xref: /freebsd/sys/dev/hyperv/pcib/vmbus_pcib.c (revision f391d6bc1d0464f62f1b8264666c897a680156b1)
1 /*-
2  * Copyright (c) 2016 Microsoft Corp.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/types.h>
33 #include <sys/malloc.h>
34 #include <sys/module.h>
35 #include <sys/kernel.h>
36 #include <sys/queue.h>
37 #include <sys/lock.h>
38 #include <sys/sx.h>
39 #include <sys/smp.h>
40 #include <sys/sysctl.h>
41 #include <sys/bus.h>
42 #include <sys/rman.h>
43 #include <sys/mutex.h>
44 #include <sys/errno.h>
45 
46 #include <vm/vm.h>
47 #include <vm/vm_param.h>
48 #include <vm/vm_kern.h>
49 #include <vm/pmap.h>
50 
51 #include <machine/atomic.h>
52 #include <machine/bus.h>
53 #include <machine/frame.h>
54 #include <machine/pci_cfgreg.h>
55 #include <machine/resource.h>
56 
57 #include <sys/pciio.h>
58 #include <dev/pci/pcireg.h>
59 #include <dev/pci/pcivar.h>
60 #include <dev/pci/pci_private.h>
61 #include <dev/pci/pcib_private.h>
62 #include "pcib_if.h"
63 
64 #include <machine/intr_machdep.h>
65 #include <x86/apicreg.h>
66 
67 #include <dev/hyperv/include/hyperv.h>
68 #include <dev/hyperv/include/hyperv_busdma.h>
69 #include <dev/hyperv/include/vmbus_xact.h>
70 #include <dev/hyperv/vmbus/vmbus_reg.h>
71 #include <dev/hyperv/vmbus/vmbus_chanvar.h>
72 
73 #include "vmbus_if.h"
74 
75 #if __FreeBSD_version < 1100000
76 typedef u_long rman_res_t;
77 #define RM_MAX_END	(~(rman_res_t)0)
78 #endif
79 
80 struct completion {
81 	unsigned int done;
82 	struct mtx lock;
83 };
84 
85 static void
86 init_completion(struct completion *c)
87 {
88 	memset(c, 0, sizeof(*c));
89 	mtx_init(&c->lock, "hvcmpl", NULL, MTX_DEF);
90 	c->done = 0;
91 }
92 
93 static void
94 free_completion(struct completion *c)
95 {
96 	mtx_destroy(&c->lock);
97 }
98 
99 static void
100 complete(struct completion *c)
101 {
102 	mtx_lock(&c->lock);
103 	c->done++;
104 	mtx_unlock(&c->lock);
105 	wakeup(c);
106 }
107 
108 static void
109 wait_for_completion(struct completion *c)
110 {
111 	mtx_lock(&c->lock);
112 	while (c->done == 0)
113 		mtx_sleep(c, &c->lock, 0, "hvwfc", 0);
114 	c->done--;
115 	mtx_unlock(&c->lock);
116 }
117 
118 #define PCI_MAKE_VERSION(major, minor) ((uint32_t)(((major) << 16) | (major)))
119 
120 enum {
121 	PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),
122 	PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1
123 };
124 
125 #define PCI_CONFIG_MMIO_LENGTH	0x2000
126 #define CFG_PAGE_OFFSET 0x1000
127 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
128 
129 /*
130  * Message Types
131  */
132 
133 enum pci_message_type {
134 	/*
135 	 * Version 1.1
136 	 */
137 	PCI_MESSAGE_BASE                = 0x42490000,
138 	PCI_BUS_RELATIONS               = PCI_MESSAGE_BASE + 0,
139 	PCI_QUERY_BUS_RELATIONS         = PCI_MESSAGE_BASE + 1,
140 	PCI_POWER_STATE_CHANGE          = PCI_MESSAGE_BASE + 4,
141 	PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
142 	PCI_QUERY_RESOURCE_RESOURCES    = PCI_MESSAGE_BASE + 6,
143 	PCI_BUS_D0ENTRY                 = PCI_MESSAGE_BASE + 7,
144 	PCI_BUS_D0EXIT                  = PCI_MESSAGE_BASE + 8,
145 	PCI_READ_BLOCK                  = PCI_MESSAGE_BASE + 9,
146 	PCI_WRITE_BLOCK                 = PCI_MESSAGE_BASE + 0xA,
147 	PCI_EJECT                       = PCI_MESSAGE_BASE + 0xB,
148 	PCI_QUERY_STOP                  = PCI_MESSAGE_BASE + 0xC,
149 	PCI_REENABLE                    = PCI_MESSAGE_BASE + 0xD,
150 	PCI_QUERY_STOP_FAILED           = PCI_MESSAGE_BASE + 0xE,
151 	PCI_EJECTION_COMPLETE           = PCI_MESSAGE_BASE + 0xF,
152 	PCI_RESOURCES_ASSIGNED          = PCI_MESSAGE_BASE + 0x10,
153 	PCI_RESOURCES_RELEASED          = PCI_MESSAGE_BASE + 0x11,
154 	PCI_INVALIDATE_BLOCK            = PCI_MESSAGE_BASE + 0x12,
155 	PCI_QUERY_PROTOCOL_VERSION      = PCI_MESSAGE_BASE + 0x13,
156 	PCI_CREATE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x14,
157 	PCI_DELETE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x15,
158 	PCI_MESSAGE_MAXIMUM
159 };
160 
161 /*
162  * Structures defining the virtual PCI Express protocol.
163  */
164 
165 union pci_version {
166 	struct {
167 		uint16_t minor_version;
168 		uint16_t major_version;
169 	} parts;
170 	uint32_t version;
171 } __packed;
172 
173 /*
174  * This representation is the one used in Windows, which is
175  * what is expected when sending this back and forth with
176  * the Hyper-V parent partition.
177  */
178 union win_slot_encoding {
179 	struct {
180 		uint32_t	slot:5;
181 		uint32_t	func:3;
182 		uint32_t	reserved:24;
183 	} bits;
184 	uint32_t val;
185 } __packed;
186 
187 struct pci_func_desc {
188 	uint16_t	v_id;	/* vendor ID */
189 	uint16_t	d_id;	/* device ID */
190 	uint8_t		rev;
191 	uint8_t		prog_intf;
192 	uint8_t		subclass;
193 	uint8_t		base_class;
194 	uint32_t	subsystem_id;
195 	union win_slot_encoding wslot;
196 	uint32_t	ser;	/* serial number */
197 } __packed;
198 
199 struct hv_msi_desc {
200 	uint8_t		vector;
201 	uint8_t		delivery_mode;
202 	uint16_t	vector_count;
203 	uint32_t	reserved;
204 	uint64_t	cpu_mask;
205 } __packed;
206 
207 struct tran_int_desc {
208 	uint16_t	reserved;
209 	uint16_t	vector_count;
210 	uint32_t	data;
211 	uint64_t	address;
212 } __packed;
213 
214 struct pci_message {
215 	uint32_t type;
216 } __packed;
217 
218 struct pci_child_message {
219 	struct pci_message message_type;
220 	union win_slot_encoding wslot;
221 } __packed;
222 
223 struct pci_incoming_message {
224 	struct vmbus_chanpkt_hdr hdr;
225 	struct pci_message message_type;
226 } __packed;
227 
228 struct pci_response {
229 	struct vmbus_chanpkt_hdr hdr;
230 	int32_t status;	/* negative values are failures */
231 } __packed;
232 
233 struct pci_packet {
234 	void (*completion_func)(void *context, struct pci_response *resp,
235 	    int resp_packet_size);
236 	void *compl_ctxt;
237 
238 	struct pci_message message[0];
239 };
240 
241 /*
242  * Specific message types supporting the PCI protocol.
243  */
244 
245 struct pci_version_request {
246 	struct pci_message message_type;
247 	uint32_t protocol_version;
248 	uint32_t is_last_attempt:1;
249 	uint32_t reservedz:31;
250 } __packed;
251 
252 struct pci_bus_d0_entry {
253 	struct pci_message message_type;
254 	uint32_t reserved;
255 	uint64_t mmio_base;
256 } __packed;
257 
258 struct pci_bus_relations {
259 	struct pci_incoming_message incoming;
260 	uint32_t device_count;
261 	struct pci_func_desc func[0];
262 } __packed;
263 
264 #define MAX_NUM_BARS	(PCIR_MAX_BAR_0 + 1)
265 struct pci_q_res_req_response {
266 	struct vmbus_chanpkt_hdr hdr;
267 	int32_t status; /* negative values are failures */
268 	uint32_t probed_bar[MAX_NUM_BARS];
269 } __packed;
270 
271 struct pci_resources_assigned {
272 	struct pci_message message_type;
273 	union win_slot_encoding wslot;
274 	uint8_t memory_range[0x14][MAX_NUM_BARS]; /* unused here */
275 	uint32_t msi_descriptors;
276 	uint32_t reserved[4];
277 } __packed;
278 
279 struct pci_create_interrupt {
280 	struct pci_message message_type;
281 	union win_slot_encoding wslot;
282 	struct hv_msi_desc int_desc;
283 } __packed;
284 
285 struct pci_create_int_response {
286 	struct pci_response response;
287 	uint32_t reserved;
288 	struct tran_int_desc int_desc;
289 } __packed;
290 
291 struct pci_delete_interrupt {
292 	struct pci_message message_type;
293 	union win_slot_encoding wslot;
294 	struct tran_int_desc int_desc;
295 } __packed;
296 
297 struct pci_dev_incoming {
298 	struct pci_incoming_message incoming;
299 	union win_slot_encoding wslot;
300 } __packed;
301 
302 struct pci_eject_response {
303 	struct pci_message message_type;
304 	union win_slot_encoding wslot;
305 	uint32_t status;
306 } __packed;
307 
308 /*
309  * Driver specific state.
310  */
311 
312 enum hv_pcibus_state {
313 	hv_pcibus_init = 0,
314 	hv_pcibus_installed,
315 };
316 
317 struct hv_pcibus {
318 	device_t pcib;
319 	device_t pci_bus;
320 	struct vmbus_pcib_softc *sc;
321 
322 	uint16_t pci_domain;
323 
324 	enum hv_pcibus_state state;
325 
326 	struct resource *cfg_res;
327 
328 	struct completion query_completion, *query_comp;
329 
330 	struct mtx config_lock; /* Avoid two threads writing index page */
331 	struct mtx device_list_lock;    /* Protect lists below */
332 	TAILQ_HEAD(, hv_pci_dev) children;
333 	TAILQ_HEAD(, hv_dr_state) dr_list;
334 
335 	volatile int detaching;
336 };
337 
338 struct hv_pci_dev {
339 	TAILQ_ENTRY(hv_pci_dev) link;
340 
341 	struct pci_func_desc desc;
342 
343 	bool reported_missing;
344 
345 	struct hv_pcibus *hbus;
346 	struct task eject_task;
347 
348 	TAILQ_HEAD(, hv_irq_desc) irq_desc_list;
349 
350 	/*
351 	 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
352 	 * read it back, for each of the BAR offsets within config space.
353 	 */
354 	uint32_t probed_bar[MAX_NUM_BARS];
355 };
356 
357 /*
358  * Tracks "Device Relations" messages from the host, which must be both
359  * processed in order.
360  */
361 struct hv_dr_work {
362 	struct task task;
363 	struct hv_pcibus *bus;
364 };
365 
366 struct hv_dr_state {
367 	TAILQ_ENTRY(hv_dr_state) link;
368 	uint32_t device_count;
369 	struct pci_func_desc func[0];
370 };
371 
372 struct hv_irq_desc {
373 	TAILQ_ENTRY(hv_irq_desc) link;
374 	struct tran_int_desc desc;
375 	int irq;
376 };
377 
378 #define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
379 #define PCI_SLOT(devfn)         (((devfn) >> 3) & 0x1f)
380 #define PCI_FUNC(devfn)         ((devfn) & 0x07)
381 
382 static uint32_t
383 devfn_to_wslot(unsigned int devfn)
384 {
385 	union win_slot_encoding wslot;
386 
387 	wslot.val = 0;
388 	wslot.bits.slot = PCI_SLOT(devfn);
389 	wslot.bits.func = PCI_FUNC(devfn);
390 
391 	return (wslot.val);
392 }
393 
394 static unsigned int
395 wslot_to_devfn(uint32_t wslot)
396 {
397 	union win_slot_encoding encoding;
398 	unsigned int slot;
399 	unsigned int func;
400 
401 	encoding.val = wslot;
402 
403 	slot = encoding.bits.slot;
404 	func = encoding.bits.func;
405 
406 	return (PCI_DEVFN(slot, func));
407 }
408 
409 struct vmbus_pcib_softc {
410 	struct vmbus_channel	*chan;
411 	void *rx_buf;
412 
413 	struct taskqueue	*taskq;
414 
415 	struct hv_pcibus	*hbus;
416 };
417 
418 /* {44C4F61D-4444-4400-9D52-802E27EDE19F} */
419 static const struct hyperv_guid g_pass_through_dev_type = {
420 	.hv_guid = {0x1D, 0xF6, 0xC4, 0x44, 0x44, 0x44, 0x00, 0x44,
421 	    0x9D, 0x52, 0x80, 0x2E, 0x27, 0xED, 0xE1, 0x9F}
422 };
423 
424 struct hv_pci_compl {
425 	struct completion host_event;
426 	int32_t completion_status;
427 };
428 
429 struct q_res_req_compl {
430 	struct completion host_event;
431 	struct hv_pci_dev *hpdev;
432 };
433 
434 struct compose_comp_ctxt {
435 	struct hv_pci_compl comp_pkt;
436 	struct tran_int_desc int_desc;
437 };
438 
439 static void
440 hv_pci_generic_compl(void *context, struct pci_response *resp,
441     int resp_packet_size)
442 {
443 	struct hv_pci_compl *comp_pkt = context;
444 
445 	if (resp_packet_size >= sizeof(struct pci_response))
446 		comp_pkt->completion_status = resp->status;
447 	else
448 		comp_pkt->completion_status = -1;
449 
450 	complete(&comp_pkt->host_event);
451 }
452 
453 static void
454 q_resource_requirements(void *context, struct pci_response *resp,
455     int resp_packet_size)
456 {
457 	struct q_res_req_compl *completion = context;
458 	struct pci_q_res_req_response *q_res_req =
459 	    (struct pci_q_res_req_response *)resp;
460 	int i;
461 
462 	if (resp->status < 0) {
463 		printf("vmbus_pcib: failed to query resource requirements\n");
464 	} else {
465 		for (i = 0; i < MAX_NUM_BARS; i++)
466 			completion->hpdev->probed_bar[i] =
467 			    q_res_req->probed_bar[i];
468 	}
469 
470 	complete(&completion->host_event);
471 }
472 
473 static void
474 hv_pci_compose_compl(void *context, struct pci_response *resp,
475     int resp_packet_size)
476 {
477 	struct compose_comp_ctxt *comp_pkt = context;
478 	struct pci_create_int_response *int_resp =
479 	    (struct pci_create_int_response *)resp;
480 
481 	comp_pkt->comp_pkt.completion_status = resp->status;
482 	comp_pkt->int_desc = int_resp->int_desc;
483 	complete(&comp_pkt->comp_pkt.host_event);
484 }
485 
486 static void
487 hv_int_desc_free(struct hv_pci_dev *hpdev, struct hv_irq_desc *hid)
488 {
489 	struct pci_delete_interrupt *int_pkt;
490 	struct {
491 		struct pci_packet pkt;
492 		uint8_t buffer[sizeof(struct pci_delete_interrupt)];
493 	} ctxt;
494 
495 	memset(&ctxt, 0, sizeof(ctxt));
496 	int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
497 	int_pkt->message_type.type = PCI_DELETE_INTERRUPT_MESSAGE;
498 	int_pkt->wslot.val = hpdev->desc.wslot.val;
499 	int_pkt->int_desc = hid->desc;
500 
501 	vmbus_chan_send(hpdev->hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
502 	    int_pkt, sizeof(*int_pkt), 0);
503 
504 	free(hid, M_DEVBUF);
505 }
506 
507 static void
508 hv_pci_delete_device(struct hv_pci_dev *hpdev)
509 {
510 	struct hv_pcibus *hbus = hpdev->hbus;
511 	struct hv_irq_desc *hid, *tmp_hid;
512 	device_t pci_dev;
513 	int devfn;
514 
515 	devfn = wslot_to_devfn(hpdev->desc.wslot.val);
516 
517 	mtx_lock(&Giant);
518 
519 	pci_dev = pci_find_dbsf(hbus->pci_domain,
520 	    0, PCI_SLOT(devfn), PCI_FUNC(devfn));
521 	if (pci_dev)
522 		device_delete_child(hbus->pci_bus, pci_dev);
523 
524 	mtx_unlock(&Giant);
525 
526 	mtx_lock(&hbus->device_list_lock);
527 	TAILQ_REMOVE(&hbus->children, hpdev, link);
528 	mtx_unlock(&hbus->device_list_lock);
529 
530 	TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid)
531 		hv_int_desc_free(hpdev, hid);
532 
533 	free(hpdev, M_DEVBUF);
534 }
535 
536 static struct hv_pci_dev *
537 new_pcichild_device(struct hv_pcibus *hbus, struct pci_func_desc *desc)
538 {
539 	struct hv_pci_dev *hpdev;
540 	struct pci_child_message *res_req;
541 	struct q_res_req_compl comp_pkt;
542 	struct {
543 		struct pci_packet pkt;
544 		uint8_t buffer[sizeof(struct pci_child_message)];
545 	} ctxt;
546 	int ret;
547 
548 	hpdev = malloc(sizeof(*hpdev), M_DEVBUF, M_WAITOK | M_ZERO);
549 	hpdev->hbus = hbus;
550 
551 	TAILQ_INIT(&hpdev->irq_desc_list);
552 
553 	init_completion(&comp_pkt.host_event);
554 	comp_pkt.hpdev = hpdev;
555 
556 	ctxt.pkt.compl_ctxt = &comp_pkt;
557 	ctxt.pkt.completion_func = q_resource_requirements;
558 
559 	res_req = (struct pci_child_message *)&ctxt.pkt.message;
560 	res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
561 	res_req->wslot.val = desc->wslot.val;
562 
563 	ret = vmbus_chan_send(hbus->sc->chan,
564 	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
565 	    res_req, sizeof(*res_req), (uint64_t)&ctxt.pkt);
566 	if (ret)
567 		goto err;
568 
569 	wait_for_completion(&comp_pkt.host_event);
570 	free_completion(&comp_pkt.host_event);
571 
572 	hpdev->desc = *desc;
573 
574 	mtx_lock(&hbus->device_list_lock);
575 	TAILQ_INSERT_TAIL(&hbus->children, hpdev, link);
576 	mtx_unlock(&hbus->device_list_lock);
577 	return (hpdev);
578 err:
579 	free_completion(&comp_pkt.host_event);
580 	free(hpdev, M_DEVBUF);
581 	return (NULL);
582 }
583 
584 #if __FreeBSD_version < 1100000
585 
586 /* Old versions don't have BUS_RESCAN(). Let's copy it from FreeBSD 11. */
587 
588 static struct pci_devinfo *
589 pci_identify_function(device_t pcib, device_t dev, int domain, int busno,
590     int slot, int func, size_t dinfo_size)
591 {
592 	struct pci_devinfo *dinfo;
593 
594 	dinfo = pci_read_device(pcib, domain, busno, slot, func, dinfo_size);
595 	if (dinfo != NULL)
596 		pci_add_child(dev, dinfo);
597 
598 	return (dinfo);
599 }
600 
601 static int
602 pci_rescan(device_t dev)
603 {
604 #define	REG(n, w)	PCIB_READ_CONFIG(pcib, busno, s, f, n, w)
605 	device_t pcib = device_get_parent(dev);
606 	struct pci_softc *sc;
607 	device_t child, *devlist, *unchanged;
608 	int devcount, error, i, j, maxslots, oldcount;
609 	int busno, domain, s, f, pcifunchigh;
610 	uint8_t hdrtype;
611 
612 	/* No need to check for ARI on a rescan. */
613 	error = device_get_children(dev, &devlist, &devcount);
614 	if (error)
615 		return (error);
616 	if (devcount != 0) {
617 		unchanged = malloc(devcount * sizeof(device_t), M_TEMP,
618 		    M_NOWAIT | M_ZERO);
619 		if (unchanged == NULL) {
620 			free(devlist, M_TEMP);
621 			return (ENOMEM);
622 		}
623 	} else
624 		unchanged = NULL;
625 
626 	sc = device_get_softc(dev);
627 	domain = pcib_get_domain(dev);
628 	busno = pcib_get_bus(dev);
629 	maxslots = PCIB_MAXSLOTS(pcib);
630 	for (s = 0; s <= maxslots; s++) {
631 		/* If function 0 is not present, skip to the next slot. */
632 		f = 0;
633 		if (REG(PCIR_VENDOR, 2) == 0xffff)
634 			continue;
635 		pcifunchigh = 0;
636 		hdrtype = REG(PCIR_HDRTYPE, 1);
637 		if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE)
638 			continue;
639 		if (hdrtype & PCIM_MFDEV)
640 			pcifunchigh = PCIB_MAXFUNCS(pcib);
641 		for (f = 0; f <= pcifunchigh; f++) {
642 			if (REG(PCIR_VENDOR, 2) == 0xffff)
643 				continue;
644 
645 			/*
646 			 * Found a valid function.  Check if a
647 			 * device_t for this device already exists.
648 			 */
649 			for (i = 0; i < devcount; i++) {
650 				child = devlist[i];
651 				if (child == NULL)
652 					continue;
653 				if (pci_get_slot(child) == s &&
654 				    pci_get_function(child) == f) {
655 					unchanged[i] = child;
656 					goto next_func;
657 				}
658 			}
659 
660 			pci_identify_function(pcib, dev, domain, busno, s, f,
661 			    sizeof(struct pci_devinfo));
662 		next_func:;
663 		}
664 	}
665 
666 	/* Remove devices that are no longer present. */
667 	for (i = 0; i < devcount; i++) {
668 		if (unchanged[i] != NULL)
669 			continue;
670 		device_delete_child(dev, devlist[i]);
671 	}
672 
673 	free(devlist, M_TEMP);
674 	oldcount = devcount;
675 
676 	/* Try to attach the devices just added. */
677 	error = device_get_children(dev, &devlist, &devcount);
678 	if (error) {
679 		free(unchanged, M_TEMP);
680 		return (error);
681 	}
682 
683 	for (i = 0; i < devcount; i++) {
684 		for (j = 0; j < oldcount; j++) {
685 			if (devlist[i] == unchanged[j])
686 				goto next_device;
687 		}
688 
689 		device_probe_and_attach(devlist[i]);
690 	next_device:;
691 	}
692 
693 	free(unchanged, M_TEMP);
694 	free(devlist, M_TEMP);
695 	return (0);
696 #undef REG
697 }
698 
699 #else
700 
701 static int
702 pci_rescan(device_t dev)
703 {
704 	return (BUS_RESCAN(dev));
705 }
706 
707 #endif
708 
709 static void
710 pci_devices_present_work(void *arg, int pending __unused)
711 {
712 	struct hv_dr_work *dr_wrk = arg;
713 	struct hv_dr_state *dr = NULL;
714 	struct hv_pcibus *hbus;
715 	uint32_t child_no;
716 	bool found;
717 	struct pci_func_desc *new_desc;
718 	struct hv_pci_dev *hpdev, *tmp_hpdev;
719 	struct completion *query_comp;
720 	bool need_rescan = false;
721 
722 	hbus = dr_wrk->bus;
723 	free(dr_wrk, M_DEVBUF);
724 
725 	/* Pull this off the queue and process it if it was the last one. */
726 	mtx_lock(&hbus->device_list_lock);
727 	while (!TAILQ_EMPTY(&hbus->dr_list)) {
728 		dr = TAILQ_FIRST(&hbus->dr_list);
729 		TAILQ_REMOVE(&hbus->dr_list, dr, link);
730 
731 		/* Throw this away if the list still has stuff in it. */
732 		if (!TAILQ_EMPTY(&hbus->dr_list)) {
733 			free(dr, M_DEVBUF);
734 			continue;
735 		}
736 	}
737 	mtx_unlock(&hbus->device_list_lock);
738 
739 	if (!dr)
740 		return;
741 
742 	/* First, mark all existing children as reported missing. */
743 	mtx_lock(&hbus->device_list_lock);
744 	TAILQ_FOREACH(hpdev, &hbus->children, link)
745 		hpdev->reported_missing = true;
746 	mtx_unlock(&hbus->device_list_lock);
747 
748 	/* Next, add back any reported devices. */
749 	for (child_no = 0; child_no < dr->device_count; child_no++) {
750 		found = false;
751 		new_desc = &dr->func[child_no];
752 
753 		mtx_lock(&hbus->device_list_lock);
754 		TAILQ_FOREACH(hpdev, &hbus->children, link) {
755 			if ((hpdev->desc.wslot.val ==
756 			    new_desc->wslot.val) &&
757 			    (hpdev->desc.v_id == new_desc->v_id) &&
758 			    (hpdev->desc.d_id == new_desc->d_id) &&
759 			    (hpdev->desc.ser == new_desc->ser)) {
760 				hpdev->reported_missing = false;
761 				found = true;
762 				break;
763 			}
764 		}
765 		mtx_unlock(&hbus->device_list_lock);
766 
767 		if (!found) {
768 			if (!need_rescan)
769 				need_rescan = true;
770 
771 			hpdev = new_pcichild_device(hbus, new_desc);
772 			if (!hpdev)
773 				printf("vmbus_pcib: failed to add a child\n");
774 		}
775 	}
776 
777 	/* Remove missing device(s), if any */
778 	TAILQ_FOREACH_SAFE(hpdev, &hbus->children, link, tmp_hpdev) {
779 		if (hpdev->reported_missing)
780 			hv_pci_delete_device(hpdev);
781 	}
782 
783 	/* Rescan the bus to find any new device, if necessary. */
784 	if (hbus->state == hv_pcibus_installed && need_rescan)
785 		pci_rescan(hbus->pci_bus);
786 
787 	/* Wake up hv_pci_query_relations(), if it's waiting. */
788 	query_comp = hbus->query_comp;
789 	if (query_comp) {
790 		hbus->query_comp = NULL;
791 		complete(query_comp);
792 	}
793 
794 	free(dr, M_DEVBUF);
795 }
796 
797 static struct hv_pci_dev *
798 get_pcichild_wslot(struct hv_pcibus *hbus, uint32_t wslot)
799 {
800 	struct hv_pci_dev *hpdev, *ret = NULL;
801 
802 	mtx_lock(&hbus->device_list_lock);
803 	TAILQ_FOREACH(hpdev, &hbus->children, link) {
804 		if (hpdev->desc.wslot.val == wslot) {
805 			ret = hpdev;
806 			break;
807 		}
808 	}
809 	mtx_unlock(&hbus->device_list_lock);
810 
811 	return (ret);
812 }
813 
814 static void
815 hv_pci_devices_present(struct hv_pcibus *hbus,
816     struct pci_bus_relations *relations)
817 {
818 	struct hv_dr_state *dr;
819 	struct hv_dr_work *dr_wrk;
820 	unsigned long dr_size;
821 
822 	if (hbus->detaching && relations->device_count > 0)
823 		return;
824 
825 	dr_size = offsetof(struct hv_dr_state, func) +
826 	    (sizeof(struct pci_func_desc) * relations->device_count);
827 	dr = malloc(dr_size, M_DEVBUF, M_WAITOK | M_ZERO);
828 
829 	dr->device_count = relations->device_count;
830 	if (dr->device_count != 0)
831 		memcpy(dr->func, relations->func,
832 		    sizeof(struct pci_func_desc) * dr->device_count);
833 
834 	mtx_lock(&hbus->device_list_lock);
835 	TAILQ_INSERT_TAIL(&hbus->dr_list, dr, link);
836 	mtx_unlock(&hbus->device_list_lock);
837 
838 	dr_wrk = malloc(sizeof(*dr_wrk), M_DEVBUF, M_WAITOK | M_ZERO);
839 	dr_wrk->bus = hbus;
840 	TASK_INIT(&dr_wrk->task, 0, pci_devices_present_work, dr_wrk);
841 	taskqueue_enqueue(hbus->sc->taskq, &dr_wrk->task);
842 }
843 
844 static void
845 hv_eject_device_work(void *arg, int pending __unused)
846 {
847 	struct hv_pci_dev *hpdev = arg;
848 	union win_slot_encoding wslot = hpdev->desc.wslot;
849 	struct hv_pcibus *hbus = hpdev->hbus;
850 	struct pci_eject_response *eject_pkt;
851 	struct {
852 		struct pci_packet pkt;
853 		uint8_t buffer[sizeof(struct pci_eject_response)];
854 	} ctxt;
855 
856 	hv_pci_delete_device(hpdev);
857 
858 	memset(&ctxt, 0, sizeof(ctxt));
859 	eject_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
860 	eject_pkt->message_type.type = PCI_EJECTION_COMPLETE;
861 	eject_pkt->wslot.val = wslot.val;
862 	vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
863 	    eject_pkt, sizeof(*eject_pkt), 0);
864 }
865 
866 static void
867 hv_pci_eject_device(struct hv_pci_dev *hpdev)
868 {
869 	struct hv_pcibus *hbus = hpdev->hbus;
870 	struct taskqueue *taskq;
871 
872 	if (hbus->detaching)
873 		return;
874 
875 	/*
876 	 * Push this task into the same taskqueue on which
877 	 * vmbus_pcib_attach() runs, so we're sure this task can't run
878 	 * concurrently with vmbus_pcib_attach().
879 	 */
880 	TASK_INIT(&hpdev->eject_task, 0, hv_eject_device_work, hpdev);
881 	taskq = vmbus_chan_mgmt_tq(hbus->sc->chan);
882 	taskqueue_enqueue(taskq, &hpdev->eject_task);
883 }
884 
885 #define PCIB_PACKET_SIZE	0x100
886 
887 static void
888 vmbus_pcib_on_channel_callback(struct vmbus_channel *chan, void *arg)
889 {
890 	struct vmbus_pcib_softc *sc = arg;
891 	struct hv_pcibus *hbus = sc->hbus;
892 
893 	void *buffer;
894 	int bufferlen = PCIB_PACKET_SIZE;
895 
896 	struct pci_packet *comp_packet;
897 	struct pci_response *response;
898 	struct pci_incoming_message *new_msg;
899 	struct pci_bus_relations *bus_rel;
900 	struct pci_dev_incoming *dev_msg;
901 	struct hv_pci_dev *hpdev;
902 
903 	buffer = sc->rx_buf;
904 	do {
905 		struct vmbus_chanpkt_hdr *pkt = buffer;
906 		uint32_t bytes_rxed;
907 		int ret;
908 
909 		bytes_rxed = bufferlen;
910 		ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed);
911 
912 		if (ret == ENOBUFS) {
913 			/* Handle large packet */
914 			if (bufferlen > PCIB_PACKET_SIZE) {
915 				free(buffer, M_DEVBUF);
916 				buffer = NULL;
917 			}
918 
919 			/* alloc new buffer */
920 			buffer = malloc(bytes_rxed, M_DEVBUF, M_WAITOK | M_ZERO);
921 			bufferlen = bytes_rxed;
922 
923 			continue;
924 		}
925 
926 		if (ret != 0) {
927 			/* ignore EIO or EAGAIN */
928 			break;
929 		}
930 
931 		if (bytes_rxed <= sizeof(struct pci_response))
932 			continue;
933 
934 		switch (pkt->cph_type) {
935 		case VMBUS_CHANPKT_TYPE_COMP:
936 			comp_packet = (struct pci_packet *)pkt->cph_xactid;
937 			response = (struct pci_response *)pkt;
938 			comp_packet->completion_func(comp_packet->compl_ctxt,
939 			    response, bytes_rxed);
940 			break;
941 		case VMBUS_CHANPKT_TYPE_INBAND:
942 			new_msg = (struct pci_incoming_message *)buffer;
943 
944 			switch (new_msg->message_type.type) {
945 			case PCI_BUS_RELATIONS:
946 				bus_rel = (struct pci_bus_relations *)buffer;
947 
948 				if (bus_rel->device_count == 0)
949 					break;
950 
951 				if (bytes_rxed <
952 				    offsetof(struct pci_bus_relations, func) +
953 				        (sizeof(struct pci_func_desc) *
954 				            (bus_rel->device_count)))
955 					break;
956 
957 				hv_pci_devices_present(hbus, bus_rel);
958 				break;
959 
960 			case PCI_EJECT:
961 				dev_msg = (struct pci_dev_incoming *)buffer;
962 				hpdev = get_pcichild_wslot(hbus,
963 				    dev_msg->wslot.val);
964 
965 				if (hpdev)
966 					hv_pci_eject_device(hpdev);
967 
968 				break;
969 			default:
970 				printf("vmbus_pcib: Unknown msg type 0x%x\n",
971 				    new_msg->message_type.type);
972 				break;
973 			}
974 			break;
975 		default:
976 			printf("vmbus_pcib: Unknown VMBus msg type %hd\n",
977 			    pkt->cph_type);
978 			break;
979 		}
980 	} while (1);
981 
982 	if (bufferlen > PCIB_PACKET_SIZE)
983 		free(buffer, M_DEVBUF);
984 }
985 
986 static int
987 hv_pci_protocol_negotiation(struct hv_pcibus *hbus)
988 {
989 	struct pci_version_request *version_req;
990 	struct hv_pci_compl comp_pkt;
991 	struct {
992 		struct pci_packet pkt;
993 		uint8_t buffer[sizeof(struct pci_version_request)];
994 	} ctxt;
995 	int ret;
996 
997 	init_completion(&comp_pkt.host_event);
998 
999 	ctxt.pkt.completion_func = hv_pci_generic_compl;
1000 	ctxt.pkt.compl_ctxt = &comp_pkt;
1001 	version_req = (struct pci_version_request *)&ctxt.pkt.message;
1002 	version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
1003 	version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT;
1004 	version_req->is_last_attempt = 1;
1005 
1006 	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
1007 	    VMBUS_CHANPKT_FLAG_RC, version_req, sizeof(*version_req),
1008 	    (uint64_t)&ctxt.pkt);
1009 	if (ret)
1010 		goto out;
1011 
1012 	wait_for_completion(&comp_pkt.host_event);
1013 
1014 	if (comp_pkt.completion_status < 0) {
1015 		device_printf(hbus->pcib,
1016 		    "vmbus_pcib version negotiation failed: %x\n",
1017 		    comp_pkt.completion_status);
1018 		ret = EPROTO;
1019 	} else {
1020 		ret = 0;
1021 	}
1022 out:
1023 	free_completion(&comp_pkt.host_event);
1024 	return (ret);
1025 }
1026 
1027 /* Ask the host to send along the list of child devices */
1028 static int
1029 hv_pci_query_relations(struct hv_pcibus *hbus)
1030 {
1031 	struct pci_message message;
1032 	int ret;
1033 
1034 	message.type = PCI_QUERY_BUS_RELATIONS;
1035 	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
1036 	    &message, sizeof(message), 0);
1037 	return (ret);
1038 }
1039 
1040 static int
1041 hv_pci_enter_d0(struct hv_pcibus *hbus)
1042 {
1043 	struct pci_bus_d0_entry *d0_entry;
1044 	struct hv_pci_compl comp_pkt;
1045 	struct {
1046 		struct pci_packet pkt;
1047 		uint8_t buffer[sizeof(struct pci_bus_d0_entry)];
1048 	} ctxt;
1049 	int ret;
1050 
1051 	/*
1052 	 * Tell the host that the bus is ready to use, and moved into the
1053 	 * powered-on state.  This includes telling the host which region
1054 	 * of memory-mapped I/O space has been chosen for configuration space
1055 	 * access.
1056 	 */
1057 	init_completion(&comp_pkt.host_event);
1058 
1059 	ctxt.pkt.completion_func = hv_pci_generic_compl;
1060 	ctxt.pkt.compl_ctxt = &comp_pkt;
1061 
1062 	d0_entry = (struct pci_bus_d0_entry *)&ctxt.pkt.message;
1063 	memset(d0_entry, 0, sizeof(*d0_entry));
1064 	d0_entry->message_type.type = PCI_BUS_D0ENTRY;
1065 	d0_entry->mmio_base = rman_get_start(hbus->cfg_res);
1066 
1067 	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
1068 	    VMBUS_CHANPKT_FLAG_RC, d0_entry, sizeof(*d0_entry),
1069 	    (uint64_t)&ctxt.pkt);
1070 	if (ret)
1071 		goto out;
1072 
1073 	wait_for_completion(&comp_pkt.host_event);
1074 
1075 	if (comp_pkt.completion_status < 0) {
1076 		device_printf(hbus->pcib, "vmbus_pcib failed to enable D0\n");
1077 		ret = EPROTO;
1078 	} else {
1079 		ret = 0;
1080 	}
1081 
1082 out:
1083 	free_completion(&comp_pkt.host_event);
1084 	return (ret);
1085 }
1086 
1087 /*
1088  * It looks this is only needed by Windows VM, but let's send the message too
1089  * just to make the host happy.
1090  */
1091 static int
1092 hv_send_resources_allocated(struct hv_pcibus *hbus)
1093 {
1094 	struct pci_resources_assigned *res_assigned;
1095 	struct hv_pci_compl comp_pkt;
1096 	struct hv_pci_dev *hpdev;
1097 	struct pci_packet *pkt;
1098 	uint32_t wslot;
1099 	int ret = 0;
1100 
1101 	pkt = malloc(sizeof(*pkt) + sizeof(*res_assigned),
1102 	    M_DEVBUF, M_WAITOK | M_ZERO);
1103 
1104 	for (wslot = 0; wslot < 256; wslot++) {
1105 		hpdev = get_pcichild_wslot(hbus, wslot);
1106 		if (!hpdev)
1107 			continue;
1108 
1109 		init_completion(&comp_pkt.host_event);
1110 
1111 		memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned));
1112 		pkt->completion_func = hv_pci_generic_compl;
1113 		pkt->compl_ctxt = &comp_pkt;
1114 
1115 		res_assigned = (struct pci_resources_assigned *)&pkt->message;
1116 		res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED;
1117 		res_assigned->wslot.val = hpdev->desc.wslot.val;
1118 
1119 		ret = vmbus_chan_send(hbus->sc->chan,
1120 		    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
1121 		    &pkt->message, sizeof(*res_assigned), (uint64_t)pkt);
1122 		if (ret) {
1123 			free_completion(&comp_pkt.host_event);
1124 			break;
1125 		}
1126 
1127 		wait_for_completion(&comp_pkt.host_event);
1128 		free_completion(&comp_pkt.host_event);
1129 
1130 		if (comp_pkt.completion_status < 0) {
1131 			ret = EPROTO;
1132 			device_printf(hbus->pcib,
1133 			    "failed to send PCI_RESOURCES_ASSIGNED\n");
1134 			break;
1135 		}
1136 	}
1137 
1138 	free(pkt, M_DEVBUF);
1139 	return (ret);
1140 }
1141 
1142 static int
1143 hv_send_resources_released(struct hv_pcibus *hbus)
1144 {
1145 	struct pci_child_message pkt;
1146 	struct hv_pci_dev *hpdev;
1147 	uint32_t wslot;
1148 	int ret;
1149 
1150 	for (wslot = 0; wslot < 256; wslot++) {
1151 		hpdev = get_pcichild_wslot(hbus, wslot);
1152 		if (!hpdev)
1153 			continue;
1154 
1155 		pkt.message_type.type = PCI_RESOURCES_RELEASED;
1156 		pkt.wslot.val = hpdev->desc.wslot.val;
1157 
1158 		ret = vmbus_chan_send(hbus->sc->chan,
1159 		    VMBUS_CHANPKT_TYPE_INBAND, 0, &pkt, sizeof(pkt), 0);
1160 		if (ret)
1161 			return (ret);
1162 	}
1163 
1164 	return (0);
1165 }
1166 
1167 #define hv_cfg_read(x, s)						\
1168 static inline uint##x##_t hv_cfg_read_##s(struct hv_pcibus *bus,	\
1169     bus_size_t offset)							\
1170 {									\
1171 	return (bus_read_##s(bus->cfg_res, offset));			\
1172 }
1173 
1174 #define hv_cfg_write(x, s)						\
1175 static inline void hv_cfg_write_##s(struct hv_pcibus *bus,		\
1176     bus_size_t offset, uint##x##_t val)					\
1177 {									\
1178 	return (bus_write_##s(bus->cfg_res, offset, val));		\
1179 }
1180 
1181 hv_cfg_read(8, 1)
1182 hv_cfg_read(16, 2)
1183 hv_cfg_read(32, 4)
1184 
1185 hv_cfg_write(8, 1)
1186 hv_cfg_write(16, 2)
1187 hv_cfg_write(32, 4)
1188 
1189 static void
1190 _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, int size,
1191     uint32_t *val)
1192 {
1193 	struct hv_pcibus *hbus = hpdev->hbus;
1194 	bus_size_t addr = CFG_PAGE_OFFSET + where;
1195 
1196 	/*
1197 	 * If the attempt is to read the IDs or the ROM BAR, simulate that.
1198 	 */
1199 	if (where + size <= PCIR_COMMAND) {
1200 		memcpy(val, ((uint8_t *)&hpdev->desc.v_id) + where, size);
1201 	} else if (where >= PCIR_REVID && where + size <=
1202 		   PCIR_CACHELNSZ) {
1203 		memcpy(val, ((uint8_t *)&hpdev->desc.rev) + where -
1204 		       PCIR_REVID, size);
1205 	} else if (where >= PCIR_SUBVEND_0 && where + size <=
1206 		   PCIR_BIOS) {
1207 		memcpy(val, (uint8_t *)&hpdev->desc.subsystem_id + where -
1208 		       PCIR_SUBVEND_0, size);
1209 	} else if (where >= PCIR_BIOS && where + size <=
1210 		   PCIR_CAP_PTR) {
1211 		/* ROM BARs are unimplemented */
1212 		*val = 0;
1213 	} else if ((where >= PCIR_INTLINE && where + size <=
1214 		   PCIR_INTPIN) ||(where == PCIR_INTPIN && size == 1)) {
1215 		/*
1216 		 * Interrupt Line and Interrupt PIN are hard-wired to zero
1217 		 * because this front-end only supports message-signaled
1218 		 * interrupts.
1219 		 */
1220 		*val = 0;
1221 	} else if (where + size <= CFG_PAGE_SIZE) {
1222 		mtx_lock(&hbus->config_lock);
1223 
1224 		/* Choose the function to be read. */
1225 		hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val);
1226 
1227 		/* Make sure the function was chosen before we start reading.*/
1228 		mb();
1229 
1230 		/* Read from that function's config space. */
1231 		switch (size) {
1232 		case 1:
1233 			*((uint8_t *)val) = hv_cfg_read_1(hbus, addr);
1234 			break;
1235 		case 2:
1236 			*((uint16_t *)val) = hv_cfg_read_2(hbus, addr);
1237 			break;
1238 		default:
1239 			*((uint32_t *)val) = hv_cfg_read_4(hbus, addr);
1240 			break;
1241 		}
1242 		/*
1243 		 * Make sure the write was done before we release the lock,
1244 		 * allowing consecutive reads/writes.
1245 		 */
1246 		mb();
1247 
1248 		mtx_unlock(&hbus->config_lock);
1249 	} else {
1250 		/* Invalid config read: it's unlikely to reach here. */
1251 		memset(val, 0, size);
1252 	}
1253 }
1254 
1255 static void
1256 _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, int size,
1257     uint32_t val)
1258 {
1259 	struct hv_pcibus *hbus = hpdev->hbus;
1260 	bus_size_t addr = CFG_PAGE_OFFSET + where;
1261 
1262 	/* SSIDs and ROM BARs are read-only */
1263 	if (where >= PCIR_SUBVEND_0 && where + size <= PCIR_CAP_PTR)
1264 		return;
1265 
1266 	if (where >= PCIR_COMMAND && where + size <= CFG_PAGE_SIZE) {
1267 		mtx_lock(&hbus->config_lock);
1268 
1269 		/* Choose the function to be written. */
1270 		hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val);
1271 
1272 		/* Make sure the function was chosen before we start writing.*/
1273 		wmb();
1274 
1275 		/* Write to that function's config space. */
1276 		switch (size) {
1277 		case 1:
1278 			hv_cfg_write_1(hbus, addr, (uint8_t)val);
1279 			break;
1280 		case 2:
1281 			hv_cfg_write_2(hbus, addr, (uint16_t)val);
1282 			break;
1283 		default:
1284 			hv_cfg_write_4(hbus, addr, (uint32_t)val);
1285 			break;
1286 		}
1287 
1288 		/*
1289 		 * Make sure the write was done before we release the lock,
1290 		 * allowing consecutive reads/writes.
1291 		 */
1292 		mb();
1293 
1294 		mtx_unlock(&hbus->config_lock);
1295 	} else {
1296 		/* Invalid config write: it's unlikely to reach here. */
1297 		return;
1298 	}
1299 }
1300 
1301 static void
1302 vmbus_pcib_set_detaching(void *arg, int pending __unused)
1303 {
1304 	struct hv_pcibus *hbus = arg;
1305 
1306 	atomic_set_int(&hbus->detaching, 1);
1307 }
1308 
1309 static void
1310 vmbus_pcib_pre_detach(struct hv_pcibus *hbus)
1311 {
1312 	struct task task;
1313 
1314 	TASK_INIT(&task, 0, vmbus_pcib_set_detaching, hbus);
1315 
1316 	/*
1317 	 * Make sure the channel callback won't push any possible new
1318 	 * PCI_BUS_RELATIONS and PCI_EJECT tasks to sc->taskq.
1319 	 */
1320 	vmbus_chan_run_task(hbus->sc->chan, &task);
1321 
1322 	taskqueue_drain_all(hbus->sc->taskq);
1323 }
1324 
1325 
1326 /*
1327  * Standard probe entry point.
1328  *
1329  */
1330 static int
1331 vmbus_pcib_probe(device_t dev)
1332 {
1333 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1334 	    &g_pass_through_dev_type) == 0) {
1335 		device_set_desc(dev, "Hyper-V PCI Express Pass Through");
1336 		return (BUS_PROBE_DEFAULT);
1337 	}
1338 	return (ENXIO);
1339 }
1340 
1341 /*
1342  * Standard attach entry point.
1343  *
1344  */
1345 static int
1346 vmbus_pcib_attach(device_t dev)
1347 {
1348 	const int pci_ring_size = (4 * PAGE_SIZE);
1349 	const struct hyperv_guid *inst_guid;
1350 	struct vmbus_channel *channel;
1351 	struct vmbus_pcib_softc *sc;
1352 	struct hv_pcibus *hbus;
1353 	int rid = 0;
1354 	int ret;
1355 
1356 	hbus = malloc(sizeof(*hbus), M_DEVBUF, M_WAITOK | M_ZERO);
1357 	hbus->pcib = dev;
1358 
1359 	channel = vmbus_get_channel(dev);
1360 	inst_guid = vmbus_chan_guid_inst(channel);
1361 	hbus->pci_domain = inst_guid->hv_guid[9] |
1362 			  (inst_guid->hv_guid[8] << 8);
1363 
1364 	mtx_init(&hbus->config_lock, "hbcfg", NULL, MTX_DEF);
1365 	mtx_init(&hbus->device_list_lock, "hbdl", NULL, MTX_DEF);
1366 	TAILQ_INIT(&hbus->children);
1367 	TAILQ_INIT(&hbus->dr_list);
1368 
1369 	hbus->cfg_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid,
1370 	    0, RM_MAX_END, PCI_CONFIG_MMIO_LENGTH,
1371 	    RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE));
1372 
1373 	if (!hbus->cfg_res) {
1374 		device_printf(dev, "failed to get resource for cfg window\n");
1375 		ret = ENXIO;
1376 		goto free_bus;
1377 	}
1378 
1379 	sc = device_get_softc(dev);
1380 	sc->chan = channel;
1381 	sc->rx_buf = malloc(PCIB_PACKET_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
1382 	sc->hbus = hbus;
1383 
1384 	/*
1385 	 * The taskq is used to handle PCI_BUS_RELATIONS and PCI_EJECT
1386 	 * messages. NB: we can't handle the messages in the channel callback
1387 	 * directly, because the message handlers need to send new messages
1388 	 * to the host and waits for the host's completion messages, which
1389 	 * must also be handled by the channel callback.
1390 	 */
1391 	sc->taskq = taskqueue_create("vmbus_pcib_tq", M_WAITOK,
1392 	    taskqueue_thread_enqueue, &sc->taskq);
1393 	taskqueue_start_threads(&sc->taskq, 1, PI_NET, "vmbus_pcib_tq");
1394 
1395 	hbus->sc = sc;
1396 
1397 	init_completion(&hbus->query_completion);
1398 	hbus->query_comp = &hbus->query_completion;
1399 
1400 	ret = vmbus_chan_open(sc->chan, pci_ring_size, pci_ring_size,
1401 		NULL, 0, vmbus_pcib_on_channel_callback, sc);
1402 	if (ret)
1403 		goto free_res;
1404 
1405 	ret = hv_pci_protocol_negotiation(hbus);
1406 	if (ret)
1407 		goto vmbus_close;
1408 
1409 	ret = hv_pci_query_relations(hbus);
1410 	if (ret)
1411 		goto vmbus_close;
1412 	wait_for_completion(hbus->query_comp);
1413 
1414 	ret = hv_pci_enter_d0(hbus);
1415 	if (ret)
1416 		goto vmbus_close;
1417 
1418 	ret = hv_send_resources_allocated(hbus);
1419 	if (ret)
1420 		goto vmbus_close;
1421 
1422 	hbus->pci_bus = device_add_child(dev, "pci", -1);
1423 	if (!hbus->pci_bus) {
1424 		device_printf(dev, "failed to create pci bus\n");
1425 		ret = ENXIO;
1426 		goto vmbus_close;
1427 	}
1428 
1429 	bus_generic_attach(dev);
1430 
1431 	hbus->state = hv_pcibus_installed;
1432 
1433 	return (0);
1434 
1435 vmbus_close:
1436 	vmbus_pcib_pre_detach(hbus);
1437 	vmbus_chan_close(sc->chan);
1438 free_res:
1439 	taskqueue_free(sc->taskq);
1440 	free_completion(&hbus->query_completion);
1441 	free(sc->rx_buf, M_DEVBUF);
1442 	bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res);
1443 free_bus:
1444 	mtx_destroy(&hbus->device_list_lock);
1445 	mtx_destroy(&hbus->config_lock);
1446 	free(hbus, M_DEVBUF);
1447 	return (ret);
1448 }
1449 
1450 /*
1451  * Standard detach entry point
1452  */
1453 static int
1454 vmbus_pcib_detach(device_t dev)
1455 {
1456 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1457 	struct hv_pcibus *hbus = sc->hbus;
1458 	struct pci_message teardown_packet;
1459 	struct pci_bus_relations relations;
1460 	int ret;
1461 
1462 	vmbus_pcib_pre_detach(hbus);
1463 
1464 	if (hbus->state == hv_pcibus_installed)
1465 		bus_generic_detach(dev);
1466 
1467 	/* Delete any children which might still exist. */
1468 	memset(&relations, 0, sizeof(relations));
1469 	hv_pci_devices_present(hbus, &relations);
1470 
1471 	ret = hv_send_resources_released(hbus);
1472 	if (ret)
1473 		device_printf(dev, "failed to send PCI_RESOURCES_RELEASED\n");
1474 
1475 	teardown_packet.type = PCI_BUS_D0EXIT;
1476 	ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
1477 	    &teardown_packet, sizeof(struct pci_message), 0);
1478 	if (ret)
1479 		device_printf(dev, "failed to send PCI_BUS_D0EXIT\n");
1480 
1481 	taskqueue_drain_all(hbus->sc->taskq);
1482 	vmbus_chan_close(sc->chan);
1483 	taskqueue_free(sc->taskq);
1484 
1485 	free_completion(&hbus->query_completion);
1486 	free(sc->rx_buf, M_DEVBUF);
1487 	bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res);
1488 
1489 	mtx_destroy(&hbus->device_list_lock);
1490 	mtx_destroy(&hbus->config_lock);
1491 	free(hbus, M_DEVBUF);
1492 
1493 	return (0);
1494 }
1495 
1496 static int
1497 vmbus_pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *val)
1498 {
1499 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1500 
1501 	switch (which) {
1502 	case PCIB_IVAR_DOMAIN:
1503 		*val = sc->hbus->pci_domain;
1504 		return (0);
1505 
1506 	case PCIB_IVAR_BUS:
1507 		/* There is only bus 0. */
1508 		*val = 0;
1509 		return (0);
1510 	}
1511 	return (ENOENT);
1512 }
1513 
1514 static int
1515 vmbus_pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t val)
1516 {
1517 	return (ENOENT);
1518 }
1519 
1520 static struct resource *
1521 vmbus_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
1522 	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
1523 {
1524 	unsigned int bar_no;
1525 	struct hv_pci_dev *hpdev;
1526 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1527 	struct resource *res;
1528 	unsigned int devfn;
1529 
1530 	if (type == PCI_RES_BUS)
1531 		return (pci_domain_alloc_bus(sc->hbus->pci_domain, child, rid,
1532 		    start, end, count, flags));
1533 
1534 	/* Devices with port I/O BAR are not supported. */
1535 	if (type == SYS_RES_IOPORT)
1536 		return (NULL);
1537 
1538 	if (type == SYS_RES_MEMORY) {
1539 		devfn = PCI_DEVFN(pci_get_slot(child),
1540 		    pci_get_function(child));
1541 		hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1542 		if (!hpdev)
1543 			return (NULL);
1544 
1545 		bar_no = PCI_RID2BAR(*rid);
1546 		if (bar_no >= MAX_NUM_BARS)
1547 			return (NULL);
1548 
1549 		/* Make sure a 32-bit BAR gets a 32-bit address */
1550 		if (!(hpdev->probed_bar[bar_no] & PCIM_BAR_MEM_64))
1551 			end = ulmin(end, 0xFFFFFFFF);
1552 	}
1553 
1554 	res = bus_generic_alloc_resource(dev, child, type, rid,
1555 		start, end, count, flags);
1556 	/*
1557 	 * If this is a request for a specific range, assume it is
1558 	 * correct and pass it up to the parent.
1559 	 */
1560 	if (res == NULL && start + count - 1 == end)
1561 		res = bus_generic_alloc_resource(dev, child, type, rid,
1562 		    start, end, count, flags);
1563 	return (res);
1564 }
1565 
1566 static int
1567 vmbus_pcib_release_resource(device_t dev, device_t child, int type, int rid,
1568     struct resource *r)
1569 {
1570 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1571 
1572 	if (type == PCI_RES_BUS)
1573 		return (pci_domain_release_bus(sc->hbus->pci_domain, child,
1574 		    rid, r));
1575 
1576 	if (type == SYS_RES_IOPORT)
1577 		return (EINVAL);
1578 
1579 	return (bus_generic_release_resource(dev, child, type, rid, r));
1580 }
1581 
1582 #if __FreeBSD_version >= 1100000
1583 static int
1584 vmbus_pcib_get_cpus(device_t pcib, device_t dev, enum cpu_sets op,
1585     size_t setsize, cpuset_t *cpuset)
1586 {
1587 	return (bus_get_cpus(pcib, op, setsize, cpuset));
1588 }
1589 #endif
1590 
1591 static uint32_t
1592 vmbus_pcib_read_config(device_t dev, u_int bus, u_int slot, u_int func,
1593     u_int reg, int bytes)
1594 {
1595 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1596 	struct hv_pci_dev *hpdev;
1597 	unsigned int devfn = PCI_DEVFN(slot, func);
1598 	uint32_t data = 0;
1599 
1600 	KASSERT(bus == 0, ("bus should be 0, but is %u", bus));
1601 
1602 	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1603 	if (!hpdev)
1604 		return (~0);
1605 
1606 	_hv_pcifront_read_config(hpdev, reg, bytes, &data);
1607 
1608 	return (data);
1609 }
1610 
1611 static void
1612 vmbus_pcib_write_config(device_t dev, u_int bus, u_int slot, u_int func,
1613     u_int reg, uint32_t data, int bytes)
1614 {
1615 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1616 	struct hv_pci_dev *hpdev;
1617 	unsigned int devfn = PCI_DEVFN(slot, func);
1618 
1619 	KASSERT(bus == 0, ("bus should be 0, but is %u", bus));
1620 
1621 	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1622 	if (!hpdev)
1623 		return;
1624 
1625 	_hv_pcifront_write_config(hpdev, reg, bytes, data);
1626 }
1627 
1628 static int
1629 vmbus_pcib_route_intr(device_t pcib, device_t dev, int pin)
1630 {
1631 	/* We only support MSI/MSI-X and don't support INTx interrupt. */
1632 	return (PCI_INVALID_IRQ);
1633 }
1634 
1635 static int
1636 vmbus_pcib_alloc_msi(device_t pcib, device_t dev, int count,
1637     int maxcount, int *irqs)
1638 {
1639 	return (PCIB_ALLOC_MSI(device_get_parent(pcib), dev, count, maxcount,
1640 	    irqs));
1641 }
1642 
1643 static int
1644 vmbus_pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs)
1645 {
1646 	return (PCIB_RELEASE_MSI(device_get_parent(pcib), dev, count, irqs));
1647 }
1648 
1649 static int
1650 vmbus_pcib_alloc_msix(device_t pcib, device_t dev, int *irq)
1651 {
1652 	return (PCIB_ALLOC_MSIX(device_get_parent(pcib), dev, irq));
1653 }
1654 
1655 static int
1656 vmbus_pcib_release_msix(device_t pcib, device_t dev, int irq)
1657 {
1658 	return (PCIB_RELEASE_MSIX(device_get_parent(pcib), dev, irq));
1659 }
1660 
1661 #define	MSI_INTEL_ADDR_DEST	0x000ff000
1662 #define	MSI_INTEL_DATA_INTVEC	IOART_INTVEC	/* Interrupt vector. */
1663 #define	MSI_INTEL_DATA_DELFIXED	IOART_DELFIXED
1664 
1665 static int
1666 vmbus_pcib_map_msi(device_t pcib, device_t child, int irq,
1667     uint64_t *addr, uint32_t *data)
1668 {
1669 	unsigned int devfn;
1670 	struct hv_pci_dev *hpdev;
1671 
1672 	uint64_t v_addr;
1673 	uint32_t v_data;
1674 	struct hv_irq_desc *hid, *tmp_hid;
1675 	unsigned int cpu, vcpu_id;
1676 	unsigned int vector;
1677 
1678 	struct vmbus_pcib_softc *sc = device_get_softc(pcib);
1679 	struct pci_create_interrupt *int_pkt;
1680 	struct compose_comp_ctxt comp;
1681 	struct {
1682 		struct pci_packet pkt;
1683 		uint8_t buffer[sizeof(struct pci_create_interrupt)];
1684 	} ctxt;
1685 
1686 	int ret;
1687 
1688 	devfn = PCI_DEVFN(pci_get_slot(child), pci_get_function(child));
1689 	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1690 	if (!hpdev)
1691 		return (ENOENT);
1692 
1693 	ret = PCIB_MAP_MSI(device_get_parent(pcib), child, irq,
1694 	    &v_addr, &v_data);
1695 	if (ret)
1696 		return (ret);
1697 
1698 	TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid) {
1699 		if (hid->irq == irq) {
1700 			TAILQ_REMOVE(&hpdev->irq_desc_list, hid, link);
1701 			hv_int_desc_free(hpdev, hid);
1702 			break;
1703 		}
1704 	}
1705 
1706 	cpu = (v_addr & MSI_INTEL_ADDR_DEST) >> 12;
1707 	vcpu_id = VMBUS_GET_VCPU_ID(device_get_parent(pcib), pcib, cpu);
1708 	vector = v_data & MSI_INTEL_DATA_INTVEC;
1709 
1710 	init_completion(&comp.comp_pkt.host_event);
1711 
1712 	memset(&ctxt, 0, sizeof(ctxt));
1713 	ctxt.pkt.completion_func = hv_pci_compose_compl;
1714 	ctxt.pkt.compl_ctxt = &comp;
1715 
1716 	int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message;
1717 	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
1718 	int_pkt->wslot.val = hpdev->desc.wslot.val;
1719 	int_pkt->int_desc.vector = vector;
1720 	int_pkt->int_desc.vector_count = 1;
1721 	int_pkt->int_desc.delivery_mode = MSI_INTEL_DATA_DELFIXED;
1722 	int_pkt->int_desc.cpu_mask = 1ULL << vcpu_id;
1723 
1724 	ret = vmbus_chan_send(sc->chan,	VMBUS_CHANPKT_TYPE_INBAND,
1725 	    VMBUS_CHANPKT_FLAG_RC, int_pkt, sizeof(*int_pkt),
1726 	    (uint64_t)&ctxt.pkt);
1727 	if (ret) {
1728 		free_completion(&comp.comp_pkt.host_event);
1729 		return (ret);
1730 	}
1731 
1732 	wait_for_completion(&comp.comp_pkt.host_event);
1733 	free_completion(&comp.comp_pkt.host_event);
1734 
1735 	if (comp.comp_pkt.completion_status < 0)
1736 		return (EPROTO);
1737 
1738 	*addr = comp.int_desc.address;
1739 	*data = comp.int_desc.data;
1740 
1741 	hid = malloc(sizeof(struct hv_irq_desc), M_DEVBUF, M_WAITOK | M_ZERO);
1742 	hid->irq = irq;
1743 	hid->desc = comp.int_desc;
1744 	TAILQ_INSERT_TAIL(&hpdev->irq_desc_list, hid, link);
1745 
1746 	return (0);
1747 }
1748 
1749 static device_method_t vmbus_pcib_methods[] = {
1750 	/* Device interface */
1751 	DEVMETHOD(device_probe,         vmbus_pcib_probe),
1752 	DEVMETHOD(device_attach,        vmbus_pcib_attach),
1753 	DEVMETHOD(device_detach,        vmbus_pcib_detach),
1754 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
1755 	DEVMETHOD(device_suspend,	bus_generic_suspend),
1756 	DEVMETHOD(device_resume,	bus_generic_resume),
1757 
1758 	/* Bus interface */
1759 	DEVMETHOD(bus_read_ivar,		vmbus_pcib_read_ivar),
1760 	DEVMETHOD(bus_write_ivar,		vmbus_pcib_write_ivar),
1761 	DEVMETHOD(bus_alloc_resource,		vmbus_pcib_alloc_resource),
1762 	DEVMETHOD(bus_release_resource,		vmbus_pcib_release_resource),
1763 	DEVMETHOD(bus_activate_resource,   bus_generic_activate_resource),
1764 	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
1765 	DEVMETHOD(bus_setup_intr,	   bus_generic_setup_intr),
1766 	DEVMETHOD(bus_teardown_intr,	   bus_generic_teardown_intr),
1767 #if __FreeBSD_version >= 1100000
1768 	DEVMETHOD(bus_get_cpus,			vmbus_pcib_get_cpus),
1769 #endif
1770 
1771 	/* pcib interface */
1772 	DEVMETHOD(pcib_maxslots,		pcib_maxslots),
1773 	DEVMETHOD(pcib_read_config,		vmbus_pcib_read_config),
1774 	DEVMETHOD(pcib_write_config,		vmbus_pcib_write_config),
1775 	DEVMETHOD(pcib_route_interrupt,		vmbus_pcib_route_intr),
1776 	DEVMETHOD(pcib_alloc_msi,		vmbus_pcib_alloc_msi),
1777 	DEVMETHOD(pcib_release_msi,		vmbus_pcib_release_msi),
1778 	DEVMETHOD(pcib_alloc_msix,		vmbus_pcib_alloc_msix),
1779 	DEVMETHOD(pcib_release_msix,		vmbus_pcib_release_msix),
1780 	DEVMETHOD(pcib_map_msi,			vmbus_pcib_map_msi),
1781 
1782 	DEVMETHOD_END
1783 };
1784 
1785 static devclass_t pcib_devclass;
1786 
1787 DEFINE_CLASS_0(pcib, vmbus_pcib_driver, vmbus_pcib_methods,
1788 		sizeof(struct vmbus_pcib_softc));
1789 DRIVER_MODULE(vmbus_pcib, vmbus, vmbus_pcib_driver, pcib_devclass, 0, 0);
1790 MODULE_DEPEND(vmbus_pcib, vmbus, 1, 1, 1);
1791 MODULE_DEPEND(vmbus_pcib, pci, 1, 1, 1);
1792