xref: /freebsd/sys/dev/hyperv/vmbus/vmbus_chan.c (revision 43e29d03f416d7dda52112a29600a7c82ee1a91e)
1 /*-
2  * Copyright (c) 2009-2012,2016 Microsoft Corp.
3  * Copyright (c) 2012 NetApp Inc.
4  * Copyright (c) 2012 Citrix Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/bus.h>
34 #include <sys/callout.h>
35 #include <sys/kernel.h>
36 #include <sys/lock.h>
37 #include <sys/malloc.h>
38 #include <sys/mutex.h>
39 #include <sys/smp.h>
40 #include <sys/sysctl.h>
41 #include <sys/systm.h>
42 
43 #include <machine/atomic.h>
44 #include <machine/stdarg.h>
45 
46 #include <vm/vm.h>
47 #include <vm/pmap.h>
48 #include <vm/vm_extern.h>
49 
50 #include <dev/hyperv/include/vmbus_xact.h>
51 #include <dev/hyperv/vmbus/hyperv_var.h>
52 #include <dev/hyperv/vmbus/vmbus_reg.h>
53 #include <dev/hyperv/vmbus/vmbus_var.h>
54 #include <dev/hyperv/vmbus/vmbus_brvar.h>
55 #include <dev/hyperv/vmbus/vmbus_chanvar.h>
56 
57 struct vmbus_chan_pollarg {
58 	struct vmbus_channel	*poll_chan;
59 	u_int			poll_hz;
60 };
61 
62 static void			vmbus_chan_update_evtflagcnt(
63 				    struct vmbus_softc *,
64 				    const struct vmbus_channel *);
65 static int			vmbus_chan_close_internal(
66 				    struct vmbus_channel *);
67 static int			vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS);
68 static void			vmbus_chan_sysctl_create(
69 				    struct vmbus_channel *);
70 static struct vmbus_channel	*vmbus_chan_alloc(struct vmbus_softc *);
71 static void			vmbus_chan_free(struct vmbus_channel *);
72 static int			vmbus_chan_add(struct vmbus_channel *);
73 static void			vmbus_chan_cpu_default(struct vmbus_channel *);
74 static int			vmbus_chan_release(struct vmbus_channel *);
75 static void			vmbus_chan_set_chmap(struct vmbus_channel *);
76 static void			vmbus_chan_clear_chmap(struct vmbus_channel *);
77 static void			vmbus_chan_detach(struct vmbus_channel *);
78 static bool			vmbus_chan_wait_revoke(
79 				    const struct vmbus_channel *, bool);
80 static void			vmbus_chan_poll_timeout(void *);
81 static bool			vmbus_chan_poll_cancel_intq(
82 				    struct vmbus_channel *);
83 static void			vmbus_chan_poll_cancel(struct vmbus_channel *);
84 
85 static void			vmbus_chan_ins_prilist(struct vmbus_softc *,
86 				    struct vmbus_channel *);
87 static void			vmbus_chan_rem_prilist(struct vmbus_softc *,
88 				    struct vmbus_channel *);
89 static void			vmbus_chan_ins_list(struct vmbus_softc *,
90 				    struct vmbus_channel *);
91 static void			vmbus_chan_rem_list(struct vmbus_softc *,
92 				    struct vmbus_channel *);
93 static void			vmbus_chan_ins_sublist(struct vmbus_channel *,
94 				    struct vmbus_channel *);
95 static void			vmbus_chan_rem_sublist(struct vmbus_channel *,
96 				    struct vmbus_channel *);
97 
98 static void			vmbus_chan_task(void *, int);
99 static void			vmbus_chan_task_nobatch(void *, int);
100 static void			vmbus_chan_poll_task(void *, int);
101 static void			vmbus_chan_clrchmap_task(void *, int);
102 static void			vmbus_chan_pollcfg_task(void *, int);
103 static void			vmbus_chan_polldis_task(void *, int);
104 static void			vmbus_chan_poll_cancel_task(void *, int);
105 static void			vmbus_prichan_attach_task(void *, int);
106 static void			vmbus_subchan_attach_task(void *, int);
107 static void			vmbus_prichan_detach_task(void *, int);
108 static void			vmbus_subchan_detach_task(void *, int);
109 
110 static void			vmbus_chan_msgproc_choffer(struct vmbus_softc *,
111 				    const struct vmbus_message *);
112 static void			vmbus_chan_msgproc_chrescind(
113 				    struct vmbus_softc *,
114 				    const struct vmbus_message *);
115 
116 static int			vmbus_chan_printf(const struct vmbus_channel *,
117 				    const char *, ...) __printflike(2, 3);
118 
119 /*
120  * Vmbus channel message processing.
121  */
122 static const vmbus_chanmsg_proc_t
123 vmbus_chan_msgprocs[VMBUS_CHANMSG_TYPE_MAX] = {
124 	VMBUS_CHANMSG_PROC(CHOFFER,	vmbus_chan_msgproc_choffer),
125 	VMBUS_CHANMSG_PROC(CHRESCIND,	vmbus_chan_msgproc_chrescind),
126 
127 	VMBUS_CHANMSG_PROC_WAKEUP(CHOPEN_RESP),
128 	VMBUS_CHANMSG_PROC_WAKEUP(GPADL_CONNRESP),
129 	VMBUS_CHANMSG_PROC_WAKEUP(GPADL_DISCONNRESP)
130 };
131 
132 /*
133  * Notify host that there are data pending on our TX bufring or
134  * we have put some data on the TX bufring.
135  */
136 static __inline void
137 vmbus_chan_signal(const struct vmbus_channel *chan)
138 {
139 	atomic_set_long(chan->ch_evtflag, chan->ch_evtflag_mask);
140 	if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF)
141 		atomic_set_int(chan->ch_montrig, chan->ch_montrig_mask);
142 	else
143 		hypercall_signal_event(pmap_kextract(
144 		    (vm_offset_t)chan->ch_monprm));
145 }
146 
147 static __inline void
148 vmbus_chan_signal_tx(struct vmbus_channel *chan)
149 {
150 	chan->ch_txbr.txbr_intrcnt ++;
151 
152 	vmbus_chan_signal(chan);
153 }
154 
155 static __inline void
156 vmbus_chan_signal_rx(struct vmbus_channel *chan)
157 {
158 	chan->ch_rxbr.rxbr_intrcnt ++;
159 
160 	vmbus_chan_signal(chan);
161 }
162 
163 static void
164 vmbus_chan_ins_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan)
165 {
166 
167 	mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED);
168 	if (atomic_testandset_int(&chan->ch_stflags,
169 	    VMBUS_CHAN_ST_ONPRIL_SHIFT))
170 		panic("channel is already on the prilist");
171 	TAILQ_INSERT_TAIL(&sc->vmbus_prichans, chan, ch_prilink);
172 }
173 
174 static void
175 vmbus_chan_rem_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan)
176 {
177 
178 	mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED);
179 	if (atomic_testandclear_int(&chan->ch_stflags,
180 	    VMBUS_CHAN_ST_ONPRIL_SHIFT) == 0)
181 		panic("channel is not on the prilist");
182 	TAILQ_REMOVE(&sc->vmbus_prichans, chan, ch_prilink);
183 }
184 
185 static void
186 vmbus_chan_ins_sublist(struct vmbus_channel *prichan,
187     struct vmbus_channel *chan)
188 {
189 
190 	mtx_assert(&prichan->ch_subchan_lock, MA_OWNED);
191 
192 	if (atomic_testandset_int(&chan->ch_stflags,
193 	    VMBUS_CHAN_ST_ONSUBL_SHIFT))
194 		panic("channel is already on the sublist");
195 	TAILQ_INSERT_TAIL(&prichan->ch_subchans, chan, ch_sublink);
196 
197 	/* Bump sub-channel count. */
198 	prichan->ch_subchan_cnt++;
199 }
200 
201 static void
202 vmbus_chan_rem_sublist(struct vmbus_channel *prichan,
203     struct vmbus_channel *chan)
204 {
205 
206 	mtx_assert(&prichan->ch_subchan_lock, MA_OWNED);
207 
208 	KASSERT(prichan->ch_subchan_cnt > 0,
209 	    ("invalid subchan_cnt %d", prichan->ch_subchan_cnt));
210 	prichan->ch_subchan_cnt--;
211 
212 	if (atomic_testandclear_int(&chan->ch_stflags,
213 	    VMBUS_CHAN_ST_ONSUBL_SHIFT) == 0)
214 		panic("channel is not on the sublist");
215 	TAILQ_REMOVE(&prichan->ch_subchans, chan, ch_sublink);
216 }
217 
218 static void
219 vmbus_chan_ins_list(struct vmbus_softc *sc, struct vmbus_channel *chan)
220 {
221 
222 	mtx_assert(&sc->vmbus_chan_lock, MA_OWNED);
223 	if (atomic_testandset_int(&chan->ch_stflags,
224 	    VMBUS_CHAN_ST_ONLIST_SHIFT))
225 		panic("channel is already on the list");
226 	TAILQ_INSERT_TAIL(&sc->vmbus_chans, chan, ch_link);
227 }
228 
229 static void
230 vmbus_chan_rem_list(struct vmbus_softc *sc, struct vmbus_channel *chan)
231 {
232 
233 	mtx_assert(&sc->vmbus_chan_lock, MA_OWNED);
234 	if (atomic_testandclear_int(&chan->ch_stflags,
235 	    VMBUS_CHAN_ST_ONLIST_SHIFT) == 0)
236 		panic("channel is not on the list");
237 	TAILQ_REMOVE(&sc->vmbus_chans, chan, ch_link);
238 }
239 
240 static int
241 vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS)
242 {
243 	struct vmbus_channel *chan = arg1;
244 	int mnf = 0;
245 
246 	if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF)
247 		mnf = 1;
248 	return sysctl_handle_int(oidp, &mnf, 0, req);
249 }
250 
251 static void
252 vmbus_chan_sysctl_create(struct vmbus_channel *chan)
253 {
254 	struct sysctl_oid *ch_tree, *chid_tree, *br_tree;
255 	struct sysctl_ctx_list *ctx;
256 	uint32_t ch_id;
257 	char name[16];
258 
259 	/*
260 	 * Add sysctl nodes related to this channel to this
261 	 * channel's sysctl ctx, so that they can be destroyed
262 	 * independently upon close of this channel, which can
263 	 * happen even if the device is not detached.
264 	 */
265 	ctx = &chan->ch_sysctl_ctx;
266 	sysctl_ctx_init(ctx);
267 
268 	/*
269 	 * Create dev.NAME.UNIT.channel tree.
270 	 */
271 	ch_tree = SYSCTL_ADD_NODE(ctx,
272 	    SYSCTL_CHILDREN(device_get_sysctl_tree(chan->ch_dev)),
273 	    OID_AUTO, "channel", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
274 	if (ch_tree == NULL)
275 		return;
276 
277 	/*
278 	 * Create dev.NAME.UNIT.channel.CHANID tree.
279 	 */
280 	if (VMBUS_CHAN_ISPRIMARY(chan))
281 		ch_id = chan->ch_id;
282 	else
283 		ch_id = chan->ch_prichan->ch_id;
284 	snprintf(name, sizeof(name), "%d", ch_id);
285 	chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree),
286 	    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
287 	if (chid_tree == NULL)
288 		return;
289 
290 	if (!VMBUS_CHAN_ISPRIMARY(chan)) {
291 		/*
292 		 * Create dev.NAME.UNIT.channel.CHANID.sub tree.
293 		 */
294 		ch_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree),
295 		    OID_AUTO, "sub", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
296 		if (ch_tree == NULL)
297 			return;
298 
299 		/*
300 		 * Create dev.NAME.UNIT.channel.CHANID.sub.SUBIDX tree.
301 		 *
302 		 * NOTE:
303 		 * chid_tree is changed to this new sysctl tree.
304 		 */
305 		snprintf(name, sizeof(name), "%d", chan->ch_subidx);
306 		chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree),
307 		    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
308 		if (chid_tree == NULL)
309 			return;
310 
311 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
312 		    "chanid", CTLFLAG_RD, &chan->ch_id, 0, "channel id");
313 	}
314 
315 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
316 	    "cpu", CTLFLAG_RD, &chan->ch_cpuid, 0, "owner CPU id");
317 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
318 	    "mnf", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
319 	    chan, 0, vmbus_chan_sysctl_mnf, "I",
320 	    "has monitor notification facilities");
321 
322 	br_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
323 	    "br", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
324 	if (br_tree != NULL) {
325 		/*
326 		 * Create sysctl tree for RX bufring.
327 		 */
328 		vmbus_br_sysctl_create(ctx, br_tree, &chan->ch_rxbr.rxbr, "rx");
329 		/*
330 		 * Create sysctl tree for TX bufring.
331 		 */
332 		vmbus_br_sysctl_create(ctx, br_tree, &chan->ch_txbr.txbr, "tx");
333 	}
334 }
335 
336 int
337 vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size,
338     const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg)
339 {
340 	struct vmbus_chan_br cbr;
341 	int error;
342 
343 	/*
344 	 * Allocate the TX+RX bufrings.
345 	 */
346 	KASSERT(chan->ch_bufring == NULL, ("bufrings are allocated"));
347 	chan->ch_bufring_size = txbr_size + rxbr_size;
348 	chan->ch_bufring = contigmalloc(chan->ch_bufring_size, M_DEVBUF,
349 	    M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0);
350 	if (chan->ch_bufring == NULL) {
351 		vmbus_chan_printf(chan, "bufring allocation failed\n");
352 		return (ENOMEM);
353 	}
354 
355 	cbr.cbr = chan->ch_bufring;
356 	cbr.cbr_paddr = pmap_kextract((vm_offset_t)chan->ch_bufring);
357 	cbr.cbr_txsz = txbr_size;
358 	cbr.cbr_rxsz = rxbr_size;
359 
360 	error = vmbus_chan_open_br(chan, &cbr, udata, udlen, cb, cbarg);
361 	if (error) {
362 		if (error == EISCONN) {
363 			/*
364 			 * XXX
365 			 * The bufring GPADL is still connected; abandon
366 			 * this bufring, instead of having mysterious
367 			 * crash or trashed data later on.
368 			 */
369 			vmbus_chan_printf(chan, "chan%u bufring GPADL "
370 			    "is still connected upon channel open error; "
371 			    "leak %d bytes memory\n", chan->ch_id,
372 			    txbr_size + rxbr_size);
373 		} else {
374 			contigfree(chan->ch_bufring, chan->ch_bufring_size,
375 			    M_DEVBUF);
376 		}
377 		chan->ch_bufring = NULL;
378 	}
379 	return (error);
380 }
381 
382 int
383 vmbus_chan_open_br(struct vmbus_channel *chan, const struct vmbus_chan_br *cbr,
384     const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg)
385 {
386 	struct vmbus_softc *sc = chan->ch_vmbus;
387 	const struct vmbus_message *msg;
388 	struct vmbus_chanmsg_chopen *req;
389 	struct vmbus_msghc *mh;
390 	uint32_t status;
391 	int error, txbr_size, rxbr_size;
392 	task_fn_t *task_fn;
393 	uint8_t *br;
394 
395 	if (udlen > VMBUS_CHANMSG_CHOPEN_UDATA_SIZE) {
396 		vmbus_chan_printf(chan,
397 		    "invalid udata len %d for chan%u\n", udlen, chan->ch_id);
398 		return (EINVAL);
399 	}
400 
401 	br = cbr->cbr;
402 	txbr_size = cbr->cbr_txsz;
403 	rxbr_size = cbr->cbr_rxsz;
404 	KASSERT((txbr_size & PAGE_MASK) == 0,
405 	    ("send bufring size is not multiple page"));
406 	KASSERT((rxbr_size & PAGE_MASK) == 0,
407 	    ("recv bufring size is not multiple page"));
408 	KASSERT((cbr->cbr_paddr & PAGE_MASK) == 0,
409 	    ("bufring is not page aligned"));
410 
411 	/*
412 	 * Zero out the TX/RX bufrings, in case that they were used before.
413 	 */
414 	memset(br, 0, txbr_size + rxbr_size);
415 
416 	if (atomic_testandset_int(&chan->ch_stflags,
417 	    VMBUS_CHAN_ST_OPENED_SHIFT))
418 		panic("double-open chan%u", chan->ch_id);
419 
420 	chan->ch_cb = cb;
421 	chan->ch_cbarg = cbarg;
422 
423 	vmbus_chan_update_evtflagcnt(sc, chan);
424 
425 	chan->ch_tq = VMBUS_PCPU_GET(chan->ch_vmbus, event_tq, chan->ch_cpuid);
426 	if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD)
427 		task_fn = vmbus_chan_task;
428 	else
429 		task_fn = vmbus_chan_task_nobatch;
430 	TASK_INIT(&chan->ch_task, 0, task_fn, chan);
431 
432 	/* TX bufring comes first */
433 	vmbus_txbr_setup(&chan->ch_txbr, br, txbr_size);
434 	/* RX bufring immediately follows TX bufring */
435 	vmbus_rxbr_setup(&chan->ch_rxbr, br + txbr_size, rxbr_size);
436 
437 	/* Create sysctl tree for this channel */
438 	vmbus_chan_sysctl_create(chan);
439 
440 	/*
441 	 * Connect the bufrings, both RX and TX, to this channel.
442 	 */
443 	error = vmbus_chan_gpadl_connect(chan, cbr->cbr_paddr,
444 	    txbr_size + rxbr_size, &chan->ch_bufring_gpadl);
445 	if (error) {
446 		vmbus_chan_printf(chan,
447 		    "failed to connect bufring GPADL to chan%u\n", chan->ch_id);
448 		goto failed;
449 	}
450 
451 	/*
452 	 * Install this channel, before it is opened, but after everything
453 	 * else has been setup.
454 	 */
455 	vmbus_chan_set_chmap(chan);
456 
457 	/*
458 	 * Open channel w/ the bufring GPADL on the target CPU.
459 	 */
460 	mh = vmbus_msghc_get(sc, sizeof(*req));
461 	if (mh == NULL) {
462 		vmbus_chan_printf(chan,
463 		    "can not get msg hypercall for chopen(chan%u)\n",
464 		    chan->ch_id);
465 		error = ENXIO;
466 		goto failed;
467 	}
468 
469 	req = vmbus_msghc_dataptr(mh);
470 	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHOPEN;
471 	req->chm_chanid = chan->ch_id;
472 	req->chm_openid = chan->ch_id;
473 	req->chm_gpadl = chan->ch_bufring_gpadl;
474 	req->chm_vcpuid = chan->ch_vcpuid;
475 	req->chm_txbr_pgcnt = txbr_size >> PAGE_SHIFT;
476 	if (udlen > 0)
477 		memcpy(req->chm_udata, udata, udlen);
478 
479 	error = vmbus_msghc_exec(sc, mh);
480 	if (error) {
481 		vmbus_chan_printf(chan,
482 		    "chopen(chan%u) msg hypercall exec failed: %d\n",
483 		    chan->ch_id, error);
484 		vmbus_msghc_put(sc, mh);
485 		goto failed;
486 	}
487 
488 	for (;;) {
489 		msg = vmbus_msghc_poll_result(sc, mh);
490 		if (msg != NULL)
491 			break;
492 		if (vmbus_chan_is_revoked(chan)) {
493 			int i;
494 
495 			/*
496 			 * NOTE:
497 			 * Hypervisor does _not_ send response CHOPEN to
498 			 * a revoked channel.
499 			 */
500 			vmbus_chan_printf(chan,
501 			    "chan%u is revoked, when it is being opened\n",
502 			    chan->ch_id);
503 
504 			/*
505 			 * XXX
506 			 * Add extra delay before cancel the hypercall
507 			 * execution; mainly to close any possible
508 			 * CHRESCIND and CHOPEN_RESP races on the
509 			 * hypervisor side.
510 			 */
511 #define REVOKE_LINGER	100
512 			for (i = 0; i < REVOKE_LINGER; ++i) {
513 				msg = vmbus_msghc_poll_result(sc, mh);
514 				if (msg != NULL)
515 					break;
516 				pause("rchopen", 1);
517 			}
518 #undef REVOKE_LINGER
519 			if (msg == NULL)
520 				vmbus_msghc_exec_cancel(sc, mh);
521 			break;
522 		}
523 		pause("chopen", 1);
524 	}
525 	if (msg != NULL) {
526 		status = ((const struct vmbus_chanmsg_chopen_resp *)
527 		    msg->msg_data)->chm_status;
528 	} else {
529 		/* XXX any non-0 value is ok here. */
530 		status = 0xff;
531 	}
532 
533 	vmbus_msghc_put(sc, mh);
534 
535 	if (status == 0) {
536 		if (bootverbose)
537 			vmbus_chan_printf(chan, "chan%u opened\n", chan->ch_id);
538 		return (0);
539 	}
540 
541 	vmbus_chan_printf(chan, "failed to open chan%u\n", chan->ch_id);
542 	error = ENXIO;
543 
544 failed:
545 	sysctl_ctx_free(&chan->ch_sysctl_ctx);
546 	vmbus_chan_clear_chmap(chan);
547 	if (chan->ch_bufring_gpadl != 0) {
548 		int error1;
549 
550 		error1 = vmbus_chan_gpadl_disconnect(chan,
551 		    chan->ch_bufring_gpadl);
552 		if (error1) {
553 			/*
554 			 * Give caller a hint that the bufring GPADL is still
555 			 * connected.
556 			 */
557 			error = EISCONN;
558 		}
559 		chan->ch_bufring_gpadl = 0;
560 	}
561 	atomic_clear_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED);
562 	return (error);
563 }
564 
565 int
566 vmbus_chan_gpadl_connect(struct vmbus_channel *chan, bus_addr_t paddr,
567     int size, uint32_t *gpadl0)
568 {
569 	struct vmbus_softc *sc = chan->ch_vmbus;
570 	struct vmbus_msghc *mh;
571 	struct vmbus_chanmsg_gpadl_conn *req;
572 	const struct vmbus_message *msg;
573 	size_t reqsz;
574 	uint32_t gpadl, status;
575 	int page_count, range_len, i, cnt, error;
576 	uint64_t page_id;
577 
578 	KASSERT(*gpadl0 == 0, ("GPADL is not zero"));
579 
580 	/*
581 	 * Preliminary checks.
582 	 */
583 
584 	KASSERT((size & PAGE_MASK) == 0,
585 	    ("invalid GPA size %d, not multiple page size", size));
586 	page_count = size >> PAGE_SHIFT;
587 
588 	KASSERT((paddr & PAGE_MASK) == 0,
589 	    ("GPA is not page aligned %jx", (uintmax_t)paddr));
590 	page_id = paddr >> PAGE_SHIFT;
591 
592 	range_len = __offsetof(struct vmbus_gpa_range, gpa_page[page_count]);
593 	/*
594 	 * We don't support multiple GPA ranges.
595 	 */
596 	if (range_len > UINT16_MAX) {
597 		vmbus_chan_printf(chan, "GPA too large, %d pages\n",
598 		    page_count);
599 		return EOPNOTSUPP;
600 	}
601 
602 	/*
603 	 * Allocate GPADL id.
604 	 */
605 	gpadl = vmbus_gpadl_alloc(sc);
606 
607 	/*
608 	 * Connect this GPADL to the target channel.
609 	 *
610 	 * NOTE:
611 	 * Since each message can only hold small set of page
612 	 * addresses, several messages may be required to
613 	 * complete the connection.
614 	 */
615 	if (page_count > VMBUS_CHANMSG_GPADL_CONN_PGMAX)
616 		cnt = VMBUS_CHANMSG_GPADL_CONN_PGMAX;
617 	else
618 		cnt = page_count;
619 	page_count -= cnt;
620 
621 	reqsz = __offsetof(struct vmbus_chanmsg_gpadl_conn,
622 	    chm_range.gpa_page[cnt]);
623 	mh = vmbus_msghc_get(sc, reqsz);
624 	if (mh == NULL) {
625 		vmbus_chan_printf(chan,
626 		    "can not get msg hypercall for gpadl_conn(chan%u)\n",
627 		    chan->ch_id);
628 		return EIO;
629 	}
630 
631 	req = vmbus_msghc_dataptr(mh);
632 	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_CONN;
633 	req->chm_chanid = chan->ch_id;
634 	req->chm_gpadl = gpadl;
635 	req->chm_range_len = range_len;
636 	req->chm_range_cnt = 1;
637 	req->chm_range.gpa_len = size;
638 	req->chm_range.gpa_ofs = 0;
639 	for (i = 0; i < cnt; ++i)
640 		req->chm_range.gpa_page[i] = page_id++;
641 
642 	error = vmbus_msghc_exec(sc, mh);
643 	if (error) {
644 		vmbus_chan_printf(chan,
645 		    "gpadl_conn(chan%u) msg hypercall exec failed: %d\n",
646 		    chan->ch_id, error);
647 		vmbus_msghc_put(sc, mh);
648 		return error;
649 	}
650 
651 	while (page_count > 0) {
652 		struct vmbus_chanmsg_gpadl_subconn *subreq;
653 
654 		if (page_count > VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX)
655 			cnt = VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX;
656 		else
657 			cnt = page_count;
658 		page_count -= cnt;
659 
660 		reqsz = __offsetof(struct vmbus_chanmsg_gpadl_subconn,
661 		    chm_gpa_page[cnt]);
662 		vmbus_msghc_reset(mh, reqsz);
663 
664 		subreq = vmbus_msghc_dataptr(mh);
665 		subreq->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_SUBCONN;
666 		subreq->chm_gpadl = gpadl;
667 		for (i = 0; i < cnt; ++i)
668 			subreq->chm_gpa_page[i] = page_id++;
669 
670 		vmbus_msghc_exec_noresult(mh);
671 	}
672 	KASSERT(page_count == 0, ("invalid page count %d", page_count));
673 
674 	msg = vmbus_msghc_wait_result(sc, mh);
675 	status = ((const struct vmbus_chanmsg_gpadl_connresp *)
676 	    msg->msg_data)->chm_status;
677 
678 	vmbus_msghc_put(sc, mh);
679 
680 	if (status != 0) {
681 		vmbus_chan_printf(chan, "gpadl_conn(chan%u) failed: %u\n",
682 		    chan->ch_id, status);
683 		return EIO;
684 	}
685 
686 	/* Done; commit the GPADL id. */
687 	*gpadl0 = gpadl;
688 	if (bootverbose) {
689 		vmbus_chan_printf(chan, "gpadl_conn(chan%u) succeeded\n",
690 		    chan->ch_id);
691 	}
692 	return 0;
693 }
694 
695 static bool
696 vmbus_chan_wait_revoke(const struct vmbus_channel *chan, bool can_sleep)
697 {
698 #define WAIT_COUNT	200	/* 200ms */
699 
700 	int i;
701 
702 	for (i = 0; i < WAIT_COUNT; ++i) {
703 		if (vmbus_chan_is_revoked(chan))
704 			return (true);
705 		if (can_sleep)
706 			pause("wchrev", 1);
707 		else
708 			DELAY(1000);
709 	}
710 	return (false);
711 
712 #undef WAIT_COUNT
713 }
714 
715 /*
716  * Disconnect the GPA from the target channel
717  */
718 int
719 vmbus_chan_gpadl_disconnect(struct vmbus_channel *chan, uint32_t gpadl)
720 {
721 	struct vmbus_softc *sc = chan->ch_vmbus;
722 	struct vmbus_msghc *mh;
723 	struct vmbus_chanmsg_gpadl_disconn *req;
724 	int error;
725 
726 	KASSERT(gpadl != 0, ("GPADL is zero"));
727 
728 	mh = vmbus_msghc_get(sc, sizeof(*req));
729 	if (mh == NULL) {
730 		vmbus_chan_printf(chan,
731 		    "can not get msg hypercall for gpadl_disconn(chan%u)\n",
732 		    chan->ch_id);
733 		return (EBUSY);
734 	}
735 
736 	req = vmbus_msghc_dataptr(mh);
737 	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_DISCONN;
738 	req->chm_chanid = chan->ch_id;
739 	req->chm_gpadl = gpadl;
740 
741 	error = vmbus_msghc_exec(sc, mh);
742 	if (error) {
743 		vmbus_msghc_put(sc, mh);
744 
745 		if (vmbus_chan_wait_revoke(chan, true)) {
746 			/*
747 			 * Error is benign; this channel is revoked,
748 			 * so this GPADL will not be touched anymore.
749 			 */
750 			vmbus_chan_printf(chan,
751 			    "gpadl_disconn(revoked chan%u) msg hypercall "
752 			    "exec failed: %d\n", chan->ch_id, error);
753 			return (0);
754 		}
755 		vmbus_chan_printf(chan,
756 		    "gpadl_disconn(chan%u) msg hypercall exec failed: %d\n",
757 		    chan->ch_id, error);
758 		return (error);
759 	}
760 
761 	vmbus_msghc_wait_result(sc, mh);
762 	/* Discard result; no useful information */
763 	vmbus_msghc_put(sc, mh);
764 
765 	return (0);
766 }
767 
768 static void
769 vmbus_chan_detach(struct vmbus_channel *chan)
770 {
771 	int refs;
772 
773 	KASSERT(chan->ch_refs > 0, ("chan%u: invalid refcnt %d",
774 	    chan->ch_id, chan->ch_refs));
775 	refs = atomic_fetchadd_int(&chan->ch_refs, -1);
776 #ifdef INVARIANTS
777 	if (VMBUS_CHAN_ISPRIMARY(chan)) {
778 		KASSERT(refs == 1, ("chan%u: invalid refcnt %d for prichan",
779 		    chan->ch_id, refs + 1));
780 	}
781 #endif
782 	if (refs == 1) {
783 		/*
784 		 * Detach the target channel.
785 		 */
786 		if (bootverbose) {
787 			vmbus_chan_printf(chan, "chan%u detached\n",
788 			    chan->ch_id);
789 		}
790 		taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task);
791 	}
792 }
793 
794 static void
795 vmbus_chan_clrchmap_task(void *xchan, int pending __unused)
796 {
797 	struct vmbus_channel *chan = xchan;
798 
799 	chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL;
800 }
801 
802 static void
803 vmbus_chan_clear_chmap(struct vmbus_channel *chan)
804 {
805 	struct task chmap_task;
806 
807 	TASK_INIT(&chmap_task, 0, vmbus_chan_clrchmap_task, chan);
808 	vmbus_chan_run_task(chan, &chmap_task);
809 }
810 
811 static void
812 vmbus_chan_set_chmap(struct vmbus_channel *chan)
813 {
814 	__compiler_membar();
815 	chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan;
816 }
817 
818 static void
819 vmbus_chan_poll_cancel_task(void *xchan, int pending __unused)
820 {
821 
822 	vmbus_chan_poll_cancel_intq(xchan);
823 }
824 
825 static void
826 vmbus_chan_poll_cancel(struct vmbus_channel *chan)
827 {
828 	struct task poll_cancel;
829 
830 	TASK_INIT(&poll_cancel, 0, vmbus_chan_poll_cancel_task, chan);
831 	vmbus_chan_run_task(chan, &poll_cancel);
832 }
833 
834 static int
835 vmbus_chan_close_internal(struct vmbus_channel *chan)
836 {
837 	struct vmbus_softc *sc = chan->ch_vmbus;
838 	struct vmbus_msghc *mh;
839 	struct vmbus_chanmsg_chclose *req;
840 	uint32_t old_stflags;
841 	int error;
842 
843 	/*
844 	 * NOTE:
845 	 * Sub-channels are closed upon their primary channel closing,
846 	 * so they can be closed even before they are opened.
847 	 */
848 	for (;;) {
849 		old_stflags = chan->ch_stflags;
850 		if (atomic_cmpset_int(&chan->ch_stflags, old_stflags,
851 		    old_stflags & ~VMBUS_CHAN_ST_OPENED))
852 			break;
853 	}
854 	if ((old_stflags & VMBUS_CHAN_ST_OPENED) == 0) {
855 		/* Not opened yet; done */
856 		if (bootverbose) {
857 			vmbus_chan_printf(chan, "chan%u not opened\n",
858 			    chan->ch_id);
859 		}
860 		return (0);
861 	}
862 
863 	/*
864 	 * Free this channel's sysctl tree attached to its device's
865 	 * sysctl tree.
866 	 */
867 	sysctl_ctx_free(&chan->ch_sysctl_ctx);
868 
869 	/*
870 	 * Cancel polling, if it is enabled.
871 	 */
872 	vmbus_chan_poll_cancel(chan);
873 
874 	/*
875 	 * NOTE:
876 	 * Order is critical.  This channel _must_ be uninstalled first,
877 	 * else the channel task may be enqueued by the IDT after it has
878 	 * been drained.
879 	 */
880 	vmbus_chan_clear_chmap(chan);
881 	taskqueue_drain(chan->ch_tq, &chan->ch_task);
882 	chan->ch_tq = NULL;
883 
884 	/*
885 	 * Close this channel.
886 	 */
887 	mh = vmbus_msghc_get(sc, sizeof(*req));
888 	if (mh == NULL) {
889 		vmbus_chan_printf(chan,
890 		    "can not get msg hypercall for chclose(chan%u)\n",
891 		    chan->ch_id);
892 		error = ENXIO;
893 		goto disconnect;
894 	}
895 
896 	req = vmbus_msghc_dataptr(mh);
897 	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHCLOSE;
898 	req->chm_chanid = chan->ch_id;
899 
900 	error = vmbus_msghc_exec_noresult(mh);
901 	vmbus_msghc_put(sc, mh);
902 
903 	if (error) {
904 		vmbus_chan_printf(chan,
905 		    "chclose(chan%u) msg hypercall exec failed: %d\n",
906 		    chan->ch_id, error);
907 		goto disconnect;
908 	}
909 
910 	if (bootverbose)
911 		vmbus_chan_printf(chan, "chan%u closed\n", chan->ch_id);
912 
913 disconnect:
914 	/*
915 	 * Disconnect the TX+RX bufrings from this channel.
916 	 */
917 	if (chan->ch_bufring_gpadl != 0) {
918 		int error1;
919 
920 		error1 = vmbus_chan_gpadl_disconnect(chan,
921 		    chan->ch_bufring_gpadl);
922 		if (error1) {
923 			/*
924 			 * XXX
925 			 * The bufring GPADL is still connected; abandon
926 			 * this bufring, instead of having mysterious
927 			 * crash or trashed data later on.
928 			 */
929 			vmbus_chan_printf(chan, "chan%u bufring GPADL "
930 			    "is still connected after close\n", chan->ch_id);
931 			chan->ch_bufring = NULL;
932 			/*
933 			 * Give caller a hint that the bufring GPADL is
934 			 * still connected.
935 			 */
936 			error = EISCONN;
937 		}
938 		chan->ch_bufring_gpadl = 0;
939 	}
940 
941 	/*
942 	 * Destroy the TX+RX bufrings.
943 	 */
944 	if (chan->ch_bufring != NULL) {
945 		contigfree(chan->ch_bufring, chan->ch_bufring_size, M_DEVBUF);
946 		chan->ch_bufring = NULL;
947 	}
948 	return (error);
949 }
950 
951 int
952 vmbus_chan_close_direct(struct vmbus_channel *chan)
953 {
954 	int error;
955 
956 #ifdef INVARIANTS
957 	if (VMBUS_CHAN_ISPRIMARY(chan)) {
958 		struct vmbus_channel *subchan;
959 
960 		/*
961 		 * All sub-channels _must_ have been closed, or are _not_
962 		 * opened at all.
963 		 */
964 		mtx_lock(&chan->ch_subchan_lock);
965 		TAILQ_FOREACH(subchan, &chan->ch_subchans, ch_sublink) {
966 			KASSERT(
967 			   (subchan->ch_stflags & VMBUS_CHAN_ST_OPENED) == 0,
968 			   ("chan%u: subchan%u is still opened",
969 			    chan->ch_id, subchan->ch_subidx));
970 		}
971 		mtx_unlock(&chan->ch_subchan_lock);
972 	}
973 #endif
974 
975 	error = vmbus_chan_close_internal(chan);
976 	if (!VMBUS_CHAN_ISPRIMARY(chan)) {
977 		/*
978 		 * This sub-channel is referenced, when it is linked to
979 		 * the primary channel; drop that reference now.
980 		 */
981 		vmbus_chan_detach(chan);
982 	}
983 	return (error);
984 }
985 
986 /*
987  * Caller should make sure that all sub-channels have
988  * been added to 'chan' and all to-be-closed channels
989  * are not being opened.
990  */
991 void
992 vmbus_chan_close(struct vmbus_channel *chan)
993 {
994 	int subchan_cnt;
995 
996 	if (!VMBUS_CHAN_ISPRIMARY(chan)) {
997 		/*
998 		 * Sub-channel is closed when its primary channel
999 		 * is closed; done.
1000 		 */
1001 		return;
1002 	}
1003 
1004 	/*
1005 	 * Close all sub-channels, if any.
1006 	 */
1007 	subchan_cnt = chan->ch_subchan_cnt;
1008 	if (subchan_cnt > 0) {
1009 		struct vmbus_channel **subchan;
1010 		int i;
1011 
1012 		subchan = vmbus_subchan_get(chan, subchan_cnt);
1013 		for (i = 0; i < subchan_cnt; ++i) {
1014 			vmbus_chan_close_internal(subchan[i]);
1015 			/*
1016 			 * This sub-channel is referenced, when it is
1017 			 * linked to the primary channel; drop that
1018 			 * reference now.
1019 			 */
1020 			vmbus_chan_detach(subchan[i]);
1021 		}
1022 		vmbus_subchan_rel(subchan, subchan_cnt);
1023 	}
1024 
1025 	/* Then close the primary channel. */
1026 	vmbus_chan_close_internal(chan);
1027 }
1028 
1029 void
1030 vmbus_chan_intr_drain(struct vmbus_channel *chan)
1031 {
1032 
1033 	taskqueue_drain(chan->ch_tq, &chan->ch_task);
1034 }
1035 
1036 uint32_t
1037 vmbus_chan_write_available(struct vmbus_channel *chan)
1038 {
1039 	return (vmbus_txbr_available(&chan->ch_txbr));
1040 }
1041 
1042 bool
1043 vmbus_chan_write_signal(struct vmbus_channel *chan,
1044     int32_t min_signal_size)
1045 {
1046 	if (min_signal_size >= 0 &&
1047 	    vmbus_chan_write_available(chan) > min_signal_size) {
1048 		return false;
1049 	}
1050 
1051 	if (!vmbus_txbr_get_imask(&chan->ch_txbr)) {
1052 		/* txbr imask is not set, signal the reader */
1053 		vmbus_chan_signal_tx(chan);
1054 		return true;
1055 	}
1056 
1057 	return false;
1058 }
1059 
1060 void
1061 vmbus_chan_set_pending_send_size(struct vmbus_channel *chan,
1062     uint32_t size)
1063 {
1064 	if (chan)
1065 		vmbus_txbr_set_pending_snd_sz(&chan->ch_txbr, size);
1066 }
1067 
1068 int
1069 vmbus_chan_iov_send(struct vmbus_channel *chan,
1070     const struct iovec iov[], int iovlen,
1071     vmbus_br_copy_callback_t cb, void *cbarg)
1072 {
1073 	int error;
1074 	boolean_t send_evt;
1075 
1076 	if (iovlen == 0)
1077 		return (0);
1078 
1079 	error = vmbus_txbr_write_call(&chan->ch_txbr, iov, iovlen,
1080 	    cb, cbarg, &send_evt);
1081 
1082 	if (!error && send_evt) {
1083 		vmbus_chan_signal_tx(chan);
1084 	}
1085 
1086 	return error;
1087 }
1088 
1089 int
1090 vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, uint16_t flags,
1091     void *data, int dlen, uint64_t xactid)
1092 {
1093 	struct vmbus_chanpkt pkt;
1094 	int pktlen, pad_pktlen, hlen, error;
1095 	uint64_t pad = 0;
1096 	struct iovec iov[3];
1097 	boolean_t send_evt;
1098 
1099 	hlen = sizeof(pkt);
1100 	pktlen = hlen + dlen;
1101 	pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen);
1102 	KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr),
1103 	    ("invalid packet size %d", pad_pktlen));
1104 
1105 	pkt.cp_hdr.cph_type = type;
1106 	pkt.cp_hdr.cph_flags = flags;
1107 	VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen);
1108 	VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen);
1109 	pkt.cp_hdr.cph_xactid = xactid;
1110 
1111 	iov[0].iov_base = &pkt;
1112 	iov[0].iov_len = hlen;
1113 	iov[1].iov_base = data;
1114 	iov[1].iov_len = dlen;
1115 	iov[2].iov_base = &pad;
1116 	iov[2].iov_len = pad_pktlen - pktlen;
1117 
1118 	error = vmbus_txbr_write(&chan->ch_txbr, iov, 3, &send_evt);
1119 	if (!error && send_evt)
1120 		vmbus_chan_signal_tx(chan);
1121 	return error;
1122 }
1123 
1124 int
1125 vmbus_chan_send_sglist(struct vmbus_channel *chan,
1126     struct vmbus_gpa sg[], int sglen, void *data, int dlen, uint64_t xactid)
1127 {
1128 	struct vmbus_chanpkt_sglist pkt;
1129 	int pktlen, pad_pktlen, hlen, error;
1130 	struct iovec iov[4];
1131 	boolean_t send_evt;
1132 	uint64_t pad = 0;
1133 
1134 	hlen = __offsetof(struct vmbus_chanpkt_sglist, cp_gpa[sglen]);
1135 	pktlen = hlen + dlen;
1136 	pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen);
1137 	KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr),
1138 	    ("invalid packet size %d", pad_pktlen));
1139 
1140 	pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA;
1141 	pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC;
1142 	VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen);
1143 	VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen);
1144 	pkt.cp_hdr.cph_xactid = xactid;
1145 	pkt.cp_rsvd = 0;
1146 	pkt.cp_gpa_cnt = sglen;
1147 
1148 	iov[0].iov_base = &pkt;
1149 	iov[0].iov_len = sizeof(pkt);
1150 	iov[1].iov_base = sg;
1151 	iov[1].iov_len = sizeof(struct vmbus_gpa) * sglen;
1152 	iov[2].iov_base = data;
1153 	iov[2].iov_len = dlen;
1154 	iov[3].iov_base = &pad;
1155 	iov[3].iov_len = pad_pktlen - pktlen;
1156 
1157 	error = vmbus_txbr_write(&chan->ch_txbr, iov, 4, &send_evt);
1158 	if (!error && send_evt)
1159 		vmbus_chan_signal_tx(chan);
1160 	return error;
1161 }
1162 
1163 int
1164 vmbus_chan_send_prplist(struct vmbus_channel *chan,
1165     struct vmbus_gpa_range *prp, int prp_cnt, void *data, int dlen,
1166     uint64_t xactid)
1167 {
1168 	struct vmbus_chanpkt_prplist pkt;
1169 	int pktlen, pad_pktlen, hlen, error;
1170 	struct iovec iov[4];
1171 	boolean_t send_evt;
1172 	uint64_t pad = 0;
1173 
1174 	hlen = __offsetof(struct vmbus_chanpkt_prplist,
1175 	    cp_range[0].gpa_page[prp_cnt]);
1176 	pktlen = hlen + dlen;
1177 	pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen);
1178 	KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr),
1179 	    ("invalid packet size %d", pad_pktlen));
1180 
1181 	pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA;
1182 	pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC;
1183 	VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen);
1184 	VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen);
1185 	pkt.cp_hdr.cph_xactid = xactid;
1186 	pkt.cp_rsvd = 0;
1187 	pkt.cp_range_cnt = 1;
1188 
1189 	iov[0].iov_base = &pkt;
1190 	iov[0].iov_len = sizeof(pkt);
1191 	iov[1].iov_base = prp;
1192 	iov[1].iov_len = __offsetof(struct vmbus_gpa_range, gpa_page[prp_cnt]);
1193 	iov[2].iov_base = data;
1194 	iov[2].iov_len = dlen;
1195 	iov[3].iov_base = &pad;
1196 	iov[3].iov_len = pad_pktlen - pktlen;
1197 
1198 	error = vmbus_txbr_write(&chan->ch_txbr, iov, 4, &send_evt);
1199 	if (!error && send_evt)
1200 		vmbus_chan_signal_tx(chan);
1201 	return error;
1202 }
1203 
1204 int
1205 vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen0,
1206     uint64_t *xactid)
1207 {
1208 	struct vmbus_chanpkt_hdr pkt;
1209 	int error, dlen, hlen;
1210 
1211 	error = vmbus_rxbr_peek(&chan->ch_rxbr, &pkt, sizeof(pkt));
1212 	if (error)
1213 		return (error);
1214 
1215 	if (__predict_false(pkt.cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) {
1216 		vmbus_chan_printf(chan, "invalid hlen %u\n", pkt.cph_hlen);
1217 		/* XXX this channel is dead actually. */
1218 		return (EIO);
1219 	}
1220 	if (__predict_false(pkt.cph_hlen > pkt.cph_tlen)) {
1221 		vmbus_chan_printf(chan, "invalid hlen %u and tlen %u\n",
1222 		    pkt.cph_hlen, pkt.cph_tlen);
1223 		/* XXX this channel is dead actually. */
1224 		return (EIO);
1225 	}
1226 
1227 	hlen = VMBUS_CHANPKT_GETLEN(pkt.cph_hlen);
1228 	dlen = VMBUS_CHANPKT_GETLEN(pkt.cph_tlen) - hlen;
1229 
1230 	if (*dlen0 < dlen) {
1231 		/* Return the size of this packet's data. */
1232 		*dlen0 = dlen;
1233 		return (ENOBUFS);
1234 	}
1235 
1236 	*xactid = pkt.cph_xactid;
1237 	*dlen0 = dlen;
1238 
1239 	/* Skip packet header */
1240 	error = vmbus_rxbr_read(&chan->ch_rxbr, data, dlen, hlen);
1241 	KASSERT(!error, ("vmbus_rxbr_read failed"));
1242 
1243 	return (0);
1244 }
1245 
1246 int
1247 vmbus_chan_recv_pkt(struct vmbus_channel *chan,
1248     struct vmbus_chanpkt_hdr *pkt, int *pktlen0)
1249 {
1250 	int error, pktlen, pkt_hlen;
1251 
1252 	pkt_hlen = sizeof(*pkt);
1253 	error = vmbus_rxbr_peek(&chan->ch_rxbr, pkt, pkt_hlen);
1254 	if (error)
1255 		return (error);
1256 
1257 	if (__predict_false(pkt->cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) {
1258 		vmbus_chan_printf(chan, "invalid hlen %u\n", pkt->cph_hlen);
1259 		/* XXX this channel is dead actually. */
1260 		return (EIO);
1261 	}
1262 	if (__predict_false(pkt->cph_hlen > pkt->cph_tlen)) {
1263 		vmbus_chan_printf(chan, "invalid hlen %u and tlen %u\n",
1264 		    pkt->cph_hlen, pkt->cph_tlen);
1265 		/* XXX this channel is dead actually. */
1266 		return (EIO);
1267 	}
1268 
1269 	pktlen = VMBUS_CHANPKT_GETLEN(pkt->cph_tlen);
1270 	if (*pktlen0 < pktlen) {
1271 		/* Return the size of this packet. */
1272 		*pktlen0 = pktlen;
1273 		return (ENOBUFS);
1274 	}
1275 	*pktlen0 = pktlen;
1276 
1277 	/*
1278 	 * Skip the fixed-size packet header, which has been filled
1279 	 * by the above vmbus_rxbr_peek().
1280 	 */
1281 	error = vmbus_rxbr_read(&chan->ch_rxbr, pkt + 1,
1282 	    pktlen - pkt_hlen, pkt_hlen);
1283 	KASSERT(!error, ("vmbus_rxbr_read failed"));
1284 
1285 	return (0);
1286 }
1287 
1288 uint32_t
1289 vmbus_chan_read_available(struct vmbus_channel *chan)
1290 {
1291 	return (vmbus_rxbr_available(&chan->ch_rxbr));
1292 }
1293 
1294 /*
1295  * This routine does:
1296  *     - Advance the channel read index for 'advance' bytes
1297  *     - Copy data_len bytes in to the buffer pointed by 'data'
1298  * Return 0 if operation succeed. EAGAIN if operations if failed.
1299  * If failed, the buffer pointed by 'data' is intact, and the
1300  * channel read index is not advanced at all.
1301  */
1302 int
1303 vmbus_chan_recv_peek(struct vmbus_channel *chan,
1304     void *data, int data_len, uint32_t advance)
1305 {
1306 	int error;
1307 	boolean_t sig_event;
1308 
1309 	if (data == NULL || data_len <= 0)
1310 		return (EINVAL);
1311 
1312 	error = vmbus_rxbr_idxadv_peek(&chan->ch_rxbr,
1313 	    data, data_len, advance, &sig_event);
1314 
1315 	if (!error && sig_event) {
1316 		vmbus_chan_signal_rx(chan);
1317 	}
1318 
1319 	return (error);
1320 }
1321 
1322 /*
1323  * This routine does:
1324  *     - Advance the channel read index for 'advance' bytes
1325  */
1326 int
1327 vmbus_chan_recv_idxadv(struct vmbus_channel *chan, uint32_t advance)
1328 {
1329 	int error;
1330 	boolean_t sig_event;
1331 
1332 	if (advance == 0)
1333 		return (EINVAL);
1334 
1335 	error = vmbus_rxbr_idxadv(&chan->ch_rxbr, advance, &sig_event);
1336 
1337 	if (!error && sig_event) {
1338 		vmbus_chan_signal_rx(chan);
1339 	}
1340 
1341 	return (error);
1342 }
1343 
1344 
1345 /*
1346  * Caller should hold its own lock to serialize the ring buffer
1347  * copy.
1348  */
1349 int
1350 vmbus_chan_recv_peek_call(struct vmbus_channel *chan, int data_len,
1351     uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg)
1352 {
1353 	if (!chan || data_len <= 0 || cb == NULL)
1354 		return (EINVAL);
1355 
1356 	return (vmbus_rxbr_peek_call(&chan->ch_rxbr, data_len, skip,
1357 	    cb, cbarg));
1358 }
1359 
1360 static void
1361 vmbus_chan_task(void *xchan, int pending __unused)
1362 {
1363 	struct vmbus_channel *chan = xchan;
1364 	vmbus_chan_callback_t cb = chan->ch_cb;
1365 	void *cbarg = chan->ch_cbarg;
1366 
1367 	KASSERT(chan->ch_poll_intvl == 0,
1368 	    ("chan%u: interrupted in polling mode", chan->ch_id));
1369 
1370 	/*
1371 	 * Optimize host to guest signaling by ensuring:
1372 	 * 1. While reading the channel, we disable interrupts from
1373 	 *    host.
1374 	 * 2. Ensure that we process all posted messages from the host
1375 	 *    before returning from this callback.
1376 	 * 3. Once we return, enable signaling from the host. Once this
1377 	 *    state is set we check to see if additional packets are
1378 	 *    available to read. In this case we repeat the process.
1379 	 *
1380 	 * NOTE: Interrupt has been disabled in the ISR.
1381 	 */
1382 	for (;;) {
1383 		uint32_t left;
1384 
1385 		cb(chan, cbarg);
1386 
1387 		left = vmbus_rxbr_intr_unmask(&chan->ch_rxbr);
1388 		if (left == 0) {
1389 			/* No more data in RX bufring; done */
1390 			break;
1391 		}
1392 		vmbus_rxbr_intr_mask(&chan->ch_rxbr);
1393 	}
1394 }
1395 
1396 static void
1397 vmbus_chan_task_nobatch(void *xchan, int pending __unused)
1398 {
1399 	struct vmbus_channel *chan = xchan;
1400 
1401 	KASSERT(chan->ch_poll_intvl == 0,
1402 	    ("chan%u: interrupted in polling mode", chan->ch_id));
1403 	chan->ch_cb(chan, chan->ch_cbarg);
1404 }
1405 
1406 static void
1407 vmbus_chan_poll_timeout(void *xchan)
1408 {
1409 	struct vmbus_channel *chan = xchan;
1410 
1411 	KASSERT(chan->ch_poll_intvl != 0,
1412 	    ("chan%u: polling timeout in interrupt mode", chan->ch_id));
1413 	taskqueue_enqueue(chan->ch_tq, &chan->ch_poll_task);
1414 }
1415 
1416 static void
1417 vmbus_chan_poll_task(void *xchan, int pending __unused)
1418 {
1419 	struct vmbus_channel *chan = xchan;
1420 
1421 	KASSERT(chan->ch_poll_intvl != 0,
1422 	    ("chan%u: polling in interrupt mode", chan->ch_id));
1423 	callout_reset_sbt_curcpu(&chan->ch_poll_timeo, chan->ch_poll_intvl, 0,
1424 	    vmbus_chan_poll_timeout, chan, chan->ch_poll_flags);
1425 	chan->ch_cb(chan, chan->ch_cbarg);
1426 }
1427 
1428 static void
1429 vmbus_chan_pollcfg_task(void *xarg, int pending __unused)
1430 {
1431 	const struct vmbus_chan_pollarg *arg = xarg;
1432 	struct vmbus_channel *chan = arg->poll_chan;
1433 	sbintime_t intvl;
1434 	int poll_flags;
1435 
1436 	/*
1437 	 * Save polling interval.
1438 	 */
1439 	intvl = SBT_1S / arg->poll_hz;
1440 	if (intvl == 0)
1441 		intvl = 1;
1442 	if (intvl == chan->ch_poll_intvl) {
1443 		/* Nothing changes; done */
1444 		return;
1445 	}
1446 	chan->ch_poll_intvl = intvl;
1447 
1448 	/* Adjust callout flags. */
1449 	poll_flags = C_DIRECT_EXEC;
1450 	if (arg->poll_hz <= hz)
1451 		poll_flags |= C_HARDCLOCK;
1452 	chan->ch_poll_flags = poll_flags;
1453 
1454 	/*
1455 	 * Disconnect this channel from the channel map to make sure that
1456 	 * the RX bufring interrupt enabling bit can not be touched, and
1457 	 * ISR can not enqueue this channel task anymore.  THEN, disable
1458 	 * interrupt from the RX bufring (TX bufring does not generate
1459 	 * interrupt to VM).
1460 	 *
1461 	 * NOTE: order is critical.
1462 	 */
1463 	chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL;
1464 	__compiler_membar();
1465 	vmbus_rxbr_intr_mask(&chan->ch_rxbr);
1466 
1467 	/*
1468 	 * NOTE:
1469 	 * At this point, this channel task will not be enqueued by
1470 	 * the ISR anymore, time to cancel the pending one.
1471 	 */
1472 	taskqueue_cancel(chan->ch_tq, &chan->ch_task, NULL);
1473 
1474 	/* Kick start! */
1475 	taskqueue_enqueue(chan->ch_tq, &chan->ch_poll_task);
1476 }
1477 
1478 static bool
1479 vmbus_chan_poll_cancel_intq(struct vmbus_channel *chan)
1480 {
1481 
1482 	if (chan->ch_poll_intvl == 0) {
1483 		/* Not enabled. */
1484 		return (false);
1485 	}
1486 
1487 	/*
1488 	 * Stop polling callout, so that channel polling task
1489 	 * will not be enqueued anymore.
1490 	 */
1491 	callout_drain(&chan->ch_poll_timeo);
1492 
1493 	/*
1494 	 * Disable polling by resetting polling interval.
1495 	 *
1496 	 * NOTE:
1497 	 * The polling interval resetting MUST be conducted
1498 	 * after the callout is drained; mainly to keep the
1499 	 * proper assertion in place.
1500 	 */
1501 	chan->ch_poll_intvl = 0;
1502 
1503 	/*
1504 	 * NOTE:
1505 	 * At this point, this channel polling task will not be
1506 	 * enqueued by the callout anymore, time to cancel the
1507 	 * pending one.
1508 	 */
1509 	taskqueue_cancel(chan->ch_tq, &chan->ch_poll_task, NULL);
1510 
1511 	/* Polling was enabled. */
1512 	return (true);
1513 }
1514 
1515 static void
1516 vmbus_chan_polldis_task(void *xchan, int pending __unused)
1517 {
1518 	struct vmbus_channel *chan = xchan;
1519 
1520 	if (!vmbus_chan_poll_cancel_intq(chan)) {
1521 		/* Already disabled; done. */
1522 		return;
1523 	}
1524 
1525 	/*
1526 	 * Plug this channel back to the channel map and unmask
1527 	 * the RX bufring interrupt.
1528 	 */
1529 	chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan;
1530 	__compiler_membar();
1531 	vmbus_rxbr_intr_unmask(&chan->ch_rxbr);
1532 
1533 	/*
1534 	 * Kick start the interrupt task, just in case unmasking
1535 	 * interrupt races ISR.
1536 	 */
1537 	taskqueue_enqueue(chan->ch_tq, &chan->ch_task);
1538 }
1539 
1540 static __inline void
1541 vmbus_event_flags_proc(struct vmbus_softc *sc, volatile u_long *event_flags,
1542     int flag_cnt)
1543 {
1544 	int f;
1545 
1546 	for (f = 0; f < flag_cnt; ++f) {
1547 		uint32_t chid_base;
1548 		u_long flags;
1549 		int chid_ofs;
1550 
1551 		if (event_flags[f] == 0)
1552 			continue;
1553 
1554 		flags = atomic_swap_long(&event_flags[f], 0);
1555 		chid_base = f << VMBUS_EVTFLAG_SHIFT;
1556 
1557 		while ((chid_ofs = ffsl(flags)) != 0) {
1558 			struct vmbus_channel *chan;
1559 
1560 			--chid_ofs; /* NOTE: ffsl is 1-based */
1561 			flags &= ~(1UL << chid_ofs);
1562 
1563 			chan = sc->vmbus_chmap[chid_base + chid_ofs];
1564 			if (__predict_false(chan == NULL)) {
1565 				/* Channel is closed. */
1566 				continue;
1567 			}
1568 			__compiler_membar();
1569 
1570 			if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD)
1571 				vmbus_rxbr_intr_mask(&chan->ch_rxbr);
1572 			taskqueue_enqueue(chan->ch_tq, &chan->ch_task);
1573 		}
1574 	}
1575 }
1576 
1577 void
1578 vmbus_event_proc(struct vmbus_softc *sc, int cpu)
1579 {
1580 	struct vmbus_evtflags *eventf;
1581 
1582 	/*
1583 	 * On Host with Win8 or above, the event page can be checked directly
1584 	 * to get the id of the channel that has the pending interrupt.
1585 	 */
1586 	eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE;
1587 	vmbus_event_flags_proc(sc, eventf->evt_flags,
1588 	    VMBUS_PCPU_GET(sc, event_flags_cnt, cpu));
1589 }
1590 
1591 void
1592 vmbus_event_proc_compat(struct vmbus_softc *sc, int cpu)
1593 {
1594 	struct vmbus_evtflags *eventf;
1595 
1596 	eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE;
1597 	if (atomic_testandclear_long(&eventf->evt_flags[0], 0)) {
1598 		vmbus_event_flags_proc(sc, sc->vmbus_rx_evtflags,
1599 		    VMBUS_CHAN_MAX_COMPAT >> VMBUS_EVTFLAG_SHIFT);
1600 	}
1601 }
1602 
1603 static void
1604 vmbus_chan_update_evtflagcnt(struct vmbus_softc *sc,
1605     const struct vmbus_channel *chan)
1606 {
1607 	volatile int *flag_cnt_ptr;
1608 	int flag_cnt;
1609 
1610 	flag_cnt = (chan->ch_id / VMBUS_EVTFLAG_LEN) + 1;
1611 	flag_cnt_ptr = VMBUS_PCPU_PTR(sc, event_flags_cnt, chan->ch_cpuid);
1612 
1613 	for (;;) {
1614 		int old_flag_cnt;
1615 
1616 		old_flag_cnt = *flag_cnt_ptr;
1617 		if (old_flag_cnt >= flag_cnt)
1618 			break;
1619 		if (atomic_cmpset_int(flag_cnt_ptr, old_flag_cnt, flag_cnt)) {
1620 			if (bootverbose) {
1621 				vmbus_chan_printf(chan,
1622 				    "chan%u update cpu%d flag_cnt to %d\n",
1623 				    chan->ch_id, chan->ch_cpuid, flag_cnt);
1624 			}
1625 			break;
1626 		}
1627 	}
1628 }
1629 
1630 static struct vmbus_channel *
1631 vmbus_chan_alloc(struct vmbus_softc *sc)
1632 {
1633 	struct vmbus_channel *chan;
1634 
1635 	chan = malloc(sizeof(*chan), M_DEVBUF, M_WAITOK | M_ZERO);
1636 
1637 	chan->ch_monprm = contigmalloc(sizeof(struct hyperv_mon_param),
1638 	    M_DEVBUF, M_WAITOK | M_ZERO, 0ul, ~0ul, HYPERCALL_PARAM_ALIGN, 0);
1639 	if (chan->ch_monprm == NULL) {
1640 		device_printf(sc->vmbus_dev, "monprm alloc failed\n");
1641 		free(chan, M_DEVBUF);
1642 		return NULL;
1643 	}
1644 
1645 	chan->ch_refs = 1;
1646 	chan->ch_vmbus = sc;
1647 	mtx_init(&chan->ch_subchan_lock, "vmbus subchan", NULL, MTX_DEF);
1648 	sx_init(&chan->ch_orphan_lock, "vmbus chorphan");
1649 	TAILQ_INIT(&chan->ch_subchans);
1650 	vmbus_rxbr_init(&chan->ch_rxbr);
1651 	vmbus_txbr_init(&chan->ch_txbr);
1652 
1653 	TASK_INIT(&chan->ch_poll_task, 0, vmbus_chan_poll_task, chan);
1654 	callout_init(&chan->ch_poll_timeo, 1);
1655 
1656 	return chan;
1657 }
1658 
1659 static void
1660 vmbus_chan_free(struct vmbus_channel *chan)
1661 {
1662 
1663 	KASSERT(TAILQ_EMPTY(&chan->ch_subchans) && chan->ch_subchan_cnt == 0,
1664 	    ("still owns sub-channels"));
1665 	KASSERT((chan->ch_stflags &
1666 	    (VMBUS_CHAN_ST_OPENED |
1667 	     VMBUS_CHAN_ST_ONPRIL |
1668 	     VMBUS_CHAN_ST_ONSUBL |
1669 	     VMBUS_CHAN_ST_ONLIST)) == 0, ("free busy channel"));
1670 	KASSERT(chan->ch_orphan_xact == NULL,
1671 	    ("still has orphan xact installed"));
1672 	KASSERT(chan->ch_refs == 0, ("chan%u: invalid refcnt %d",
1673 	    chan->ch_id, chan->ch_refs));
1674 	KASSERT(chan->ch_poll_intvl == 0, ("chan%u: polling is activated",
1675 	    chan->ch_id));
1676 
1677 	contigfree(chan->ch_monprm, sizeof(struct hyperv_mon_param), M_DEVBUF);
1678 	mtx_destroy(&chan->ch_subchan_lock);
1679 	sx_destroy(&chan->ch_orphan_lock);
1680 	vmbus_rxbr_deinit(&chan->ch_rxbr);
1681 	vmbus_txbr_deinit(&chan->ch_txbr);
1682 	free(chan, M_DEVBUF);
1683 }
1684 
1685 static int
1686 vmbus_chan_add(struct vmbus_channel *newchan)
1687 {
1688 	struct vmbus_softc *sc = newchan->ch_vmbus;
1689 	struct vmbus_channel *prichan;
1690 
1691 	if (newchan->ch_id == 0) {
1692 		/*
1693 		 * XXX
1694 		 * Chan0 will neither be processed nor should be offered;
1695 		 * skip it.
1696 		 */
1697 		device_printf(sc->vmbus_dev, "got chan0 offer, discard\n");
1698 		return EINVAL;
1699 	} else if (newchan->ch_id >= VMBUS_CHAN_MAX) {
1700 		device_printf(sc->vmbus_dev, "invalid chan%u offer\n",
1701 		    newchan->ch_id);
1702 		return EINVAL;
1703 	}
1704 
1705 	mtx_lock(&sc->vmbus_prichan_lock);
1706 	TAILQ_FOREACH(prichan, &sc->vmbus_prichans, ch_prilink) {
1707 		/*
1708 		 * Sub-channel will have the same type GUID and instance
1709 		 * GUID as its primary channel.
1710 		 */
1711 		if (memcmp(&prichan->ch_guid_type, &newchan->ch_guid_type,
1712 		    sizeof(struct hyperv_guid)) == 0 &&
1713 		    memcmp(&prichan->ch_guid_inst, &newchan->ch_guid_inst,
1714 		    sizeof(struct hyperv_guid)) == 0)
1715 			break;
1716 	}
1717 	if (VMBUS_CHAN_ISPRIMARY(newchan)) {
1718 		if (prichan == NULL) {
1719 			/* Install the new primary channel */
1720 			vmbus_chan_ins_prilist(sc, newchan);
1721 			mtx_unlock(&sc->vmbus_prichan_lock);
1722 			goto done;
1723 		} else {
1724 			mtx_unlock(&sc->vmbus_prichan_lock);
1725 			device_printf(sc->vmbus_dev,
1726 			    "duplicated primary chan%u\n", newchan->ch_id);
1727 			return EINVAL;
1728 		}
1729 	} else { /* Sub-channel */
1730 		if (prichan == NULL) {
1731 			mtx_unlock(&sc->vmbus_prichan_lock);
1732 			device_printf(sc->vmbus_dev,
1733 			    "no primary chan for chan%u\n", newchan->ch_id);
1734 			return EINVAL;
1735 		}
1736 		/*
1737 		 * Found the primary channel for this sub-channel and
1738 		 * move on.
1739 		 *
1740 		 * XXX refcnt prichan
1741 		 */
1742 	}
1743 	mtx_unlock(&sc->vmbus_prichan_lock);
1744 
1745 	/*
1746 	 * This is a sub-channel; link it with the primary channel.
1747 	 */
1748 	KASSERT(!VMBUS_CHAN_ISPRIMARY(newchan),
1749 	    ("new channel is not sub-channel"));
1750 	KASSERT(prichan != NULL, ("no primary channel"));
1751 
1752 	/*
1753 	 * Reference count this sub-channel; it will be dereferenced
1754 	 * when this sub-channel is closed.
1755 	 */
1756 	KASSERT(newchan->ch_refs == 1, ("chan%u: invalid refcnt %d",
1757 	    newchan->ch_id, newchan->ch_refs));
1758 	atomic_add_int(&newchan->ch_refs, 1);
1759 
1760 	newchan->ch_prichan = prichan;
1761 	newchan->ch_dev = prichan->ch_dev;
1762 
1763 	mtx_lock(&prichan->ch_subchan_lock);
1764 	vmbus_chan_ins_sublist(prichan, newchan);
1765 	mtx_unlock(&prichan->ch_subchan_lock);
1766 	/*
1767 	 * Notify anyone that is interested in this sub-channel,
1768 	 * after this sub-channel is setup.
1769 	 */
1770 	wakeup(prichan);
1771 done:
1772 	/*
1773 	 * Hook this channel up for later revocation.
1774 	 */
1775 	mtx_lock(&sc->vmbus_chan_lock);
1776 	vmbus_chan_ins_list(sc, newchan);
1777 	mtx_unlock(&sc->vmbus_chan_lock);
1778 
1779 	if (bootverbose) {
1780 		vmbus_chan_printf(newchan, "chan%u subidx%u offer\n",
1781 		    newchan->ch_id, newchan->ch_subidx);
1782 	}
1783 
1784 	/* Select default cpu for this channel. */
1785 	vmbus_chan_cpu_default(newchan);
1786 
1787 	return 0;
1788 }
1789 
1790 void
1791 vmbus_chan_cpu_set(struct vmbus_channel *chan, int cpu)
1792 {
1793 	KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu %d", cpu));
1794 
1795 	if (chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WS2008 ||
1796 	    chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WIN7) {
1797 		/* Only cpu0 is supported */
1798 		cpu = 0;
1799 	}
1800 
1801 	chan->ch_cpuid = cpu;
1802 	chan->ch_vcpuid = VMBUS_PCPU_GET(chan->ch_vmbus, vcpuid, cpu);
1803 
1804 	if (bootverbose) {
1805 		vmbus_chan_printf(chan,
1806 		    "chan%u assigned to cpu%u [vcpu%u]\n",
1807 		    chan->ch_id, chan->ch_cpuid, chan->ch_vcpuid);
1808 	}
1809 }
1810 
1811 void
1812 vmbus_chan_cpu_rr(struct vmbus_channel *chan)
1813 {
1814 	static uint32_t vmbus_chan_nextcpu;
1815 	int cpu;
1816 
1817 	cpu = atomic_fetchadd_int(&vmbus_chan_nextcpu, 1) % mp_ncpus;
1818 	vmbus_chan_cpu_set(chan, cpu);
1819 }
1820 
1821 static void
1822 vmbus_chan_cpu_default(struct vmbus_channel *chan)
1823 {
1824 	/*
1825 	 * By default, pin the channel to cpu0.  Devices having
1826 	 * special channel-cpu mapping requirement should call
1827 	 * vmbus_chan_cpu_{set,rr}().
1828 	 */
1829 	vmbus_chan_cpu_set(chan, 0);
1830 }
1831 
1832 static void
1833 vmbus_chan_msgproc_choffer(struct vmbus_softc *sc,
1834     const struct vmbus_message *msg)
1835 {
1836 	const struct vmbus_chanmsg_choffer *offer;
1837 	struct vmbus_channel *chan;
1838 	task_fn_t *detach_fn, *attach_fn;
1839 	int error;
1840 
1841 	offer = (const struct vmbus_chanmsg_choffer *)msg->msg_data;
1842 
1843 	chan = vmbus_chan_alloc(sc);
1844 	if (chan == NULL) {
1845 		device_printf(sc->vmbus_dev, "allocate chan%u failed\n",
1846 		    offer->chm_chanid);
1847 		return;
1848 	}
1849 
1850 	chan->ch_id = offer->chm_chanid;
1851 	chan->ch_subidx = offer->chm_subidx;
1852 	chan->ch_guid_type = offer->chm_chtype;
1853 	chan->ch_guid_inst = offer->chm_chinst;
1854 
1855 	/* Batch reading is on by default */
1856 	chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD;
1857 
1858 	chan->ch_monprm->mp_connid = VMBUS_CONNID_EVENT;
1859 	if (sc->vmbus_version != VMBUS_VERSION_WS2008)
1860 		chan->ch_monprm->mp_connid = offer->chm_connid;
1861 
1862 	if (offer->chm_flags1 & VMBUS_CHOFFER_FLAG1_HASMNF) {
1863 		int trig_idx;
1864 
1865 		/*
1866 		 * Setup MNF stuffs.
1867 		 */
1868 		chan->ch_txflags |= VMBUS_CHAN_TXF_HASMNF;
1869 
1870 		trig_idx = offer->chm_montrig / VMBUS_MONTRIG_LEN;
1871 		if (trig_idx >= VMBUS_MONTRIGS_MAX)
1872 			panic("invalid monitor trigger %u", offer->chm_montrig);
1873 		chan->ch_montrig =
1874 		    &sc->vmbus_mnf2->mnf_trigs[trig_idx].mt_pending;
1875 
1876 		chan->ch_montrig_mask =
1877 		    1 << (offer->chm_montrig % VMBUS_MONTRIG_LEN);
1878 	}
1879 
1880 	if (offer->chm_chflags & VMBUS_CHAN_TLNPI_PROVIDER_OFFER) {
1881 		/* This is HyperV socket channel */
1882 		chan->ch_is_hvs = true;
1883 		/* The first byte != 0 means the host initiated connection. */
1884 		chan->ch_hvs_conn_from_host =
1885 		    offer->chm_udata.pipe.user_def[0];
1886 
1887 		if (bootverbose) {
1888 			device_printf(sc->vmbus_dev,
1889 			    "chan%u is hyperv socket channel "
1890 			    "connected %s host\n",
1891 			    chan->ch_id,
1892 			    (chan->ch_hvs_conn_from_host != 0) ?
1893 			    "from" : "to");
1894 		}
1895 	} else {
1896 		chan->ch_is_hvs = false;
1897 	}
1898 
1899 	/*
1900 	 * Setup event flag.
1901 	 */
1902 	chan->ch_evtflag =
1903 	    &sc->vmbus_tx_evtflags[chan->ch_id >> VMBUS_EVTFLAG_SHIFT];
1904 	chan->ch_evtflag_mask = 1UL << (chan->ch_id & VMBUS_EVTFLAG_MASK);
1905 
1906 	/*
1907 	 * Setup attach and detach tasks.
1908 	 */
1909 	if (VMBUS_CHAN_ISPRIMARY(chan)) {
1910 		chan->ch_mgmt_tq = sc->vmbus_devtq;
1911 		attach_fn = vmbus_prichan_attach_task;
1912 		detach_fn = vmbus_prichan_detach_task;
1913 	} else {
1914 		chan->ch_mgmt_tq = sc->vmbus_subchtq;
1915 		attach_fn = vmbus_subchan_attach_task;
1916 		detach_fn = vmbus_subchan_detach_task;
1917 	}
1918 	TASK_INIT(&chan->ch_attach_task, 0, attach_fn, chan);
1919 	TASK_INIT(&chan->ch_detach_task, 0, detach_fn, chan);
1920 
1921 	error = vmbus_chan_add(chan);
1922 	if (error) {
1923 		device_printf(sc->vmbus_dev, "add chan%u failed: %d\n",
1924 		    chan->ch_id, error);
1925 		atomic_subtract_int(&chan->ch_refs, 1);
1926 		vmbus_chan_free(chan);
1927 		return;
1928 	}
1929 	taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_attach_task);
1930 }
1931 
1932 static void
1933 vmbus_chan_msgproc_chrescind(struct vmbus_softc *sc,
1934     const struct vmbus_message *msg)
1935 {
1936 	const struct vmbus_chanmsg_chrescind *note;
1937 	struct vmbus_channel *chan;
1938 
1939 	note = (const struct vmbus_chanmsg_chrescind *)msg->msg_data;
1940 	if (note->chm_chanid > VMBUS_CHAN_MAX) {
1941 		device_printf(sc->vmbus_dev, "invalid revoked chan%u\n",
1942 		    note->chm_chanid);
1943 		return;
1944 	}
1945 
1946 	/*
1947 	 * Find and remove the target channel from the channel list.
1948 	 */
1949 	mtx_lock(&sc->vmbus_chan_lock);
1950 	TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) {
1951 		if (chan->ch_id == note->chm_chanid)
1952 			break;
1953 	}
1954 	if (chan == NULL) {
1955 		mtx_unlock(&sc->vmbus_chan_lock);
1956 		device_printf(sc->vmbus_dev, "chan%u is not offered\n",
1957 		    note->chm_chanid);
1958 		return;
1959 	}
1960 	vmbus_chan_rem_list(sc, chan);
1961 	mtx_unlock(&sc->vmbus_chan_lock);
1962 
1963 	if (VMBUS_CHAN_ISPRIMARY(chan)) {
1964 		/*
1965 		 * The target channel is a primary channel; remove the
1966 		 * target channel from the primary channel list now,
1967 		 * instead of later, so that it will not be found by
1968 		 * other sub-channel offers, which are processed in
1969 		 * this thread.
1970 		 */
1971 		mtx_lock(&sc->vmbus_prichan_lock);
1972 		vmbus_chan_rem_prilist(sc, chan);
1973 		mtx_unlock(&sc->vmbus_prichan_lock);
1974 	}
1975 
1976 	/*
1977 	 * NOTE:
1978 	 * The following processing order is critical:
1979 	 * Set the REVOKED state flag before orphaning the installed xact.
1980 	 */
1981 
1982 	if (atomic_testandset_int(&chan->ch_stflags,
1983 	    VMBUS_CHAN_ST_REVOKED_SHIFT))
1984 		panic("channel has already been revoked");
1985 
1986 	sx_xlock(&chan->ch_orphan_lock);
1987 	if (chan->ch_orphan_xact != NULL)
1988 		vmbus_xact_ctx_orphan(chan->ch_orphan_xact);
1989 	sx_xunlock(&chan->ch_orphan_lock);
1990 
1991 	if (bootverbose)
1992 		vmbus_chan_printf(chan, "chan%u revoked\n", note->chm_chanid);
1993 	vmbus_chan_detach(chan);
1994 }
1995 
1996 static int
1997 vmbus_chan_release(struct vmbus_channel *chan)
1998 {
1999 	struct vmbus_softc *sc = chan->ch_vmbus;
2000 	struct vmbus_chanmsg_chfree *req;
2001 	struct vmbus_msghc *mh;
2002 	int error;
2003 
2004 	mh = vmbus_msghc_get(sc, sizeof(*req));
2005 	if (mh == NULL) {
2006 		vmbus_chan_printf(chan,
2007 		    "can not get msg hypercall for chfree(chan%u)\n",
2008 		    chan->ch_id);
2009 		return (ENXIO);
2010 	}
2011 
2012 	req = vmbus_msghc_dataptr(mh);
2013 	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHFREE;
2014 	req->chm_chanid = chan->ch_id;
2015 
2016 	error = vmbus_msghc_exec_noresult(mh);
2017 	vmbus_msghc_put(sc, mh);
2018 
2019 	if (error) {
2020 		vmbus_chan_printf(chan,
2021 		    "chfree(chan%u) msg hypercall exec failed: %d\n",
2022 		    chan->ch_id, error);
2023 	} else {
2024 		if (bootverbose)
2025 			vmbus_chan_printf(chan, "chan%u freed\n", chan->ch_id);
2026 	}
2027 	return (error);
2028 }
2029 
2030 static void
2031 vmbus_prichan_detach_task(void *xchan, int pending __unused)
2032 {
2033 	struct vmbus_channel *chan = xchan;
2034 
2035 	KASSERT(VMBUS_CHAN_ISPRIMARY(chan),
2036 	    ("chan%u is not primary channel", chan->ch_id));
2037 
2038 	/* Delete and detach the device associated with this channel. */
2039 	vmbus_delete_child(chan);
2040 
2041 	/* Release this channel (back to vmbus). */
2042 	vmbus_chan_release(chan);
2043 
2044 	/* Free this channel's resource. */
2045 	vmbus_chan_free(chan);
2046 }
2047 
2048 static void
2049 vmbus_subchan_detach_task(void *xchan, int pending __unused)
2050 {
2051 	struct vmbus_channel *chan = xchan;
2052 	struct vmbus_channel *pri_chan = chan->ch_prichan;
2053 
2054 	KASSERT(!VMBUS_CHAN_ISPRIMARY(chan),
2055 	    ("chan%u is primary channel", chan->ch_id));
2056 
2057 	/* Release this channel (back to vmbus). */
2058 	vmbus_chan_release(chan);
2059 
2060 	/* Unlink from its primary channel's sub-channel list. */
2061 	mtx_lock(&pri_chan->ch_subchan_lock);
2062 	vmbus_chan_rem_sublist(pri_chan, chan);
2063 	mtx_unlock(&pri_chan->ch_subchan_lock);
2064 	/* Notify anyone that is waiting for this sub-channel to vanish. */
2065 	wakeup(pri_chan);
2066 
2067 	/* Free this channel's resource. */
2068 	vmbus_chan_free(chan);
2069 }
2070 
2071 static void
2072 vmbus_prichan_attach_task(void *xchan, int pending __unused)
2073 {
2074 
2075 	/*
2076 	 * Add device for this primary channel.
2077 	 */
2078 	vmbus_add_child(xchan);
2079 }
2080 
2081 static void
2082 vmbus_subchan_attach_task(void *xchan __unused, int pending __unused)
2083 {
2084 
2085 	/* Nothing */
2086 }
2087 
2088 void
2089 vmbus_chan_destroy_all(struct vmbus_softc *sc)
2090 {
2091 
2092 	/*
2093 	 * Detach all devices and destroy the corresponding primary
2094 	 * channels.
2095 	 */
2096 	for (;;) {
2097 		struct vmbus_channel *chan;
2098 
2099 		mtx_lock(&sc->vmbus_chan_lock);
2100 		TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) {
2101 			if (VMBUS_CHAN_ISPRIMARY(chan))
2102 				break;
2103 		}
2104 		if (chan == NULL) {
2105 			/* No more primary channels; done. */
2106 			mtx_unlock(&sc->vmbus_chan_lock);
2107 			break;
2108 		}
2109 		vmbus_chan_rem_list(sc, chan);
2110 		mtx_unlock(&sc->vmbus_chan_lock);
2111 
2112 		mtx_lock(&sc->vmbus_prichan_lock);
2113 		vmbus_chan_rem_prilist(sc, chan);
2114 		mtx_unlock(&sc->vmbus_prichan_lock);
2115 
2116 		taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task);
2117 	}
2118 }
2119 
2120 struct vmbus_channel **
2121 vmbus_subchan_get(struct vmbus_channel *pri_chan, int subchan_cnt)
2122 {
2123 	struct vmbus_channel **ret, *chan;
2124 	int i;
2125 
2126 	KASSERT(subchan_cnt > 0, ("invalid sub-channel count %d", subchan_cnt));
2127 
2128 	ret = malloc(subchan_cnt * sizeof(struct vmbus_channel *), M_TEMP,
2129 	    M_WAITOK);
2130 
2131 	mtx_lock(&pri_chan->ch_subchan_lock);
2132 
2133 	while (pri_chan->ch_subchan_cnt < subchan_cnt)
2134 		mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "subch", 0);
2135 
2136 	i = 0;
2137 	TAILQ_FOREACH(chan, &pri_chan->ch_subchans, ch_sublink) {
2138 		/* TODO: refcnt chan */
2139 		ret[i] = chan;
2140 
2141 		++i;
2142 		if (i == subchan_cnt)
2143 			break;
2144 	}
2145 	KASSERT(i == subchan_cnt, ("invalid subchan count %d, should be %d",
2146 	    pri_chan->ch_subchan_cnt, subchan_cnt));
2147 
2148 	mtx_unlock(&pri_chan->ch_subchan_lock);
2149 
2150 	return ret;
2151 }
2152 
2153 void
2154 vmbus_subchan_rel(struct vmbus_channel **subchan, int subchan_cnt __unused)
2155 {
2156 
2157 	free(subchan, M_TEMP);
2158 }
2159 
2160 void
2161 vmbus_subchan_drain(struct vmbus_channel *pri_chan)
2162 {
2163 	mtx_lock(&pri_chan->ch_subchan_lock);
2164 	while (pri_chan->ch_subchan_cnt > 0)
2165 		mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "dsubch", 0);
2166 	mtx_unlock(&pri_chan->ch_subchan_lock);
2167 }
2168 
2169 void
2170 vmbus_chan_msgproc(struct vmbus_softc *sc, const struct vmbus_message *msg)
2171 {
2172 	vmbus_chanmsg_proc_t msg_proc;
2173 	uint32_t msg_type;
2174 
2175 	msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type;
2176 	KASSERT(msg_type < VMBUS_CHANMSG_TYPE_MAX,
2177 	    ("invalid message type %u", msg_type));
2178 
2179 	msg_proc = vmbus_chan_msgprocs[msg_type];
2180 	if (msg_proc != NULL)
2181 		msg_proc(sc, msg);
2182 }
2183 
2184 void
2185 vmbus_chan_set_readbatch(struct vmbus_channel *chan, bool on)
2186 {
2187 	if (!on)
2188 		chan->ch_flags &= ~VMBUS_CHAN_FLAG_BATCHREAD;
2189 	else
2190 		chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD;
2191 }
2192 
2193 uint32_t
2194 vmbus_chan_id(const struct vmbus_channel *chan)
2195 {
2196 	return chan->ch_id;
2197 }
2198 
2199 uint32_t
2200 vmbus_chan_subidx(const struct vmbus_channel *chan)
2201 {
2202 	return chan->ch_subidx;
2203 }
2204 
2205 bool
2206 vmbus_chan_is_primary(const struct vmbus_channel *chan)
2207 {
2208 	if (VMBUS_CHAN_ISPRIMARY(chan))
2209 		return true;
2210 	else
2211 		return false;
2212 }
2213 
2214 bool
2215 vmbus_chan_is_hvs(const struct vmbus_channel *chan)
2216 {
2217 	return chan->ch_is_hvs;
2218 }
2219 
2220 bool
2221 vmbus_chan_is_hvs_conn_from_host(const struct vmbus_channel *chan)
2222 {
2223 	KASSERT(vmbus_chan_is_hvs(chan) == true,
2224 	    ("Not a HyperV Socket channel %u", chan->ch_id));
2225 	if (chan->ch_hvs_conn_from_host != 0)
2226 		return true;
2227 	else
2228 		return false;
2229 }
2230 
2231 struct hyperv_guid *
2232 vmbus_chan_guid_type(struct vmbus_channel *chan)
2233 {
2234 	return &chan->ch_guid_type;
2235 }
2236 
2237 struct hyperv_guid *
2238 vmbus_chan_guid_inst(struct vmbus_channel *chan)
2239 {
2240 	return &chan->ch_guid_inst;
2241 }
2242 
2243 int
2244 vmbus_chan_prplist_nelem(int br_size, int prpcnt_max, int dlen_max)
2245 {
2246 	int elem_size;
2247 
2248 	elem_size = __offsetof(struct vmbus_chanpkt_prplist,
2249 	    cp_range[0].gpa_page[prpcnt_max]);
2250 	elem_size += dlen_max;
2251 	elem_size = VMBUS_CHANPKT_TOTLEN(elem_size);
2252 
2253 	return (vmbus_br_nelem(br_size, elem_size));
2254 }
2255 
2256 bool
2257 vmbus_chan_tx_empty(const struct vmbus_channel *chan)
2258 {
2259 
2260 	return (vmbus_txbr_empty(&chan->ch_txbr));
2261 }
2262 
2263 bool
2264 vmbus_chan_rx_empty(const struct vmbus_channel *chan)
2265 {
2266 
2267 	return (vmbus_rxbr_empty(&chan->ch_rxbr));
2268 }
2269 
2270 static int
2271 vmbus_chan_printf(const struct vmbus_channel *chan, const char *fmt, ...)
2272 {
2273 	va_list ap;
2274 	device_t dev;
2275 	int retval;
2276 
2277 	if (chan->ch_dev == NULL || !device_is_alive(chan->ch_dev))
2278 		dev = chan->ch_vmbus->vmbus_dev;
2279 	else
2280 		dev = chan->ch_dev;
2281 
2282 	retval = device_print_prettyname(dev);
2283 	va_start(ap, fmt);
2284 	retval += vprintf(fmt, ap);
2285 	va_end(ap);
2286 
2287 	return (retval);
2288 }
2289 
2290 void
2291 vmbus_chan_run_task(struct vmbus_channel *chan, struct task *task)
2292 {
2293 
2294 	taskqueue_enqueue(chan->ch_tq, task);
2295 	taskqueue_drain(chan->ch_tq, task);
2296 }
2297 
2298 struct taskqueue *
2299 vmbus_chan_mgmt_tq(const struct vmbus_channel *chan)
2300 {
2301 
2302 	return (chan->ch_mgmt_tq);
2303 }
2304 
2305 bool
2306 vmbus_chan_is_revoked(const struct vmbus_channel *chan)
2307 {
2308 
2309 	if (chan->ch_stflags & VMBUS_CHAN_ST_REVOKED)
2310 		return (true);
2311 	return (false);
2312 }
2313 
2314 void
2315 vmbus_chan_set_orphan(struct vmbus_channel *chan, struct vmbus_xact_ctx *xact)
2316 {
2317 
2318 	sx_xlock(&chan->ch_orphan_lock);
2319 	chan->ch_orphan_xact = xact;
2320 	sx_xunlock(&chan->ch_orphan_lock);
2321 }
2322 
2323 void
2324 vmbus_chan_unset_orphan(struct vmbus_channel *chan)
2325 {
2326 
2327 	sx_xlock(&chan->ch_orphan_lock);
2328 	chan->ch_orphan_xact = NULL;
2329 	sx_xunlock(&chan->ch_orphan_lock);
2330 }
2331 
2332 const void *
2333 vmbus_chan_xact_wait(const struct vmbus_channel *chan,
2334     struct vmbus_xact *xact, size_t *resp_len, bool can_sleep)
2335 {
2336 	const void *ret;
2337 
2338 	if (can_sleep)
2339 		ret = vmbus_xact_wait(xact, resp_len);
2340 	else
2341 		ret = vmbus_xact_busywait(xact, resp_len);
2342 	if (vmbus_chan_is_revoked(chan)) {
2343 		/*
2344 		 * This xact probably is interrupted, and the
2345 		 * interruption can race the reply reception,
2346 		 * so we have to make sure that there are nothing
2347 		 * left on the RX bufring, i.e. this xact will
2348 		 * not be touched, once this function returns.
2349 		 *
2350 		 * Since the hypervisor will not put more data
2351 		 * onto the RX bufring once the channel is revoked,
2352 		 * the following loop will be terminated, once all
2353 		 * data are drained by the driver's channel
2354 		 * callback.
2355 		 */
2356 		while (!vmbus_chan_rx_empty(chan)) {
2357 			if (can_sleep)
2358 				pause("chxact", 1);
2359 			else
2360 				DELAY(1000);
2361 		}
2362 	}
2363 	return (ret);
2364 }
2365 
2366 void
2367 vmbus_chan_poll_enable(struct vmbus_channel *chan, u_int pollhz)
2368 {
2369 	struct vmbus_chan_pollarg arg;
2370 	struct task poll_cfg;
2371 
2372 	KASSERT(chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD,
2373 	    ("enable polling on non-batch chan%u", chan->ch_id));
2374 	KASSERT(pollhz >= VMBUS_CHAN_POLLHZ_MIN &&
2375 	    pollhz <= VMBUS_CHAN_POLLHZ_MAX, ("invalid pollhz %u", pollhz));
2376 
2377 	arg.poll_chan = chan;
2378 	arg.poll_hz = pollhz;
2379 	TASK_INIT(&poll_cfg, 0, vmbus_chan_pollcfg_task, &arg);
2380 	vmbus_chan_run_task(chan, &poll_cfg);
2381 }
2382 
2383 void
2384 vmbus_chan_poll_disable(struct vmbus_channel *chan)
2385 {
2386 	struct task poll_dis;
2387 
2388 	KASSERT(chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD,
2389 	    ("disable polling on non-batch chan%u", chan->ch_id));
2390 
2391 	TASK_INIT(&poll_dis, 0, vmbus_chan_polldis_task, chan);
2392 	vmbus_chan_run_task(chan, &poll_dis);
2393 }
2394