xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision b70d2a2aa5003027b422e62435ab5bb9390d543c)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/bus.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/smp.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
75 #include <sys/sx.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
80 #include <sys/eventhandler.h>
81 
82 #include <machine/atomic.h>
83 #include <machine/in_cksum.h>
84 
85 #include <net/bpf.h>
86 #include <net/ethernet.h>
87 #include <net/if.h>
88 #include <net/if_dl.h>
89 #include <net/if_media.h>
90 #include <net/if_types.h>
91 #include <net/if_var.h>
92 #include <net/rndis.h>
93 #ifdef RSS
94 #include <net/rss_config.h>
95 #endif
96 
97 #include <netinet/in_systm.h>
98 #include <netinet/in.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip6.h>
101 #include <netinet/tcp.h>
102 #include <netinet/tcp_lro.h>
103 #include <netinet/udp.h>
104 
105 #include <dev/hyperv/include/hyperv.h>
106 #include <dev/hyperv/include/hyperv_busdma.h>
107 #include <dev/hyperv/include/vmbus.h>
108 #include <dev/hyperv/include/vmbus_xact.h>
109 
110 #include <dev/hyperv/netvsc/ndis.h>
111 #include <dev/hyperv/netvsc/if_hnreg.h>
112 #include <dev/hyperv/netvsc/if_hnvar.h>
113 #include <dev/hyperv/netvsc/hn_nvs.h>
114 #include <dev/hyperv/netvsc/hn_rndis.h>
115 
116 #include "vmbus_if.h"
117 
118 #define HN_IFSTART_SUPPORT
119 
120 #define HN_RING_CNT_DEF_MAX		8
121 
122 /* YYY should get it from the underlying channel */
123 #define HN_TX_DESC_CNT			512
124 
125 #define HN_RNDIS_PKT_LEN					\
126 	(sizeof(struct rndis_packet_msg) +			\
127 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
128 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
129 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
130 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
131 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
132 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
133 
134 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
135 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
136 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
137 /* -1 for RNDIS packet message */
138 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
139 
140 #define HN_DIRECT_TX_SIZE_DEF		128
141 
142 #define HN_EARLY_TXEOF_THRESH		8
143 
144 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
145 
146 #define HN_LROENT_CNT_DEF		128
147 
148 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
149 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
150 /* YYY 2*MTU is a bit rough, but should be good enough. */
151 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
152 
153 #define HN_LRO_ACKCNT_DEF		1
154 
155 #define HN_LOCK_INIT(sc)		\
156 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
157 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
158 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
159 #define HN_LOCK(sc)					\
160 do {							\
161 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
162 		DELAY(1000);				\
163 } while (0)
164 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
165 
166 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
167 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
168 #define HN_CSUM_IP_HWASSIST(sc)		\
169 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
170 #define HN_CSUM_IP6_HWASSIST(sc)	\
171 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
172 
173 #define HN_PKTSIZE_MIN(align)		\
174 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
175 	    HN_RNDIS_PKT_LEN, (align))
176 #define HN_PKTSIZE(m, align)		\
177 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
178 
179 #ifdef RSS
180 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
181 #else
182 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
183 #endif
184 
185 struct hn_txdesc {
186 #ifndef HN_USE_TXDESC_BUFRING
187 	SLIST_ENTRY(hn_txdesc)		link;
188 #endif
189 	STAILQ_ENTRY(hn_txdesc)		agg_link;
190 
191 	/* Aggregated txdescs, in sending order. */
192 	STAILQ_HEAD(, hn_txdesc)	agg_list;
193 
194 	/* The oldest packet, if transmission aggregation happens. */
195 	struct mbuf			*m;
196 	struct hn_tx_ring		*txr;
197 	int				refs;
198 	uint32_t			flags;	/* HN_TXD_FLAG_ */
199 	struct hn_nvs_sendctx		send_ctx;
200 	uint32_t			chim_index;
201 	int				chim_size;
202 
203 	bus_dmamap_t			data_dmap;
204 
205 	bus_addr_t			rndis_pkt_paddr;
206 	struct rndis_packet_msg		*rndis_pkt;
207 	bus_dmamap_t			rndis_pkt_dmap;
208 };
209 
210 #define HN_TXD_FLAG_ONLIST		0x0001
211 #define HN_TXD_FLAG_DMAMAP		0x0002
212 #define HN_TXD_FLAG_ONAGG		0x0004
213 
214 struct hn_rxinfo {
215 	uint32_t			vlan_info;
216 	uint32_t			csum_info;
217 	uint32_t			hash_info;
218 	uint32_t			hash_value;
219 };
220 
221 struct hn_update_vf {
222 	struct hn_rx_ring	*rxr;
223 	struct ifnet		*vf;
224 };
225 
226 #define HN_RXINFO_VLAN			0x0001
227 #define HN_RXINFO_CSUM			0x0002
228 #define HN_RXINFO_HASHINF		0x0004
229 #define HN_RXINFO_HASHVAL		0x0008
230 #define HN_RXINFO_ALL			\
231 	(HN_RXINFO_VLAN |		\
232 	 HN_RXINFO_CSUM |		\
233 	 HN_RXINFO_HASHINF |		\
234 	 HN_RXINFO_HASHVAL)
235 
236 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
237 #define HN_NDIS_RXCSUM_INFO_INVALID	0
238 #define HN_NDIS_HASH_INFO_INVALID	0
239 
240 static int			hn_probe(device_t);
241 static int			hn_attach(device_t);
242 static int			hn_detach(device_t);
243 static int			hn_shutdown(device_t);
244 static void			hn_chan_callback(struct vmbus_channel *,
245 				    void *);
246 
247 static void			hn_init(void *);
248 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
249 #ifdef HN_IFSTART_SUPPORT
250 static void			hn_start(struct ifnet *);
251 #endif
252 static int			hn_transmit(struct ifnet *, struct mbuf *);
253 static void			hn_xmit_qflush(struct ifnet *);
254 static int			hn_ifmedia_upd(struct ifnet *);
255 static void			hn_ifmedia_sts(struct ifnet *,
256 				    struct ifmediareq *);
257 
258 static int			hn_rndis_rxinfo(const void *, int,
259 				    struct hn_rxinfo *);
260 static void			hn_rndis_rx_data(struct hn_rx_ring *,
261 				    const void *, int);
262 static void			hn_rndis_rx_status(struct hn_softc *,
263 				    const void *, int);
264 
265 static void			hn_nvs_handle_notify(struct hn_softc *,
266 				    const struct vmbus_chanpkt_hdr *);
267 static void			hn_nvs_handle_comp(struct hn_softc *,
268 				    struct vmbus_channel *,
269 				    const struct vmbus_chanpkt_hdr *);
270 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
271 				    struct vmbus_channel *,
272 				    const struct vmbus_chanpkt_hdr *);
273 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
274 				    struct vmbus_channel *, uint64_t);
275 
276 #if __FreeBSD_version >= 1100099
277 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
278 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
279 #endif
280 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
281 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
282 #if __FreeBSD_version < 1100095
283 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
284 #else
285 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
286 #endif
287 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
288 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
289 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
290 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
291 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
292 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
293 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
294 #ifndef RSS
295 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
296 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
297 #endif
298 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
299 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
300 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
301 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
302 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
303 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
304 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
305 
306 static void			hn_stop(struct hn_softc *, bool);
307 static void			hn_init_locked(struct hn_softc *);
308 static int			hn_chan_attach(struct hn_softc *,
309 				    struct vmbus_channel *);
310 static void			hn_chan_detach(struct hn_softc *,
311 				    struct vmbus_channel *);
312 static int			hn_attach_subchans(struct hn_softc *);
313 static void			hn_detach_allchans(struct hn_softc *);
314 static void			hn_chan_rollup(struct hn_rx_ring *,
315 				    struct hn_tx_ring *);
316 static void			hn_set_ring_inuse(struct hn_softc *, int);
317 static int			hn_synth_attach(struct hn_softc *, int);
318 static void			hn_synth_detach(struct hn_softc *);
319 static int			hn_synth_alloc_subchans(struct hn_softc *,
320 				    int *);
321 static bool			hn_synth_attachable(const struct hn_softc *);
322 static void			hn_suspend(struct hn_softc *);
323 static void			hn_suspend_data(struct hn_softc *);
324 static void			hn_suspend_mgmt(struct hn_softc *);
325 static void			hn_resume(struct hn_softc *);
326 static void			hn_resume_data(struct hn_softc *);
327 static void			hn_resume_mgmt(struct hn_softc *);
328 static void			hn_suspend_mgmt_taskfunc(void *, int);
329 static void			hn_chan_drain(struct hn_softc *,
330 				    struct vmbus_channel *);
331 static void			hn_polling(struct hn_softc *, u_int);
332 static void			hn_chan_polling(struct vmbus_channel *, u_int);
333 
334 static void			hn_update_link_status(struct hn_softc *);
335 static void			hn_change_network(struct hn_softc *);
336 static void			hn_link_taskfunc(void *, int);
337 static void			hn_netchg_init_taskfunc(void *, int);
338 static void			hn_netchg_status_taskfunc(void *, int);
339 static void			hn_link_status(struct hn_softc *);
340 
341 static int			hn_create_rx_data(struct hn_softc *, int);
342 static void			hn_destroy_rx_data(struct hn_softc *);
343 static int			hn_check_iplen(const struct mbuf *, int);
344 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
345 static int			hn_rxfilter_config(struct hn_softc *);
346 #ifndef RSS
347 static int			hn_rss_reconfig(struct hn_softc *);
348 #endif
349 static void			hn_rss_ind_fixup(struct hn_softc *);
350 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
351 				    int, const struct hn_rxinfo *);
352 
353 static int			hn_tx_ring_create(struct hn_softc *, int);
354 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
355 static int			hn_create_tx_data(struct hn_softc *, int);
356 static void			hn_fixup_tx_data(struct hn_softc *);
357 static void			hn_destroy_tx_data(struct hn_softc *);
358 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
359 static void			hn_txdesc_gc(struct hn_tx_ring *,
360 				    struct hn_txdesc *);
361 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
362 				    struct hn_txdesc *, struct mbuf **);
363 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
364 				    struct hn_txdesc *);
365 static void			hn_set_chim_size(struct hn_softc *, int);
366 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
367 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
368 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
369 static void			hn_resume_tx(struct hn_softc *, int);
370 static void			hn_set_txagg(struct hn_softc *);
371 static void			*hn_try_txagg(struct ifnet *,
372 				    struct hn_tx_ring *, struct hn_txdesc *,
373 				    int);
374 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
375 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
376 				    struct hn_softc *, struct vmbus_channel *,
377 				    const void *, int);
378 static int			hn_txpkt_sglist(struct hn_tx_ring *,
379 				    struct hn_txdesc *);
380 static int			hn_txpkt_chim(struct hn_tx_ring *,
381 				    struct hn_txdesc *);
382 static int			hn_xmit(struct hn_tx_ring *, int);
383 static void			hn_xmit_taskfunc(void *, int);
384 static void			hn_xmit_txeof(struct hn_tx_ring *);
385 static void			hn_xmit_txeof_taskfunc(void *, int);
386 #ifdef HN_IFSTART_SUPPORT
387 static int			hn_start_locked(struct hn_tx_ring *, int);
388 static void			hn_start_taskfunc(void *, int);
389 static void			hn_start_txeof(struct hn_tx_ring *);
390 static void			hn_start_txeof_taskfunc(void *, int);
391 #endif
392 
393 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
394     "Hyper-V network interface");
395 
396 /* Trust tcp segements verification on host side. */
397 static int			hn_trust_hosttcp = 1;
398 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
399     &hn_trust_hosttcp, 0,
400     "Trust tcp segement verification on host side, "
401     "when csum info is missing (global setting)");
402 
403 /* Trust udp datagrams verification on host side. */
404 static int			hn_trust_hostudp = 1;
405 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
406     &hn_trust_hostudp, 0,
407     "Trust udp datagram verification on host side, "
408     "when csum info is missing (global setting)");
409 
410 /* Trust ip packets verification on host side. */
411 static int			hn_trust_hostip = 1;
412 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
413     &hn_trust_hostip, 0,
414     "Trust ip packet verification on host side, "
415     "when csum info is missing (global setting)");
416 
417 /* Limit TSO burst size */
418 static int			hn_tso_maxlen = IP_MAXPACKET;
419 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
420     &hn_tso_maxlen, 0, "TSO burst limit");
421 
422 /* Limit chimney send size */
423 static int			hn_tx_chimney_size = 0;
424 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
425     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
426 
427 /* Limit the size of packet for direct transmission */
428 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
429 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
430     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
431 
432 /* # of LRO entries per RX ring */
433 #if defined(INET) || defined(INET6)
434 #if __FreeBSD_version >= 1100095
435 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
436 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
437     &hn_lro_entry_count, 0, "LRO entry count");
438 #endif
439 #endif
440 
441 static int			hn_tx_taskq_cnt = 1;
442 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
443     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
444 
445 #define HN_TX_TASKQ_M_INDEP	0
446 #define HN_TX_TASKQ_M_GLOBAL	1
447 #define HN_TX_TASKQ_M_EVTTQ	2
448 
449 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
450 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
451     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
452     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
453 
454 #ifndef HN_USE_TXDESC_BUFRING
455 static int			hn_use_txdesc_bufring = 0;
456 #else
457 static int			hn_use_txdesc_bufring = 1;
458 #endif
459 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
460     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
461 
462 #ifdef HN_IFSTART_SUPPORT
463 /* Use ifnet.if_start instead of ifnet.if_transmit */
464 static int			hn_use_if_start = 0;
465 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
466     &hn_use_if_start, 0, "Use if_start TX method");
467 #endif
468 
469 /* # of channels to use */
470 static int			hn_chan_cnt = 0;
471 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
472     &hn_chan_cnt, 0,
473     "# of channels to use; each channel has one RX ring and one TX ring");
474 
475 /* # of transmit rings to use */
476 static int			hn_tx_ring_cnt = 0;
477 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
478     &hn_tx_ring_cnt, 0, "# of TX rings to use");
479 
480 /* Software TX ring deptch */
481 static int			hn_tx_swq_depth = 0;
482 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
483     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
484 
485 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
486 #if __FreeBSD_version >= 1100095
487 static u_int			hn_lro_mbufq_depth = 0;
488 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
489     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
490 #endif
491 
492 /* Packet transmission aggregation size limit */
493 static int			hn_tx_agg_size = -1;
494 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
495     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
496 
497 /* Packet transmission aggregation count limit */
498 static int			hn_tx_agg_pkts = -1;
499 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
500     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
501 
502 static u_int			hn_cpu_index;	/* next CPU for channel */
503 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
504 
505 #ifndef RSS
506 static const uint8_t
507 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
508 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
509 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
510 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
511 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
512 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
513 };
514 #endif	/* !RSS */
515 
516 static device_method_t hn_methods[] = {
517 	/* Device interface */
518 	DEVMETHOD(device_probe,		hn_probe),
519 	DEVMETHOD(device_attach,	hn_attach),
520 	DEVMETHOD(device_detach,	hn_detach),
521 	DEVMETHOD(device_shutdown,	hn_shutdown),
522 	DEVMETHOD_END
523 };
524 
525 static driver_t hn_driver = {
526 	"hn",
527 	hn_methods,
528 	sizeof(struct hn_softc)
529 };
530 
531 static devclass_t hn_devclass;
532 
533 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
534 MODULE_VERSION(hn, 1);
535 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
536 
537 #if __FreeBSD_version >= 1100099
538 static void
539 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
540 {
541 	int i;
542 
543 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
544 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
545 }
546 #endif
547 
548 static int
549 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
550 {
551 
552 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
553 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
554 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
555 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
556 }
557 
558 static int
559 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
560 {
561 	struct hn_nvs_rndis rndis;
562 
563 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
564 	    txd->chim_size > 0, ("invalid rndis chim txd"));
565 
566 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
567 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
568 	rndis.nvs_chim_idx = txd->chim_index;
569 	rndis.nvs_chim_sz = txd->chim_size;
570 
571 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
572 	    &rndis, sizeof(rndis), &txd->send_ctx));
573 }
574 
575 static __inline uint32_t
576 hn_chim_alloc(struct hn_softc *sc)
577 {
578 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
579 	u_long *bmap = sc->hn_chim_bmap;
580 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
581 
582 	for (i = 0; i < bmap_cnt; ++i) {
583 		int idx;
584 
585 		idx = ffsl(~bmap[i]);
586 		if (idx == 0)
587 			continue;
588 
589 		--idx; /* ffsl is 1-based */
590 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
591 		    ("invalid i %d and idx %d", i, idx));
592 
593 		if (atomic_testandset_long(&bmap[i], idx))
594 			continue;
595 
596 		ret = i * LONG_BIT + idx;
597 		break;
598 	}
599 	return (ret);
600 }
601 
602 static __inline void
603 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
604 {
605 	u_long mask;
606 	uint32_t idx;
607 
608 	idx = chim_idx / LONG_BIT;
609 	KASSERT(idx < sc->hn_chim_bmap_cnt,
610 	    ("invalid chimney index 0x%x", chim_idx));
611 
612 	mask = 1UL << (chim_idx % LONG_BIT);
613 	KASSERT(sc->hn_chim_bmap[idx] & mask,
614 	    ("index bitmap 0x%lx, chimney index %u, "
615 	     "bitmap idx %d, bitmask 0x%lx",
616 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
617 
618 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
619 }
620 
621 #if defined(INET6) || defined(INET)
622 /*
623  * NOTE: If this function failed, the m_head would be freed.
624  */
625 static __inline struct mbuf *
626 hn_tso_fixup(struct mbuf *m_head)
627 {
628 	struct ether_vlan_header *evl;
629 	struct tcphdr *th;
630 	int ehlen;
631 
632 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
633 
634 #define PULLUP_HDR(m, len)				\
635 do {							\
636 	if (__predict_false((m)->m_len < (len))) {	\
637 		(m) = m_pullup((m), (len));		\
638 		if ((m) == NULL)			\
639 			return (NULL);			\
640 	}						\
641 } while (0)
642 
643 	PULLUP_HDR(m_head, sizeof(*evl));
644 	evl = mtod(m_head, struct ether_vlan_header *);
645 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
646 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
647 	else
648 		ehlen = ETHER_HDR_LEN;
649 
650 #ifdef INET
651 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
652 		struct ip *ip;
653 		int iphlen;
654 
655 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
656 		ip = mtodo(m_head, ehlen);
657 		iphlen = ip->ip_hl << 2;
658 
659 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
660 		th = mtodo(m_head, ehlen + iphlen);
661 
662 		ip->ip_len = 0;
663 		ip->ip_sum = 0;
664 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
665 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
666 	}
667 #endif
668 #if defined(INET6) && defined(INET)
669 	else
670 #endif
671 #ifdef INET6
672 	{
673 		struct ip6_hdr *ip6;
674 
675 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
676 		ip6 = mtodo(m_head, ehlen);
677 		if (ip6->ip6_nxt != IPPROTO_TCP) {
678 			m_freem(m_head);
679 			return (NULL);
680 		}
681 
682 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
683 		th = mtodo(m_head, ehlen + sizeof(*ip6));
684 
685 		ip6->ip6_plen = 0;
686 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
687 	}
688 #endif
689 	return (m_head);
690 
691 #undef PULLUP_HDR
692 }
693 #endif	/* INET6 || INET */
694 
695 static int
696 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
697 {
698 	int error = 0;
699 
700 	HN_LOCK_ASSERT(sc);
701 
702 	if (sc->hn_rx_filter != filter) {
703 		error = hn_rndis_set_rxfilter(sc, filter);
704 		if (!error)
705 			sc->hn_rx_filter = filter;
706 	}
707 	return (error);
708 }
709 
710 static int
711 hn_rxfilter_config(struct hn_softc *sc)
712 {
713 	struct ifnet *ifp = sc->hn_ifp;
714 	uint32_t filter;
715 
716 	HN_LOCK_ASSERT(sc);
717 
718 	if ((ifp->if_flags & IFF_PROMISC) ||
719 	    (sc->hn_flags & HN_FLAG_VF)) {
720 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
721 	} else {
722 		filter = NDIS_PACKET_TYPE_DIRECTED;
723 		if (ifp->if_flags & IFF_BROADCAST)
724 			filter |= NDIS_PACKET_TYPE_BROADCAST;
725 		/* TODO: support multicast list */
726 		if ((ifp->if_flags & IFF_ALLMULTI) ||
727 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
728 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
729 	}
730 	return (hn_set_rxfilter(sc, filter));
731 }
732 
733 static void
734 hn_set_txagg(struct hn_softc *sc)
735 {
736 	uint32_t size, pkts;
737 	int i;
738 
739 	/*
740 	 * Setup aggregation size.
741 	 */
742 	if (sc->hn_agg_size < 0)
743 		size = UINT32_MAX;
744 	else
745 		size = sc->hn_agg_size;
746 
747 	if (sc->hn_rndis_agg_size < size)
748 		size = sc->hn_rndis_agg_size;
749 
750 	/* NOTE: We only aggregate packets using chimney sending buffers. */
751 	if (size > (uint32_t)sc->hn_chim_szmax)
752 		size = sc->hn_chim_szmax;
753 
754 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
755 		/* Disable */
756 		size = 0;
757 		pkts = 0;
758 		goto done;
759 	}
760 
761 	/* NOTE: Type of the per TX ring setting is 'int'. */
762 	if (size > INT_MAX)
763 		size = INT_MAX;
764 
765 	/*
766 	 * Setup aggregation packet count.
767 	 */
768 	if (sc->hn_agg_pkts < 0)
769 		pkts = UINT32_MAX;
770 	else
771 		pkts = sc->hn_agg_pkts;
772 
773 	if (sc->hn_rndis_agg_pkts < pkts)
774 		pkts = sc->hn_rndis_agg_pkts;
775 
776 	if (pkts <= 1) {
777 		/* Disable */
778 		size = 0;
779 		pkts = 0;
780 		goto done;
781 	}
782 
783 	/* NOTE: Type of the per TX ring setting is 'short'. */
784 	if (pkts > SHRT_MAX)
785 		pkts = SHRT_MAX;
786 
787 done:
788 	/* NOTE: Type of the per TX ring setting is 'short'. */
789 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
790 		/* Disable */
791 		size = 0;
792 		pkts = 0;
793 	}
794 
795 	if (bootverbose) {
796 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
797 		    size, pkts, sc->hn_rndis_agg_align);
798 	}
799 
800 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
801 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
802 
803 		mtx_lock(&txr->hn_tx_lock);
804 		txr->hn_agg_szmax = size;
805 		txr->hn_agg_pktmax = pkts;
806 		txr->hn_agg_align = sc->hn_rndis_agg_align;
807 		mtx_unlock(&txr->hn_tx_lock);
808 	}
809 }
810 
811 static int
812 hn_get_txswq_depth(const struct hn_tx_ring *txr)
813 {
814 
815 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
816 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
817 		return txr->hn_txdesc_cnt;
818 	return hn_tx_swq_depth;
819 }
820 
821 #ifndef RSS
822 static int
823 hn_rss_reconfig(struct hn_softc *sc)
824 {
825 	int error;
826 
827 	HN_LOCK_ASSERT(sc);
828 
829 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
830 		return (ENXIO);
831 
832 	/*
833 	 * Disable RSS first.
834 	 *
835 	 * NOTE:
836 	 * Direct reconfiguration by setting the UNCHG flags does
837 	 * _not_ work properly.
838 	 */
839 	if (bootverbose)
840 		if_printf(sc->hn_ifp, "disable RSS\n");
841 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
842 	if (error) {
843 		if_printf(sc->hn_ifp, "RSS disable failed\n");
844 		return (error);
845 	}
846 
847 	/*
848 	 * Reenable the RSS w/ the updated RSS key or indirect
849 	 * table.
850 	 */
851 	if (bootverbose)
852 		if_printf(sc->hn_ifp, "reconfig RSS\n");
853 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
854 	if (error) {
855 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
856 		return (error);
857 	}
858 	return (0);
859 }
860 #endif	/* !RSS */
861 
862 static void
863 hn_rss_ind_fixup(struct hn_softc *sc)
864 {
865 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
866 	int i, nchan;
867 
868 	nchan = sc->hn_rx_ring_inuse;
869 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
870 
871 	/*
872 	 * Check indirect table to make sure that all channels in it
873 	 * can be used.
874 	 */
875 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
876 		if (rss->rss_ind[i] >= nchan) {
877 			if_printf(sc->hn_ifp,
878 			    "RSS indirect table %d fixup: %u -> %d\n",
879 			    i, rss->rss_ind[i], nchan - 1);
880 			rss->rss_ind[i] = nchan - 1;
881 		}
882 	}
883 }
884 
885 static int
886 hn_ifmedia_upd(struct ifnet *ifp __unused)
887 {
888 
889 	return EOPNOTSUPP;
890 }
891 
892 static void
893 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
894 {
895 	struct hn_softc *sc = ifp->if_softc;
896 
897 	ifmr->ifm_status = IFM_AVALID;
898 	ifmr->ifm_active = IFM_ETHER;
899 
900 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
901 		ifmr->ifm_active |= IFM_NONE;
902 		return;
903 	}
904 	ifmr->ifm_status |= IFM_ACTIVE;
905 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
906 }
907 
908 static void
909 hn_update_vf_task(void *arg, int pending __unused)
910 {
911 	struct hn_update_vf *uv = arg;
912 
913 	uv->rxr->hn_vf = uv->vf;
914 }
915 
916 static void
917 hn_update_vf(struct hn_softc *sc, struct ifnet *vf)
918 {
919 	struct hn_rx_ring *rxr;
920 	struct hn_update_vf uv;
921 	struct task task;
922 	int i;
923 
924 	HN_LOCK_ASSERT(sc);
925 
926 	TASK_INIT(&task, 0, hn_update_vf_task, &uv);
927 
928 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
929 		rxr = &sc->hn_rx_ring[i];
930 
931 		if (i < sc->hn_rx_ring_inuse) {
932 			uv.rxr = rxr;
933 			uv.vf = vf;
934 			vmbus_chan_run_task(rxr->hn_chan, &task);
935 		} else {
936 			rxr->hn_vf = vf;
937 		}
938 	}
939 }
940 
941 static void
942 hn_set_vf(struct hn_softc *sc, struct ifnet *ifp, bool vf)
943 {
944 	struct ifnet *hn_ifp;
945 
946 	HN_LOCK(sc);
947 
948 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
949 		goto out;
950 
951 	hn_ifp = sc->hn_ifp;
952 
953 	if (ifp == hn_ifp)
954 		goto out;
955 
956 	if (ifp->if_alloctype != IFT_ETHER)
957 		goto out;
958 
959 	/* Ignore lagg/vlan interfaces */
960 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
961 	    strcmp(ifp->if_dname, "vlan") == 0)
962 		goto out;
963 
964 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
965 		goto out;
966 
967 	/* Now we're sure 'ifp' is a real VF device. */
968 	if (vf) {
969 		if (sc->hn_flags & HN_FLAG_VF)
970 			goto out;
971 
972 		sc->hn_flags |= HN_FLAG_VF;
973 		hn_rxfilter_config(sc);
974 	} else {
975 		if (!(sc->hn_flags & HN_FLAG_VF))
976 			goto out;
977 
978 		sc->hn_flags &= ~HN_FLAG_VF;
979 		if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
980 			hn_rxfilter_config(sc);
981 		else
982 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
983 	}
984 
985 	hn_nvs_set_datapath(sc,
986 	    vf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC);
987 
988 	hn_update_vf(sc, vf ? ifp : NULL);
989 
990 	if (vf) {
991 		hn_suspend_mgmt(sc);
992 		sc->hn_link_flags &=
993 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
994 		if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
995 	} else {
996 		hn_resume_mgmt(sc);
997 	}
998 
999 	devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1000 	    vf ? "VF_UP" : "VF_DOWN", NULL);
1001 
1002 	if (bootverbose)
1003 		if_printf(hn_ifp, "Data path is switched %s %s\n",
1004 		    vf ? "to" : "from", if_name(ifp));
1005 out:
1006 	HN_UNLOCK(sc);
1007 }
1008 
1009 static void
1010 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1011 {
1012 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1013 		return;
1014 
1015 	hn_set_vf(arg, ifp, event == IFNET_EVENT_UP);
1016 }
1017 
1018 static void
1019 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1020 {
1021 	hn_set_vf(arg, ifp, ifp->if_flags & IFF_UP);
1022 }
1023 
1024 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
1025 static const struct hyperv_guid g_net_vsc_device_type = {
1026 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
1027 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
1028 };
1029 
1030 static int
1031 hn_probe(device_t dev)
1032 {
1033 
1034 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1035 	    &g_net_vsc_device_type) == 0) {
1036 		device_set_desc(dev, "Hyper-V Network Interface");
1037 		return BUS_PROBE_DEFAULT;
1038 	}
1039 	return ENXIO;
1040 }
1041 
1042 static int
1043 hn_attach(device_t dev)
1044 {
1045 	struct hn_softc *sc = device_get_softc(dev);
1046 	struct sysctl_oid_list *child;
1047 	struct sysctl_ctx_list *ctx;
1048 	uint8_t eaddr[ETHER_ADDR_LEN];
1049 	struct ifnet *ifp = NULL;
1050 	int error, ring_cnt, tx_ring_cnt;
1051 
1052 	sc->hn_dev = dev;
1053 	sc->hn_prichan = vmbus_get_channel(dev);
1054 	HN_LOCK_INIT(sc);
1055 
1056 	/*
1057 	 * Initialize these tunables once.
1058 	 */
1059 	sc->hn_agg_size = hn_tx_agg_size;
1060 	sc->hn_agg_pkts = hn_tx_agg_pkts;
1061 
1062 	/*
1063 	 * Setup taskqueue for transmission.
1064 	 */
1065 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1066 		int i;
1067 
1068 		sc->hn_tx_taskqs =
1069 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1070 		    M_DEVBUF, M_WAITOK);
1071 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1072 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1073 			    M_WAITOK, taskqueue_thread_enqueue,
1074 			    &sc->hn_tx_taskqs[i]);
1075 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1076 			    "%s tx%d", device_get_nameunit(dev), i);
1077 		}
1078 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1079 		sc->hn_tx_taskqs = hn_tx_taskque;
1080 	}
1081 
1082 	/*
1083 	 * Setup taskqueue for mangement tasks, e.g. link status.
1084 	 */
1085 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1086 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1087 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1088 	    device_get_nameunit(dev));
1089 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1090 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1091 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1092 	    hn_netchg_status_taskfunc, sc);
1093 
1094 	/*
1095 	 * Allocate ifnet and setup its name earlier, so that if_printf
1096 	 * can be used by functions, which will be called after
1097 	 * ether_ifattach().
1098 	 */
1099 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
1100 	ifp->if_softc = sc;
1101 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1102 
1103 	/*
1104 	 * Initialize ifmedia earlier so that it can be unconditionally
1105 	 * destroyed, if error happened later on.
1106 	 */
1107 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1108 
1109 	/*
1110 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1111 	 * to use (tx_ring_cnt).
1112 	 *
1113 	 * NOTE:
1114 	 * The # of RX rings to use is same as the # of channels to use.
1115 	 */
1116 	ring_cnt = hn_chan_cnt;
1117 	if (ring_cnt <= 0) {
1118 		/* Default */
1119 		ring_cnt = mp_ncpus;
1120 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
1121 			ring_cnt = HN_RING_CNT_DEF_MAX;
1122 	} else if (ring_cnt > mp_ncpus) {
1123 		ring_cnt = mp_ncpus;
1124 	}
1125 #ifdef RSS
1126 	if (ring_cnt > rss_getnumbuckets())
1127 		ring_cnt = rss_getnumbuckets();
1128 #endif
1129 
1130 	tx_ring_cnt = hn_tx_ring_cnt;
1131 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1132 		tx_ring_cnt = ring_cnt;
1133 #ifdef HN_IFSTART_SUPPORT
1134 	if (hn_use_if_start) {
1135 		/* ifnet.if_start only needs one TX ring. */
1136 		tx_ring_cnt = 1;
1137 	}
1138 #endif
1139 
1140 	/*
1141 	 * Set the leader CPU for channels.
1142 	 */
1143 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1144 
1145 	/*
1146 	 * Create enough TX/RX rings, even if only limited number of
1147 	 * channels can be allocated.
1148 	 */
1149 	error = hn_create_tx_data(sc, tx_ring_cnt);
1150 	if (error)
1151 		goto failed;
1152 	error = hn_create_rx_data(sc, ring_cnt);
1153 	if (error)
1154 		goto failed;
1155 
1156 	/*
1157 	 * Create transaction context for NVS and RNDIS transactions.
1158 	 */
1159 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1160 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1161 	if (sc->hn_xact == NULL) {
1162 		error = ENXIO;
1163 		goto failed;
1164 	}
1165 
1166 	/*
1167 	 * Install orphan handler for the revocation of this device's
1168 	 * primary channel.
1169 	 *
1170 	 * NOTE:
1171 	 * The processing order is critical here:
1172 	 * Install the orphan handler, _before_ testing whether this
1173 	 * device's primary channel has been revoked or not.
1174 	 */
1175 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1176 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1177 		error = ENXIO;
1178 		goto failed;
1179 	}
1180 
1181 	/*
1182 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1183 	 */
1184 	error = hn_synth_attach(sc, ETHERMTU);
1185 	if (error)
1186 		goto failed;
1187 
1188 	error = hn_rndis_get_eaddr(sc, eaddr);
1189 	if (error)
1190 		goto failed;
1191 
1192 #if __FreeBSD_version >= 1100099
1193 	if (sc->hn_rx_ring_inuse > 1) {
1194 		/*
1195 		 * Reduce TCP segment aggregation limit for multiple
1196 		 * RX rings to increase ACK timeliness.
1197 		 */
1198 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1199 	}
1200 #endif
1201 
1202 	/*
1203 	 * Fixup TX stuffs after synthetic parts are attached.
1204 	 */
1205 	hn_fixup_tx_data(sc);
1206 
1207 	ctx = device_get_sysctl_ctx(dev);
1208 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1209 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1210 	    &sc->hn_nvs_ver, 0, "NVS version");
1211 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1212 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1213 	    hn_ndis_version_sysctl, "A", "NDIS version");
1214 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1215 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1216 	    hn_caps_sysctl, "A", "capabilities");
1217 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1218 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1219 	    hn_hwassist_sysctl, "A", "hwassist");
1220 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1221 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1222 	    hn_rxfilter_sysctl, "A", "rxfilter");
1223 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1224 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1225 	    hn_rss_hash_sysctl, "A", "RSS hash");
1226 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1227 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1228 #ifndef RSS
1229 	/*
1230 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
1231 	 */
1232 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1233 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1234 	    hn_rss_key_sysctl, "IU", "RSS key");
1235 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1236 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1237 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1238 #endif
1239 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1240 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1241 	    "RNDIS offered packet transmission aggregation size limit");
1242 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1243 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1244 	    "RNDIS offered packet transmission aggregation count limit");
1245 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1246 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1247 	    "RNDIS packet transmission aggregation alignment");
1248 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1249 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1250 	    hn_txagg_size_sysctl, "I",
1251 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1252 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1253 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1254 	    hn_txagg_pkts_sysctl, "I",
1255 	    "Packet transmission aggregation packets, "
1256 	    "0 -- disable, -1 -- auto");
1257 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1258 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1259 	    hn_polling_sysctl, "I",
1260 	    "Polling frequency: [100,1000000], 0 disable polling");
1261 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1262 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1263 	    hn_vf_sysctl, "A", "Virtual Function's name");
1264 
1265 	/*
1266 	 * Setup the ifmedia, which has been initialized earlier.
1267 	 */
1268 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1269 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1270 	/* XXX ifmedia_set really should do this for us */
1271 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1272 
1273 	/*
1274 	 * Setup the ifnet for this interface.
1275 	 */
1276 
1277 	ifp->if_baudrate = IF_Gbps(10);
1278 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1279 	ifp->if_ioctl = hn_ioctl;
1280 	ifp->if_init = hn_init;
1281 #ifdef HN_IFSTART_SUPPORT
1282 	if (hn_use_if_start) {
1283 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1284 
1285 		ifp->if_start = hn_start;
1286 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1287 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1288 		IFQ_SET_READY(&ifp->if_snd);
1289 	} else
1290 #endif
1291 	{
1292 		ifp->if_transmit = hn_transmit;
1293 		ifp->if_qflush = hn_xmit_qflush;
1294 	}
1295 
1296 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1297 #ifdef foo
1298 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
1299 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1300 #endif
1301 	if (sc->hn_caps & HN_CAP_VLAN) {
1302 		/* XXX not sure about VLAN_MTU. */
1303 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1304 	}
1305 
1306 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1307 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1308 		ifp->if_capabilities |= IFCAP_TXCSUM;
1309 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1310 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1311 	if (sc->hn_caps & HN_CAP_TSO4) {
1312 		ifp->if_capabilities |= IFCAP_TSO4;
1313 		ifp->if_hwassist |= CSUM_IP_TSO;
1314 	}
1315 	if (sc->hn_caps & HN_CAP_TSO6) {
1316 		ifp->if_capabilities |= IFCAP_TSO6;
1317 		ifp->if_hwassist |= CSUM_IP6_TSO;
1318 	}
1319 
1320 	/* Enable all available capabilities by default. */
1321 	ifp->if_capenable = ifp->if_capabilities;
1322 
1323 	/*
1324 	 * Disable IPv6 TSO and TXCSUM by default, they still can
1325 	 * be enabled through SIOCSIFCAP.
1326 	 */
1327 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1328 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1329 
1330 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1331 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1332 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1333 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1334 	}
1335 
1336 	ether_ifattach(ifp, eaddr);
1337 
1338 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1339 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
1340 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1341 	}
1342 
1343 	/* Inform the upper layer about the long frame support. */
1344 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1345 
1346 	/*
1347 	 * Kick off link status check.
1348 	 */
1349 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1350 	hn_update_link_status(sc);
1351 
1352 	sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
1353 	    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
1354 
1355 	sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
1356 	    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
1357 
1358 	return (0);
1359 failed:
1360 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1361 		hn_synth_detach(sc);
1362 	hn_detach(dev);
1363 	return (error);
1364 }
1365 
1366 static int
1367 hn_detach(device_t dev)
1368 {
1369 	struct hn_softc *sc = device_get_softc(dev);
1370 	struct ifnet *ifp = sc->hn_ifp;
1371 
1372 	if (sc->hn_ifaddr_evthand != NULL)
1373 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
1374 	if (sc->hn_ifnet_evthand != NULL)
1375 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
1376 
1377 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1378 		/*
1379 		 * In case that the vmbus missed the orphan handler
1380 		 * installation.
1381 		 */
1382 		vmbus_xact_ctx_orphan(sc->hn_xact);
1383 	}
1384 
1385 	if (device_is_attached(dev)) {
1386 		HN_LOCK(sc);
1387 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1388 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1389 				hn_stop(sc, true);
1390 			/*
1391 			 * NOTE:
1392 			 * hn_stop() only suspends data, so managment
1393 			 * stuffs have to be suspended manually here.
1394 			 */
1395 			hn_suspend_mgmt(sc);
1396 			hn_synth_detach(sc);
1397 		}
1398 		HN_UNLOCK(sc);
1399 		ether_ifdetach(ifp);
1400 	}
1401 
1402 	ifmedia_removeall(&sc->hn_media);
1403 	hn_destroy_rx_data(sc);
1404 	hn_destroy_tx_data(sc);
1405 
1406 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1407 		int i;
1408 
1409 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
1410 			taskqueue_free(sc->hn_tx_taskqs[i]);
1411 		free(sc->hn_tx_taskqs, M_DEVBUF);
1412 	}
1413 	taskqueue_free(sc->hn_mgmt_taskq0);
1414 
1415 	if (sc->hn_xact != NULL) {
1416 		/*
1417 		 * Uninstall the orphan handler _before_ the xact is
1418 		 * destructed.
1419 		 */
1420 		vmbus_chan_unset_orphan(sc->hn_prichan);
1421 		vmbus_xact_ctx_destroy(sc->hn_xact);
1422 	}
1423 
1424 	if_free(ifp);
1425 
1426 	HN_LOCK_DESTROY(sc);
1427 	return (0);
1428 }
1429 
1430 static int
1431 hn_shutdown(device_t dev)
1432 {
1433 
1434 	return (0);
1435 }
1436 
1437 static void
1438 hn_link_status(struct hn_softc *sc)
1439 {
1440 	uint32_t link_status;
1441 	int error;
1442 
1443 	error = hn_rndis_get_linkstatus(sc, &link_status);
1444 	if (error) {
1445 		/* XXX what to do? */
1446 		return;
1447 	}
1448 
1449 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1450 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1451 	else
1452 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1453 	if_link_state_change(sc->hn_ifp,
1454 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1455 	    LINK_STATE_UP : LINK_STATE_DOWN);
1456 }
1457 
1458 static void
1459 hn_link_taskfunc(void *xsc, int pending __unused)
1460 {
1461 	struct hn_softc *sc = xsc;
1462 
1463 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1464 		return;
1465 	hn_link_status(sc);
1466 }
1467 
1468 static void
1469 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1470 {
1471 	struct hn_softc *sc = xsc;
1472 
1473 	/* Prevent any link status checks from running. */
1474 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1475 
1476 	/*
1477 	 * Fake up a [link down --> link up] state change; 5 seconds
1478 	 * delay is used, which closely simulates miibus reaction
1479 	 * upon link down event.
1480 	 */
1481 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1482 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1483 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1484 	    &sc->hn_netchg_status, 5 * hz);
1485 }
1486 
1487 static void
1488 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1489 {
1490 	struct hn_softc *sc = xsc;
1491 
1492 	/* Re-allow link status checks. */
1493 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1494 	hn_link_status(sc);
1495 }
1496 
1497 static void
1498 hn_update_link_status(struct hn_softc *sc)
1499 {
1500 
1501 	if (sc->hn_mgmt_taskq != NULL)
1502 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1503 }
1504 
1505 static void
1506 hn_change_network(struct hn_softc *sc)
1507 {
1508 
1509 	if (sc->hn_mgmt_taskq != NULL)
1510 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1511 }
1512 
1513 static __inline int
1514 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1515     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1516 {
1517 	struct mbuf *m = *m_head;
1518 	int error;
1519 
1520 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1521 
1522 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1523 	    m, segs, nsegs, BUS_DMA_NOWAIT);
1524 	if (error == EFBIG) {
1525 		struct mbuf *m_new;
1526 
1527 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1528 		if (m_new == NULL)
1529 			return ENOBUFS;
1530 		else
1531 			*m_head = m = m_new;
1532 		txr->hn_tx_collapsed++;
1533 
1534 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1535 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1536 	}
1537 	if (!error) {
1538 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1539 		    BUS_DMASYNC_PREWRITE);
1540 		txd->flags |= HN_TXD_FLAG_DMAMAP;
1541 	}
1542 	return error;
1543 }
1544 
1545 static __inline int
1546 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1547 {
1548 
1549 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1550 	    ("put an onlist txd %#x", txd->flags));
1551 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1552 	    ("put an onagg txd %#x", txd->flags));
1553 
1554 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1555 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1556 		return 0;
1557 
1558 	if (!STAILQ_EMPTY(&txd->agg_list)) {
1559 		struct hn_txdesc *tmp_txd;
1560 
1561 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1562 			int freed;
1563 
1564 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1565 			    ("resursive aggregation on aggregated txdesc"));
1566 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1567 			    ("not aggregated txdesc"));
1568 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1569 			    ("aggregated txdesc uses dmamap"));
1570 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1571 			    ("aggregated txdesc consumes "
1572 			     "chimney sending buffer"));
1573 			KASSERT(tmp_txd->chim_size == 0,
1574 			    ("aggregated txdesc has non-zero "
1575 			     "chimney sending size"));
1576 
1577 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1578 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1579 			freed = hn_txdesc_put(txr, tmp_txd);
1580 			KASSERT(freed, ("failed to free aggregated txdesc"));
1581 		}
1582 	}
1583 
1584 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1585 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1586 		    ("chim txd uses dmamap"));
1587 		hn_chim_free(txr->hn_sc, txd->chim_index);
1588 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1589 		txd->chim_size = 0;
1590 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1591 		bus_dmamap_sync(txr->hn_tx_data_dtag,
1592 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1593 		bus_dmamap_unload(txr->hn_tx_data_dtag,
1594 		    txd->data_dmap);
1595 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1596 	}
1597 
1598 	if (txd->m != NULL) {
1599 		m_freem(txd->m);
1600 		txd->m = NULL;
1601 	}
1602 
1603 	txd->flags |= HN_TXD_FLAG_ONLIST;
1604 #ifndef HN_USE_TXDESC_BUFRING
1605 	mtx_lock_spin(&txr->hn_txlist_spin);
1606 	KASSERT(txr->hn_txdesc_avail >= 0 &&
1607 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1608 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1609 	txr->hn_txdesc_avail++;
1610 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1611 	mtx_unlock_spin(&txr->hn_txlist_spin);
1612 #else	/* HN_USE_TXDESC_BUFRING */
1613 #ifdef HN_DEBUG
1614 	atomic_add_int(&txr->hn_txdesc_avail, 1);
1615 #endif
1616 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1617 #endif	/* !HN_USE_TXDESC_BUFRING */
1618 
1619 	return 1;
1620 }
1621 
1622 static __inline struct hn_txdesc *
1623 hn_txdesc_get(struct hn_tx_ring *txr)
1624 {
1625 	struct hn_txdesc *txd;
1626 
1627 #ifndef HN_USE_TXDESC_BUFRING
1628 	mtx_lock_spin(&txr->hn_txlist_spin);
1629 	txd = SLIST_FIRST(&txr->hn_txlist);
1630 	if (txd != NULL) {
1631 		KASSERT(txr->hn_txdesc_avail > 0,
1632 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1633 		txr->hn_txdesc_avail--;
1634 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1635 	}
1636 	mtx_unlock_spin(&txr->hn_txlist_spin);
1637 #else
1638 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1639 #endif
1640 
1641 	if (txd != NULL) {
1642 #ifdef HN_USE_TXDESC_BUFRING
1643 #ifdef HN_DEBUG
1644 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1645 #endif
1646 #endif	/* HN_USE_TXDESC_BUFRING */
1647 		KASSERT(txd->m == NULL && txd->refs == 0 &&
1648 		    STAILQ_EMPTY(&txd->agg_list) &&
1649 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1650 		    txd->chim_size == 0 &&
1651 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1652 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1653 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1654 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1655 		txd->refs = 1;
1656 	}
1657 	return txd;
1658 }
1659 
1660 static __inline void
1661 hn_txdesc_hold(struct hn_txdesc *txd)
1662 {
1663 
1664 	/* 0->1 transition will never work */
1665 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1666 	atomic_add_int(&txd->refs, 1);
1667 }
1668 
1669 static __inline void
1670 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1671 {
1672 
1673 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1674 	    ("recursive aggregation on aggregating txdesc"));
1675 
1676 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1677 	    ("already aggregated"));
1678 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
1679 	    ("recursive aggregation on to-be-aggregated txdesc"));
1680 
1681 	txd->flags |= HN_TXD_FLAG_ONAGG;
1682 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1683 }
1684 
1685 static bool
1686 hn_tx_ring_pending(struct hn_tx_ring *txr)
1687 {
1688 	bool pending = false;
1689 
1690 #ifndef HN_USE_TXDESC_BUFRING
1691 	mtx_lock_spin(&txr->hn_txlist_spin);
1692 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1693 		pending = true;
1694 	mtx_unlock_spin(&txr->hn_txlist_spin);
1695 #else
1696 	if (!buf_ring_full(txr->hn_txdesc_br))
1697 		pending = true;
1698 #endif
1699 	return (pending);
1700 }
1701 
1702 static __inline void
1703 hn_txeof(struct hn_tx_ring *txr)
1704 {
1705 	txr->hn_has_txeof = 0;
1706 	txr->hn_txeof(txr);
1707 }
1708 
1709 static void
1710 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1711     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1712 {
1713 	struct hn_txdesc *txd = sndc->hn_cbarg;
1714 	struct hn_tx_ring *txr;
1715 
1716 	txr = txd->txr;
1717 	KASSERT(txr->hn_chan == chan,
1718 	    ("channel mismatch, on chan%u, should be chan%u",
1719 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1720 
1721 	txr->hn_has_txeof = 1;
1722 	hn_txdesc_put(txr, txd);
1723 
1724 	++txr->hn_txdone_cnt;
1725 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1726 		txr->hn_txdone_cnt = 0;
1727 		if (txr->hn_oactive)
1728 			hn_txeof(txr);
1729 	}
1730 }
1731 
1732 static void
1733 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1734 {
1735 #if defined(INET) || defined(INET6)
1736 	tcp_lro_flush_all(&rxr->hn_lro);
1737 #endif
1738 
1739 	/*
1740 	 * NOTE:
1741 	 * 'txr' could be NULL, if multiple channels and
1742 	 * ifnet.if_start method are enabled.
1743 	 */
1744 	if (txr == NULL || !txr->hn_has_txeof)
1745 		return;
1746 
1747 	txr->hn_txdone_cnt = 0;
1748 	hn_txeof(txr);
1749 }
1750 
1751 static __inline uint32_t
1752 hn_rndis_pktmsg_offset(uint32_t ofs)
1753 {
1754 
1755 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1756 	    ("invalid RNDIS packet msg offset %u", ofs));
1757 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1758 }
1759 
1760 static __inline void *
1761 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1762     size_t pi_dlen, uint32_t pi_type)
1763 {
1764 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1765 	struct rndis_pktinfo *pi;
1766 
1767 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1768 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1769 
1770 	/*
1771 	 * Per-packet-info does not move; it only grows.
1772 	 *
1773 	 * NOTE:
1774 	 * rm_pktinfooffset in this phase counts from the beginning
1775 	 * of rndis_packet_msg.
1776 	 */
1777 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1778 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1779 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1780 	    pkt->rm_pktinfolen);
1781 	pkt->rm_pktinfolen += pi_size;
1782 
1783 	pi->rm_size = pi_size;
1784 	pi->rm_type = pi_type;
1785 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1786 
1787 	return (pi->rm_data);
1788 }
1789 
1790 static __inline int
1791 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1792 {
1793 	struct hn_txdesc *txd;
1794 	struct mbuf *m;
1795 	int error, pkts;
1796 
1797 	txd = txr->hn_agg_txd;
1798 	KASSERT(txd != NULL, ("no aggregate txdesc"));
1799 
1800 	/*
1801 	 * Since hn_txpkt() will reset this temporary stat, save
1802 	 * it now, so that oerrors can be updated properly, if
1803 	 * hn_txpkt() ever fails.
1804 	 */
1805 	pkts = txr->hn_stat_pkts;
1806 
1807 	/*
1808 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1809 	 * failure, save it for later freeing, if hn_txpkt() ever
1810 	 * fails.
1811 	 */
1812 	m = txd->m;
1813 	error = hn_txpkt(ifp, txr, txd);
1814 	if (__predict_false(error)) {
1815 		/* txd is freed, but m is not. */
1816 		m_freem(m);
1817 
1818 		txr->hn_flush_failed++;
1819 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1820 	}
1821 
1822 	/* Reset all aggregation states. */
1823 	txr->hn_agg_txd = NULL;
1824 	txr->hn_agg_szleft = 0;
1825 	txr->hn_agg_pktleft = 0;
1826 	txr->hn_agg_prevpkt = NULL;
1827 
1828 	return (error);
1829 }
1830 
1831 static void *
1832 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1833     int pktsize)
1834 {
1835 	void *chim;
1836 
1837 	if (txr->hn_agg_txd != NULL) {
1838 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1839 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1840 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1841 			int olen;
1842 
1843 			/*
1844 			 * Update the previous RNDIS packet's total length,
1845 			 * it can be increased due to the mandatory alignment
1846 			 * padding for this RNDIS packet.  And update the
1847 			 * aggregating txdesc's chimney sending buffer size
1848 			 * accordingly.
1849 			 *
1850 			 * XXX
1851 			 * Zero-out the padding, as required by the RNDIS spec.
1852 			 */
1853 			olen = pkt->rm_len;
1854 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1855 			agg_txd->chim_size += pkt->rm_len - olen;
1856 
1857 			/* Link this txdesc to the parent. */
1858 			hn_txdesc_agg(agg_txd, txd);
1859 
1860 			chim = (uint8_t *)pkt + pkt->rm_len;
1861 			/* Save the current packet for later fixup. */
1862 			txr->hn_agg_prevpkt = chim;
1863 
1864 			txr->hn_agg_pktleft--;
1865 			txr->hn_agg_szleft -= pktsize;
1866 			if (txr->hn_agg_szleft <=
1867 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1868 				/*
1869 				 * Probably can't aggregate more packets,
1870 				 * flush this aggregating txdesc proactively.
1871 				 */
1872 				txr->hn_agg_pktleft = 0;
1873 			}
1874 			/* Done! */
1875 			return (chim);
1876 		}
1877 		hn_flush_txagg(ifp, txr);
1878 	}
1879 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1880 
1881 	txr->hn_tx_chimney_tried++;
1882 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
1883 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1884 		return (NULL);
1885 	txr->hn_tx_chimney++;
1886 
1887 	chim = txr->hn_sc->hn_chim +
1888 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1889 
1890 	if (txr->hn_agg_pktmax > 1 &&
1891 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1892 		txr->hn_agg_txd = txd;
1893 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1894 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1895 		txr->hn_agg_prevpkt = chim;
1896 	}
1897 	return (chim);
1898 }
1899 
1900 /*
1901  * NOTE:
1902  * If this function fails, then both txd and m_head0 will be freed.
1903  */
1904 static int
1905 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1906     struct mbuf **m_head0)
1907 {
1908 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1909 	int error, nsegs, i;
1910 	struct mbuf *m_head = *m_head0;
1911 	struct rndis_packet_msg *pkt;
1912 	uint32_t *pi_data;
1913 	void *chim = NULL;
1914 	int pkt_hlen, pkt_size;
1915 
1916 	pkt = txd->rndis_pkt;
1917 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1918 	if (pkt_size < txr->hn_chim_size) {
1919 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1920 		if (chim != NULL)
1921 			pkt = chim;
1922 	} else {
1923 		if (txr->hn_agg_txd != NULL)
1924 			hn_flush_txagg(ifp, txr);
1925 	}
1926 
1927 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1928 	pkt->rm_len = m_head->m_pkthdr.len;
1929 	pkt->rm_dataoffset = 0;
1930 	pkt->rm_datalen = m_head->m_pkthdr.len;
1931 	pkt->rm_oobdataoffset = 0;
1932 	pkt->rm_oobdatalen = 0;
1933 	pkt->rm_oobdataelements = 0;
1934 	pkt->rm_pktinfooffset = sizeof(*pkt);
1935 	pkt->rm_pktinfolen = 0;
1936 	pkt->rm_vchandle = 0;
1937 	pkt->rm_reserved = 0;
1938 
1939 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1940 		/*
1941 		 * Set the hash value for this packet, so that the host could
1942 		 * dispatch the TX done event for this packet back to this TX
1943 		 * ring's channel.
1944 		 */
1945 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1946 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1947 		*pi_data = txr->hn_tx_idx;
1948 	}
1949 
1950 	if (m_head->m_flags & M_VLANTAG) {
1951 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1952 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1953 		*pi_data = NDIS_VLAN_INFO_MAKE(
1954 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1955 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1956 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1957 	}
1958 
1959 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1960 #if defined(INET6) || defined(INET)
1961 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1962 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1963 #ifdef INET
1964 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1965 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1966 			    m_head->m_pkthdr.tso_segsz);
1967 		}
1968 #endif
1969 #if defined(INET6) && defined(INET)
1970 		else
1971 #endif
1972 #ifdef INET6
1973 		{
1974 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1975 			    m_head->m_pkthdr.tso_segsz);
1976 		}
1977 #endif
1978 #endif	/* INET6 || INET */
1979 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1980 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1981 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1982 		if (m_head->m_pkthdr.csum_flags &
1983 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1984 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
1985 		} else {
1986 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
1987 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1988 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
1989 		}
1990 
1991 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1992 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1993 		else if (m_head->m_pkthdr.csum_flags &
1994 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
1995 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1996 	}
1997 
1998 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1999 	/* Fixup RNDIS packet message total length */
2000 	pkt->rm_len += pkt_hlen;
2001 	/* Convert RNDIS packet message offsets */
2002 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
2003 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2004 
2005 	/*
2006 	 * Fast path: Chimney sending.
2007 	 */
2008 	if (chim != NULL) {
2009 		struct hn_txdesc *tgt_txd = txd;
2010 
2011 		if (txr->hn_agg_txd != NULL) {
2012 			tgt_txd = txr->hn_agg_txd;
2013 #ifdef INVARIANTS
2014 			*m_head0 = NULL;
2015 #endif
2016 		}
2017 
2018 		KASSERT(pkt == chim,
2019 		    ("RNDIS pkt not in chimney sending buffer"));
2020 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2021 		    ("chimney sending buffer is not used"));
2022 		tgt_txd->chim_size += pkt->rm_len;
2023 
2024 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
2025 		    ((uint8_t *)chim) + pkt_hlen);
2026 
2027 		txr->hn_gpa_cnt = 0;
2028 		txr->hn_sendpkt = hn_txpkt_chim;
2029 		goto done;
2030 	}
2031 
2032 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2033 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2034 	    ("chimney buffer is used"));
2035 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2036 
2037 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2038 	if (__predict_false(error)) {
2039 		int freed;
2040 
2041 		/*
2042 		 * This mbuf is not linked w/ the txd yet, so free it now.
2043 		 */
2044 		m_freem(m_head);
2045 		*m_head0 = NULL;
2046 
2047 		freed = hn_txdesc_put(txr, txd);
2048 		KASSERT(freed != 0,
2049 		    ("fail to free txd upon txdma error"));
2050 
2051 		txr->hn_txdma_failed++;
2052 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2053 		return error;
2054 	}
2055 	*m_head0 = m_head;
2056 
2057 	/* +1 RNDIS packet message */
2058 	txr->hn_gpa_cnt = nsegs + 1;
2059 
2060 	/* send packet with page buffer */
2061 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2062 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2063 	txr->hn_gpa[0].gpa_len = pkt_hlen;
2064 
2065 	/*
2066 	 * Fill the page buffers with mbuf info after the page
2067 	 * buffer for RNDIS packet message.
2068 	 */
2069 	for (i = 0; i < nsegs; ++i) {
2070 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2071 
2072 		gpa->gpa_page = atop(segs[i].ds_addr);
2073 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2074 		gpa->gpa_len = segs[i].ds_len;
2075 	}
2076 
2077 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2078 	txd->chim_size = 0;
2079 	txr->hn_sendpkt = hn_txpkt_sglist;
2080 done:
2081 	txd->m = m_head;
2082 
2083 	/* Set the completion routine */
2084 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2085 
2086 	/* Update temporary stats for later use. */
2087 	txr->hn_stat_pkts++;
2088 	txr->hn_stat_size += m_head->m_pkthdr.len;
2089 	if (m_head->m_flags & M_MCAST)
2090 		txr->hn_stat_mcasts++;
2091 
2092 	return 0;
2093 }
2094 
2095 /*
2096  * NOTE:
2097  * If this function fails, then txd will be freed, but the mbuf
2098  * associated w/ the txd will _not_ be freed.
2099  */
2100 static int
2101 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2102 {
2103 	int error, send_failed = 0, has_bpf;
2104 
2105 again:
2106 	has_bpf = bpf_peers_present(ifp->if_bpf);
2107 	if (has_bpf) {
2108 		/*
2109 		 * Make sure that this txd and any aggregated txds are not
2110 		 * freed before ETHER_BPF_MTAP.
2111 		 */
2112 		hn_txdesc_hold(txd);
2113 	}
2114 	error = txr->hn_sendpkt(txr, txd);
2115 	if (!error) {
2116 		if (has_bpf) {
2117 			const struct hn_txdesc *tmp_txd;
2118 
2119 			ETHER_BPF_MTAP(ifp, txd->m);
2120 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2121 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
2122 		}
2123 
2124 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2125 #ifdef HN_IFSTART_SUPPORT
2126 		if (!hn_use_if_start)
2127 #endif
2128 		{
2129 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
2130 			    txr->hn_stat_size);
2131 			if (txr->hn_stat_mcasts != 0) {
2132 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2133 				    txr->hn_stat_mcasts);
2134 			}
2135 		}
2136 		txr->hn_pkts += txr->hn_stat_pkts;
2137 		txr->hn_sends++;
2138 	}
2139 	if (has_bpf)
2140 		hn_txdesc_put(txr, txd);
2141 
2142 	if (__predict_false(error)) {
2143 		int freed;
2144 
2145 		/*
2146 		 * This should "really rarely" happen.
2147 		 *
2148 		 * XXX Too many RX to be acked or too many sideband
2149 		 * commands to run?  Ask netvsc_channel_rollup()
2150 		 * to kick start later.
2151 		 */
2152 		txr->hn_has_txeof = 1;
2153 		if (!send_failed) {
2154 			txr->hn_send_failed++;
2155 			send_failed = 1;
2156 			/*
2157 			 * Try sending again after set hn_has_txeof;
2158 			 * in case that we missed the last
2159 			 * netvsc_channel_rollup().
2160 			 */
2161 			goto again;
2162 		}
2163 		if_printf(ifp, "send failed\n");
2164 
2165 		/*
2166 		 * Caller will perform further processing on the
2167 		 * associated mbuf, so don't free it in hn_txdesc_put();
2168 		 * only unload it from the DMA map in hn_txdesc_put(),
2169 		 * if it was loaded.
2170 		 */
2171 		txd->m = NULL;
2172 		freed = hn_txdesc_put(txr, txd);
2173 		KASSERT(freed != 0,
2174 		    ("fail to free txd upon send error"));
2175 
2176 		txr->hn_send_failed++;
2177 	}
2178 
2179 	/* Reset temporary stats, after this sending is done. */
2180 	txr->hn_stat_size = 0;
2181 	txr->hn_stat_pkts = 0;
2182 	txr->hn_stat_mcasts = 0;
2183 
2184 	return (error);
2185 }
2186 
2187 /*
2188  * Append the specified data to the indicated mbuf chain,
2189  * Extend the mbuf chain if the new data does not fit in
2190  * existing space.
2191  *
2192  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2193  * There should be an equivalent in the kernel mbuf code,
2194  * but there does not appear to be one yet.
2195  *
2196  * Differs from m_append() in that additional mbufs are
2197  * allocated with cluster size MJUMPAGESIZE, and filled
2198  * accordingly.
2199  *
2200  * Return 1 if able to complete the job; otherwise 0.
2201  */
2202 static int
2203 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2204 {
2205 	struct mbuf *m, *n;
2206 	int remainder, space;
2207 
2208 	for (m = m0; m->m_next != NULL; m = m->m_next)
2209 		;
2210 	remainder = len;
2211 	space = M_TRAILINGSPACE(m);
2212 	if (space > 0) {
2213 		/*
2214 		 * Copy into available space.
2215 		 */
2216 		if (space > remainder)
2217 			space = remainder;
2218 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2219 		m->m_len += space;
2220 		cp += space;
2221 		remainder -= space;
2222 	}
2223 	while (remainder > 0) {
2224 		/*
2225 		 * Allocate a new mbuf; could check space
2226 		 * and allocate a cluster instead.
2227 		 */
2228 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2229 		if (n == NULL)
2230 			break;
2231 		n->m_len = min(MJUMPAGESIZE, remainder);
2232 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2233 		cp += n->m_len;
2234 		remainder -= n->m_len;
2235 		m->m_next = n;
2236 		m = n;
2237 	}
2238 	if (m0->m_flags & M_PKTHDR)
2239 		m0->m_pkthdr.len += len - remainder;
2240 
2241 	return (remainder == 0);
2242 }
2243 
2244 #if defined(INET) || defined(INET6)
2245 static __inline int
2246 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2247 {
2248 #if __FreeBSD_version >= 1100095
2249 	if (hn_lro_mbufq_depth) {
2250 		tcp_lro_queue_mbuf(lc, m);
2251 		return 0;
2252 	}
2253 #endif
2254 	return tcp_lro_rx(lc, m, 0);
2255 }
2256 #endif
2257 
2258 static int
2259 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2260     const struct hn_rxinfo *info)
2261 {
2262 	struct ifnet *ifp;
2263 	struct mbuf *m_new;
2264 	int size, do_lro = 0, do_csum = 1;
2265 	int hash_type;
2266 
2267 	/* If the VF is active, inject the packet through the VF */
2268 	ifp = rxr->hn_vf ? rxr->hn_vf : rxr->hn_ifp;
2269 
2270 	if (dlen <= MHLEN) {
2271 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
2272 		if (m_new == NULL) {
2273 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2274 			return (0);
2275 		}
2276 		memcpy(mtod(m_new, void *), data, dlen);
2277 		m_new->m_pkthdr.len = m_new->m_len = dlen;
2278 		rxr->hn_small_pkts++;
2279 	} else {
2280 		/*
2281 		 * Get an mbuf with a cluster.  For packets 2K or less,
2282 		 * get a standard 2K cluster.  For anything larger, get a
2283 		 * 4K cluster.  Any buffers larger than 4K can cause problems
2284 		 * if looped around to the Hyper-V TX channel, so avoid them.
2285 		 */
2286 		size = MCLBYTES;
2287 		if (dlen > MCLBYTES) {
2288 			/* 4096 */
2289 			size = MJUMPAGESIZE;
2290 		}
2291 
2292 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2293 		if (m_new == NULL) {
2294 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2295 			return (0);
2296 		}
2297 
2298 		hv_m_append(m_new, dlen, data);
2299 	}
2300 	m_new->m_pkthdr.rcvif = ifp;
2301 
2302 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2303 		do_csum = 0;
2304 
2305 	/* receive side checksum offload */
2306 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2307 		/* IP csum offload */
2308 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2309 			m_new->m_pkthdr.csum_flags |=
2310 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2311 			rxr->hn_csum_ip++;
2312 		}
2313 
2314 		/* TCP/UDP csum offload */
2315 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2316 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2317 			m_new->m_pkthdr.csum_flags |=
2318 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2319 			m_new->m_pkthdr.csum_data = 0xffff;
2320 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2321 				rxr->hn_csum_tcp++;
2322 			else
2323 				rxr->hn_csum_udp++;
2324 		}
2325 
2326 		/*
2327 		 * XXX
2328 		 * As of this write (Oct 28th, 2016), host side will turn
2329 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2330 		 * the do_lro setting here is actually _not_ accurate.  We
2331 		 * depend on the RSS hash type check to reset do_lro.
2332 		 */
2333 		if ((info->csum_info &
2334 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2335 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2336 			do_lro = 1;
2337 	} else {
2338 		const struct ether_header *eh;
2339 		uint16_t etype;
2340 		int hoff;
2341 
2342 		hoff = sizeof(*eh);
2343 		if (m_new->m_len < hoff)
2344 			goto skip;
2345 		eh = mtod(m_new, struct ether_header *);
2346 		etype = ntohs(eh->ether_type);
2347 		if (etype == ETHERTYPE_VLAN) {
2348 			const struct ether_vlan_header *evl;
2349 
2350 			hoff = sizeof(*evl);
2351 			if (m_new->m_len < hoff)
2352 				goto skip;
2353 			evl = mtod(m_new, struct ether_vlan_header *);
2354 			etype = ntohs(evl->evl_proto);
2355 		}
2356 
2357 		if (etype == ETHERTYPE_IP) {
2358 			int pr;
2359 
2360 			pr = hn_check_iplen(m_new, hoff);
2361 			if (pr == IPPROTO_TCP) {
2362 				if (do_csum &&
2363 				    (rxr->hn_trust_hcsum &
2364 				     HN_TRUST_HCSUM_TCP)) {
2365 					rxr->hn_csum_trusted++;
2366 					m_new->m_pkthdr.csum_flags |=
2367 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2368 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2369 					m_new->m_pkthdr.csum_data = 0xffff;
2370 				}
2371 				do_lro = 1;
2372 			} else if (pr == IPPROTO_UDP) {
2373 				if (do_csum &&
2374 				    (rxr->hn_trust_hcsum &
2375 				     HN_TRUST_HCSUM_UDP)) {
2376 					rxr->hn_csum_trusted++;
2377 					m_new->m_pkthdr.csum_flags |=
2378 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2379 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2380 					m_new->m_pkthdr.csum_data = 0xffff;
2381 				}
2382 			} else if (pr != IPPROTO_DONE && do_csum &&
2383 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2384 				rxr->hn_csum_trusted++;
2385 				m_new->m_pkthdr.csum_flags |=
2386 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2387 			}
2388 		}
2389 	}
2390 skip:
2391 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2392 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2393 		    NDIS_VLAN_INFO_ID(info->vlan_info),
2394 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
2395 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
2396 		m_new->m_flags |= M_VLANTAG;
2397 	}
2398 
2399 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2400 		rxr->hn_rss_pkts++;
2401 		m_new->m_pkthdr.flowid = info->hash_value;
2402 		hash_type = M_HASHTYPE_OPAQUE_HASH;
2403 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2404 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
2405 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2406 
2407 			/*
2408 			 * NOTE:
2409 			 * do_lro is resetted, if the hash types are not TCP
2410 			 * related.  See the comment in the above csum_flags
2411 			 * setup section.
2412 			 */
2413 			switch (type) {
2414 			case NDIS_HASH_IPV4:
2415 				hash_type = M_HASHTYPE_RSS_IPV4;
2416 				do_lro = 0;
2417 				break;
2418 
2419 			case NDIS_HASH_TCP_IPV4:
2420 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2421 				break;
2422 
2423 			case NDIS_HASH_IPV6:
2424 				hash_type = M_HASHTYPE_RSS_IPV6;
2425 				do_lro = 0;
2426 				break;
2427 
2428 			case NDIS_HASH_IPV6_EX:
2429 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
2430 				do_lro = 0;
2431 				break;
2432 
2433 			case NDIS_HASH_TCP_IPV6:
2434 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2435 				break;
2436 
2437 			case NDIS_HASH_TCP_IPV6_EX:
2438 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2439 				break;
2440 			}
2441 		}
2442 	} else {
2443 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2444 		hash_type = M_HASHTYPE_OPAQUE;
2445 	}
2446 	M_HASHTYPE_SET(m_new, hash_type);
2447 
2448 	/*
2449 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
2450 	 * messages (not just data messages) will trigger a response.
2451 	 */
2452 
2453 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2454 	rxr->hn_pkts++;
2455 
2456 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2457 #if defined(INET) || defined(INET6)
2458 		struct lro_ctrl *lro = &rxr->hn_lro;
2459 
2460 		if (lro->lro_cnt) {
2461 			rxr->hn_lro_tried++;
2462 			if (hn_lro_rx(lro, m_new) == 0) {
2463 				/* DONE! */
2464 				return 0;
2465 			}
2466 		}
2467 #endif
2468 	}
2469 
2470 	/* We're not holding the lock here, so don't release it */
2471 	(*ifp->if_input)(ifp, m_new);
2472 
2473 	return (0);
2474 }
2475 
2476 static int
2477 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2478 {
2479 	struct hn_softc *sc = ifp->if_softc;
2480 	struct ifreq *ifr = (struct ifreq *)data;
2481 	int mask, error = 0;
2482 
2483 	switch (cmd) {
2484 	case SIOCSIFMTU:
2485 		if (ifr->ifr_mtu > HN_MTU_MAX) {
2486 			error = EINVAL;
2487 			break;
2488 		}
2489 
2490 		HN_LOCK(sc);
2491 
2492 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2493 			HN_UNLOCK(sc);
2494 			break;
2495 		}
2496 
2497 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2498 			/* Can't change MTU */
2499 			HN_UNLOCK(sc);
2500 			error = EOPNOTSUPP;
2501 			break;
2502 		}
2503 
2504 		if (ifp->if_mtu == ifr->ifr_mtu) {
2505 			HN_UNLOCK(sc);
2506 			break;
2507 		}
2508 
2509 		/*
2510 		 * Suspend this interface before the synthetic parts
2511 		 * are ripped.
2512 		 */
2513 		hn_suspend(sc);
2514 
2515 		/*
2516 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
2517 		 */
2518 		hn_synth_detach(sc);
2519 
2520 		/*
2521 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2522 		 * with the new MTU setting.
2523 		 */
2524 		error = hn_synth_attach(sc, ifr->ifr_mtu);
2525 		if (error) {
2526 			HN_UNLOCK(sc);
2527 			break;
2528 		}
2529 
2530 		/*
2531 		 * Commit the requested MTU, after the synthetic parts
2532 		 * have been successfully attached.
2533 		 */
2534 		ifp->if_mtu = ifr->ifr_mtu;
2535 
2536 		/*
2537 		 * Make sure that various parameters based on MTU are
2538 		 * still valid, after the MTU change.
2539 		 */
2540 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2541 			hn_set_chim_size(sc, sc->hn_chim_szmax);
2542 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2543 #if __FreeBSD_version >= 1100099
2544 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2545 		    HN_LRO_LENLIM_MIN(ifp))
2546 			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2547 #endif
2548 
2549 		/*
2550 		 * All done!  Resume the interface now.
2551 		 */
2552 		hn_resume(sc);
2553 
2554 		HN_UNLOCK(sc);
2555 		break;
2556 
2557 	case SIOCSIFFLAGS:
2558 		HN_LOCK(sc);
2559 
2560 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2561 			HN_UNLOCK(sc);
2562 			break;
2563 		}
2564 
2565 		if (ifp->if_flags & IFF_UP) {
2566 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2567 				/*
2568 				 * Caller meight hold mutex, e.g.
2569 				 * bpf; use busy-wait for the RNDIS
2570 				 * reply.
2571 				 */
2572 				HN_NO_SLEEPING(sc);
2573 				hn_rxfilter_config(sc);
2574 				HN_SLEEPING_OK(sc);
2575 			} else {
2576 				hn_init_locked(sc);
2577 			}
2578 		} else {
2579 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2580 				hn_stop(sc, false);
2581 		}
2582 		sc->hn_if_flags = ifp->if_flags;
2583 
2584 		HN_UNLOCK(sc);
2585 		break;
2586 
2587 	case SIOCSIFCAP:
2588 		HN_LOCK(sc);
2589 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2590 
2591 		if (mask & IFCAP_TXCSUM) {
2592 			ifp->if_capenable ^= IFCAP_TXCSUM;
2593 			if (ifp->if_capenable & IFCAP_TXCSUM)
2594 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2595 			else
2596 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2597 		}
2598 		if (mask & IFCAP_TXCSUM_IPV6) {
2599 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2600 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2601 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2602 			else
2603 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2604 		}
2605 
2606 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2607 		if (mask & IFCAP_RXCSUM)
2608 			ifp->if_capenable ^= IFCAP_RXCSUM;
2609 #ifdef foo
2610 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2611 		if (mask & IFCAP_RXCSUM_IPV6)
2612 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2613 #endif
2614 
2615 		if (mask & IFCAP_LRO)
2616 			ifp->if_capenable ^= IFCAP_LRO;
2617 
2618 		if (mask & IFCAP_TSO4) {
2619 			ifp->if_capenable ^= IFCAP_TSO4;
2620 			if (ifp->if_capenable & IFCAP_TSO4)
2621 				ifp->if_hwassist |= CSUM_IP_TSO;
2622 			else
2623 				ifp->if_hwassist &= ~CSUM_IP_TSO;
2624 		}
2625 		if (mask & IFCAP_TSO6) {
2626 			ifp->if_capenable ^= IFCAP_TSO6;
2627 			if (ifp->if_capenable & IFCAP_TSO6)
2628 				ifp->if_hwassist |= CSUM_IP6_TSO;
2629 			else
2630 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2631 		}
2632 
2633 		HN_UNLOCK(sc);
2634 		break;
2635 
2636 	case SIOCADDMULTI:
2637 	case SIOCDELMULTI:
2638 		HN_LOCK(sc);
2639 
2640 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2641 			HN_UNLOCK(sc);
2642 			break;
2643 		}
2644 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2645 			/*
2646 			 * Multicast uses mutex; use busy-wait for
2647 			 * the RNDIS reply.
2648 			 */
2649 			HN_NO_SLEEPING(sc);
2650 			hn_rxfilter_config(sc);
2651 			HN_SLEEPING_OK(sc);
2652 		}
2653 
2654 		HN_UNLOCK(sc);
2655 		break;
2656 
2657 	case SIOCSIFMEDIA:
2658 	case SIOCGIFMEDIA:
2659 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2660 		break;
2661 
2662 	default:
2663 		error = ether_ioctl(ifp, cmd, data);
2664 		break;
2665 	}
2666 	return (error);
2667 }
2668 
2669 static void
2670 hn_stop(struct hn_softc *sc, bool detaching)
2671 {
2672 	struct ifnet *ifp = sc->hn_ifp;
2673 	int i;
2674 
2675 	HN_LOCK_ASSERT(sc);
2676 
2677 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2678 	    ("synthetic parts were not attached"));
2679 
2680 	/* Disable polling. */
2681 	hn_polling(sc, 0);
2682 
2683 	/* Clear RUNNING bit _before_ hn_suspend_data() */
2684 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2685 	hn_suspend_data(sc);
2686 
2687 	/* Clear OACTIVE bit. */
2688 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2689 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2690 		sc->hn_tx_ring[i].hn_oactive = 0;
2691 
2692 	/*
2693 	 * If the VF is active, make sure the filter is not 0, even if
2694 	 * the synthetic NIC is down.
2695 	 */
2696 	if (!detaching && (sc->hn_flags & HN_FLAG_VF))
2697 		hn_rxfilter_config(sc);
2698 }
2699 
2700 static void
2701 hn_init_locked(struct hn_softc *sc)
2702 {
2703 	struct ifnet *ifp = sc->hn_ifp;
2704 	int i;
2705 
2706 	HN_LOCK_ASSERT(sc);
2707 
2708 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2709 		return;
2710 
2711 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2712 		return;
2713 
2714 	/* Configure RX filter */
2715 	hn_rxfilter_config(sc);
2716 
2717 	/* Clear OACTIVE bit. */
2718 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2719 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2720 		sc->hn_tx_ring[i].hn_oactive = 0;
2721 
2722 	/* Clear TX 'suspended' bit. */
2723 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2724 
2725 	/* Everything is ready; unleash! */
2726 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2727 
2728 	/* Re-enable polling if requested. */
2729 	if (sc->hn_pollhz > 0)
2730 		hn_polling(sc, sc->hn_pollhz);
2731 }
2732 
2733 static void
2734 hn_init(void *xsc)
2735 {
2736 	struct hn_softc *sc = xsc;
2737 
2738 	HN_LOCK(sc);
2739 	hn_init_locked(sc);
2740 	HN_UNLOCK(sc);
2741 }
2742 
2743 #if __FreeBSD_version >= 1100099
2744 
2745 static int
2746 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2747 {
2748 	struct hn_softc *sc = arg1;
2749 	unsigned int lenlim;
2750 	int error;
2751 
2752 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2753 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2754 	if (error || req->newptr == NULL)
2755 		return error;
2756 
2757 	HN_LOCK(sc);
2758 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2759 	    lenlim > TCP_LRO_LENGTH_MAX) {
2760 		HN_UNLOCK(sc);
2761 		return EINVAL;
2762 	}
2763 	hn_set_lro_lenlim(sc, lenlim);
2764 	HN_UNLOCK(sc);
2765 
2766 	return 0;
2767 }
2768 
2769 static int
2770 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2771 {
2772 	struct hn_softc *sc = arg1;
2773 	int ackcnt, error, i;
2774 
2775 	/*
2776 	 * lro_ackcnt_lim is append count limit,
2777 	 * +1 to turn it into aggregation limit.
2778 	 */
2779 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2780 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2781 	if (error || req->newptr == NULL)
2782 		return error;
2783 
2784 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2785 		return EINVAL;
2786 
2787 	/*
2788 	 * Convert aggregation limit back to append
2789 	 * count limit.
2790 	 */
2791 	--ackcnt;
2792 	HN_LOCK(sc);
2793 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2794 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2795 	HN_UNLOCK(sc);
2796 	return 0;
2797 }
2798 
2799 #endif
2800 
2801 static int
2802 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2803 {
2804 	struct hn_softc *sc = arg1;
2805 	int hcsum = arg2;
2806 	int on, error, i;
2807 
2808 	on = 0;
2809 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2810 		on = 1;
2811 
2812 	error = sysctl_handle_int(oidp, &on, 0, req);
2813 	if (error || req->newptr == NULL)
2814 		return error;
2815 
2816 	HN_LOCK(sc);
2817 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2818 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2819 
2820 		if (on)
2821 			rxr->hn_trust_hcsum |= hcsum;
2822 		else
2823 			rxr->hn_trust_hcsum &= ~hcsum;
2824 	}
2825 	HN_UNLOCK(sc);
2826 	return 0;
2827 }
2828 
2829 static int
2830 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2831 {
2832 	struct hn_softc *sc = arg1;
2833 	int chim_size, error;
2834 
2835 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
2836 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
2837 	if (error || req->newptr == NULL)
2838 		return error;
2839 
2840 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2841 		return EINVAL;
2842 
2843 	HN_LOCK(sc);
2844 	hn_set_chim_size(sc, chim_size);
2845 	HN_UNLOCK(sc);
2846 	return 0;
2847 }
2848 
2849 #if __FreeBSD_version < 1100095
2850 static int
2851 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2852 {
2853 	struct hn_softc *sc = arg1;
2854 	int ofs = arg2, i, error;
2855 	struct hn_rx_ring *rxr;
2856 	uint64_t stat;
2857 
2858 	stat = 0;
2859 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2860 		rxr = &sc->hn_rx_ring[i];
2861 		stat += *((int *)((uint8_t *)rxr + ofs));
2862 	}
2863 
2864 	error = sysctl_handle_64(oidp, &stat, 0, req);
2865 	if (error || req->newptr == NULL)
2866 		return error;
2867 
2868 	/* Zero out this stat. */
2869 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2870 		rxr = &sc->hn_rx_ring[i];
2871 		*((int *)((uint8_t *)rxr + ofs)) = 0;
2872 	}
2873 	return 0;
2874 }
2875 #else
2876 static int
2877 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2878 {
2879 	struct hn_softc *sc = arg1;
2880 	int ofs = arg2, i, error;
2881 	struct hn_rx_ring *rxr;
2882 	uint64_t stat;
2883 
2884 	stat = 0;
2885 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2886 		rxr = &sc->hn_rx_ring[i];
2887 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2888 	}
2889 
2890 	error = sysctl_handle_64(oidp, &stat, 0, req);
2891 	if (error || req->newptr == NULL)
2892 		return error;
2893 
2894 	/* Zero out this stat. */
2895 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2896 		rxr = &sc->hn_rx_ring[i];
2897 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2898 	}
2899 	return 0;
2900 }
2901 
2902 #endif
2903 
2904 static int
2905 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2906 {
2907 	struct hn_softc *sc = arg1;
2908 	int ofs = arg2, i, error;
2909 	struct hn_rx_ring *rxr;
2910 	u_long stat;
2911 
2912 	stat = 0;
2913 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2914 		rxr = &sc->hn_rx_ring[i];
2915 		stat += *((u_long *)((uint8_t *)rxr + ofs));
2916 	}
2917 
2918 	error = sysctl_handle_long(oidp, &stat, 0, req);
2919 	if (error || req->newptr == NULL)
2920 		return error;
2921 
2922 	/* Zero out this stat. */
2923 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2924 		rxr = &sc->hn_rx_ring[i];
2925 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
2926 	}
2927 	return 0;
2928 }
2929 
2930 static int
2931 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2932 {
2933 	struct hn_softc *sc = arg1;
2934 	int ofs = arg2, i, error;
2935 	struct hn_tx_ring *txr;
2936 	u_long stat;
2937 
2938 	stat = 0;
2939 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2940 		txr = &sc->hn_tx_ring[i];
2941 		stat += *((u_long *)((uint8_t *)txr + ofs));
2942 	}
2943 
2944 	error = sysctl_handle_long(oidp, &stat, 0, req);
2945 	if (error || req->newptr == NULL)
2946 		return error;
2947 
2948 	/* Zero out this stat. */
2949 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2950 		txr = &sc->hn_tx_ring[i];
2951 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
2952 	}
2953 	return 0;
2954 }
2955 
2956 static int
2957 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2958 {
2959 	struct hn_softc *sc = arg1;
2960 	int ofs = arg2, i, error, conf;
2961 	struct hn_tx_ring *txr;
2962 
2963 	txr = &sc->hn_tx_ring[0];
2964 	conf = *((int *)((uint8_t *)txr + ofs));
2965 
2966 	error = sysctl_handle_int(oidp, &conf, 0, req);
2967 	if (error || req->newptr == NULL)
2968 		return error;
2969 
2970 	HN_LOCK(sc);
2971 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2972 		txr = &sc->hn_tx_ring[i];
2973 		*((int *)((uint8_t *)txr + ofs)) = conf;
2974 	}
2975 	HN_UNLOCK(sc);
2976 
2977 	return 0;
2978 }
2979 
2980 static int
2981 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2982 {
2983 	struct hn_softc *sc = arg1;
2984 	int error, size;
2985 
2986 	size = sc->hn_agg_size;
2987 	error = sysctl_handle_int(oidp, &size, 0, req);
2988 	if (error || req->newptr == NULL)
2989 		return (error);
2990 
2991 	HN_LOCK(sc);
2992 	sc->hn_agg_size = size;
2993 	hn_set_txagg(sc);
2994 	HN_UNLOCK(sc);
2995 
2996 	return (0);
2997 }
2998 
2999 static int
3000 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3001 {
3002 	struct hn_softc *sc = arg1;
3003 	int error, pkts;
3004 
3005 	pkts = sc->hn_agg_pkts;
3006 	error = sysctl_handle_int(oidp, &pkts, 0, req);
3007 	if (error || req->newptr == NULL)
3008 		return (error);
3009 
3010 	HN_LOCK(sc);
3011 	sc->hn_agg_pkts = pkts;
3012 	hn_set_txagg(sc);
3013 	HN_UNLOCK(sc);
3014 
3015 	return (0);
3016 }
3017 
3018 static int
3019 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
3020 {
3021 	struct hn_softc *sc = arg1;
3022 	int pkts;
3023 
3024 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
3025 	return (sysctl_handle_int(oidp, &pkts, 0, req));
3026 }
3027 
3028 static int
3029 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
3030 {
3031 	struct hn_softc *sc = arg1;
3032 	int align;
3033 
3034 	align = sc->hn_tx_ring[0].hn_agg_align;
3035 	return (sysctl_handle_int(oidp, &align, 0, req));
3036 }
3037 
3038 static void
3039 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
3040 {
3041 	if (pollhz == 0)
3042 		vmbus_chan_poll_disable(chan);
3043 	else
3044 		vmbus_chan_poll_enable(chan, pollhz);
3045 }
3046 
3047 static void
3048 hn_polling(struct hn_softc *sc, u_int pollhz)
3049 {
3050 	int nsubch = sc->hn_rx_ring_inuse - 1;
3051 
3052 	HN_LOCK_ASSERT(sc);
3053 
3054 	if (nsubch > 0) {
3055 		struct vmbus_channel **subch;
3056 		int i;
3057 
3058 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3059 		for (i = 0; i < nsubch; ++i)
3060 			hn_chan_polling(subch[i], pollhz);
3061 		vmbus_subchan_rel(subch, nsubch);
3062 	}
3063 	hn_chan_polling(sc->hn_prichan, pollhz);
3064 }
3065 
3066 static int
3067 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
3068 {
3069 	struct hn_softc *sc = arg1;
3070 	int pollhz, error;
3071 
3072 	pollhz = sc->hn_pollhz;
3073 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
3074 	if (error || req->newptr == NULL)
3075 		return (error);
3076 
3077 	if (pollhz != 0 &&
3078 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
3079 		return (EINVAL);
3080 
3081 	HN_LOCK(sc);
3082 	if (sc->hn_pollhz != pollhz) {
3083 		sc->hn_pollhz = pollhz;
3084 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
3085 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
3086 			hn_polling(sc, sc->hn_pollhz);
3087 	}
3088 	HN_UNLOCK(sc);
3089 
3090 	return (0);
3091 }
3092 
3093 static int
3094 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
3095 {
3096 	struct hn_softc *sc = arg1;
3097 	char verstr[16];
3098 
3099 	snprintf(verstr, sizeof(verstr), "%u.%u",
3100 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
3101 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
3102 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
3103 }
3104 
3105 static int
3106 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
3107 {
3108 	struct hn_softc *sc = arg1;
3109 	char caps_str[128];
3110 	uint32_t caps;
3111 
3112 	HN_LOCK(sc);
3113 	caps = sc->hn_caps;
3114 	HN_UNLOCK(sc);
3115 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
3116 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
3117 }
3118 
3119 static int
3120 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
3121 {
3122 	struct hn_softc *sc = arg1;
3123 	char assist_str[128];
3124 	uint32_t hwassist;
3125 
3126 	HN_LOCK(sc);
3127 	hwassist = sc->hn_ifp->if_hwassist;
3128 	HN_UNLOCK(sc);
3129 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
3130 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
3131 }
3132 
3133 static int
3134 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
3135 {
3136 	struct hn_softc *sc = arg1;
3137 	char filter_str[128];
3138 	uint32_t filter;
3139 
3140 	HN_LOCK(sc);
3141 	filter = sc->hn_rx_filter;
3142 	HN_UNLOCK(sc);
3143 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
3144 	    NDIS_PACKET_TYPES);
3145 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3146 }
3147 
3148 #ifndef RSS
3149 
3150 static int
3151 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3152 {
3153 	struct hn_softc *sc = arg1;
3154 	int error;
3155 
3156 	HN_LOCK(sc);
3157 
3158 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3159 	if (error || req->newptr == NULL)
3160 		goto back;
3161 
3162 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3163 	if (error)
3164 		goto back;
3165 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3166 
3167 	if (sc->hn_rx_ring_inuse > 1) {
3168 		error = hn_rss_reconfig(sc);
3169 	} else {
3170 		/* Not RSS capable, at least for now; just save the RSS key. */
3171 		error = 0;
3172 	}
3173 back:
3174 	HN_UNLOCK(sc);
3175 	return (error);
3176 }
3177 
3178 static int
3179 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3180 {
3181 	struct hn_softc *sc = arg1;
3182 	int error;
3183 
3184 	HN_LOCK(sc);
3185 
3186 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3187 	if (error || req->newptr == NULL)
3188 		goto back;
3189 
3190 	/*
3191 	 * Don't allow RSS indirect table change, if this interface is not
3192 	 * RSS capable currently.
3193 	 */
3194 	if (sc->hn_rx_ring_inuse == 1) {
3195 		error = EOPNOTSUPP;
3196 		goto back;
3197 	}
3198 
3199 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3200 	if (error)
3201 		goto back;
3202 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3203 
3204 	hn_rss_ind_fixup(sc);
3205 	error = hn_rss_reconfig(sc);
3206 back:
3207 	HN_UNLOCK(sc);
3208 	return (error);
3209 }
3210 
3211 #endif	/* !RSS */
3212 
3213 static int
3214 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3215 {
3216 	struct hn_softc *sc = arg1;
3217 	char hash_str[128];
3218 	uint32_t hash;
3219 
3220 	HN_LOCK(sc);
3221 	hash = sc->hn_rss_hash;
3222 	HN_UNLOCK(sc);
3223 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3224 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3225 }
3226 
3227 static int
3228 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
3229 {
3230 	struct hn_softc *sc = arg1;
3231 	char vf_name[128];
3232 	struct ifnet *vf;
3233 
3234 	HN_LOCK(sc);
3235 	vf_name[0] = '\0';
3236 	vf = sc->hn_rx_ring[0].hn_vf;
3237 	if (vf != NULL)
3238 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf));
3239 	HN_UNLOCK(sc);
3240 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3241 }
3242 
3243 static int
3244 hn_check_iplen(const struct mbuf *m, int hoff)
3245 {
3246 	const struct ip *ip;
3247 	int len, iphlen, iplen;
3248 	const struct tcphdr *th;
3249 	int thoff;				/* TCP data offset */
3250 
3251 	len = hoff + sizeof(struct ip);
3252 
3253 	/* The packet must be at least the size of an IP header. */
3254 	if (m->m_pkthdr.len < len)
3255 		return IPPROTO_DONE;
3256 
3257 	/* The fixed IP header must reside completely in the first mbuf. */
3258 	if (m->m_len < len)
3259 		return IPPROTO_DONE;
3260 
3261 	ip = mtodo(m, hoff);
3262 
3263 	/* Bound check the packet's stated IP header length. */
3264 	iphlen = ip->ip_hl << 2;
3265 	if (iphlen < sizeof(struct ip))		/* minimum header length */
3266 		return IPPROTO_DONE;
3267 
3268 	/* The full IP header must reside completely in the one mbuf. */
3269 	if (m->m_len < hoff + iphlen)
3270 		return IPPROTO_DONE;
3271 
3272 	iplen = ntohs(ip->ip_len);
3273 
3274 	/*
3275 	 * Check that the amount of data in the buffers is as
3276 	 * at least much as the IP header would have us expect.
3277 	 */
3278 	if (m->m_pkthdr.len < hoff + iplen)
3279 		return IPPROTO_DONE;
3280 
3281 	/*
3282 	 * Ignore IP fragments.
3283 	 */
3284 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3285 		return IPPROTO_DONE;
3286 
3287 	/*
3288 	 * The TCP/IP or UDP/IP header must be entirely contained within
3289 	 * the first fragment of a packet.
3290 	 */
3291 	switch (ip->ip_p) {
3292 	case IPPROTO_TCP:
3293 		if (iplen < iphlen + sizeof(struct tcphdr))
3294 			return IPPROTO_DONE;
3295 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3296 			return IPPROTO_DONE;
3297 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3298 		thoff = th->th_off << 2;
3299 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3300 			return IPPROTO_DONE;
3301 		if (m->m_len < hoff + iphlen + thoff)
3302 			return IPPROTO_DONE;
3303 		break;
3304 	case IPPROTO_UDP:
3305 		if (iplen < iphlen + sizeof(struct udphdr))
3306 			return IPPROTO_DONE;
3307 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3308 			return IPPROTO_DONE;
3309 		break;
3310 	default:
3311 		if (iplen < iphlen)
3312 			return IPPROTO_DONE;
3313 		break;
3314 	}
3315 	return ip->ip_p;
3316 }
3317 
3318 static int
3319 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3320 {
3321 	struct sysctl_oid_list *child;
3322 	struct sysctl_ctx_list *ctx;
3323 	device_t dev = sc->hn_dev;
3324 #if defined(INET) || defined(INET6)
3325 #if __FreeBSD_version >= 1100095
3326 	int lroent_cnt;
3327 #endif
3328 #endif
3329 	int i;
3330 
3331 	/*
3332 	 * Create RXBUF for reception.
3333 	 *
3334 	 * NOTE:
3335 	 * - It is shared by all channels.
3336 	 * - A large enough buffer is allocated, certain version of NVSes
3337 	 *   may further limit the usable space.
3338 	 */
3339 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3340 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3341 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3342 	if (sc->hn_rxbuf == NULL) {
3343 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3344 		return (ENOMEM);
3345 	}
3346 
3347 	sc->hn_rx_ring_cnt = ring_cnt;
3348 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3349 
3350 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3351 	    M_DEVBUF, M_WAITOK | M_ZERO);
3352 
3353 #if defined(INET) || defined(INET6)
3354 #if __FreeBSD_version >= 1100095
3355 	lroent_cnt = hn_lro_entry_count;
3356 	if (lroent_cnt < TCP_LRO_ENTRIES)
3357 		lroent_cnt = TCP_LRO_ENTRIES;
3358 	if (bootverbose)
3359 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3360 #endif
3361 #endif	/* INET || INET6 */
3362 
3363 	ctx = device_get_sysctl_ctx(dev);
3364 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3365 
3366 	/* Create dev.hn.UNIT.rx sysctl tree */
3367 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3368 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3369 
3370 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3371 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3372 
3373 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3374 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3375 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
3376 		if (rxr->hn_br == NULL) {
3377 			device_printf(dev, "allocate bufring failed\n");
3378 			return (ENOMEM);
3379 		}
3380 
3381 		if (hn_trust_hosttcp)
3382 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3383 		if (hn_trust_hostudp)
3384 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3385 		if (hn_trust_hostip)
3386 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3387 		rxr->hn_ifp = sc->hn_ifp;
3388 		if (i < sc->hn_tx_ring_cnt)
3389 			rxr->hn_txr = &sc->hn_tx_ring[i];
3390 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3391 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3392 		rxr->hn_rx_idx = i;
3393 		rxr->hn_rxbuf = sc->hn_rxbuf;
3394 
3395 		/*
3396 		 * Initialize LRO.
3397 		 */
3398 #if defined(INET) || defined(INET6)
3399 #if __FreeBSD_version >= 1100095
3400 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3401 		    hn_lro_mbufq_depth);
3402 #else
3403 		tcp_lro_init(&rxr->hn_lro);
3404 		rxr->hn_lro.ifp = sc->hn_ifp;
3405 #endif
3406 #if __FreeBSD_version >= 1100099
3407 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3408 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3409 #endif
3410 #endif	/* INET || INET6 */
3411 
3412 		if (sc->hn_rx_sysctl_tree != NULL) {
3413 			char name[16];
3414 
3415 			/*
3416 			 * Create per RX ring sysctl tree:
3417 			 * dev.hn.UNIT.rx.RINGID
3418 			 */
3419 			snprintf(name, sizeof(name), "%d", i);
3420 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3421 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3422 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3423 
3424 			if (rxr->hn_rx_sysctl_tree != NULL) {
3425 				SYSCTL_ADD_ULONG(ctx,
3426 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3427 				    OID_AUTO, "packets", CTLFLAG_RW,
3428 				    &rxr->hn_pkts, "# of packets received");
3429 				SYSCTL_ADD_ULONG(ctx,
3430 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3431 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
3432 				    &rxr->hn_rss_pkts,
3433 				    "# of packets w/ RSS info received");
3434 				SYSCTL_ADD_INT(ctx,
3435 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3436 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3437 				    &rxr->hn_pktbuf_len, 0,
3438 				    "Temporary channel packet buffer length");
3439 			}
3440 		}
3441 	}
3442 
3443 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3444 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3445 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3446 #if __FreeBSD_version < 1100095
3447 	    hn_rx_stat_int_sysctl,
3448 #else
3449 	    hn_rx_stat_u64_sysctl,
3450 #endif
3451 	    "LU", "LRO queued");
3452 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3453 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3454 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3455 #if __FreeBSD_version < 1100095
3456 	    hn_rx_stat_int_sysctl,
3457 #else
3458 	    hn_rx_stat_u64_sysctl,
3459 #endif
3460 	    "LU", "LRO flushed");
3461 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3462 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3463 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
3464 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3465 #if __FreeBSD_version >= 1100099
3466 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3467 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3468 	    hn_lro_lenlim_sysctl, "IU",
3469 	    "Max # of data bytes to be aggregated by LRO");
3470 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3471 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3472 	    hn_lro_ackcnt_sysctl, "I",
3473 	    "Max # of ACKs to be aggregated by LRO");
3474 #endif
3475 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3476 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3477 	    hn_trust_hcsum_sysctl, "I",
3478 	    "Trust tcp segement verification on host side, "
3479 	    "when csum info is missing");
3480 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3481 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3482 	    hn_trust_hcsum_sysctl, "I",
3483 	    "Trust udp datagram verification on host side, "
3484 	    "when csum info is missing");
3485 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3486 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3487 	    hn_trust_hcsum_sysctl, "I",
3488 	    "Trust ip packet verification on host side, "
3489 	    "when csum info is missing");
3490 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3491 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3492 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
3493 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3494 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3495 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3496 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
3497 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3498 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3499 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3500 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
3501 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3502 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3503 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3504 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
3505 	    hn_rx_stat_ulong_sysctl, "LU",
3506 	    "# of packets that we trust host's csum verification");
3507 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3508 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3509 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
3510 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3511 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3512 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3513 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
3514 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3515 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3516 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3517 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3518 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3519 
3520 	return (0);
3521 }
3522 
3523 static void
3524 hn_destroy_rx_data(struct hn_softc *sc)
3525 {
3526 	int i;
3527 
3528 	if (sc->hn_rxbuf != NULL) {
3529 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3530 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3531 		else
3532 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
3533 		sc->hn_rxbuf = NULL;
3534 	}
3535 
3536 	if (sc->hn_rx_ring_cnt == 0)
3537 		return;
3538 
3539 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3540 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3541 
3542 		if (rxr->hn_br == NULL)
3543 			continue;
3544 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3545 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3546 		} else {
3547 			device_printf(sc->hn_dev,
3548 			    "%dth channel bufring is referenced", i);
3549 		}
3550 		rxr->hn_br = NULL;
3551 
3552 #if defined(INET) || defined(INET6)
3553 		tcp_lro_free(&rxr->hn_lro);
3554 #endif
3555 		free(rxr->hn_pktbuf, M_DEVBUF);
3556 	}
3557 	free(sc->hn_rx_ring, M_DEVBUF);
3558 	sc->hn_rx_ring = NULL;
3559 
3560 	sc->hn_rx_ring_cnt = 0;
3561 	sc->hn_rx_ring_inuse = 0;
3562 }
3563 
3564 static int
3565 hn_tx_ring_create(struct hn_softc *sc, int id)
3566 {
3567 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3568 	device_t dev = sc->hn_dev;
3569 	bus_dma_tag_t parent_dtag;
3570 	int error, i;
3571 
3572 	txr->hn_sc = sc;
3573 	txr->hn_tx_idx = id;
3574 
3575 #ifndef HN_USE_TXDESC_BUFRING
3576 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3577 #endif
3578 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3579 
3580 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3581 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3582 	    M_DEVBUF, M_WAITOK | M_ZERO);
3583 #ifndef HN_USE_TXDESC_BUFRING
3584 	SLIST_INIT(&txr->hn_txlist);
3585 #else
3586 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3587 	    M_WAITOK, &txr->hn_tx_lock);
3588 #endif
3589 
3590 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3591 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3592 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3593 	} else {
3594 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3595 	}
3596 
3597 #ifdef HN_IFSTART_SUPPORT
3598 	if (hn_use_if_start) {
3599 		txr->hn_txeof = hn_start_txeof;
3600 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3601 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3602 	} else
3603 #endif
3604 	{
3605 		int br_depth;
3606 
3607 		txr->hn_txeof = hn_xmit_txeof;
3608 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3609 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3610 
3611 		br_depth = hn_get_txswq_depth(txr);
3612 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3613 		    M_WAITOK, &txr->hn_tx_lock);
3614 	}
3615 
3616 	txr->hn_direct_tx_size = hn_direct_tx_size;
3617 
3618 	/*
3619 	 * Always schedule transmission instead of trying to do direct
3620 	 * transmission.  This one gives the best performance so far.
3621 	 */
3622 	txr->hn_sched_tx = 1;
3623 
3624 	parent_dtag = bus_get_dma_tag(dev);
3625 
3626 	/* DMA tag for RNDIS packet messages. */
3627 	error = bus_dma_tag_create(parent_dtag, /* parent */
3628 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
3629 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
3630 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3631 	    BUS_SPACE_MAXADDR,		/* highaddr */
3632 	    NULL, NULL,			/* filter, filterarg */
3633 	    HN_RNDIS_PKT_LEN,		/* maxsize */
3634 	    1,				/* nsegments */
3635 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
3636 	    0,				/* flags */
3637 	    NULL,			/* lockfunc */
3638 	    NULL,			/* lockfuncarg */
3639 	    &txr->hn_tx_rndis_dtag);
3640 	if (error) {
3641 		device_printf(dev, "failed to create rndis dmatag\n");
3642 		return error;
3643 	}
3644 
3645 	/* DMA tag for data. */
3646 	error = bus_dma_tag_create(parent_dtag, /* parent */
3647 	    1,				/* alignment */
3648 	    HN_TX_DATA_BOUNDARY,	/* boundary */
3649 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3650 	    BUS_SPACE_MAXADDR,		/* highaddr */
3651 	    NULL, NULL,			/* filter, filterarg */
3652 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
3653 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
3654 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
3655 	    0,				/* flags */
3656 	    NULL,			/* lockfunc */
3657 	    NULL,			/* lockfuncarg */
3658 	    &txr->hn_tx_data_dtag);
3659 	if (error) {
3660 		device_printf(dev, "failed to create data dmatag\n");
3661 		return error;
3662 	}
3663 
3664 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3665 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
3666 
3667 		txd->txr = txr;
3668 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3669 		STAILQ_INIT(&txd->agg_list);
3670 
3671 		/*
3672 		 * Allocate and load RNDIS packet message.
3673 		 */
3674         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3675 		    (void **)&txd->rndis_pkt,
3676 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3677 		    &txd->rndis_pkt_dmap);
3678 		if (error) {
3679 			device_printf(dev,
3680 			    "failed to allocate rndis_packet_msg, %d\n", i);
3681 			return error;
3682 		}
3683 
3684 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3685 		    txd->rndis_pkt_dmap,
3686 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3687 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3688 		    BUS_DMA_NOWAIT);
3689 		if (error) {
3690 			device_printf(dev,
3691 			    "failed to load rndis_packet_msg, %d\n", i);
3692 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3693 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3694 			return error;
3695 		}
3696 
3697 		/* DMA map for TX data. */
3698 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3699 		    &txd->data_dmap);
3700 		if (error) {
3701 			device_printf(dev,
3702 			    "failed to allocate tx data dmamap\n");
3703 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3704 			    txd->rndis_pkt_dmap);
3705 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3706 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3707 			return error;
3708 		}
3709 
3710 		/* All set, put it to list */
3711 		txd->flags |= HN_TXD_FLAG_ONLIST;
3712 #ifndef HN_USE_TXDESC_BUFRING
3713 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3714 #else
3715 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
3716 #endif
3717 	}
3718 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3719 
3720 	if (sc->hn_tx_sysctl_tree != NULL) {
3721 		struct sysctl_oid_list *child;
3722 		struct sysctl_ctx_list *ctx;
3723 		char name[16];
3724 
3725 		/*
3726 		 * Create per TX ring sysctl tree:
3727 		 * dev.hn.UNIT.tx.RINGID
3728 		 */
3729 		ctx = device_get_sysctl_ctx(dev);
3730 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3731 
3732 		snprintf(name, sizeof(name), "%d", id);
3733 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3734 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3735 
3736 		if (txr->hn_tx_sysctl_tree != NULL) {
3737 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3738 
3739 #ifdef HN_DEBUG
3740 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3741 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3742 			    "# of available TX descs");
3743 #endif
3744 #ifdef HN_IFSTART_SUPPORT
3745 			if (!hn_use_if_start)
3746 #endif
3747 			{
3748 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3749 				    CTLFLAG_RD, &txr->hn_oactive, 0,
3750 				    "over active");
3751 			}
3752 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3753 			    CTLFLAG_RW, &txr->hn_pkts,
3754 			    "# of packets transmitted");
3755 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3756 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
3757 		}
3758 	}
3759 
3760 	return 0;
3761 }
3762 
3763 static void
3764 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3765 {
3766 	struct hn_tx_ring *txr = txd->txr;
3767 
3768 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
3769 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3770 
3771 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3772 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3773 	    txd->rndis_pkt_dmap);
3774 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3775 }
3776 
3777 static void
3778 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3779 {
3780 
3781 	KASSERT(txd->refs == 0 || txd->refs == 1,
3782 	    ("invalid txd refs %d", txd->refs));
3783 
3784 	/* Aggregated txds will be freed by their aggregating txd. */
3785 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3786 		int freed;
3787 
3788 		freed = hn_txdesc_put(txr, txd);
3789 		KASSERT(freed, ("can't free txdesc"));
3790 	}
3791 }
3792 
3793 static void
3794 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3795 {
3796 	int i;
3797 
3798 	if (txr->hn_txdesc == NULL)
3799 		return;
3800 
3801 	/*
3802 	 * NOTE:
3803 	 * Because the freeing of aggregated txds will be deferred
3804 	 * to the aggregating txd, two passes are used here:
3805 	 * - The first pass GCes any pending txds.  This GC is necessary,
3806 	 *   since if the channels are revoked, hypervisor will not
3807 	 *   deliver send-done for all pending txds.
3808 	 * - The second pass frees the busdma stuffs, i.e. after all txds
3809 	 *   were freed.
3810 	 */
3811 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3812 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3813 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3814 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3815 
3816 	if (txr->hn_tx_data_dtag != NULL)
3817 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3818 	if (txr->hn_tx_rndis_dtag != NULL)
3819 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3820 
3821 #ifdef HN_USE_TXDESC_BUFRING
3822 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3823 #endif
3824 
3825 	free(txr->hn_txdesc, M_DEVBUF);
3826 	txr->hn_txdesc = NULL;
3827 
3828 	if (txr->hn_mbuf_br != NULL)
3829 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3830 
3831 #ifndef HN_USE_TXDESC_BUFRING
3832 	mtx_destroy(&txr->hn_txlist_spin);
3833 #endif
3834 	mtx_destroy(&txr->hn_tx_lock);
3835 }
3836 
3837 static int
3838 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3839 {
3840 	struct sysctl_oid_list *child;
3841 	struct sysctl_ctx_list *ctx;
3842 	int i;
3843 
3844 	/*
3845 	 * Create TXBUF for chimney sending.
3846 	 *
3847 	 * NOTE: It is shared by all channels.
3848 	 */
3849 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3850 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3851 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3852 	if (sc->hn_chim == NULL) {
3853 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
3854 		return (ENOMEM);
3855 	}
3856 
3857 	sc->hn_tx_ring_cnt = ring_cnt;
3858 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3859 
3860 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3861 	    M_DEVBUF, M_WAITOK | M_ZERO);
3862 
3863 	ctx = device_get_sysctl_ctx(sc->hn_dev);
3864 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3865 
3866 	/* Create dev.hn.UNIT.tx sysctl tree */
3867 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3868 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3869 
3870 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3871 		int error;
3872 
3873 		error = hn_tx_ring_create(sc, i);
3874 		if (error)
3875 			return error;
3876 	}
3877 
3878 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3879 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3880 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
3881 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3882 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3883 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3884 	    __offsetof(struct hn_tx_ring, hn_send_failed),
3885 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3886 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3887 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3888 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
3889 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3890 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3891 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3892 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
3893 	    hn_tx_stat_ulong_sysctl, "LU",
3894 	    "# of packet transmission aggregation flush failure");
3895 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3896 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3897 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3898 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3899 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3900 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3901 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
3902 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3903 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3904 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3905 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3906 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3907 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3908 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3909 	    "# of total TX descs");
3910 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3911 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3912 	    "Chimney send packet size upper boundary");
3913 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3914 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3915 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3916 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3917 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3918 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3919 	    hn_tx_conf_int_sysctl, "I",
3920 	    "Size of the packet for direct transmission");
3921 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3922 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3923 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
3924 	    hn_tx_conf_int_sysctl, "I",
3925 	    "Always schedule transmission "
3926 	    "instead of doing direct transmission");
3927 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3928 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3929 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3930 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3931 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3932 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3933 	    "Applied packet transmission aggregation size");
3934 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3935 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3936 	    hn_txagg_pktmax_sysctl, "I",
3937 	    "Applied packet transmission aggregation packets");
3938 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3939 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3940 	    hn_txagg_align_sysctl, "I",
3941 	    "Applied packet transmission aggregation alignment");
3942 
3943 	return 0;
3944 }
3945 
3946 static void
3947 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3948 {
3949 	int i;
3950 
3951 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3952 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
3953 }
3954 
3955 static void
3956 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3957 {
3958 	struct ifnet *ifp = sc->hn_ifp;
3959 	int tso_minlen;
3960 
3961 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3962 		return;
3963 
3964 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3965 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3966 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3967 
3968 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3969 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3970 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3971 
3972 	if (tso_maxlen < tso_minlen)
3973 		tso_maxlen = tso_minlen;
3974 	else if (tso_maxlen > IP_MAXPACKET)
3975 		tso_maxlen = IP_MAXPACKET;
3976 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
3977 		tso_maxlen = sc->hn_ndis_tso_szmax;
3978 	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3979 	if (bootverbose)
3980 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3981 }
3982 
3983 static void
3984 hn_fixup_tx_data(struct hn_softc *sc)
3985 {
3986 	uint64_t csum_assist;
3987 	int i;
3988 
3989 	hn_set_chim_size(sc, sc->hn_chim_szmax);
3990 	if (hn_tx_chimney_size > 0 &&
3991 	    hn_tx_chimney_size < sc->hn_chim_szmax)
3992 		hn_set_chim_size(sc, hn_tx_chimney_size);
3993 
3994 	csum_assist = 0;
3995 	if (sc->hn_caps & HN_CAP_IPCS)
3996 		csum_assist |= CSUM_IP;
3997 	if (sc->hn_caps & HN_CAP_TCP4CS)
3998 		csum_assist |= CSUM_IP_TCP;
3999 	if (sc->hn_caps & HN_CAP_UDP4CS)
4000 		csum_assist |= CSUM_IP_UDP;
4001 	if (sc->hn_caps & HN_CAP_TCP6CS)
4002 		csum_assist |= CSUM_IP6_TCP;
4003 	if (sc->hn_caps & HN_CAP_UDP6CS)
4004 		csum_assist |= CSUM_IP6_UDP;
4005 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4006 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
4007 
4008 	if (sc->hn_caps & HN_CAP_HASHVAL) {
4009 		/*
4010 		 * Support HASHVAL pktinfo on TX path.
4011 		 */
4012 		if (bootverbose)
4013 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
4014 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4015 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
4016 	}
4017 }
4018 
4019 static void
4020 hn_destroy_tx_data(struct hn_softc *sc)
4021 {
4022 	int i;
4023 
4024 	if (sc->hn_chim != NULL) {
4025 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
4026 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
4027 		} else {
4028 			device_printf(sc->hn_dev,
4029 			    "chimney sending buffer is referenced");
4030 		}
4031 		sc->hn_chim = NULL;
4032 	}
4033 
4034 	if (sc->hn_tx_ring_cnt == 0)
4035 		return;
4036 
4037 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4038 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
4039 
4040 	free(sc->hn_tx_ring, M_DEVBUF);
4041 	sc->hn_tx_ring = NULL;
4042 
4043 	sc->hn_tx_ring_cnt = 0;
4044 	sc->hn_tx_ring_inuse = 0;
4045 }
4046 
4047 #ifdef HN_IFSTART_SUPPORT
4048 
4049 static void
4050 hn_start_taskfunc(void *xtxr, int pending __unused)
4051 {
4052 	struct hn_tx_ring *txr = xtxr;
4053 
4054 	mtx_lock(&txr->hn_tx_lock);
4055 	hn_start_locked(txr, 0);
4056 	mtx_unlock(&txr->hn_tx_lock);
4057 }
4058 
4059 static int
4060 hn_start_locked(struct hn_tx_ring *txr, int len)
4061 {
4062 	struct hn_softc *sc = txr->hn_sc;
4063 	struct ifnet *ifp = sc->hn_ifp;
4064 	int sched = 0;
4065 
4066 	KASSERT(hn_use_if_start,
4067 	    ("hn_start_locked is called, when if_start is disabled"));
4068 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4069 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4070 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4071 
4072 	if (__predict_false(txr->hn_suspended))
4073 		return (0);
4074 
4075 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
4076 	    IFF_DRV_RUNNING)
4077 		return (0);
4078 
4079 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
4080 		struct hn_txdesc *txd;
4081 		struct mbuf *m_head;
4082 		int error;
4083 
4084 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
4085 		if (m_head == NULL)
4086 			break;
4087 
4088 		if (len > 0 && m_head->m_pkthdr.len > len) {
4089 			/*
4090 			 * This sending could be time consuming; let callers
4091 			 * dispatch this packet sending (and sending of any
4092 			 * following up packets) to tx taskqueue.
4093 			 */
4094 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4095 			sched = 1;
4096 			break;
4097 		}
4098 
4099 #if defined(INET6) || defined(INET)
4100 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
4101 			m_head = hn_tso_fixup(m_head);
4102 			if (__predict_false(m_head == NULL)) {
4103 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4104 				continue;
4105 			}
4106 		}
4107 #endif
4108 
4109 		txd = hn_txdesc_get(txr);
4110 		if (txd == NULL) {
4111 			txr->hn_no_txdescs++;
4112 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4113 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4114 			break;
4115 		}
4116 
4117 		error = hn_encap(ifp, txr, txd, &m_head);
4118 		if (error) {
4119 			/* Both txd and m_head are freed */
4120 			KASSERT(txr->hn_agg_txd == NULL,
4121 			    ("encap failed w/ pending aggregating txdesc"));
4122 			continue;
4123 		}
4124 
4125 		if (txr->hn_agg_pktleft == 0) {
4126 			if (txr->hn_agg_txd != NULL) {
4127 				KASSERT(m_head == NULL,
4128 				    ("pending mbuf for aggregating txdesc"));
4129 				error = hn_flush_txagg(ifp, txr);
4130 				if (__predict_false(error)) {
4131 					atomic_set_int(&ifp->if_drv_flags,
4132 					    IFF_DRV_OACTIVE);
4133 					break;
4134 				}
4135 			} else {
4136 				KASSERT(m_head != NULL, ("mbuf was freed"));
4137 				error = hn_txpkt(ifp, txr, txd);
4138 				if (__predict_false(error)) {
4139 					/* txd is freed, but m_head is not */
4140 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4141 					atomic_set_int(&ifp->if_drv_flags,
4142 					    IFF_DRV_OACTIVE);
4143 					break;
4144 				}
4145 			}
4146 		}
4147 #ifdef INVARIANTS
4148 		else {
4149 			KASSERT(txr->hn_agg_txd != NULL,
4150 			    ("no aggregating txdesc"));
4151 			KASSERT(m_head == NULL,
4152 			    ("pending mbuf for aggregating txdesc"));
4153 		}
4154 #endif
4155 	}
4156 
4157 	/* Flush pending aggerated transmission. */
4158 	if (txr->hn_agg_txd != NULL)
4159 		hn_flush_txagg(ifp, txr);
4160 	return (sched);
4161 }
4162 
4163 static void
4164 hn_start(struct ifnet *ifp)
4165 {
4166 	struct hn_softc *sc = ifp->if_softc;
4167 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
4168 
4169 	if (txr->hn_sched_tx)
4170 		goto do_sched;
4171 
4172 	if (mtx_trylock(&txr->hn_tx_lock)) {
4173 		int sched;
4174 
4175 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4176 		mtx_unlock(&txr->hn_tx_lock);
4177 		if (!sched)
4178 			return;
4179 	}
4180 do_sched:
4181 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4182 }
4183 
4184 static void
4185 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4186 {
4187 	struct hn_tx_ring *txr = xtxr;
4188 
4189 	mtx_lock(&txr->hn_tx_lock);
4190 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4191 	hn_start_locked(txr, 0);
4192 	mtx_unlock(&txr->hn_tx_lock);
4193 }
4194 
4195 static void
4196 hn_start_txeof(struct hn_tx_ring *txr)
4197 {
4198 	struct hn_softc *sc = txr->hn_sc;
4199 	struct ifnet *ifp = sc->hn_ifp;
4200 
4201 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4202 
4203 	if (txr->hn_sched_tx)
4204 		goto do_sched;
4205 
4206 	if (mtx_trylock(&txr->hn_tx_lock)) {
4207 		int sched;
4208 
4209 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4210 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4211 		mtx_unlock(&txr->hn_tx_lock);
4212 		if (sched) {
4213 			taskqueue_enqueue(txr->hn_tx_taskq,
4214 			    &txr->hn_tx_task);
4215 		}
4216 	} else {
4217 do_sched:
4218 		/*
4219 		 * Release the OACTIVE earlier, with the hope, that
4220 		 * others could catch up.  The task will clear the
4221 		 * flag again with the hn_tx_lock to avoid possible
4222 		 * races.
4223 		 */
4224 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4225 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4226 	}
4227 }
4228 
4229 #endif	/* HN_IFSTART_SUPPORT */
4230 
4231 static int
4232 hn_xmit(struct hn_tx_ring *txr, int len)
4233 {
4234 	struct hn_softc *sc = txr->hn_sc;
4235 	struct ifnet *ifp = sc->hn_ifp;
4236 	struct mbuf *m_head;
4237 	int sched = 0;
4238 
4239 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4240 #ifdef HN_IFSTART_SUPPORT
4241 	KASSERT(hn_use_if_start == 0,
4242 	    ("hn_xmit is called, when if_start is enabled"));
4243 #endif
4244 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4245 
4246 	if (__predict_false(txr->hn_suspended))
4247 		return (0);
4248 
4249 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4250 		return (0);
4251 
4252 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4253 		struct hn_txdesc *txd;
4254 		int error;
4255 
4256 		if (len > 0 && m_head->m_pkthdr.len > len) {
4257 			/*
4258 			 * This sending could be time consuming; let callers
4259 			 * dispatch this packet sending (and sending of any
4260 			 * following up packets) to tx taskqueue.
4261 			 */
4262 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4263 			sched = 1;
4264 			break;
4265 		}
4266 
4267 		txd = hn_txdesc_get(txr);
4268 		if (txd == NULL) {
4269 			txr->hn_no_txdescs++;
4270 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4271 			txr->hn_oactive = 1;
4272 			break;
4273 		}
4274 
4275 		error = hn_encap(ifp, txr, txd, &m_head);
4276 		if (error) {
4277 			/* Both txd and m_head are freed; discard */
4278 			KASSERT(txr->hn_agg_txd == NULL,
4279 			    ("encap failed w/ pending aggregating txdesc"));
4280 			drbr_advance(ifp, txr->hn_mbuf_br);
4281 			continue;
4282 		}
4283 
4284 		if (txr->hn_agg_pktleft == 0) {
4285 			if (txr->hn_agg_txd != NULL) {
4286 				KASSERT(m_head == NULL,
4287 				    ("pending mbuf for aggregating txdesc"));
4288 				error = hn_flush_txagg(ifp, txr);
4289 				if (__predict_false(error)) {
4290 					txr->hn_oactive = 1;
4291 					break;
4292 				}
4293 			} else {
4294 				KASSERT(m_head != NULL, ("mbuf was freed"));
4295 				error = hn_txpkt(ifp, txr, txd);
4296 				if (__predict_false(error)) {
4297 					/* txd is freed, but m_head is not */
4298 					drbr_putback(ifp, txr->hn_mbuf_br,
4299 					    m_head);
4300 					txr->hn_oactive = 1;
4301 					break;
4302 				}
4303 			}
4304 		}
4305 #ifdef INVARIANTS
4306 		else {
4307 			KASSERT(txr->hn_agg_txd != NULL,
4308 			    ("no aggregating txdesc"));
4309 			KASSERT(m_head == NULL,
4310 			    ("pending mbuf for aggregating txdesc"));
4311 		}
4312 #endif
4313 
4314 		/* Sent */
4315 		drbr_advance(ifp, txr->hn_mbuf_br);
4316 	}
4317 
4318 	/* Flush pending aggerated transmission. */
4319 	if (txr->hn_agg_txd != NULL)
4320 		hn_flush_txagg(ifp, txr);
4321 	return (sched);
4322 }
4323 
4324 static int
4325 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4326 {
4327 	struct hn_softc *sc = ifp->if_softc;
4328 	struct hn_tx_ring *txr;
4329 	int error, idx = 0;
4330 
4331 #if defined(INET6) || defined(INET)
4332 	/*
4333 	 * Perform TSO packet header fixup now, since the TSO
4334 	 * packet header should be cache-hot.
4335 	 */
4336 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4337 		m = hn_tso_fixup(m);
4338 		if (__predict_false(m == NULL)) {
4339 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4340 			return EIO;
4341 		}
4342 	}
4343 #endif
4344 
4345 	/*
4346 	 * Select the TX ring based on flowid
4347 	 */
4348 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4349 #ifdef RSS
4350 		uint32_t bid;
4351 
4352 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
4353 		    &bid) == 0)
4354 			idx = bid % sc->hn_tx_ring_inuse;
4355 		else
4356 #endif
4357 			idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4358 	}
4359 	txr = &sc->hn_tx_ring[idx];
4360 
4361 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4362 	if (error) {
4363 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4364 		return error;
4365 	}
4366 
4367 	if (txr->hn_oactive)
4368 		return 0;
4369 
4370 	if (txr->hn_sched_tx)
4371 		goto do_sched;
4372 
4373 	if (mtx_trylock(&txr->hn_tx_lock)) {
4374 		int sched;
4375 
4376 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4377 		mtx_unlock(&txr->hn_tx_lock);
4378 		if (!sched)
4379 			return 0;
4380 	}
4381 do_sched:
4382 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4383 	return 0;
4384 }
4385 
4386 static void
4387 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4388 {
4389 	struct mbuf *m;
4390 
4391 	mtx_lock(&txr->hn_tx_lock);
4392 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4393 		m_freem(m);
4394 	mtx_unlock(&txr->hn_tx_lock);
4395 }
4396 
4397 static void
4398 hn_xmit_qflush(struct ifnet *ifp)
4399 {
4400 	struct hn_softc *sc = ifp->if_softc;
4401 	int i;
4402 
4403 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4404 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4405 	if_qflush(ifp);
4406 }
4407 
4408 static void
4409 hn_xmit_txeof(struct hn_tx_ring *txr)
4410 {
4411 
4412 	if (txr->hn_sched_tx)
4413 		goto do_sched;
4414 
4415 	if (mtx_trylock(&txr->hn_tx_lock)) {
4416 		int sched;
4417 
4418 		txr->hn_oactive = 0;
4419 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4420 		mtx_unlock(&txr->hn_tx_lock);
4421 		if (sched) {
4422 			taskqueue_enqueue(txr->hn_tx_taskq,
4423 			    &txr->hn_tx_task);
4424 		}
4425 	} else {
4426 do_sched:
4427 		/*
4428 		 * Release the oactive earlier, with the hope, that
4429 		 * others could catch up.  The task will clear the
4430 		 * oactive again with the hn_tx_lock to avoid possible
4431 		 * races.
4432 		 */
4433 		txr->hn_oactive = 0;
4434 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4435 	}
4436 }
4437 
4438 static void
4439 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4440 {
4441 	struct hn_tx_ring *txr = xtxr;
4442 
4443 	mtx_lock(&txr->hn_tx_lock);
4444 	hn_xmit(txr, 0);
4445 	mtx_unlock(&txr->hn_tx_lock);
4446 }
4447 
4448 static void
4449 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4450 {
4451 	struct hn_tx_ring *txr = xtxr;
4452 
4453 	mtx_lock(&txr->hn_tx_lock);
4454 	txr->hn_oactive = 0;
4455 	hn_xmit(txr, 0);
4456 	mtx_unlock(&txr->hn_tx_lock);
4457 }
4458 
4459 static int
4460 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4461 {
4462 	struct vmbus_chan_br cbr;
4463 	struct hn_rx_ring *rxr;
4464 	struct hn_tx_ring *txr = NULL;
4465 	int idx, error;
4466 
4467 	idx = vmbus_chan_subidx(chan);
4468 
4469 	/*
4470 	 * Link this channel to RX/TX ring.
4471 	 */
4472 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4473 	    ("invalid channel index %d, should > 0 && < %d",
4474 	     idx, sc->hn_rx_ring_inuse));
4475 	rxr = &sc->hn_rx_ring[idx];
4476 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4477 	    ("RX ring %d already attached", idx));
4478 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4479 	rxr->hn_chan = chan;
4480 
4481 	if (bootverbose) {
4482 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4483 		    idx, vmbus_chan_id(chan));
4484 	}
4485 
4486 	if (idx < sc->hn_tx_ring_inuse) {
4487 		txr = &sc->hn_tx_ring[idx];
4488 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4489 		    ("TX ring %d already attached", idx));
4490 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4491 
4492 		txr->hn_chan = chan;
4493 		if (bootverbose) {
4494 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4495 			    idx, vmbus_chan_id(chan));
4496 		}
4497 	}
4498 
4499 	/* Bind this channel to a proper CPU. */
4500 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4501 
4502 	/*
4503 	 * Open this channel
4504 	 */
4505 	cbr.cbr = rxr->hn_br;
4506 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4507 	cbr.cbr_txsz = HN_TXBR_SIZE;
4508 	cbr.cbr_rxsz = HN_RXBR_SIZE;
4509 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4510 	if (error) {
4511 		if (error == EISCONN) {
4512 			if_printf(sc->hn_ifp, "bufring is connected after "
4513 			    "chan%u open failure\n", vmbus_chan_id(chan));
4514 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4515 		} else {
4516 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4517 			    vmbus_chan_id(chan), error);
4518 		}
4519 	}
4520 	return (error);
4521 }
4522 
4523 static void
4524 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4525 {
4526 	struct hn_rx_ring *rxr;
4527 	int idx, error;
4528 
4529 	idx = vmbus_chan_subidx(chan);
4530 
4531 	/*
4532 	 * Link this channel to RX/TX ring.
4533 	 */
4534 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4535 	    ("invalid channel index %d, should > 0 && < %d",
4536 	     idx, sc->hn_rx_ring_inuse));
4537 	rxr = &sc->hn_rx_ring[idx];
4538 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4539 	    ("RX ring %d is not attached", idx));
4540 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4541 
4542 	if (idx < sc->hn_tx_ring_inuse) {
4543 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4544 
4545 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4546 		    ("TX ring %d is not attached attached", idx));
4547 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4548 	}
4549 
4550 	/*
4551 	 * Close this channel.
4552 	 *
4553 	 * NOTE:
4554 	 * Channel closing does _not_ destroy the target channel.
4555 	 */
4556 	error = vmbus_chan_close_direct(chan);
4557 	if (error == EISCONN) {
4558 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
4559 		    "after being closed\n", vmbus_chan_id(chan));
4560 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4561 	} else if (error) {
4562 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4563 		    vmbus_chan_id(chan), error);
4564 	}
4565 }
4566 
4567 static int
4568 hn_attach_subchans(struct hn_softc *sc)
4569 {
4570 	struct vmbus_channel **subchans;
4571 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4572 	int i, error = 0;
4573 
4574 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
4575 
4576 	/* Attach the sub-channels. */
4577 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4578 	for (i = 0; i < subchan_cnt; ++i) {
4579 		int error1;
4580 
4581 		error1 = hn_chan_attach(sc, subchans[i]);
4582 		if (error1) {
4583 			error = error1;
4584 			/* Move on; all channels will be detached later. */
4585 		}
4586 	}
4587 	vmbus_subchan_rel(subchans, subchan_cnt);
4588 
4589 	if (error) {
4590 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4591 	} else {
4592 		if (bootverbose) {
4593 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4594 			    subchan_cnt);
4595 		}
4596 	}
4597 	return (error);
4598 }
4599 
4600 static void
4601 hn_detach_allchans(struct hn_softc *sc)
4602 {
4603 	struct vmbus_channel **subchans;
4604 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4605 	int i;
4606 
4607 	if (subchan_cnt == 0)
4608 		goto back;
4609 
4610 	/* Detach the sub-channels. */
4611 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4612 	for (i = 0; i < subchan_cnt; ++i)
4613 		hn_chan_detach(sc, subchans[i]);
4614 	vmbus_subchan_rel(subchans, subchan_cnt);
4615 
4616 back:
4617 	/*
4618 	 * Detach the primary channel, _after_ all sub-channels
4619 	 * are detached.
4620 	 */
4621 	hn_chan_detach(sc, sc->hn_prichan);
4622 
4623 	/* Wait for sub-channels to be destroyed, if any. */
4624 	vmbus_subchan_drain(sc->hn_prichan);
4625 
4626 #ifdef INVARIANTS
4627 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4628 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4629 		    HN_RX_FLAG_ATTACHED) == 0,
4630 		    ("%dth RX ring is still attached", i));
4631 	}
4632 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4633 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4634 		    HN_TX_FLAG_ATTACHED) == 0,
4635 		    ("%dth TX ring is still attached", i));
4636 	}
4637 #endif
4638 }
4639 
4640 static int
4641 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4642 {
4643 	struct vmbus_channel **subchans;
4644 	int nchan, rxr_cnt, error;
4645 
4646 	nchan = *nsubch + 1;
4647 	if (nchan == 1) {
4648 		/*
4649 		 * Multiple RX/TX rings are not requested.
4650 		 */
4651 		*nsubch = 0;
4652 		return (0);
4653 	}
4654 
4655 	/*
4656 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4657 	 * table entries.
4658 	 */
4659 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4660 	if (error) {
4661 		/* No RSS; this is benign. */
4662 		*nsubch = 0;
4663 		return (0);
4664 	}
4665 	if (bootverbose) {
4666 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4667 		    rxr_cnt, nchan);
4668 	}
4669 
4670 	if (nchan > rxr_cnt)
4671 		nchan = rxr_cnt;
4672 	if (nchan == 1) {
4673 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4674 		*nsubch = 0;
4675 		return (0);
4676 	}
4677 
4678 	/*
4679 	 * Allocate sub-channels from NVS.
4680 	 */
4681 	*nsubch = nchan - 1;
4682 	error = hn_nvs_alloc_subchans(sc, nsubch);
4683 	if (error || *nsubch == 0) {
4684 		/* Failed to allocate sub-channels. */
4685 		*nsubch = 0;
4686 		return (0);
4687 	}
4688 
4689 	/*
4690 	 * Wait for all sub-channels to become ready before moving on.
4691 	 */
4692 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4693 	vmbus_subchan_rel(subchans, *nsubch);
4694 	return (0);
4695 }
4696 
4697 static bool
4698 hn_synth_attachable(const struct hn_softc *sc)
4699 {
4700 	int i;
4701 
4702 	if (sc->hn_flags & HN_FLAG_ERRORS)
4703 		return (false);
4704 
4705 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4706 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4707 
4708 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4709 			return (false);
4710 	}
4711 	return (true);
4712 }
4713 
4714 static int
4715 hn_synth_attach(struct hn_softc *sc, int mtu)
4716 {
4717 #define ATTACHED_NVS		0x0002
4718 #define ATTACHED_RNDIS		0x0004
4719 
4720 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4721 	int error, nsubch, nchan, i;
4722 	uint32_t old_caps, attached = 0;
4723 
4724 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4725 	    ("synthetic parts were attached"));
4726 
4727 	if (!hn_synth_attachable(sc))
4728 		return (ENXIO);
4729 
4730 	/* Save capabilities for later verification. */
4731 	old_caps = sc->hn_caps;
4732 	sc->hn_caps = 0;
4733 
4734 	/* Clear RSS stuffs. */
4735 	sc->hn_rss_ind_size = 0;
4736 	sc->hn_rss_hash = 0;
4737 
4738 	/*
4739 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
4740 	 */
4741 	error = hn_chan_attach(sc, sc->hn_prichan);
4742 	if (error)
4743 		goto failed;
4744 
4745 	/*
4746 	 * Attach NVS.
4747 	 */
4748 	error = hn_nvs_attach(sc, mtu);
4749 	if (error)
4750 		goto failed;
4751 	attached |= ATTACHED_NVS;
4752 
4753 	/*
4754 	 * Attach RNDIS _after_ NVS is attached.
4755 	 */
4756 	error = hn_rndis_attach(sc, mtu);
4757 	if (error)
4758 		goto failed;
4759 	attached |= ATTACHED_RNDIS;
4760 
4761 	/*
4762 	 * Make sure capabilities are not changed.
4763 	 */
4764 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4765 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4766 		    old_caps, sc->hn_caps);
4767 		error = ENXIO;
4768 		goto failed;
4769 	}
4770 
4771 	/*
4772 	 * Allocate sub-channels for multi-TX/RX rings.
4773 	 *
4774 	 * NOTE:
4775 	 * The # of RX rings that can be used is equivalent to the # of
4776 	 * channels to be requested.
4777 	 */
4778 	nsubch = sc->hn_rx_ring_cnt - 1;
4779 	error = hn_synth_alloc_subchans(sc, &nsubch);
4780 	if (error)
4781 		goto failed;
4782 	/* NOTE: _Full_ synthetic parts detach is required now. */
4783 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4784 
4785 	/*
4786 	 * Set the # of TX/RX rings that could be used according to
4787 	 * the # of channels that NVS offered.
4788 	 */
4789 	nchan = nsubch + 1;
4790 	hn_set_ring_inuse(sc, nchan);
4791 	if (nchan == 1) {
4792 		/* Only the primary channel can be used; done */
4793 		goto back;
4794 	}
4795 
4796 	/*
4797 	 * Attach the sub-channels.
4798 	 *
4799 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
4800 	 */
4801 	error = hn_attach_subchans(sc);
4802 	if (error)
4803 		goto failed;
4804 
4805 	/*
4806 	 * Configure RSS key and indirect table _after_ all sub-channels
4807 	 * are attached.
4808 	 */
4809 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4810 		/*
4811 		 * RSS key is not set yet; set it to the default RSS key.
4812 		 */
4813 		if (bootverbose)
4814 			if_printf(sc->hn_ifp, "setup default RSS key\n");
4815 #ifdef RSS
4816 		rss_getkey(rss->rss_key);
4817 #else
4818 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4819 #endif
4820 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4821 	}
4822 
4823 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4824 		/*
4825 		 * RSS indirect table is not set yet; set it up in round-
4826 		 * robin fashion.
4827 		 */
4828 		if (bootverbose) {
4829 			if_printf(sc->hn_ifp, "setup default RSS indirect "
4830 			    "table\n");
4831 		}
4832 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
4833 			uint32_t subidx;
4834 
4835 #ifdef RSS
4836 			subidx = rss_get_indirection_to_bucket(i);
4837 #else
4838 			subidx = i;
4839 #endif
4840 			rss->rss_ind[i] = subidx % nchan;
4841 		}
4842 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4843 	} else {
4844 		/*
4845 		 * # of usable channels may be changed, so we have to
4846 		 * make sure that all entries in RSS indirect table
4847 		 * are valid.
4848 		 *
4849 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
4850 		 */
4851 		hn_rss_ind_fixup(sc);
4852 	}
4853 
4854 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4855 	if (error)
4856 		goto failed;
4857 back:
4858 	/*
4859 	 * Fixup transmission aggregation setup.
4860 	 */
4861 	hn_set_txagg(sc);
4862 	return (0);
4863 
4864 failed:
4865 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4866 		hn_synth_detach(sc);
4867 	} else {
4868 		if (attached & ATTACHED_RNDIS)
4869 			hn_rndis_detach(sc);
4870 		if (attached & ATTACHED_NVS)
4871 			hn_nvs_detach(sc);
4872 		hn_chan_detach(sc, sc->hn_prichan);
4873 		/* Restore old capabilities. */
4874 		sc->hn_caps = old_caps;
4875 	}
4876 	return (error);
4877 
4878 #undef ATTACHED_RNDIS
4879 #undef ATTACHED_NVS
4880 }
4881 
4882 /*
4883  * NOTE:
4884  * The interface must have been suspended though hn_suspend(), before
4885  * this function get called.
4886  */
4887 static void
4888 hn_synth_detach(struct hn_softc *sc)
4889 {
4890 
4891 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4892 	    ("synthetic parts were not attached"));
4893 
4894 	/* Detach the RNDIS first. */
4895 	hn_rndis_detach(sc);
4896 
4897 	/* Detach NVS. */
4898 	hn_nvs_detach(sc);
4899 
4900 	/* Detach all of the channels. */
4901 	hn_detach_allchans(sc);
4902 
4903 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4904 }
4905 
4906 static void
4907 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4908 {
4909 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4910 	    ("invalid ring count %d", ring_cnt));
4911 
4912 	if (sc->hn_tx_ring_cnt > ring_cnt)
4913 		sc->hn_tx_ring_inuse = ring_cnt;
4914 	else
4915 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4916 	sc->hn_rx_ring_inuse = ring_cnt;
4917 
4918 #ifdef RSS
4919 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
4920 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
4921 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
4922 		    rss_getnumbuckets());
4923 	}
4924 #endif
4925 
4926 	if (bootverbose) {
4927 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4928 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4929 	}
4930 }
4931 
4932 static void
4933 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4934 {
4935 
4936 	/*
4937 	 * NOTE:
4938 	 * The TX bufring will not be drained by the hypervisor,
4939 	 * if the primary channel is revoked.
4940 	 */
4941 	while (!vmbus_chan_rx_empty(chan) ||
4942 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4943 	     !vmbus_chan_tx_empty(chan)))
4944 		pause("waitch", 1);
4945 	vmbus_chan_intr_drain(chan);
4946 }
4947 
4948 static void
4949 hn_suspend_data(struct hn_softc *sc)
4950 {
4951 	struct vmbus_channel **subch = NULL;
4952 	struct hn_tx_ring *txr;
4953 	int i, nsubch;
4954 
4955 	HN_LOCK_ASSERT(sc);
4956 
4957 	/*
4958 	 * Suspend TX.
4959 	 */
4960 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4961 		txr = &sc->hn_tx_ring[i];
4962 
4963 		mtx_lock(&txr->hn_tx_lock);
4964 		txr->hn_suspended = 1;
4965 		mtx_unlock(&txr->hn_tx_lock);
4966 		/* No one is able send more packets now. */
4967 
4968 		/*
4969 		 * Wait for all pending sends to finish.
4970 		 *
4971 		 * NOTE:
4972 		 * We will _not_ receive all pending send-done, if the
4973 		 * primary channel is revoked.
4974 		 */
4975 		while (hn_tx_ring_pending(txr) &&
4976 		    !vmbus_chan_is_revoked(sc->hn_prichan))
4977 			pause("hnwtx", 1 /* 1 tick */);
4978 	}
4979 
4980 	/*
4981 	 * Disable RX by clearing RX filter.
4982 	 */
4983 	hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
4984 
4985 	/*
4986 	 * Give RNDIS enough time to flush all pending data packets.
4987 	 */
4988 	pause("waitrx", (200 * hz) / 1000);
4989 
4990 	/*
4991 	 * Drain RX/TX bufrings and interrupts.
4992 	 */
4993 	nsubch = sc->hn_rx_ring_inuse - 1;
4994 	if (nsubch > 0)
4995 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4996 
4997 	if (subch != NULL) {
4998 		for (i = 0; i < nsubch; ++i)
4999 			hn_chan_drain(sc, subch[i]);
5000 	}
5001 	hn_chan_drain(sc, sc->hn_prichan);
5002 
5003 	if (subch != NULL)
5004 		vmbus_subchan_rel(subch, nsubch);
5005 
5006 	/*
5007 	 * Drain any pending TX tasks.
5008 	 *
5009 	 * NOTE:
5010 	 * The above hn_chan_drain() can dispatch TX tasks, so the TX
5011 	 * tasks will have to be drained _after_ the above hn_chan_drain()
5012 	 * calls.
5013 	 */
5014 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5015 		txr = &sc->hn_tx_ring[i];
5016 
5017 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
5018 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
5019 	}
5020 }
5021 
5022 static void
5023 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
5024 {
5025 
5026 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
5027 }
5028 
5029 static void
5030 hn_suspend_mgmt(struct hn_softc *sc)
5031 {
5032 	struct task task;
5033 
5034 	HN_LOCK_ASSERT(sc);
5035 
5036 	/*
5037 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
5038 	 * through hn_mgmt_taskq.
5039 	 */
5040 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
5041 	vmbus_chan_run_task(sc->hn_prichan, &task);
5042 
5043 	/*
5044 	 * Make sure that all pending management tasks are completed.
5045 	 */
5046 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
5047 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
5048 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
5049 }
5050 
5051 static void
5052 hn_suspend(struct hn_softc *sc)
5053 {
5054 
5055 	/* Disable polling. */
5056 	hn_polling(sc, 0);
5057 
5058 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5059 	    (sc->hn_flags & HN_FLAG_VF))
5060 		hn_suspend_data(sc);
5061 	hn_suspend_mgmt(sc);
5062 }
5063 
5064 static void
5065 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
5066 {
5067 	int i;
5068 
5069 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
5070 	    ("invalid TX ring count %d", tx_ring_cnt));
5071 
5072 	for (i = 0; i < tx_ring_cnt; ++i) {
5073 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5074 
5075 		mtx_lock(&txr->hn_tx_lock);
5076 		txr->hn_suspended = 0;
5077 		mtx_unlock(&txr->hn_tx_lock);
5078 	}
5079 }
5080 
5081 static void
5082 hn_resume_data(struct hn_softc *sc)
5083 {
5084 	int i;
5085 
5086 	HN_LOCK_ASSERT(sc);
5087 
5088 	/*
5089 	 * Re-enable RX.
5090 	 */
5091 	hn_rxfilter_config(sc);
5092 
5093 	/*
5094 	 * Make sure to clear suspend status on "all" TX rings,
5095 	 * since hn_tx_ring_inuse can be changed after
5096 	 * hn_suspend_data().
5097 	 */
5098 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
5099 
5100 #ifdef HN_IFSTART_SUPPORT
5101 	if (!hn_use_if_start)
5102 #endif
5103 	{
5104 		/*
5105 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
5106 		 * reduced.
5107 		 */
5108 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
5109 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5110 	}
5111 
5112 	/*
5113 	 * Kick start TX.
5114 	 */
5115 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5116 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5117 
5118 		/*
5119 		 * Use txeof task, so that any pending oactive can be
5120 		 * cleared properly.
5121 		 */
5122 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5123 	}
5124 }
5125 
5126 static void
5127 hn_resume_mgmt(struct hn_softc *sc)
5128 {
5129 
5130 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
5131 
5132 	/*
5133 	 * Kick off network change detection, if it was pending.
5134 	 * If no network change was pending, start link status
5135 	 * checks, which is more lightweight than network change
5136 	 * detection.
5137 	 */
5138 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
5139 		hn_change_network(sc);
5140 	else
5141 		hn_update_link_status(sc);
5142 }
5143 
5144 static void
5145 hn_resume(struct hn_softc *sc)
5146 {
5147 
5148 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5149 	    (sc->hn_flags & HN_FLAG_VF))
5150 		hn_resume_data(sc);
5151 
5152 	/*
5153 	 * When the VF is activated, the synthetic interface is changed
5154 	 * to DOWN in hn_set_vf(). Here, if the VF is still active, we
5155 	 * don't call hn_resume_mgmt() until the VF is deactivated in
5156 	 * hn_set_vf().
5157 	 */
5158 	if (!(sc->hn_flags & HN_FLAG_VF))
5159 		hn_resume_mgmt(sc);
5160 
5161 	/*
5162 	 * Re-enable polling if this interface is running and
5163 	 * the polling is requested.
5164 	 */
5165 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
5166 		hn_polling(sc, sc->hn_pollhz);
5167 }
5168 
5169 static void
5170 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
5171 {
5172 	const struct rndis_status_msg *msg;
5173 	int ofs;
5174 
5175 	if (dlen < sizeof(*msg)) {
5176 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
5177 		return;
5178 	}
5179 	msg = data;
5180 
5181 	switch (msg->rm_status) {
5182 	case RNDIS_STATUS_MEDIA_CONNECT:
5183 	case RNDIS_STATUS_MEDIA_DISCONNECT:
5184 		hn_update_link_status(sc);
5185 		break;
5186 
5187 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
5188 		/* Not really useful; ignore. */
5189 		break;
5190 
5191 	case RNDIS_STATUS_NETWORK_CHANGE:
5192 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
5193 		if (dlen < ofs + msg->rm_stbuflen ||
5194 		    msg->rm_stbuflen < sizeof(uint32_t)) {
5195 			if_printf(sc->hn_ifp, "network changed\n");
5196 		} else {
5197 			uint32_t change;
5198 
5199 			memcpy(&change, ((const uint8_t *)msg) + ofs,
5200 			    sizeof(change));
5201 			if_printf(sc->hn_ifp, "network changed, change %u\n",
5202 			    change);
5203 		}
5204 		hn_change_network(sc);
5205 		break;
5206 
5207 	default:
5208 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
5209 		    msg->rm_status);
5210 		break;
5211 	}
5212 }
5213 
5214 static int
5215 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5216 {
5217 	const struct rndis_pktinfo *pi = info_data;
5218 	uint32_t mask = 0;
5219 
5220 	while (info_dlen != 0) {
5221 		const void *data;
5222 		uint32_t dlen;
5223 
5224 		if (__predict_false(info_dlen < sizeof(*pi)))
5225 			return (EINVAL);
5226 		if (__predict_false(info_dlen < pi->rm_size))
5227 			return (EINVAL);
5228 		info_dlen -= pi->rm_size;
5229 
5230 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5231 			return (EINVAL);
5232 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5233 			return (EINVAL);
5234 		dlen = pi->rm_size - pi->rm_pktinfooffset;
5235 		data = pi->rm_data;
5236 
5237 		switch (pi->rm_type) {
5238 		case NDIS_PKTINFO_TYPE_VLAN:
5239 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5240 				return (EINVAL);
5241 			info->vlan_info = *((const uint32_t *)data);
5242 			mask |= HN_RXINFO_VLAN;
5243 			break;
5244 
5245 		case NDIS_PKTINFO_TYPE_CSUM:
5246 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5247 				return (EINVAL);
5248 			info->csum_info = *((const uint32_t *)data);
5249 			mask |= HN_RXINFO_CSUM;
5250 			break;
5251 
5252 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5253 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5254 				return (EINVAL);
5255 			info->hash_value = *((const uint32_t *)data);
5256 			mask |= HN_RXINFO_HASHVAL;
5257 			break;
5258 
5259 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
5260 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5261 				return (EINVAL);
5262 			info->hash_info = *((const uint32_t *)data);
5263 			mask |= HN_RXINFO_HASHINF;
5264 			break;
5265 
5266 		default:
5267 			goto next;
5268 		}
5269 
5270 		if (mask == HN_RXINFO_ALL) {
5271 			/* All found; done */
5272 			break;
5273 		}
5274 next:
5275 		pi = (const struct rndis_pktinfo *)
5276 		    ((const uint8_t *)pi + pi->rm_size);
5277 	}
5278 
5279 	/*
5280 	 * Final fixup.
5281 	 * - If there is no hash value, invalidate the hash info.
5282 	 */
5283 	if ((mask & HN_RXINFO_HASHVAL) == 0)
5284 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5285 	return (0);
5286 }
5287 
5288 static __inline bool
5289 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5290 {
5291 
5292 	if (off < check_off) {
5293 		if (__predict_true(off + len <= check_off))
5294 			return (false);
5295 	} else if (off > check_off) {
5296 		if (__predict_true(check_off + check_len <= off))
5297 			return (false);
5298 	}
5299 	return (true);
5300 }
5301 
5302 static void
5303 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5304 {
5305 	const struct rndis_packet_msg *pkt;
5306 	struct hn_rxinfo info;
5307 	int data_off, pktinfo_off, data_len, pktinfo_len;
5308 
5309 	/*
5310 	 * Check length.
5311 	 */
5312 	if (__predict_false(dlen < sizeof(*pkt))) {
5313 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5314 		return;
5315 	}
5316 	pkt = data;
5317 
5318 	if (__predict_false(dlen < pkt->rm_len)) {
5319 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5320 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5321 		return;
5322 	}
5323 	if (__predict_false(pkt->rm_len <
5324 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5325 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5326 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
5327 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5328 		    pkt->rm_pktinfolen);
5329 		return;
5330 	}
5331 	if (__predict_false(pkt->rm_datalen == 0)) {
5332 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5333 		return;
5334 	}
5335 
5336 	/*
5337 	 * Check offests.
5338 	 */
5339 #define IS_OFFSET_INVALID(ofs)			\
5340 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
5341 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5342 
5343 	/* XXX Hyper-V does not meet data offset alignment requirement */
5344 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5345 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5346 		    "data offset %u\n", pkt->rm_dataoffset);
5347 		return;
5348 	}
5349 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5350 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5351 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5352 		    "oob offset %u\n", pkt->rm_oobdataoffset);
5353 		return;
5354 	}
5355 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5356 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5357 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5358 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5359 		return;
5360 	}
5361 
5362 #undef IS_OFFSET_INVALID
5363 
5364 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5365 	data_len = pkt->rm_datalen;
5366 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5367 	pktinfo_len = pkt->rm_pktinfolen;
5368 
5369 	/*
5370 	 * Check OOB coverage.
5371 	 */
5372 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
5373 		int oob_off, oob_len;
5374 
5375 		if_printf(rxr->hn_ifp, "got oobdata\n");
5376 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5377 		oob_len = pkt->rm_oobdatalen;
5378 
5379 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5380 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5381 			    "oob overflow, msglen %u, oob abs %d len %d\n",
5382 			    pkt->rm_len, oob_off, oob_len);
5383 			return;
5384 		}
5385 
5386 		/*
5387 		 * Check against data.
5388 		 */
5389 		if (hn_rndis_check_overlap(oob_off, oob_len,
5390 		    data_off, data_len)) {
5391 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5392 			    "oob overlaps data, oob abs %d len %d, "
5393 			    "data abs %d len %d\n",
5394 			    oob_off, oob_len, data_off, data_len);
5395 			return;
5396 		}
5397 
5398 		/*
5399 		 * Check against pktinfo.
5400 		 */
5401 		if (pktinfo_len != 0 &&
5402 		    hn_rndis_check_overlap(oob_off, oob_len,
5403 		    pktinfo_off, pktinfo_len)) {
5404 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5405 			    "oob overlaps pktinfo, oob abs %d len %d, "
5406 			    "pktinfo abs %d len %d\n",
5407 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
5408 			return;
5409 		}
5410 	}
5411 
5412 	/*
5413 	 * Check per-packet-info coverage and find useful per-packet-info.
5414 	 */
5415 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5416 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5417 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5418 	if (__predict_true(pktinfo_len != 0)) {
5419 		bool overlap;
5420 		int error;
5421 
5422 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5423 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5424 			    "pktinfo overflow, msglen %u, "
5425 			    "pktinfo abs %d len %d\n",
5426 			    pkt->rm_len, pktinfo_off, pktinfo_len);
5427 			return;
5428 		}
5429 
5430 		/*
5431 		 * Check packet info coverage.
5432 		 */
5433 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5434 		    data_off, data_len);
5435 		if (__predict_false(overlap)) {
5436 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5437 			    "pktinfo overlap data, pktinfo abs %d len %d, "
5438 			    "data abs %d len %d\n",
5439 			    pktinfo_off, pktinfo_len, data_off, data_len);
5440 			return;
5441 		}
5442 
5443 		/*
5444 		 * Find useful per-packet-info.
5445 		 */
5446 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5447 		    pktinfo_len, &info);
5448 		if (__predict_false(error)) {
5449 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5450 			    "pktinfo\n");
5451 			return;
5452 		}
5453 	}
5454 
5455 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
5456 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5457 		    "data overflow, msglen %u, data abs %d len %d\n",
5458 		    pkt->rm_len, data_off, data_len);
5459 		return;
5460 	}
5461 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5462 }
5463 
5464 static __inline void
5465 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5466 {
5467 	const struct rndis_msghdr *hdr;
5468 
5469 	if (__predict_false(dlen < sizeof(*hdr))) {
5470 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5471 		return;
5472 	}
5473 	hdr = data;
5474 
5475 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5476 		/* Hot data path. */
5477 		hn_rndis_rx_data(rxr, data, dlen);
5478 		/* Done! */
5479 		return;
5480 	}
5481 
5482 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5483 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5484 	else
5485 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5486 }
5487 
5488 static void
5489 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5490 {
5491 	const struct hn_nvs_hdr *hdr;
5492 
5493 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5494 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
5495 		return;
5496 	}
5497 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5498 
5499 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5500 		/* Useless; ignore */
5501 		return;
5502 	}
5503 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5504 }
5505 
5506 static void
5507 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5508     const struct vmbus_chanpkt_hdr *pkt)
5509 {
5510 	struct hn_nvs_sendctx *sndc;
5511 
5512 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5513 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5514 	    VMBUS_CHANPKT_DATALEN(pkt));
5515 	/*
5516 	 * NOTE:
5517 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5518 	 * its callback.
5519 	 */
5520 }
5521 
5522 static void
5523 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5524     const struct vmbus_chanpkt_hdr *pkthdr)
5525 {
5526 	const struct vmbus_chanpkt_rxbuf *pkt;
5527 	const struct hn_nvs_hdr *nvs_hdr;
5528 	int count, i, hlen;
5529 
5530 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5531 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5532 		return;
5533 	}
5534 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5535 
5536 	/* Make sure that this is a RNDIS message. */
5537 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5538 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5539 		    nvs_hdr->nvs_type);
5540 		return;
5541 	}
5542 
5543 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5544 	if (__predict_false(hlen < sizeof(*pkt))) {
5545 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5546 		return;
5547 	}
5548 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5549 
5550 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5551 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5552 		    pkt->cp_rxbuf_id);
5553 		return;
5554 	}
5555 
5556 	count = pkt->cp_rxbuf_cnt;
5557 	if (__predict_false(hlen <
5558 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5559 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5560 		return;
5561 	}
5562 
5563 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5564 	for (i = 0; i < count; ++i) {
5565 		int ofs, len;
5566 
5567 		ofs = pkt->cp_rxbuf[i].rb_ofs;
5568 		len = pkt->cp_rxbuf[i].rb_len;
5569 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5570 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5571 			    "ofs %d, len %d\n", i, ofs, len);
5572 			continue;
5573 		}
5574 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5575 	}
5576 
5577 	/*
5578 	 * Ack the consumed RXBUF associated w/ this channel packet,
5579 	 * so that this RXBUF can be recycled by the hypervisor.
5580 	 */
5581 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5582 }
5583 
5584 static void
5585 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5586     uint64_t tid)
5587 {
5588 	struct hn_nvs_rndis_ack ack;
5589 	int retries, error;
5590 
5591 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5592 	ack.nvs_status = HN_NVS_STATUS_OK;
5593 
5594 	retries = 0;
5595 again:
5596 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5597 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5598 	if (__predict_false(error == EAGAIN)) {
5599 		/*
5600 		 * NOTE:
5601 		 * This should _not_ happen in real world, since the
5602 		 * consumption of the TX bufring from the TX path is
5603 		 * controlled.
5604 		 */
5605 		if (rxr->hn_ack_failed == 0)
5606 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5607 		rxr->hn_ack_failed++;
5608 		retries++;
5609 		if (retries < 10) {
5610 			DELAY(100);
5611 			goto again;
5612 		}
5613 		/* RXBUF leaks! */
5614 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5615 	}
5616 }
5617 
5618 static void
5619 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5620 {
5621 	struct hn_rx_ring *rxr = xrxr;
5622 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
5623 
5624 	for (;;) {
5625 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5626 		int error, pktlen;
5627 
5628 		pktlen = rxr->hn_pktbuf_len;
5629 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5630 		if (__predict_false(error == ENOBUFS)) {
5631 			void *nbuf;
5632 			int nlen;
5633 
5634 			/*
5635 			 * Expand channel packet buffer.
5636 			 *
5637 			 * XXX
5638 			 * Use M_WAITOK here, since allocation failure
5639 			 * is fatal.
5640 			 */
5641 			nlen = rxr->hn_pktbuf_len * 2;
5642 			while (nlen < pktlen)
5643 				nlen *= 2;
5644 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5645 
5646 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5647 			    rxr->hn_pktbuf_len, nlen);
5648 
5649 			free(rxr->hn_pktbuf, M_DEVBUF);
5650 			rxr->hn_pktbuf = nbuf;
5651 			rxr->hn_pktbuf_len = nlen;
5652 			/* Retry! */
5653 			continue;
5654 		} else if (__predict_false(error == EAGAIN)) {
5655 			/* No more channel packets; done! */
5656 			break;
5657 		}
5658 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5659 
5660 		switch (pkt->cph_type) {
5661 		case VMBUS_CHANPKT_TYPE_COMP:
5662 			hn_nvs_handle_comp(sc, chan, pkt);
5663 			break;
5664 
5665 		case VMBUS_CHANPKT_TYPE_RXBUF:
5666 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
5667 			break;
5668 
5669 		case VMBUS_CHANPKT_TYPE_INBAND:
5670 			hn_nvs_handle_notify(sc, pkt);
5671 			break;
5672 
5673 		default:
5674 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5675 			    pkt->cph_type);
5676 			break;
5677 		}
5678 	}
5679 	hn_chan_rollup(rxr, rxr->hn_txr);
5680 }
5681 
5682 static void
5683 hn_tx_taskq_create(void *arg __unused)
5684 {
5685 	int i;
5686 
5687 	/*
5688 	 * Fix the # of TX taskqueues.
5689 	 */
5690 	if (hn_tx_taskq_cnt <= 0)
5691 		hn_tx_taskq_cnt = 1;
5692 	else if (hn_tx_taskq_cnt > mp_ncpus)
5693 		hn_tx_taskq_cnt = mp_ncpus;
5694 
5695 	/*
5696 	 * Fix the TX taskqueue mode.
5697 	 */
5698 	switch (hn_tx_taskq_mode) {
5699 	case HN_TX_TASKQ_M_INDEP:
5700 	case HN_TX_TASKQ_M_GLOBAL:
5701 	case HN_TX_TASKQ_M_EVTTQ:
5702 		break;
5703 	default:
5704 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5705 		break;
5706 	}
5707 
5708 	if (vm_guest != VM_GUEST_HV)
5709 		return;
5710 
5711 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5712 		return;
5713 
5714 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5715 	    M_DEVBUF, M_WAITOK);
5716 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5717 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5718 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5719 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5720 		    "hn tx%d", i);
5721 	}
5722 }
5723 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5724     hn_tx_taskq_create, NULL);
5725 
5726 static void
5727 hn_tx_taskq_destroy(void *arg __unused)
5728 {
5729 
5730 	if (hn_tx_taskque != NULL) {
5731 		int i;
5732 
5733 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
5734 			taskqueue_free(hn_tx_taskque[i]);
5735 		free(hn_tx_taskque, M_DEVBUF);
5736 	}
5737 }
5738 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5739     hn_tx_taskq_destroy, NULL);
5740