xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 9ff086544d5f85b58349e28ed36a9811b8fe5cf9)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/bus.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/smp.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
75 #include <sys/sx.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
80 
81 #include <machine/atomic.h>
82 #include <machine/in_cksum.h>
83 
84 #include <net/bpf.h>
85 #include <net/ethernet.h>
86 #include <net/if.h>
87 #include <net/if_media.h>
88 #include <net/if_types.h>
89 #include <net/if_var.h>
90 #include <net/rndis.h>
91 #ifdef RSS
92 #include <net/rss_config.h>
93 #endif
94 
95 #include <netinet/in_systm.h>
96 #include <netinet/in.h>
97 #include <netinet/ip.h>
98 #include <netinet/ip6.h>
99 #include <netinet/tcp.h>
100 #include <netinet/tcp_lro.h>
101 #include <netinet/udp.h>
102 
103 #include <dev/hyperv/include/hyperv.h>
104 #include <dev/hyperv/include/hyperv_busdma.h>
105 #include <dev/hyperv/include/vmbus.h>
106 #include <dev/hyperv/include/vmbus_xact.h>
107 
108 #include <dev/hyperv/netvsc/ndis.h>
109 #include <dev/hyperv/netvsc/if_hnreg.h>
110 #include <dev/hyperv/netvsc/if_hnvar.h>
111 #include <dev/hyperv/netvsc/hn_nvs.h>
112 #include <dev/hyperv/netvsc/hn_rndis.h>
113 
114 #include "vmbus_if.h"
115 
116 #define HN_IFSTART_SUPPORT
117 
118 #define HN_RING_CNT_DEF_MAX		8
119 
120 /* YYY should get it from the underlying channel */
121 #define HN_TX_DESC_CNT			512
122 
123 #define HN_RNDIS_PKT_LEN					\
124 	(sizeof(struct rndis_packet_msg) +			\
125 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
126 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
127 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
128 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
129 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
130 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
131 
132 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
133 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
134 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
135 /* -1 for RNDIS packet message */
136 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
137 
138 #define HN_DIRECT_TX_SIZE_DEF		128
139 
140 #define HN_EARLY_TXEOF_THRESH		8
141 
142 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
143 
144 #define HN_LROENT_CNT_DEF		128
145 
146 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
147 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
148 /* YYY 2*MTU is a bit rough, but should be good enough. */
149 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
150 
151 #define HN_LRO_ACKCNT_DEF		1
152 
153 #define HN_LOCK_INIT(sc)		\
154 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
155 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
156 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
157 #define HN_LOCK(sc)					\
158 do {							\
159 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
160 		DELAY(1000);				\
161 } while (0)
162 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
163 
164 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
165 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
166 #define HN_CSUM_IP_HWASSIST(sc)		\
167 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
168 #define HN_CSUM_IP6_HWASSIST(sc)	\
169 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
170 
171 #define HN_PKTSIZE_MIN(align)		\
172 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
173 	    HN_RNDIS_PKT_LEN, (align))
174 #define HN_PKTSIZE(m, align)		\
175 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
176 
177 #ifdef RSS
178 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
179 #else
180 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
181 #endif
182 
183 struct hn_txdesc {
184 #ifndef HN_USE_TXDESC_BUFRING
185 	SLIST_ENTRY(hn_txdesc)		link;
186 #endif
187 	STAILQ_ENTRY(hn_txdesc)		agg_link;
188 
189 	/* Aggregated txdescs, in sending order. */
190 	STAILQ_HEAD(, hn_txdesc)	agg_list;
191 
192 	/* The oldest packet, if transmission aggregation happens. */
193 	struct mbuf			*m;
194 	struct hn_tx_ring		*txr;
195 	int				refs;
196 	uint32_t			flags;	/* HN_TXD_FLAG_ */
197 	struct hn_nvs_sendctx		send_ctx;
198 	uint32_t			chim_index;
199 	int				chim_size;
200 
201 	bus_dmamap_t			data_dmap;
202 
203 	bus_addr_t			rndis_pkt_paddr;
204 	struct rndis_packet_msg		*rndis_pkt;
205 	bus_dmamap_t			rndis_pkt_dmap;
206 };
207 
208 #define HN_TXD_FLAG_ONLIST		0x0001
209 #define HN_TXD_FLAG_DMAMAP		0x0002
210 #define HN_TXD_FLAG_ONAGG		0x0004
211 
212 struct hn_rxinfo {
213 	uint32_t			vlan_info;
214 	uint32_t			csum_info;
215 	uint32_t			hash_info;
216 	uint32_t			hash_value;
217 };
218 
219 #define HN_RXINFO_VLAN			0x0001
220 #define HN_RXINFO_CSUM			0x0002
221 #define HN_RXINFO_HASHINF		0x0004
222 #define HN_RXINFO_HASHVAL		0x0008
223 #define HN_RXINFO_ALL			\
224 	(HN_RXINFO_VLAN |		\
225 	 HN_RXINFO_CSUM |		\
226 	 HN_RXINFO_HASHINF |		\
227 	 HN_RXINFO_HASHVAL)
228 
229 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
230 #define HN_NDIS_RXCSUM_INFO_INVALID	0
231 #define HN_NDIS_HASH_INFO_INVALID	0
232 
233 static int			hn_probe(device_t);
234 static int			hn_attach(device_t);
235 static int			hn_detach(device_t);
236 static int			hn_shutdown(device_t);
237 static void			hn_chan_callback(struct vmbus_channel *,
238 				    void *);
239 
240 static void			hn_init(void *);
241 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
242 #ifdef HN_IFSTART_SUPPORT
243 static void			hn_start(struct ifnet *);
244 #endif
245 static int			hn_transmit(struct ifnet *, struct mbuf *);
246 static void			hn_xmit_qflush(struct ifnet *);
247 static int			hn_ifmedia_upd(struct ifnet *);
248 static void			hn_ifmedia_sts(struct ifnet *,
249 				    struct ifmediareq *);
250 
251 static int			hn_rndis_rxinfo(const void *, int,
252 				    struct hn_rxinfo *);
253 static void			hn_rndis_rx_data(struct hn_rx_ring *,
254 				    const void *, int);
255 static void			hn_rndis_rx_status(struct hn_softc *,
256 				    const void *, int);
257 
258 static void			hn_nvs_handle_notify(struct hn_softc *,
259 				    const struct vmbus_chanpkt_hdr *);
260 static void			hn_nvs_handle_comp(struct hn_softc *,
261 				    struct vmbus_channel *,
262 				    const struct vmbus_chanpkt_hdr *);
263 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
264 				    struct vmbus_channel *,
265 				    const struct vmbus_chanpkt_hdr *);
266 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
267 				    struct vmbus_channel *, uint64_t);
268 
269 #if __FreeBSD_version >= 1100099
270 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
271 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
272 #endif
273 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
274 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
275 #if __FreeBSD_version < 1100095
276 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
277 #else
278 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
279 #endif
280 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
281 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
282 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
283 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
284 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
285 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
286 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
287 #ifndef RSS
288 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
289 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
290 #endif
291 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
292 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
293 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
294 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
295 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
296 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
297 
298 static void			hn_stop(struct hn_softc *);
299 static void			hn_init_locked(struct hn_softc *);
300 static int			hn_chan_attach(struct hn_softc *,
301 				    struct vmbus_channel *);
302 static void			hn_chan_detach(struct hn_softc *,
303 				    struct vmbus_channel *);
304 static int			hn_attach_subchans(struct hn_softc *);
305 static void			hn_detach_allchans(struct hn_softc *);
306 static void			hn_chan_rollup(struct hn_rx_ring *,
307 				    struct hn_tx_ring *);
308 static void			hn_set_ring_inuse(struct hn_softc *, int);
309 static int			hn_synth_attach(struct hn_softc *, int);
310 static void			hn_synth_detach(struct hn_softc *);
311 static int			hn_synth_alloc_subchans(struct hn_softc *,
312 				    int *);
313 static bool			hn_synth_attachable(const struct hn_softc *);
314 static void			hn_suspend(struct hn_softc *);
315 static void			hn_suspend_data(struct hn_softc *);
316 static void			hn_suspend_mgmt(struct hn_softc *);
317 static void			hn_resume(struct hn_softc *);
318 static void			hn_resume_data(struct hn_softc *);
319 static void			hn_resume_mgmt(struct hn_softc *);
320 static void			hn_suspend_mgmt_taskfunc(void *, int);
321 static void			hn_chan_drain(struct hn_softc *,
322 				    struct vmbus_channel *);
323 static void			hn_polling(struct hn_softc *, u_int);
324 static void			hn_chan_polling(struct vmbus_channel *, u_int);
325 
326 static void			hn_update_link_status(struct hn_softc *);
327 static void			hn_change_network(struct hn_softc *);
328 static void			hn_link_taskfunc(void *, int);
329 static void			hn_netchg_init_taskfunc(void *, int);
330 static void			hn_netchg_status_taskfunc(void *, int);
331 static void			hn_link_status(struct hn_softc *);
332 
333 static int			hn_create_rx_data(struct hn_softc *, int);
334 static void			hn_destroy_rx_data(struct hn_softc *);
335 static int			hn_check_iplen(const struct mbuf *, int);
336 static int			hn_set_rxfilter(struct hn_softc *);
337 #ifndef RSS
338 static int			hn_rss_reconfig(struct hn_softc *);
339 #endif
340 static void			hn_rss_ind_fixup(struct hn_softc *);
341 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
342 				    int, const struct hn_rxinfo *);
343 
344 static int			hn_tx_ring_create(struct hn_softc *, int);
345 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
346 static int			hn_create_tx_data(struct hn_softc *, int);
347 static void			hn_fixup_tx_data(struct hn_softc *);
348 static void			hn_destroy_tx_data(struct hn_softc *);
349 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
350 static void			hn_txdesc_gc(struct hn_tx_ring *,
351 				    struct hn_txdesc *);
352 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
353 				    struct hn_txdesc *, struct mbuf **);
354 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
355 				    struct hn_txdesc *);
356 static void			hn_set_chim_size(struct hn_softc *, int);
357 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
358 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
359 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
360 static void			hn_resume_tx(struct hn_softc *, int);
361 static void			hn_set_txagg(struct hn_softc *);
362 static void			*hn_try_txagg(struct ifnet *,
363 				    struct hn_tx_ring *, struct hn_txdesc *,
364 				    int);
365 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
366 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
367 				    struct hn_softc *, struct vmbus_channel *,
368 				    const void *, int);
369 static int			hn_txpkt_sglist(struct hn_tx_ring *,
370 				    struct hn_txdesc *);
371 static int			hn_txpkt_chim(struct hn_tx_ring *,
372 				    struct hn_txdesc *);
373 static int			hn_xmit(struct hn_tx_ring *, int);
374 static void			hn_xmit_taskfunc(void *, int);
375 static void			hn_xmit_txeof(struct hn_tx_ring *);
376 static void			hn_xmit_txeof_taskfunc(void *, int);
377 #ifdef HN_IFSTART_SUPPORT
378 static int			hn_start_locked(struct hn_tx_ring *, int);
379 static void			hn_start_taskfunc(void *, int);
380 static void			hn_start_txeof(struct hn_tx_ring *);
381 static void			hn_start_txeof_taskfunc(void *, int);
382 #endif
383 
384 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
385     "Hyper-V network interface");
386 
387 /* Trust tcp segements verification on host side. */
388 static int			hn_trust_hosttcp = 1;
389 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
390     &hn_trust_hosttcp, 0,
391     "Trust tcp segement verification on host side, "
392     "when csum info is missing (global setting)");
393 
394 /* Trust udp datagrams verification on host side. */
395 static int			hn_trust_hostudp = 1;
396 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
397     &hn_trust_hostudp, 0,
398     "Trust udp datagram verification on host side, "
399     "when csum info is missing (global setting)");
400 
401 /* Trust ip packets verification on host side. */
402 static int			hn_trust_hostip = 1;
403 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
404     &hn_trust_hostip, 0,
405     "Trust ip packet verification on host side, "
406     "when csum info is missing (global setting)");
407 
408 /* Limit TSO burst size */
409 static int			hn_tso_maxlen = IP_MAXPACKET;
410 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
411     &hn_tso_maxlen, 0, "TSO burst limit");
412 
413 /* Limit chimney send size */
414 static int			hn_tx_chimney_size = 0;
415 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
416     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
417 
418 /* Limit the size of packet for direct transmission */
419 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
420 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
421     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
422 
423 /* # of LRO entries per RX ring */
424 #if defined(INET) || defined(INET6)
425 #if __FreeBSD_version >= 1100095
426 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
427 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
428     &hn_lro_entry_count, 0, "LRO entry count");
429 #endif
430 #endif
431 
432 static int			hn_tx_taskq_cnt = 1;
433 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
434     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
435 
436 #define HN_TX_TASKQ_M_INDEP	0
437 #define HN_TX_TASKQ_M_GLOBAL	1
438 #define HN_TX_TASKQ_M_EVTTQ	2
439 
440 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
441 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
442     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
443     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
444 
445 #ifndef HN_USE_TXDESC_BUFRING
446 static int			hn_use_txdesc_bufring = 0;
447 #else
448 static int			hn_use_txdesc_bufring = 1;
449 #endif
450 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
451     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
452 
453 #ifdef HN_IFSTART_SUPPORT
454 /* Use ifnet.if_start instead of ifnet.if_transmit */
455 static int			hn_use_if_start = 0;
456 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
457     &hn_use_if_start, 0, "Use if_start TX method");
458 #endif
459 
460 /* # of channels to use */
461 static int			hn_chan_cnt = 0;
462 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
463     &hn_chan_cnt, 0,
464     "# of channels to use; each channel has one RX ring and one TX ring");
465 
466 /* # of transmit rings to use */
467 static int			hn_tx_ring_cnt = 0;
468 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
469     &hn_tx_ring_cnt, 0, "# of TX rings to use");
470 
471 /* Software TX ring deptch */
472 static int			hn_tx_swq_depth = 0;
473 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
474     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
475 
476 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
477 #if __FreeBSD_version >= 1100095
478 static u_int			hn_lro_mbufq_depth = 0;
479 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
480     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
481 #endif
482 
483 /* Packet transmission aggregation size limit */
484 static int			hn_tx_agg_size = -1;
485 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
486     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
487 
488 /* Packet transmission aggregation count limit */
489 static int			hn_tx_agg_pkts = -1;
490 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
491     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
492 
493 static u_int			hn_cpu_index;	/* next CPU for channel */
494 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
495 
496 #ifndef RSS
497 static const uint8_t
498 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
499 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
500 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
501 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
502 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
503 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
504 };
505 #endif	/* !RSS */
506 
507 static device_method_t hn_methods[] = {
508 	/* Device interface */
509 	DEVMETHOD(device_probe,		hn_probe),
510 	DEVMETHOD(device_attach,	hn_attach),
511 	DEVMETHOD(device_detach,	hn_detach),
512 	DEVMETHOD(device_shutdown,	hn_shutdown),
513 	DEVMETHOD_END
514 };
515 
516 static driver_t hn_driver = {
517 	"hn",
518 	hn_methods,
519 	sizeof(struct hn_softc)
520 };
521 
522 static devclass_t hn_devclass;
523 
524 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
525 MODULE_VERSION(hn, 1);
526 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
527 
528 #if __FreeBSD_version >= 1100099
529 static void
530 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
531 {
532 	int i;
533 
534 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
535 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
536 }
537 #endif
538 
539 static int
540 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
541 {
542 
543 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
544 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
545 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
546 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
547 }
548 
549 static int
550 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
551 {
552 	struct hn_nvs_rndis rndis;
553 
554 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
555 	    txd->chim_size > 0, ("invalid rndis chim txd"));
556 
557 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
558 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
559 	rndis.nvs_chim_idx = txd->chim_index;
560 	rndis.nvs_chim_sz = txd->chim_size;
561 
562 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
563 	    &rndis, sizeof(rndis), &txd->send_ctx));
564 }
565 
566 static __inline uint32_t
567 hn_chim_alloc(struct hn_softc *sc)
568 {
569 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
570 	u_long *bmap = sc->hn_chim_bmap;
571 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
572 
573 	for (i = 0; i < bmap_cnt; ++i) {
574 		int idx;
575 
576 		idx = ffsl(~bmap[i]);
577 		if (idx == 0)
578 			continue;
579 
580 		--idx; /* ffsl is 1-based */
581 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
582 		    ("invalid i %d and idx %d", i, idx));
583 
584 		if (atomic_testandset_long(&bmap[i], idx))
585 			continue;
586 
587 		ret = i * LONG_BIT + idx;
588 		break;
589 	}
590 	return (ret);
591 }
592 
593 static __inline void
594 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
595 {
596 	u_long mask;
597 	uint32_t idx;
598 
599 	idx = chim_idx / LONG_BIT;
600 	KASSERT(idx < sc->hn_chim_bmap_cnt,
601 	    ("invalid chimney index 0x%x", chim_idx));
602 
603 	mask = 1UL << (chim_idx % LONG_BIT);
604 	KASSERT(sc->hn_chim_bmap[idx] & mask,
605 	    ("index bitmap 0x%lx, chimney index %u, "
606 	     "bitmap idx %d, bitmask 0x%lx",
607 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
608 
609 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
610 }
611 
612 #if defined(INET6) || defined(INET)
613 /*
614  * NOTE: If this function failed, the m_head would be freed.
615  */
616 static __inline struct mbuf *
617 hn_tso_fixup(struct mbuf *m_head)
618 {
619 	struct ether_vlan_header *evl;
620 	struct tcphdr *th;
621 	int ehlen;
622 
623 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
624 
625 #define PULLUP_HDR(m, len)				\
626 do {							\
627 	if (__predict_false((m)->m_len < (len))) {	\
628 		(m) = m_pullup((m), (len));		\
629 		if ((m) == NULL)			\
630 			return (NULL);			\
631 	}						\
632 } while (0)
633 
634 	PULLUP_HDR(m_head, sizeof(*evl));
635 	evl = mtod(m_head, struct ether_vlan_header *);
636 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
637 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
638 	else
639 		ehlen = ETHER_HDR_LEN;
640 
641 #ifdef INET
642 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
643 		struct ip *ip;
644 		int iphlen;
645 
646 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
647 		ip = mtodo(m_head, ehlen);
648 		iphlen = ip->ip_hl << 2;
649 
650 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
651 		th = mtodo(m_head, ehlen + iphlen);
652 
653 		ip->ip_len = 0;
654 		ip->ip_sum = 0;
655 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
656 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
657 	}
658 #endif
659 #if defined(INET6) && defined(INET)
660 	else
661 #endif
662 #ifdef INET6
663 	{
664 		struct ip6_hdr *ip6;
665 
666 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
667 		ip6 = mtodo(m_head, ehlen);
668 		if (ip6->ip6_nxt != IPPROTO_TCP) {
669 			m_freem(m_head);
670 			return (NULL);
671 		}
672 
673 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
674 		th = mtodo(m_head, ehlen + sizeof(*ip6));
675 
676 		ip6->ip6_plen = 0;
677 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
678 	}
679 #endif
680 	return (m_head);
681 
682 #undef PULLUP_HDR
683 }
684 #endif	/* INET6 || INET */
685 
686 static int
687 hn_set_rxfilter(struct hn_softc *sc)
688 {
689 	struct ifnet *ifp = sc->hn_ifp;
690 	uint32_t filter;
691 	int error = 0;
692 
693 	HN_LOCK_ASSERT(sc);
694 
695 	if (ifp->if_flags & IFF_PROMISC) {
696 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
697 	} else {
698 		filter = NDIS_PACKET_TYPE_DIRECTED;
699 		if (ifp->if_flags & IFF_BROADCAST)
700 			filter |= NDIS_PACKET_TYPE_BROADCAST;
701 		/* TODO: support multicast list */
702 		if ((ifp->if_flags & IFF_ALLMULTI) ||
703 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
704 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
705 	}
706 
707 	if (sc->hn_rx_filter != filter) {
708 		error = hn_rndis_set_rxfilter(sc, filter);
709 		if (!error)
710 			sc->hn_rx_filter = filter;
711 	}
712 	return (error);
713 }
714 
715 static void
716 hn_set_txagg(struct hn_softc *sc)
717 {
718 	uint32_t size, pkts;
719 	int i;
720 
721 	/*
722 	 * Setup aggregation size.
723 	 */
724 	if (sc->hn_agg_size < 0)
725 		size = UINT32_MAX;
726 	else
727 		size = sc->hn_agg_size;
728 
729 	if (sc->hn_rndis_agg_size < size)
730 		size = sc->hn_rndis_agg_size;
731 
732 	/* NOTE: We only aggregate packets using chimney sending buffers. */
733 	if (size > (uint32_t)sc->hn_chim_szmax)
734 		size = sc->hn_chim_szmax;
735 
736 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
737 		/* Disable */
738 		size = 0;
739 		pkts = 0;
740 		goto done;
741 	}
742 
743 	/* NOTE: Type of the per TX ring setting is 'int'. */
744 	if (size > INT_MAX)
745 		size = INT_MAX;
746 
747 	/*
748 	 * Setup aggregation packet count.
749 	 */
750 	if (sc->hn_agg_pkts < 0)
751 		pkts = UINT32_MAX;
752 	else
753 		pkts = sc->hn_agg_pkts;
754 
755 	if (sc->hn_rndis_agg_pkts < pkts)
756 		pkts = sc->hn_rndis_agg_pkts;
757 
758 	if (pkts <= 1) {
759 		/* Disable */
760 		size = 0;
761 		pkts = 0;
762 		goto done;
763 	}
764 
765 	/* NOTE: Type of the per TX ring setting is 'short'. */
766 	if (pkts > SHRT_MAX)
767 		pkts = SHRT_MAX;
768 
769 done:
770 	/* NOTE: Type of the per TX ring setting is 'short'. */
771 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
772 		/* Disable */
773 		size = 0;
774 		pkts = 0;
775 	}
776 
777 	if (bootverbose) {
778 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
779 		    size, pkts, sc->hn_rndis_agg_align);
780 	}
781 
782 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
783 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
784 
785 		mtx_lock(&txr->hn_tx_lock);
786 		txr->hn_agg_szmax = size;
787 		txr->hn_agg_pktmax = pkts;
788 		txr->hn_agg_align = sc->hn_rndis_agg_align;
789 		mtx_unlock(&txr->hn_tx_lock);
790 	}
791 }
792 
793 static int
794 hn_get_txswq_depth(const struct hn_tx_ring *txr)
795 {
796 
797 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
798 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
799 		return txr->hn_txdesc_cnt;
800 	return hn_tx_swq_depth;
801 }
802 
803 #ifndef RSS
804 static int
805 hn_rss_reconfig(struct hn_softc *sc)
806 {
807 	int error;
808 
809 	HN_LOCK_ASSERT(sc);
810 
811 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
812 		return (ENXIO);
813 
814 	/*
815 	 * Disable RSS first.
816 	 *
817 	 * NOTE:
818 	 * Direct reconfiguration by setting the UNCHG flags does
819 	 * _not_ work properly.
820 	 */
821 	if (bootverbose)
822 		if_printf(sc->hn_ifp, "disable RSS\n");
823 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
824 	if (error) {
825 		if_printf(sc->hn_ifp, "RSS disable failed\n");
826 		return (error);
827 	}
828 
829 	/*
830 	 * Reenable the RSS w/ the updated RSS key or indirect
831 	 * table.
832 	 */
833 	if (bootverbose)
834 		if_printf(sc->hn_ifp, "reconfig RSS\n");
835 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
836 	if (error) {
837 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
838 		return (error);
839 	}
840 	return (0);
841 }
842 #endif	/* !RSS */
843 
844 static void
845 hn_rss_ind_fixup(struct hn_softc *sc)
846 {
847 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
848 	int i, nchan;
849 
850 	nchan = sc->hn_rx_ring_inuse;
851 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
852 
853 	/*
854 	 * Check indirect table to make sure that all channels in it
855 	 * can be used.
856 	 */
857 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
858 		if (rss->rss_ind[i] >= nchan) {
859 			if_printf(sc->hn_ifp,
860 			    "RSS indirect table %d fixup: %u -> %d\n",
861 			    i, rss->rss_ind[i], nchan - 1);
862 			rss->rss_ind[i] = nchan - 1;
863 		}
864 	}
865 }
866 
867 static int
868 hn_ifmedia_upd(struct ifnet *ifp __unused)
869 {
870 
871 	return EOPNOTSUPP;
872 }
873 
874 static void
875 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
876 {
877 	struct hn_softc *sc = ifp->if_softc;
878 
879 	ifmr->ifm_status = IFM_AVALID;
880 	ifmr->ifm_active = IFM_ETHER;
881 
882 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
883 		ifmr->ifm_active |= IFM_NONE;
884 		return;
885 	}
886 	ifmr->ifm_status |= IFM_ACTIVE;
887 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
888 }
889 
890 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
891 static const struct hyperv_guid g_net_vsc_device_type = {
892 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
893 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
894 };
895 
896 static int
897 hn_probe(device_t dev)
898 {
899 
900 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
901 	    &g_net_vsc_device_type) == 0) {
902 		device_set_desc(dev, "Hyper-V Network Interface");
903 		return BUS_PROBE_DEFAULT;
904 	}
905 	return ENXIO;
906 }
907 
908 static int
909 hn_attach(device_t dev)
910 {
911 	struct hn_softc *sc = device_get_softc(dev);
912 	struct sysctl_oid_list *child;
913 	struct sysctl_ctx_list *ctx;
914 	uint8_t eaddr[ETHER_ADDR_LEN];
915 	struct ifnet *ifp = NULL;
916 	int error, ring_cnt, tx_ring_cnt;
917 
918 	sc->hn_dev = dev;
919 	sc->hn_prichan = vmbus_get_channel(dev);
920 	HN_LOCK_INIT(sc);
921 
922 	/*
923 	 * Initialize these tunables once.
924 	 */
925 	sc->hn_agg_size = hn_tx_agg_size;
926 	sc->hn_agg_pkts = hn_tx_agg_pkts;
927 
928 	/*
929 	 * Setup taskqueue for transmission.
930 	 */
931 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
932 		int i;
933 
934 		sc->hn_tx_taskqs =
935 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
936 		    M_DEVBUF, M_WAITOK);
937 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
938 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
939 			    M_WAITOK, taskqueue_thread_enqueue,
940 			    &sc->hn_tx_taskqs[i]);
941 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
942 			    "%s tx%d", device_get_nameunit(dev), i);
943 		}
944 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
945 		sc->hn_tx_taskqs = hn_tx_taskque;
946 	}
947 
948 	/*
949 	 * Setup taskqueue for mangement tasks, e.g. link status.
950 	 */
951 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
952 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
953 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
954 	    device_get_nameunit(dev));
955 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
956 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
957 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
958 	    hn_netchg_status_taskfunc, sc);
959 
960 	/*
961 	 * Allocate ifnet and setup its name earlier, so that if_printf
962 	 * can be used by functions, which will be called after
963 	 * ether_ifattach().
964 	 */
965 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
966 	ifp->if_softc = sc;
967 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
968 
969 	/*
970 	 * Initialize ifmedia earlier so that it can be unconditionally
971 	 * destroyed, if error happened later on.
972 	 */
973 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
974 
975 	/*
976 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
977 	 * to use (tx_ring_cnt).
978 	 *
979 	 * NOTE:
980 	 * The # of RX rings to use is same as the # of channels to use.
981 	 */
982 	ring_cnt = hn_chan_cnt;
983 	if (ring_cnt <= 0) {
984 		/* Default */
985 		ring_cnt = mp_ncpus;
986 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
987 			ring_cnt = HN_RING_CNT_DEF_MAX;
988 	} else if (ring_cnt > mp_ncpus) {
989 		ring_cnt = mp_ncpus;
990 	}
991 #ifdef RSS
992 	if (ring_cnt > rss_getnumbuckets())
993 		ring_cnt = rss_getnumbuckets();
994 #endif
995 
996 	tx_ring_cnt = hn_tx_ring_cnt;
997 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
998 		tx_ring_cnt = ring_cnt;
999 #ifdef HN_IFSTART_SUPPORT
1000 	if (hn_use_if_start) {
1001 		/* ifnet.if_start only needs one TX ring. */
1002 		tx_ring_cnt = 1;
1003 	}
1004 #endif
1005 
1006 	/*
1007 	 * Set the leader CPU for channels.
1008 	 */
1009 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1010 
1011 	/*
1012 	 * Create enough TX/RX rings, even if only limited number of
1013 	 * channels can be allocated.
1014 	 */
1015 	error = hn_create_tx_data(sc, tx_ring_cnt);
1016 	if (error)
1017 		goto failed;
1018 	error = hn_create_rx_data(sc, ring_cnt);
1019 	if (error)
1020 		goto failed;
1021 
1022 	/*
1023 	 * Create transaction context for NVS and RNDIS transactions.
1024 	 */
1025 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1026 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1027 	if (sc->hn_xact == NULL) {
1028 		error = ENXIO;
1029 		goto failed;
1030 	}
1031 
1032 	/*
1033 	 * Install orphan handler for the revocation of this device's
1034 	 * primary channel.
1035 	 *
1036 	 * NOTE:
1037 	 * The processing order is critical here:
1038 	 * Install the orphan handler, _before_ testing whether this
1039 	 * device's primary channel has been revoked or not.
1040 	 */
1041 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1042 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1043 		error = ENXIO;
1044 		goto failed;
1045 	}
1046 
1047 	/*
1048 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1049 	 */
1050 	error = hn_synth_attach(sc, ETHERMTU);
1051 	if (error)
1052 		goto failed;
1053 
1054 	error = hn_rndis_get_eaddr(sc, eaddr);
1055 	if (error)
1056 		goto failed;
1057 
1058 #if __FreeBSD_version >= 1100099
1059 	if (sc->hn_rx_ring_inuse > 1) {
1060 		/*
1061 		 * Reduce TCP segment aggregation limit for multiple
1062 		 * RX rings to increase ACK timeliness.
1063 		 */
1064 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1065 	}
1066 #endif
1067 
1068 	/*
1069 	 * Fixup TX stuffs after synthetic parts are attached.
1070 	 */
1071 	hn_fixup_tx_data(sc);
1072 
1073 	ctx = device_get_sysctl_ctx(dev);
1074 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1075 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1076 	    &sc->hn_nvs_ver, 0, "NVS version");
1077 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1078 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1079 	    hn_ndis_version_sysctl, "A", "NDIS version");
1080 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1081 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1082 	    hn_caps_sysctl, "A", "capabilities");
1083 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1084 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1085 	    hn_hwassist_sysctl, "A", "hwassist");
1086 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1087 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1088 	    hn_rxfilter_sysctl, "A", "rxfilter");
1089 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1090 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1091 	    hn_rss_hash_sysctl, "A", "RSS hash");
1092 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1093 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1094 #ifndef RSS
1095 	/*
1096 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
1097 	 */
1098 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1099 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1100 	    hn_rss_key_sysctl, "IU", "RSS key");
1101 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1102 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1103 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1104 #endif
1105 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1106 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1107 	    "RNDIS offered packet transmission aggregation size limit");
1108 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1109 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1110 	    "RNDIS offered packet transmission aggregation count limit");
1111 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1112 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1113 	    "RNDIS packet transmission aggregation alignment");
1114 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1115 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1116 	    hn_txagg_size_sysctl, "I",
1117 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1118 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1119 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1120 	    hn_txagg_pkts_sysctl, "I",
1121 	    "Packet transmission aggregation packets, "
1122 	    "0 -- disable, -1 -- auto");
1123 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1124 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1125 	    hn_polling_sysctl, "I",
1126 	    "Polling frequency: [100,1000000], 0 disable polling");
1127 
1128 	/*
1129 	 * Setup the ifmedia, which has been initialized earlier.
1130 	 */
1131 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1132 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1133 	/* XXX ifmedia_set really should do this for us */
1134 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1135 
1136 	/*
1137 	 * Setup the ifnet for this interface.
1138 	 */
1139 
1140 	ifp->if_baudrate = IF_Gbps(10);
1141 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1142 	ifp->if_ioctl = hn_ioctl;
1143 	ifp->if_init = hn_init;
1144 #ifdef HN_IFSTART_SUPPORT
1145 	if (hn_use_if_start) {
1146 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1147 
1148 		ifp->if_start = hn_start;
1149 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1150 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1151 		IFQ_SET_READY(&ifp->if_snd);
1152 	} else
1153 #endif
1154 	{
1155 		ifp->if_transmit = hn_transmit;
1156 		ifp->if_qflush = hn_xmit_qflush;
1157 	}
1158 
1159 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1160 #ifdef foo
1161 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
1162 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1163 #endif
1164 	if (sc->hn_caps & HN_CAP_VLAN) {
1165 		/* XXX not sure about VLAN_MTU. */
1166 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1167 	}
1168 
1169 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1170 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1171 		ifp->if_capabilities |= IFCAP_TXCSUM;
1172 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1173 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1174 	if (sc->hn_caps & HN_CAP_TSO4) {
1175 		ifp->if_capabilities |= IFCAP_TSO4;
1176 		ifp->if_hwassist |= CSUM_IP_TSO;
1177 	}
1178 	if (sc->hn_caps & HN_CAP_TSO6) {
1179 		ifp->if_capabilities |= IFCAP_TSO6;
1180 		ifp->if_hwassist |= CSUM_IP6_TSO;
1181 	}
1182 
1183 	/* Enable all available capabilities by default. */
1184 	ifp->if_capenable = ifp->if_capabilities;
1185 
1186 	/*
1187 	 * Disable IPv6 TSO and TXCSUM by default, they still can
1188 	 * be enabled through SIOCSIFCAP.
1189 	 */
1190 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1191 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1192 
1193 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1194 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1195 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1196 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1197 	}
1198 
1199 	ether_ifattach(ifp, eaddr);
1200 
1201 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1202 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
1203 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1204 	}
1205 
1206 	/* Inform the upper layer about the long frame support. */
1207 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1208 
1209 	/*
1210 	 * Kick off link status check.
1211 	 */
1212 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1213 	hn_update_link_status(sc);
1214 
1215 	return (0);
1216 failed:
1217 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1218 		hn_synth_detach(sc);
1219 	hn_detach(dev);
1220 	return (error);
1221 }
1222 
1223 static int
1224 hn_detach(device_t dev)
1225 {
1226 	struct hn_softc *sc = device_get_softc(dev);
1227 	struct ifnet *ifp = sc->hn_ifp;
1228 
1229 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1230 		/*
1231 		 * In case that the vmbus missed the orphan handler
1232 		 * installation.
1233 		 */
1234 		vmbus_xact_ctx_orphan(sc->hn_xact);
1235 	}
1236 
1237 	if (device_is_attached(dev)) {
1238 		HN_LOCK(sc);
1239 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1240 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1241 				hn_stop(sc);
1242 			/*
1243 			 * NOTE:
1244 			 * hn_stop() only suspends data, so managment
1245 			 * stuffs have to be suspended manually here.
1246 			 */
1247 			hn_suspend_mgmt(sc);
1248 			hn_synth_detach(sc);
1249 		}
1250 		HN_UNLOCK(sc);
1251 		ether_ifdetach(ifp);
1252 	}
1253 
1254 	ifmedia_removeall(&sc->hn_media);
1255 	hn_destroy_rx_data(sc);
1256 	hn_destroy_tx_data(sc);
1257 
1258 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1259 		int i;
1260 
1261 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
1262 			taskqueue_free(sc->hn_tx_taskqs[i]);
1263 		free(sc->hn_tx_taskqs, M_DEVBUF);
1264 	}
1265 	taskqueue_free(sc->hn_mgmt_taskq0);
1266 
1267 	if (sc->hn_xact != NULL) {
1268 		/*
1269 		 * Uninstall the orphan handler _before_ the xact is
1270 		 * destructed.
1271 		 */
1272 		vmbus_chan_unset_orphan(sc->hn_prichan);
1273 		vmbus_xact_ctx_destroy(sc->hn_xact);
1274 	}
1275 
1276 	if_free(ifp);
1277 
1278 	HN_LOCK_DESTROY(sc);
1279 	return (0);
1280 }
1281 
1282 static int
1283 hn_shutdown(device_t dev)
1284 {
1285 
1286 	return (0);
1287 }
1288 
1289 static void
1290 hn_link_status(struct hn_softc *sc)
1291 {
1292 	uint32_t link_status;
1293 	int error;
1294 
1295 	error = hn_rndis_get_linkstatus(sc, &link_status);
1296 	if (error) {
1297 		/* XXX what to do? */
1298 		return;
1299 	}
1300 
1301 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1302 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1303 	else
1304 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1305 	if_link_state_change(sc->hn_ifp,
1306 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1307 	    LINK_STATE_UP : LINK_STATE_DOWN);
1308 }
1309 
1310 static void
1311 hn_link_taskfunc(void *xsc, int pending __unused)
1312 {
1313 	struct hn_softc *sc = xsc;
1314 
1315 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1316 		return;
1317 	hn_link_status(sc);
1318 }
1319 
1320 static void
1321 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1322 {
1323 	struct hn_softc *sc = xsc;
1324 
1325 	/* Prevent any link status checks from running. */
1326 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1327 
1328 	/*
1329 	 * Fake up a [link down --> link up] state change; 5 seconds
1330 	 * delay is used, which closely simulates miibus reaction
1331 	 * upon link down event.
1332 	 */
1333 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1334 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1335 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1336 	    &sc->hn_netchg_status, 5 * hz);
1337 }
1338 
1339 static void
1340 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1341 {
1342 	struct hn_softc *sc = xsc;
1343 
1344 	/* Re-allow link status checks. */
1345 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1346 	hn_link_status(sc);
1347 }
1348 
1349 static void
1350 hn_update_link_status(struct hn_softc *sc)
1351 {
1352 
1353 	if (sc->hn_mgmt_taskq != NULL)
1354 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1355 }
1356 
1357 static void
1358 hn_change_network(struct hn_softc *sc)
1359 {
1360 
1361 	if (sc->hn_mgmt_taskq != NULL)
1362 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1363 }
1364 
1365 static __inline int
1366 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1367     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1368 {
1369 	struct mbuf *m = *m_head;
1370 	int error;
1371 
1372 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1373 
1374 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1375 	    m, segs, nsegs, BUS_DMA_NOWAIT);
1376 	if (error == EFBIG) {
1377 		struct mbuf *m_new;
1378 
1379 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1380 		if (m_new == NULL)
1381 			return ENOBUFS;
1382 		else
1383 			*m_head = m = m_new;
1384 		txr->hn_tx_collapsed++;
1385 
1386 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1387 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1388 	}
1389 	if (!error) {
1390 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1391 		    BUS_DMASYNC_PREWRITE);
1392 		txd->flags |= HN_TXD_FLAG_DMAMAP;
1393 	}
1394 	return error;
1395 }
1396 
1397 static __inline int
1398 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1399 {
1400 
1401 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1402 	    ("put an onlist txd %#x", txd->flags));
1403 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1404 	    ("put an onagg txd %#x", txd->flags));
1405 
1406 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1407 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1408 		return 0;
1409 
1410 	if (!STAILQ_EMPTY(&txd->agg_list)) {
1411 		struct hn_txdesc *tmp_txd;
1412 
1413 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1414 			int freed;
1415 
1416 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1417 			    ("resursive aggregation on aggregated txdesc"));
1418 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1419 			    ("not aggregated txdesc"));
1420 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1421 			    ("aggregated txdesc uses dmamap"));
1422 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1423 			    ("aggregated txdesc consumes "
1424 			     "chimney sending buffer"));
1425 			KASSERT(tmp_txd->chim_size == 0,
1426 			    ("aggregated txdesc has non-zero "
1427 			     "chimney sending size"));
1428 
1429 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1430 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1431 			freed = hn_txdesc_put(txr, tmp_txd);
1432 			KASSERT(freed, ("failed to free aggregated txdesc"));
1433 		}
1434 	}
1435 
1436 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1437 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1438 		    ("chim txd uses dmamap"));
1439 		hn_chim_free(txr->hn_sc, txd->chim_index);
1440 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1441 		txd->chim_size = 0;
1442 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1443 		bus_dmamap_sync(txr->hn_tx_data_dtag,
1444 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1445 		bus_dmamap_unload(txr->hn_tx_data_dtag,
1446 		    txd->data_dmap);
1447 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1448 	}
1449 
1450 	if (txd->m != NULL) {
1451 		m_freem(txd->m);
1452 		txd->m = NULL;
1453 	}
1454 
1455 	txd->flags |= HN_TXD_FLAG_ONLIST;
1456 #ifndef HN_USE_TXDESC_BUFRING
1457 	mtx_lock_spin(&txr->hn_txlist_spin);
1458 	KASSERT(txr->hn_txdesc_avail >= 0 &&
1459 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1460 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1461 	txr->hn_txdesc_avail++;
1462 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1463 	mtx_unlock_spin(&txr->hn_txlist_spin);
1464 #else	/* HN_USE_TXDESC_BUFRING */
1465 #ifdef HN_DEBUG
1466 	atomic_add_int(&txr->hn_txdesc_avail, 1);
1467 #endif
1468 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1469 #endif	/* !HN_USE_TXDESC_BUFRING */
1470 
1471 	return 1;
1472 }
1473 
1474 static __inline struct hn_txdesc *
1475 hn_txdesc_get(struct hn_tx_ring *txr)
1476 {
1477 	struct hn_txdesc *txd;
1478 
1479 #ifndef HN_USE_TXDESC_BUFRING
1480 	mtx_lock_spin(&txr->hn_txlist_spin);
1481 	txd = SLIST_FIRST(&txr->hn_txlist);
1482 	if (txd != NULL) {
1483 		KASSERT(txr->hn_txdesc_avail > 0,
1484 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1485 		txr->hn_txdesc_avail--;
1486 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1487 	}
1488 	mtx_unlock_spin(&txr->hn_txlist_spin);
1489 #else
1490 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1491 #endif
1492 
1493 	if (txd != NULL) {
1494 #ifdef HN_USE_TXDESC_BUFRING
1495 #ifdef HN_DEBUG
1496 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1497 #endif
1498 #endif	/* HN_USE_TXDESC_BUFRING */
1499 		KASSERT(txd->m == NULL && txd->refs == 0 &&
1500 		    STAILQ_EMPTY(&txd->agg_list) &&
1501 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1502 		    txd->chim_size == 0 &&
1503 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1504 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1505 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1506 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1507 		txd->refs = 1;
1508 	}
1509 	return txd;
1510 }
1511 
1512 static __inline void
1513 hn_txdesc_hold(struct hn_txdesc *txd)
1514 {
1515 
1516 	/* 0->1 transition will never work */
1517 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1518 	atomic_add_int(&txd->refs, 1);
1519 }
1520 
1521 static __inline void
1522 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1523 {
1524 
1525 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1526 	    ("recursive aggregation on aggregating txdesc"));
1527 
1528 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1529 	    ("already aggregated"));
1530 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
1531 	    ("recursive aggregation on to-be-aggregated txdesc"));
1532 
1533 	txd->flags |= HN_TXD_FLAG_ONAGG;
1534 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1535 }
1536 
1537 static bool
1538 hn_tx_ring_pending(struct hn_tx_ring *txr)
1539 {
1540 	bool pending = false;
1541 
1542 #ifndef HN_USE_TXDESC_BUFRING
1543 	mtx_lock_spin(&txr->hn_txlist_spin);
1544 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1545 		pending = true;
1546 	mtx_unlock_spin(&txr->hn_txlist_spin);
1547 #else
1548 	if (!buf_ring_full(txr->hn_txdesc_br))
1549 		pending = true;
1550 #endif
1551 	return (pending);
1552 }
1553 
1554 static __inline void
1555 hn_txeof(struct hn_tx_ring *txr)
1556 {
1557 	txr->hn_has_txeof = 0;
1558 	txr->hn_txeof(txr);
1559 }
1560 
1561 static void
1562 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1563     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1564 {
1565 	struct hn_txdesc *txd = sndc->hn_cbarg;
1566 	struct hn_tx_ring *txr;
1567 
1568 	txr = txd->txr;
1569 	KASSERT(txr->hn_chan == chan,
1570 	    ("channel mismatch, on chan%u, should be chan%u",
1571 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1572 
1573 	txr->hn_has_txeof = 1;
1574 	hn_txdesc_put(txr, txd);
1575 
1576 	++txr->hn_txdone_cnt;
1577 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1578 		txr->hn_txdone_cnt = 0;
1579 		if (txr->hn_oactive)
1580 			hn_txeof(txr);
1581 	}
1582 }
1583 
1584 static void
1585 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1586 {
1587 #if defined(INET) || defined(INET6)
1588 	tcp_lro_flush_all(&rxr->hn_lro);
1589 #endif
1590 
1591 	/*
1592 	 * NOTE:
1593 	 * 'txr' could be NULL, if multiple channels and
1594 	 * ifnet.if_start method are enabled.
1595 	 */
1596 	if (txr == NULL || !txr->hn_has_txeof)
1597 		return;
1598 
1599 	txr->hn_txdone_cnt = 0;
1600 	hn_txeof(txr);
1601 }
1602 
1603 static __inline uint32_t
1604 hn_rndis_pktmsg_offset(uint32_t ofs)
1605 {
1606 
1607 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1608 	    ("invalid RNDIS packet msg offset %u", ofs));
1609 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1610 }
1611 
1612 static __inline void *
1613 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1614     size_t pi_dlen, uint32_t pi_type)
1615 {
1616 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1617 	struct rndis_pktinfo *pi;
1618 
1619 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1620 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1621 
1622 	/*
1623 	 * Per-packet-info does not move; it only grows.
1624 	 *
1625 	 * NOTE:
1626 	 * rm_pktinfooffset in this phase counts from the beginning
1627 	 * of rndis_packet_msg.
1628 	 */
1629 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1630 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1631 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1632 	    pkt->rm_pktinfolen);
1633 	pkt->rm_pktinfolen += pi_size;
1634 
1635 	pi->rm_size = pi_size;
1636 	pi->rm_type = pi_type;
1637 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1638 
1639 	/* Data immediately follow per-packet-info. */
1640 	pkt->rm_dataoffset += pi_size;
1641 
1642 	/* Update RNDIS packet msg length */
1643 	pkt->rm_len += pi_size;
1644 
1645 	return (pi->rm_data);
1646 }
1647 
1648 static __inline int
1649 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1650 {
1651 	struct hn_txdesc *txd;
1652 	struct mbuf *m;
1653 	int error, pkts;
1654 
1655 	txd = txr->hn_agg_txd;
1656 	KASSERT(txd != NULL, ("no aggregate txdesc"));
1657 
1658 	/*
1659 	 * Since hn_txpkt() will reset this temporary stat, save
1660 	 * it now, so that oerrors can be updated properly, if
1661 	 * hn_txpkt() ever fails.
1662 	 */
1663 	pkts = txr->hn_stat_pkts;
1664 
1665 	/*
1666 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1667 	 * failure, save it for later freeing, if hn_txpkt() ever
1668 	 * fails.
1669 	 */
1670 	m = txd->m;
1671 	error = hn_txpkt(ifp, txr, txd);
1672 	if (__predict_false(error)) {
1673 		/* txd is freed, but m is not. */
1674 		m_freem(m);
1675 
1676 		txr->hn_flush_failed++;
1677 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1678 	}
1679 
1680 	/* Reset all aggregation states. */
1681 	txr->hn_agg_txd = NULL;
1682 	txr->hn_agg_szleft = 0;
1683 	txr->hn_agg_pktleft = 0;
1684 	txr->hn_agg_prevpkt = NULL;
1685 
1686 	return (error);
1687 }
1688 
1689 static void *
1690 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1691     int pktsize)
1692 {
1693 	void *chim;
1694 
1695 	if (txr->hn_agg_txd != NULL) {
1696 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1697 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1698 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1699 			int olen;
1700 
1701 			/*
1702 			 * Update the previous RNDIS packet's total length,
1703 			 * it can be increased due to the mandatory alignment
1704 			 * padding for this RNDIS packet.  And update the
1705 			 * aggregating txdesc's chimney sending buffer size
1706 			 * accordingly.
1707 			 *
1708 			 * XXX
1709 			 * Zero-out the padding, as required by the RNDIS spec.
1710 			 */
1711 			olen = pkt->rm_len;
1712 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1713 			agg_txd->chim_size += pkt->rm_len - olen;
1714 
1715 			/* Link this txdesc to the parent. */
1716 			hn_txdesc_agg(agg_txd, txd);
1717 
1718 			chim = (uint8_t *)pkt + pkt->rm_len;
1719 			/* Save the current packet for later fixup. */
1720 			txr->hn_agg_prevpkt = chim;
1721 
1722 			txr->hn_agg_pktleft--;
1723 			txr->hn_agg_szleft -= pktsize;
1724 			if (txr->hn_agg_szleft <=
1725 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1726 				/*
1727 				 * Probably can't aggregate more packets,
1728 				 * flush this aggregating txdesc proactively.
1729 				 */
1730 				txr->hn_agg_pktleft = 0;
1731 			}
1732 			/* Done! */
1733 			return (chim);
1734 		}
1735 		hn_flush_txagg(ifp, txr);
1736 	}
1737 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1738 
1739 	txr->hn_tx_chimney_tried++;
1740 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
1741 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1742 		return (NULL);
1743 	txr->hn_tx_chimney++;
1744 
1745 	chim = txr->hn_sc->hn_chim +
1746 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1747 
1748 	if (txr->hn_agg_pktmax > 1 &&
1749 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1750 		txr->hn_agg_txd = txd;
1751 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1752 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1753 		txr->hn_agg_prevpkt = chim;
1754 	}
1755 	return (chim);
1756 }
1757 
1758 /*
1759  * NOTE:
1760  * If this function fails, then both txd and m_head0 will be freed.
1761  */
1762 static int
1763 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1764     struct mbuf **m_head0)
1765 {
1766 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1767 	int error, nsegs, i;
1768 	struct mbuf *m_head = *m_head0;
1769 	struct rndis_packet_msg *pkt;
1770 	uint32_t *pi_data;
1771 	void *chim = NULL;
1772 	int pkt_hlen, pkt_size;
1773 
1774 	pkt = txd->rndis_pkt;
1775 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1776 	if (pkt_size < txr->hn_chim_size) {
1777 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1778 		if (chim != NULL)
1779 			pkt = chim;
1780 	} else {
1781 		if (txr->hn_agg_txd != NULL)
1782 			hn_flush_txagg(ifp, txr);
1783 	}
1784 
1785 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1786 	pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1787 	pkt->rm_dataoffset = sizeof(*pkt);
1788 	pkt->rm_datalen = m_head->m_pkthdr.len;
1789 	pkt->rm_oobdataoffset = 0;
1790 	pkt->rm_oobdatalen = 0;
1791 	pkt->rm_oobdataelements = 0;
1792 	pkt->rm_pktinfooffset = sizeof(*pkt);
1793 	pkt->rm_pktinfolen = 0;
1794 	pkt->rm_vchandle = 0;
1795 	pkt->rm_reserved = 0;
1796 
1797 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1798 		/*
1799 		 * Set the hash value for this packet, so that the host could
1800 		 * dispatch the TX done event for this packet back to this TX
1801 		 * ring's channel.
1802 		 */
1803 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1804 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1805 		*pi_data = txr->hn_tx_idx;
1806 	}
1807 
1808 	if (m_head->m_flags & M_VLANTAG) {
1809 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1810 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1811 		*pi_data = NDIS_VLAN_INFO_MAKE(
1812 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1813 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1814 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1815 	}
1816 
1817 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1818 #if defined(INET6) || defined(INET)
1819 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1820 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1821 #ifdef INET
1822 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1823 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1824 			    m_head->m_pkthdr.tso_segsz);
1825 		}
1826 #endif
1827 #if defined(INET6) && defined(INET)
1828 		else
1829 #endif
1830 #ifdef INET6
1831 		{
1832 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1833 			    m_head->m_pkthdr.tso_segsz);
1834 		}
1835 #endif
1836 #endif	/* INET6 || INET */
1837 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1838 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1839 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1840 		if (m_head->m_pkthdr.csum_flags &
1841 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1842 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
1843 		} else {
1844 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
1845 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1846 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
1847 		}
1848 
1849 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1850 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1851 		else if (m_head->m_pkthdr.csum_flags &
1852 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
1853 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1854 	}
1855 
1856 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1857 	/* Convert RNDIS packet message offsets */
1858 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1859 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1860 
1861 	/*
1862 	 * Fast path: Chimney sending.
1863 	 */
1864 	if (chim != NULL) {
1865 		struct hn_txdesc *tgt_txd = txd;
1866 
1867 		if (txr->hn_agg_txd != NULL) {
1868 			tgt_txd = txr->hn_agg_txd;
1869 #ifdef INVARIANTS
1870 			*m_head0 = NULL;
1871 #endif
1872 		}
1873 
1874 		KASSERT(pkt == chim,
1875 		    ("RNDIS pkt not in chimney sending buffer"));
1876 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1877 		    ("chimney sending buffer is not used"));
1878 		tgt_txd->chim_size += pkt->rm_len;
1879 
1880 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
1881 		    ((uint8_t *)chim) + pkt_hlen);
1882 
1883 		txr->hn_gpa_cnt = 0;
1884 		txr->hn_sendpkt = hn_txpkt_chim;
1885 		goto done;
1886 	}
1887 
1888 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1889 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1890 	    ("chimney buffer is used"));
1891 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1892 
1893 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1894 	if (__predict_false(error)) {
1895 		int freed;
1896 
1897 		/*
1898 		 * This mbuf is not linked w/ the txd yet, so free it now.
1899 		 */
1900 		m_freem(m_head);
1901 		*m_head0 = NULL;
1902 
1903 		freed = hn_txdesc_put(txr, txd);
1904 		KASSERT(freed != 0,
1905 		    ("fail to free txd upon txdma error"));
1906 
1907 		txr->hn_txdma_failed++;
1908 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1909 		return error;
1910 	}
1911 	*m_head0 = m_head;
1912 
1913 	/* +1 RNDIS packet message */
1914 	txr->hn_gpa_cnt = nsegs + 1;
1915 
1916 	/* send packet with page buffer */
1917 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1918 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1919 	txr->hn_gpa[0].gpa_len = pkt_hlen;
1920 
1921 	/*
1922 	 * Fill the page buffers with mbuf info after the page
1923 	 * buffer for RNDIS packet message.
1924 	 */
1925 	for (i = 0; i < nsegs; ++i) {
1926 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1927 
1928 		gpa->gpa_page = atop(segs[i].ds_addr);
1929 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1930 		gpa->gpa_len = segs[i].ds_len;
1931 	}
1932 
1933 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1934 	txd->chim_size = 0;
1935 	txr->hn_sendpkt = hn_txpkt_sglist;
1936 done:
1937 	txd->m = m_head;
1938 
1939 	/* Set the completion routine */
1940 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1941 
1942 	/* Update temporary stats for later use. */
1943 	txr->hn_stat_pkts++;
1944 	txr->hn_stat_size += m_head->m_pkthdr.len;
1945 	if (m_head->m_flags & M_MCAST)
1946 		txr->hn_stat_mcasts++;
1947 
1948 	return 0;
1949 }
1950 
1951 /*
1952  * NOTE:
1953  * If this function fails, then txd will be freed, but the mbuf
1954  * associated w/ the txd will _not_ be freed.
1955  */
1956 static int
1957 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1958 {
1959 	int error, send_failed = 0, has_bpf;
1960 
1961 again:
1962 	has_bpf = bpf_peers_present(ifp->if_bpf);
1963 	if (has_bpf) {
1964 		/*
1965 		 * Make sure that this txd and any aggregated txds are not
1966 		 * freed before ETHER_BPF_MTAP.
1967 		 */
1968 		hn_txdesc_hold(txd);
1969 	}
1970 	error = txr->hn_sendpkt(txr, txd);
1971 	if (!error) {
1972 		if (has_bpf) {
1973 			const struct hn_txdesc *tmp_txd;
1974 
1975 			ETHER_BPF_MTAP(ifp, txd->m);
1976 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1977 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
1978 		}
1979 
1980 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1981 #ifdef HN_IFSTART_SUPPORT
1982 		if (!hn_use_if_start)
1983 #endif
1984 		{
1985 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
1986 			    txr->hn_stat_size);
1987 			if (txr->hn_stat_mcasts != 0) {
1988 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1989 				    txr->hn_stat_mcasts);
1990 			}
1991 		}
1992 		txr->hn_pkts += txr->hn_stat_pkts;
1993 		txr->hn_sends++;
1994 	}
1995 	if (has_bpf)
1996 		hn_txdesc_put(txr, txd);
1997 
1998 	if (__predict_false(error)) {
1999 		int freed;
2000 
2001 		/*
2002 		 * This should "really rarely" happen.
2003 		 *
2004 		 * XXX Too many RX to be acked or too many sideband
2005 		 * commands to run?  Ask netvsc_channel_rollup()
2006 		 * to kick start later.
2007 		 */
2008 		txr->hn_has_txeof = 1;
2009 		if (!send_failed) {
2010 			txr->hn_send_failed++;
2011 			send_failed = 1;
2012 			/*
2013 			 * Try sending again after set hn_has_txeof;
2014 			 * in case that we missed the last
2015 			 * netvsc_channel_rollup().
2016 			 */
2017 			goto again;
2018 		}
2019 		if_printf(ifp, "send failed\n");
2020 
2021 		/*
2022 		 * Caller will perform further processing on the
2023 		 * associated mbuf, so don't free it in hn_txdesc_put();
2024 		 * only unload it from the DMA map in hn_txdesc_put(),
2025 		 * if it was loaded.
2026 		 */
2027 		txd->m = NULL;
2028 		freed = hn_txdesc_put(txr, txd);
2029 		KASSERT(freed != 0,
2030 		    ("fail to free txd upon send error"));
2031 
2032 		txr->hn_send_failed++;
2033 	}
2034 
2035 	/* Reset temporary stats, after this sending is done. */
2036 	txr->hn_stat_size = 0;
2037 	txr->hn_stat_pkts = 0;
2038 	txr->hn_stat_mcasts = 0;
2039 
2040 	return (error);
2041 }
2042 
2043 /*
2044  * Append the specified data to the indicated mbuf chain,
2045  * Extend the mbuf chain if the new data does not fit in
2046  * existing space.
2047  *
2048  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2049  * There should be an equivalent in the kernel mbuf code,
2050  * but there does not appear to be one yet.
2051  *
2052  * Differs from m_append() in that additional mbufs are
2053  * allocated with cluster size MJUMPAGESIZE, and filled
2054  * accordingly.
2055  *
2056  * Return 1 if able to complete the job; otherwise 0.
2057  */
2058 static int
2059 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2060 {
2061 	struct mbuf *m, *n;
2062 	int remainder, space;
2063 
2064 	for (m = m0; m->m_next != NULL; m = m->m_next)
2065 		;
2066 	remainder = len;
2067 	space = M_TRAILINGSPACE(m);
2068 	if (space > 0) {
2069 		/*
2070 		 * Copy into available space.
2071 		 */
2072 		if (space > remainder)
2073 			space = remainder;
2074 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2075 		m->m_len += space;
2076 		cp += space;
2077 		remainder -= space;
2078 	}
2079 	while (remainder > 0) {
2080 		/*
2081 		 * Allocate a new mbuf; could check space
2082 		 * and allocate a cluster instead.
2083 		 */
2084 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2085 		if (n == NULL)
2086 			break;
2087 		n->m_len = min(MJUMPAGESIZE, remainder);
2088 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2089 		cp += n->m_len;
2090 		remainder -= n->m_len;
2091 		m->m_next = n;
2092 		m = n;
2093 	}
2094 	if (m0->m_flags & M_PKTHDR)
2095 		m0->m_pkthdr.len += len - remainder;
2096 
2097 	return (remainder == 0);
2098 }
2099 
2100 #if defined(INET) || defined(INET6)
2101 static __inline int
2102 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2103 {
2104 #if __FreeBSD_version >= 1100095
2105 	if (hn_lro_mbufq_depth) {
2106 		tcp_lro_queue_mbuf(lc, m);
2107 		return 0;
2108 	}
2109 #endif
2110 	return tcp_lro_rx(lc, m, 0);
2111 }
2112 #endif
2113 
2114 static int
2115 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2116     const struct hn_rxinfo *info)
2117 {
2118 	struct ifnet *ifp = rxr->hn_ifp;
2119 	struct mbuf *m_new;
2120 	int size, do_lro = 0, do_csum = 1;
2121 	int hash_type;
2122 
2123 	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2124 		return (0);
2125 
2126 	/*
2127 	 * Bail out if packet contains more data than configured MTU.
2128 	 */
2129 	if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2130 		return (0);
2131 	} else if (dlen <= MHLEN) {
2132 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
2133 		if (m_new == NULL) {
2134 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2135 			return (0);
2136 		}
2137 		memcpy(mtod(m_new, void *), data, dlen);
2138 		m_new->m_pkthdr.len = m_new->m_len = dlen;
2139 		rxr->hn_small_pkts++;
2140 	} else {
2141 		/*
2142 		 * Get an mbuf with a cluster.  For packets 2K or less,
2143 		 * get a standard 2K cluster.  For anything larger, get a
2144 		 * 4K cluster.  Any buffers larger than 4K can cause problems
2145 		 * if looped around to the Hyper-V TX channel, so avoid them.
2146 		 */
2147 		size = MCLBYTES;
2148 		if (dlen > MCLBYTES) {
2149 			/* 4096 */
2150 			size = MJUMPAGESIZE;
2151 		}
2152 
2153 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2154 		if (m_new == NULL) {
2155 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2156 			return (0);
2157 		}
2158 
2159 		hv_m_append(m_new, dlen, data);
2160 	}
2161 	m_new->m_pkthdr.rcvif = ifp;
2162 
2163 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2164 		do_csum = 0;
2165 
2166 	/* receive side checksum offload */
2167 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2168 		/* IP csum offload */
2169 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2170 			m_new->m_pkthdr.csum_flags |=
2171 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2172 			rxr->hn_csum_ip++;
2173 		}
2174 
2175 		/* TCP/UDP csum offload */
2176 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2177 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2178 			m_new->m_pkthdr.csum_flags |=
2179 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2180 			m_new->m_pkthdr.csum_data = 0xffff;
2181 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2182 				rxr->hn_csum_tcp++;
2183 			else
2184 				rxr->hn_csum_udp++;
2185 		}
2186 
2187 		/*
2188 		 * XXX
2189 		 * As of this write (Oct 28th, 2016), host side will turn
2190 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2191 		 * the do_lro setting here is actually _not_ accurate.  We
2192 		 * depend on the RSS hash type check to reset do_lro.
2193 		 */
2194 		if ((info->csum_info &
2195 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2196 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2197 			do_lro = 1;
2198 	} else {
2199 		const struct ether_header *eh;
2200 		uint16_t etype;
2201 		int hoff;
2202 
2203 		hoff = sizeof(*eh);
2204 		if (m_new->m_len < hoff)
2205 			goto skip;
2206 		eh = mtod(m_new, struct ether_header *);
2207 		etype = ntohs(eh->ether_type);
2208 		if (etype == ETHERTYPE_VLAN) {
2209 			const struct ether_vlan_header *evl;
2210 
2211 			hoff = sizeof(*evl);
2212 			if (m_new->m_len < hoff)
2213 				goto skip;
2214 			evl = mtod(m_new, struct ether_vlan_header *);
2215 			etype = ntohs(evl->evl_proto);
2216 		}
2217 
2218 		if (etype == ETHERTYPE_IP) {
2219 			int pr;
2220 
2221 			pr = hn_check_iplen(m_new, hoff);
2222 			if (pr == IPPROTO_TCP) {
2223 				if (do_csum &&
2224 				    (rxr->hn_trust_hcsum &
2225 				     HN_TRUST_HCSUM_TCP)) {
2226 					rxr->hn_csum_trusted++;
2227 					m_new->m_pkthdr.csum_flags |=
2228 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2229 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2230 					m_new->m_pkthdr.csum_data = 0xffff;
2231 				}
2232 				do_lro = 1;
2233 			} else if (pr == IPPROTO_UDP) {
2234 				if (do_csum &&
2235 				    (rxr->hn_trust_hcsum &
2236 				     HN_TRUST_HCSUM_UDP)) {
2237 					rxr->hn_csum_trusted++;
2238 					m_new->m_pkthdr.csum_flags |=
2239 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2240 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2241 					m_new->m_pkthdr.csum_data = 0xffff;
2242 				}
2243 			} else if (pr != IPPROTO_DONE && do_csum &&
2244 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2245 				rxr->hn_csum_trusted++;
2246 				m_new->m_pkthdr.csum_flags |=
2247 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2248 			}
2249 		}
2250 	}
2251 skip:
2252 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2253 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2254 		    NDIS_VLAN_INFO_ID(info->vlan_info),
2255 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
2256 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
2257 		m_new->m_flags |= M_VLANTAG;
2258 	}
2259 
2260 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2261 		rxr->hn_rss_pkts++;
2262 		m_new->m_pkthdr.flowid = info->hash_value;
2263 		hash_type = M_HASHTYPE_OPAQUE_HASH;
2264 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2265 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
2266 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2267 
2268 			/*
2269 			 * NOTE:
2270 			 * do_lro is resetted, if the hash types are not TCP
2271 			 * related.  See the comment in the above csum_flags
2272 			 * setup section.
2273 			 */
2274 			switch (type) {
2275 			case NDIS_HASH_IPV4:
2276 				hash_type = M_HASHTYPE_RSS_IPV4;
2277 				do_lro = 0;
2278 				break;
2279 
2280 			case NDIS_HASH_TCP_IPV4:
2281 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2282 				break;
2283 
2284 			case NDIS_HASH_IPV6:
2285 				hash_type = M_HASHTYPE_RSS_IPV6;
2286 				do_lro = 0;
2287 				break;
2288 
2289 			case NDIS_HASH_IPV6_EX:
2290 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
2291 				do_lro = 0;
2292 				break;
2293 
2294 			case NDIS_HASH_TCP_IPV6:
2295 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2296 				break;
2297 
2298 			case NDIS_HASH_TCP_IPV6_EX:
2299 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2300 				break;
2301 			}
2302 		}
2303 	} else {
2304 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2305 		hash_type = M_HASHTYPE_OPAQUE;
2306 	}
2307 	M_HASHTYPE_SET(m_new, hash_type);
2308 
2309 	/*
2310 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
2311 	 * messages (not just data messages) will trigger a response.
2312 	 */
2313 
2314 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2315 	rxr->hn_pkts++;
2316 
2317 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2318 #if defined(INET) || defined(INET6)
2319 		struct lro_ctrl *lro = &rxr->hn_lro;
2320 
2321 		if (lro->lro_cnt) {
2322 			rxr->hn_lro_tried++;
2323 			if (hn_lro_rx(lro, m_new) == 0) {
2324 				/* DONE! */
2325 				return 0;
2326 			}
2327 		}
2328 #endif
2329 	}
2330 
2331 	/* We're not holding the lock here, so don't release it */
2332 	(*ifp->if_input)(ifp, m_new);
2333 
2334 	return (0);
2335 }
2336 
2337 static int
2338 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2339 {
2340 	struct hn_softc *sc = ifp->if_softc;
2341 	struct ifreq *ifr = (struct ifreq *)data;
2342 	int mask, error = 0;
2343 
2344 	switch (cmd) {
2345 	case SIOCSIFMTU:
2346 		if (ifr->ifr_mtu > HN_MTU_MAX) {
2347 			error = EINVAL;
2348 			break;
2349 		}
2350 
2351 		HN_LOCK(sc);
2352 
2353 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2354 			HN_UNLOCK(sc);
2355 			break;
2356 		}
2357 
2358 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2359 			/* Can't change MTU */
2360 			HN_UNLOCK(sc);
2361 			error = EOPNOTSUPP;
2362 			break;
2363 		}
2364 
2365 		if (ifp->if_mtu == ifr->ifr_mtu) {
2366 			HN_UNLOCK(sc);
2367 			break;
2368 		}
2369 
2370 		/* Disable polling. */
2371 		hn_polling(sc, 0);
2372 
2373 		/*
2374 		 * Suspend this interface before the synthetic parts
2375 		 * are ripped.
2376 		 */
2377 		hn_suspend(sc);
2378 
2379 		/*
2380 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
2381 		 */
2382 		hn_synth_detach(sc);
2383 
2384 		/*
2385 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2386 		 * with the new MTU setting.
2387 		 */
2388 		error = hn_synth_attach(sc, ifr->ifr_mtu);
2389 		if (error) {
2390 			HN_UNLOCK(sc);
2391 			break;
2392 		}
2393 
2394 		/*
2395 		 * Commit the requested MTU, after the synthetic parts
2396 		 * have been successfully attached.
2397 		 */
2398 		ifp->if_mtu = ifr->ifr_mtu;
2399 
2400 		/*
2401 		 * Make sure that various parameters based on MTU are
2402 		 * still valid, after the MTU change.
2403 		 */
2404 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2405 			hn_set_chim_size(sc, sc->hn_chim_szmax);
2406 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2407 #if __FreeBSD_version >= 1100099
2408 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2409 		    HN_LRO_LENLIM_MIN(ifp))
2410 			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2411 #endif
2412 
2413 		/*
2414 		 * All done!  Resume the interface now.
2415 		 */
2416 		hn_resume(sc);
2417 
2418 		/*
2419 		 * Re-enable polling if this interface is running and
2420 		 * the polling is requested.
2421 		 */
2422 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
2423 			hn_polling(sc, sc->hn_pollhz);
2424 
2425 		HN_UNLOCK(sc);
2426 		break;
2427 
2428 	case SIOCSIFFLAGS:
2429 		HN_LOCK(sc);
2430 
2431 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2432 			HN_UNLOCK(sc);
2433 			break;
2434 		}
2435 
2436 		if (ifp->if_flags & IFF_UP) {
2437 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2438 				/*
2439 				 * Caller meight hold mutex, e.g.
2440 				 * bpf; use busy-wait for the RNDIS
2441 				 * reply.
2442 				 */
2443 				HN_NO_SLEEPING(sc);
2444 				hn_set_rxfilter(sc);
2445 				HN_SLEEPING_OK(sc);
2446 			} else {
2447 				hn_init_locked(sc);
2448 			}
2449 		} else {
2450 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2451 				hn_stop(sc);
2452 		}
2453 		sc->hn_if_flags = ifp->if_flags;
2454 
2455 		HN_UNLOCK(sc);
2456 		break;
2457 
2458 	case SIOCSIFCAP:
2459 		HN_LOCK(sc);
2460 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2461 
2462 		if (mask & IFCAP_TXCSUM) {
2463 			ifp->if_capenable ^= IFCAP_TXCSUM;
2464 			if (ifp->if_capenable & IFCAP_TXCSUM)
2465 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2466 			else
2467 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2468 		}
2469 		if (mask & IFCAP_TXCSUM_IPV6) {
2470 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2471 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2472 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2473 			else
2474 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2475 		}
2476 
2477 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2478 		if (mask & IFCAP_RXCSUM)
2479 			ifp->if_capenable ^= IFCAP_RXCSUM;
2480 #ifdef foo
2481 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2482 		if (mask & IFCAP_RXCSUM_IPV6)
2483 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2484 #endif
2485 
2486 		if (mask & IFCAP_LRO)
2487 			ifp->if_capenable ^= IFCAP_LRO;
2488 
2489 		if (mask & IFCAP_TSO4) {
2490 			ifp->if_capenable ^= IFCAP_TSO4;
2491 			if (ifp->if_capenable & IFCAP_TSO4)
2492 				ifp->if_hwassist |= CSUM_IP_TSO;
2493 			else
2494 				ifp->if_hwassist &= ~CSUM_IP_TSO;
2495 		}
2496 		if (mask & IFCAP_TSO6) {
2497 			ifp->if_capenable ^= IFCAP_TSO6;
2498 			if (ifp->if_capenable & IFCAP_TSO6)
2499 				ifp->if_hwassist |= CSUM_IP6_TSO;
2500 			else
2501 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2502 		}
2503 
2504 		HN_UNLOCK(sc);
2505 		break;
2506 
2507 	case SIOCADDMULTI:
2508 	case SIOCDELMULTI:
2509 		HN_LOCK(sc);
2510 
2511 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2512 			HN_UNLOCK(sc);
2513 			break;
2514 		}
2515 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2516 			/*
2517 			 * Multicast uses mutex; use busy-wait for
2518 			 * the RNDIS reply.
2519 			 */
2520 			HN_NO_SLEEPING(sc);
2521 			hn_set_rxfilter(sc);
2522 			HN_SLEEPING_OK(sc);
2523 		}
2524 
2525 		HN_UNLOCK(sc);
2526 		break;
2527 
2528 	case SIOCSIFMEDIA:
2529 	case SIOCGIFMEDIA:
2530 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2531 		break;
2532 
2533 	default:
2534 		error = ether_ioctl(ifp, cmd, data);
2535 		break;
2536 	}
2537 	return (error);
2538 }
2539 
2540 static void
2541 hn_stop(struct hn_softc *sc)
2542 {
2543 	struct ifnet *ifp = sc->hn_ifp;
2544 	int i;
2545 
2546 	HN_LOCK_ASSERT(sc);
2547 
2548 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2549 	    ("synthetic parts were not attached"));
2550 
2551 	/* Disable polling. */
2552 	hn_polling(sc, 0);
2553 
2554 	/* Clear RUNNING bit _before_ hn_suspend_data() */
2555 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2556 	hn_suspend_data(sc);
2557 
2558 	/* Clear OACTIVE bit. */
2559 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2560 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2561 		sc->hn_tx_ring[i].hn_oactive = 0;
2562 }
2563 
2564 static void
2565 hn_init_locked(struct hn_softc *sc)
2566 {
2567 	struct ifnet *ifp = sc->hn_ifp;
2568 	int i;
2569 
2570 	HN_LOCK_ASSERT(sc);
2571 
2572 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2573 		return;
2574 
2575 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2576 		return;
2577 
2578 	/* Configure RX filter */
2579 	hn_set_rxfilter(sc);
2580 
2581 	/* Clear OACTIVE bit. */
2582 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2583 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2584 		sc->hn_tx_ring[i].hn_oactive = 0;
2585 
2586 	/* Clear TX 'suspended' bit. */
2587 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2588 
2589 	/* Everything is ready; unleash! */
2590 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2591 
2592 	/* Re-enable polling if requested. */
2593 	if (sc->hn_pollhz > 0)
2594 		hn_polling(sc, sc->hn_pollhz);
2595 }
2596 
2597 static void
2598 hn_init(void *xsc)
2599 {
2600 	struct hn_softc *sc = xsc;
2601 
2602 	HN_LOCK(sc);
2603 	hn_init_locked(sc);
2604 	HN_UNLOCK(sc);
2605 }
2606 
2607 #if __FreeBSD_version >= 1100099
2608 
2609 static int
2610 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2611 {
2612 	struct hn_softc *sc = arg1;
2613 	unsigned int lenlim;
2614 	int error;
2615 
2616 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2617 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2618 	if (error || req->newptr == NULL)
2619 		return error;
2620 
2621 	HN_LOCK(sc);
2622 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2623 	    lenlim > TCP_LRO_LENGTH_MAX) {
2624 		HN_UNLOCK(sc);
2625 		return EINVAL;
2626 	}
2627 	hn_set_lro_lenlim(sc, lenlim);
2628 	HN_UNLOCK(sc);
2629 
2630 	return 0;
2631 }
2632 
2633 static int
2634 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2635 {
2636 	struct hn_softc *sc = arg1;
2637 	int ackcnt, error, i;
2638 
2639 	/*
2640 	 * lro_ackcnt_lim is append count limit,
2641 	 * +1 to turn it into aggregation limit.
2642 	 */
2643 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2644 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2645 	if (error || req->newptr == NULL)
2646 		return error;
2647 
2648 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2649 		return EINVAL;
2650 
2651 	/*
2652 	 * Convert aggregation limit back to append
2653 	 * count limit.
2654 	 */
2655 	--ackcnt;
2656 	HN_LOCK(sc);
2657 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2658 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2659 	HN_UNLOCK(sc);
2660 	return 0;
2661 }
2662 
2663 #endif
2664 
2665 static int
2666 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2667 {
2668 	struct hn_softc *sc = arg1;
2669 	int hcsum = arg2;
2670 	int on, error, i;
2671 
2672 	on = 0;
2673 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2674 		on = 1;
2675 
2676 	error = sysctl_handle_int(oidp, &on, 0, req);
2677 	if (error || req->newptr == NULL)
2678 		return error;
2679 
2680 	HN_LOCK(sc);
2681 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2682 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2683 
2684 		if (on)
2685 			rxr->hn_trust_hcsum |= hcsum;
2686 		else
2687 			rxr->hn_trust_hcsum &= ~hcsum;
2688 	}
2689 	HN_UNLOCK(sc);
2690 	return 0;
2691 }
2692 
2693 static int
2694 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2695 {
2696 	struct hn_softc *sc = arg1;
2697 	int chim_size, error;
2698 
2699 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
2700 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
2701 	if (error || req->newptr == NULL)
2702 		return error;
2703 
2704 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2705 		return EINVAL;
2706 
2707 	HN_LOCK(sc);
2708 	hn_set_chim_size(sc, chim_size);
2709 	HN_UNLOCK(sc);
2710 	return 0;
2711 }
2712 
2713 #if __FreeBSD_version < 1100095
2714 static int
2715 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2716 {
2717 	struct hn_softc *sc = arg1;
2718 	int ofs = arg2, i, error;
2719 	struct hn_rx_ring *rxr;
2720 	uint64_t stat;
2721 
2722 	stat = 0;
2723 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2724 		rxr = &sc->hn_rx_ring[i];
2725 		stat += *((int *)((uint8_t *)rxr + ofs));
2726 	}
2727 
2728 	error = sysctl_handle_64(oidp, &stat, 0, req);
2729 	if (error || req->newptr == NULL)
2730 		return error;
2731 
2732 	/* Zero out this stat. */
2733 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2734 		rxr = &sc->hn_rx_ring[i];
2735 		*((int *)((uint8_t *)rxr + ofs)) = 0;
2736 	}
2737 	return 0;
2738 }
2739 #else
2740 static int
2741 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2742 {
2743 	struct hn_softc *sc = arg1;
2744 	int ofs = arg2, i, error;
2745 	struct hn_rx_ring *rxr;
2746 	uint64_t stat;
2747 
2748 	stat = 0;
2749 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2750 		rxr = &sc->hn_rx_ring[i];
2751 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2752 	}
2753 
2754 	error = sysctl_handle_64(oidp, &stat, 0, req);
2755 	if (error || req->newptr == NULL)
2756 		return error;
2757 
2758 	/* Zero out this stat. */
2759 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2760 		rxr = &sc->hn_rx_ring[i];
2761 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2762 	}
2763 	return 0;
2764 }
2765 
2766 #endif
2767 
2768 static int
2769 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2770 {
2771 	struct hn_softc *sc = arg1;
2772 	int ofs = arg2, i, error;
2773 	struct hn_rx_ring *rxr;
2774 	u_long stat;
2775 
2776 	stat = 0;
2777 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2778 		rxr = &sc->hn_rx_ring[i];
2779 		stat += *((u_long *)((uint8_t *)rxr + ofs));
2780 	}
2781 
2782 	error = sysctl_handle_long(oidp, &stat, 0, req);
2783 	if (error || req->newptr == NULL)
2784 		return error;
2785 
2786 	/* Zero out this stat. */
2787 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2788 		rxr = &sc->hn_rx_ring[i];
2789 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
2790 	}
2791 	return 0;
2792 }
2793 
2794 static int
2795 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2796 {
2797 	struct hn_softc *sc = arg1;
2798 	int ofs = arg2, i, error;
2799 	struct hn_tx_ring *txr;
2800 	u_long stat;
2801 
2802 	stat = 0;
2803 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2804 		txr = &sc->hn_tx_ring[i];
2805 		stat += *((u_long *)((uint8_t *)txr + ofs));
2806 	}
2807 
2808 	error = sysctl_handle_long(oidp, &stat, 0, req);
2809 	if (error || req->newptr == NULL)
2810 		return error;
2811 
2812 	/* Zero out this stat. */
2813 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2814 		txr = &sc->hn_tx_ring[i];
2815 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
2816 	}
2817 	return 0;
2818 }
2819 
2820 static int
2821 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2822 {
2823 	struct hn_softc *sc = arg1;
2824 	int ofs = arg2, i, error, conf;
2825 	struct hn_tx_ring *txr;
2826 
2827 	txr = &sc->hn_tx_ring[0];
2828 	conf = *((int *)((uint8_t *)txr + ofs));
2829 
2830 	error = sysctl_handle_int(oidp, &conf, 0, req);
2831 	if (error || req->newptr == NULL)
2832 		return error;
2833 
2834 	HN_LOCK(sc);
2835 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2836 		txr = &sc->hn_tx_ring[i];
2837 		*((int *)((uint8_t *)txr + ofs)) = conf;
2838 	}
2839 	HN_UNLOCK(sc);
2840 
2841 	return 0;
2842 }
2843 
2844 static int
2845 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2846 {
2847 	struct hn_softc *sc = arg1;
2848 	int error, size;
2849 
2850 	size = sc->hn_agg_size;
2851 	error = sysctl_handle_int(oidp, &size, 0, req);
2852 	if (error || req->newptr == NULL)
2853 		return (error);
2854 
2855 	HN_LOCK(sc);
2856 	sc->hn_agg_size = size;
2857 	hn_set_txagg(sc);
2858 	HN_UNLOCK(sc);
2859 
2860 	return (0);
2861 }
2862 
2863 static int
2864 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2865 {
2866 	struct hn_softc *sc = arg1;
2867 	int error, pkts;
2868 
2869 	pkts = sc->hn_agg_pkts;
2870 	error = sysctl_handle_int(oidp, &pkts, 0, req);
2871 	if (error || req->newptr == NULL)
2872 		return (error);
2873 
2874 	HN_LOCK(sc);
2875 	sc->hn_agg_pkts = pkts;
2876 	hn_set_txagg(sc);
2877 	HN_UNLOCK(sc);
2878 
2879 	return (0);
2880 }
2881 
2882 static int
2883 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2884 {
2885 	struct hn_softc *sc = arg1;
2886 	int pkts;
2887 
2888 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2889 	return (sysctl_handle_int(oidp, &pkts, 0, req));
2890 }
2891 
2892 static int
2893 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2894 {
2895 	struct hn_softc *sc = arg1;
2896 	int align;
2897 
2898 	align = sc->hn_tx_ring[0].hn_agg_align;
2899 	return (sysctl_handle_int(oidp, &align, 0, req));
2900 }
2901 
2902 static void
2903 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
2904 {
2905 	if (pollhz == 0)
2906 		vmbus_chan_poll_disable(chan);
2907 	else
2908 		vmbus_chan_poll_enable(chan, pollhz);
2909 }
2910 
2911 static void
2912 hn_polling(struct hn_softc *sc, u_int pollhz)
2913 {
2914 	int nsubch = sc->hn_rx_ring_inuse - 1;
2915 
2916 	HN_LOCK_ASSERT(sc);
2917 
2918 	if (nsubch > 0) {
2919 		struct vmbus_channel **subch;
2920 		int i;
2921 
2922 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
2923 		for (i = 0; i < nsubch; ++i)
2924 			hn_chan_polling(subch[i], pollhz);
2925 		vmbus_subchan_rel(subch, nsubch);
2926 	}
2927 	hn_chan_polling(sc->hn_prichan, pollhz);
2928 }
2929 
2930 static int
2931 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
2932 {
2933 	struct hn_softc *sc = arg1;
2934 	int pollhz, error;
2935 
2936 	pollhz = sc->hn_pollhz;
2937 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
2938 	if (error || req->newptr == NULL)
2939 		return (error);
2940 
2941 	if (pollhz != 0 &&
2942 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
2943 		return (EINVAL);
2944 
2945 	HN_LOCK(sc);
2946 	if (sc->hn_pollhz != pollhz) {
2947 		sc->hn_pollhz = pollhz;
2948 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
2949 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
2950 			hn_polling(sc, sc->hn_pollhz);
2951 	}
2952 	HN_UNLOCK(sc);
2953 
2954 	return (0);
2955 }
2956 
2957 static int
2958 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2959 {
2960 	struct hn_softc *sc = arg1;
2961 	char verstr[16];
2962 
2963 	snprintf(verstr, sizeof(verstr), "%u.%u",
2964 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2965 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2966 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2967 }
2968 
2969 static int
2970 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2971 {
2972 	struct hn_softc *sc = arg1;
2973 	char caps_str[128];
2974 	uint32_t caps;
2975 
2976 	HN_LOCK(sc);
2977 	caps = sc->hn_caps;
2978 	HN_UNLOCK(sc);
2979 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2980 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2981 }
2982 
2983 static int
2984 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2985 {
2986 	struct hn_softc *sc = arg1;
2987 	char assist_str[128];
2988 	uint32_t hwassist;
2989 
2990 	HN_LOCK(sc);
2991 	hwassist = sc->hn_ifp->if_hwassist;
2992 	HN_UNLOCK(sc);
2993 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2994 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2995 }
2996 
2997 static int
2998 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2999 {
3000 	struct hn_softc *sc = arg1;
3001 	char filter_str[128];
3002 	uint32_t filter;
3003 
3004 	HN_LOCK(sc);
3005 	filter = sc->hn_rx_filter;
3006 	HN_UNLOCK(sc);
3007 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
3008 	    NDIS_PACKET_TYPES);
3009 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3010 }
3011 
3012 #ifndef RSS
3013 
3014 static int
3015 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3016 {
3017 	struct hn_softc *sc = arg1;
3018 	int error;
3019 
3020 	HN_LOCK(sc);
3021 
3022 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3023 	if (error || req->newptr == NULL)
3024 		goto back;
3025 
3026 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3027 	if (error)
3028 		goto back;
3029 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3030 
3031 	if (sc->hn_rx_ring_inuse > 1) {
3032 		error = hn_rss_reconfig(sc);
3033 	} else {
3034 		/* Not RSS capable, at least for now; just save the RSS key. */
3035 		error = 0;
3036 	}
3037 back:
3038 	HN_UNLOCK(sc);
3039 	return (error);
3040 }
3041 
3042 static int
3043 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3044 {
3045 	struct hn_softc *sc = arg1;
3046 	int error;
3047 
3048 	HN_LOCK(sc);
3049 
3050 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3051 	if (error || req->newptr == NULL)
3052 		goto back;
3053 
3054 	/*
3055 	 * Don't allow RSS indirect table change, if this interface is not
3056 	 * RSS capable currently.
3057 	 */
3058 	if (sc->hn_rx_ring_inuse == 1) {
3059 		error = EOPNOTSUPP;
3060 		goto back;
3061 	}
3062 
3063 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3064 	if (error)
3065 		goto back;
3066 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3067 
3068 	hn_rss_ind_fixup(sc);
3069 	error = hn_rss_reconfig(sc);
3070 back:
3071 	HN_UNLOCK(sc);
3072 	return (error);
3073 }
3074 
3075 #endif	/* !RSS */
3076 
3077 static int
3078 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3079 {
3080 	struct hn_softc *sc = arg1;
3081 	char hash_str[128];
3082 	uint32_t hash;
3083 
3084 	HN_LOCK(sc);
3085 	hash = sc->hn_rss_hash;
3086 	HN_UNLOCK(sc);
3087 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3088 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3089 }
3090 
3091 static int
3092 hn_check_iplen(const struct mbuf *m, int hoff)
3093 {
3094 	const struct ip *ip;
3095 	int len, iphlen, iplen;
3096 	const struct tcphdr *th;
3097 	int thoff;				/* TCP data offset */
3098 
3099 	len = hoff + sizeof(struct ip);
3100 
3101 	/* The packet must be at least the size of an IP header. */
3102 	if (m->m_pkthdr.len < len)
3103 		return IPPROTO_DONE;
3104 
3105 	/* The fixed IP header must reside completely in the first mbuf. */
3106 	if (m->m_len < len)
3107 		return IPPROTO_DONE;
3108 
3109 	ip = mtodo(m, hoff);
3110 
3111 	/* Bound check the packet's stated IP header length. */
3112 	iphlen = ip->ip_hl << 2;
3113 	if (iphlen < sizeof(struct ip))		/* minimum header length */
3114 		return IPPROTO_DONE;
3115 
3116 	/* The full IP header must reside completely in the one mbuf. */
3117 	if (m->m_len < hoff + iphlen)
3118 		return IPPROTO_DONE;
3119 
3120 	iplen = ntohs(ip->ip_len);
3121 
3122 	/*
3123 	 * Check that the amount of data in the buffers is as
3124 	 * at least much as the IP header would have us expect.
3125 	 */
3126 	if (m->m_pkthdr.len < hoff + iplen)
3127 		return IPPROTO_DONE;
3128 
3129 	/*
3130 	 * Ignore IP fragments.
3131 	 */
3132 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3133 		return IPPROTO_DONE;
3134 
3135 	/*
3136 	 * The TCP/IP or UDP/IP header must be entirely contained within
3137 	 * the first fragment of a packet.
3138 	 */
3139 	switch (ip->ip_p) {
3140 	case IPPROTO_TCP:
3141 		if (iplen < iphlen + sizeof(struct tcphdr))
3142 			return IPPROTO_DONE;
3143 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3144 			return IPPROTO_DONE;
3145 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3146 		thoff = th->th_off << 2;
3147 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3148 			return IPPROTO_DONE;
3149 		if (m->m_len < hoff + iphlen + thoff)
3150 			return IPPROTO_DONE;
3151 		break;
3152 	case IPPROTO_UDP:
3153 		if (iplen < iphlen + sizeof(struct udphdr))
3154 			return IPPROTO_DONE;
3155 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3156 			return IPPROTO_DONE;
3157 		break;
3158 	default:
3159 		if (iplen < iphlen)
3160 			return IPPROTO_DONE;
3161 		break;
3162 	}
3163 	return ip->ip_p;
3164 }
3165 
3166 static int
3167 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3168 {
3169 	struct sysctl_oid_list *child;
3170 	struct sysctl_ctx_list *ctx;
3171 	device_t dev = sc->hn_dev;
3172 #if defined(INET) || defined(INET6)
3173 #if __FreeBSD_version >= 1100095
3174 	int lroent_cnt;
3175 #endif
3176 #endif
3177 	int i;
3178 
3179 	/*
3180 	 * Create RXBUF for reception.
3181 	 *
3182 	 * NOTE:
3183 	 * - It is shared by all channels.
3184 	 * - A large enough buffer is allocated, certain version of NVSes
3185 	 *   may further limit the usable space.
3186 	 */
3187 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3188 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3189 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3190 	if (sc->hn_rxbuf == NULL) {
3191 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3192 		return (ENOMEM);
3193 	}
3194 
3195 	sc->hn_rx_ring_cnt = ring_cnt;
3196 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3197 
3198 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3199 	    M_DEVBUF, M_WAITOK | M_ZERO);
3200 
3201 #if defined(INET) || defined(INET6)
3202 #if __FreeBSD_version >= 1100095
3203 	lroent_cnt = hn_lro_entry_count;
3204 	if (lroent_cnt < TCP_LRO_ENTRIES)
3205 		lroent_cnt = TCP_LRO_ENTRIES;
3206 	if (bootverbose)
3207 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3208 #endif
3209 #endif	/* INET || INET6 */
3210 
3211 	ctx = device_get_sysctl_ctx(dev);
3212 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3213 
3214 	/* Create dev.hn.UNIT.rx sysctl tree */
3215 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3216 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3217 
3218 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3219 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3220 
3221 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3222 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3223 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
3224 		if (rxr->hn_br == NULL) {
3225 			device_printf(dev, "allocate bufring failed\n");
3226 			return (ENOMEM);
3227 		}
3228 
3229 		if (hn_trust_hosttcp)
3230 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3231 		if (hn_trust_hostudp)
3232 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3233 		if (hn_trust_hostip)
3234 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3235 		rxr->hn_ifp = sc->hn_ifp;
3236 		if (i < sc->hn_tx_ring_cnt)
3237 			rxr->hn_txr = &sc->hn_tx_ring[i];
3238 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3239 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3240 		rxr->hn_rx_idx = i;
3241 		rxr->hn_rxbuf = sc->hn_rxbuf;
3242 
3243 		/*
3244 		 * Initialize LRO.
3245 		 */
3246 #if defined(INET) || defined(INET6)
3247 #if __FreeBSD_version >= 1100095
3248 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3249 		    hn_lro_mbufq_depth);
3250 #else
3251 		tcp_lro_init(&rxr->hn_lro);
3252 		rxr->hn_lro.ifp = sc->hn_ifp;
3253 #endif
3254 #if __FreeBSD_version >= 1100099
3255 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3256 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3257 #endif
3258 #endif	/* INET || INET6 */
3259 
3260 		if (sc->hn_rx_sysctl_tree != NULL) {
3261 			char name[16];
3262 
3263 			/*
3264 			 * Create per RX ring sysctl tree:
3265 			 * dev.hn.UNIT.rx.RINGID
3266 			 */
3267 			snprintf(name, sizeof(name), "%d", i);
3268 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3269 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3270 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3271 
3272 			if (rxr->hn_rx_sysctl_tree != NULL) {
3273 				SYSCTL_ADD_ULONG(ctx,
3274 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3275 				    OID_AUTO, "packets", CTLFLAG_RW,
3276 				    &rxr->hn_pkts, "# of packets received");
3277 				SYSCTL_ADD_ULONG(ctx,
3278 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3279 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
3280 				    &rxr->hn_rss_pkts,
3281 				    "# of packets w/ RSS info received");
3282 				SYSCTL_ADD_INT(ctx,
3283 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3284 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3285 				    &rxr->hn_pktbuf_len, 0,
3286 				    "Temporary channel packet buffer length");
3287 			}
3288 		}
3289 	}
3290 
3291 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3292 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3293 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3294 #if __FreeBSD_version < 1100095
3295 	    hn_rx_stat_int_sysctl,
3296 #else
3297 	    hn_rx_stat_u64_sysctl,
3298 #endif
3299 	    "LU", "LRO queued");
3300 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3301 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3302 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3303 #if __FreeBSD_version < 1100095
3304 	    hn_rx_stat_int_sysctl,
3305 #else
3306 	    hn_rx_stat_u64_sysctl,
3307 #endif
3308 	    "LU", "LRO flushed");
3309 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3310 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3311 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
3312 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3313 #if __FreeBSD_version >= 1100099
3314 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3315 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3316 	    hn_lro_lenlim_sysctl, "IU",
3317 	    "Max # of data bytes to be aggregated by LRO");
3318 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3319 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3320 	    hn_lro_ackcnt_sysctl, "I",
3321 	    "Max # of ACKs to be aggregated by LRO");
3322 #endif
3323 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3324 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3325 	    hn_trust_hcsum_sysctl, "I",
3326 	    "Trust tcp segement verification on host side, "
3327 	    "when csum info is missing");
3328 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3329 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3330 	    hn_trust_hcsum_sysctl, "I",
3331 	    "Trust udp datagram verification on host side, "
3332 	    "when csum info is missing");
3333 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3334 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3335 	    hn_trust_hcsum_sysctl, "I",
3336 	    "Trust ip packet verification on host side, "
3337 	    "when csum info is missing");
3338 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3339 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3340 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
3341 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3342 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3343 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3344 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
3345 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3346 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3347 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3348 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
3349 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3350 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3351 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3352 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
3353 	    hn_rx_stat_ulong_sysctl, "LU",
3354 	    "# of packets that we trust host's csum verification");
3355 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3356 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3357 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
3358 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3359 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3360 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3361 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
3362 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3363 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3364 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3365 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3366 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3367 
3368 	return (0);
3369 }
3370 
3371 static void
3372 hn_destroy_rx_data(struct hn_softc *sc)
3373 {
3374 	int i;
3375 
3376 	if (sc->hn_rxbuf != NULL) {
3377 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3378 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3379 		else
3380 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
3381 		sc->hn_rxbuf = NULL;
3382 	}
3383 
3384 	if (sc->hn_rx_ring_cnt == 0)
3385 		return;
3386 
3387 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3388 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3389 
3390 		if (rxr->hn_br == NULL)
3391 			continue;
3392 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3393 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3394 		} else {
3395 			device_printf(sc->hn_dev,
3396 			    "%dth channel bufring is referenced", i);
3397 		}
3398 		rxr->hn_br = NULL;
3399 
3400 #if defined(INET) || defined(INET6)
3401 		tcp_lro_free(&rxr->hn_lro);
3402 #endif
3403 		free(rxr->hn_pktbuf, M_DEVBUF);
3404 	}
3405 	free(sc->hn_rx_ring, M_DEVBUF);
3406 	sc->hn_rx_ring = NULL;
3407 
3408 	sc->hn_rx_ring_cnt = 0;
3409 	sc->hn_rx_ring_inuse = 0;
3410 }
3411 
3412 static int
3413 hn_tx_ring_create(struct hn_softc *sc, int id)
3414 {
3415 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3416 	device_t dev = sc->hn_dev;
3417 	bus_dma_tag_t parent_dtag;
3418 	int error, i;
3419 
3420 	txr->hn_sc = sc;
3421 	txr->hn_tx_idx = id;
3422 
3423 #ifndef HN_USE_TXDESC_BUFRING
3424 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3425 #endif
3426 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3427 
3428 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3429 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3430 	    M_DEVBUF, M_WAITOK | M_ZERO);
3431 #ifndef HN_USE_TXDESC_BUFRING
3432 	SLIST_INIT(&txr->hn_txlist);
3433 #else
3434 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3435 	    M_WAITOK, &txr->hn_tx_lock);
3436 #endif
3437 
3438 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3439 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3440 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3441 	} else {
3442 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3443 	}
3444 
3445 #ifdef HN_IFSTART_SUPPORT
3446 	if (hn_use_if_start) {
3447 		txr->hn_txeof = hn_start_txeof;
3448 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3449 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3450 	} else
3451 #endif
3452 	{
3453 		int br_depth;
3454 
3455 		txr->hn_txeof = hn_xmit_txeof;
3456 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3457 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3458 
3459 		br_depth = hn_get_txswq_depth(txr);
3460 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3461 		    M_WAITOK, &txr->hn_tx_lock);
3462 	}
3463 
3464 	txr->hn_direct_tx_size = hn_direct_tx_size;
3465 
3466 	/*
3467 	 * Always schedule transmission instead of trying to do direct
3468 	 * transmission.  This one gives the best performance so far.
3469 	 */
3470 	txr->hn_sched_tx = 1;
3471 
3472 	parent_dtag = bus_get_dma_tag(dev);
3473 
3474 	/* DMA tag for RNDIS packet messages. */
3475 	error = bus_dma_tag_create(parent_dtag, /* parent */
3476 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
3477 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
3478 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3479 	    BUS_SPACE_MAXADDR,		/* highaddr */
3480 	    NULL, NULL,			/* filter, filterarg */
3481 	    HN_RNDIS_PKT_LEN,		/* maxsize */
3482 	    1,				/* nsegments */
3483 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
3484 	    0,				/* flags */
3485 	    NULL,			/* lockfunc */
3486 	    NULL,			/* lockfuncarg */
3487 	    &txr->hn_tx_rndis_dtag);
3488 	if (error) {
3489 		device_printf(dev, "failed to create rndis dmatag\n");
3490 		return error;
3491 	}
3492 
3493 	/* DMA tag for data. */
3494 	error = bus_dma_tag_create(parent_dtag, /* parent */
3495 	    1,				/* alignment */
3496 	    HN_TX_DATA_BOUNDARY,	/* boundary */
3497 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3498 	    BUS_SPACE_MAXADDR,		/* highaddr */
3499 	    NULL, NULL,			/* filter, filterarg */
3500 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
3501 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
3502 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
3503 	    0,				/* flags */
3504 	    NULL,			/* lockfunc */
3505 	    NULL,			/* lockfuncarg */
3506 	    &txr->hn_tx_data_dtag);
3507 	if (error) {
3508 		device_printf(dev, "failed to create data dmatag\n");
3509 		return error;
3510 	}
3511 
3512 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3513 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
3514 
3515 		txd->txr = txr;
3516 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3517 		STAILQ_INIT(&txd->agg_list);
3518 
3519 		/*
3520 		 * Allocate and load RNDIS packet message.
3521 		 */
3522         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3523 		    (void **)&txd->rndis_pkt,
3524 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3525 		    &txd->rndis_pkt_dmap);
3526 		if (error) {
3527 			device_printf(dev,
3528 			    "failed to allocate rndis_packet_msg, %d\n", i);
3529 			return error;
3530 		}
3531 
3532 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3533 		    txd->rndis_pkt_dmap,
3534 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3535 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3536 		    BUS_DMA_NOWAIT);
3537 		if (error) {
3538 			device_printf(dev,
3539 			    "failed to load rndis_packet_msg, %d\n", i);
3540 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3541 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3542 			return error;
3543 		}
3544 
3545 		/* DMA map for TX data. */
3546 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3547 		    &txd->data_dmap);
3548 		if (error) {
3549 			device_printf(dev,
3550 			    "failed to allocate tx data dmamap\n");
3551 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3552 			    txd->rndis_pkt_dmap);
3553 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3554 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3555 			return error;
3556 		}
3557 
3558 		/* All set, put it to list */
3559 		txd->flags |= HN_TXD_FLAG_ONLIST;
3560 #ifndef HN_USE_TXDESC_BUFRING
3561 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3562 #else
3563 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
3564 #endif
3565 	}
3566 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3567 
3568 	if (sc->hn_tx_sysctl_tree != NULL) {
3569 		struct sysctl_oid_list *child;
3570 		struct sysctl_ctx_list *ctx;
3571 		char name[16];
3572 
3573 		/*
3574 		 * Create per TX ring sysctl tree:
3575 		 * dev.hn.UNIT.tx.RINGID
3576 		 */
3577 		ctx = device_get_sysctl_ctx(dev);
3578 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3579 
3580 		snprintf(name, sizeof(name), "%d", id);
3581 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3582 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3583 
3584 		if (txr->hn_tx_sysctl_tree != NULL) {
3585 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3586 
3587 #ifdef HN_DEBUG
3588 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3589 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3590 			    "# of available TX descs");
3591 #endif
3592 #ifdef HN_IFSTART_SUPPORT
3593 			if (!hn_use_if_start)
3594 #endif
3595 			{
3596 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3597 				    CTLFLAG_RD, &txr->hn_oactive, 0,
3598 				    "over active");
3599 			}
3600 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3601 			    CTLFLAG_RW, &txr->hn_pkts,
3602 			    "# of packets transmitted");
3603 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3604 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
3605 		}
3606 	}
3607 
3608 	return 0;
3609 }
3610 
3611 static void
3612 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3613 {
3614 	struct hn_tx_ring *txr = txd->txr;
3615 
3616 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
3617 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3618 
3619 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3620 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3621 	    txd->rndis_pkt_dmap);
3622 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3623 }
3624 
3625 static void
3626 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3627 {
3628 
3629 	KASSERT(txd->refs == 0 || txd->refs == 1,
3630 	    ("invalid txd refs %d", txd->refs));
3631 
3632 	/* Aggregated txds will be freed by their aggregating txd. */
3633 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3634 		int freed;
3635 
3636 		freed = hn_txdesc_put(txr, txd);
3637 		KASSERT(freed, ("can't free txdesc"));
3638 	}
3639 }
3640 
3641 static void
3642 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3643 {
3644 	int i;
3645 
3646 	if (txr->hn_txdesc == NULL)
3647 		return;
3648 
3649 	/*
3650 	 * NOTE:
3651 	 * Because the freeing of aggregated txds will be deferred
3652 	 * to the aggregating txd, two passes are used here:
3653 	 * - The first pass GCes any pending txds.  This GC is necessary,
3654 	 *   since if the channels are revoked, hypervisor will not
3655 	 *   deliver send-done for all pending txds.
3656 	 * - The second pass frees the busdma stuffs, i.e. after all txds
3657 	 *   were freed.
3658 	 */
3659 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3660 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3661 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3662 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3663 
3664 	if (txr->hn_tx_data_dtag != NULL)
3665 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3666 	if (txr->hn_tx_rndis_dtag != NULL)
3667 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3668 
3669 #ifdef HN_USE_TXDESC_BUFRING
3670 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3671 #endif
3672 
3673 	free(txr->hn_txdesc, M_DEVBUF);
3674 	txr->hn_txdesc = NULL;
3675 
3676 	if (txr->hn_mbuf_br != NULL)
3677 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3678 
3679 #ifndef HN_USE_TXDESC_BUFRING
3680 	mtx_destroy(&txr->hn_txlist_spin);
3681 #endif
3682 	mtx_destroy(&txr->hn_tx_lock);
3683 }
3684 
3685 static int
3686 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3687 {
3688 	struct sysctl_oid_list *child;
3689 	struct sysctl_ctx_list *ctx;
3690 	int i;
3691 
3692 	/*
3693 	 * Create TXBUF for chimney sending.
3694 	 *
3695 	 * NOTE: It is shared by all channels.
3696 	 */
3697 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3698 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3699 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3700 	if (sc->hn_chim == NULL) {
3701 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
3702 		return (ENOMEM);
3703 	}
3704 
3705 	sc->hn_tx_ring_cnt = ring_cnt;
3706 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3707 
3708 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3709 	    M_DEVBUF, M_WAITOK | M_ZERO);
3710 
3711 	ctx = device_get_sysctl_ctx(sc->hn_dev);
3712 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3713 
3714 	/* Create dev.hn.UNIT.tx sysctl tree */
3715 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3716 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3717 
3718 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3719 		int error;
3720 
3721 		error = hn_tx_ring_create(sc, i);
3722 		if (error)
3723 			return error;
3724 	}
3725 
3726 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3727 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3728 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
3729 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3730 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3731 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3732 	    __offsetof(struct hn_tx_ring, hn_send_failed),
3733 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3734 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3735 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3736 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
3737 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3738 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3739 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3740 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
3741 	    hn_tx_stat_ulong_sysctl, "LU",
3742 	    "# of packet transmission aggregation flush failure");
3743 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3744 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3745 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3746 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3747 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3748 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3749 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
3750 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3751 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3752 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3753 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3754 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3755 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3756 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3757 	    "# of total TX descs");
3758 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3759 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3760 	    "Chimney send packet size upper boundary");
3761 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3762 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3763 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3764 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3765 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3766 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3767 	    hn_tx_conf_int_sysctl, "I",
3768 	    "Size of the packet for direct transmission");
3769 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3770 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3771 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
3772 	    hn_tx_conf_int_sysctl, "I",
3773 	    "Always schedule transmission "
3774 	    "instead of doing direct transmission");
3775 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3776 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3777 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3778 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3779 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3780 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3781 	    "Applied packet transmission aggregation size");
3782 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3783 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3784 	    hn_txagg_pktmax_sysctl, "I",
3785 	    "Applied packet transmission aggregation packets");
3786 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3787 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3788 	    hn_txagg_align_sysctl, "I",
3789 	    "Applied packet transmission aggregation alignment");
3790 
3791 	return 0;
3792 }
3793 
3794 static void
3795 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3796 {
3797 	int i;
3798 
3799 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3800 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
3801 }
3802 
3803 static void
3804 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3805 {
3806 	struct ifnet *ifp = sc->hn_ifp;
3807 	int tso_minlen;
3808 
3809 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3810 		return;
3811 
3812 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3813 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3814 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3815 
3816 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3817 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3818 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3819 
3820 	if (tso_maxlen < tso_minlen)
3821 		tso_maxlen = tso_minlen;
3822 	else if (tso_maxlen > IP_MAXPACKET)
3823 		tso_maxlen = IP_MAXPACKET;
3824 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
3825 		tso_maxlen = sc->hn_ndis_tso_szmax;
3826 	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3827 	if (bootverbose)
3828 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3829 }
3830 
3831 static void
3832 hn_fixup_tx_data(struct hn_softc *sc)
3833 {
3834 	uint64_t csum_assist;
3835 	int i;
3836 
3837 	hn_set_chim_size(sc, sc->hn_chim_szmax);
3838 	if (hn_tx_chimney_size > 0 &&
3839 	    hn_tx_chimney_size < sc->hn_chim_szmax)
3840 		hn_set_chim_size(sc, hn_tx_chimney_size);
3841 
3842 	csum_assist = 0;
3843 	if (sc->hn_caps & HN_CAP_IPCS)
3844 		csum_assist |= CSUM_IP;
3845 	if (sc->hn_caps & HN_CAP_TCP4CS)
3846 		csum_assist |= CSUM_IP_TCP;
3847 	if (sc->hn_caps & HN_CAP_UDP4CS)
3848 		csum_assist |= CSUM_IP_UDP;
3849 	if (sc->hn_caps & HN_CAP_TCP6CS)
3850 		csum_assist |= CSUM_IP6_TCP;
3851 	if (sc->hn_caps & HN_CAP_UDP6CS)
3852 		csum_assist |= CSUM_IP6_UDP;
3853 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3854 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3855 
3856 	if (sc->hn_caps & HN_CAP_HASHVAL) {
3857 		/*
3858 		 * Support HASHVAL pktinfo on TX path.
3859 		 */
3860 		if (bootverbose)
3861 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3862 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3863 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3864 	}
3865 }
3866 
3867 static void
3868 hn_destroy_tx_data(struct hn_softc *sc)
3869 {
3870 	int i;
3871 
3872 	if (sc->hn_chim != NULL) {
3873 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
3874 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3875 		} else {
3876 			device_printf(sc->hn_dev,
3877 			    "chimney sending buffer is referenced");
3878 		}
3879 		sc->hn_chim = NULL;
3880 	}
3881 
3882 	if (sc->hn_tx_ring_cnt == 0)
3883 		return;
3884 
3885 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3886 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3887 
3888 	free(sc->hn_tx_ring, M_DEVBUF);
3889 	sc->hn_tx_ring = NULL;
3890 
3891 	sc->hn_tx_ring_cnt = 0;
3892 	sc->hn_tx_ring_inuse = 0;
3893 }
3894 
3895 #ifdef HN_IFSTART_SUPPORT
3896 
3897 static void
3898 hn_start_taskfunc(void *xtxr, int pending __unused)
3899 {
3900 	struct hn_tx_ring *txr = xtxr;
3901 
3902 	mtx_lock(&txr->hn_tx_lock);
3903 	hn_start_locked(txr, 0);
3904 	mtx_unlock(&txr->hn_tx_lock);
3905 }
3906 
3907 static int
3908 hn_start_locked(struct hn_tx_ring *txr, int len)
3909 {
3910 	struct hn_softc *sc = txr->hn_sc;
3911 	struct ifnet *ifp = sc->hn_ifp;
3912 	int sched = 0;
3913 
3914 	KASSERT(hn_use_if_start,
3915 	    ("hn_start_locked is called, when if_start is disabled"));
3916 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3917 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3918 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3919 
3920 	if (__predict_false(txr->hn_suspended))
3921 		return (0);
3922 
3923 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3924 	    IFF_DRV_RUNNING)
3925 		return (0);
3926 
3927 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3928 		struct hn_txdesc *txd;
3929 		struct mbuf *m_head;
3930 		int error;
3931 
3932 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3933 		if (m_head == NULL)
3934 			break;
3935 
3936 		if (len > 0 && m_head->m_pkthdr.len > len) {
3937 			/*
3938 			 * This sending could be time consuming; let callers
3939 			 * dispatch this packet sending (and sending of any
3940 			 * following up packets) to tx taskqueue.
3941 			 */
3942 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3943 			sched = 1;
3944 			break;
3945 		}
3946 
3947 #if defined(INET6) || defined(INET)
3948 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3949 			m_head = hn_tso_fixup(m_head);
3950 			if (__predict_false(m_head == NULL)) {
3951 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3952 				continue;
3953 			}
3954 		}
3955 #endif
3956 
3957 		txd = hn_txdesc_get(txr);
3958 		if (txd == NULL) {
3959 			txr->hn_no_txdescs++;
3960 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3961 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3962 			break;
3963 		}
3964 
3965 		error = hn_encap(ifp, txr, txd, &m_head);
3966 		if (error) {
3967 			/* Both txd and m_head are freed */
3968 			KASSERT(txr->hn_agg_txd == NULL,
3969 			    ("encap failed w/ pending aggregating txdesc"));
3970 			continue;
3971 		}
3972 
3973 		if (txr->hn_agg_pktleft == 0) {
3974 			if (txr->hn_agg_txd != NULL) {
3975 				KASSERT(m_head == NULL,
3976 				    ("pending mbuf for aggregating txdesc"));
3977 				error = hn_flush_txagg(ifp, txr);
3978 				if (__predict_false(error)) {
3979 					atomic_set_int(&ifp->if_drv_flags,
3980 					    IFF_DRV_OACTIVE);
3981 					break;
3982 				}
3983 			} else {
3984 				KASSERT(m_head != NULL, ("mbuf was freed"));
3985 				error = hn_txpkt(ifp, txr, txd);
3986 				if (__predict_false(error)) {
3987 					/* txd is freed, but m_head is not */
3988 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3989 					atomic_set_int(&ifp->if_drv_flags,
3990 					    IFF_DRV_OACTIVE);
3991 					break;
3992 				}
3993 			}
3994 		}
3995 #ifdef INVARIANTS
3996 		else {
3997 			KASSERT(txr->hn_agg_txd != NULL,
3998 			    ("no aggregating txdesc"));
3999 			KASSERT(m_head == NULL,
4000 			    ("pending mbuf for aggregating txdesc"));
4001 		}
4002 #endif
4003 	}
4004 
4005 	/* Flush pending aggerated transmission. */
4006 	if (txr->hn_agg_txd != NULL)
4007 		hn_flush_txagg(ifp, txr);
4008 	return (sched);
4009 }
4010 
4011 static void
4012 hn_start(struct ifnet *ifp)
4013 {
4014 	struct hn_softc *sc = ifp->if_softc;
4015 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
4016 
4017 	if (txr->hn_sched_tx)
4018 		goto do_sched;
4019 
4020 	if (mtx_trylock(&txr->hn_tx_lock)) {
4021 		int sched;
4022 
4023 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4024 		mtx_unlock(&txr->hn_tx_lock);
4025 		if (!sched)
4026 			return;
4027 	}
4028 do_sched:
4029 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4030 }
4031 
4032 static void
4033 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4034 {
4035 	struct hn_tx_ring *txr = xtxr;
4036 
4037 	mtx_lock(&txr->hn_tx_lock);
4038 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4039 	hn_start_locked(txr, 0);
4040 	mtx_unlock(&txr->hn_tx_lock);
4041 }
4042 
4043 static void
4044 hn_start_txeof(struct hn_tx_ring *txr)
4045 {
4046 	struct hn_softc *sc = txr->hn_sc;
4047 	struct ifnet *ifp = sc->hn_ifp;
4048 
4049 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4050 
4051 	if (txr->hn_sched_tx)
4052 		goto do_sched;
4053 
4054 	if (mtx_trylock(&txr->hn_tx_lock)) {
4055 		int sched;
4056 
4057 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4058 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4059 		mtx_unlock(&txr->hn_tx_lock);
4060 		if (sched) {
4061 			taskqueue_enqueue(txr->hn_tx_taskq,
4062 			    &txr->hn_tx_task);
4063 		}
4064 	} else {
4065 do_sched:
4066 		/*
4067 		 * Release the OACTIVE earlier, with the hope, that
4068 		 * others could catch up.  The task will clear the
4069 		 * flag again with the hn_tx_lock to avoid possible
4070 		 * races.
4071 		 */
4072 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4073 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4074 	}
4075 }
4076 
4077 #endif	/* HN_IFSTART_SUPPORT */
4078 
4079 static int
4080 hn_xmit(struct hn_tx_ring *txr, int len)
4081 {
4082 	struct hn_softc *sc = txr->hn_sc;
4083 	struct ifnet *ifp = sc->hn_ifp;
4084 	struct mbuf *m_head;
4085 	int sched = 0;
4086 
4087 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4088 #ifdef HN_IFSTART_SUPPORT
4089 	KASSERT(hn_use_if_start == 0,
4090 	    ("hn_xmit is called, when if_start is enabled"));
4091 #endif
4092 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4093 
4094 	if (__predict_false(txr->hn_suspended))
4095 		return (0);
4096 
4097 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4098 		return (0);
4099 
4100 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4101 		struct hn_txdesc *txd;
4102 		int error;
4103 
4104 		if (len > 0 && m_head->m_pkthdr.len > len) {
4105 			/*
4106 			 * This sending could be time consuming; let callers
4107 			 * dispatch this packet sending (and sending of any
4108 			 * following up packets) to tx taskqueue.
4109 			 */
4110 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4111 			sched = 1;
4112 			break;
4113 		}
4114 
4115 		txd = hn_txdesc_get(txr);
4116 		if (txd == NULL) {
4117 			txr->hn_no_txdescs++;
4118 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4119 			txr->hn_oactive = 1;
4120 			break;
4121 		}
4122 
4123 		error = hn_encap(ifp, txr, txd, &m_head);
4124 		if (error) {
4125 			/* Both txd and m_head are freed; discard */
4126 			KASSERT(txr->hn_agg_txd == NULL,
4127 			    ("encap failed w/ pending aggregating txdesc"));
4128 			drbr_advance(ifp, txr->hn_mbuf_br);
4129 			continue;
4130 		}
4131 
4132 		if (txr->hn_agg_pktleft == 0) {
4133 			if (txr->hn_agg_txd != NULL) {
4134 				KASSERT(m_head == NULL,
4135 				    ("pending mbuf for aggregating txdesc"));
4136 				error = hn_flush_txagg(ifp, txr);
4137 				if (__predict_false(error)) {
4138 					txr->hn_oactive = 1;
4139 					break;
4140 				}
4141 			} else {
4142 				KASSERT(m_head != NULL, ("mbuf was freed"));
4143 				error = hn_txpkt(ifp, txr, txd);
4144 				if (__predict_false(error)) {
4145 					/* txd is freed, but m_head is not */
4146 					drbr_putback(ifp, txr->hn_mbuf_br,
4147 					    m_head);
4148 					txr->hn_oactive = 1;
4149 					break;
4150 				}
4151 			}
4152 		}
4153 #ifdef INVARIANTS
4154 		else {
4155 			KASSERT(txr->hn_agg_txd != NULL,
4156 			    ("no aggregating txdesc"));
4157 			KASSERT(m_head == NULL,
4158 			    ("pending mbuf for aggregating txdesc"));
4159 		}
4160 #endif
4161 
4162 		/* Sent */
4163 		drbr_advance(ifp, txr->hn_mbuf_br);
4164 	}
4165 
4166 	/* Flush pending aggerated transmission. */
4167 	if (txr->hn_agg_txd != NULL)
4168 		hn_flush_txagg(ifp, txr);
4169 	return (sched);
4170 }
4171 
4172 static int
4173 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4174 {
4175 	struct hn_softc *sc = ifp->if_softc;
4176 	struct hn_tx_ring *txr;
4177 	int error, idx = 0;
4178 
4179 #if defined(INET6) || defined(INET)
4180 	/*
4181 	 * Perform TSO packet header fixup now, since the TSO
4182 	 * packet header should be cache-hot.
4183 	 */
4184 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4185 		m = hn_tso_fixup(m);
4186 		if (__predict_false(m == NULL)) {
4187 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4188 			return EIO;
4189 		}
4190 	}
4191 #endif
4192 
4193 	/*
4194 	 * Select the TX ring based on flowid
4195 	 */
4196 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4197 #ifdef RSS
4198 		uint32_t bid;
4199 
4200 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
4201 		    &bid) == 0)
4202 			idx = bid % sc->hn_tx_ring_inuse;
4203 		else
4204 #endif
4205 			idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4206 	}
4207 	txr = &sc->hn_tx_ring[idx];
4208 
4209 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4210 	if (error) {
4211 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4212 		return error;
4213 	}
4214 
4215 	if (txr->hn_oactive)
4216 		return 0;
4217 
4218 	if (txr->hn_sched_tx)
4219 		goto do_sched;
4220 
4221 	if (mtx_trylock(&txr->hn_tx_lock)) {
4222 		int sched;
4223 
4224 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4225 		mtx_unlock(&txr->hn_tx_lock);
4226 		if (!sched)
4227 			return 0;
4228 	}
4229 do_sched:
4230 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4231 	return 0;
4232 }
4233 
4234 static void
4235 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4236 {
4237 	struct mbuf *m;
4238 
4239 	mtx_lock(&txr->hn_tx_lock);
4240 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4241 		m_freem(m);
4242 	mtx_unlock(&txr->hn_tx_lock);
4243 }
4244 
4245 static void
4246 hn_xmit_qflush(struct ifnet *ifp)
4247 {
4248 	struct hn_softc *sc = ifp->if_softc;
4249 	int i;
4250 
4251 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4252 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4253 	if_qflush(ifp);
4254 }
4255 
4256 static void
4257 hn_xmit_txeof(struct hn_tx_ring *txr)
4258 {
4259 
4260 	if (txr->hn_sched_tx)
4261 		goto do_sched;
4262 
4263 	if (mtx_trylock(&txr->hn_tx_lock)) {
4264 		int sched;
4265 
4266 		txr->hn_oactive = 0;
4267 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4268 		mtx_unlock(&txr->hn_tx_lock);
4269 		if (sched) {
4270 			taskqueue_enqueue(txr->hn_tx_taskq,
4271 			    &txr->hn_tx_task);
4272 		}
4273 	} else {
4274 do_sched:
4275 		/*
4276 		 * Release the oactive earlier, with the hope, that
4277 		 * others could catch up.  The task will clear the
4278 		 * oactive again with the hn_tx_lock to avoid possible
4279 		 * races.
4280 		 */
4281 		txr->hn_oactive = 0;
4282 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4283 	}
4284 }
4285 
4286 static void
4287 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4288 {
4289 	struct hn_tx_ring *txr = xtxr;
4290 
4291 	mtx_lock(&txr->hn_tx_lock);
4292 	hn_xmit(txr, 0);
4293 	mtx_unlock(&txr->hn_tx_lock);
4294 }
4295 
4296 static void
4297 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4298 {
4299 	struct hn_tx_ring *txr = xtxr;
4300 
4301 	mtx_lock(&txr->hn_tx_lock);
4302 	txr->hn_oactive = 0;
4303 	hn_xmit(txr, 0);
4304 	mtx_unlock(&txr->hn_tx_lock);
4305 }
4306 
4307 static int
4308 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4309 {
4310 	struct vmbus_chan_br cbr;
4311 	struct hn_rx_ring *rxr;
4312 	struct hn_tx_ring *txr = NULL;
4313 	int idx, error;
4314 
4315 	idx = vmbus_chan_subidx(chan);
4316 
4317 	/*
4318 	 * Link this channel to RX/TX ring.
4319 	 */
4320 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4321 	    ("invalid channel index %d, should > 0 && < %d",
4322 	     idx, sc->hn_rx_ring_inuse));
4323 	rxr = &sc->hn_rx_ring[idx];
4324 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4325 	    ("RX ring %d already attached", idx));
4326 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4327 
4328 	if (bootverbose) {
4329 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4330 		    idx, vmbus_chan_id(chan));
4331 	}
4332 
4333 	if (idx < sc->hn_tx_ring_inuse) {
4334 		txr = &sc->hn_tx_ring[idx];
4335 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4336 		    ("TX ring %d already attached", idx));
4337 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4338 
4339 		txr->hn_chan = chan;
4340 		if (bootverbose) {
4341 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4342 			    idx, vmbus_chan_id(chan));
4343 		}
4344 	}
4345 
4346 	/* Bind this channel to a proper CPU. */
4347 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4348 
4349 	/*
4350 	 * Open this channel
4351 	 */
4352 	cbr.cbr = rxr->hn_br;
4353 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4354 	cbr.cbr_txsz = HN_TXBR_SIZE;
4355 	cbr.cbr_rxsz = HN_RXBR_SIZE;
4356 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4357 	if (error) {
4358 		if (error == EISCONN) {
4359 			if_printf(sc->hn_ifp, "bufring is connected after "
4360 			    "chan%u open failure\n", vmbus_chan_id(chan));
4361 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4362 		} else {
4363 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4364 			    vmbus_chan_id(chan), error);
4365 		}
4366 	}
4367 	return (error);
4368 }
4369 
4370 static void
4371 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4372 {
4373 	struct hn_rx_ring *rxr;
4374 	int idx, error;
4375 
4376 	idx = vmbus_chan_subidx(chan);
4377 
4378 	/*
4379 	 * Link this channel to RX/TX ring.
4380 	 */
4381 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4382 	    ("invalid channel index %d, should > 0 && < %d",
4383 	     idx, sc->hn_rx_ring_inuse));
4384 	rxr = &sc->hn_rx_ring[idx];
4385 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4386 	    ("RX ring %d is not attached", idx));
4387 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4388 
4389 	if (idx < sc->hn_tx_ring_inuse) {
4390 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4391 
4392 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4393 		    ("TX ring %d is not attached attached", idx));
4394 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4395 	}
4396 
4397 	/*
4398 	 * Close this channel.
4399 	 *
4400 	 * NOTE:
4401 	 * Channel closing does _not_ destroy the target channel.
4402 	 */
4403 	error = vmbus_chan_close_direct(chan);
4404 	if (error == EISCONN) {
4405 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
4406 		    "after being closed\n", vmbus_chan_id(chan));
4407 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4408 	} else if (error) {
4409 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4410 		    vmbus_chan_id(chan), error);
4411 	}
4412 }
4413 
4414 static int
4415 hn_attach_subchans(struct hn_softc *sc)
4416 {
4417 	struct vmbus_channel **subchans;
4418 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4419 	int i, error = 0;
4420 
4421 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
4422 
4423 	/* Attach the sub-channels. */
4424 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4425 	for (i = 0; i < subchan_cnt; ++i) {
4426 		int error1;
4427 
4428 		error1 = hn_chan_attach(sc, subchans[i]);
4429 		if (error1) {
4430 			error = error1;
4431 			/* Move on; all channels will be detached later. */
4432 		}
4433 	}
4434 	vmbus_subchan_rel(subchans, subchan_cnt);
4435 
4436 	if (error) {
4437 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4438 	} else {
4439 		if (bootverbose) {
4440 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4441 			    subchan_cnt);
4442 		}
4443 	}
4444 	return (error);
4445 }
4446 
4447 static void
4448 hn_detach_allchans(struct hn_softc *sc)
4449 {
4450 	struct vmbus_channel **subchans;
4451 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4452 	int i;
4453 
4454 	if (subchan_cnt == 0)
4455 		goto back;
4456 
4457 	/* Detach the sub-channels. */
4458 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4459 	for (i = 0; i < subchan_cnt; ++i)
4460 		hn_chan_detach(sc, subchans[i]);
4461 	vmbus_subchan_rel(subchans, subchan_cnt);
4462 
4463 back:
4464 	/*
4465 	 * Detach the primary channel, _after_ all sub-channels
4466 	 * are detached.
4467 	 */
4468 	hn_chan_detach(sc, sc->hn_prichan);
4469 
4470 	/* Wait for sub-channels to be destroyed, if any. */
4471 	vmbus_subchan_drain(sc->hn_prichan);
4472 
4473 #ifdef INVARIANTS
4474 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4475 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4476 		    HN_RX_FLAG_ATTACHED) == 0,
4477 		    ("%dth RX ring is still attached", i));
4478 	}
4479 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4480 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4481 		    HN_TX_FLAG_ATTACHED) == 0,
4482 		    ("%dth TX ring is still attached", i));
4483 	}
4484 #endif
4485 }
4486 
4487 static int
4488 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4489 {
4490 	struct vmbus_channel **subchans;
4491 	int nchan, rxr_cnt, error;
4492 
4493 	nchan = *nsubch + 1;
4494 	if (nchan == 1) {
4495 		/*
4496 		 * Multiple RX/TX rings are not requested.
4497 		 */
4498 		*nsubch = 0;
4499 		return (0);
4500 	}
4501 
4502 	/*
4503 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4504 	 * table entries.
4505 	 */
4506 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4507 	if (error) {
4508 		/* No RSS; this is benign. */
4509 		*nsubch = 0;
4510 		return (0);
4511 	}
4512 	if (bootverbose) {
4513 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4514 		    rxr_cnt, nchan);
4515 	}
4516 
4517 	if (nchan > rxr_cnt)
4518 		nchan = rxr_cnt;
4519 	if (nchan == 1) {
4520 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4521 		*nsubch = 0;
4522 		return (0);
4523 	}
4524 
4525 	/*
4526 	 * Allocate sub-channels from NVS.
4527 	 */
4528 	*nsubch = nchan - 1;
4529 	error = hn_nvs_alloc_subchans(sc, nsubch);
4530 	if (error || *nsubch == 0) {
4531 		/* Failed to allocate sub-channels. */
4532 		*nsubch = 0;
4533 		return (0);
4534 	}
4535 
4536 	/*
4537 	 * Wait for all sub-channels to become ready before moving on.
4538 	 */
4539 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4540 	vmbus_subchan_rel(subchans, *nsubch);
4541 	return (0);
4542 }
4543 
4544 static bool
4545 hn_synth_attachable(const struct hn_softc *sc)
4546 {
4547 	int i;
4548 
4549 	if (sc->hn_flags & HN_FLAG_ERRORS)
4550 		return (false);
4551 
4552 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4553 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4554 
4555 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4556 			return (false);
4557 	}
4558 	return (true);
4559 }
4560 
4561 static int
4562 hn_synth_attach(struct hn_softc *sc, int mtu)
4563 {
4564 #define ATTACHED_NVS		0x0002
4565 #define ATTACHED_RNDIS		0x0004
4566 
4567 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4568 	int error, nsubch, nchan, i;
4569 	uint32_t old_caps, attached = 0;
4570 
4571 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4572 	    ("synthetic parts were attached"));
4573 
4574 	if (!hn_synth_attachable(sc))
4575 		return (ENXIO);
4576 
4577 	/* Save capabilities for later verification. */
4578 	old_caps = sc->hn_caps;
4579 	sc->hn_caps = 0;
4580 
4581 	/* Clear RSS stuffs. */
4582 	sc->hn_rss_ind_size = 0;
4583 	sc->hn_rss_hash = 0;
4584 
4585 	/*
4586 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
4587 	 */
4588 	error = hn_chan_attach(sc, sc->hn_prichan);
4589 	if (error)
4590 		goto failed;
4591 
4592 	/*
4593 	 * Attach NVS.
4594 	 */
4595 	error = hn_nvs_attach(sc, mtu);
4596 	if (error)
4597 		goto failed;
4598 	attached |= ATTACHED_NVS;
4599 
4600 	/*
4601 	 * Attach RNDIS _after_ NVS is attached.
4602 	 */
4603 	error = hn_rndis_attach(sc, mtu);
4604 	if (error)
4605 		goto failed;
4606 	attached |= ATTACHED_RNDIS;
4607 
4608 	/*
4609 	 * Make sure capabilities are not changed.
4610 	 */
4611 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4612 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4613 		    old_caps, sc->hn_caps);
4614 		error = ENXIO;
4615 		goto failed;
4616 	}
4617 
4618 	/*
4619 	 * Allocate sub-channels for multi-TX/RX rings.
4620 	 *
4621 	 * NOTE:
4622 	 * The # of RX rings that can be used is equivalent to the # of
4623 	 * channels to be requested.
4624 	 */
4625 	nsubch = sc->hn_rx_ring_cnt - 1;
4626 	error = hn_synth_alloc_subchans(sc, &nsubch);
4627 	if (error)
4628 		goto failed;
4629 	/* NOTE: _Full_ synthetic parts detach is required now. */
4630 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4631 
4632 	/*
4633 	 * Set the # of TX/RX rings that could be used according to
4634 	 * the # of channels that NVS offered.
4635 	 */
4636 	nchan = nsubch + 1;
4637 	hn_set_ring_inuse(sc, nchan);
4638 	if (nchan == 1) {
4639 		/* Only the primary channel can be used; done */
4640 		goto back;
4641 	}
4642 
4643 	/*
4644 	 * Attach the sub-channels.
4645 	 *
4646 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
4647 	 */
4648 	error = hn_attach_subchans(sc);
4649 	if (error)
4650 		goto failed;
4651 
4652 	/*
4653 	 * Configure RSS key and indirect table _after_ all sub-channels
4654 	 * are attached.
4655 	 */
4656 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4657 		/*
4658 		 * RSS key is not set yet; set it to the default RSS key.
4659 		 */
4660 		if (bootverbose)
4661 			if_printf(sc->hn_ifp, "setup default RSS key\n");
4662 #ifdef RSS
4663 		rss_getkey(rss->rss_key);
4664 #else
4665 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4666 #endif
4667 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4668 	}
4669 
4670 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4671 		/*
4672 		 * RSS indirect table is not set yet; set it up in round-
4673 		 * robin fashion.
4674 		 */
4675 		if (bootverbose) {
4676 			if_printf(sc->hn_ifp, "setup default RSS indirect "
4677 			    "table\n");
4678 		}
4679 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
4680 			uint32_t subidx;
4681 
4682 #ifdef RSS
4683 			subidx = rss_get_indirection_to_bucket(i);
4684 #else
4685 			subidx = i;
4686 #endif
4687 			rss->rss_ind[i] = subidx % nchan;
4688 		}
4689 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4690 	} else {
4691 		/*
4692 		 * # of usable channels may be changed, so we have to
4693 		 * make sure that all entries in RSS indirect table
4694 		 * are valid.
4695 		 *
4696 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
4697 		 */
4698 		hn_rss_ind_fixup(sc);
4699 	}
4700 
4701 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4702 	if (error)
4703 		goto failed;
4704 back:
4705 	/*
4706 	 * Fixup transmission aggregation setup.
4707 	 */
4708 	hn_set_txagg(sc);
4709 	return (0);
4710 
4711 failed:
4712 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4713 		hn_synth_detach(sc);
4714 	} else {
4715 		if (attached & ATTACHED_RNDIS)
4716 			hn_rndis_detach(sc);
4717 		if (attached & ATTACHED_NVS)
4718 			hn_nvs_detach(sc);
4719 		hn_chan_detach(sc, sc->hn_prichan);
4720 		/* Restore old capabilities. */
4721 		sc->hn_caps = old_caps;
4722 	}
4723 	return (error);
4724 
4725 #undef ATTACHED_RNDIS
4726 #undef ATTACHED_NVS
4727 }
4728 
4729 /*
4730  * NOTE:
4731  * The interface must have been suspended though hn_suspend(), before
4732  * this function get called.
4733  */
4734 static void
4735 hn_synth_detach(struct hn_softc *sc)
4736 {
4737 
4738 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4739 	    ("synthetic parts were not attached"));
4740 
4741 	/* Detach the RNDIS first. */
4742 	hn_rndis_detach(sc);
4743 
4744 	/* Detach NVS. */
4745 	hn_nvs_detach(sc);
4746 
4747 	/* Detach all of the channels. */
4748 	hn_detach_allchans(sc);
4749 
4750 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4751 }
4752 
4753 static void
4754 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4755 {
4756 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4757 	    ("invalid ring count %d", ring_cnt));
4758 
4759 	if (sc->hn_tx_ring_cnt > ring_cnt)
4760 		sc->hn_tx_ring_inuse = ring_cnt;
4761 	else
4762 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4763 	sc->hn_rx_ring_inuse = ring_cnt;
4764 
4765 #ifdef RSS
4766 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
4767 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
4768 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
4769 		    rss_getnumbuckets());
4770 	}
4771 #endif
4772 
4773 	if (bootverbose) {
4774 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4775 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4776 	}
4777 }
4778 
4779 static void
4780 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4781 {
4782 
4783 	/*
4784 	 * NOTE:
4785 	 * The TX bufring will not be drained by the hypervisor,
4786 	 * if the primary channel is revoked.
4787 	 */
4788 	while (!vmbus_chan_rx_empty(chan) ||
4789 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4790 	     !vmbus_chan_tx_empty(chan)))
4791 		pause("waitch", 1);
4792 	vmbus_chan_intr_drain(chan);
4793 }
4794 
4795 static void
4796 hn_suspend_data(struct hn_softc *sc)
4797 {
4798 	struct vmbus_channel **subch = NULL;
4799 	struct hn_tx_ring *txr;
4800 	int i, nsubch;
4801 
4802 	HN_LOCK_ASSERT(sc);
4803 
4804 	/*
4805 	 * Suspend TX.
4806 	 */
4807 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4808 		txr = &sc->hn_tx_ring[i];
4809 
4810 		mtx_lock(&txr->hn_tx_lock);
4811 		txr->hn_suspended = 1;
4812 		mtx_unlock(&txr->hn_tx_lock);
4813 		/* No one is able send more packets now. */
4814 
4815 		/*
4816 		 * Wait for all pending sends to finish.
4817 		 *
4818 		 * NOTE:
4819 		 * We will _not_ receive all pending send-done, if the
4820 		 * primary channel is revoked.
4821 		 */
4822 		while (hn_tx_ring_pending(txr) &&
4823 		    !vmbus_chan_is_revoked(sc->hn_prichan))
4824 			pause("hnwtx", 1 /* 1 tick */);
4825 	}
4826 
4827 	/*
4828 	 * Disable RX by clearing RX filter.
4829 	 */
4830 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4831 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4832 
4833 	/*
4834 	 * Give RNDIS enough time to flush all pending data packets.
4835 	 */
4836 	pause("waitrx", (200 * hz) / 1000);
4837 
4838 	/*
4839 	 * Drain RX/TX bufrings and interrupts.
4840 	 */
4841 	nsubch = sc->hn_rx_ring_inuse - 1;
4842 	if (nsubch > 0)
4843 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4844 
4845 	if (subch != NULL) {
4846 		for (i = 0; i < nsubch; ++i)
4847 			hn_chan_drain(sc, subch[i]);
4848 	}
4849 	hn_chan_drain(sc, sc->hn_prichan);
4850 
4851 	if (subch != NULL)
4852 		vmbus_subchan_rel(subch, nsubch);
4853 
4854 	/*
4855 	 * Drain any pending TX tasks.
4856 	 *
4857 	 * NOTE:
4858 	 * The above hn_chan_drain() can dispatch TX tasks, so the TX
4859 	 * tasks will have to be drained _after_ the above hn_chan_drain()
4860 	 * calls.
4861 	 */
4862 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4863 		txr = &sc->hn_tx_ring[i];
4864 
4865 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4866 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4867 	}
4868 }
4869 
4870 static void
4871 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4872 {
4873 
4874 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4875 }
4876 
4877 static void
4878 hn_suspend_mgmt(struct hn_softc *sc)
4879 {
4880 	struct task task;
4881 
4882 	HN_LOCK_ASSERT(sc);
4883 
4884 	/*
4885 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4886 	 * through hn_mgmt_taskq.
4887 	 */
4888 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4889 	vmbus_chan_run_task(sc->hn_prichan, &task);
4890 
4891 	/*
4892 	 * Make sure that all pending management tasks are completed.
4893 	 */
4894 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4895 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4896 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
4897 }
4898 
4899 static void
4900 hn_suspend(struct hn_softc *sc)
4901 {
4902 
4903 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4904 		hn_suspend_data(sc);
4905 	hn_suspend_mgmt(sc);
4906 }
4907 
4908 static void
4909 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4910 {
4911 	int i;
4912 
4913 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4914 	    ("invalid TX ring count %d", tx_ring_cnt));
4915 
4916 	for (i = 0; i < tx_ring_cnt; ++i) {
4917 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4918 
4919 		mtx_lock(&txr->hn_tx_lock);
4920 		txr->hn_suspended = 0;
4921 		mtx_unlock(&txr->hn_tx_lock);
4922 	}
4923 }
4924 
4925 static void
4926 hn_resume_data(struct hn_softc *sc)
4927 {
4928 	int i;
4929 
4930 	HN_LOCK_ASSERT(sc);
4931 
4932 	/*
4933 	 * Re-enable RX.
4934 	 */
4935 	hn_set_rxfilter(sc);
4936 
4937 	/*
4938 	 * Make sure to clear suspend status on "all" TX rings,
4939 	 * since hn_tx_ring_inuse can be changed after
4940 	 * hn_suspend_data().
4941 	 */
4942 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4943 
4944 #ifdef HN_IFSTART_SUPPORT
4945 	if (!hn_use_if_start)
4946 #endif
4947 	{
4948 		/*
4949 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
4950 		 * reduced.
4951 		 */
4952 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4953 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4954 	}
4955 
4956 	/*
4957 	 * Kick start TX.
4958 	 */
4959 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4960 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4961 
4962 		/*
4963 		 * Use txeof task, so that any pending oactive can be
4964 		 * cleared properly.
4965 		 */
4966 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4967 	}
4968 }
4969 
4970 static void
4971 hn_resume_mgmt(struct hn_softc *sc)
4972 {
4973 
4974 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4975 
4976 	/*
4977 	 * Kick off network change detection, if it was pending.
4978 	 * If no network change was pending, start link status
4979 	 * checks, which is more lightweight than network change
4980 	 * detection.
4981 	 */
4982 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4983 		hn_change_network(sc);
4984 	else
4985 		hn_update_link_status(sc);
4986 }
4987 
4988 static void
4989 hn_resume(struct hn_softc *sc)
4990 {
4991 
4992 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4993 		hn_resume_data(sc);
4994 	hn_resume_mgmt(sc);
4995 }
4996 
4997 static void
4998 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4999 {
5000 	const struct rndis_status_msg *msg;
5001 	int ofs;
5002 
5003 	if (dlen < sizeof(*msg)) {
5004 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
5005 		return;
5006 	}
5007 	msg = data;
5008 
5009 	switch (msg->rm_status) {
5010 	case RNDIS_STATUS_MEDIA_CONNECT:
5011 	case RNDIS_STATUS_MEDIA_DISCONNECT:
5012 		hn_update_link_status(sc);
5013 		break;
5014 
5015 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
5016 		/* Not really useful; ignore. */
5017 		break;
5018 
5019 	case RNDIS_STATUS_NETWORK_CHANGE:
5020 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
5021 		if (dlen < ofs + msg->rm_stbuflen ||
5022 		    msg->rm_stbuflen < sizeof(uint32_t)) {
5023 			if_printf(sc->hn_ifp, "network changed\n");
5024 		} else {
5025 			uint32_t change;
5026 
5027 			memcpy(&change, ((const uint8_t *)msg) + ofs,
5028 			    sizeof(change));
5029 			if_printf(sc->hn_ifp, "network changed, change %u\n",
5030 			    change);
5031 		}
5032 		hn_change_network(sc);
5033 		break;
5034 
5035 	default:
5036 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
5037 		    msg->rm_status);
5038 		break;
5039 	}
5040 }
5041 
5042 static int
5043 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5044 {
5045 	const struct rndis_pktinfo *pi = info_data;
5046 	uint32_t mask = 0;
5047 
5048 	while (info_dlen != 0) {
5049 		const void *data;
5050 		uint32_t dlen;
5051 
5052 		if (__predict_false(info_dlen < sizeof(*pi)))
5053 			return (EINVAL);
5054 		if (__predict_false(info_dlen < pi->rm_size))
5055 			return (EINVAL);
5056 		info_dlen -= pi->rm_size;
5057 
5058 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5059 			return (EINVAL);
5060 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5061 			return (EINVAL);
5062 		dlen = pi->rm_size - pi->rm_pktinfooffset;
5063 		data = pi->rm_data;
5064 
5065 		switch (pi->rm_type) {
5066 		case NDIS_PKTINFO_TYPE_VLAN:
5067 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5068 				return (EINVAL);
5069 			info->vlan_info = *((const uint32_t *)data);
5070 			mask |= HN_RXINFO_VLAN;
5071 			break;
5072 
5073 		case NDIS_PKTINFO_TYPE_CSUM:
5074 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5075 				return (EINVAL);
5076 			info->csum_info = *((const uint32_t *)data);
5077 			mask |= HN_RXINFO_CSUM;
5078 			break;
5079 
5080 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5081 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5082 				return (EINVAL);
5083 			info->hash_value = *((const uint32_t *)data);
5084 			mask |= HN_RXINFO_HASHVAL;
5085 			break;
5086 
5087 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
5088 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5089 				return (EINVAL);
5090 			info->hash_info = *((const uint32_t *)data);
5091 			mask |= HN_RXINFO_HASHINF;
5092 			break;
5093 
5094 		default:
5095 			goto next;
5096 		}
5097 
5098 		if (mask == HN_RXINFO_ALL) {
5099 			/* All found; done */
5100 			break;
5101 		}
5102 next:
5103 		pi = (const struct rndis_pktinfo *)
5104 		    ((const uint8_t *)pi + pi->rm_size);
5105 	}
5106 
5107 	/*
5108 	 * Final fixup.
5109 	 * - If there is no hash value, invalidate the hash info.
5110 	 */
5111 	if ((mask & HN_RXINFO_HASHVAL) == 0)
5112 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5113 	return (0);
5114 }
5115 
5116 static __inline bool
5117 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5118 {
5119 
5120 	if (off < check_off) {
5121 		if (__predict_true(off + len <= check_off))
5122 			return (false);
5123 	} else if (off > check_off) {
5124 		if (__predict_true(check_off + check_len <= off))
5125 			return (false);
5126 	}
5127 	return (true);
5128 }
5129 
5130 static void
5131 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5132 {
5133 	const struct rndis_packet_msg *pkt;
5134 	struct hn_rxinfo info;
5135 	int data_off, pktinfo_off, data_len, pktinfo_len;
5136 
5137 	/*
5138 	 * Check length.
5139 	 */
5140 	if (__predict_false(dlen < sizeof(*pkt))) {
5141 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5142 		return;
5143 	}
5144 	pkt = data;
5145 
5146 	if (__predict_false(dlen < pkt->rm_len)) {
5147 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5148 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5149 		return;
5150 	}
5151 	if (__predict_false(pkt->rm_len <
5152 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5153 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5154 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
5155 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5156 		    pkt->rm_pktinfolen);
5157 		return;
5158 	}
5159 	if (__predict_false(pkt->rm_datalen == 0)) {
5160 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5161 		return;
5162 	}
5163 
5164 	/*
5165 	 * Check offests.
5166 	 */
5167 #define IS_OFFSET_INVALID(ofs)			\
5168 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
5169 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5170 
5171 	/* XXX Hyper-V does not meet data offset alignment requirement */
5172 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5173 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5174 		    "data offset %u\n", pkt->rm_dataoffset);
5175 		return;
5176 	}
5177 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5178 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5179 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5180 		    "oob offset %u\n", pkt->rm_oobdataoffset);
5181 		return;
5182 	}
5183 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5184 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5185 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5186 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5187 		return;
5188 	}
5189 
5190 #undef IS_OFFSET_INVALID
5191 
5192 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5193 	data_len = pkt->rm_datalen;
5194 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5195 	pktinfo_len = pkt->rm_pktinfolen;
5196 
5197 	/*
5198 	 * Check OOB coverage.
5199 	 */
5200 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
5201 		int oob_off, oob_len;
5202 
5203 		if_printf(rxr->hn_ifp, "got oobdata\n");
5204 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5205 		oob_len = pkt->rm_oobdatalen;
5206 
5207 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5208 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5209 			    "oob overflow, msglen %u, oob abs %d len %d\n",
5210 			    pkt->rm_len, oob_off, oob_len);
5211 			return;
5212 		}
5213 
5214 		/*
5215 		 * Check against data.
5216 		 */
5217 		if (hn_rndis_check_overlap(oob_off, oob_len,
5218 		    data_off, data_len)) {
5219 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5220 			    "oob overlaps data, oob abs %d len %d, "
5221 			    "data abs %d len %d\n",
5222 			    oob_off, oob_len, data_off, data_len);
5223 			return;
5224 		}
5225 
5226 		/*
5227 		 * Check against pktinfo.
5228 		 */
5229 		if (pktinfo_len != 0 &&
5230 		    hn_rndis_check_overlap(oob_off, oob_len,
5231 		    pktinfo_off, pktinfo_len)) {
5232 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5233 			    "oob overlaps pktinfo, oob abs %d len %d, "
5234 			    "pktinfo abs %d len %d\n",
5235 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
5236 			return;
5237 		}
5238 	}
5239 
5240 	/*
5241 	 * Check per-packet-info coverage and find useful per-packet-info.
5242 	 */
5243 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5244 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5245 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5246 	if (__predict_true(pktinfo_len != 0)) {
5247 		bool overlap;
5248 		int error;
5249 
5250 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5251 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5252 			    "pktinfo overflow, msglen %u, "
5253 			    "pktinfo abs %d len %d\n",
5254 			    pkt->rm_len, pktinfo_off, pktinfo_len);
5255 			return;
5256 		}
5257 
5258 		/*
5259 		 * Check packet info coverage.
5260 		 */
5261 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5262 		    data_off, data_len);
5263 		if (__predict_false(overlap)) {
5264 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5265 			    "pktinfo overlap data, pktinfo abs %d len %d, "
5266 			    "data abs %d len %d\n",
5267 			    pktinfo_off, pktinfo_len, data_off, data_len);
5268 			return;
5269 		}
5270 
5271 		/*
5272 		 * Find useful per-packet-info.
5273 		 */
5274 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5275 		    pktinfo_len, &info);
5276 		if (__predict_false(error)) {
5277 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5278 			    "pktinfo\n");
5279 			return;
5280 		}
5281 	}
5282 
5283 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
5284 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5285 		    "data overflow, msglen %u, data abs %d len %d\n",
5286 		    pkt->rm_len, data_off, data_len);
5287 		return;
5288 	}
5289 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5290 }
5291 
5292 static __inline void
5293 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5294 {
5295 	const struct rndis_msghdr *hdr;
5296 
5297 	if (__predict_false(dlen < sizeof(*hdr))) {
5298 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5299 		return;
5300 	}
5301 	hdr = data;
5302 
5303 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5304 		/* Hot data path. */
5305 		hn_rndis_rx_data(rxr, data, dlen);
5306 		/* Done! */
5307 		return;
5308 	}
5309 
5310 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5311 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5312 	else
5313 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5314 }
5315 
5316 static void
5317 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5318 {
5319 	const struct hn_nvs_hdr *hdr;
5320 
5321 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5322 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
5323 		return;
5324 	}
5325 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5326 
5327 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5328 		/* Useless; ignore */
5329 		return;
5330 	}
5331 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5332 }
5333 
5334 static void
5335 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5336     const struct vmbus_chanpkt_hdr *pkt)
5337 {
5338 	struct hn_nvs_sendctx *sndc;
5339 
5340 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5341 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5342 	    VMBUS_CHANPKT_DATALEN(pkt));
5343 	/*
5344 	 * NOTE:
5345 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5346 	 * its callback.
5347 	 */
5348 }
5349 
5350 static void
5351 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5352     const struct vmbus_chanpkt_hdr *pkthdr)
5353 {
5354 	const struct vmbus_chanpkt_rxbuf *pkt;
5355 	const struct hn_nvs_hdr *nvs_hdr;
5356 	int count, i, hlen;
5357 
5358 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5359 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5360 		return;
5361 	}
5362 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5363 
5364 	/* Make sure that this is a RNDIS message. */
5365 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5366 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5367 		    nvs_hdr->nvs_type);
5368 		return;
5369 	}
5370 
5371 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5372 	if (__predict_false(hlen < sizeof(*pkt))) {
5373 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5374 		return;
5375 	}
5376 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5377 
5378 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5379 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5380 		    pkt->cp_rxbuf_id);
5381 		return;
5382 	}
5383 
5384 	count = pkt->cp_rxbuf_cnt;
5385 	if (__predict_false(hlen <
5386 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5387 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5388 		return;
5389 	}
5390 
5391 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5392 	for (i = 0; i < count; ++i) {
5393 		int ofs, len;
5394 
5395 		ofs = pkt->cp_rxbuf[i].rb_ofs;
5396 		len = pkt->cp_rxbuf[i].rb_len;
5397 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5398 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5399 			    "ofs %d, len %d\n", i, ofs, len);
5400 			continue;
5401 		}
5402 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5403 	}
5404 
5405 	/*
5406 	 * Ack the consumed RXBUF associated w/ this channel packet,
5407 	 * so that this RXBUF can be recycled by the hypervisor.
5408 	 */
5409 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5410 }
5411 
5412 static void
5413 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5414     uint64_t tid)
5415 {
5416 	struct hn_nvs_rndis_ack ack;
5417 	int retries, error;
5418 
5419 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5420 	ack.nvs_status = HN_NVS_STATUS_OK;
5421 
5422 	retries = 0;
5423 again:
5424 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5425 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5426 	if (__predict_false(error == EAGAIN)) {
5427 		/*
5428 		 * NOTE:
5429 		 * This should _not_ happen in real world, since the
5430 		 * consumption of the TX bufring from the TX path is
5431 		 * controlled.
5432 		 */
5433 		if (rxr->hn_ack_failed == 0)
5434 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5435 		rxr->hn_ack_failed++;
5436 		retries++;
5437 		if (retries < 10) {
5438 			DELAY(100);
5439 			goto again;
5440 		}
5441 		/* RXBUF leaks! */
5442 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5443 	}
5444 }
5445 
5446 static void
5447 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5448 {
5449 	struct hn_rx_ring *rxr = xrxr;
5450 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
5451 
5452 	for (;;) {
5453 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5454 		int error, pktlen;
5455 
5456 		pktlen = rxr->hn_pktbuf_len;
5457 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5458 		if (__predict_false(error == ENOBUFS)) {
5459 			void *nbuf;
5460 			int nlen;
5461 
5462 			/*
5463 			 * Expand channel packet buffer.
5464 			 *
5465 			 * XXX
5466 			 * Use M_WAITOK here, since allocation failure
5467 			 * is fatal.
5468 			 */
5469 			nlen = rxr->hn_pktbuf_len * 2;
5470 			while (nlen < pktlen)
5471 				nlen *= 2;
5472 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5473 
5474 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5475 			    rxr->hn_pktbuf_len, nlen);
5476 
5477 			free(rxr->hn_pktbuf, M_DEVBUF);
5478 			rxr->hn_pktbuf = nbuf;
5479 			rxr->hn_pktbuf_len = nlen;
5480 			/* Retry! */
5481 			continue;
5482 		} else if (__predict_false(error == EAGAIN)) {
5483 			/* No more channel packets; done! */
5484 			break;
5485 		}
5486 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5487 
5488 		switch (pkt->cph_type) {
5489 		case VMBUS_CHANPKT_TYPE_COMP:
5490 			hn_nvs_handle_comp(sc, chan, pkt);
5491 			break;
5492 
5493 		case VMBUS_CHANPKT_TYPE_RXBUF:
5494 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
5495 			break;
5496 
5497 		case VMBUS_CHANPKT_TYPE_INBAND:
5498 			hn_nvs_handle_notify(sc, pkt);
5499 			break;
5500 
5501 		default:
5502 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5503 			    pkt->cph_type);
5504 			break;
5505 		}
5506 	}
5507 	hn_chan_rollup(rxr, rxr->hn_txr);
5508 }
5509 
5510 static void
5511 hn_tx_taskq_create(void *arg __unused)
5512 {
5513 	int i;
5514 
5515 	/*
5516 	 * Fix the # of TX taskqueues.
5517 	 */
5518 	if (hn_tx_taskq_cnt <= 0)
5519 		hn_tx_taskq_cnt = 1;
5520 	else if (hn_tx_taskq_cnt > mp_ncpus)
5521 		hn_tx_taskq_cnt = mp_ncpus;
5522 
5523 	/*
5524 	 * Fix the TX taskqueue mode.
5525 	 */
5526 	switch (hn_tx_taskq_mode) {
5527 	case HN_TX_TASKQ_M_INDEP:
5528 	case HN_TX_TASKQ_M_GLOBAL:
5529 	case HN_TX_TASKQ_M_EVTTQ:
5530 		break;
5531 	default:
5532 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5533 		break;
5534 	}
5535 
5536 	if (vm_guest != VM_GUEST_HV)
5537 		return;
5538 
5539 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5540 		return;
5541 
5542 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5543 	    M_DEVBUF, M_WAITOK);
5544 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5545 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5546 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5547 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5548 		    "hn tx%d", i);
5549 	}
5550 }
5551 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5552     hn_tx_taskq_create, NULL);
5553 
5554 static void
5555 hn_tx_taskq_destroy(void *arg __unused)
5556 {
5557 
5558 	if (hn_tx_taskque != NULL) {
5559 		int i;
5560 
5561 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
5562 			taskqueue_free(hn_tx_taskque[i]);
5563 		free(hn_tx_taskque, M_DEVBUF);
5564 	}
5565 }
5566 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5567     hn_tx_taskq_destroy, NULL);
5568