xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 949eda893bfe47e0bd09b43681f4463f0e695450)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/bus.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/smp.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
75 #include <sys/sx.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
80 #include <sys/eventhandler.h>
81 
82 #include <machine/atomic.h>
83 #include <machine/in_cksum.h>
84 
85 #include <net/bpf.h>
86 #include <net/ethernet.h>
87 #include <net/if.h>
88 #include <net/if_dl.h>
89 #include <net/if_media.h>
90 #include <net/if_types.h>
91 #include <net/if_var.h>
92 #include <net/rndis.h>
93 #ifdef RSS
94 #include <net/rss_config.h>
95 #endif
96 
97 #include <netinet/in_systm.h>
98 #include <netinet/in.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip6.h>
101 #include <netinet/tcp.h>
102 #include <netinet/tcp_lro.h>
103 #include <netinet/udp.h>
104 
105 #include <dev/hyperv/include/hyperv.h>
106 #include <dev/hyperv/include/hyperv_busdma.h>
107 #include <dev/hyperv/include/vmbus.h>
108 #include <dev/hyperv/include/vmbus_xact.h>
109 
110 #include <dev/hyperv/netvsc/ndis.h>
111 #include <dev/hyperv/netvsc/if_hnreg.h>
112 #include <dev/hyperv/netvsc/if_hnvar.h>
113 #include <dev/hyperv/netvsc/hn_nvs.h>
114 #include <dev/hyperv/netvsc/hn_rndis.h>
115 
116 #include "vmbus_if.h"
117 
118 #define HN_IFSTART_SUPPORT
119 
120 #define HN_RING_CNT_DEF_MAX		8
121 
122 /* YYY should get it from the underlying channel */
123 #define HN_TX_DESC_CNT			512
124 
125 #define HN_RNDIS_PKT_LEN					\
126 	(sizeof(struct rndis_packet_msg) +			\
127 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
128 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
129 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
130 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
131 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
132 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
133 
134 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
135 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
136 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
137 /* -1 for RNDIS packet message */
138 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
139 
140 #define HN_DIRECT_TX_SIZE_DEF		128
141 
142 #define HN_EARLY_TXEOF_THRESH		8
143 
144 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
145 
146 #define HN_LROENT_CNT_DEF		128
147 
148 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
149 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
150 /* YYY 2*MTU is a bit rough, but should be good enough. */
151 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
152 
153 #define HN_LRO_ACKCNT_DEF		1
154 
155 #define HN_LOCK_INIT(sc)		\
156 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
157 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
158 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
159 #define HN_LOCK(sc)					\
160 do {							\
161 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
162 		DELAY(1000);				\
163 } while (0)
164 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
165 
166 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
167 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
168 #define HN_CSUM_IP_HWASSIST(sc)		\
169 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
170 #define HN_CSUM_IP6_HWASSIST(sc)	\
171 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
172 
173 #define HN_PKTSIZE_MIN(align)		\
174 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
175 	    HN_RNDIS_PKT_LEN, (align))
176 #define HN_PKTSIZE(m, align)		\
177 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
178 
179 #ifdef RSS
180 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
181 #else
182 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
183 #endif
184 
185 struct hn_txdesc {
186 #ifndef HN_USE_TXDESC_BUFRING
187 	SLIST_ENTRY(hn_txdesc)		link;
188 #endif
189 	STAILQ_ENTRY(hn_txdesc)		agg_link;
190 
191 	/* Aggregated txdescs, in sending order. */
192 	STAILQ_HEAD(, hn_txdesc)	agg_list;
193 
194 	/* The oldest packet, if transmission aggregation happens. */
195 	struct mbuf			*m;
196 	struct hn_tx_ring		*txr;
197 	int				refs;
198 	uint32_t			flags;	/* HN_TXD_FLAG_ */
199 	struct hn_nvs_sendctx		send_ctx;
200 	uint32_t			chim_index;
201 	int				chim_size;
202 
203 	bus_dmamap_t			data_dmap;
204 
205 	bus_addr_t			rndis_pkt_paddr;
206 	struct rndis_packet_msg		*rndis_pkt;
207 	bus_dmamap_t			rndis_pkt_dmap;
208 };
209 
210 #define HN_TXD_FLAG_ONLIST		0x0001
211 #define HN_TXD_FLAG_DMAMAP		0x0002
212 #define HN_TXD_FLAG_ONAGG		0x0004
213 
214 struct hn_rxinfo {
215 	uint32_t			vlan_info;
216 	uint32_t			csum_info;
217 	uint32_t			hash_info;
218 	uint32_t			hash_value;
219 };
220 
221 struct hn_update_vf {
222 	struct hn_rx_ring	*rxr;
223 	struct ifnet		*vf;
224 };
225 
226 #define HN_RXINFO_VLAN			0x0001
227 #define HN_RXINFO_CSUM			0x0002
228 #define HN_RXINFO_HASHINF		0x0004
229 #define HN_RXINFO_HASHVAL		0x0008
230 #define HN_RXINFO_ALL			\
231 	(HN_RXINFO_VLAN |		\
232 	 HN_RXINFO_CSUM |		\
233 	 HN_RXINFO_HASHINF |		\
234 	 HN_RXINFO_HASHVAL)
235 
236 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
237 #define HN_NDIS_RXCSUM_INFO_INVALID	0
238 #define HN_NDIS_HASH_INFO_INVALID	0
239 
240 static int			hn_probe(device_t);
241 static int			hn_attach(device_t);
242 static int			hn_detach(device_t);
243 static int			hn_shutdown(device_t);
244 static void			hn_chan_callback(struct vmbus_channel *,
245 				    void *);
246 
247 static void			hn_init(void *);
248 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
249 #ifdef HN_IFSTART_SUPPORT
250 static void			hn_start(struct ifnet *);
251 #endif
252 static int			hn_transmit(struct ifnet *, struct mbuf *);
253 static void			hn_xmit_qflush(struct ifnet *);
254 static int			hn_ifmedia_upd(struct ifnet *);
255 static void			hn_ifmedia_sts(struct ifnet *,
256 				    struct ifmediareq *);
257 
258 static int			hn_rndis_rxinfo(const void *, int,
259 				    struct hn_rxinfo *);
260 static void			hn_rndis_rx_data(struct hn_rx_ring *,
261 				    const void *, int);
262 static void			hn_rndis_rx_status(struct hn_softc *,
263 				    const void *, int);
264 
265 static void			hn_nvs_handle_notify(struct hn_softc *,
266 				    const struct vmbus_chanpkt_hdr *);
267 static void			hn_nvs_handle_comp(struct hn_softc *,
268 				    struct vmbus_channel *,
269 				    const struct vmbus_chanpkt_hdr *);
270 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
271 				    struct vmbus_channel *,
272 				    const struct vmbus_chanpkt_hdr *);
273 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
274 				    struct vmbus_channel *, uint64_t);
275 
276 #if __FreeBSD_version >= 1100099
277 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
278 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
279 #endif
280 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
281 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
282 #if __FreeBSD_version < 1100095
283 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
284 #else
285 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
286 #endif
287 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
288 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
289 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
290 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
291 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
292 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
293 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
294 #ifndef RSS
295 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
296 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
297 #endif
298 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
299 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
300 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
301 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
302 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
303 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
304 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
305 
306 static void			hn_stop(struct hn_softc *, bool);
307 static void			hn_init_locked(struct hn_softc *);
308 static int			hn_chan_attach(struct hn_softc *,
309 				    struct vmbus_channel *);
310 static void			hn_chan_detach(struct hn_softc *,
311 				    struct vmbus_channel *);
312 static int			hn_attach_subchans(struct hn_softc *);
313 static void			hn_detach_allchans(struct hn_softc *);
314 static void			hn_chan_rollup(struct hn_rx_ring *,
315 				    struct hn_tx_ring *);
316 static void			hn_set_ring_inuse(struct hn_softc *, int);
317 static int			hn_synth_attach(struct hn_softc *, int);
318 static void			hn_synth_detach(struct hn_softc *);
319 static int			hn_synth_alloc_subchans(struct hn_softc *,
320 				    int *);
321 static bool			hn_synth_attachable(const struct hn_softc *);
322 static void			hn_suspend(struct hn_softc *);
323 static void			hn_suspend_data(struct hn_softc *);
324 static void			hn_suspend_mgmt(struct hn_softc *);
325 static void			hn_resume(struct hn_softc *);
326 static void			hn_resume_data(struct hn_softc *);
327 static void			hn_resume_mgmt(struct hn_softc *);
328 static void			hn_suspend_mgmt_taskfunc(void *, int);
329 static void			hn_chan_drain(struct hn_softc *,
330 				    struct vmbus_channel *);
331 static void			hn_polling(struct hn_softc *, u_int);
332 static void			hn_chan_polling(struct vmbus_channel *, u_int);
333 
334 static void			hn_update_link_status(struct hn_softc *);
335 static void			hn_change_network(struct hn_softc *);
336 static void			hn_link_taskfunc(void *, int);
337 static void			hn_netchg_init_taskfunc(void *, int);
338 static void			hn_netchg_status_taskfunc(void *, int);
339 static void			hn_link_status(struct hn_softc *);
340 
341 static int			hn_create_rx_data(struct hn_softc *, int);
342 static void			hn_destroy_rx_data(struct hn_softc *);
343 static int			hn_check_iplen(const struct mbuf *, int);
344 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
345 static int			hn_rxfilter_config(struct hn_softc *);
346 #ifndef RSS
347 static int			hn_rss_reconfig(struct hn_softc *);
348 #endif
349 static void			hn_rss_ind_fixup(struct hn_softc *);
350 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
351 				    int, const struct hn_rxinfo *);
352 
353 static int			hn_tx_ring_create(struct hn_softc *, int);
354 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
355 static int			hn_create_tx_data(struct hn_softc *, int);
356 static void			hn_fixup_tx_data(struct hn_softc *);
357 static void			hn_destroy_tx_data(struct hn_softc *);
358 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
359 static void			hn_txdesc_gc(struct hn_tx_ring *,
360 				    struct hn_txdesc *);
361 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
362 				    struct hn_txdesc *, struct mbuf **);
363 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
364 				    struct hn_txdesc *);
365 static void			hn_set_chim_size(struct hn_softc *, int);
366 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
367 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
368 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
369 static void			hn_resume_tx(struct hn_softc *, int);
370 static void			hn_set_txagg(struct hn_softc *);
371 static void			*hn_try_txagg(struct ifnet *,
372 				    struct hn_tx_ring *, struct hn_txdesc *,
373 				    int);
374 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
375 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
376 				    struct hn_softc *, struct vmbus_channel *,
377 				    const void *, int);
378 static int			hn_txpkt_sglist(struct hn_tx_ring *,
379 				    struct hn_txdesc *);
380 static int			hn_txpkt_chim(struct hn_tx_ring *,
381 				    struct hn_txdesc *);
382 static int			hn_xmit(struct hn_tx_ring *, int);
383 static void			hn_xmit_taskfunc(void *, int);
384 static void			hn_xmit_txeof(struct hn_tx_ring *);
385 static void			hn_xmit_txeof_taskfunc(void *, int);
386 #ifdef HN_IFSTART_SUPPORT
387 static int			hn_start_locked(struct hn_tx_ring *, int);
388 static void			hn_start_taskfunc(void *, int);
389 static void			hn_start_txeof(struct hn_tx_ring *);
390 static void			hn_start_txeof_taskfunc(void *, int);
391 #endif
392 
393 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
394     "Hyper-V network interface");
395 
396 /* Trust tcp segements verification on host side. */
397 static int			hn_trust_hosttcp = 1;
398 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
399     &hn_trust_hosttcp, 0,
400     "Trust tcp segement verification on host side, "
401     "when csum info is missing (global setting)");
402 
403 /* Trust udp datagrams verification on host side. */
404 static int			hn_trust_hostudp = 1;
405 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
406     &hn_trust_hostudp, 0,
407     "Trust udp datagram verification on host side, "
408     "when csum info is missing (global setting)");
409 
410 /* Trust ip packets verification on host side. */
411 static int			hn_trust_hostip = 1;
412 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
413     &hn_trust_hostip, 0,
414     "Trust ip packet verification on host side, "
415     "when csum info is missing (global setting)");
416 
417 /* Limit TSO burst size */
418 static int			hn_tso_maxlen = IP_MAXPACKET;
419 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
420     &hn_tso_maxlen, 0, "TSO burst limit");
421 
422 /* Limit chimney send size */
423 static int			hn_tx_chimney_size = 0;
424 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
425     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
426 
427 /* Limit the size of packet for direct transmission */
428 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
429 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
430     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
431 
432 /* # of LRO entries per RX ring */
433 #if defined(INET) || defined(INET6)
434 #if __FreeBSD_version >= 1100095
435 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
436 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
437     &hn_lro_entry_count, 0, "LRO entry count");
438 #endif
439 #endif
440 
441 static int			hn_tx_taskq_cnt = 1;
442 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
443     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
444 
445 #define HN_TX_TASKQ_M_INDEP	0
446 #define HN_TX_TASKQ_M_GLOBAL	1
447 #define HN_TX_TASKQ_M_EVTTQ	2
448 
449 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
450 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
451     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
452     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
453 
454 #ifndef HN_USE_TXDESC_BUFRING
455 static int			hn_use_txdesc_bufring = 0;
456 #else
457 static int			hn_use_txdesc_bufring = 1;
458 #endif
459 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
460     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
461 
462 #ifdef HN_IFSTART_SUPPORT
463 /* Use ifnet.if_start instead of ifnet.if_transmit */
464 static int			hn_use_if_start = 0;
465 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
466     &hn_use_if_start, 0, "Use if_start TX method");
467 #endif
468 
469 /* # of channels to use */
470 static int			hn_chan_cnt = 0;
471 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
472     &hn_chan_cnt, 0,
473     "# of channels to use; each channel has one RX ring and one TX ring");
474 
475 /* # of transmit rings to use */
476 static int			hn_tx_ring_cnt = 0;
477 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
478     &hn_tx_ring_cnt, 0, "# of TX rings to use");
479 
480 /* Software TX ring deptch */
481 static int			hn_tx_swq_depth = 0;
482 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
483     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
484 
485 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
486 #if __FreeBSD_version >= 1100095
487 static u_int			hn_lro_mbufq_depth = 0;
488 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
489     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
490 #endif
491 
492 /* Packet transmission aggregation size limit */
493 static int			hn_tx_agg_size = -1;
494 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
495     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
496 
497 /* Packet transmission aggregation count limit */
498 static int			hn_tx_agg_pkts = -1;
499 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
500     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
501 
502 static u_int			hn_cpu_index;	/* next CPU for channel */
503 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
504 
505 #ifndef RSS
506 static const uint8_t
507 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
508 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
509 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
510 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
511 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
512 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
513 };
514 #endif	/* !RSS */
515 
516 static device_method_t hn_methods[] = {
517 	/* Device interface */
518 	DEVMETHOD(device_probe,		hn_probe),
519 	DEVMETHOD(device_attach,	hn_attach),
520 	DEVMETHOD(device_detach,	hn_detach),
521 	DEVMETHOD(device_shutdown,	hn_shutdown),
522 	DEVMETHOD_END
523 };
524 
525 static driver_t hn_driver = {
526 	"hn",
527 	hn_methods,
528 	sizeof(struct hn_softc)
529 };
530 
531 static devclass_t hn_devclass;
532 
533 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
534 MODULE_VERSION(hn, 1);
535 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
536 
537 #if __FreeBSD_version >= 1100099
538 static void
539 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
540 {
541 	int i;
542 
543 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
544 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
545 }
546 #endif
547 
548 static int
549 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
550 {
551 
552 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
553 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
554 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
555 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
556 }
557 
558 static int
559 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
560 {
561 	struct hn_nvs_rndis rndis;
562 
563 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
564 	    txd->chim_size > 0, ("invalid rndis chim txd"));
565 
566 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
567 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
568 	rndis.nvs_chim_idx = txd->chim_index;
569 	rndis.nvs_chim_sz = txd->chim_size;
570 
571 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
572 	    &rndis, sizeof(rndis), &txd->send_ctx));
573 }
574 
575 static __inline uint32_t
576 hn_chim_alloc(struct hn_softc *sc)
577 {
578 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
579 	u_long *bmap = sc->hn_chim_bmap;
580 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
581 
582 	for (i = 0; i < bmap_cnt; ++i) {
583 		int idx;
584 
585 		idx = ffsl(~bmap[i]);
586 		if (idx == 0)
587 			continue;
588 
589 		--idx; /* ffsl is 1-based */
590 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
591 		    ("invalid i %d and idx %d", i, idx));
592 
593 		if (atomic_testandset_long(&bmap[i], idx))
594 			continue;
595 
596 		ret = i * LONG_BIT + idx;
597 		break;
598 	}
599 	return (ret);
600 }
601 
602 static __inline void
603 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
604 {
605 	u_long mask;
606 	uint32_t idx;
607 
608 	idx = chim_idx / LONG_BIT;
609 	KASSERT(idx < sc->hn_chim_bmap_cnt,
610 	    ("invalid chimney index 0x%x", chim_idx));
611 
612 	mask = 1UL << (chim_idx % LONG_BIT);
613 	KASSERT(sc->hn_chim_bmap[idx] & mask,
614 	    ("index bitmap 0x%lx, chimney index %u, "
615 	     "bitmap idx %d, bitmask 0x%lx",
616 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
617 
618 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
619 }
620 
621 #if defined(INET6) || defined(INET)
622 /*
623  * NOTE: If this function failed, the m_head would be freed.
624  */
625 static __inline struct mbuf *
626 hn_tso_fixup(struct mbuf *m_head)
627 {
628 	struct ether_vlan_header *evl;
629 	struct tcphdr *th;
630 	int ehlen;
631 
632 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
633 
634 #define PULLUP_HDR(m, len)				\
635 do {							\
636 	if (__predict_false((m)->m_len < (len))) {	\
637 		(m) = m_pullup((m), (len));		\
638 		if ((m) == NULL)			\
639 			return (NULL);			\
640 	}						\
641 } while (0)
642 
643 	PULLUP_HDR(m_head, sizeof(*evl));
644 	evl = mtod(m_head, struct ether_vlan_header *);
645 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
646 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
647 	else
648 		ehlen = ETHER_HDR_LEN;
649 
650 #ifdef INET
651 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
652 		struct ip *ip;
653 		int iphlen;
654 
655 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
656 		ip = mtodo(m_head, ehlen);
657 		iphlen = ip->ip_hl << 2;
658 
659 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
660 		th = mtodo(m_head, ehlen + iphlen);
661 
662 		ip->ip_len = 0;
663 		ip->ip_sum = 0;
664 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
665 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
666 	}
667 #endif
668 #if defined(INET6) && defined(INET)
669 	else
670 #endif
671 #ifdef INET6
672 	{
673 		struct ip6_hdr *ip6;
674 
675 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
676 		ip6 = mtodo(m_head, ehlen);
677 		if (ip6->ip6_nxt != IPPROTO_TCP) {
678 			m_freem(m_head);
679 			return (NULL);
680 		}
681 
682 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
683 		th = mtodo(m_head, ehlen + sizeof(*ip6));
684 
685 		ip6->ip6_plen = 0;
686 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
687 	}
688 #endif
689 	return (m_head);
690 
691 #undef PULLUP_HDR
692 }
693 #endif	/* INET6 || INET */
694 
695 static int
696 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
697 {
698 	int error = 0;
699 
700 	HN_LOCK_ASSERT(sc);
701 
702 	if (sc->hn_rx_filter != filter) {
703 		error = hn_rndis_set_rxfilter(sc, filter);
704 		if (!error)
705 			sc->hn_rx_filter = filter;
706 	}
707 	return (error);
708 }
709 
710 static int
711 hn_rxfilter_config(struct hn_softc *sc)
712 {
713 	struct ifnet *ifp = sc->hn_ifp;
714 	uint32_t filter;
715 
716 	HN_LOCK_ASSERT(sc);
717 
718 	if ((ifp->if_flags & IFF_PROMISC) ||
719 	    (sc->hn_flags & HN_FLAG_VF)) {
720 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
721 	} else {
722 		filter = NDIS_PACKET_TYPE_DIRECTED;
723 		if (ifp->if_flags & IFF_BROADCAST)
724 			filter |= NDIS_PACKET_TYPE_BROADCAST;
725 		/* TODO: support multicast list */
726 		if ((ifp->if_flags & IFF_ALLMULTI) ||
727 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
728 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
729 	}
730 	return (hn_set_rxfilter(sc, filter));
731 }
732 
733 static void
734 hn_set_txagg(struct hn_softc *sc)
735 {
736 	uint32_t size, pkts;
737 	int i;
738 
739 	/*
740 	 * Setup aggregation size.
741 	 */
742 	if (sc->hn_agg_size < 0)
743 		size = UINT32_MAX;
744 	else
745 		size = sc->hn_agg_size;
746 
747 	if (sc->hn_rndis_agg_size < size)
748 		size = sc->hn_rndis_agg_size;
749 
750 	/* NOTE: We only aggregate packets using chimney sending buffers. */
751 	if (size > (uint32_t)sc->hn_chim_szmax)
752 		size = sc->hn_chim_szmax;
753 
754 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
755 		/* Disable */
756 		size = 0;
757 		pkts = 0;
758 		goto done;
759 	}
760 
761 	/* NOTE: Type of the per TX ring setting is 'int'. */
762 	if (size > INT_MAX)
763 		size = INT_MAX;
764 
765 	/*
766 	 * Setup aggregation packet count.
767 	 */
768 	if (sc->hn_agg_pkts < 0)
769 		pkts = UINT32_MAX;
770 	else
771 		pkts = sc->hn_agg_pkts;
772 
773 	if (sc->hn_rndis_agg_pkts < pkts)
774 		pkts = sc->hn_rndis_agg_pkts;
775 
776 	if (pkts <= 1) {
777 		/* Disable */
778 		size = 0;
779 		pkts = 0;
780 		goto done;
781 	}
782 
783 	/* NOTE: Type of the per TX ring setting is 'short'. */
784 	if (pkts > SHRT_MAX)
785 		pkts = SHRT_MAX;
786 
787 done:
788 	/* NOTE: Type of the per TX ring setting is 'short'. */
789 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
790 		/* Disable */
791 		size = 0;
792 		pkts = 0;
793 	}
794 
795 	if (bootverbose) {
796 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
797 		    size, pkts, sc->hn_rndis_agg_align);
798 	}
799 
800 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
801 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
802 
803 		mtx_lock(&txr->hn_tx_lock);
804 		txr->hn_agg_szmax = size;
805 		txr->hn_agg_pktmax = pkts;
806 		txr->hn_agg_align = sc->hn_rndis_agg_align;
807 		mtx_unlock(&txr->hn_tx_lock);
808 	}
809 }
810 
811 static int
812 hn_get_txswq_depth(const struct hn_tx_ring *txr)
813 {
814 
815 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
816 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
817 		return txr->hn_txdesc_cnt;
818 	return hn_tx_swq_depth;
819 }
820 
821 #ifndef RSS
822 static int
823 hn_rss_reconfig(struct hn_softc *sc)
824 {
825 	int error;
826 
827 	HN_LOCK_ASSERT(sc);
828 
829 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
830 		return (ENXIO);
831 
832 	/*
833 	 * Disable RSS first.
834 	 *
835 	 * NOTE:
836 	 * Direct reconfiguration by setting the UNCHG flags does
837 	 * _not_ work properly.
838 	 */
839 	if (bootverbose)
840 		if_printf(sc->hn_ifp, "disable RSS\n");
841 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
842 	if (error) {
843 		if_printf(sc->hn_ifp, "RSS disable failed\n");
844 		return (error);
845 	}
846 
847 	/*
848 	 * Reenable the RSS w/ the updated RSS key or indirect
849 	 * table.
850 	 */
851 	if (bootverbose)
852 		if_printf(sc->hn_ifp, "reconfig RSS\n");
853 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
854 	if (error) {
855 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
856 		return (error);
857 	}
858 	return (0);
859 }
860 #endif	/* !RSS */
861 
862 static void
863 hn_rss_ind_fixup(struct hn_softc *sc)
864 {
865 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
866 	int i, nchan;
867 
868 	nchan = sc->hn_rx_ring_inuse;
869 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
870 
871 	/*
872 	 * Check indirect table to make sure that all channels in it
873 	 * can be used.
874 	 */
875 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
876 		if (rss->rss_ind[i] >= nchan) {
877 			if_printf(sc->hn_ifp,
878 			    "RSS indirect table %d fixup: %u -> %d\n",
879 			    i, rss->rss_ind[i], nchan - 1);
880 			rss->rss_ind[i] = nchan - 1;
881 		}
882 	}
883 }
884 
885 static int
886 hn_ifmedia_upd(struct ifnet *ifp __unused)
887 {
888 
889 	return EOPNOTSUPP;
890 }
891 
892 static void
893 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
894 {
895 	struct hn_softc *sc = ifp->if_softc;
896 
897 	ifmr->ifm_status = IFM_AVALID;
898 	ifmr->ifm_active = IFM_ETHER;
899 
900 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
901 		ifmr->ifm_active |= IFM_NONE;
902 		return;
903 	}
904 	ifmr->ifm_status |= IFM_ACTIVE;
905 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
906 }
907 
908 static void
909 hn_update_vf_task(void *arg, int pending __unused)
910 {
911 	struct hn_update_vf *uv = arg;
912 
913 	uv->rxr->hn_vf = uv->vf;
914 }
915 
916 static void
917 hn_update_vf(struct hn_softc *sc, struct ifnet *vf)
918 {
919 	struct hn_rx_ring *rxr;
920 	struct hn_update_vf uv;
921 	struct task task;
922 	int i;
923 
924 	HN_LOCK_ASSERT(sc);
925 
926 	TASK_INIT(&task, 0, hn_update_vf_task, &uv);
927 
928 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
929 		rxr = &sc->hn_rx_ring[i];
930 
931 		if (i < sc->hn_rx_ring_inuse) {
932 			uv.rxr = rxr;
933 			uv.vf = vf;
934 			vmbus_chan_run_task(rxr->hn_chan, &task);
935 		} else {
936 			rxr->hn_vf = vf;
937 		}
938 	}
939 }
940 
941 static void
942 hn_set_vf(struct hn_softc *sc, struct ifnet *ifp, bool vf)
943 {
944 	struct ifnet *hn_ifp;
945 
946 	HN_LOCK(sc);
947 
948 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
949 		goto out;
950 
951 	hn_ifp = sc->hn_ifp;
952 
953 	if (ifp == hn_ifp)
954 		goto out;
955 
956 	if (ifp->if_alloctype != IFT_ETHER)
957 		goto out;
958 
959 	/* Ignore lagg/vlan interfaces */
960 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
961 	    strcmp(ifp->if_dname, "vlan") == 0)
962 		goto out;
963 
964 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
965 		goto out;
966 
967 	/* Now we're sure 'ifp' is a real VF device. */
968 	if (vf) {
969 		if (sc->hn_flags & HN_FLAG_VF)
970 			goto out;
971 
972 		sc->hn_flags |= HN_FLAG_VF;
973 		hn_rxfilter_config(sc);
974 	} else {
975 		if (!(sc->hn_flags & HN_FLAG_VF))
976 			goto out;
977 
978 		sc->hn_flags &= ~HN_FLAG_VF;
979 		if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
980 			hn_rxfilter_config(sc);
981 		else
982 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
983 	}
984 
985 	hn_nvs_set_datapath(sc,
986 	    vf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC);
987 
988 	hn_update_vf(sc, vf ? ifp : NULL);
989 
990 	if (vf) {
991 		hn_suspend_mgmt(sc);
992 		sc->hn_link_flags &=
993 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
994 		if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
995 	} else {
996 		hn_resume_mgmt(sc);
997 	}
998 
999 	devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1000 	    vf ? "VF_UP" : "VF_DOWN", NULL);
1001 
1002 	if (bootverbose)
1003 		if_printf(hn_ifp, "Data path is switched %s %s\n",
1004 		    vf ? "to" : "from", if_name(ifp));
1005 out:
1006 	HN_UNLOCK(sc);
1007 }
1008 
1009 static void
1010 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1011 {
1012 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1013 		return;
1014 
1015 	hn_set_vf(arg, ifp, event == IFNET_EVENT_UP);
1016 }
1017 
1018 static void
1019 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1020 {
1021 	hn_set_vf(arg, ifp, ifp->if_flags & IFF_UP);
1022 }
1023 
1024 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
1025 static const struct hyperv_guid g_net_vsc_device_type = {
1026 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
1027 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
1028 };
1029 
1030 static int
1031 hn_probe(device_t dev)
1032 {
1033 
1034 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1035 	    &g_net_vsc_device_type) == 0) {
1036 		device_set_desc(dev, "Hyper-V Network Interface");
1037 		return BUS_PROBE_DEFAULT;
1038 	}
1039 	return ENXIO;
1040 }
1041 
1042 static int
1043 hn_attach(device_t dev)
1044 {
1045 	struct hn_softc *sc = device_get_softc(dev);
1046 	struct sysctl_oid_list *child;
1047 	struct sysctl_ctx_list *ctx;
1048 	uint8_t eaddr[ETHER_ADDR_LEN];
1049 	struct ifnet *ifp = NULL;
1050 	int error, ring_cnt, tx_ring_cnt;
1051 
1052 	sc->hn_dev = dev;
1053 	sc->hn_prichan = vmbus_get_channel(dev);
1054 	HN_LOCK_INIT(sc);
1055 
1056 	/*
1057 	 * Initialize these tunables once.
1058 	 */
1059 	sc->hn_agg_size = hn_tx_agg_size;
1060 	sc->hn_agg_pkts = hn_tx_agg_pkts;
1061 
1062 	/*
1063 	 * Setup taskqueue for transmission.
1064 	 */
1065 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1066 		int i;
1067 
1068 		sc->hn_tx_taskqs =
1069 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1070 		    M_DEVBUF, M_WAITOK);
1071 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1072 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1073 			    M_WAITOK, taskqueue_thread_enqueue,
1074 			    &sc->hn_tx_taskqs[i]);
1075 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1076 			    "%s tx%d", device_get_nameunit(dev), i);
1077 		}
1078 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1079 		sc->hn_tx_taskqs = hn_tx_taskque;
1080 	}
1081 
1082 	/*
1083 	 * Setup taskqueue for mangement tasks, e.g. link status.
1084 	 */
1085 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1086 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1087 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1088 	    device_get_nameunit(dev));
1089 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1090 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1091 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1092 	    hn_netchg_status_taskfunc, sc);
1093 
1094 	/*
1095 	 * Allocate ifnet and setup its name earlier, so that if_printf
1096 	 * can be used by functions, which will be called after
1097 	 * ether_ifattach().
1098 	 */
1099 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
1100 	ifp->if_softc = sc;
1101 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1102 
1103 	/*
1104 	 * Initialize ifmedia earlier so that it can be unconditionally
1105 	 * destroyed, if error happened later on.
1106 	 */
1107 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1108 
1109 	/*
1110 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1111 	 * to use (tx_ring_cnt).
1112 	 *
1113 	 * NOTE:
1114 	 * The # of RX rings to use is same as the # of channels to use.
1115 	 */
1116 	ring_cnt = hn_chan_cnt;
1117 	if (ring_cnt <= 0) {
1118 		/* Default */
1119 		ring_cnt = mp_ncpus;
1120 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
1121 			ring_cnt = HN_RING_CNT_DEF_MAX;
1122 	} else if (ring_cnt > mp_ncpus) {
1123 		ring_cnt = mp_ncpus;
1124 	}
1125 #ifdef RSS
1126 	if (ring_cnt > rss_getnumbuckets())
1127 		ring_cnt = rss_getnumbuckets();
1128 #endif
1129 
1130 	tx_ring_cnt = hn_tx_ring_cnt;
1131 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1132 		tx_ring_cnt = ring_cnt;
1133 #ifdef HN_IFSTART_SUPPORT
1134 	if (hn_use_if_start) {
1135 		/* ifnet.if_start only needs one TX ring. */
1136 		tx_ring_cnt = 1;
1137 	}
1138 #endif
1139 
1140 	/*
1141 	 * Set the leader CPU for channels.
1142 	 */
1143 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1144 
1145 	/*
1146 	 * Create enough TX/RX rings, even if only limited number of
1147 	 * channels can be allocated.
1148 	 */
1149 	error = hn_create_tx_data(sc, tx_ring_cnt);
1150 	if (error)
1151 		goto failed;
1152 	error = hn_create_rx_data(sc, ring_cnt);
1153 	if (error)
1154 		goto failed;
1155 
1156 	/*
1157 	 * Create transaction context for NVS and RNDIS transactions.
1158 	 */
1159 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1160 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1161 	if (sc->hn_xact == NULL) {
1162 		error = ENXIO;
1163 		goto failed;
1164 	}
1165 
1166 	/*
1167 	 * Install orphan handler for the revocation of this device's
1168 	 * primary channel.
1169 	 *
1170 	 * NOTE:
1171 	 * The processing order is critical here:
1172 	 * Install the orphan handler, _before_ testing whether this
1173 	 * device's primary channel has been revoked or not.
1174 	 */
1175 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1176 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1177 		error = ENXIO;
1178 		goto failed;
1179 	}
1180 
1181 	/*
1182 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1183 	 */
1184 	error = hn_synth_attach(sc, ETHERMTU);
1185 	if (error)
1186 		goto failed;
1187 
1188 	error = hn_rndis_get_eaddr(sc, eaddr);
1189 	if (error)
1190 		goto failed;
1191 
1192 #if __FreeBSD_version >= 1100099
1193 	if (sc->hn_rx_ring_inuse > 1) {
1194 		/*
1195 		 * Reduce TCP segment aggregation limit for multiple
1196 		 * RX rings to increase ACK timeliness.
1197 		 */
1198 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1199 	}
1200 #endif
1201 
1202 	/*
1203 	 * Fixup TX stuffs after synthetic parts are attached.
1204 	 */
1205 	hn_fixup_tx_data(sc);
1206 
1207 	ctx = device_get_sysctl_ctx(dev);
1208 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1209 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1210 	    &sc->hn_nvs_ver, 0, "NVS version");
1211 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1212 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1213 	    hn_ndis_version_sysctl, "A", "NDIS version");
1214 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1215 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1216 	    hn_caps_sysctl, "A", "capabilities");
1217 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1218 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1219 	    hn_hwassist_sysctl, "A", "hwassist");
1220 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1221 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1222 	    hn_rxfilter_sysctl, "A", "rxfilter");
1223 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1224 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1225 	    hn_rss_hash_sysctl, "A", "RSS hash");
1226 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1227 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1228 #ifndef RSS
1229 	/*
1230 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
1231 	 */
1232 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1233 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1234 	    hn_rss_key_sysctl, "IU", "RSS key");
1235 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1236 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1237 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1238 #endif
1239 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1240 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1241 	    "RNDIS offered packet transmission aggregation size limit");
1242 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1243 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1244 	    "RNDIS offered packet transmission aggregation count limit");
1245 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1246 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1247 	    "RNDIS packet transmission aggregation alignment");
1248 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1249 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1250 	    hn_txagg_size_sysctl, "I",
1251 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1252 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1253 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1254 	    hn_txagg_pkts_sysctl, "I",
1255 	    "Packet transmission aggregation packets, "
1256 	    "0 -- disable, -1 -- auto");
1257 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1258 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1259 	    hn_polling_sysctl, "I",
1260 	    "Polling frequency: [100,1000000], 0 disable polling");
1261 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1262 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1263 	    hn_vf_sysctl, "A", "Virtual Function's name");
1264 
1265 	/*
1266 	 * Setup the ifmedia, which has been initialized earlier.
1267 	 */
1268 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1269 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1270 	/* XXX ifmedia_set really should do this for us */
1271 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1272 
1273 	/*
1274 	 * Setup the ifnet for this interface.
1275 	 */
1276 
1277 	ifp->if_baudrate = IF_Gbps(10);
1278 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1279 	ifp->if_ioctl = hn_ioctl;
1280 	ifp->if_init = hn_init;
1281 #ifdef HN_IFSTART_SUPPORT
1282 	if (hn_use_if_start) {
1283 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1284 
1285 		ifp->if_start = hn_start;
1286 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1287 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1288 		IFQ_SET_READY(&ifp->if_snd);
1289 	} else
1290 #endif
1291 	{
1292 		ifp->if_transmit = hn_transmit;
1293 		ifp->if_qflush = hn_xmit_qflush;
1294 	}
1295 
1296 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1297 #ifdef foo
1298 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
1299 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1300 #endif
1301 	if (sc->hn_caps & HN_CAP_VLAN) {
1302 		/* XXX not sure about VLAN_MTU. */
1303 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1304 	}
1305 
1306 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1307 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1308 		ifp->if_capabilities |= IFCAP_TXCSUM;
1309 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1310 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1311 	if (sc->hn_caps & HN_CAP_TSO4) {
1312 		ifp->if_capabilities |= IFCAP_TSO4;
1313 		ifp->if_hwassist |= CSUM_IP_TSO;
1314 	}
1315 	if (sc->hn_caps & HN_CAP_TSO6) {
1316 		ifp->if_capabilities |= IFCAP_TSO6;
1317 		ifp->if_hwassist |= CSUM_IP6_TSO;
1318 	}
1319 
1320 	/* Enable all available capabilities by default. */
1321 	ifp->if_capenable = ifp->if_capabilities;
1322 
1323 	/*
1324 	 * Disable IPv6 TSO and TXCSUM by default, they still can
1325 	 * be enabled through SIOCSIFCAP.
1326 	 */
1327 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1328 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1329 
1330 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1331 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1332 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1333 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1334 	}
1335 
1336 	ether_ifattach(ifp, eaddr);
1337 
1338 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1339 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
1340 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1341 	}
1342 
1343 	/* Inform the upper layer about the long frame support. */
1344 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1345 
1346 	/*
1347 	 * Kick off link status check.
1348 	 */
1349 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1350 	hn_update_link_status(sc);
1351 
1352 	sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
1353 	    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
1354 
1355 	sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
1356 	    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
1357 
1358 	return (0);
1359 failed:
1360 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1361 		hn_synth_detach(sc);
1362 	hn_detach(dev);
1363 	return (error);
1364 }
1365 
1366 static int
1367 hn_detach(device_t dev)
1368 {
1369 	struct hn_softc *sc = device_get_softc(dev);
1370 	struct ifnet *ifp = sc->hn_ifp;
1371 
1372 	if (sc->hn_ifaddr_evthand != NULL)
1373 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
1374 	if (sc->hn_ifnet_evthand != NULL)
1375 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
1376 
1377 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1378 		/*
1379 		 * In case that the vmbus missed the orphan handler
1380 		 * installation.
1381 		 */
1382 		vmbus_xact_ctx_orphan(sc->hn_xact);
1383 	}
1384 
1385 	if (device_is_attached(dev)) {
1386 		HN_LOCK(sc);
1387 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1388 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1389 				hn_stop(sc, true);
1390 			/*
1391 			 * NOTE:
1392 			 * hn_stop() only suspends data, so managment
1393 			 * stuffs have to be suspended manually here.
1394 			 */
1395 			hn_suspend_mgmt(sc);
1396 			hn_synth_detach(sc);
1397 		}
1398 		HN_UNLOCK(sc);
1399 		ether_ifdetach(ifp);
1400 	}
1401 
1402 	ifmedia_removeall(&sc->hn_media);
1403 	hn_destroy_rx_data(sc);
1404 	hn_destroy_tx_data(sc);
1405 
1406 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1407 		int i;
1408 
1409 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
1410 			taskqueue_free(sc->hn_tx_taskqs[i]);
1411 		free(sc->hn_tx_taskqs, M_DEVBUF);
1412 	}
1413 	taskqueue_free(sc->hn_mgmt_taskq0);
1414 
1415 	if (sc->hn_xact != NULL) {
1416 		/*
1417 		 * Uninstall the orphan handler _before_ the xact is
1418 		 * destructed.
1419 		 */
1420 		vmbus_chan_unset_orphan(sc->hn_prichan);
1421 		vmbus_xact_ctx_destroy(sc->hn_xact);
1422 	}
1423 
1424 	if_free(ifp);
1425 
1426 	HN_LOCK_DESTROY(sc);
1427 	return (0);
1428 }
1429 
1430 static int
1431 hn_shutdown(device_t dev)
1432 {
1433 
1434 	return (0);
1435 }
1436 
1437 static void
1438 hn_link_status(struct hn_softc *sc)
1439 {
1440 	uint32_t link_status;
1441 	int error;
1442 
1443 	error = hn_rndis_get_linkstatus(sc, &link_status);
1444 	if (error) {
1445 		/* XXX what to do? */
1446 		return;
1447 	}
1448 
1449 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1450 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1451 	else
1452 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1453 	if_link_state_change(sc->hn_ifp,
1454 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1455 	    LINK_STATE_UP : LINK_STATE_DOWN);
1456 }
1457 
1458 static void
1459 hn_link_taskfunc(void *xsc, int pending __unused)
1460 {
1461 	struct hn_softc *sc = xsc;
1462 
1463 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1464 		return;
1465 	hn_link_status(sc);
1466 }
1467 
1468 static void
1469 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1470 {
1471 	struct hn_softc *sc = xsc;
1472 
1473 	/* Prevent any link status checks from running. */
1474 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1475 
1476 	/*
1477 	 * Fake up a [link down --> link up] state change; 5 seconds
1478 	 * delay is used, which closely simulates miibus reaction
1479 	 * upon link down event.
1480 	 */
1481 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1482 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1483 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1484 	    &sc->hn_netchg_status, 5 * hz);
1485 }
1486 
1487 static void
1488 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1489 {
1490 	struct hn_softc *sc = xsc;
1491 
1492 	/* Re-allow link status checks. */
1493 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1494 	hn_link_status(sc);
1495 }
1496 
1497 static void
1498 hn_update_link_status(struct hn_softc *sc)
1499 {
1500 
1501 	if (sc->hn_mgmt_taskq != NULL)
1502 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1503 }
1504 
1505 static void
1506 hn_change_network(struct hn_softc *sc)
1507 {
1508 
1509 	if (sc->hn_mgmt_taskq != NULL)
1510 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1511 }
1512 
1513 static __inline int
1514 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1515     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1516 {
1517 	struct mbuf *m = *m_head;
1518 	int error;
1519 
1520 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1521 
1522 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1523 	    m, segs, nsegs, BUS_DMA_NOWAIT);
1524 	if (error == EFBIG) {
1525 		struct mbuf *m_new;
1526 
1527 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1528 		if (m_new == NULL)
1529 			return ENOBUFS;
1530 		else
1531 			*m_head = m = m_new;
1532 		txr->hn_tx_collapsed++;
1533 
1534 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1535 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1536 	}
1537 	if (!error) {
1538 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1539 		    BUS_DMASYNC_PREWRITE);
1540 		txd->flags |= HN_TXD_FLAG_DMAMAP;
1541 	}
1542 	return error;
1543 }
1544 
1545 static __inline int
1546 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1547 {
1548 
1549 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1550 	    ("put an onlist txd %#x", txd->flags));
1551 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1552 	    ("put an onagg txd %#x", txd->flags));
1553 
1554 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1555 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1556 		return 0;
1557 
1558 	if (!STAILQ_EMPTY(&txd->agg_list)) {
1559 		struct hn_txdesc *tmp_txd;
1560 
1561 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1562 			int freed;
1563 
1564 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1565 			    ("resursive aggregation on aggregated txdesc"));
1566 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1567 			    ("not aggregated txdesc"));
1568 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1569 			    ("aggregated txdesc uses dmamap"));
1570 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1571 			    ("aggregated txdesc consumes "
1572 			     "chimney sending buffer"));
1573 			KASSERT(tmp_txd->chim_size == 0,
1574 			    ("aggregated txdesc has non-zero "
1575 			     "chimney sending size"));
1576 
1577 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1578 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1579 			freed = hn_txdesc_put(txr, tmp_txd);
1580 			KASSERT(freed, ("failed to free aggregated txdesc"));
1581 		}
1582 	}
1583 
1584 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1585 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1586 		    ("chim txd uses dmamap"));
1587 		hn_chim_free(txr->hn_sc, txd->chim_index);
1588 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1589 		txd->chim_size = 0;
1590 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1591 		bus_dmamap_sync(txr->hn_tx_data_dtag,
1592 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1593 		bus_dmamap_unload(txr->hn_tx_data_dtag,
1594 		    txd->data_dmap);
1595 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1596 	}
1597 
1598 	if (txd->m != NULL) {
1599 		m_freem(txd->m);
1600 		txd->m = NULL;
1601 	}
1602 
1603 	txd->flags |= HN_TXD_FLAG_ONLIST;
1604 #ifndef HN_USE_TXDESC_BUFRING
1605 	mtx_lock_spin(&txr->hn_txlist_spin);
1606 	KASSERT(txr->hn_txdesc_avail >= 0 &&
1607 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1608 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1609 	txr->hn_txdesc_avail++;
1610 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1611 	mtx_unlock_spin(&txr->hn_txlist_spin);
1612 #else	/* HN_USE_TXDESC_BUFRING */
1613 #ifdef HN_DEBUG
1614 	atomic_add_int(&txr->hn_txdesc_avail, 1);
1615 #endif
1616 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1617 #endif	/* !HN_USE_TXDESC_BUFRING */
1618 
1619 	return 1;
1620 }
1621 
1622 static __inline struct hn_txdesc *
1623 hn_txdesc_get(struct hn_tx_ring *txr)
1624 {
1625 	struct hn_txdesc *txd;
1626 
1627 #ifndef HN_USE_TXDESC_BUFRING
1628 	mtx_lock_spin(&txr->hn_txlist_spin);
1629 	txd = SLIST_FIRST(&txr->hn_txlist);
1630 	if (txd != NULL) {
1631 		KASSERT(txr->hn_txdesc_avail > 0,
1632 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1633 		txr->hn_txdesc_avail--;
1634 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1635 	}
1636 	mtx_unlock_spin(&txr->hn_txlist_spin);
1637 #else
1638 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1639 #endif
1640 
1641 	if (txd != NULL) {
1642 #ifdef HN_USE_TXDESC_BUFRING
1643 #ifdef HN_DEBUG
1644 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1645 #endif
1646 #endif	/* HN_USE_TXDESC_BUFRING */
1647 		KASSERT(txd->m == NULL && txd->refs == 0 &&
1648 		    STAILQ_EMPTY(&txd->agg_list) &&
1649 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1650 		    txd->chim_size == 0 &&
1651 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1652 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1653 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1654 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1655 		txd->refs = 1;
1656 	}
1657 	return txd;
1658 }
1659 
1660 static __inline void
1661 hn_txdesc_hold(struct hn_txdesc *txd)
1662 {
1663 
1664 	/* 0->1 transition will never work */
1665 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1666 	atomic_add_int(&txd->refs, 1);
1667 }
1668 
1669 static __inline void
1670 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1671 {
1672 
1673 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1674 	    ("recursive aggregation on aggregating txdesc"));
1675 
1676 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1677 	    ("already aggregated"));
1678 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
1679 	    ("recursive aggregation on to-be-aggregated txdesc"));
1680 
1681 	txd->flags |= HN_TXD_FLAG_ONAGG;
1682 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1683 }
1684 
1685 static bool
1686 hn_tx_ring_pending(struct hn_tx_ring *txr)
1687 {
1688 	bool pending = false;
1689 
1690 #ifndef HN_USE_TXDESC_BUFRING
1691 	mtx_lock_spin(&txr->hn_txlist_spin);
1692 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1693 		pending = true;
1694 	mtx_unlock_spin(&txr->hn_txlist_spin);
1695 #else
1696 	if (!buf_ring_full(txr->hn_txdesc_br))
1697 		pending = true;
1698 #endif
1699 	return (pending);
1700 }
1701 
1702 static __inline void
1703 hn_txeof(struct hn_tx_ring *txr)
1704 {
1705 	txr->hn_has_txeof = 0;
1706 	txr->hn_txeof(txr);
1707 }
1708 
1709 static void
1710 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1711     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1712 {
1713 	struct hn_txdesc *txd = sndc->hn_cbarg;
1714 	struct hn_tx_ring *txr;
1715 
1716 	txr = txd->txr;
1717 	KASSERT(txr->hn_chan == chan,
1718 	    ("channel mismatch, on chan%u, should be chan%u",
1719 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1720 
1721 	txr->hn_has_txeof = 1;
1722 	hn_txdesc_put(txr, txd);
1723 
1724 	++txr->hn_txdone_cnt;
1725 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1726 		txr->hn_txdone_cnt = 0;
1727 		if (txr->hn_oactive)
1728 			hn_txeof(txr);
1729 	}
1730 }
1731 
1732 static void
1733 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1734 {
1735 #if defined(INET) || defined(INET6)
1736 	tcp_lro_flush_all(&rxr->hn_lro);
1737 #endif
1738 
1739 	/*
1740 	 * NOTE:
1741 	 * 'txr' could be NULL, if multiple channels and
1742 	 * ifnet.if_start method are enabled.
1743 	 */
1744 	if (txr == NULL || !txr->hn_has_txeof)
1745 		return;
1746 
1747 	txr->hn_txdone_cnt = 0;
1748 	hn_txeof(txr);
1749 }
1750 
1751 static __inline uint32_t
1752 hn_rndis_pktmsg_offset(uint32_t ofs)
1753 {
1754 
1755 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1756 	    ("invalid RNDIS packet msg offset %u", ofs));
1757 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1758 }
1759 
1760 static __inline void *
1761 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1762     size_t pi_dlen, uint32_t pi_type)
1763 {
1764 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1765 	struct rndis_pktinfo *pi;
1766 
1767 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1768 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1769 
1770 	/*
1771 	 * Per-packet-info does not move; it only grows.
1772 	 *
1773 	 * NOTE:
1774 	 * rm_pktinfooffset in this phase counts from the beginning
1775 	 * of rndis_packet_msg.
1776 	 */
1777 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1778 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1779 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1780 	    pkt->rm_pktinfolen);
1781 	pkt->rm_pktinfolen += pi_size;
1782 
1783 	pi->rm_size = pi_size;
1784 	pi->rm_type = pi_type;
1785 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1786 
1787 	/* Update RNDIS packet msg length */
1788 	pkt->rm_len += pi_size;
1789 
1790 	return (pi->rm_data);
1791 }
1792 
1793 static __inline int
1794 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1795 {
1796 	struct hn_txdesc *txd;
1797 	struct mbuf *m;
1798 	int error, pkts;
1799 
1800 	txd = txr->hn_agg_txd;
1801 	KASSERT(txd != NULL, ("no aggregate txdesc"));
1802 
1803 	/*
1804 	 * Since hn_txpkt() will reset this temporary stat, save
1805 	 * it now, so that oerrors can be updated properly, if
1806 	 * hn_txpkt() ever fails.
1807 	 */
1808 	pkts = txr->hn_stat_pkts;
1809 
1810 	/*
1811 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1812 	 * failure, save it for later freeing, if hn_txpkt() ever
1813 	 * fails.
1814 	 */
1815 	m = txd->m;
1816 	error = hn_txpkt(ifp, txr, txd);
1817 	if (__predict_false(error)) {
1818 		/* txd is freed, but m is not. */
1819 		m_freem(m);
1820 
1821 		txr->hn_flush_failed++;
1822 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1823 	}
1824 
1825 	/* Reset all aggregation states. */
1826 	txr->hn_agg_txd = NULL;
1827 	txr->hn_agg_szleft = 0;
1828 	txr->hn_agg_pktleft = 0;
1829 	txr->hn_agg_prevpkt = NULL;
1830 
1831 	return (error);
1832 }
1833 
1834 static void *
1835 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1836     int pktsize)
1837 {
1838 	void *chim;
1839 
1840 	if (txr->hn_agg_txd != NULL) {
1841 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1842 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1843 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1844 			int olen;
1845 
1846 			/*
1847 			 * Update the previous RNDIS packet's total length,
1848 			 * it can be increased due to the mandatory alignment
1849 			 * padding for this RNDIS packet.  And update the
1850 			 * aggregating txdesc's chimney sending buffer size
1851 			 * accordingly.
1852 			 *
1853 			 * XXX
1854 			 * Zero-out the padding, as required by the RNDIS spec.
1855 			 */
1856 			olen = pkt->rm_len;
1857 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1858 			agg_txd->chim_size += pkt->rm_len - olen;
1859 
1860 			/* Link this txdesc to the parent. */
1861 			hn_txdesc_agg(agg_txd, txd);
1862 
1863 			chim = (uint8_t *)pkt + pkt->rm_len;
1864 			/* Save the current packet for later fixup. */
1865 			txr->hn_agg_prevpkt = chim;
1866 
1867 			txr->hn_agg_pktleft--;
1868 			txr->hn_agg_szleft -= pktsize;
1869 			if (txr->hn_agg_szleft <=
1870 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1871 				/*
1872 				 * Probably can't aggregate more packets,
1873 				 * flush this aggregating txdesc proactively.
1874 				 */
1875 				txr->hn_agg_pktleft = 0;
1876 			}
1877 			/* Done! */
1878 			return (chim);
1879 		}
1880 		hn_flush_txagg(ifp, txr);
1881 	}
1882 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1883 
1884 	txr->hn_tx_chimney_tried++;
1885 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
1886 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1887 		return (NULL);
1888 	txr->hn_tx_chimney++;
1889 
1890 	chim = txr->hn_sc->hn_chim +
1891 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1892 
1893 	if (txr->hn_agg_pktmax > 1 &&
1894 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1895 		txr->hn_agg_txd = txd;
1896 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1897 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1898 		txr->hn_agg_prevpkt = chim;
1899 	}
1900 	return (chim);
1901 }
1902 
1903 /*
1904  * NOTE:
1905  * If this function fails, then both txd and m_head0 will be freed.
1906  */
1907 static int
1908 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1909     struct mbuf **m_head0)
1910 {
1911 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1912 	int error, nsegs, i;
1913 	struct mbuf *m_head = *m_head0;
1914 	struct rndis_packet_msg *pkt;
1915 	uint32_t *pi_data;
1916 	void *chim = NULL;
1917 	int pkt_hlen, pkt_size;
1918 
1919 	pkt = txd->rndis_pkt;
1920 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1921 	if (pkt_size < txr->hn_chim_size) {
1922 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1923 		if (chim != NULL)
1924 			pkt = chim;
1925 	} else {
1926 		if (txr->hn_agg_txd != NULL)
1927 			hn_flush_txagg(ifp, txr);
1928 	}
1929 
1930 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1931 	pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1932 	pkt->rm_dataoffset = 0;
1933 	pkt->rm_datalen = m_head->m_pkthdr.len;
1934 	pkt->rm_oobdataoffset = 0;
1935 	pkt->rm_oobdatalen = 0;
1936 	pkt->rm_oobdataelements = 0;
1937 	pkt->rm_pktinfooffset = sizeof(*pkt);
1938 	pkt->rm_pktinfolen = 0;
1939 	pkt->rm_vchandle = 0;
1940 	pkt->rm_reserved = 0;
1941 
1942 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1943 		/*
1944 		 * Set the hash value for this packet, so that the host could
1945 		 * dispatch the TX done event for this packet back to this TX
1946 		 * ring's channel.
1947 		 */
1948 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1949 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1950 		*pi_data = txr->hn_tx_idx;
1951 	}
1952 
1953 	if (m_head->m_flags & M_VLANTAG) {
1954 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1955 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1956 		*pi_data = NDIS_VLAN_INFO_MAKE(
1957 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1958 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1959 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1960 	}
1961 
1962 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1963 #if defined(INET6) || defined(INET)
1964 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1965 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1966 #ifdef INET
1967 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1968 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1969 			    m_head->m_pkthdr.tso_segsz);
1970 		}
1971 #endif
1972 #if defined(INET6) && defined(INET)
1973 		else
1974 #endif
1975 #ifdef INET6
1976 		{
1977 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1978 			    m_head->m_pkthdr.tso_segsz);
1979 		}
1980 #endif
1981 #endif	/* INET6 || INET */
1982 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1983 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1984 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1985 		if (m_head->m_pkthdr.csum_flags &
1986 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1987 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
1988 		} else {
1989 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
1990 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1991 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
1992 		}
1993 
1994 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1995 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1996 		else if (m_head->m_pkthdr.csum_flags &
1997 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
1998 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1999 	}
2000 
2001 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
2002 	/* Convert RNDIS packet message offsets */
2003 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
2004 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2005 
2006 	/*
2007 	 * Fast path: Chimney sending.
2008 	 */
2009 	if (chim != NULL) {
2010 		struct hn_txdesc *tgt_txd = txd;
2011 
2012 		if (txr->hn_agg_txd != NULL) {
2013 			tgt_txd = txr->hn_agg_txd;
2014 #ifdef INVARIANTS
2015 			*m_head0 = NULL;
2016 #endif
2017 		}
2018 
2019 		KASSERT(pkt == chim,
2020 		    ("RNDIS pkt not in chimney sending buffer"));
2021 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2022 		    ("chimney sending buffer is not used"));
2023 		tgt_txd->chim_size += pkt->rm_len;
2024 
2025 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
2026 		    ((uint8_t *)chim) + pkt_hlen);
2027 
2028 		txr->hn_gpa_cnt = 0;
2029 		txr->hn_sendpkt = hn_txpkt_chim;
2030 		goto done;
2031 	}
2032 
2033 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2034 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2035 	    ("chimney buffer is used"));
2036 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2037 
2038 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2039 	if (__predict_false(error)) {
2040 		int freed;
2041 
2042 		/*
2043 		 * This mbuf is not linked w/ the txd yet, so free it now.
2044 		 */
2045 		m_freem(m_head);
2046 		*m_head0 = NULL;
2047 
2048 		freed = hn_txdesc_put(txr, txd);
2049 		KASSERT(freed != 0,
2050 		    ("fail to free txd upon txdma error"));
2051 
2052 		txr->hn_txdma_failed++;
2053 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2054 		return error;
2055 	}
2056 	*m_head0 = m_head;
2057 
2058 	/* +1 RNDIS packet message */
2059 	txr->hn_gpa_cnt = nsegs + 1;
2060 
2061 	/* send packet with page buffer */
2062 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2063 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2064 	txr->hn_gpa[0].gpa_len = pkt_hlen;
2065 
2066 	/*
2067 	 * Fill the page buffers with mbuf info after the page
2068 	 * buffer for RNDIS packet message.
2069 	 */
2070 	for (i = 0; i < nsegs; ++i) {
2071 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2072 
2073 		gpa->gpa_page = atop(segs[i].ds_addr);
2074 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2075 		gpa->gpa_len = segs[i].ds_len;
2076 	}
2077 
2078 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2079 	txd->chim_size = 0;
2080 	txr->hn_sendpkt = hn_txpkt_sglist;
2081 done:
2082 	txd->m = m_head;
2083 
2084 	/* Set the completion routine */
2085 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2086 
2087 	/* Update temporary stats for later use. */
2088 	txr->hn_stat_pkts++;
2089 	txr->hn_stat_size += m_head->m_pkthdr.len;
2090 	if (m_head->m_flags & M_MCAST)
2091 		txr->hn_stat_mcasts++;
2092 
2093 	return 0;
2094 }
2095 
2096 /*
2097  * NOTE:
2098  * If this function fails, then txd will be freed, but the mbuf
2099  * associated w/ the txd will _not_ be freed.
2100  */
2101 static int
2102 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2103 {
2104 	int error, send_failed = 0, has_bpf;
2105 
2106 again:
2107 	has_bpf = bpf_peers_present(ifp->if_bpf);
2108 	if (has_bpf) {
2109 		/*
2110 		 * Make sure that this txd and any aggregated txds are not
2111 		 * freed before ETHER_BPF_MTAP.
2112 		 */
2113 		hn_txdesc_hold(txd);
2114 	}
2115 	error = txr->hn_sendpkt(txr, txd);
2116 	if (!error) {
2117 		if (has_bpf) {
2118 			const struct hn_txdesc *tmp_txd;
2119 
2120 			ETHER_BPF_MTAP(ifp, txd->m);
2121 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2122 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
2123 		}
2124 
2125 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2126 #ifdef HN_IFSTART_SUPPORT
2127 		if (!hn_use_if_start)
2128 #endif
2129 		{
2130 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
2131 			    txr->hn_stat_size);
2132 			if (txr->hn_stat_mcasts != 0) {
2133 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2134 				    txr->hn_stat_mcasts);
2135 			}
2136 		}
2137 		txr->hn_pkts += txr->hn_stat_pkts;
2138 		txr->hn_sends++;
2139 	}
2140 	if (has_bpf)
2141 		hn_txdesc_put(txr, txd);
2142 
2143 	if (__predict_false(error)) {
2144 		int freed;
2145 
2146 		/*
2147 		 * This should "really rarely" happen.
2148 		 *
2149 		 * XXX Too many RX to be acked or too many sideband
2150 		 * commands to run?  Ask netvsc_channel_rollup()
2151 		 * to kick start later.
2152 		 */
2153 		txr->hn_has_txeof = 1;
2154 		if (!send_failed) {
2155 			txr->hn_send_failed++;
2156 			send_failed = 1;
2157 			/*
2158 			 * Try sending again after set hn_has_txeof;
2159 			 * in case that we missed the last
2160 			 * netvsc_channel_rollup().
2161 			 */
2162 			goto again;
2163 		}
2164 		if_printf(ifp, "send failed\n");
2165 
2166 		/*
2167 		 * Caller will perform further processing on the
2168 		 * associated mbuf, so don't free it in hn_txdesc_put();
2169 		 * only unload it from the DMA map in hn_txdesc_put(),
2170 		 * if it was loaded.
2171 		 */
2172 		txd->m = NULL;
2173 		freed = hn_txdesc_put(txr, txd);
2174 		KASSERT(freed != 0,
2175 		    ("fail to free txd upon send error"));
2176 
2177 		txr->hn_send_failed++;
2178 	}
2179 
2180 	/* Reset temporary stats, after this sending is done. */
2181 	txr->hn_stat_size = 0;
2182 	txr->hn_stat_pkts = 0;
2183 	txr->hn_stat_mcasts = 0;
2184 
2185 	return (error);
2186 }
2187 
2188 /*
2189  * Append the specified data to the indicated mbuf chain,
2190  * Extend the mbuf chain if the new data does not fit in
2191  * existing space.
2192  *
2193  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2194  * There should be an equivalent in the kernel mbuf code,
2195  * but there does not appear to be one yet.
2196  *
2197  * Differs from m_append() in that additional mbufs are
2198  * allocated with cluster size MJUMPAGESIZE, and filled
2199  * accordingly.
2200  *
2201  * Return 1 if able to complete the job; otherwise 0.
2202  */
2203 static int
2204 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2205 {
2206 	struct mbuf *m, *n;
2207 	int remainder, space;
2208 
2209 	for (m = m0; m->m_next != NULL; m = m->m_next)
2210 		;
2211 	remainder = len;
2212 	space = M_TRAILINGSPACE(m);
2213 	if (space > 0) {
2214 		/*
2215 		 * Copy into available space.
2216 		 */
2217 		if (space > remainder)
2218 			space = remainder;
2219 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2220 		m->m_len += space;
2221 		cp += space;
2222 		remainder -= space;
2223 	}
2224 	while (remainder > 0) {
2225 		/*
2226 		 * Allocate a new mbuf; could check space
2227 		 * and allocate a cluster instead.
2228 		 */
2229 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2230 		if (n == NULL)
2231 			break;
2232 		n->m_len = min(MJUMPAGESIZE, remainder);
2233 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2234 		cp += n->m_len;
2235 		remainder -= n->m_len;
2236 		m->m_next = n;
2237 		m = n;
2238 	}
2239 	if (m0->m_flags & M_PKTHDR)
2240 		m0->m_pkthdr.len += len - remainder;
2241 
2242 	return (remainder == 0);
2243 }
2244 
2245 #if defined(INET) || defined(INET6)
2246 static __inline int
2247 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2248 {
2249 #if __FreeBSD_version >= 1100095
2250 	if (hn_lro_mbufq_depth) {
2251 		tcp_lro_queue_mbuf(lc, m);
2252 		return 0;
2253 	}
2254 #endif
2255 	return tcp_lro_rx(lc, m, 0);
2256 }
2257 #endif
2258 
2259 static int
2260 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2261     const struct hn_rxinfo *info)
2262 {
2263 	struct ifnet *ifp;
2264 	struct mbuf *m_new;
2265 	int size, do_lro = 0, do_csum = 1;
2266 	int hash_type;
2267 
2268 	/* If the VF is active, inject the packet through the VF */
2269 	ifp = rxr->hn_vf ? rxr->hn_vf : rxr->hn_ifp;
2270 
2271 	if (dlen <= MHLEN) {
2272 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
2273 		if (m_new == NULL) {
2274 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2275 			return (0);
2276 		}
2277 		memcpy(mtod(m_new, void *), data, dlen);
2278 		m_new->m_pkthdr.len = m_new->m_len = dlen;
2279 		rxr->hn_small_pkts++;
2280 	} else {
2281 		/*
2282 		 * Get an mbuf with a cluster.  For packets 2K or less,
2283 		 * get a standard 2K cluster.  For anything larger, get a
2284 		 * 4K cluster.  Any buffers larger than 4K can cause problems
2285 		 * if looped around to the Hyper-V TX channel, so avoid them.
2286 		 */
2287 		size = MCLBYTES;
2288 		if (dlen > MCLBYTES) {
2289 			/* 4096 */
2290 			size = MJUMPAGESIZE;
2291 		}
2292 
2293 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2294 		if (m_new == NULL) {
2295 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2296 			return (0);
2297 		}
2298 
2299 		hv_m_append(m_new, dlen, data);
2300 	}
2301 	m_new->m_pkthdr.rcvif = ifp;
2302 
2303 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2304 		do_csum = 0;
2305 
2306 	/* receive side checksum offload */
2307 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2308 		/* IP csum offload */
2309 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2310 			m_new->m_pkthdr.csum_flags |=
2311 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2312 			rxr->hn_csum_ip++;
2313 		}
2314 
2315 		/* TCP/UDP csum offload */
2316 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2317 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2318 			m_new->m_pkthdr.csum_flags |=
2319 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2320 			m_new->m_pkthdr.csum_data = 0xffff;
2321 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2322 				rxr->hn_csum_tcp++;
2323 			else
2324 				rxr->hn_csum_udp++;
2325 		}
2326 
2327 		/*
2328 		 * XXX
2329 		 * As of this write (Oct 28th, 2016), host side will turn
2330 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2331 		 * the do_lro setting here is actually _not_ accurate.  We
2332 		 * depend on the RSS hash type check to reset do_lro.
2333 		 */
2334 		if ((info->csum_info &
2335 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2336 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2337 			do_lro = 1;
2338 	} else {
2339 		const struct ether_header *eh;
2340 		uint16_t etype;
2341 		int hoff;
2342 
2343 		hoff = sizeof(*eh);
2344 		if (m_new->m_len < hoff)
2345 			goto skip;
2346 		eh = mtod(m_new, struct ether_header *);
2347 		etype = ntohs(eh->ether_type);
2348 		if (etype == ETHERTYPE_VLAN) {
2349 			const struct ether_vlan_header *evl;
2350 
2351 			hoff = sizeof(*evl);
2352 			if (m_new->m_len < hoff)
2353 				goto skip;
2354 			evl = mtod(m_new, struct ether_vlan_header *);
2355 			etype = ntohs(evl->evl_proto);
2356 		}
2357 
2358 		if (etype == ETHERTYPE_IP) {
2359 			int pr;
2360 
2361 			pr = hn_check_iplen(m_new, hoff);
2362 			if (pr == IPPROTO_TCP) {
2363 				if (do_csum &&
2364 				    (rxr->hn_trust_hcsum &
2365 				     HN_TRUST_HCSUM_TCP)) {
2366 					rxr->hn_csum_trusted++;
2367 					m_new->m_pkthdr.csum_flags |=
2368 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2369 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2370 					m_new->m_pkthdr.csum_data = 0xffff;
2371 				}
2372 				do_lro = 1;
2373 			} else if (pr == IPPROTO_UDP) {
2374 				if (do_csum &&
2375 				    (rxr->hn_trust_hcsum &
2376 				     HN_TRUST_HCSUM_UDP)) {
2377 					rxr->hn_csum_trusted++;
2378 					m_new->m_pkthdr.csum_flags |=
2379 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2380 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2381 					m_new->m_pkthdr.csum_data = 0xffff;
2382 				}
2383 			} else if (pr != IPPROTO_DONE && do_csum &&
2384 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2385 				rxr->hn_csum_trusted++;
2386 				m_new->m_pkthdr.csum_flags |=
2387 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2388 			}
2389 		}
2390 	}
2391 skip:
2392 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2393 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2394 		    NDIS_VLAN_INFO_ID(info->vlan_info),
2395 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
2396 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
2397 		m_new->m_flags |= M_VLANTAG;
2398 	}
2399 
2400 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2401 		rxr->hn_rss_pkts++;
2402 		m_new->m_pkthdr.flowid = info->hash_value;
2403 		hash_type = M_HASHTYPE_OPAQUE_HASH;
2404 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2405 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
2406 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2407 
2408 			/*
2409 			 * NOTE:
2410 			 * do_lro is resetted, if the hash types are not TCP
2411 			 * related.  See the comment in the above csum_flags
2412 			 * setup section.
2413 			 */
2414 			switch (type) {
2415 			case NDIS_HASH_IPV4:
2416 				hash_type = M_HASHTYPE_RSS_IPV4;
2417 				do_lro = 0;
2418 				break;
2419 
2420 			case NDIS_HASH_TCP_IPV4:
2421 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2422 				break;
2423 
2424 			case NDIS_HASH_IPV6:
2425 				hash_type = M_HASHTYPE_RSS_IPV6;
2426 				do_lro = 0;
2427 				break;
2428 
2429 			case NDIS_HASH_IPV6_EX:
2430 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
2431 				do_lro = 0;
2432 				break;
2433 
2434 			case NDIS_HASH_TCP_IPV6:
2435 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2436 				break;
2437 
2438 			case NDIS_HASH_TCP_IPV6_EX:
2439 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2440 				break;
2441 			}
2442 		}
2443 	} else {
2444 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2445 		hash_type = M_HASHTYPE_OPAQUE;
2446 	}
2447 	M_HASHTYPE_SET(m_new, hash_type);
2448 
2449 	/*
2450 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
2451 	 * messages (not just data messages) will trigger a response.
2452 	 */
2453 
2454 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2455 	rxr->hn_pkts++;
2456 
2457 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2458 #if defined(INET) || defined(INET6)
2459 		struct lro_ctrl *lro = &rxr->hn_lro;
2460 
2461 		if (lro->lro_cnt) {
2462 			rxr->hn_lro_tried++;
2463 			if (hn_lro_rx(lro, m_new) == 0) {
2464 				/* DONE! */
2465 				return 0;
2466 			}
2467 		}
2468 #endif
2469 	}
2470 
2471 	/* We're not holding the lock here, so don't release it */
2472 	(*ifp->if_input)(ifp, m_new);
2473 
2474 	return (0);
2475 }
2476 
2477 static int
2478 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2479 {
2480 	struct hn_softc *sc = ifp->if_softc;
2481 	struct ifreq *ifr = (struct ifreq *)data;
2482 	int mask, error = 0;
2483 
2484 	switch (cmd) {
2485 	case SIOCSIFMTU:
2486 		if (ifr->ifr_mtu > HN_MTU_MAX) {
2487 			error = EINVAL;
2488 			break;
2489 		}
2490 
2491 		HN_LOCK(sc);
2492 
2493 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2494 			HN_UNLOCK(sc);
2495 			break;
2496 		}
2497 
2498 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2499 			/* Can't change MTU */
2500 			HN_UNLOCK(sc);
2501 			error = EOPNOTSUPP;
2502 			break;
2503 		}
2504 
2505 		if (ifp->if_mtu == ifr->ifr_mtu) {
2506 			HN_UNLOCK(sc);
2507 			break;
2508 		}
2509 
2510 		/*
2511 		 * Suspend this interface before the synthetic parts
2512 		 * are ripped.
2513 		 */
2514 		hn_suspend(sc);
2515 
2516 		/*
2517 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
2518 		 */
2519 		hn_synth_detach(sc);
2520 
2521 		/*
2522 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2523 		 * with the new MTU setting.
2524 		 */
2525 		error = hn_synth_attach(sc, ifr->ifr_mtu);
2526 		if (error) {
2527 			HN_UNLOCK(sc);
2528 			break;
2529 		}
2530 
2531 		/*
2532 		 * Commit the requested MTU, after the synthetic parts
2533 		 * have been successfully attached.
2534 		 */
2535 		ifp->if_mtu = ifr->ifr_mtu;
2536 
2537 		/*
2538 		 * Make sure that various parameters based on MTU are
2539 		 * still valid, after the MTU change.
2540 		 */
2541 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2542 			hn_set_chim_size(sc, sc->hn_chim_szmax);
2543 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2544 #if __FreeBSD_version >= 1100099
2545 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2546 		    HN_LRO_LENLIM_MIN(ifp))
2547 			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2548 #endif
2549 
2550 		/*
2551 		 * All done!  Resume the interface now.
2552 		 */
2553 		hn_resume(sc);
2554 
2555 		HN_UNLOCK(sc);
2556 		break;
2557 
2558 	case SIOCSIFFLAGS:
2559 		HN_LOCK(sc);
2560 
2561 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2562 			HN_UNLOCK(sc);
2563 			break;
2564 		}
2565 
2566 		if (ifp->if_flags & IFF_UP) {
2567 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2568 				/*
2569 				 * Caller meight hold mutex, e.g.
2570 				 * bpf; use busy-wait for the RNDIS
2571 				 * reply.
2572 				 */
2573 				HN_NO_SLEEPING(sc);
2574 				hn_rxfilter_config(sc);
2575 				HN_SLEEPING_OK(sc);
2576 			} else {
2577 				hn_init_locked(sc);
2578 			}
2579 		} else {
2580 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2581 				hn_stop(sc, false);
2582 		}
2583 		sc->hn_if_flags = ifp->if_flags;
2584 
2585 		HN_UNLOCK(sc);
2586 		break;
2587 
2588 	case SIOCSIFCAP:
2589 		HN_LOCK(sc);
2590 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2591 
2592 		if (mask & IFCAP_TXCSUM) {
2593 			ifp->if_capenable ^= IFCAP_TXCSUM;
2594 			if (ifp->if_capenable & IFCAP_TXCSUM)
2595 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2596 			else
2597 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2598 		}
2599 		if (mask & IFCAP_TXCSUM_IPV6) {
2600 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2601 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2602 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2603 			else
2604 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2605 		}
2606 
2607 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2608 		if (mask & IFCAP_RXCSUM)
2609 			ifp->if_capenable ^= IFCAP_RXCSUM;
2610 #ifdef foo
2611 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2612 		if (mask & IFCAP_RXCSUM_IPV6)
2613 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2614 #endif
2615 
2616 		if (mask & IFCAP_LRO)
2617 			ifp->if_capenable ^= IFCAP_LRO;
2618 
2619 		if (mask & IFCAP_TSO4) {
2620 			ifp->if_capenable ^= IFCAP_TSO4;
2621 			if (ifp->if_capenable & IFCAP_TSO4)
2622 				ifp->if_hwassist |= CSUM_IP_TSO;
2623 			else
2624 				ifp->if_hwassist &= ~CSUM_IP_TSO;
2625 		}
2626 		if (mask & IFCAP_TSO6) {
2627 			ifp->if_capenable ^= IFCAP_TSO6;
2628 			if (ifp->if_capenable & IFCAP_TSO6)
2629 				ifp->if_hwassist |= CSUM_IP6_TSO;
2630 			else
2631 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2632 		}
2633 
2634 		HN_UNLOCK(sc);
2635 		break;
2636 
2637 	case SIOCADDMULTI:
2638 	case SIOCDELMULTI:
2639 		HN_LOCK(sc);
2640 
2641 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2642 			HN_UNLOCK(sc);
2643 			break;
2644 		}
2645 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2646 			/*
2647 			 * Multicast uses mutex; use busy-wait for
2648 			 * the RNDIS reply.
2649 			 */
2650 			HN_NO_SLEEPING(sc);
2651 			hn_rxfilter_config(sc);
2652 			HN_SLEEPING_OK(sc);
2653 		}
2654 
2655 		HN_UNLOCK(sc);
2656 		break;
2657 
2658 	case SIOCSIFMEDIA:
2659 	case SIOCGIFMEDIA:
2660 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2661 		break;
2662 
2663 	default:
2664 		error = ether_ioctl(ifp, cmd, data);
2665 		break;
2666 	}
2667 	return (error);
2668 }
2669 
2670 static void
2671 hn_stop(struct hn_softc *sc, bool detaching)
2672 {
2673 	struct ifnet *ifp = sc->hn_ifp;
2674 	int i;
2675 
2676 	HN_LOCK_ASSERT(sc);
2677 
2678 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2679 	    ("synthetic parts were not attached"));
2680 
2681 	/* Disable polling. */
2682 	hn_polling(sc, 0);
2683 
2684 	/* Clear RUNNING bit _before_ hn_suspend_data() */
2685 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2686 	hn_suspend_data(sc);
2687 
2688 	/* Clear OACTIVE bit. */
2689 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2690 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2691 		sc->hn_tx_ring[i].hn_oactive = 0;
2692 
2693 	/*
2694 	 * If the VF is active, make sure the filter is not 0, even if
2695 	 * the synthetic NIC is down.
2696 	 */
2697 	if (!detaching && (sc->hn_flags & HN_FLAG_VF))
2698 		hn_rxfilter_config(sc);
2699 }
2700 
2701 static void
2702 hn_init_locked(struct hn_softc *sc)
2703 {
2704 	struct ifnet *ifp = sc->hn_ifp;
2705 	int i;
2706 
2707 	HN_LOCK_ASSERT(sc);
2708 
2709 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2710 		return;
2711 
2712 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2713 		return;
2714 
2715 	/* Configure RX filter */
2716 	hn_rxfilter_config(sc);
2717 
2718 	/* Clear OACTIVE bit. */
2719 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2720 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2721 		sc->hn_tx_ring[i].hn_oactive = 0;
2722 
2723 	/* Clear TX 'suspended' bit. */
2724 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2725 
2726 	/* Everything is ready; unleash! */
2727 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2728 
2729 	/* Re-enable polling if requested. */
2730 	if (sc->hn_pollhz > 0)
2731 		hn_polling(sc, sc->hn_pollhz);
2732 }
2733 
2734 static void
2735 hn_init(void *xsc)
2736 {
2737 	struct hn_softc *sc = xsc;
2738 
2739 	HN_LOCK(sc);
2740 	hn_init_locked(sc);
2741 	HN_UNLOCK(sc);
2742 }
2743 
2744 #if __FreeBSD_version >= 1100099
2745 
2746 static int
2747 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2748 {
2749 	struct hn_softc *sc = arg1;
2750 	unsigned int lenlim;
2751 	int error;
2752 
2753 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2754 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2755 	if (error || req->newptr == NULL)
2756 		return error;
2757 
2758 	HN_LOCK(sc);
2759 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2760 	    lenlim > TCP_LRO_LENGTH_MAX) {
2761 		HN_UNLOCK(sc);
2762 		return EINVAL;
2763 	}
2764 	hn_set_lro_lenlim(sc, lenlim);
2765 	HN_UNLOCK(sc);
2766 
2767 	return 0;
2768 }
2769 
2770 static int
2771 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2772 {
2773 	struct hn_softc *sc = arg1;
2774 	int ackcnt, error, i;
2775 
2776 	/*
2777 	 * lro_ackcnt_lim is append count limit,
2778 	 * +1 to turn it into aggregation limit.
2779 	 */
2780 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2781 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2782 	if (error || req->newptr == NULL)
2783 		return error;
2784 
2785 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2786 		return EINVAL;
2787 
2788 	/*
2789 	 * Convert aggregation limit back to append
2790 	 * count limit.
2791 	 */
2792 	--ackcnt;
2793 	HN_LOCK(sc);
2794 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2795 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2796 	HN_UNLOCK(sc);
2797 	return 0;
2798 }
2799 
2800 #endif
2801 
2802 static int
2803 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2804 {
2805 	struct hn_softc *sc = arg1;
2806 	int hcsum = arg2;
2807 	int on, error, i;
2808 
2809 	on = 0;
2810 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2811 		on = 1;
2812 
2813 	error = sysctl_handle_int(oidp, &on, 0, req);
2814 	if (error || req->newptr == NULL)
2815 		return error;
2816 
2817 	HN_LOCK(sc);
2818 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2819 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2820 
2821 		if (on)
2822 			rxr->hn_trust_hcsum |= hcsum;
2823 		else
2824 			rxr->hn_trust_hcsum &= ~hcsum;
2825 	}
2826 	HN_UNLOCK(sc);
2827 	return 0;
2828 }
2829 
2830 static int
2831 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2832 {
2833 	struct hn_softc *sc = arg1;
2834 	int chim_size, error;
2835 
2836 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
2837 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
2838 	if (error || req->newptr == NULL)
2839 		return error;
2840 
2841 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2842 		return EINVAL;
2843 
2844 	HN_LOCK(sc);
2845 	hn_set_chim_size(sc, chim_size);
2846 	HN_UNLOCK(sc);
2847 	return 0;
2848 }
2849 
2850 #if __FreeBSD_version < 1100095
2851 static int
2852 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2853 {
2854 	struct hn_softc *sc = arg1;
2855 	int ofs = arg2, i, error;
2856 	struct hn_rx_ring *rxr;
2857 	uint64_t stat;
2858 
2859 	stat = 0;
2860 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2861 		rxr = &sc->hn_rx_ring[i];
2862 		stat += *((int *)((uint8_t *)rxr + ofs));
2863 	}
2864 
2865 	error = sysctl_handle_64(oidp, &stat, 0, req);
2866 	if (error || req->newptr == NULL)
2867 		return error;
2868 
2869 	/* Zero out this stat. */
2870 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2871 		rxr = &sc->hn_rx_ring[i];
2872 		*((int *)((uint8_t *)rxr + ofs)) = 0;
2873 	}
2874 	return 0;
2875 }
2876 #else
2877 static int
2878 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2879 {
2880 	struct hn_softc *sc = arg1;
2881 	int ofs = arg2, i, error;
2882 	struct hn_rx_ring *rxr;
2883 	uint64_t stat;
2884 
2885 	stat = 0;
2886 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2887 		rxr = &sc->hn_rx_ring[i];
2888 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2889 	}
2890 
2891 	error = sysctl_handle_64(oidp, &stat, 0, req);
2892 	if (error || req->newptr == NULL)
2893 		return error;
2894 
2895 	/* Zero out this stat. */
2896 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2897 		rxr = &sc->hn_rx_ring[i];
2898 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2899 	}
2900 	return 0;
2901 }
2902 
2903 #endif
2904 
2905 static int
2906 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2907 {
2908 	struct hn_softc *sc = arg1;
2909 	int ofs = arg2, i, error;
2910 	struct hn_rx_ring *rxr;
2911 	u_long stat;
2912 
2913 	stat = 0;
2914 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2915 		rxr = &sc->hn_rx_ring[i];
2916 		stat += *((u_long *)((uint8_t *)rxr + ofs));
2917 	}
2918 
2919 	error = sysctl_handle_long(oidp, &stat, 0, req);
2920 	if (error || req->newptr == NULL)
2921 		return error;
2922 
2923 	/* Zero out this stat. */
2924 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2925 		rxr = &sc->hn_rx_ring[i];
2926 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
2927 	}
2928 	return 0;
2929 }
2930 
2931 static int
2932 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2933 {
2934 	struct hn_softc *sc = arg1;
2935 	int ofs = arg2, i, error;
2936 	struct hn_tx_ring *txr;
2937 	u_long stat;
2938 
2939 	stat = 0;
2940 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2941 		txr = &sc->hn_tx_ring[i];
2942 		stat += *((u_long *)((uint8_t *)txr + ofs));
2943 	}
2944 
2945 	error = sysctl_handle_long(oidp, &stat, 0, req);
2946 	if (error || req->newptr == NULL)
2947 		return error;
2948 
2949 	/* Zero out this stat. */
2950 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2951 		txr = &sc->hn_tx_ring[i];
2952 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
2953 	}
2954 	return 0;
2955 }
2956 
2957 static int
2958 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2959 {
2960 	struct hn_softc *sc = arg1;
2961 	int ofs = arg2, i, error, conf;
2962 	struct hn_tx_ring *txr;
2963 
2964 	txr = &sc->hn_tx_ring[0];
2965 	conf = *((int *)((uint8_t *)txr + ofs));
2966 
2967 	error = sysctl_handle_int(oidp, &conf, 0, req);
2968 	if (error || req->newptr == NULL)
2969 		return error;
2970 
2971 	HN_LOCK(sc);
2972 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2973 		txr = &sc->hn_tx_ring[i];
2974 		*((int *)((uint8_t *)txr + ofs)) = conf;
2975 	}
2976 	HN_UNLOCK(sc);
2977 
2978 	return 0;
2979 }
2980 
2981 static int
2982 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2983 {
2984 	struct hn_softc *sc = arg1;
2985 	int error, size;
2986 
2987 	size = sc->hn_agg_size;
2988 	error = sysctl_handle_int(oidp, &size, 0, req);
2989 	if (error || req->newptr == NULL)
2990 		return (error);
2991 
2992 	HN_LOCK(sc);
2993 	sc->hn_agg_size = size;
2994 	hn_set_txagg(sc);
2995 	HN_UNLOCK(sc);
2996 
2997 	return (0);
2998 }
2999 
3000 static int
3001 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3002 {
3003 	struct hn_softc *sc = arg1;
3004 	int error, pkts;
3005 
3006 	pkts = sc->hn_agg_pkts;
3007 	error = sysctl_handle_int(oidp, &pkts, 0, req);
3008 	if (error || req->newptr == NULL)
3009 		return (error);
3010 
3011 	HN_LOCK(sc);
3012 	sc->hn_agg_pkts = pkts;
3013 	hn_set_txagg(sc);
3014 	HN_UNLOCK(sc);
3015 
3016 	return (0);
3017 }
3018 
3019 static int
3020 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
3021 {
3022 	struct hn_softc *sc = arg1;
3023 	int pkts;
3024 
3025 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
3026 	return (sysctl_handle_int(oidp, &pkts, 0, req));
3027 }
3028 
3029 static int
3030 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
3031 {
3032 	struct hn_softc *sc = arg1;
3033 	int align;
3034 
3035 	align = sc->hn_tx_ring[0].hn_agg_align;
3036 	return (sysctl_handle_int(oidp, &align, 0, req));
3037 }
3038 
3039 static void
3040 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
3041 {
3042 	if (pollhz == 0)
3043 		vmbus_chan_poll_disable(chan);
3044 	else
3045 		vmbus_chan_poll_enable(chan, pollhz);
3046 }
3047 
3048 static void
3049 hn_polling(struct hn_softc *sc, u_int pollhz)
3050 {
3051 	int nsubch = sc->hn_rx_ring_inuse - 1;
3052 
3053 	HN_LOCK_ASSERT(sc);
3054 
3055 	if (nsubch > 0) {
3056 		struct vmbus_channel **subch;
3057 		int i;
3058 
3059 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3060 		for (i = 0; i < nsubch; ++i)
3061 			hn_chan_polling(subch[i], pollhz);
3062 		vmbus_subchan_rel(subch, nsubch);
3063 	}
3064 	hn_chan_polling(sc->hn_prichan, pollhz);
3065 }
3066 
3067 static int
3068 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
3069 {
3070 	struct hn_softc *sc = arg1;
3071 	int pollhz, error;
3072 
3073 	pollhz = sc->hn_pollhz;
3074 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
3075 	if (error || req->newptr == NULL)
3076 		return (error);
3077 
3078 	if (pollhz != 0 &&
3079 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
3080 		return (EINVAL);
3081 
3082 	HN_LOCK(sc);
3083 	if (sc->hn_pollhz != pollhz) {
3084 		sc->hn_pollhz = pollhz;
3085 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
3086 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
3087 			hn_polling(sc, sc->hn_pollhz);
3088 	}
3089 	HN_UNLOCK(sc);
3090 
3091 	return (0);
3092 }
3093 
3094 static int
3095 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
3096 {
3097 	struct hn_softc *sc = arg1;
3098 	char verstr[16];
3099 
3100 	snprintf(verstr, sizeof(verstr), "%u.%u",
3101 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
3102 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
3103 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
3104 }
3105 
3106 static int
3107 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
3108 {
3109 	struct hn_softc *sc = arg1;
3110 	char caps_str[128];
3111 	uint32_t caps;
3112 
3113 	HN_LOCK(sc);
3114 	caps = sc->hn_caps;
3115 	HN_UNLOCK(sc);
3116 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
3117 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
3118 }
3119 
3120 static int
3121 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
3122 {
3123 	struct hn_softc *sc = arg1;
3124 	char assist_str[128];
3125 	uint32_t hwassist;
3126 
3127 	HN_LOCK(sc);
3128 	hwassist = sc->hn_ifp->if_hwassist;
3129 	HN_UNLOCK(sc);
3130 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
3131 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
3132 }
3133 
3134 static int
3135 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
3136 {
3137 	struct hn_softc *sc = arg1;
3138 	char filter_str[128];
3139 	uint32_t filter;
3140 
3141 	HN_LOCK(sc);
3142 	filter = sc->hn_rx_filter;
3143 	HN_UNLOCK(sc);
3144 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
3145 	    NDIS_PACKET_TYPES);
3146 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3147 }
3148 
3149 #ifndef RSS
3150 
3151 static int
3152 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3153 {
3154 	struct hn_softc *sc = arg1;
3155 	int error;
3156 
3157 	HN_LOCK(sc);
3158 
3159 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3160 	if (error || req->newptr == NULL)
3161 		goto back;
3162 
3163 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3164 	if (error)
3165 		goto back;
3166 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3167 
3168 	if (sc->hn_rx_ring_inuse > 1) {
3169 		error = hn_rss_reconfig(sc);
3170 	} else {
3171 		/* Not RSS capable, at least for now; just save the RSS key. */
3172 		error = 0;
3173 	}
3174 back:
3175 	HN_UNLOCK(sc);
3176 	return (error);
3177 }
3178 
3179 static int
3180 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3181 {
3182 	struct hn_softc *sc = arg1;
3183 	int error;
3184 
3185 	HN_LOCK(sc);
3186 
3187 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3188 	if (error || req->newptr == NULL)
3189 		goto back;
3190 
3191 	/*
3192 	 * Don't allow RSS indirect table change, if this interface is not
3193 	 * RSS capable currently.
3194 	 */
3195 	if (sc->hn_rx_ring_inuse == 1) {
3196 		error = EOPNOTSUPP;
3197 		goto back;
3198 	}
3199 
3200 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3201 	if (error)
3202 		goto back;
3203 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3204 
3205 	hn_rss_ind_fixup(sc);
3206 	error = hn_rss_reconfig(sc);
3207 back:
3208 	HN_UNLOCK(sc);
3209 	return (error);
3210 }
3211 
3212 #endif	/* !RSS */
3213 
3214 static int
3215 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3216 {
3217 	struct hn_softc *sc = arg1;
3218 	char hash_str[128];
3219 	uint32_t hash;
3220 
3221 	HN_LOCK(sc);
3222 	hash = sc->hn_rss_hash;
3223 	HN_UNLOCK(sc);
3224 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3225 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3226 }
3227 
3228 static int
3229 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
3230 {
3231 	struct hn_softc *sc = arg1;
3232 	char vf_name[128];
3233 	struct ifnet *vf;
3234 
3235 	HN_LOCK(sc);
3236 	vf_name[0] = '\0';
3237 	vf = sc->hn_rx_ring[0].hn_vf;
3238 	if (vf != NULL)
3239 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf));
3240 	HN_UNLOCK(sc);
3241 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3242 }
3243 
3244 static int
3245 hn_check_iplen(const struct mbuf *m, int hoff)
3246 {
3247 	const struct ip *ip;
3248 	int len, iphlen, iplen;
3249 	const struct tcphdr *th;
3250 	int thoff;				/* TCP data offset */
3251 
3252 	len = hoff + sizeof(struct ip);
3253 
3254 	/* The packet must be at least the size of an IP header. */
3255 	if (m->m_pkthdr.len < len)
3256 		return IPPROTO_DONE;
3257 
3258 	/* The fixed IP header must reside completely in the first mbuf. */
3259 	if (m->m_len < len)
3260 		return IPPROTO_DONE;
3261 
3262 	ip = mtodo(m, hoff);
3263 
3264 	/* Bound check the packet's stated IP header length. */
3265 	iphlen = ip->ip_hl << 2;
3266 	if (iphlen < sizeof(struct ip))		/* minimum header length */
3267 		return IPPROTO_DONE;
3268 
3269 	/* The full IP header must reside completely in the one mbuf. */
3270 	if (m->m_len < hoff + iphlen)
3271 		return IPPROTO_DONE;
3272 
3273 	iplen = ntohs(ip->ip_len);
3274 
3275 	/*
3276 	 * Check that the amount of data in the buffers is as
3277 	 * at least much as the IP header would have us expect.
3278 	 */
3279 	if (m->m_pkthdr.len < hoff + iplen)
3280 		return IPPROTO_DONE;
3281 
3282 	/*
3283 	 * Ignore IP fragments.
3284 	 */
3285 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3286 		return IPPROTO_DONE;
3287 
3288 	/*
3289 	 * The TCP/IP or UDP/IP header must be entirely contained within
3290 	 * the first fragment of a packet.
3291 	 */
3292 	switch (ip->ip_p) {
3293 	case IPPROTO_TCP:
3294 		if (iplen < iphlen + sizeof(struct tcphdr))
3295 			return IPPROTO_DONE;
3296 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3297 			return IPPROTO_DONE;
3298 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3299 		thoff = th->th_off << 2;
3300 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3301 			return IPPROTO_DONE;
3302 		if (m->m_len < hoff + iphlen + thoff)
3303 			return IPPROTO_DONE;
3304 		break;
3305 	case IPPROTO_UDP:
3306 		if (iplen < iphlen + sizeof(struct udphdr))
3307 			return IPPROTO_DONE;
3308 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3309 			return IPPROTO_DONE;
3310 		break;
3311 	default:
3312 		if (iplen < iphlen)
3313 			return IPPROTO_DONE;
3314 		break;
3315 	}
3316 	return ip->ip_p;
3317 }
3318 
3319 static int
3320 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3321 {
3322 	struct sysctl_oid_list *child;
3323 	struct sysctl_ctx_list *ctx;
3324 	device_t dev = sc->hn_dev;
3325 #if defined(INET) || defined(INET6)
3326 #if __FreeBSD_version >= 1100095
3327 	int lroent_cnt;
3328 #endif
3329 #endif
3330 	int i;
3331 
3332 	/*
3333 	 * Create RXBUF for reception.
3334 	 *
3335 	 * NOTE:
3336 	 * - It is shared by all channels.
3337 	 * - A large enough buffer is allocated, certain version of NVSes
3338 	 *   may further limit the usable space.
3339 	 */
3340 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3341 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3342 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3343 	if (sc->hn_rxbuf == NULL) {
3344 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3345 		return (ENOMEM);
3346 	}
3347 
3348 	sc->hn_rx_ring_cnt = ring_cnt;
3349 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3350 
3351 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3352 	    M_DEVBUF, M_WAITOK | M_ZERO);
3353 
3354 #if defined(INET) || defined(INET6)
3355 #if __FreeBSD_version >= 1100095
3356 	lroent_cnt = hn_lro_entry_count;
3357 	if (lroent_cnt < TCP_LRO_ENTRIES)
3358 		lroent_cnt = TCP_LRO_ENTRIES;
3359 	if (bootverbose)
3360 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3361 #endif
3362 #endif	/* INET || INET6 */
3363 
3364 	ctx = device_get_sysctl_ctx(dev);
3365 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3366 
3367 	/* Create dev.hn.UNIT.rx sysctl tree */
3368 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3369 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3370 
3371 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3372 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3373 
3374 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3375 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3376 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
3377 		if (rxr->hn_br == NULL) {
3378 			device_printf(dev, "allocate bufring failed\n");
3379 			return (ENOMEM);
3380 		}
3381 
3382 		if (hn_trust_hosttcp)
3383 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3384 		if (hn_trust_hostudp)
3385 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3386 		if (hn_trust_hostip)
3387 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3388 		rxr->hn_ifp = sc->hn_ifp;
3389 		if (i < sc->hn_tx_ring_cnt)
3390 			rxr->hn_txr = &sc->hn_tx_ring[i];
3391 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3392 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3393 		rxr->hn_rx_idx = i;
3394 		rxr->hn_rxbuf = sc->hn_rxbuf;
3395 
3396 		/*
3397 		 * Initialize LRO.
3398 		 */
3399 #if defined(INET) || defined(INET6)
3400 #if __FreeBSD_version >= 1100095
3401 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3402 		    hn_lro_mbufq_depth);
3403 #else
3404 		tcp_lro_init(&rxr->hn_lro);
3405 		rxr->hn_lro.ifp = sc->hn_ifp;
3406 #endif
3407 #if __FreeBSD_version >= 1100099
3408 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3409 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3410 #endif
3411 #endif	/* INET || INET6 */
3412 
3413 		if (sc->hn_rx_sysctl_tree != NULL) {
3414 			char name[16];
3415 
3416 			/*
3417 			 * Create per RX ring sysctl tree:
3418 			 * dev.hn.UNIT.rx.RINGID
3419 			 */
3420 			snprintf(name, sizeof(name), "%d", i);
3421 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3422 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3423 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3424 
3425 			if (rxr->hn_rx_sysctl_tree != NULL) {
3426 				SYSCTL_ADD_ULONG(ctx,
3427 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3428 				    OID_AUTO, "packets", CTLFLAG_RW,
3429 				    &rxr->hn_pkts, "# of packets received");
3430 				SYSCTL_ADD_ULONG(ctx,
3431 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3432 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
3433 				    &rxr->hn_rss_pkts,
3434 				    "# of packets w/ RSS info received");
3435 				SYSCTL_ADD_INT(ctx,
3436 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3437 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3438 				    &rxr->hn_pktbuf_len, 0,
3439 				    "Temporary channel packet buffer length");
3440 			}
3441 		}
3442 	}
3443 
3444 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3445 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3446 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3447 #if __FreeBSD_version < 1100095
3448 	    hn_rx_stat_int_sysctl,
3449 #else
3450 	    hn_rx_stat_u64_sysctl,
3451 #endif
3452 	    "LU", "LRO queued");
3453 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3454 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3455 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3456 #if __FreeBSD_version < 1100095
3457 	    hn_rx_stat_int_sysctl,
3458 #else
3459 	    hn_rx_stat_u64_sysctl,
3460 #endif
3461 	    "LU", "LRO flushed");
3462 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3463 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3464 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
3465 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3466 #if __FreeBSD_version >= 1100099
3467 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3468 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3469 	    hn_lro_lenlim_sysctl, "IU",
3470 	    "Max # of data bytes to be aggregated by LRO");
3471 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3472 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3473 	    hn_lro_ackcnt_sysctl, "I",
3474 	    "Max # of ACKs to be aggregated by LRO");
3475 #endif
3476 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3477 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3478 	    hn_trust_hcsum_sysctl, "I",
3479 	    "Trust tcp segement verification on host side, "
3480 	    "when csum info is missing");
3481 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3482 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3483 	    hn_trust_hcsum_sysctl, "I",
3484 	    "Trust udp datagram verification on host side, "
3485 	    "when csum info is missing");
3486 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3487 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3488 	    hn_trust_hcsum_sysctl, "I",
3489 	    "Trust ip packet verification on host side, "
3490 	    "when csum info is missing");
3491 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3492 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3493 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
3494 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3495 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3496 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3497 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
3498 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3499 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3500 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3501 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
3502 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3503 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3504 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3505 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
3506 	    hn_rx_stat_ulong_sysctl, "LU",
3507 	    "# of packets that we trust host's csum verification");
3508 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3509 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3510 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
3511 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3512 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3513 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3514 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
3515 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3516 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3517 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3518 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3519 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3520 
3521 	return (0);
3522 }
3523 
3524 static void
3525 hn_destroy_rx_data(struct hn_softc *sc)
3526 {
3527 	int i;
3528 
3529 	if (sc->hn_rxbuf != NULL) {
3530 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3531 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3532 		else
3533 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
3534 		sc->hn_rxbuf = NULL;
3535 	}
3536 
3537 	if (sc->hn_rx_ring_cnt == 0)
3538 		return;
3539 
3540 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3541 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3542 
3543 		if (rxr->hn_br == NULL)
3544 			continue;
3545 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3546 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3547 		} else {
3548 			device_printf(sc->hn_dev,
3549 			    "%dth channel bufring is referenced", i);
3550 		}
3551 		rxr->hn_br = NULL;
3552 
3553 #if defined(INET) || defined(INET6)
3554 		tcp_lro_free(&rxr->hn_lro);
3555 #endif
3556 		free(rxr->hn_pktbuf, M_DEVBUF);
3557 	}
3558 	free(sc->hn_rx_ring, M_DEVBUF);
3559 	sc->hn_rx_ring = NULL;
3560 
3561 	sc->hn_rx_ring_cnt = 0;
3562 	sc->hn_rx_ring_inuse = 0;
3563 }
3564 
3565 static int
3566 hn_tx_ring_create(struct hn_softc *sc, int id)
3567 {
3568 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3569 	device_t dev = sc->hn_dev;
3570 	bus_dma_tag_t parent_dtag;
3571 	int error, i;
3572 
3573 	txr->hn_sc = sc;
3574 	txr->hn_tx_idx = id;
3575 
3576 #ifndef HN_USE_TXDESC_BUFRING
3577 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3578 #endif
3579 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3580 
3581 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3582 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3583 	    M_DEVBUF, M_WAITOK | M_ZERO);
3584 #ifndef HN_USE_TXDESC_BUFRING
3585 	SLIST_INIT(&txr->hn_txlist);
3586 #else
3587 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3588 	    M_WAITOK, &txr->hn_tx_lock);
3589 #endif
3590 
3591 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3592 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3593 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3594 	} else {
3595 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3596 	}
3597 
3598 #ifdef HN_IFSTART_SUPPORT
3599 	if (hn_use_if_start) {
3600 		txr->hn_txeof = hn_start_txeof;
3601 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3602 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3603 	} else
3604 #endif
3605 	{
3606 		int br_depth;
3607 
3608 		txr->hn_txeof = hn_xmit_txeof;
3609 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3610 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3611 
3612 		br_depth = hn_get_txswq_depth(txr);
3613 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3614 		    M_WAITOK, &txr->hn_tx_lock);
3615 	}
3616 
3617 	txr->hn_direct_tx_size = hn_direct_tx_size;
3618 
3619 	/*
3620 	 * Always schedule transmission instead of trying to do direct
3621 	 * transmission.  This one gives the best performance so far.
3622 	 */
3623 	txr->hn_sched_tx = 1;
3624 
3625 	parent_dtag = bus_get_dma_tag(dev);
3626 
3627 	/* DMA tag for RNDIS packet messages. */
3628 	error = bus_dma_tag_create(parent_dtag, /* parent */
3629 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
3630 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
3631 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3632 	    BUS_SPACE_MAXADDR,		/* highaddr */
3633 	    NULL, NULL,			/* filter, filterarg */
3634 	    HN_RNDIS_PKT_LEN,		/* maxsize */
3635 	    1,				/* nsegments */
3636 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
3637 	    0,				/* flags */
3638 	    NULL,			/* lockfunc */
3639 	    NULL,			/* lockfuncarg */
3640 	    &txr->hn_tx_rndis_dtag);
3641 	if (error) {
3642 		device_printf(dev, "failed to create rndis dmatag\n");
3643 		return error;
3644 	}
3645 
3646 	/* DMA tag for data. */
3647 	error = bus_dma_tag_create(parent_dtag, /* parent */
3648 	    1,				/* alignment */
3649 	    HN_TX_DATA_BOUNDARY,	/* boundary */
3650 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3651 	    BUS_SPACE_MAXADDR,		/* highaddr */
3652 	    NULL, NULL,			/* filter, filterarg */
3653 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
3654 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
3655 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
3656 	    0,				/* flags */
3657 	    NULL,			/* lockfunc */
3658 	    NULL,			/* lockfuncarg */
3659 	    &txr->hn_tx_data_dtag);
3660 	if (error) {
3661 		device_printf(dev, "failed to create data dmatag\n");
3662 		return error;
3663 	}
3664 
3665 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3666 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
3667 
3668 		txd->txr = txr;
3669 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3670 		STAILQ_INIT(&txd->agg_list);
3671 
3672 		/*
3673 		 * Allocate and load RNDIS packet message.
3674 		 */
3675         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3676 		    (void **)&txd->rndis_pkt,
3677 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3678 		    &txd->rndis_pkt_dmap);
3679 		if (error) {
3680 			device_printf(dev,
3681 			    "failed to allocate rndis_packet_msg, %d\n", i);
3682 			return error;
3683 		}
3684 
3685 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3686 		    txd->rndis_pkt_dmap,
3687 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3688 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3689 		    BUS_DMA_NOWAIT);
3690 		if (error) {
3691 			device_printf(dev,
3692 			    "failed to load rndis_packet_msg, %d\n", i);
3693 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3694 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3695 			return error;
3696 		}
3697 
3698 		/* DMA map for TX data. */
3699 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3700 		    &txd->data_dmap);
3701 		if (error) {
3702 			device_printf(dev,
3703 			    "failed to allocate tx data dmamap\n");
3704 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3705 			    txd->rndis_pkt_dmap);
3706 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3707 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3708 			return error;
3709 		}
3710 
3711 		/* All set, put it to list */
3712 		txd->flags |= HN_TXD_FLAG_ONLIST;
3713 #ifndef HN_USE_TXDESC_BUFRING
3714 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3715 #else
3716 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
3717 #endif
3718 	}
3719 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3720 
3721 	if (sc->hn_tx_sysctl_tree != NULL) {
3722 		struct sysctl_oid_list *child;
3723 		struct sysctl_ctx_list *ctx;
3724 		char name[16];
3725 
3726 		/*
3727 		 * Create per TX ring sysctl tree:
3728 		 * dev.hn.UNIT.tx.RINGID
3729 		 */
3730 		ctx = device_get_sysctl_ctx(dev);
3731 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3732 
3733 		snprintf(name, sizeof(name), "%d", id);
3734 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3735 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3736 
3737 		if (txr->hn_tx_sysctl_tree != NULL) {
3738 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3739 
3740 #ifdef HN_DEBUG
3741 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3742 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3743 			    "# of available TX descs");
3744 #endif
3745 #ifdef HN_IFSTART_SUPPORT
3746 			if (!hn_use_if_start)
3747 #endif
3748 			{
3749 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3750 				    CTLFLAG_RD, &txr->hn_oactive, 0,
3751 				    "over active");
3752 			}
3753 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3754 			    CTLFLAG_RW, &txr->hn_pkts,
3755 			    "# of packets transmitted");
3756 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3757 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
3758 		}
3759 	}
3760 
3761 	return 0;
3762 }
3763 
3764 static void
3765 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3766 {
3767 	struct hn_tx_ring *txr = txd->txr;
3768 
3769 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
3770 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3771 
3772 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3773 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3774 	    txd->rndis_pkt_dmap);
3775 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3776 }
3777 
3778 static void
3779 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3780 {
3781 
3782 	KASSERT(txd->refs == 0 || txd->refs == 1,
3783 	    ("invalid txd refs %d", txd->refs));
3784 
3785 	/* Aggregated txds will be freed by their aggregating txd. */
3786 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3787 		int freed;
3788 
3789 		freed = hn_txdesc_put(txr, txd);
3790 		KASSERT(freed, ("can't free txdesc"));
3791 	}
3792 }
3793 
3794 static void
3795 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3796 {
3797 	int i;
3798 
3799 	if (txr->hn_txdesc == NULL)
3800 		return;
3801 
3802 	/*
3803 	 * NOTE:
3804 	 * Because the freeing of aggregated txds will be deferred
3805 	 * to the aggregating txd, two passes are used here:
3806 	 * - The first pass GCes any pending txds.  This GC is necessary,
3807 	 *   since if the channels are revoked, hypervisor will not
3808 	 *   deliver send-done for all pending txds.
3809 	 * - The second pass frees the busdma stuffs, i.e. after all txds
3810 	 *   were freed.
3811 	 */
3812 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3813 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3814 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3815 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3816 
3817 	if (txr->hn_tx_data_dtag != NULL)
3818 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3819 	if (txr->hn_tx_rndis_dtag != NULL)
3820 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3821 
3822 #ifdef HN_USE_TXDESC_BUFRING
3823 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3824 #endif
3825 
3826 	free(txr->hn_txdesc, M_DEVBUF);
3827 	txr->hn_txdesc = NULL;
3828 
3829 	if (txr->hn_mbuf_br != NULL)
3830 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3831 
3832 #ifndef HN_USE_TXDESC_BUFRING
3833 	mtx_destroy(&txr->hn_txlist_spin);
3834 #endif
3835 	mtx_destroy(&txr->hn_tx_lock);
3836 }
3837 
3838 static int
3839 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3840 {
3841 	struct sysctl_oid_list *child;
3842 	struct sysctl_ctx_list *ctx;
3843 	int i;
3844 
3845 	/*
3846 	 * Create TXBUF for chimney sending.
3847 	 *
3848 	 * NOTE: It is shared by all channels.
3849 	 */
3850 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3851 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3852 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3853 	if (sc->hn_chim == NULL) {
3854 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
3855 		return (ENOMEM);
3856 	}
3857 
3858 	sc->hn_tx_ring_cnt = ring_cnt;
3859 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3860 
3861 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3862 	    M_DEVBUF, M_WAITOK | M_ZERO);
3863 
3864 	ctx = device_get_sysctl_ctx(sc->hn_dev);
3865 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3866 
3867 	/* Create dev.hn.UNIT.tx sysctl tree */
3868 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3869 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3870 
3871 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3872 		int error;
3873 
3874 		error = hn_tx_ring_create(sc, i);
3875 		if (error)
3876 			return error;
3877 	}
3878 
3879 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3880 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3881 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
3882 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3883 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3884 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3885 	    __offsetof(struct hn_tx_ring, hn_send_failed),
3886 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3887 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3888 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3889 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
3890 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3891 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3892 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3893 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
3894 	    hn_tx_stat_ulong_sysctl, "LU",
3895 	    "# of packet transmission aggregation flush failure");
3896 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3897 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3898 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3899 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3900 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3901 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3902 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
3903 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3904 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3905 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3906 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3907 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3908 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3909 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3910 	    "# of total TX descs");
3911 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3912 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3913 	    "Chimney send packet size upper boundary");
3914 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3915 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3916 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3917 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3918 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3919 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3920 	    hn_tx_conf_int_sysctl, "I",
3921 	    "Size of the packet for direct transmission");
3922 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3923 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3924 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
3925 	    hn_tx_conf_int_sysctl, "I",
3926 	    "Always schedule transmission "
3927 	    "instead of doing direct transmission");
3928 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3929 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3930 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3931 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3932 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3933 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3934 	    "Applied packet transmission aggregation size");
3935 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3936 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3937 	    hn_txagg_pktmax_sysctl, "I",
3938 	    "Applied packet transmission aggregation packets");
3939 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3940 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3941 	    hn_txagg_align_sysctl, "I",
3942 	    "Applied packet transmission aggregation alignment");
3943 
3944 	return 0;
3945 }
3946 
3947 static void
3948 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3949 {
3950 	int i;
3951 
3952 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3953 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
3954 }
3955 
3956 static void
3957 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3958 {
3959 	struct ifnet *ifp = sc->hn_ifp;
3960 	int tso_minlen;
3961 
3962 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3963 		return;
3964 
3965 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3966 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3967 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3968 
3969 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3970 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3971 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3972 
3973 	if (tso_maxlen < tso_minlen)
3974 		tso_maxlen = tso_minlen;
3975 	else if (tso_maxlen > IP_MAXPACKET)
3976 		tso_maxlen = IP_MAXPACKET;
3977 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
3978 		tso_maxlen = sc->hn_ndis_tso_szmax;
3979 	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3980 	if (bootverbose)
3981 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3982 }
3983 
3984 static void
3985 hn_fixup_tx_data(struct hn_softc *sc)
3986 {
3987 	uint64_t csum_assist;
3988 	int i;
3989 
3990 	hn_set_chim_size(sc, sc->hn_chim_szmax);
3991 	if (hn_tx_chimney_size > 0 &&
3992 	    hn_tx_chimney_size < sc->hn_chim_szmax)
3993 		hn_set_chim_size(sc, hn_tx_chimney_size);
3994 
3995 	csum_assist = 0;
3996 	if (sc->hn_caps & HN_CAP_IPCS)
3997 		csum_assist |= CSUM_IP;
3998 	if (sc->hn_caps & HN_CAP_TCP4CS)
3999 		csum_assist |= CSUM_IP_TCP;
4000 	if (sc->hn_caps & HN_CAP_UDP4CS)
4001 		csum_assist |= CSUM_IP_UDP;
4002 	if (sc->hn_caps & HN_CAP_TCP6CS)
4003 		csum_assist |= CSUM_IP6_TCP;
4004 	if (sc->hn_caps & HN_CAP_UDP6CS)
4005 		csum_assist |= CSUM_IP6_UDP;
4006 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4007 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
4008 
4009 	if (sc->hn_caps & HN_CAP_HASHVAL) {
4010 		/*
4011 		 * Support HASHVAL pktinfo on TX path.
4012 		 */
4013 		if (bootverbose)
4014 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
4015 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4016 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
4017 	}
4018 }
4019 
4020 static void
4021 hn_destroy_tx_data(struct hn_softc *sc)
4022 {
4023 	int i;
4024 
4025 	if (sc->hn_chim != NULL) {
4026 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
4027 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
4028 		} else {
4029 			device_printf(sc->hn_dev,
4030 			    "chimney sending buffer is referenced");
4031 		}
4032 		sc->hn_chim = NULL;
4033 	}
4034 
4035 	if (sc->hn_tx_ring_cnt == 0)
4036 		return;
4037 
4038 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4039 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
4040 
4041 	free(sc->hn_tx_ring, M_DEVBUF);
4042 	sc->hn_tx_ring = NULL;
4043 
4044 	sc->hn_tx_ring_cnt = 0;
4045 	sc->hn_tx_ring_inuse = 0;
4046 }
4047 
4048 #ifdef HN_IFSTART_SUPPORT
4049 
4050 static void
4051 hn_start_taskfunc(void *xtxr, int pending __unused)
4052 {
4053 	struct hn_tx_ring *txr = xtxr;
4054 
4055 	mtx_lock(&txr->hn_tx_lock);
4056 	hn_start_locked(txr, 0);
4057 	mtx_unlock(&txr->hn_tx_lock);
4058 }
4059 
4060 static int
4061 hn_start_locked(struct hn_tx_ring *txr, int len)
4062 {
4063 	struct hn_softc *sc = txr->hn_sc;
4064 	struct ifnet *ifp = sc->hn_ifp;
4065 	int sched = 0;
4066 
4067 	KASSERT(hn_use_if_start,
4068 	    ("hn_start_locked is called, when if_start is disabled"));
4069 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4070 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4071 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4072 
4073 	if (__predict_false(txr->hn_suspended))
4074 		return (0);
4075 
4076 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
4077 	    IFF_DRV_RUNNING)
4078 		return (0);
4079 
4080 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
4081 		struct hn_txdesc *txd;
4082 		struct mbuf *m_head;
4083 		int error;
4084 
4085 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
4086 		if (m_head == NULL)
4087 			break;
4088 
4089 		if (len > 0 && m_head->m_pkthdr.len > len) {
4090 			/*
4091 			 * This sending could be time consuming; let callers
4092 			 * dispatch this packet sending (and sending of any
4093 			 * following up packets) to tx taskqueue.
4094 			 */
4095 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4096 			sched = 1;
4097 			break;
4098 		}
4099 
4100 #if defined(INET6) || defined(INET)
4101 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
4102 			m_head = hn_tso_fixup(m_head);
4103 			if (__predict_false(m_head == NULL)) {
4104 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4105 				continue;
4106 			}
4107 		}
4108 #endif
4109 
4110 		txd = hn_txdesc_get(txr);
4111 		if (txd == NULL) {
4112 			txr->hn_no_txdescs++;
4113 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4114 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4115 			break;
4116 		}
4117 
4118 		error = hn_encap(ifp, txr, txd, &m_head);
4119 		if (error) {
4120 			/* Both txd and m_head are freed */
4121 			KASSERT(txr->hn_agg_txd == NULL,
4122 			    ("encap failed w/ pending aggregating txdesc"));
4123 			continue;
4124 		}
4125 
4126 		if (txr->hn_agg_pktleft == 0) {
4127 			if (txr->hn_agg_txd != NULL) {
4128 				KASSERT(m_head == NULL,
4129 				    ("pending mbuf for aggregating txdesc"));
4130 				error = hn_flush_txagg(ifp, txr);
4131 				if (__predict_false(error)) {
4132 					atomic_set_int(&ifp->if_drv_flags,
4133 					    IFF_DRV_OACTIVE);
4134 					break;
4135 				}
4136 			} else {
4137 				KASSERT(m_head != NULL, ("mbuf was freed"));
4138 				error = hn_txpkt(ifp, txr, txd);
4139 				if (__predict_false(error)) {
4140 					/* txd is freed, but m_head is not */
4141 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4142 					atomic_set_int(&ifp->if_drv_flags,
4143 					    IFF_DRV_OACTIVE);
4144 					break;
4145 				}
4146 			}
4147 		}
4148 #ifdef INVARIANTS
4149 		else {
4150 			KASSERT(txr->hn_agg_txd != NULL,
4151 			    ("no aggregating txdesc"));
4152 			KASSERT(m_head == NULL,
4153 			    ("pending mbuf for aggregating txdesc"));
4154 		}
4155 #endif
4156 	}
4157 
4158 	/* Flush pending aggerated transmission. */
4159 	if (txr->hn_agg_txd != NULL)
4160 		hn_flush_txagg(ifp, txr);
4161 	return (sched);
4162 }
4163 
4164 static void
4165 hn_start(struct ifnet *ifp)
4166 {
4167 	struct hn_softc *sc = ifp->if_softc;
4168 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
4169 
4170 	if (txr->hn_sched_tx)
4171 		goto do_sched;
4172 
4173 	if (mtx_trylock(&txr->hn_tx_lock)) {
4174 		int sched;
4175 
4176 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4177 		mtx_unlock(&txr->hn_tx_lock);
4178 		if (!sched)
4179 			return;
4180 	}
4181 do_sched:
4182 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4183 }
4184 
4185 static void
4186 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4187 {
4188 	struct hn_tx_ring *txr = xtxr;
4189 
4190 	mtx_lock(&txr->hn_tx_lock);
4191 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4192 	hn_start_locked(txr, 0);
4193 	mtx_unlock(&txr->hn_tx_lock);
4194 }
4195 
4196 static void
4197 hn_start_txeof(struct hn_tx_ring *txr)
4198 {
4199 	struct hn_softc *sc = txr->hn_sc;
4200 	struct ifnet *ifp = sc->hn_ifp;
4201 
4202 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4203 
4204 	if (txr->hn_sched_tx)
4205 		goto do_sched;
4206 
4207 	if (mtx_trylock(&txr->hn_tx_lock)) {
4208 		int sched;
4209 
4210 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4211 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4212 		mtx_unlock(&txr->hn_tx_lock);
4213 		if (sched) {
4214 			taskqueue_enqueue(txr->hn_tx_taskq,
4215 			    &txr->hn_tx_task);
4216 		}
4217 	} else {
4218 do_sched:
4219 		/*
4220 		 * Release the OACTIVE earlier, with the hope, that
4221 		 * others could catch up.  The task will clear the
4222 		 * flag again with the hn_tx_lock to avoid possible
4223 		 * races.
4224 		 */
4225 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4226 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4227 	}
4228 }
4229 
4230 #endif	/* HN_IFSTART_SUPPORT */
4231 
4232 static int
4233 hn_xmit(struct hn_tx_ring *txr, int len)
4234 {
4235 	struct hn_softc *sc = txr->hn_sc;
4236 	struct ifnet *ifp = sc->hn_ifp;
4237 	struct mbuf *m_head;
4238 	int sched = 0;
4239 
4240 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4241 #ifdef HN_IFSTART_SUPPORT
4242 	KASSERT(hn_use_if_start == 0,
4243 	    ("hn_xmit is called, when if_start is enabled"));
4244 #endif
4245 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4246 
4247 	if (__predict_false(txr->hn_suspended))
4248 		return (0);
4249 
4250 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4251 		return (0);
4252 
4253 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4254 		struct hn_txdesc *txd;
4255 		int error;
4256 
4257 		if (len > 0 && m_head->m_pkthdr.len > len) {
4258 			/*
4259 			 * This sending could be time consuming; let callers
4260 			 * dispatch this packet sending (and sending of any
4261 			 * following up packets) to tx taskqueue.
4262 			 */
4263 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4264 			sched = 1;
4265 			break;
4266 		}
4267 
4268 		txd = hn_txdesc_get(txr);
4269 		if (txd == NULL) {
4270 			txr->hn_no_txdescs++;
4271 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4272 			txr->hn_oactive = 1;
4273 			break;
4274 		}
4275 
4276 		error = hn_encap(ifp, txr, txd, &m_head);
4277 		if (error) {
4278 			/* Both txd and m_head are freed; discard */
4279 			KASSERT(txr->hn_agg_txd == NULL,
4280 			    ("encap failed w/ pending aggregating txdesc"));
4281 			drbr_advance(ifp, txr->hn_mbuf_br);
4282 			continue;
4283 		}
4284 
4285 		if (txr->hn_agg_pktleft == 0) {
4286 			if (txr->hn_agg_txd != NULL) {
4287 				KASSERT(m_head == NULL,
4288 				    ("pending mbuf for aggregating txdesc"));
4289 				error = hn_flush_txagg(ifp, txr);
4290 				if (__predict_false(error)) {
4291 					txr->hn_oactive = 1;
4292 					break;
4293 				}
4294 			} else {
4295 				KASSERT(m_head != NULL, ("mbuf was freed"));
4296 				error = hn_txpkt(ifp, txr, txd);
4297 				if (__predict_false(error)) {
4298 					/* txd is freed, but m_head is not */
4299 					drbr_putback(ifp, txr->hn_mbuf_br,
4300 					    m_head);
4301 					txr->hn_oactive = 1;
4302 					break;
4303 				}
4304 			}
4305 		}
4306 #ifdef INVARIANTS
4307 		else {
4308 			KASSERT(txr->hn_agg_txd != NULL,
4309 			    ("no aggregating txdesc"));
4310 			KASSERT(m_head == NULL,
4311 			    ("pending mbuf for aggregating txdesc"));
4312 		}
4313 #endif
4314 
4315 		/* Sent */
4316 		drbr_advance(ifp, txr->hn_mbuf_br);
4317 	}
4318 
4319 	/* Flush pending aggerated transmission. */
4320 	if (txr->hn_agg_txd != NULL)
4321 		hn_flush_txagg(ifp, txr);
4322 	return (sched);
4323 }
4324 
4325 static int
4326 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4327 {
4328 	struct hn_softc *sc = ifp->if_softc;
4329 	struct hn_tx_ring *txr;
4330 	int error, idx = 0;
4331 
4332 #if defined(INET6) || defined(INET)
4333 	/*
4334 	 * Perform TSO packet header fixup now, since the TSO
4335 	 * packet header should be cache-hot.
4336 	 */
4337 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4338 		m = hn_tso_fixup(m);
4339 		if (__predict_false(m == NULL)) {
4340 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4341 			return EIO;
4342 		}
4343 	}
4344 #endif
4345 
4346 	/*
4347 	 * Select the TX ring based on flowid
4348 	 */
4349 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4350 #ifdef RSS
4351 		uint32_t bid;
4352 
4353 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
4354 		    &bid) == 0)
4355 			idx = bid % sc->hn_tx_ring_inuse;
4356 		else
4357 #endif
4358 			idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4359 	}
4360 	txr = &sc->hn_tx_ring[idx];
4361 
4362 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4363 	if (error) {
4364 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4365 		return error;
4366 	}
4367 
4368 	if (txr->hn_oactive)
4369 		return 0;
4370 
4371 	if (txr->hn_sched_tx)
4372 		goto do_sched;
4373 
4374 	if (mtx_trylock(&txr->hn_tx_lock)) {
4375 		int sched;
4376 
4377 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4378 		mtx_unlock(&txr->hn_tx_lock);
4379 		if (!sched)
4380 			return 0;
4381 	}
4382 do_sched:
4383 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4384 	return 0;
4385 }
4386 
4387 static void
4388 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4389 {
4390 	struct mbuf *m;
4391 
4392 	mtx_lock(&txr->hn_tx_lock);
4393 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4394 		m_freem(m);
4395 	mtx_unlock(&txr->hn_tx_lock);
4396 }
4397 
4398 static void
4399 hn_xmit_qflush(struct ifnet *ifp)
4400 {
4401 	struct hn_softc *sc = ifp->if_softc;
4402 	int i;
4403 
4404 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4405 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4406 	if_qflush(ifp);
4407 }
4408 
4409 static void
4410 hn_xmit_txeof(struct hn_tx_ring *txr)
4411 {
4412 
4413 	if (txr->hn_sched_tx)
4414 		goto do_sched;
4415 
4416 	if (mtx_trylock(&txr->hn_tx_lock)) {
4417 		int sched;
4418 
4419 		txr->hn_oactive = 0;
4420 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4421 		mtx_unlock(&txr->hn_tx_lock);
4422 		if (sched) {
4423 			taskqueue_enqueue(txr->hn_tx_taskq,
4424 			    &txr->hn_tx_task);
4425 		}
4426 	} else {
4427 do_sched:
4428 		/*
4429 		 * Release the oactive earlier, with the hope, that
4430 		 * others could catch up.  The task will clear the
4431 		 * oactive again with the hn_tx_lock to avoid possible
4432 		 * races.
4433 		 */
4434 		txr->hn_oactive = 0;
4435 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4436 	}
4437 }
4438 
4439 static void
4440 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4441 {
4442 	struct hn_tx_ring *txr = xtxr;
4443 
4444 	mtx_lock(&txr->hn_tx_lock);
4445 	hn_xmit(txr, 0);
4446 	mtx_unlock(&txr->hn_tx_lock);
4447 }
4448 
4449 static void
4450 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4451 {
4452 	struct hn_tx_ring *txr = xtxr;
4453 
4454 	mtx_lock(&txr->hn_tx_lock);
4455 	txr->hn_oactive = 0;
4456 	hn_xmit(txr, 0);
4457 	mtx_unlock(&txr->hn_tx_lock);
4458 }
4459 
4460 static int
4461 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4462 {
4463 	struct vmbus_chan_br cbr;
4464 	struct hn_rx_ring *rxr;
4465 	struct hn_tx_ring *txr = NULL;
4466 	int idx, error;
4467 
4468 	idx = vmbus_chan_subidx(chan);
4469 
4470 	/*
4471 	 * Link this channel to RX/TX ring.
4472 	 */
4473 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4474 	    ("invalid channel index %d, should > 0 && < %d",
4475 	     idx, sc->hn_rx_ring_inuse));
4476 	rxr = &sc->hn_rx_ring[idx];
4477 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4478 	    ("RX ring %d already attached", idx));
4479 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4480 	rxr->hn_chan = chan;
4481 
4482 	if (bootverbose) {
4483 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4484 		    idx, vmbus_chan_id(chan));
4485 	}
4486 
4487 	if (idx < sc->hn_tx_ring_inuse) {
4488 		txr = &sc->hn_tx_ring[idx];
4489 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4490 		    ("TX ring %d already attached", idx));
4491 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4492 
4493 		txr->hn_chan = chan;
4494 		if (bootverbose) {
4495 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4496 			    idx, vmbus_chan_id(chan));
4497 		}
4498 	}
4499 
4500 	/* Bind this channel to a proper CPU. */
4501 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4502 
4503 	/*
4504 	 * Open this channel
4505 	 */
4506 	cbr.cbr = rxr->hn_br;
4507 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4508 	cbr.cbr_txsz = HN_TXBR_SIZE;
4509 	cbr.cbr_rxsz = HN_RXBR_SIZE;
4510 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4511 	if (error) {
4512 		if (error == EISCONN) {
4513 			if_printf(sc->hn_ifp, "bufring is connected after "
4514 			    "chan%u open failure\n", vmbus_chan_id(chan));
4515 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4516 		} else {
4517 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4518 			    vmbus_chan_id(chan), error);
4519 		}
4520 	}
4521 	return (error);
4522 }
4523 
4524 static void
4525 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4526 {
4527 	struct hn_rx_ring *rxr;
4528 	int idx, error;
4529 
4530 	idx = vmbus_chan_subidx(chan);
4531 
4532 	/*
4533 	 * Link this channel to RX/TX ring.
4534 	 */
4535 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4536 	    ("invalid channel index %d, should > 0 && < %d",
4537 	     idx, sc->hn_rx_ring_inuse));
4538 	rxr = &sc->hn_rx_ring[idx];
4539 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4540 	    ("RX ring %d is not attached", idx));
4541 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4542 
4543 	if (idx < sc->hn_tx_ring_inuse) {
4544 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4545 
4546 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4547 		    ("TX ring %d is not attached attached", idx));
4548 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4549 	}
4550 
4551 	/*
4552 	 * Close this channel.
4553 	 *
4554 	 * NOTE:
4555 	 * Channel closing does _not_ destroy the target channel.
4556 	 */
4557 	error = vmbus_chan_close_direct(chan);
4558 	if (error == EISCONN) {
4559 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
4560 		    "after being closed\n", vmbus_chan_id(chan));
4561 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4562 	} else if (error) {
4563 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4564 		    vmbus_chan_id(chan), error);
4565 	}
4566 }
4567 
4568 static int
4569 hn_attach_subchans(struct hn_softc *sc)
4570 {
4571 	struct vmbus_channel **subchans;
4572 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4573 	int i, error = 0;
4574 
4575 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
4576 
4577 	/* Attach the sub-channels. */
4578 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4579 	for (i = 0; i < subchan_cnt; ++i) {
4580 		int error1;
4581 
4582 		error1 = hn_chan_attach(sc, subchans[i]);
4583 		if (error1) {
4584 			error = error1;
4585 			/* Move on; all channels will be detached later. */
4586 		}
4587 	}
4588 	vmbus_subchan_rel(subchans, subchan_cnt);
4589 
4590 	if (error) {
4591 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4592 	} else {
4593 		if (bootverbose) {
4594 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4595 			    subchan_cnt);
4596 		}
4597 	}
4598 	return (error);
4599 }
4600 
4601 static void
4602 hn_detach_allchans(struct hn_softc *sc)
4603 {
4604 	struct vmbus_channel **subchans;
4605 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4606 	int i;
4607 
4608 	if (subchan_cnt == 0)
4609 		goto back;
4610 
4611 	/* Detach the sub-channels. */
4612 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4613 	for (i = 0; i < subchan_cnt; ++i)
4614 		hn_chan_detach(sc, subchans[i]);
4615 	vmbus_subchan_rel(subchans, subchan_cnt);
4616 
4617 back:
4618 	/*
4619 	 * Detach the primary channel, _after_ all sub-channels
4620 	 * are detached.
4621 	 */
4622 	hn_chan_detach(sc, sc->hn_prichan);
4623 
4624 	/* Wait for sub-channels to be destroyed, if any. */
4625 	vmbus_subchan_drain(sc->hn_prichan);
4626 
4627 #ifdef INVARIANTS
4628 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4629 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4630 		    HN_RX_FLAG_ATTACHED) == 0,
4631 		    ("%dth RX ring is still attached", i));
4632 	}
4633 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4634 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4635 		    HN_TX_FLAG_ATTACHED) == 0,
4636 		    ("%dth TX ring is still attached", i));
4637 	}
4638 #endif
4639 }
4640 
4641 static int
4642 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4643 {
4644 	struct vmbus_channel **subchans;
4645 	int nchan, rxr_cnt, error;
4646 
4647 	nchan = *nsubch + 1;
4648 	if (nchan == 1) {
4649 		/*
4650 		 * Multiple RX/TX rings are not requested.
4651 		 */
4652 		*nsubch = 0;
4653 		return (0);
4654 	}
4655 
4656 	/*
4657 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4658 	 * table entries.
4659 	 */
4660 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4661 	if (error) {
4662 		/* No RSS; this is benign. */
4663 		*nsubch = 0;
4664 		return (0);
4665 	}
4666 	if (bootverbose) {
4667 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4668 		    rxr_cnt, nchan);
4669 	}
4670 
4671 	if (nchan > rxr_cnt)
4672 		nchan = rxr_cnt;
4673 	if (nchan == 1) {
4674 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4675 		*nsubch = 0;
4676 		return (0);
4677 	}
4678 
4679 	/*
4680 	 * Allocate sub-channels from NVS.
4681 	 */
4682 	*nsubch = nchan - 1;
4683 	error = hn_nvs_alloc_subchans(sc, nsubch);
4684 	if (error || *nsubch == 0) {
4685 		/* Failed to allocate sub-channels. */
4686 		*nsubch = 0;
4687 		return (0);
4688 	}
4689 
4690 	/*
4691 	 * Wait for all sub-channels to become ready before moving on.
4692 	 */
4693 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4694 	vmbus_subchan_rel(subchans, *nsubch);
4695 	return (0);
4696 }
4697 
4698 static bool
4699 hn_synth_attachable(const struct hn_softc *sc)
4700 {
4701 	int i;
4702 
4703 	if (sc->hn_flags & HN_FLAG_ERRORS)
4704 		return (false);
4705 
4706 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4707 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4708 
4709 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4710 			return (false);
4711 	}
4712 	return (true);
4713 }
4714 
4715 static int
4716 hn_synth_attach(struct hn_softc *sc, int mtu)
4717 {
4718 #define ATTACHED_NVS		0x0002
4719 #define ATTACHED_RNDIS		0x0004
4720 
4721 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4722 	int error, nsubch, nchan, i;
4723 	uint32_t old_caps, attached = 0;
4724 
4725 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4726 	    ("synthetic parts were attached"));
4727 
4728 	if (!hn_synth_attachable(sc))
4729 		return (ENXIO);
4730 
4731 	/* Save capabilities for later verification. */
4732 	old_caps = sc->hn_caps;
4733 	sc->hn_caps = 0;
4734 
4735 	/* Clear RSS stuffs. */
4736 	sc->hn_rss_ind_size = 0;
4737 	sc->hn_rss_hash = 0;
4738 
4739 	/*
4740 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
4741 	 */
4742 	error = hn_chan_attach(sc, sc->hn_prichan);
4743 	if (error)
4744 		goto failed;
4745 
4746 	/*
4747 	 * Attach NVS.
4748 	 */
4749 	error = hn_nvs_attach(sc, mtu);
4750 	if (error)
4751 		goto failed;
4752 	attached |= ATTACHED_NVS;
4753 
4754 	/*
4755 	 * Attach RNDIS _after_ NVS is attached.
4756 	 */
4757 	error = hn_rndis_attach(sc, mtu);
4758 	if (error)
4759 		goto failed;
4760 	attached |= ATTACHED_RNDIS;
4761 
4762 	/*
4763 	 * Make sure capabilities are not changed.
4764 	 */
4765 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4766 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4767 		    old_caps, sc->hn_caps);
4768 		error = ENXIO;
4769 		goto failed;
4770 	}
4771 
4772 	/*
4773 	 * Allocate sub-channels for multi-TX/RX rings.
4774 	 *
4775 	 * NOTE:
4776 	 * The # of RX rings that can be used is equivalent to the # of
4777 	 * channels to be requested.
4778 	 */
4779 	nsubch = sc->hn_rx_ring_cnt - 1;
4780 	error = hn_synth_alloc_subchans(sc, &nsubch);
4781 	if (error)
4782 		goto failed;
4783 	/* NOTE: _Full_ synthetic parts detach is required now. */
4784 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4785 
4786 	/*
4787 	 * Set the # of TX/RX rings that could be used according to
4788 	 * the # of channels that NVS offered.
4789 	 */
4790 	nchan = nsubch + 1;
4791 	hn_set_ring_inuse(sc, nchan);
4792 	if (nchan == 1) {
4793 		/* Only the primary channel can be used; done */
4794 		goto back;
4795 	}
4796 
4797 	/*
4798 	 * Attach the sub-channels.
4799 	 *
4800 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
4801 	 */
4802 	error = hn_attach_subchans(sc);
4803 	if (error)
4804 		goto failed;
4805 
4806 	/*
4807 	 * Configure RSS key and indirect table _after_ all sub-channels
4808 	 * are attached.
4809 	 */
4810 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4811 		/*
4812 		 * RSS key is not set yet; set it to the default RSS key.
4813 		 */
4814 		if (bootverbose)
4815 			if_printf(sc->hn_ifp, "setup default RSS key\n");
4816 #ifdef RSS
4817 		rss_getkey(rss->rss_key);
4818 #else
4819 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4820 #endif
4821 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4822 	}
4823 
4824 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4825 		/*
4826 		 * RSS indirect table is not set yet; set it up in round-
4827 		 * robin fashion.
4828 		 */
4829 		if (bootverbose) {
4830 			if_printf(sc->hn_ifp, "setup default RSS indirect "
4831 			    "table\n");
4832 		}
4833 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
4834 			uint32_t subidx;
4835 
4836 #ifdef RSS
4837 			subidx = rss_get_indirection_to_bucket(i);
4838 #else
4839 			subidx = i;
4840 #endif
4841 			rss->rss_ind[i] = subidx % nchan;
4842 		}
4843 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4844 	} else {
4845 		/*
4846 		 * # of usable channels may be changed, so we have to
4847 		 * make sure that all entries in RSS indirect table
4848 		 * are valid.
4849 		 *
4850 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
4851 		 */
4852 		hn_rss_ind_fixup(sc);
4853 	}
4854 
4855 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4856 	if (error)
4857 		goto failed;
4858 back:
4859 	/*
4860 	 * Fixup transmission aggregation setup.
4861 	 */
4862 	hn_set_txagg(sc);
4863 	return (0);
4864 
4865 failed:
4866 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4867 		hn_synth_detach(sc);
4868 	} else {
4869 		if (attached & ATTACHED_RNDIS)
4870 			hn_rndis_detach(sc);
4871 		if (attached & ATTACHED_NVS)
4872 			hn_nvs_detach(sc);
4873 		hn_chan_detach(sc, sc->hn_prichan);
4874 		/* Restore old capabilities. */
4875 		sc->hn_caps = old_caps;
4876 	}
4877 	return (error);
4878 
4879 #undef ATTACHED_RNDIS
4880 #undef ATTACHED_NVS
4881 }
4882 
4883 /*
4884  * NOTE:
4885  * The interface must have been suspended though hn_suspend(), before
4886  * this function get called.
4887  */
4888 static void
4889 hn_synth_detach(struct hn_softc *sc)
4890 {
4891 
4892 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4893 	    ("synthetic parts were not attached"));
4894 
4895 	/* Detach the RNDIS first. */
4896 	hn_rndis_detach(sc);
4897 
4898 	/* Detach NVS. */
4899 	hn_nvs_detach(sc);
4900 
4901 	/* Detach all of the channels. */
4902 	hn_detach_allchans(sc);
4903 
4904 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4905 }
4906 
4907 static void
4908 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4909 {
4910 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4911 	    ("invalid ring count %d", ring_cnt));
4912 
4913 	if (sc->hn_tx_ring_cnt > ring_cnt)
4914 		sc->hn_tx_ring_inuse = ring_cnt;
4915 	else
4916 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4917 	sc->hn_rx_ring_inuse = ring_cnt;
4918 
4919 #ifdef RSS
4920 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
4921 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
4922 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
4923 		    rss_getnumbuckets());
4924 	}
4925 #endif
4926 
4927 	if (bootverbose) {
4928 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4929 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4930 	}
4931 }
4932 
4933 static void
4934 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4935 {
4936 
4937 	/*
4938 	 * NOTE:
4939 	 * The TX bufring will not be drained by the hypervisor,
4940 	 * if the primary channel is revoked.
4941 	 */
4942 	while (!vmbus_chan_rx_empty(chan) ||
4943 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4944 	     !vmbus_chan_tx_empty(chan)))
4945 		pause("waitch", 1);
4946 	vmbus_chan_intr_drain(chan);
4947 }
4948 
4949 static void
4950 hn_suspend_data(struct hn_softc *sc)
4951 {
4952 	struct vmbus_channel **subch = NULL;
4953 	struct hn_tx_ring *txr;
4954 	int i, nsubch;
4955 
4956 	HN_LOCK_ASSERT(sc);
4957 
4958 	/*
4959 	 * Suspend TX.
4960 	 */
4961 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4962 		txr = &sc->hn_tx_ring[i];
4963 
4964 		mtx_lock(&txr->hn_tx_lock);
4965 		txr->hn_suspended = 1;
4966 		mtx_unlock(&txr->hn_tx_lock);
4967 		/* No one is able send more packets now. */
4968 
4969 		/*
4970 		 * Wait for all pending sends to finish.
4971 		 *
4972 		 * NOTE:
4973 		 * We will _not_ receive all pending send-done, if the
4974 		 * primary channel is revoked.
4975 		 */
4976 		while (hn_tx_ring_pending(txr) &&
4977 		    !vmbus_chan_is_revoked(sc->hn_prichan))
4978 			pause("hnwtx", 1 /* 1 tick */);
4979 	}
4980 
4981 	/*
4982 	 * Disable RX by clearing RX filter.
4983 	 */
4984 	hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
4985 
4986 	/*
4987 	 * Give RNDIS enough time to flush all pending data packets.
4988 	 */
4989 	pause("waitrx", (200 * hz) / 1000);
4990 
4991 	/*
4992 	 * Drain RX/TX bufrings and interrupts.
4993 	 */
4994 	nsubch = sc->hn_rx_ring_inuse - 1;
4995 	if (nsubch > 0)
4996 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4997 
4998 	if (subch != NULL) {
4999 		for (i = 0; i < nsubch; ++i)
5000 			hn_chan_drain(sc, subch[i]);
5001 	}
5002 	hn_chan_drain(sc, sc->hn_prichan);
5003 
5004 	if (subch != NULL)
5005 		vmbus_subchan_rel(subch, nsubch);
5006 
5007 	/*
5008 	 * Drain any pending TX tasks.
5009 	 *
5010 	 * NOTE:
5011 	 * The above hn_chan_drain() can dispatch TX tasks, so the TX
5012 	 * tasks will have to be drained _after_ the above hn_chan_drain()
5013 	 * calls.
5014 	 */
5015 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5016 		txr = &sc->hn_tx_ring[i];
5017 
5018 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
5019 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
5020 	}
5021 }
5022 
5023 static void
5024 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
5025 {
5026 
5027 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
5028 }
5029 
5030 static void
5031 hn_suspend_mgmt(struct hn_softc *sc)
5032 {
5033 	struct task task;
5034 
5035 	HN_LOCK_ASSERT(sc);
5036 
5037 	/*
5038 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
5039 	 * through hn_mgmt_taskq.
5040 	 */
5041 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
5042 	vmbus_chan_run_task(sc->hn_prichan, &task);
5043 
5044 	/*
5045 	 * Make sure that all pending management tasks are completed.
5046 	 */
5047 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
5048 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
5049 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
5050 }
5051 
5052 static void
5053 hn_suspend(struct hn_softc *sc)
5054 {
5055 
5056 	/* Disable polling. */
5057 	hn_polling(sc, 0);
5058 
5059 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5060 	    (sc->hn_flags & HN_FLAG_VF))
5061 		hn_suspend_data(sc);
5062 	hn_suspend_mgmt(sc);
5063 }
5064 
5065 static void
5066 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
5067 {
5068 	int i;
5069 
5070 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
5071 	    ("invalid TX ring count %d", tx_ring_cnt));
5072 
5073 	for (i = 0; i < tx_ring_cnt; ++i) {
5074 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5075 
5076 		mtx_lock(&txr->hn_tx_lock);
5077 		txr->hn_suspended = 0;
5078 		mtx_unlock(&txr->hn_tx_lock);
5079 	}
5080 }
5081 
5082 static void
5083 hn_resume_data(struct hn_softc *sc)
5084 {
5085 	int i;
5086 
5087 	HN_LOCK_ASSERT(sc);
5088 
5089 	/*
5090 	 * Re-enable RX.
5091 	 */
5092 	hn_rxfilter_config(sc);
5093 
5094 	/*
5095 	 * Make sure to clear suspend status on "all" TX rings,
5096 	 * since hn_tx_ring_inuse can be changed after
5097 	 * hn_suspend_data().
5098 	 */
5099 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
5100 
5101 #ifdef HN_IFSTART_SUPPORT
5102 	if (!hn_use_if_start)
5103 #endif
5104 	{
5105 		/*
5106 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
5107 		 * reduced.
5108 		 */
5109 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
5110 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5111 	}
5112 
5113 	/*
5114 	 * Kick start TX.
5115 	 */
5116 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5117 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5118 
5119 		/*
5120 		 * Use txeof task, so that any pending oactive can be
5121 		 * cleared properly.
5122 		 */
5123 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5124 	}
5125 }
5126 
5127 static void
5128 hn_resume_mgmt(struct hn_softc *sc)
5129 {
5130 
5131 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
5132 
5133 	/*
5134 	 * Kick off network change detection, if it was pending.
5135 	 * If no network change was pending, start link status
5136 	 * checks, which is more lightweight than network change
5137 	 * detection.
5138 	 */
5139 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
5140 		hn_change_network(sc);
5141 	else
5142 		hn_update_link_status(sc);
5143 }
5144 
5145 static void
5146 hn_resume(struct hn_softc *sc)
5147 {
5148 
5149 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5150 	    (sc->hn_flags & HN_FLAG_VF))
5151 		hn_resume_data(sc);
5152 
5153 	/*
5154 	 * When the VF is activated, the synthetic interface is changed
5155 	 * to DOWN in hn_set_vf(). Here, if the VF is still active, we
5156 	 * don't call hn_resume_mgmt() until the VF is deactivated in
5157 	 * hn_set_vf().
5158 	 */
5159 	if (!(sc->hn_flags & HN_FLAG_VF))
5160 		hn_resume_mgmt(sc);
5161 
5162 	/*
5163 	 * Re-enable polling if this interface is running and
5164 	 * the polling is requested.
5165 	 */
5166 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
5167 		hn_polling(sc, sc->hn_pollhz);
5168 }
5169 
5170 static void
5171 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
5172 {
5173 	const struct rndis_status_msg *msg;
5174 	int ofs;
5175 
5176 	if (dlen < sizeof(*msg)) {
5177 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
5178 		return;
5179 	}
5180 	msg = data;
5181 
5182 	switch (msg->rm_status) {
5183 	case RNDIS_STATUS_MEDIA_CONNECT:
5184 	case RNDIS_STATUS_MEDIA_DISCONNECT:
5185 		hn_update_link_status(sc);
5186 		break;
5187 
5188 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
5189 		/* Not really useful; ignore. */
5190 		break;
5191 
5192 	case RNDIS_STATUS_NETWORK_CHANGE:
5193 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
5194 		if (dlen < ofs + msg->rm_stbuflen ||
5195 		    msg->rm_stbuflen < sizeof(uint32_t)) {
5196 			if_printf(sc->hn_ifp, "network changed\n");
5197 		} else {
5198 			uint32_t change;
5199 
5200 			memcpy(&change, ((const uint8_t *)msg) + ofs,
5201 			    sizeof(change));
5202 			if_printf(sc->hn_ifp, "network changed, change %u\n",
5203 			    change);
5204 		}
5205 		hn_change_network(sc);
5206 		break;
5207 
5208 	default:
5209 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
5210 		    msg->rm_status);
5211 		break;
5212 	}
5213 }
5214 
5215 static int
5216 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5217 {
5218 	const struct rndis_pktinfo *pi = info_data;
5219 	uint32_t mask = 0;
5220 
5221 	while (info_dlen != 0) {
5222 		const void *data;
5223 		uint32_t dlen;
5224 
5225 		if (__predict_false(info_dlen < sizeof(*pi)))
5226 			return (EINVAL);
5227 		if (__predict_false(info_dlen < pi->rm_size))
5228 			return (EINVAL);
5229 		info_dlen -= pi->rm_size;
5230 
5231 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5232 			return (EINVAL);
5233 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5234 			return (EINVAL);
5235 		dlen = pi->rm_size - pi->rm_pktinfooffset;
5236 		data = pi->rm_data;
5237 
5238 		switch (pi->rm_type) {
5239 		case NDIS_PKTINFO_TYPE_VLAN:
5240 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5241 				return (EINVAL);
5242 			info->vlan_info = *((const uint32_t *)data);
5243 			mask |= HN_RXINFO_VLAN;
5244 			break;
5245 
5246 		case NDIS_PKTINFO_TYPE_CSUM:
5247 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5248 				return (EINVAL);
5249 			info->csum_info = *((const uint32_t *)data);
5250 			mask |= HN_RXINFO_CSUM;
5251 			break;
5252 
5253 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5254 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5255 				return (EINVAL);
5256 			info->hash_value = *((const uint32_t *)data);
5257 			mask |= HN_RXINFO_HASHVAL;
5258 			break;
5259 
5260 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
5261 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5262 				return (EINVAL);
5263 			info->hash_info = *((const uint32_t *)data);
5264 			mask |= HN_RXINFO_HASHINF;
5265 			break;
5266 
5267 		default:
5268 			goto next;
5269 		}
5270 
5271 		if (mask == HN_RXINFO_ALL) {
5272 			/* All found; done */
5273 			break;
5274 		}
5275 next:
5276 		pi = (const struct rndis_pktinfo *)
5277 		    ((const uint8_t *)pi + pi->rm_size);
5278 	}
5279 
5280 	/*
5281 	 * Final fixup.
5282 	 * - If there is no hash value, invalidate the hash info.
5283 	 */
5284 	if ((mask & HN_RXINFO_HASHVAL) == 0)
5285 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5286 	return (0);
5287 }
5288 
5289 static __inline bool
5290 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5291 {
5292 
5293 	if (off < check_off) {
5294 		if (__predict_true(off + len <= check_off))
5295 			return (false);
5296 	} else if (off > check_off) {
5297 		if (__predict_true(check_off + check_len <= off))
5298 			return (false);
5299 	}
5300 	return (true);
5301 }
5302 
5303 static void
5304 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5305 {
5306 	const struct rndis_packet_msg *pkt;
5307 	struct hn_rxinfo info;
5308 	int data_off, pktinfo_off, data_len, pktinfo_len;
5309 
5310 	/*
5311 	 * Check length.
5312 	 */
5313 	if (__predict_false(dlen < sizeof(*pkt))) {
5314 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5315 		return;
5316 	}
5317 	pkt = data;
5318 
5319 	if (__predict_false(dlen < pkt->rm_len)) {
5320 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5321 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5322 		return;
5323 	}
5324 	if (__predict_false(pkt->rm_len <
5325 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5326 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5327 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
5328 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5329 		    pkt->rm_pktinfolen);
5330 		return;
5331 	}
5332 	if (__predict_false(pkt->rm_datalen == 0)) {
5333 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5334 		return;
5335 	}
5336 
5337 	/*
5338 	 * Check offests.
5339 	 */
5340 #define IS_OFFSET_INVALID(ofs)			\
5341 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
5342 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5343 
5344 	/* XXX Hyper-V does not meet data offset alignment requirement */
5345 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5346 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5347 		    "data offset %u\n", pkt->rm_dataoffset);
5348 		return;
5349 	}
5350 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5351 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5352 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5353 		    "oob offset %u\n", pkt->rm_oobdataoffset);
5354 		return;
5355 	}
5356 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5357 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5358 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5359 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5360 		return;
5361 	}
5362 
5363 #undef IS_OFFSET_INVALID
5364 
5365 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5366 	data_len = pkt->rm_datalen;
5367 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5368 	pktinfo_len = pkt->rm_pktinfolen;
5369 
5370 	/*
5371 	 * Check OOB coverage.
5372 	 */
5373 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
5374 		int oob_off, oob_len;
5375 
5376 		if_printf(rxr->hn_ifp, "got oobdata\n");
5377 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5378 		oob_len = pkt->rm_oobdatalen;
5379 
5380 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5381 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5382 			    "oob overflow, msglen %u, oob abs %d len %d\n",
5383 			    pkt->rm_len, oob_off, oob_len);
5384 			return;
5385 		}
5386 
5387 		/*
5388 		 * Check against data.
5389 		 */
5390 		if (hn_rndis_check_overlap(oob_off, oob_len,
5391 		    data_off, data_len)) {
5392 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5393 			    "oob overlaps data, oob abs %d len %d, "
5394 			    "data abs %d len %d\n",
5395 			    oob_off, oob_len, data_off, data_len);
5396 			return;
5397 		}
5398 
5399 		/*
5400 		 * Check against pktinfo.
5401 		 */
5402 		if (pktinfo_len != 0 &&
5403 		    hn_rndis_check_overlap(oob_off, oob_len,
5404 		    pktinfo_off, pktinfo_len)) {
5405 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5406 			    "oob overlaps pktinfo, oob abs %d len %d, "
5407 			    "pktinfo abs %d len %d\n",
5408 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
5409 			return;
5410 		}
5411 	}
5412 
5413 	/*
5414 	 * Check per-packet-info coverage and find useful per-packet-info.
5415 	 */
5416 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5417 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5418 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5419 	if (__predict_true(pktinfo_len != 0)) {
5420 		bool overlap;
5421 		int error;
5422 
5423 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5424 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5425 			    "pktinfo overflow, msglen %u, "
5426 			    "pktinfo abs %d len %d\n",
5427 			    pkt->rm_len, pktinfo_off, pktinfo_len);
5428 			return;
5429 		}
5430 
5431 		/*
5432 		 * Check packet info coverage.
5433 		 */
5434 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5435 		    data_off, data_len);
5436 		if (__predict_false(overlap)) {
5437 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5438 			    "pktinfo overlap data, pktinfo abs %d len %d, "
5439 			    "data abs %d len %d\n",
5440 			    pktinfo_off, pktinfo_len, data_off, data_len);
5441 			return;
5442 		}
5443 
5444 		/*
5445 		 * Find useful per-packet-info.
5446 		 */
5447 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5448 		    pktinfo_len, &info);
5449 		if (__predict_false(error)) {
5450 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5451 			    "pktinfo\n");
5452 			return;
5453 		}
5454 	}
5455 
5456 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
5457 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5458 		    "data overflow, msglen %u, data abs %d len %d\n",
5459 		    pkt->rm_len, data_off, data_len);
5460 		return;
5461 	}
5462 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5463 }
5464 
5465 static __inline void
5466 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5467 {
5468 	const struct rndis_msghdr *hdr;
5469 
5470 	if (__predict_false(dlen < sizeof(*hdr))) {
5471 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5472 		return;
5473 	}
5474 	hdr = data;
5475 
5476 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5477 		/* Hot data path. */
5478 		hn_rndis_rx_data(rxr, data, dlen);
5479 		/* Done! */
5480 		return;
5481 	}
5482 
5483 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5484 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5485 	else
5486 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5487 }
5488 
5489 static void
5490 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5491 {
5492 	const struct hn_nvs_hdr *hdr;
5493 
5494 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5495 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
5496 		return;
5497 	}
5498 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5499 
5500 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5501 		/* Useless; ignore */
5502 		return;
5503 	}
5504 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5505 }
5506 
5507 static void
5508 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5509     const struct vmbus_chanpkt_hdr *pkt)
5510 {
5511 	struct hn_nvs_sendctx *sndc;
5512 
5513 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5514 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5515 	    VMBUS_CHANPKT_DATALEN(pkt));
5516 	/*
5517 	 * NOTE:
5518 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5519 	 * its callback.
5520 	 */
5521 }
5522 
5523 static void
5524 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5525     const struct vmbus_chanpkt_hdr *pkthdr)
5526 {
5527 	const struct vmbus_chanpkt_rxbuf *pkt;
5528 	const struct hn_nvs_hdr *nvs_hdr;
5529 	int count, i, hlen;
5530 
5531 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5532 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5533 		return;
5534 	}
5535 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5536 
5537 	/* Make sure that this is a RNDIS message. */
5538 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5539 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5540 		    nvs_hdr->nvs_type);
5541 		return;
5542 	}
5543 
5544 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5545 	if (__predict_false(hlen < sizeof(*pkt))) {
5546 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5547 		return;
5548 	}
5549 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5550 
5551 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5552 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5553 		    pkt->cp_rxbuf_id);
5554 		return;
5555 	}
5556 
5557 	count = pkt->cp_rxbuf_cnt;
5558 	if (__predict_false(hlen <
5559 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5560 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5561 		return;
5562 	}
5563 
5564 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5565 	for (i = 0; i < count; ++i) {
5566 		int ofs, len;
5567 
5568 		ofs = pkt->cp_rxbuf[i].rb_ofs;
5569 		len = pkt->cp_rxbuf[i].rb_len;
5570 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5571 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5572 			    "ofs %d, len %d\n", i, ofs, len);
5573 			continue;
5574 		}
5575 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5576 	}
5577 
5578 	/*
5579 	 * Ack the consumed RXBUF associated w/ this channel packet,
5580 	 * so that this RXBUF can be recycled by the hypervisor.
5581 	 */
5582 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5583 }
5584 
5585 static void
5586 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5587     uint64_t tid)
5588 {
5589 	struct hn_nvs_rndis_ack ack;
5590 	int retries, error;
5591 
5592 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5593 	ack.nvs_status = HN_NVS_STATUS_OK;
5594 
5595 	retries = 0;
5596 again:
5597 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5598 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5599 	if (__predict_false(error == EAGAIN)) {
5600 		/*
5601 		 * NOTE:
5602 		 * This should _not_ happen in real world, since the
5603 		 * consumption of the TX bufring from the TX path is
5604 		 * controlled.
5605 		 */
5606 		if (rxr->hn_ack_failed == 0)
5607 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5608 		rxr->hn_ack_failed++;
5609 		retries++;
5610 		if (retries < 10) {
5611 			DELAY(100);
5612 			goto again;
5613 		}
5614 		/* RXBUF leaks! */
5615 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5616 	}
5617 }
5618 
5619 static void
5620 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5621 {
5622 	struct hn_rx_ring *rxr = xrxr;
5623 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
5624 
5625 	for (;;) {
5626 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5627 		int error, pktlen;
5628 
5629 		pktlen = rxr->hn_pktbuf_len;
5630 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5631 		if (__predict_false(error == ENOBUFS)) {
5632 			void *nbuf;
5633 			int nlen;
5634 
5635 			/*
5636 			 * Expand channel packet buffer.
5637 			 *
5638 			 * XXX
5639 			 * Use M_WAITOK here, since allocation failure
5640 			 * is fatal.
5641 			 */
5642 			nlen = rxr->hn_pktbuf_len * 2;
5643 			while (nlen < pktlen)
5644 				nlen *= 2;
5645 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5646 
5647 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5648 			    rxr->hn_pktbuf_len, nlen);
5649 
5650 			free(rxr->hn_pktbuf, M_DEVBUF);
5651 			rxr->hn_pktbuf = nbuf;
5652 			rxr->hn_pktbuf_len = nlen;
5653 			/* Retry! */
5654 			continue;
5655 		} else if (__predict_false(error == EAGAIN)) {
5656 			/* No more channel packets; done! */
5657 			break;
5658 		}
5659 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5660 
5661 		switch (pkt->cph_type) {
5662 		case VMBUS_CHANPKT_TYPE_COMP:
5663 			hn_nvs_handle_comp(sc, chan, pkt);
5664 			break;
5665 
5666 		case VMBUS_CHANPKT_TYPE_RXBUF:
5667 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
5668 			break;
5669 
5670 		case VMBUS_CHANPKT_TYPE_INBAND:
5671 			hn_nvs_handle_notify(sc, pkt);
5672 			break;
5673 
5674 		default:
5675 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5676 			    pkt->cph_type);
5677 			break;
5678 		}
5679 	}
5680 	hn_chan_rollup(rxr, rxr->hn_txr);
5681 }
5682 
5683 static void
5684 hn_tx_taskq_create(void *arg __unused)
5685 {
5686 	int i;
5687 
5688 	/*
5689 	 * Fix the # of TX taskqueues.
5690 	 */
5691 	if (hn_tx_taskq_cnt <= 0)
5692 		hn_tx_taskq_cnt = 1;
5693 	else if (hn_tx_taskq_cnt > mp_ncpus)
5694 		hn_tx_taskq_cnt = mp_ncpus;
5695 
5696 	/*
5697 	 * Fix the TX taskqueue mode.
5698 	 */
5699 	switch (hn_tx_taskq_mode) {
5700 	case HN_TX_TASKQ_M_INDEP:
5701 	case HN_TX_TASKQ_M_GLOBAL:
5702 	case HN_TX_TASKQ_M_EVTTQ:
5703 		break;
5704 	default:
5705 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5706 		break;
5707 	}
5708 
5709 	if (vm_guest != VM_GUEST_HV)
5710 		return;
5711 
5712 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5713 		return;
5714 
5715 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5716 	    M_DEVBUF, M_WAITOK);
5717 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5718 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5719 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5720 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5721 		    "hn tx%d", i);
5722 	}
5723 }
5724 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5725     hn_tx_taskq_create, NULL);
5726 
5727 static void
5728 hn_tx_taskq_destroy(void *arg __unused)
5729 {
5730 
5731 	if (hn_tx_taskque != NULL) {
5732 		int i;
5733 
5734 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
5735 			taskqueue_free(hn_tx_taskque[i]);
5736 		free(hn_tx_taskque, M_DEVBUF);
5737 	}
5738 }
5739 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5740     hn_tx_taskq_destroy, NULL);
5741