xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 69718b786d3943ea9a99eeeb5f5f6162f11c78b7)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/bus.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/smp.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
75 #include <sys/sx.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
80 
81 #include <machine/atomic.h>
82 #include <machine/in_cksum.h>
83 
84 #include <net/bpf.h>
85 #include <net/ethernet.h>
86 #include <net/if.h>
87 #include <net/if_media.h>
88 #include <net/if_types.h>
89 #include <net/if_var.h>
90 #include <net/rndis.h>
91 #ifdef RSS
92 #include <net/rss_config.h>
93 #endif
94 
95 #include <netinet/in_systm.h>
96 #include <netinet/in.h>
97 #include <netinet/ip.h>
98 #include <netinet/ip6.h>
99 #include <netinet/tcp.h>
100 #include <netinet/tcp_lro.h>
101 #include <netinet/udp.h>
102 
103 #include <dev/hyperv/include/hyperv.h>
104 #include <dev/hyperv/include/hyperv_busdma.h>
105 #include <dev/hyperv/include/vmbus.h>
106 #include <dev/hyperv/include/vmbus_xact.h>
107 
108 #include <dev/hyperv/netvsc/ndis.h>
109 #include <dev/hyperv/netvsc/if_hnreg.h>
110 #include <dev/hyperv/netvsc/if_hnvar.h>
111 #include <dev/hyperv/netvsc/hn_nvs.h>
112 #include <dev/hyperv/netvsc/hn_rndis.h>
113 
114 #include "vmbus_if.h"
115 
116 #define HN_IFSTART_SUPPORT
117 
118 #define HN_RING_CNT_DEF_MAX		8
119 
120 /* YYY should get it from the underlying channel */
121 #define HN_TX_DESC_CNT			512
122 
123 #define HN_RNDIS_PKT_LEN					\
124 	(sizeof(struct rndis_packet_msg) +			\
125 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
126 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
127 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
128 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
129 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
130 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
131 
132 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
133 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
134 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
135 /* -1 for RNDIS packet message */
136 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
137 
138 #define HN_DIRECT_TX_SIZE_DEF		128
139 
140 #define HN_EARLY_TXEOF_THRESH		8
141 
142 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
143 
144 #define HN_LROENT_CNT_DEF		128
145 
146 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
147 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
148 /* YYY 2*MTU is a bit rough, but should be good enough. */
149 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
150 
151 #define HN_LRO_ACKCNT_DEF		1
152 
153 #define HN_LOCK_INIT(sc)		\
154 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
155 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
156 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
157 #define HN_LOCK(sc)					\
158 do {							\
159 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
160 		DELAY(1000);				\
161 } while (0)
162 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
163 
164 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
165 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
166 #define HN_CSUM_IP_HWASSIST(sc)		\
167 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
168 #define HN_CSUM_IP6_HWASSIST(sc)	\
169 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
170 
171 #define HN_PKTSIZE_MIN(align)		\
172 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
173 	    HN_RNDIS_PKT_LEN, (align))
174 #define HN_PKTSIZE(m, align)		\
175 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
176 
177 #ifdef RSS
178 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
179 #else
180 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
181 #endif
182 
183 struct hn_txdesc {
184 #ifndef HN_USE_TXDESC_BUFRING
185 	SLIST_ENTRY(hn_txdesc)		link;
186 #endif
187 	STAILQ_ENTRY(hn_txdesc)		agg_link;
188 
189 	/* Aggregated txdescs, in sending order. */
190 	STAILQ_HEAD(, hn_txdesc)	agg_list;
191 
192 	/* The oldest packet, if transmission aggregation happens. */
193 	struct mbuf			*m;
194 	struct hn_tx_ring		*txr;
195 	int				refs;
196 	uint32_t			flags;	/* HN_TXD_FLAG_ */
197 	struct hn_nvs_sendctx		send_ctx;
198 	uint32_t			chim_index;
199 	int				chim_size;
200 
201 	bus_dmamap_t			data_dmap;
202 
203 	bus_addr_t			rndis_pkt_paddr;
204 	struct rndis_packet_msg		*rndis_pkt;
205 	bus_dmamap_t			rndis_pkt_dmap;
206 };
207 
208 #define HN_TXD_FLAG_ONLIST		0x0001
209 #define HN_TXD_FLAG_DMAMAP		0x0002
210 #define HN_TXD_FLAG_ONAGG		0x0004
211 
212 struct hn_rxinfo {
213 	uint32_t			vlan_info;
214 	uint32_t			csum_info;
215 	uint32_t			hash_info;
216 	uint32_t			hash_value;
217 };
218 
219 #define HN_RXINFO_VLAN			0x0001
220 #define HN_RXINFO_CSUM			0x0002
221 #define HN_RXINFO_HASHINF		0x0004
222 #define HN_RXINFO_HASHVAL		0x0008
223 #define HN_RXINFO_ALL			\
224 	(HN_RXINFO_VLAN |		\
225 	 HN_RXINFO_CSUM |		\
226 	 HN_RXINFO_HASHINF |		\
227 	 HN_RXINFO_HASHVAL)
228 
229 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
230 #define HN_NDIS_RXCSUM_INFO_INVALID	0
231 #define HN_NDIS_HASH_INFO_INVALID	0
232 
233 static int			hn_probe(device_t);
234 static int			hn_attach(device_t);
235 static int			hn_detach(device_t);
236 static int			hn_shutdown(device_t);
237 static void			hn_chan_callback(struct vmbus_channel *,
238 				    void *);
239 
240 static void			hn_init(void *);
241 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
242 #ifdef HN_IFSTART_SUPPORT
243 static void			hn_start(struct ifnet *);
244 #endif
245 static int			hn_transmit(struct ifnet *, struct mbuf *);
246 static void			hn_xmit_qflush(struct ifnet *);
247 static int			hn_ifmedia_upd(struct ifnet *);
248 static void			hn_ifmedia_sts(struct ifnet *,
249 				    struct ifmediareq *);
250 
251 static int			hn_rndis_rxinfo(const void *, int,
252 				    struct hn_rxinfo *);
253 static void			hn_rndis_rx_data(struct hn_rx_ring *,
254 				    const void *, int);
255 static void			hn_rndis_rx_status(struct hn_softc *,
256 				    const void *, int);
257 
258 static void			hn_nvs_handle_notify(struct hn_softc *,
259 				    const struct vmbus_chanpkt_hdr *);
260 static void			hn_nvs_handle_comp(struct hn_softc *,
261 				    struct vmbus_channel *,
262 				    const struct vmbus_chanpkt_hdr *);
263 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
264 				    struct vmbus_channel *,
265 				    const struct vmbus_chanpkt_hdr *);
266 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
267 				    struct vmbus_channel *, uint64_t);
268 
269 #if __FreeBSD_version >= 1100099
270 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
271 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
272 #endif
273 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
274 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
275 #if __FreeBSD_version < 1100095
276 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
277 #else
278 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
279 #endif
280 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
281 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
282 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
283 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
284 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
285 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
286 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
287 #ifndef RSS
288 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
289 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
290 #endif
291 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
292 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
293 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
294 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
295 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
296 
297 static void			hn_stop(struct hn_softc *);
298 static void			hn_init_locked(struct hn_softc *);
299 static int			hn_chan_attach(struct hn_softc *,
300 				    struct vmbus_channel *);
301 static void			hn_chan_detach(struct hn_softc *,
302 				    struct vmbus_channel *);
303 static int			hn_attach_subchans(struct hn_softc *);
304 static void			hn_detach_allchans(struct hn_softc *);
305 static void			hn_chan_rollup(struct hn_rx_ring *,
306 				    struct hn_tx_ring *);
307 static void			hn_set_ring_inuse(struct hn_softc *, int);
308 static int			hn_synth_attach(struct hn_softc *, int);
309 static void			hn_synth_detach(struct hn_softc *);
310 static int			hn_synth_alloc_subchans(struct hn_softc *,
311 				    int *);
312 static bool			hn_synth_attachable(const struct hn_softc *);
313 static void			hn_suspend(struct hn_softc *);
314 static void			hn_suspend_data(struct hn_softc *);
315 static void			hn_suspend_mgmt(struct hn_softc *);
316 static void			hn_resume(struct hn_softc *);
317 static void			hn_resume_data(struct hn_softc *);
318 static void			hn_resume_mgmt(struct hn_softc *);
319 static void			hn_suspend_mgmt_taskfunc(void *, int);
320 static void			hn_chan_drain(struct hn_softc *,
321 				    struct vmbus_channel *);
322 
323 static void			hn_update_link_status(struct hn_softc *);
324 static void			hn_change_network(struct hn_softc *);
325 static void			hn_link_taskfunc(void *, int);
326 static void			hn_netchg_init_taskfunc(void *, int);
327 static void			hn_netchg_status_taskfunc(void *, int);
328 static void			hn_link_status(struct hn_softc *);
329 
330 static int			hn_create_rx_data(struct hn_softc *, int);
331 static void			hn_destroy_rx_data(struct hn_softc *);
332 static int			hn_check_iplen(const struct mbuf *, int);
333 static int			hn_set_rxfilter(struct hn_softc *);
334 #ifndef RSS
335 static int			hn_rss_reconfig(struct hn_softc *);
336 #endif
337 static void			hn_rss_ind_fixup(struct hn_softc *);
338 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
339 				    int, const struct hn_rxinfo *);
340 
341 static int			hn_tx_ring_create(struct hn_softc *, int);
342 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
343 static int			hn_create_tx_data(struct hn_softc *, int);
344 static void			hn_fixup_tx_data(struct hn_softc *);
345 static void			hn_destroy_tx_data(struct hn_softc *);
346 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
347 static void			hn_txdesc_gc(struct hn_tx_ring *,
348 				    struct hn_txdesc *);
349 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
350 				    struct hn_txdesc *, struct mbuf **);
351 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
352 				    struct hn_txdesc *);
353 static void			hn_set_chim_size(struct hn_softc *, int);
354 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
355 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
356 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
357 static void			hn_resume_tx(struct hn_softc *, int);
358 static void			hn_set_txagg(struct hn_softc *);
359 static void			*hn_try_txagg(struct ifnet *,
360 				    struct hn_tx_ring *, struct hn_txdesc *,
361 				    int);
362 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
363 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
364 				    struct hn_softc *, struct vmbus_channel *,
365 				    const void *, int);
366 static int			hn_txpkt_sglist(struct hn_tx_ring *,
367 				    struct hn_txdesc *);
368 static int			hn_txpkt_chim(struct hn_tx_ring *,
369 				    struct hn_txdesc *);
370 static int			hn_xmit(struct hn_tx_ring *, int);
371 static void			hn_xmit_taskfunc(void *, int);
372 static void			hn_xmit_txeof(struct hn_tx_ring *);
373 static void			hn_xmit_txeof_taskfunc(void *, int);
374 #ifdef HN_IFSTART_SUPPORT
375 static int			hn_start_locked(struct hn_tx_ring *, int);
376 static void			hn_start_taskfunc(void *, int);
377 static void			hn_start_txeof(struct hn_tx_ring *);
378 static void			hn_start_txeof_taskfunc(void *, int);
379 #endif
380 
381 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
382     "Hyper-V network interface");
383 
384 /* Trust tcp segements verification on host side. */
385 static int			hn_trust_hosttcp = 1;
386 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
387     &hn_trust_hosttcp, 0,
388     "Trust tcp segement verification on host side, "
389     "when csum info is missing (global setting)");
390 
391 /* Trust udp datagrams verification on host side. */
392 static int			hn_trust_hostudp = 1;
393 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
394     &hn_trust_hostudp, 0,
395     "Trust udp datagram verification on host side, "
396     "when csum info is missing (global setting)");
397 
398 /* Trust ip packets verification on host side. */
399 static int			hn_trust_hostip = 1;
400 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
401     &hn_trust_hostip, 0,
402     "Trust ip packet verification on host side, "
403     "when csum info is missing (global setting)");
404 
405 /* Limit TSO burst size */
406 static int			hn_tso_maxlen = IP_MAXPACKET;
407 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
408     &hn_tso_maxlen, 0, "TSO burst limit");
409 
410 /* Limit chimney send size */
411 static int			hn_tx_chimney_size = 0;
412 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
413     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
414 
415 /* Limit the size of packet for direct transmission */
416 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
417 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
418     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
419 
420 /* # of LRO entries per RX ring */
421 #if defined(INET) || defined(INET6)
422 #if __FreeBSD_version >= 1100095
423 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
424 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
425     &hn_lro_entry_count, 0, "LRO entry count");
426 #endif
427 #endif
428 
429 static int			hn_tx_taskq_cnt = 1;
430 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
431     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
432 
433 #define HN_TX_TASKQ_M_INDEP	0
434 #define HN_TX_TASKQ_M_GLOBAL	1
435 #define HN_TX_TASKQ_M_EVTTQ	2
436 
437 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
438 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
439     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
440     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
441 
442 #ifndef HN_USE_TXDESC_BUFRING
443 static int			hn_use_txdesc_bufring = 0;
444 #else
445 static int			hn_use_txdesc_bufring = 1;
446 #endif
447 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
448     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
449 
450 #ifdef HN_IFSTART_SUPPORT
451 /* Use ifnet.if_start instead of ifnet.if_transmit */
452 static int			hn_use_if_start = 0;
453 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
454     &hn_use_if_start, 0, "Use if_start TX method");
455 #endif
456 
457 /* # of channels to use */
458 static int			hn_chan_cnt = 0;
459 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
460     &hn_chan_cnt, 0,
461     "# of channels to use; each channel has one RX ring and one TX ring");
462 
463 /* # of transmit rings to use */
464 static int			hn_tx_ring_cnt = 0;
465 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
466     &hn_tx_ring_cnt, 0, "# of TX rings to use");
467 
468 /* Software TX ring deptch */
469 static int			hn_tx_swq_depth = 0;
470 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
471     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
472 
473 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
474 #if __FreeBSD_version >= 1100095
475 static u_int			hn_lro_mbufq_depth = 0;
476 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
477     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
478 #endif
479 
480 /* Packet transmission aggregation size limit */
481 static int			hn_tx_agg_size = -1;
482 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
483     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
484 
485 /* Packet transmission aggregation count limit */
486 static int			hn_tx_agg_pkts = -1;
487 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
488     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
489 
490 static u_int			hn_cpu_index;	/* next CPU for channel */
491 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
492 
493 #ifndef RSS
494 static const uint8_t
495 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
496 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
497 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
498 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
499 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
500 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
501 };
502 #endif	/* !RSS */
503 
504 static device_method_t hn_methods[] = {
505 	/* Device interface */
506 	DEVMETHOD(device_probe,		hn_probe),
507 	DEVMETHOD(device_attach,	hn_attach),
508 	DEVMETHOD(device_detach,	hn_detach),
509 	DEVMETHOD(device_shutdown,	hn_shutdown),
510 	DEVMETHOD_END
511 };
512 
513 static driver_t hn_driver = {
514 	"hn",
515 	hn_methods,
516 	sizeof(struct hn_softc)
517 };
518 
519 static devclass_t hn_devclass;
520 
521 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
522 MODULE_VERSION(hn, 1);
523 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
524 
525 #if __FreeBSD_version >= 1100099
526 static void
527 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
528 {
529 	int i;
530 
531 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
532 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
533 }
534 #endif
535 
536 static int
537 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
538 {
539 
540 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
541 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
542 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
543 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
544 }
545 
546 static int
547 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
548 {
549 	struct hn_nvs_rndis rndis;
550 
551 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
552 	    txd->chim_size > 0, ("invalid rndis chim txd"));
553 
554 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
555 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
556 	rndis.nvs_chim_idx = txd->chim_index;
557 	rndis.nvs_chim_sz = txd->chim_size;
558 
559 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
560 	    &rndis, sizeof(rndis), &txd->send_ctx));
561 }
562 
563 static __inline uint32_t
564 hn_chim_alloc(struct hn_softc *sc)
565 {
566 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
567 	u_long *bmap = sc->hn_chim_bmap;
568 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
569 
570 	for (i = 0; i < bmap_cnt; ++i) {
571 		int idx;
572 
573 		idx = ffsl(~bmap[i]);
574 		if (idx == 0)
575 			continue;
576 
577 		--idx; /* ffsl is 1-based */
578 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
579 		    ("invalid i %d and idx %d", i, idx));
580 
581 		if (atomic_testandset_long(&bmap[i], idx))
582 			continue;
583 
584 		ret = i * LONG_BIT + idx;
585 		break;
586 	}
587 	return (ret);
588 }
589 
590 static __inline void
591 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
592 {
593 	u_long mask;
594 	uint32_t idx;
595 
596 	idx = chim_idx / LONG_BIT;
597 	KASSERT(idx < sc->hn_chim_bmap_cnt,
598 	    ("invalid chimney index 0x%x", chim_idx));
599 
600 	mask = 1UL << (chim_idx % LONG_BIT);
601 	KASSERT(sc->hn_chim_bmap[idx] & mask,
602 	    ("index bitmap 0x%lx, chimney index %u, "
603 	     "bitmap idx %d, bitmask 0x%lx",
604 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
605 
606 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
607 }
608 
609 #if defined(INET6) || defined(INET)
610 /*
611  * NOTE: If this function failed, the m_head would be freed.
612  */
613 static __inline struct mbuf *
614 hn_tso_fixup(struct mbuf *m_head)
615 {
616 	struct ether_vlan_header *evl;
617 	struct tcphdr *th;
618 	int ehlen;
619 
620 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
621 
622 #define PULLUP_HDR(m, len)				\
623 do {							\
624 	if (__predict_false((m)->m_len < (len))) {	\
625 		(m) = m_pullup((m), (len));		\
626 		if ((m) == NULL)			\
627 			return (NULL);			\
628 	}						\
629 } while (0)
630 
631 	PULLUP_HDR(m_head, sizeof(*evl));
632 	evl = mtod(m_head, struct ether_vlan_header *);
633 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
634 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
635 	else
636 		ehlen = ETHER_HDR_LEN;
637 
638 #ifdef INET
639 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
640 		struct ip *ip;
641 		int iphlen;
642 
643 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
644 		ip = mtodo(m_head, ehlen);
645 		iphlen = ip->ip_hl << 2;
646 
647 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
648 		th = mtodo(m_head, ehlen + iphlen);
649 
650 		ip->ip_len = 0;
651 		ip->ip_sum = 0;
652 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
653 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
654 	}
655 #endif
656 #if defined(INET6) && defined(INET)
657 	else
658 #endif
659 #ifdef INET6
660 	{
661 		struct ip6_hdr *ip6;
662 
663 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
664 		ip6 = mtodo(m_head, ehlen);
665 		if (ip6->ip6_nxt != IPPROTO_TCP) {
666 			m_freem(m_head);
667 			return (NULL);
668 		}
669 
670 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
671 		th = mtodo(m_head, ehlen + sizeof(*ip6));
672 
673 		ip6->ip6_plen = 0;
674 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
675 	}
676 #endif
677 	return (m_head);
678 
679 #undef PULLUP_HDR
680 }
681 #endif	/* INET6 || INET */
682 
683 static int
684 hn_set_rxfilter(struct hn_softc *sc)
685 {
686 	struct ifnet *ifp = sc->hn_ifp;
687 	uint32_t filter;
688 	int error = 0;
689 
690 	HN_LOCK_ASSERT(sc);
691 
692 	if (ifp->if_flags & IFF_PROMISC) {
693 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
694 	} else {
695 		filter = NDIS_PACKET_TYPE_DIRECTED;
696 		if (ifp->if_flags & IFF_BROADCAST)
697 			filter |= NDIS_PACKET_TYPE_BROADCAST;
698 		/* TODO: support multicast list */
699 		if ((ifp->if_flags & IFF_ALLMULTI) ||
700 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
701 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
702 	}
703 
704 	if (sc->hn_rx_filter != filter) {
705 		error = hn_rndis_set_rxfilter(sc, filter);
706 		if (!error)
707 			sc->hn_rx_filter = filter;
708 	}
709 	return (error);
710 }
711 
712 static void
713 hn_set_txagg(struct hn_softc *sc)
714 {
715 	uint32_t size, pkts;
716 	int i;
717 
718 	/*
719 	 * Setup aggregation size.
720 	 */
721 	if (sc->hn_agg_size < 0)
722 		size = UINT32_MAX;
723 	else
724 		size = sc->hn_agg_size;
725 
726 	if (sc->hn_rndis_agg_size < size)
727 		size = sc->hn_rndis_agg_size;
728 
729 	/* NOTE: We only aggregate packets using chimney sending buffers. */
730 	if (size > (uint32_t)sc->hn_chim_szmax)
731 		size = sc->hn_chim_szmax;
732 
733 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
734 		/* Disable */
735 		size = 0;
736 		pkts = 0;
737 		goto done;
738 	}
739 
740 	/* NOTE: Type of the per TX ring setting is 'int'. */
741 	if (size > INT_MAX)
742 		size = INT_MAX;
743 
744 	/*
745 	 * Setup aggregation packet count.
746 	 */
747 	if (sc->hn_agg_pkts < 0)
748 		pkts = UINT32_MAX;
749 	else
750 		pkts = sc->hn_agg_pkts;
751 
752 	if (sc->hn_rndis_agg_pkts < pkts)
753 		pkts = sc->hn_rndis_agg_pkts;
754 
755 	if (pkts <= 1) {
756 		/* Disable */
757 		size = 0;
758 		pkts = 0;
759 		goto done;
760 	}
761 
762 	/* NOTE: Type of the per TX ring setting is 'short'. */
763 	if (pkts > SHRT_MAX)
764 		pkts = SHRT_MAX;
765 
766 done:
767 	/* NOTE: Type of the per TX ring setting is 'short'. */
768 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
769 		/* Disable */
770 		size = 0;
771 		pkts = 0;
772 	}
773 
774 	if (bootverbose) {
775 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
776 		    size, pkts, sc->hn_rndis_agg_align);
777 	}
778 
779 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
780 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
781 
782 		mtx_lock(&txr->hn_tx_lock);
783 		txr->hn_agg_szmax = size;
784 		txr->hn_agg_pktmax = pkts;
785 		txr->hn_agg_align = sc->hn_rndis_agg_align;
786 		mtx_unlock(&txr->hn_tx_lock);
787 	}
788 }
789 
790 static int
791 hn_get_txswq_depth(const struct hn_tx_ring *txr)
792 {
793 
794 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
795 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
796 		return txr->hn_txdesc_cnt;
797 	return hn_tx_swq_depth;
798 }
799 
800 #ifndef RSS
801 static int
802 hn_rss_reconfig(struct hn_softc *sc)
803 {
804 	int error;
805 
806 	HN_LOCK_ASSERT(sc);
807 
808 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
809 		return (ENXIO);
810 
811 	/*
812 	 * Disable RSS first.
813 	 *
814 	 * NOTE:
815 	 * Direct reconfiguration by setting the UNCHG flags does
816 	 * _not_ work properly.
817 	 */
818 	if (bootverbose)
819 		if_printf(sc->hn_ifp, "disable RSS\n");
820 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
821 	if (error) {
822 		if_printf(sc->hn_ifp, "RSS disable failed\n");
823 		return (error);
824 	}
825 
826 	/*
827 	 * Reenable the RSS w/ the updated RSS key or indirect
828 	 * table.
829 	 */
830 	if (bootverbose)
831 		if_printf(sc->hn_ifp, "reconfig RSS\n");
832 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
833 	if (error) {
834 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
835 		return (error);
836 	}
837 	return (0);
838 }
839 #endif	/* !RSS */
840 
841 static void
842 hn_rss_ind_fixup(struct hn_softc *sc)
843 {
844 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
845 	int i, nchan;
846 
847 	nchan = sc->hn_rx_ring_inuse;
848 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
849 
850 	/*
851 	 * Check indirect table to make sure that all channels in it
852 	 * can be used.
853 	 */
854 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
855 		if (rss->rss_ind[i] >= nchan) {
856 			if_printf(sc->hn_ifp,
857 			    "RSS indirect table %d fixup: %u -> %d\n",
858 			    i, rss->rss_ind[i], nchan - 1);
859 			rss->rss_ind[i] = nchan - 1;
860 		}
861 	}
862 }
863 
864 static int
865 hn_ifmedia_upd(struct ifnet *ifp __unused)
866 {
867 
868 	return EOPNOTSUPP;
869 }
870 
871 static void
872 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
873 {
874 	struct hn_softc *sc = ifp->if_softc;
875 
876 	ifmr->ifm_status = IFM_AVALID;
877 	ifmr->ifm_active = IFM_ETHER;
878 
879 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
880 		ifmr->ifm_active |= IFM_NONE;
881 		return;
882 	}
883 	ifmr->ifm_status |= IFM_ACTIVE;
884 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
885 }
886 
887 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
888 static const struct hyperv_guid g_net_vsc_device_type = {
889 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
890 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
891 };
892 
893 static int
894 hn_probe(device_t dev)
895 {
896 
897 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
898 	    &g_net_vsc_device_type) == 0) {
899 		device_set_desc(dev, "Hyper-V Network Interface");
900 		return BUS_PROBE_DEFAULT;
901 	}
902 	return ENXIO;
903 }
904 
905 static int
906 hn_attach(device_t dev)
907 {
908 	struct hn_softc *sc = device_get_softc(dev);
909 	struct sysctl_oid_list *child;
910 	struct sysctl_ctx_list *ctx;
911 	uint8_t eaddr[ETHER_ADDR_LEN];
912 	struct ifnet *ifp = NULL;
913 	int error, ring_cnt, tx_ring_cnt;
914 
915 	sc->hn_dev = dev;
916 	sc->hn_prichan = vmbus_get_channel(dev);
917 	HN_LOCK_INIT(sc);
918 
919 	/*
920 	 * Initialize these tunables once.
921 	 */
922 	sc->hn_agg_size = hn_tx_agg_size;
923 	sc->hn_agg_pkts = hn_tx_agg_pkts;
924 
925 	/*
926 	 * Setup taskqueue for transmission.
927 	 */
928 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
929 		int i;
930 
931 		sc->hn_tx_taskqs =
932 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
933 		    M_DEVBUF, M_WAITOK);
934 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
935 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
936 			    M_WAITOK, taskqueue_thread_enqueue,
937 			    &sc->hn_tx_taskqs[i]);
938 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
939 			    "%s tx%d", device_get_nameunit(dev), i);
940 		}
941 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
942 		sc->hn_tx_taskqs = hn_tx_taskque;
943 	}
944 
945 	/*
946 	 * Setup taskqueue for mangement tasks, e.g. link status.
947 	 */
948 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
949 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
950 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
951 	    device_get_nameunit(dev));
952 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
953 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
954 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
955 	    hn_netchg_status_taskfunc, sc);
956 
957 	/*
958 	 * Allocate ifnet and setup its name earlier, so that if_printf
959 	 * can be used by functions, which will be called after
960 	 * ether_ifattach().
961 	 */
962 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
963 	ifp->if_softc = sc;
964 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
965 
966 	/*
967 	 * Initialize ifmedia earlier so that it can be unconditionally
968 	 * destroyed, if error happened later on.
969 	 */
970 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
971 
972 	/*
973 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
974 	 * to use (tx_ring_cnt).
975 	 *
976 	 * NOTE:
977 	 * The # of RX rings to use is same as the # of channels to use.
978 	 */
979 	ring_cnt = hn_chan_cnt;
980 	if (ring_cnt <= 0) {
981 		/* Default */
982 		ring_cnt = mp_ncpus;
983 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
984 			ring_cnt = HN_RING_CNT_DEF_MAX;
985 	} else if (ring_cnt > mp_ncpus) {
986 		ring_cnt = mp_ncpus;
987 	}
988 #ifdef RSS
989 	if (ring_cnt > rss_getnumbuckets())
990 		ring_cnt = rss_getnumbuckets();
991 #endif
992 
993 	tx_ring_cnt = hn_tx_ring_cnt;
994 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
995 		tx_ring_cnt = ring_cnt;
996 #ifdef HN_IFSTART_SUPPORT
997 	if (hn_use_if_start) {
998 		/* ifnet.if_start only needs one TX ring. */
999 		tx_ring_cnt = 1;
1000 	}
1001 #endif
1002 
1003 	/*
1004 	 * Set the leader CPU for channels.
1005 	 */
1006 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1007 
1008 	/*
1009 	 * Create enough TX/RX rings, even if only limited number of
1010 	 * channels can be allocated.
1011 	 */
1012 	error = hn_create_tx_data(sc, tx_ring_cnt);
1013 	if (error)
1014 		goto failed;
1015 	error = hn_create_rx_data(sc, ring_cnt);
1016 	if (error)
1017 		goto failed;
1018 
1019 	/*
1020 	 * Create transaction context for NVS and RNDIS transactions.
1021 	 */
1022 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1023 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1024 	if (sc->hn_xact == NULL) {
1025 		error = ENXIO;
1026 		goto failed;
1027 	}
1028 
1029 	/*
1030 	 * Install orphan handler for the revocation of this device's
1031 	 * primary channel.
1032 	 *
1033 	 * NOTE:
1034 	 * The processing order is critical here:
1035 	 * Install the orphan handler, _before_ testing whether this
1036 	 * device's primary channel has been revoked or not.
1037 	 */
1038 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1039 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1040 		error = ENXIO;
1041 		goto failed;
1042 	}
1043 
1044 	/*
1045 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1046 	 */
1047 	error = hn_synth_attach(sc, ETHERMTU);
1048 	if (error)
1049 		goto failed;
1050 
1051 	error = hn_rndis_get_eaddr(sc, eaddr);
1052 	if (error)
1053 		goto failed;
1054 
1055 #if __FreeBSD_version >= 1100099
1056 	if (sc->hn_rx_ring_inuse > 1) {
1057 		/*
1058 		 * Reduce TCP segment aggregation limit for multiple
1059 		 * RX rings to increase ACK timeliness.
1060 		 */
1061 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1062 	}
1063 #endif
1064 
1065 	/*
1066 	 * Fixup TX stuffs after synthetic parts are attached.
1067 	 */
1068 	hn_fixup_tx_data(sc);
1069 
1070 	ctx = device_get_sysctl_ctx(dev);
1071 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1072 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1073 	    &sc->hn_nvs_ver, 0, "NVS version");
1074 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1075 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1076 	    hn_ndis_version_sysctl, "A", "NDIS version");
1077 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1078 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1079 	    hn_caps_sysctl, "A", "capabilities");
1080 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1081 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1082 	    hn_hwassist_sysctl, "A", "hwassist");
1083 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1084 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1085 	    hn_rxfilter_sysctl, "A", "rxfilter");
1086 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1087 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1088 	    hn_rss_hash_sysctl, "A", "RSS hash");
1089 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1090 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1091 #ifndef RSS
1092 	/*
1093 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
1094 	 */
1095 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1096 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1097 	    hn_rss_key_sysctl, "IU", "RSS key");
1098 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1099 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1100 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1101 #endif
1102 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1103 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1104 	    "RNDIS offered packet transmission aggregation size limit");
1105 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1106 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1107 	    "RNDIS offered packet transmission aggregation count limit");
1108 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1109 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1110 	    "RNDIS packet transmission aggregation alignment");
1111 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1112 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1113 	    hn_txagg_size_sysctl, "I",
1114 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1115 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1116 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1117 	    hn_txagg_pkts_sysctl, "I",
1118 	    "Packet transmission aggregation packets, "
1119 	    "0 -- disable, -1 -- auto");
1120 
1121 	/*
1122 	 * Setup the ifmedia, which has been initialized earlier.
1123 	 */
1124 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1125 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1126 	/* XXX ifmedia_set really should do this for us */
1127 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1128 
1129 	/*
1130 	 * Setup the ifnet for this interface.
1131 	 */
1132 
1133 	ifp->if_baudrate = IF_Gbps(10);
1134 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1135 	ifp->if_ioctl = hn_ioctl;
1136 	ifp->if_init = hn_init;
1137 #ifdef HN_IFSTART_SUPPORT
1138 	if (hn_use_if_start) {
1139 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1140 
1141 		ifp->if_start = hn_start;
1142 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1143 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1144 		IFQ_SET_READY(&ifp->if_snd);
1145 	} else
1146 #endif
1147 	{
1148 		ifp->if_transmit = hn_transmit;
1149 		ifp->if_qflush = hn_xmit_qflush;
1150 	}
1151 
1152 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1153 #ifdef foo
1154 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
1155 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1156 #endif
1157 	if (sc->hn_caps & HN_CAP_VLAN) {
1158 		/* XXX not sure about VLAN_MTU. */
1159 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1160 	}
1161 
1162 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1163 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1164 		ifp->if_capabilities |= IFCAP_TXCSUM;
1165 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1166 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1167 	if (sc->hn_caps & HN_CAP_TSO4) {
1168 		ifp->if_capabilities |= IFCAP_TSO4;
1169 		ifp->if_hwassist |= CSUM_IP_TSO;
1170 	}
1171 	if (sc->hn_caps & HN_CAP_TSO6) {
1172 		ifp->if_capabilities |= IFCAP_TSO6;
1173 		ifp->if_hwassist |= CSUM_IP6_TSO;
1174 	}
1175 
1176 	/* Enable all available capabilities by default. */
1177 	ifp->if_capenable = ifp->if_capabilities;
1178 
1179 	/*
1180 	 * Disable IPv6 TSO and TXCSUM by default, they still can
1181 	 * be enabled through SIOCSIFCAP.
1182 	 */
1183 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1184 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1185 
1186 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1187 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1188 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1189 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1190 	}
1191 
1192 	ether_ifattach(ifp, eaddr);
1193 
1194 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1195 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
1196 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1197 	}
1198 
1199 	/* Inform the upper layer about the long frame support. */
1200 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1201 
1202 	/*
1203 	 * Kick off link status check.
1204 	 */
1205 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1206 	hn_update_link_status(sc);
1207 
1208 	return (0);
1209 failed:
1210 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1211 		hn_synth_detach(sc);
1212 	hn_detach(dev);
1213 	return (error);
1214 }
1215 
1216 static int
1217 hn_detach(device_t dev)
1218 {
1219 	struct hn_softc *sc = device_get_softc(dev);
1220 	struct ifnet *ifp = sc->hn_ifp;
1221 
1222 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1223 		/*
1224 		 * In case that the vmbus missed the orphan handler
1225 		 * installation.
1226 		 */
1227 		vmbus_xact_ctx_orphan(sc->hn_xact);
1228 	}
1229 
1230 	if (device_is_attached(dev)) {
1231 		HN_LOCK(sc);
1232 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1233 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1234 				hn_stop(sc);
1235 			/*
1236 			 * NOTE:
1237 			 * hn_stop() only suspends data, so managment
1238 			 * stuffs have to be suspended manually here.
1239 			 */
1240 			hn_suspend_mgmt(sc);
1241 			hn_synth_detach(sc);
1242 		}
1243 		HN_UNLOCK(sc);
1244 		ether_ifdetach(ifp);
1245 	}
1246 
1247 	ifmedia_removeall(&sc->hn_media);
1248 	hn_destroy_rx_data(sc);
1249 	hn_destroy_tx_data(sc);
1250 
1251 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1252 		int i;
1253 
1254 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
1255 			taskqueue_free(sc->hn_tx_taskqs[i]);
1256 		free(sc->hn_tx_taskqs, M_DEVBUF);
1257 	}
1258 	taskqueue_free(sc->hn_mgmt_taskq0);
1259 
1260 	if (sc->hn_xact != NULL) {
1261 		/*
1262 		 * Uninstall the orphan handler _before_ the xact is
1263 		 * destructed.
1264 		 */
1265 		vmbus_chan_unset_orphan(sc->hn_prichan);
1266 		vmbus_xact_ctx_destroy(sc->hn_xact);
1267 	}
1268 
1269 	if_free(ifp);
1270 
1271 	HN_LOCK_DESTROY(sc);
1272 	return (0);
1273 }
1274 
1275 static int
1276 hn_shutdown(device_t dev)
1277 {
1278 
1279 	return (0);
1280 }
1281 
1282 static void
1283 hn_link_status(struct hn_softc *sc)
1284 {
1285 	uint32_t link_status;
1286 	int error;
1287 
1288 	error = hn_rndis_get_linkstatus(sc, &link_status);
1289 	if (error) {
1290 		/* XXX what to do? */
1291 		return;
1292 	}
1293 
1294 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1295 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1296 	else
1297 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1298 	if_link_state_change(sc->hn_ifp,
1299 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1300 	    LINK_STATE_UP : LINK_STATE_DOWN);
1301 }
1302 
1303 static void
1304 hn_link_taskfunc(void *xsc, int pending __unused)
1305 {
1306 	struct hn_softc *sc = xsc;
1307 
1308 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1309 		return;
1310 	hn_link_status(sc);
1311 }
1312 
1313 static void
1314 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1315 {
1316 	struct hn_softc *sc = xsc;
1317 
1318 	/* Prevent any link status checks from running. */
1319 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1320 
1321 	/*
1322 	 * Fake up a [link down --> link up] state change; 5 seconds
1323 	 * delay is used, which closely simulates miibus reaction
1324 	 * upon link down event.
1325 	 */
1326 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1327 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1328 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1329 	    &sc->hn_netchg_status, 5 * hz);
1330 }
1331 
1332 static void
1333 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1334 {
1335 	struct hn_softc *sc = xsc;
1336 
1337 	/* Re-allow link status checks. */
1338 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1339 	hn_link_status(sc);
1340 }
1341 
1342 static void
1343 hn_update_link_status(struct hn_softc *sc)
1344 {
1345 
1346 	if (sc->hn_mgmt_taskq != NULL)
1347 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1348 }
1349 
1350 static void
1351 hn_change_network(struct hn_softc *sc)
1352 {
1353 
1354 	if (sc->hn_mgmt_taskq != NULL)
1355 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1356 }
1357 
1358 static __inline int
1359 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1360     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1361 {
1362 	struct mbuf *m = *m_head;
1363 	int error;
1364 
1365 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1366 
1367 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1368 	    m, segs, nsegs, BUS_DMA_NOWAIT);
1369 	if (error == EFBIG) {
1370 		struct mbuf *m_new;
1371 
1372 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1373 		if (m_new == NULL)
1374 			return ENOBUFS;
1375 		else
1376 			*m_head = m = m_new;
1377 		txr->hn_tx_collapsed++;
1378 
1379 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1380 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1381 	}
1382 	if (!error) {
1383 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1384 		    BUS_DMASYNC_PREWRITE);
1385 		txd->flags |= HN_TXD_FLAG_DMAMAP;
1386 	}
1387 	return error;
1388 }
1389 
1390 static __inline int
1391 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1392 {
1393 
1394 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1395 	    ("put an onlist txd %#x", txd->flags));
1396 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1397 	    ("put an onagg txd %#x", txd->flags));
1398 
1399 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1400 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1401 		return 0;
1402 
1403 	if (!STAILQ_EMPTY(&txd->agg_list)) {
1404 		struct hn_txdesc *tmp_txd;
1405 
1406 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1407 			int freed;
1408 
1409 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1410 			    ("resursive aggregation on aggregated txdesc"));
1411 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1412 			    ("not aggregated txdesc"));
1413 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1414 			    ("aggregated txdesc uses dmamap"));
1415 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1416 			    ("aggregated txdesc consumes "
1417 			     "chimney sending buffer"));
1418 			KASSERT(tmp_txd->chim_size == 0,
1419 			    ("aggregated txdesc has non-zero "
1420 			     "chimney sending size"));
1421 
1422 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1423 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1424 			freed = hn_txdesc_put(txr, tmp_txd);
1425 			KASSERT(freed, ("failed to free aggregated txdesc"));
1426 		}
1427 	}
1428 
1429 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1430 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1431 		    ("chim txd uses dmamap"));
1432 		hn_chim_free(txr->hn_sc, txd->chim_index);
1433 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1434 		txd->chim_size = 0;
1435 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1436 		bus_dmamap_sync(txr->hn_tx_data_dtag,
1437 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1438 		bus_dmamap_unload(txr->hn_tx_data_dtag,
1439 		    txd->data_dmap);
1440 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1441 	}
1442 
1443 	if (txd->m != NULL) {
1444 		m_freem(txd->m);
1445 		txd->m = NULL;
1446 	}
1447 
1448 	txd->flags |= HN_TXD_FLAG_ONLIST;
1449 #ifndef HN_USE_TXDESC_BUFRING
1450 	mtx_lock_spin(&txr->hn_txlist_spin);
1451 	KASSERT(txr->hn_txdesc_avail >= 0 &&
1452 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1453 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1454 	txr->hn_txdesc_avail++;
1455 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1456 	mtx_unlock_spin(&txr->hn_txlist_spin);
1457 #else	/* HN_USE_TXDESC_BUFRING */
1458 #ifdef HN_DEBUG
1459 	atomic_add_int(&txr->hn_txdesc_avail, 1);
1460 #endif
1461 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1462 #endif	/* !HN_USE_TXDESC_BUFRING */
1463 
1464 	return 1;
1465 }
1466 
1467 static __inline struct hn_txdesc *
1468 hn_txdesc_get(struct hn_tx_ring *txr)
1469 {
1470 	struct hn_txdesc *txd;
1471 
1472 #ifndef HN_USE_TXDESC_BUFRING
1473 	mtx_lock_spin(&txr->hn_txlist_spin);
1474 	txd = SLIST_FIRST(&txr->hn_txlist);
1475 	if (txd != NULL) {
1476 		KASSERT(txr->hn_txdesc_avail > 0,
1477 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1478 		txr->hn_txdesc_avail--;
1479 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1480 	}
1481 	mtx_unlock_spin(&txr->hn_txlist_spin);
1482 #else
1483 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1484 #endif
1485 
1486 	if (txd != NULL) {
1487 #ifdef HN_USE_TXDESC_BUFRING
1488 #ifdef HN_DEBUG
1489 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1490 #endif
1491 #endif	/* HN_USE_TXDESC_BUFRING */
1492 		KASSERT(txd->m == NULL && txd->refs == 0 &&
1493 		    STAILQ_EMPTY(&txd->agg_list) &&
1494 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1495 		    txd->chim_size == 0 &&
1496 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1497 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1498 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1499 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1500 		txd->refs = 1;
1501 	}
1502 	return txd;
1503 }
1504 
1505 static __inline void
1506 hn_txdesc_hold(struct hn_txdesc *txd)
1507 {
1508 
1509 	/* 0->1 transition will never work */
1510 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1511 	atomic_add_int(&txd->refs, 1);
1512 }
1513 
1514 static __inline void
1515 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1516 {
1517 
1518 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1519 	    ("recursive aggregation on aggregating txdesc"));
1520 
1521 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1522 	    ("already aggregated"));
1523 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
1524 	    ("recursive aggregation on to-be-aggregated txdesc"));
1525 
1526 	txd->flags |= HN_TXD_FLAG_ONAGG;
1527 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1528 }
1529 
1530 static bool
1531 hn_tx_ring_pending(struct hn_tx_ring *txr)
1532 {
1533 	bool pending = false;
1534 
1535 #ifndef HN_USE_TXDESC_BUFRING
1536 	mtx_lock_spin(&txr->hn_txlist_spin);
1537 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1538 		pending = true;
1539 	mtx_unlock_spin(&txr->hn_txlist_spin);
1540 #else
1541 	if (!buf_ring_full(txr->hn_txdesc_br))
1542 		pending = true;
1543 #endif
1544 	return (pending);
1545 }
1546 
1547 static __inline void
1548 hn_txeof(struct hn_tx_ring *txr)
1549 {
1550 	txr->hn_has_txeof = 0;
1551 	txr->hn_txeof(txr);
1552 }
1553 
1554 static void
1555 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1556     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1557 {
1558 	struct hn_txdesc *txd = sndc->hn_cbarg;
1559 	struct hn_tx_ring *txr;
1560 
1561 	txr = txd->txr;
1562 	KASSERT(txr->hn_chan == chan,
1563 	    ("channel mismatch, on chan%u, should be chan%u",
1564 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1565 
1566 	txr->hn_has_txeof = 1;
1567 	hn_txdesc_put(txr, txd);
1568 
1569 	++txr->hn_txdone_cnt;
1570 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1571 		txr->hn_txdone_cnt = 0;
1572 		if (txr->hn_oactive)
1573 			hn_txeof(txr);
1574 	}
1575 }
1576 
1577 static void
1578 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1579 {
1580 #if defined(INET) || defined(INET6)
1581 	tcp_lro_flush_all(&rxr->hn_lro);
1582 #endif
1583 
1584 	/*
1585 	 * NOTE:
1586 	 * 'txr' could be NULL, if multiple channels and
1587 	 * ifnet.if_start method are enabled.
1588 	 */
1589 	if (txr == NULL || !txr->hn_has_txeof)
1590 		return;
1591 
1592 	txr->hn_txdone_cnt = 0;
1593 	hn_txeof(txr);
1594 }
1595 
1596 static __inline uint32_t
1597 hn_rndis_pktmsg_offset(uint32_t ofs)
1598 {
1599 
1600 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1601 	    ("invalid RNDIS packet msg offset %u", ofs));
1602 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1603 }
1604 
1605 static __inline void *
1606 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1607     size_t pi_dlen, uint32_t pi_type)
1608 {
1609 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1610 	struct rndis_pktinfo *pi;
1611 
1612 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1613 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1614 
1615 	/*
1616 	 * Per-packet-info does not move; it only grows.
1617 	 *
1618 	 * NOTE:
1619 	 * rm_pktinfooffset in this phase counts from the beginning
1620 	 * of rndis_packet_msg.
1621 	 */
1622 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1623 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1624 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1625 	    pkt->rm_pktinfolen);
1626 	pkt->rm_pktinfolen += pi_size;
1627 
1628 	pi->rm_size = pi_size;
1629 	pi->rm_type = pi_type;
1630 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1631 
1632 	/* Data immediately follow per-packet-info. */
1633 	pkt->rm_dataoffset += pi_size;
1634 
1635 	/* Update RNDIS packet msg length */
1636 	pkt->rm_len += pi_size;
1637 
1638 	return (pi->rm_data);
1639 }
1640 
1641 static __inline int
1642 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1643 {
1644 	struct hn_txdesc *txd;
1645 	struct mbuf *m;
1646 	int error, pkts;
1647 
1648 	txd = txr->hn_agg_txd;
1649 	KASSERT(txd != NULL, ("no aggregate txdesc"));
1650 
1651 	/*
1652 	 * Since hn_txpkt() will reset this temporary stat, save
1653 	 * it now, so that oerrors can be updated properly, if
1654 	 * hn_txpkt() ever fails.
1655 	 */
1656 	pkts = txr->hn_stat_pkts;
1657 
1658 	/*
1659 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1660 	 * failure, save it for later freeing, if hn_txpkt() ever
1661 	 * fails.
1662 	 */
1663 	m = txd->m;
1664 	error = hn_txpkt(ifp, txr, txd);
1665 	if (__predict_false(error)) {
1666 		/* txd is freed, but m is not. */
1667 		m_freem(m);
1668 
1669 		txr->hn_flush_failed++;
1670 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1671 	}
1672 
1673 	/* Reset all aggregation states. */
1674 	txr->hn_agg_txd = NULL;
1675 	txr->hn_agg_szleft = 0;
1676 	txr->hn_agg_pktleft = 0;
1677 	txr->hn_agg_prevpkt = NULL;
1678 
1679 	return (error);
1680 }
1681 
1682 static void *
1683 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1684     int pktsize)
1685 {
1686 	void *chim;
1687 
1688 	if (txr->hn_agg_txd != NULL) {
1689 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1690 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1691 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1692 			int olen;
1693 
1694 			/*
1695 			 * Update the previous RNDIS packet's total length,
1696 			 * it can be increased due to the mandatory alignment
1697 			 * padding for this RNDIS packet.  And update the
1698 			 * aggregating txdesc's chimney sending buffer size
1699 			 * accordingly.
1700 			 *
1701 			 * XXX
1702 			 * Zero-out the padding, as required by the RNDIS spec.
1703 			 */
1704 			olen = pkt->rm_len;
1705 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1706 			agg_txd->chim_size += pkt->rm_len - olen;
1707 
1708 			/* Link this txdesc to the parent. */
1709 			hn_txdesc_agg(agg_txd, txd);
1710 
1711 			chim = (uint8_t *)pkt + pkt->rm_len;
1712 			/* Save the current packet for later fixup. */
1713 			txr->hn_agg_prevpkt = chim;
1714 
1715 			txr->hn_agg_pktleft--;
1716 			txr->hn_agg_szleft -= pktsize;
1717 			if (txr->hn_agg_szleft <=
1718 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1719 				/*
1720 				 * Probably can't aggregate more packets,
1721 				 * flush this aggregating txdesc proactively.
1722 				 */
1723 				txr->hn_agg_pktleft = 0;
1724 			}
1725 			/* Done! */
1726 			return (chim);
1727 		}
1728 		hn_flush_txagg(ifp, txr);
1729 	}
1730 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1731 
1732 	txr->hn_tx_chimney_tried++;
1733 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
1734 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1735 		return (NULL);
1736 	txr->hn_tx_chimney++;
1737 
1738 	chim = txr->hn_sc->hn_chim +
1739 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1740 
1741 	if (txr->hn_agg_pktmax > 1 &&
1742 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1743 		txr->hn_agg_txd = txd;
1744 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1745 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1746 		txr->hn_agg_prevpkt = chim;
1747 	}
1748 	return (chim);
1749 }
1750 
1751 /*
1752  * NOTE:
1753  * If this function fails, then both txd and m_head0 will be freed.
1754  */
1755 static int
1756 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1757     struct mbuf **m_head0)
1758 {
1759 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1760 	int error, nsegs, i;
1761 	struct mbuf *m_head = *m_head0;
1762 	struct rndis_packet_msg *pkt;
1763 	uint32_t *pi_data;
1764 	void *chim = NULL;
1765 	int pkt_hlen, pkt_size;
1766 
1767 	pkt = txd->rndis_pkt;
1768 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1769 	if (pkt_size < txr->hn_chim_size) {
1770 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1771 		if (chim != NULL)
1772 			pkt = chim;
1773 	} else {
1774 		if (txr->hn_agg_txd != NULL)
1775 			hn_flush_txagg(ifp, txr);
1776 	}
1777 
1778 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1779 	pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1780 	pkt->rm_dataoffset = sizeof(*pkt);
1781 	pkt->rm_datalen = m_head->m_pkthdr.len;
1782 	pkt->rm_oobdataoffset = 0;
1783 	pkt->rm_oobdatalen = 0;
1784 	pkt->rm_oobdataelements = 0;
1785 	pkt->rm_pktinfooffset = sizeof(*pkt);
1786 	pkt->rm_pktinfolen = 0;
1787 	pkt->rm_vchandle = 0;
1788 	pkt->rm_reserved = 0;
1789 
1790 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1791 		/*
1792 		 * Set the hash value for this packet, so that the host could
1793 		 * dispatch the TX done event for this packet back to this TX
1794 		 * ring's channel.
1795 		 */
1796 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1797 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1798 		*pi_data = txr->hn_tx_idx;
1799 	}
1800 
1801 	if (m_head->m_flags & M_VLANTAG) {
1802 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1803 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1804 		*pi_data = NDIS_VLAN_INFO_MAKE(
1805 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1806 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1807 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1808 	}
1809 
1810 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1811 #if defined(INET6) || defined(INET)
1812 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1813 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1814 #ifdef INET
1815 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1816 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1817 			    m_head->m_pkthdr.tso_segsz);
1818 		}
1819 #endif
1820 #if defined(INET6) && defined(INET)
1821 		else
1822 #endif
1823 #ifdef INET6
1824 		{
1825 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1826 			    m_head->m_pkthdr.tso_segsz);
1827 		}
1828 #endif
1829 #endif	/* INET6 || INET */
1830 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1831 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1832 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1833 		if (m_head->m_pkthdr.csum_flags &
1834 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1835 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
1836 		} else {
1837 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
1838 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1839 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
1840 		}
1841 
1842 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1843 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1844 		else if (m_head->m_pkthdr.csum_flags &
1845 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
1846 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1847 	}
1848 
1849 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1850 	/* Convert RNDIS packet message offsets */
1851 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1852 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1853 
1854 	/*
1855 	 * Fast path: Chimney sending.
1856 	 */
1857 	if (chim != NULL) {
1858 		struct hn_txdesc *tgt_txd = txd;
1859 
1860 		if (txr->hn_agg_txd != NULL) {
1861 			tgt_txd = txr->hn_agg_txd;
1862 #ifdef INVARIANTS
1863 			*m_head0 = NULL;
1864 #endif
1865 		}
1866 
1867 		KASSERT(pkt == chim,
1868 		    ("RNDIS pkt not in chimney sending buffer"));
1869 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1870 		    ("chimney sending buffer is not used"));
1871 		tgt_txd->chim_size += pkt->rm_len;
1872 
1873 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
1874 		    ((uint8_t *)chim) + pkt_hlen);
1875 
1876 		txr->hn_gpa_cnt = 0;
1877 		txr->hn_sendpkt = hn_txpkt_chim;
1878 		goto done;
1879 	}
1880 
1881 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1882 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1883 	    ("chimney buffer is used"));
1884 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1885 
1886 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1887 	if (__predict_false(error)) {
1888 		int freed;
1889 
1890 		/*
1891 		 * This mbuf is not linked w/ the txd yet, so free it now.
1892 		 */
1893 		m_freem(m_head);
1894 		*m_head0 = NULL;
1895 
1896 		freed = hn_txdesc_put(txr, txd);
1897 		KASSERT(freed != 0,
1898 		    ("fail to free txd upon txdma error"));
1899 
1900 		txr->hn_txdma_failed++;
1901 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1902 		return error;
1903 	}
1904 	*m_head0 = m_head;
1905 
1906 	/* +1 RNDIS packet message */
1907 	txr->hn_gpa_cnt = nsegs + 1;
1908 
1909 	/* send packet with page buffer */
1910 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1911 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1912 	txr->hn_gpa[0].gpa_len = pkt_hlen;
1913 
1914 	/*
1915 	 * Fill the page buffers with mbuf info after the page
1916 	 * buffer for RNDIS packet message.
1917 	 */
1918 	for (i = 0; i < nsegs; ++i) {
1919 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1920 
1921 		gpa->gpa_page = atop(segs[i].ds_addr);
1922 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1923 		gpa->gpa_len = segs[i].ds_len;
1924 	}
1925 
1926 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1927 	txd->chim_size = 0;
1928 	txr->hn_sendpkt = hn_txpkt_sglist;
1929 done:
1930 	txd->m = m_head;
1931 
1932 	/* Set the completion routine */
1933 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1934 
1935 	/* Update temporary stats for later use. */
1936 	txr->hn_stat_pkts++;
1937 	txr->hn_stat_size += m_head->m_pkthdr.len;
1938 	if (m_head->m_flags & M_MCAST)
1939 		txr->hn_stat_mcasts++;
1940 
1941 	return 0;
1942 }
1943 
1944 /*
1945  * NOTE:
1946  * If this function fails, then txd will be freed, but the mbuf
1947  * associated w/ the txd will _not_ be freed.
1948  */
1949 static int
1950 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1951 {
1952 	int error, send_failed = 0, has_bpf;
1953 
1954 again:
1955 	has_bpf = bpf_peers_present(ifp->if_bpf);
1956 	if (has_bpf) {
1957 		/*
1958 		 * Make sure that this txd and any aggregated txds are not
1959 		 * freed before ETHER_BPF_MTAP.
1960 		 */
1961 		hn_txdesc_hold(txd);
1962 	}
1963 	error = txr->hn_sendpkt(txr, txd);
1964 	if (!error) {
1965 		if (has_bpf) {
1966 			const struct hn_txdesc *tmp_txd;
1967 
1968 			ETHER_BPF_MTAP(ifp, txd->m);
1969 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1970 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
1971 		}
1972 
1973 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1974 #ifdef HN_IFSTART_SUPPORT
1975 		if (!hn_use_if_start)
1976 #endif
1977 		{
1978 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
1979 			    txr->hn_stat_size);
1980 			if (txr->hn_stat_mcasts != 0) {
1981 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1982 				    txr->hn_stat_mcasts);
1983 			}
1984 		}
1985 		txr->hn_pkts += txr->hn_stat_pkts;
1986 		txr->hn_sends++;
1987 	}
1988 	if (has_bpf)
1989 		hn_txdesc_put(txr, txd);
1990 
1991 	if (__predict_false(error)) {
1992 		int freed;
1993 
1994 		/*
1995 		 * This should "really rarely" happen.
1996 		 *
1997 		 * XXX Too many RX to be acked or too many sideband
1998 		 * commands to run?  Ask netvsc_channel_rollup()
1999 		 * to kick start later.
2000 		 */
2001 		txr->hn_has_txeof = 1;
2002 		if (!send_failed) {
2003 			txr->hn_send_failed++;
2004 			send_failed = 1;
2005 			/*
2006 			 * Try sending again after set hn_has_txeof;
2007 			 * in case that we missed the last
2008 			 * netvsc_channel_rollup().
2009 			 */
2010 			goto again;
2011 		}
2012 		if_printf(ifp, "send failed\n");
2013 
2014 		/*
2015 		 * Caller will perform further processing on the
2016 		 * associated mbuf, so don't free it in hn_txdesc_put();
2017 		 * only unload it from the DMA map in hn_txdesc_put(),
2018 		 * if it was loaded.
2019 		 */
2020 		txd->m = NULL;
2021 		freed = hn_txdesc_put(txr, txd);
2022 		KASSERT(freed != 0,
2023 		    ("fail to free txd upon send error"));
2024 
2025 		txr->hn_send_failed++;
2026 	}
2027 
2028 	/* Reset temporary stats, after this sending is done. */
2029 	txr->hn_stat_size = 0;
2030 	txr->hn_stat_pkts = 0;
2031 	txr->hn_stat_mcasts = 0;
2032 
2033 	return (error);
2034 }
2035 
2036 /*
2037  * Append the specified data to the indicated mbuf chain,
2038  * Extend the mbuf chain if the new data does not fit in
2039  * existing space.
2040  *
2041  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2042  * There should be an equivalent in the kernel mbuf code,
2043  * but there does not appear to be one yet.
2044  *
2045  * Differs from m_append() in that additional mbufs are
2046  * allocated with cluster size MJUMPAGESIZE, and filled
2047  * accordingly.
2048  *
2049  * Return 1 if able to complete the job; otherwise 0.
2050  */
2051 static int
2052 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2053 {
2054 	struct mbuf *m, *n;
2055 	int remainder, space;
2056 
2057 	for (m = m0; m->m_next != NULL; m = m->m_next)
2058 		;
2059 	remainder = len;
2060 	space = M_TRAILINGSPACE(m);
2061 	if (space > 0) {
2062 		/*
2063 		 * Copy into available space.
2064 		 */
2065 		if (space > remainder)
2066 			space = remainder;
2067 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2068 		m->m_len += space;
2069 		cp += space;
2070 		remainder -= space;
2071 	}
2072 	while (remainder > 0) {
2073 		/*
2074 		 * Allocate a new mbuf; could check space
2075 		 * and allocate a cluster instead.
2076 		 */
2077 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2078 		if (n == NULL)
2079 			break;
2080 		n->m_len = min(MJUMPAGESIZE, remainder);
2081 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2082 		cp += n->m_len;
2083 		remainder -= n->m_len;
2084 		m->m_next = n;
2085 		m = n;
2086 	}
2087 	if (m0->m_flags & M_PKTHDR)
2088 		m0->m_pkthdr.len += len - remainder;
2089 
2090 	return (remainder == 0);
2091 }
2092 
2093 #if defined(INET) || defined(INET6)
2094 static __inline int
2095 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2096 {
2097 #if __FreeBSD_version >= 1100095
2098 	if (hn_lro_mbufq_depth) {
2099 		tcp_lro_queue_mbuf(lc, m);
2100 		return 0;
2101 	}
2102 #endif
2103 	return tcp_lro_rx(lc, m, 0);
2104 }
2105 #endif
2106 
2107 static int
2108 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2109     const struct hn_rxinfo *info)
2110 {
2111 	struct ifnet *ifp = rxr->hn_ifp;
2112 	struct mbuf *m_new;
2113 	int size, do_lro = 0, do_csum = 1;
2114 	int hash_type;
2115 
2116 	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2117 		return (0);
2118 
2119 	/*
2120 	 * Bail out if packet contains more data than configured MTU.
2121 	 */
2122 	if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2123 		return (0);
2124 	} else if (dlen <= MHLEN) {
2125 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
2126 		if (m_new == NULL) {
2127 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2128 			return (0);
2129 		}
2130 		memcpy(mtod(m_new, void *), data, dlen);
2131 		m_new->m_pkthdr.len = m_new->m_len = dlen;
2132 		rxr->hn_small_pkts++;
2133 	} else {
2134 		/*
2135 		 * Get an mbuf with a cluster.  For packets 2K or less,
2136 		 * get a standard 2K cluster.  For anything larger, get a
2137 		 * 4K cluster.  Any buffers larger than 4K can cause problems
2138 		 * if looped around to the Hyper-V TX channel, so avoid them.
2139 		 */
2140 		size = MCLBYTES;
2141 		if (dlen > MCLBYTES) {
2142 			/* 4096 */
2143 			size = MJUMPAGESIZE;
2144 		}
2145 
2146 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2147 		if (m_new == NULL) {
2148 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2149 			return (0);
2150 		}
2151 
2152 		hv_m_append(m_new, dlen, data);
2153 	}
2154 	m_new->m_pkthdr.rcvif = ifp;
2155 
2156 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2157 		do_csum = 0;
2158 
2159 	/* receive side checksum offload */
2160 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2161 		/* IP csum offload */
2162 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2163 			m_new->m_pkthdr.csum_flags |=
2164 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2165 			rxr->hn_csum_ip++;
2166 		}
2167 
2168 		/* TCP/UDP csum offload */
2169 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2170 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2171 			m_new->m_pkthdr.csum_flags |=
2172 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2173 			m_new->m_pkthdr.csum_data = 0xffff;
2174 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2175 				rxr->hn_csum_tcp++;
2176 			else
2177 				rxr->hn_csum_udp++;
2178 		}
2179 
2180 		/*
2181 		 * XXX
2182 		 * As of this write (Oct 28th, 2016), host side will turn
2183 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2184 		 * the do_lro setting here is actually _not_ accurate.  We
2185 		 * depend on the RSS hash type check to reset do_lro.
2186 		 */
2187 		if ((info->csum_info &
2188 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2189 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2190 			do_lro = 1;
2191 	} else {
2192 		const struct ether_header *eh;
2193 		uint16_t etype;
2194 		int hoff;
2195 
2196 		hoff = sizeof(*eh);
2197 		if (m_new->m_len < hoff)
2198 			goto skip;
2199 		eh = mtod(m_new, struct ether_header *);
2200 		etype = ntohs(eh->ether_type);
2201 		if (etype == ETHERTYPE_VLAN) {
2202 			const struct ether_vlan_header *evl;
2203 
2204 			hoff = sizeof(*evl);
2205 			if (m_new->m_len < hoff)
2206 				goto skip;
2207 			evl = mtod(m_new, struct ether_vlan_header *);
2208 			etype = ntohs(evl->evl_proto);
2209 		}
2210 
2211 		if (etype == ETHERTYPE_IP) {
2212 			int pr;
2213 
2214 			pr = hn_check_iplen(m_new, hoff);
2215 			if (pr == IPPROTO_TCP) {
2216 				if (do_csum &&
2217 				    (rxr->hn_trust_hcsum &
2218 				     HN_TRUST_HCSUM_TCP)) {
2219 					rxr->hn_csum_trusted++;
2220 					m_new->m_pkthdr.csum_flags |=
2221 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2222 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2223 					m_new->m_pkthdr.csum_data = 0xffff;
2224 				}
2225 				do_lro = 1;
2226 			} else if (pr == IPPROTO_UDP) {
2227 				if (do_csum &&
2228 				    (rxr->hn_trust_hcsum &
2229 				     HN_TRUST_HCSUM_UDP)) {
2230 					rxr->hn_csum_trusted++;
2231 					m_new->m_pkthdr.csum_flags |=
2232 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2233 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2234 					m_new->m_pkthdr.csum_data = 0xffff;
2235 				}
2236 			} else if (pr != IPPROTO_DONE && do_csum &&
2237 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2238 				rxr->hn_csum_trusted++;
2239 				m_new->m_pkthdr.csum_flags |=
2240 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2241 			}
2242 		}
2243 	}
2244 skip:
2245 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2246 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2247 		    NDIS_VLAN_INFO_ID(info->vlan_info),
2248 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
2249 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
2250 		m_new->m_flags |= M_VLANTAG;
2251 	}
2252 
2253 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2254 		rxr->hn_rss_pkts++;
2255 		m_new->m_pkthdr.flowid = info->hash_value;
2256 		hash_type = M_HASHTYPE_OPAQUE_HASH;
2257 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2258 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
2259 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2260 
2261 			/*
2262 			 * NOTE:
2263 			 * do_lro is resetted, if the hash types are not TCP
2264 			 * related.  See the comment in the above csum_flags
2265 			 * setup section.
2266 			 */
2267 			switch (type) {
2268 			case NDIS_HASH_IPV4:
2269 				hash_type = M_HASHTYPE_RSS_IPV4;
2270 				do_lro = 0;
2271 				break;
2272 
2273 			case NDIS_HASH_TCP_IPV4:
2274 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2275 				break;
2276 
2277 			case NDIS_HASH_IPV6:
2278 				hash_type = M_HASHTYPE_RSS_IPV6;
2279 				do_lro = 0;
2280 				break;
2281 
2282 			case NDIS_HASH_IPV6_EX:
2283 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
2284 				do_lro = 0;
2285 				break;
2286 
2287 			case NDIS_HASH_TCP_IPV6:
2288 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2289 				break;
2290 
2291 			case NDIS_HASH_TCP_IPV6_EX:
2292 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2293 				break;
2294 			}
2295 		}
2296 	} else {
2297 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2298 		hash_type = M_HASHTYPE_OPAQUE;
2299 	}
2300 	M_HASHTYPE_SET(m_new, hash_type);
2301 
2302 	/*
2303 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
2304 	 * messages (not just data messages) will trigger a response.
2305 	 */
2306 
2307 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2308 	rxr->hn_pkts++;
2309 
2310 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2311 #if defined(INET) || defined(INET6)
2312 		struct lro_ctrl *lro = &rxr->hn_lro;
2313 
2314 		if (lro->lro_cnt) {
2315 			rxr->hn_lro_tried++;
2316 			if (hn_lro_rx(lro, m_new) == 0) {
2317 				/* DONE! */
2318 				return 0;
2319 			}
2320 		}
2321 #endif
2322 	}
2323 
2324 	/* We're not holding the lock here, so don't release it */
2325 	(*ifp->if_input)(ifp, m_new);
2326 
2327 	return (0);
2328 }
2329 
2330 static int
2331 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2332 {
2333 	struct hn_softc *sc = ifp->if_softc;
2334 	struct ifreq *ifr = (struct ifreq *)data;
2335 	int mask, error = 0;
2336 
2337 	switch (cmd) {
2338 	case SIOCSIFMTU:
2339 		if (ifr->ifr_mtu > HN_MTU_MAX) {
2340 			error = EINVAL;
2341 			break;
2342 		}
2343 
2344 		HN_LOCK(sc);
2345 
2346 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2347 			HN_UNLOCK(sc);
2348 			break;
2349 		}
2350 
2351 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2352 			/* Can't change MTU */
2353 			HN_UNLOCK(sc);
2354 			error = EOPNOTSUPP;
2355 			break;
2356 		}
2357 
2358 		if (ifp->if_mtu == ifr->ifr_mtu) {
2359 			HN_UNLOCK(sc);
2360 			break;
2361 		}
2362 
2363 		/*
2364 		 * Suspend this interface before the synthetic parts
2365 		 * are ripped.
2366 		 */
2367 		hn_suspend(sc);
2368 
2369 		/*
2370 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
2371 		 */
2372 		hn_synth_detach(sc);
2373 
2374 		/*
2375 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2376 		 * with the new MTU setting.
2377 		 */
2378 		error = hn_synth_attach(sc, ifr->ifr_mtu);
2379 		if (error) {
2380 			HN_UNLOCK(sc);
2381 			break;
2382 		}
2383 
2384 		/*
2385 		 * Commit the requested MTU, after the synthetic parts
2386 		 * have been successfully attached.
2387 		 */
2388 		ifp->if_mtu = ifr->ifr_mtu;
2389 
2390 		/*
2391 		 * Make sure that various parameters based on MTU are
2392 		 * still valid, after the MTU change.
2393 		 */
2394 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2395 			hn_set_chim_size(sc, sc->hn_chim_szmax);
2396 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2397 #if __FreeBSD_version >= 1100099
2398 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2399 		    HN_LRO_LENLIM_MIN(ifp))
2400 			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2401 #endif
2402 
2403 		/*
2404 		 * All done!  Resume the interface now.
2405 		 */
2406 		hn_resume(sc);
2407 
2408 		HN_UNLOCK(sc);
2409 		break;
2410 
2411 	case SIOCSIFFLAGS:
2412 		HN_LOCK(sc);
2413 
2414 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2415 			HN_UNLOCK(sc);
2416 			break;
2417 		}
2418 
2419 		if (ifp->if_flags & IFF_UP) {
2420 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2421 				/*
2422 				 * Caller meight hold mutex, e.g.
2423 				 * bpf; use busy-wait for the RNDIS
2424 				 * reply.
2425 				 */
2426 				HN_NO_SLEEPING(sc);
2427 				hn_set_rxfilter(sc);
2428 				HN_SLEEPING_OK(sc);
2429 			} else {
2430 				hn_init_locked(sc);
2431 			}
2432 		} else {
2433 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2434 				hn_stop(sc);
2435 		}
2436 		sc->hn_if_flags = ifp->if_flags;
2437 
2438 		HN_UNLOCK(sc);
2439 		break;
2440 
2441 	case SIOCSIFCAP:
2442 		HN_LOCK(sc);
2443 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2444 
2445 		if (mask & IFCAP_TXCSUM) {
2446 			ifp->if_capenable ^= IFCAP_TXCSUM;
2447 			if (ifp->if_capenable & IFCAP_TXCSUM)
2448 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2449 			else
2450 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2451 		}
2452 		if (mask & IFCAP_TXCSUM_IPV6) {
2453 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2454 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2455 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2456 			else
2457 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2458 		}
2459 
2460 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2461 		if (mask & IFCAP_RXCSUM)
2462 			ifp->if_capenable ^= IFCAP_RXCSUM;
2463 #ifdef foo
2464 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2465 		if (mask & IFCAP_RXCSUM_IPV6)
2466 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2467 #endif
2468 
2469 		if (mask & IFCAP_LRO)
2470 			ifp->if_capenable ^= IFCAP_LRO;
2471 
2472 		if (mask & IFCAP_TSO4) {
2473 			ifp->if_capenable ^= IFCAP_TSO4;
2474 			if (ifp->if_capenable & IFCAP_TSO4)
2475 				ifp->if_hwassist |= CSUM_IP_TSO;
2476 			else
2477 				ifp->if_hwassist &= ~CSUM_IP_TSO;
2478 		}
2479 		if (mask & IFCAP_TSO6) {
2480 			ifp->if_capenable ^= IFCAP_TSO6;
2481 			if (ifp->if_capenable & IFCAP_TSO6)
2482 				ifp->if_hwassist |= CSUM_IP6_TSO;
2483 			else
2484 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2485 		}
2486 
2487 		HN_UNLOCK(sc);
2488 		break;
2489 
2490 	case SIOCADDMULTI:
2491 	case SIOCDELMULTI:
2492 		HN_LOCK(sc);
2493 
2494 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2495 			HN_UNLOCK(sc);
2496 			break;
2497 		}
2498 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2499 			/*
2500 			 * Multicast uses mutex; use busy-wait for
2501 			 * the RNDIS reply.
2502 			 */
2503 			HN_NO_SLEEPING(sc);
2504 			hn_set_rxfilter(sc);
2505 			HN_SLEEPING_OK(sc);
2506 		}
2507 
2508 		HN_UNLOCK(sc);
2509 		break;
2510 
2511 	case SIOCSIFMEDIA:
2512 	case SIOCGIFMEDIA:
2513 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2514 		break;
2515 
2516 	default:
2517 		error = ether_ioctl(ifp, cmd, data);
2518 		break;
2519 	}
2520 	return (error);
2521 }
2522 
2523 static void
2524 hn_stop(struct hn_softc *sc)
2525 {
2526 	struct ifnet *ifp = sc->hn_ifp;
2527 	int i;
2528 
2529 	HN_LOCK_ASSERT(sc);
2530 
2531 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2532 	    ("synthetic parts were not attached"));
2533 
2534 	/* Clear RUNNING bit _before_ hn_suspend_data() */
2535 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2536 	hn_suspend_data(sc);
2537 
2538 	/* Clear OACTIVE bit. */
2539 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2540 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2541 		sc->hn_tx_ring[i].hn_oactive = 0;
2542 }
2543 
2544 static void
2545 hn_init_locked(struct hn_softc *sc)
2546 {
2547 	struct ifnet *ifp = sc->hn_ifp;
2548 	int i;
2549 
2550 	HN_LOCK_ASSERT(sc);
2551 
2552 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2553 		return;
2554 
2555 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2556 		return;
2557 
2558 	/* Configure RX filter */
2559 	hn_set_rxfilter(sc);
2560 
2561 	/* Clear OACTIVE bit. */
2562 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2563 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2564 		sc->hn_tx_ring[i].hn_oactive = 0;
2565 
2566 	/* Clear TX 'suspended' bit. */
2567 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2568 
2569 	/* Everything is ready; unleash! */
2570 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2571 }
2572 
2573 static void
2574 hn_init(void *xsc)
2575 {
2576 	struct hn_softc *sc = xsc;
2577 
2578 	HN_LOCK(sc);
2579 	hn_init_locked(sc);
2580 	HN_UNLOCK(sc);
2581 }
2582 
2583 #if __FreeBSD_version >= 1100099
2584 
2585 static int
2586 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2587 {
2588 	struct hn_softc *sc = arg1;
2589 	unsigned int lenlim;
2590 	int error;
2591 
2592 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2593 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2594 	if (error || req->newptr == NULL)
2595 		return error;
2596 
2597 	HN_LOCK(sc);
2598 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2599 	    lenlim > TCP_LRO_LENGTH_MAX) {
2600 		HN_UNLOCK(sc);
2601 		return EINVAL;
2602 	}
2603 	hn_set_lro_lenlim(sc, lenlim);
2604 	HN_UNLOCK(sc);
2605 
2606 	return 0;
2607 }
2608 
2609 static int
2610 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2611 {
2612 	struct hn_softc *sc = arg1;
2613 	int ackcnt, error, i;
2614 
2615 	/*
2616 	 * lro_ackcnt_lim is append count limit,
2617 	 * +1 to turn it into aggregation limit.
2618 	 */
2619 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2620 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2621 	if (error || req->newptr == NULL)
2622 		return error;
2623 
2624 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2625 		return EINVAL;
2626 
2627 	/*
2628 	 * Convert aggregation limit back to append
2629 	 * count limit.
2630 	 */
2631 	--ackcnt;
2632 	HN_LOCK(sc);
2633 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2634 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2635 	HN_UNLOCK(sc);
2636 	return 0;
2637 }
2638 
2639 #endif
2640 
2641 static int
2642 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2643 {
2644 	struct hn_softc *sc = arg1;
2645 	int hcsum = arg2;
2646 	int on, error, i;
2647 
2648 	on = 0;
2649 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2650 		on = 1;
2651 
2652 	error = sysctl_handle_int(oidp, &on, 0, req);
2653 	if (error || req->newptr == NULL)
2654 		return error;
2655 
2656 	HN_LOCK(sc);
2657 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2658 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2659 
2660 		if (on)
2661 			rxr->hn_trust_hcsum |= hcsum;
2662 		else
2663 			rxr->hn_trust_hcsum &= ~hcsum;
2664 	}
2665 	HN_UNLOCK(sc);
2666 	return 0;
2667 }
2668 
2669 static int
2670 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2671 {
2672 	struct hn_softc *sc = arg1;
2673 	int chim_size, error;
2674 
2675 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
2676 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
2677 	if (error || req->newptr == NULL)
2678 		return error;
2679 
2680 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2681 		return EINVAL;
2682 
2683 	HN_LOCK(sc);
2684 	hn_set_chim_size(sc, chim_size);
2685 	HN_UNLOCK(sc);
2686 	return 0;
2687 }
2688 
2689 #if __FreeBSD_version < 1100095
2690 static int
2691 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2692 {
2693 	struct hn_softc *sc = arg1;
2694 	int ofs = arg2, i, error;
2695 	struct hn_rx_ring *rxr;
2696 	uint64_t stat;
2697 
2698 	stat = 0;
2699 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2700 		rxr = &sc->hn_rx_ring[i];
2701 		stat += *((int *)((uint8_t *)rxr + ofs));
2702 	}
2703 
2704 	error = sysctl_handle_64(oidp, &stat, 0, req);
2705 	if (error || req->newptr == NULL)
2706 		return error;
2707 
2708 	/* Zero out this stat. */
2709 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2710 		rxr = &sc->hn_rx_ring[i];
2711 		*((int *)((uint8_t *)rxr + ofs)) = 0;
2712 	}
2713 	return 0;
2714 }
2715 #else
2716 static int
2717 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2718 {
2719 	struct hn_softc *sc = arg1;
2720 	int ofs = arg2, i, error;
2721 	struct hn_rx_ring *rxr;
2722 	uint64_t stat;
2723 
2724 	stat = 0;
2725 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2726 		rxr = &sc->hn_rx_ring[i];
2727 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2728 	}
2729 
2730 	error = sysctl_handle_64(oidp, &stat, 0, req);
2731 	if (error || req->newptr == NULL)
2732 		return error;
2733 
2734 	/* Zero out this stat. */
2735 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2736 		rxr = &sc->hn_rx_ring[i];
2737 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2738 	}
2739 	return 0;
2740 }
2741 
2742 #endif
2743 
2744 static int
2745 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2746 {
2747 	struct hn_softc *sc = arg1;
2748 	int ofs = arg2, i, error;
2749 	struct hn_rx_ring *rxr;
2750 	u_long stat;
2751 
2752 	stat = 0;
2753 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2754 		rxr = &sc->hn_rx_ring[i];
2755 		stat += *((u_long *)((uint8_t *)rxr + ofs));
2756 	}
2757 
2758 	error = sysctl_handle_long(oidp, &stat, 0, req);
2759 	if (error || req->newptr == NULL)
2760 		return error;
2761 
2762 	/* Zero out this stat. */
2763 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2764 		rxr = &sc->hn_rx_ring[i];
2765 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
2766 	}
2767 	return 0;
2768 }
2769 
2770 static int
2771 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2772 {
2773 	struct hn_softc *sc = arg1;
2774 	int ofs = arg2, i, error;
2775 	struct hn_tx_ring *txr;
2776 	u_long stat;
2777 
2778 	stat = 0;
2779 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2780 		txr = &sc->hn_tx_ring[i];
2781 		stat += *((u_long *)((uint8_t *)txr + ofs));
2782 	}
2783 
2784 	error = sysctl_handle_long(oidp, &stat, 0, req);
2785 	if (error || req->newptr == NULL)
2786 		return error;
2787 
2788 	/* Zero out this stat. */
2789 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2790 		txr = &sc->hn_tx_ring[i];
2791 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
2792 	}
2793 	return 0;
2794 }
2795 
2796 static int
2797 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2798 {
2799 	struct hn_softc *sc = arg1;
2800 	int ofs = arg2, i, error, conf;
2801 	struct hn_tx_ring *txr;
2802 
2803 	txr = &sc->hn_tx_ring[0];
2804 	conf = *((int *)((uint8_t *)txr + ofs));
2805 
2806 	error = sysctl_handle_int(oidp, &conf, 0, req);
2807 	if (error || req->newptr == NULL)
2808 		return error;
2809 
2810 	HN_LOCK(sc);
2811 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2812 		txr = &sc->hn_tx_ring[i];
2813 		*((int *)((uint8_t *)txr + ofs)) = conf;
2814 	}
2815 	HN_UNLOCK(sc);
2816 
2817 	return 0;
2818 }
2819 
2820 static int
2821 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2822 {
2823 	struct hn_softc *sc = arg1;
2824 	int error, size;
2825 
2826 	size = sc->hn_agg_size;
2827 	error = sysctl_handle_int(oidp, &size, 0, req);
2828 	if (error || req->newptr == NULL)
2829 		return (error);
2830 
2831 	HN_LOCK(sc);
2832 	sc->hn_agg_size = size;
2833 	hn_set_txagg(sc);
2834 	HN_UNLOCK(sc);
2835 
2836 	return (0);
2837 }
2838 
2839 static int
2840 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2841 {
2842 	struct hn_softc *sc = arg1;
2843 	int error, pkts;
2844 
2845 	pkts = sc->hn_agg_pkts;
2846 	error = sysctl_handle_int(oidp, &pkts, 0, req);
2847 	if (error || req->newptr == NULL)
2848 		return (error);
2849 
2850 	HN_LOCK(sc);
2851 	sc->hn_agg_pkts = pkts;
2852 	hn_set_txagg(sc);
2853 	HN_UNLOCK(sc);
2854 
2855 	return (0);
2856 }
2857 
2858 static int
2859 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2860 {
2861 	struct hn_softc *sc = arg1;
2862 	int pkts;
2863 
2864 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2865 	return (sysctl_handle_int(oidp, &pkts, 0, req));
2866 }
2867 
2868 static int
2869 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2870 {
2871 	struct hn_softc *sc = arg1;
2872 	int align;
2873 
2874 	align = sc->hn_tx_ring[0].hn_agg_align;
2875 	return (sysctl_handle_int(oidp, &align, 0, req));
2876 }
2877 
2878 static int
2879 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2880 {
2881 	struct hn_softc *sc = arg1;
2882 	char verstr[16];
2883 
2884 	snprintf(verstr, sizeof(verstr), "%u.%u",
2885 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2886 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2887 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2888 }
2889 
2890 static int
2891 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2892 {
2893 	struct hn_softc *sc = arg1;
2894 	char caps_str[128];
2895 	uint32_t caps;
2896 
2897 	HN_LOCK(sc);
2898 	caps = sc->hn_caps;
2899 	HN_UNLOCK(sc);
2900 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2901 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2902 }
2903 
2904 static int
2905 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2906 {
2907 	struct hn_softc *sc = arg1;
2908 	char assist_str[128];
2909 	uint32_t hwassist;
2910 
2911 	HN_LOCK(sc);
2912 	hwassist = sc->hn_ifp->if_hwassist;
2913 	HN_UNLOCK(sc);
2914 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2915 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2916 }
2917 
2918 static int
2919 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2920 {
2921 	struct hn_softc *sc = arg1;
2922 	char filter_str[128];
2923 	uint32_t filter;
2924 
2925 	HN_LOCK(sc);
2926 	filter = sc->hn_rx_filter;
2927 	HN_UNLOCK(sc);
2928 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
2929 	    NDIS_PACKET_TYPES);
2930 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2931 }
2932 
2933 #ifndef RSS
2934 
2935 static int
2936 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2937 {
2938 	struct hn_softc *sc = arg1;
2939 	int error;
2940 
2941 	HN_LOCK(sc);
2942 
2943 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2944 	if (error || req->newptr == NULL)
2945 		goto back;
2946 
2947 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2948 	if (error)
2949 		goto back;
2950 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2951 
2952 	if (sc->hn_rx_ring_inuse > 1) {
2953 		error = hn_rss_reconfig(sc);
2954 	} else {
2955 		/* Not RSS capable, at least for now; just save the RSS key. */
2956 		error = 0;
2957 	}
2958 back:
2959 	HN_UNLOCK(sc);
2960 	return (error);
2961 }
2962 
2963 static int
2964 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2965 {
2966 	struct hn_softc *sc = arg1;
2967 	int error;
2968 
2969 	HN_LOCK(sc);
2970 
2971 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2972 	if (error || req->newptr == NULL)
2973 		goto back;
2974 
2975 	/*
2976 	 * Don't allow RSS indirect table change, if this interface is not
2977 	 * RSS capable currently.
2978 	 */
2979 	if (sc->hn_rx_ring_inuse == 1) {
2980 		error = EOPNOTSUPP;
2981 		goto back;
2982 	}
2983 
2984 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2985 	if (error)
2986 		goto back;
2987 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2988 
2989 	hn_rss_ind_fixup(sc);
2990 	error = hn_rss_reconfig(sc);
2991 back:
2992 	HN_UNLOCK(sc);
2993 	return (error);
2994 }
2995 
2996 #endif	/* !RSS */
2997 
2998 static int
2999 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3000 {
3001 	struct hn_softc *sc = arg1;
3002 	char hash_str[128];
3003 	uint32_t hash;
3004 
3005 	HN_LOCK(sc);
3006 	hash = sc->hn_rss_hash;
3007 	HN_UNLOCK(sc);
3008 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3009 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3010 }
3011 
3012 static int
3013 hn_check_iplen(const struct mbuf *m, int hoff)
3014 {
3015 	const struct ip *ip;
3016 	int len, iphlen, iplen;
3017 	const struct tcphdr *th;
3018 	int thoff;				/* TCP data offset */
3019 
3020 	len = hoff + sizeof(struct ip);
3021 
3022 	/* The packet must be at least the size of an IP header. */
3023 	if (m->m_pkthdr.len < len)
3024 		return IPPROTO_DONE;
3025 
3026 	/* The fixed IP header must reside completely in the first mbuf. */
3027 	if (m->m_len < len)
3028 		return IPPROTO_DONE;
3029 
3030 	ip = mtodo(m, hoff);
3031 
3032 	/* Bound check the packet's stated IP header length. */
3033 	iphlen = ip->ip_hl << 2;
3034 	if (iphlen < sizeof(struct ip))		/* minimum header length */
3035 		return IPPROTO_DONE;
3036 
3037 	/* The full IP header must reside completely in the one mbuf. */
3038 	if (m->m_len < hoff + iphlen)
3039 		return IPPROTO_DONE;
3040 
3041 	iplen = ntohs(ip->ip_len);
3042 
3043 	/*
3044 	 * Check that the amount of data in the buffers is as
3045 	 * at least much as the IP header would have us expect.
3046 	 */
3047 	if (m->m_pkthdr.len < hoff + iplen)
3048 		return IPPROTO_DONE;
3049 
3050 	/*
3051 	 * Ignore IP fragments.
3052 	 */
3053 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3054 		return IPPROTO_DONE;
3055 
3056 	/*
3057 	 * The TCP/IP or UDP/IP header must be entirely contained within
3058 	 * the first fragment of a packet.
3059 	 */
3060 	switch (ip->ip_p) {
3061 	case IPPROTO_TCP:
3062 		if (iplen < iphlen + sizeof(struct tcphdr))
3063 			return IPPROTO_DONE;
3064 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3065 			return IPPROTO_DONE;
3066 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3067 		thoff = th->th_off << 2;
3068 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3069 			return IPPROTO_DONE;
3070 		if (m->m_len < hoff + iphlen + thoff)
3071 			return IPPROTO_DONE;
3072 		break;
3073 	case IPPROTO_UDP:
3074 		if (iplen < iphlen + sizeof(struct udphdr))
3075 			return IPPROTO_DONE;
3076 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3077 			return IPPROTO_DONE;
3078 		break;
3079 	default:
3080 		if (iplen < iphlen)
3081 			return IPPROTO_DONE;
3082 		break;
3083 	}
3084 	return ip->ip_p;
3085 }
3086 
3087 static int
3088 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3089 {
3090 	struct sysctl_oid_list *child;
3091 	struct sysctl_ctx_list *ctx;
3092 	device_t dev = sc->hn_dev;
3093 #if defined(INET) || defined(INET6)
3094 #if __FreeBSD_version >= 1100095
3095 	int lroent_cnt;
3096 #endif
3097 #endif
3098 	int i;
3099 
3100 	/*
3101 	 * Create RXBUF for reception.
3102 	 *
3103 	 * NOTE:
3104 	 * - It is shared by all channels.
3105 	 * - A large enough buffer is allocated, certain version of NVSes
3106 	 *   may further limit the usable space.
3107 	 */
3108 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3109 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3110 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3111 	if (sc->hn_rxbuf == NULL) {
3112 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3113 		return (ENOMEM);
3114 	}
3115 
3116 	sc->hn_rx_ring_cnt = ring_cnt;
3117 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3118 
3119 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3120 	    M_DEVBUF, M_WAITOK | M_ZERO);
3121 
3122 #if defined(INET) || defined(INET6)
3123 #if __FreeBSD_version >= 1100095
3124 	lroent_cnt = hn_lro_entry_count;
3125 	if (lroent_cnt < TCP_LRO_ENTRIES)
3126 		lroent_cnt = TCP_LRO_ENTRIES;
3127 	if (bootverbose)
3128 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3129 #endif
3130 #endif	/* INET || INET6 */
3131 
3132 	ctx = device_get_sysctl_ctx(dev);
3133 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3134 
3135 	/* Create dev.hn.UNIT.rx sysctl tree */
3136 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3137 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3138 
3139 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3140 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3141 
3142 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3143 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3144 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
3145 		if (rxr->hn_br == NULL) {
3146 			device_printf(dev, "allocate bufring failed\n");
3147 			return (ENOMEM);
3148 		}
3149 
3150 		if (hn_trust_hosttcp)
3151 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3152 		if (hn_trust_hostudp)
3153 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3154 		if (hn_trust_hostip)
3155 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3156 		rxr->hn_ifp = sc->hn_ifp;
3157 		if (i < sc->hn_tx_ring_cnt)
3158 			rxr->hn_txr = &sc->hn_tx_ring[i];
3159 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3160 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3161 		rxr->hn_rx_idx = i;
3162 		rxr->hn_rxbuf = sc->hn_rxbuf;
3163 
3164 		/*
3165 		 * Initialize LRO.
3166 		 */
3167 #if defined(INET) || defined(INET6)
3168 #if __FreeBSD_version >= 1100095
3169 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3170 		    hn_lro_mbufq_depth);
3171 #else
3172 		tcp_lro_init(&rxr->hn_lro);
3173 		rxr->hn_lro.ifp = sc->hn_ifp;
3174 #endif
3175 #if __FreeBSD_version >= 1100099
3176 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3177 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3178 #endif
3179 #endif	/* INET || INET6 */
3180 
3181 		if (sc->hn_rx_sysctl_tree != NULL) {
3182 			char name[16];
3183 
3184 			/*
3185 			 * Create per RX ring sysctl tree:
3186 			 * dev.hn.UNIT.rx.RINGID
3187 			 */
3188 			snprintf(name, sizeof(name), "%d", i);
3189 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3190 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3191 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3192 
3193 			if (rxr->hn_rx_sysctl_tree != NULL) {
3194 				SYSCTL_ADD_ULONG(ctx,
3195 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3196 				    OID_AUTO, "packets", CTLFLAG_RW,
3197 				    &rxr->hn_pkts, "# of packets received");
3198 				SYSCTL_ADD_ULONG(ctx,
3199 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3200 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
3201 				    &rxr->hn_rss_pkts,
3202 				    "# of packets w/ RSS info received");
3203 				SYSCTL_ADD_INT(ctx,
3204 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3205 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3206 				    &rxr->hn_pktbuf_len, 0,
3207 				    "Temporary channel packet buffer length");
3208 			}
3209 		}
3210 	}
3211 
3212 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3213 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3214 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3215 #if __FreeBSD_version < 1100095
3216 	    hn_rx_stat_int_sysctl,
3217 #else
3218 	    hn_rx_stat_u64_sysctl,
3219 #endif
3220 	    "LU", "LRO queued");
3221 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3222 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3223 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3224 #if __FreeBSD_version < 1100095
3225 	    hn_rx_stat_int_sysctl,
3226 #else
3227 	    hn_rx_stat_u64_sysctl,
3228 #endif
3229 	    "LU", "LRO flushed");
3230 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3231 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3232 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
3233 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3234 #if __FreeBSD_version >= 1100099
3235 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3236 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3237 	    hn_lro_lenlim_sysctl, "IU",
3238 	    "Max # of data bytes to be aggregated by LRO");
3239 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3240 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3241 	    hn_lro_ackcnt_sysctl, "I",
3242 	    "Max # of ACKs to be aggregated by LRO");
3243 #endif
3244 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3245 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3246 	    hn_trust_hcsum_sysctl, "I",
3247 	    "Trust tcp segement verification on host side, "
3248 	    "when csum info is missing");
3249 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3250 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3251 	    hn_trust_hcsum_sysctl, "I",
3252 	    "Trust udp datagram verification on host side, "
3253 	    "when csum info is missing");
3254 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3255 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3256 	    hn_trust_hcsum_sysctl, "I",
3257 	    "Trust ip packet verification on host side, "
3258 	    "when csum info is missing");
3259 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3260 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3261 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
3262 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3263 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3264 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3265 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
3266 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3267 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3268 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3269 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
3270 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3271 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3272 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3273 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
3274 	    hn_rx_stat_ulong_sysctl, "LU",
3275 	    "# of packets that we trust host's csum verification");
3276 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3277 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3278 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
3279 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3280 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3281 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3282 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
3283 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3284 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3285 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3286 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3287 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3288 
3289 	return (0);
3290 }
3291 
3292 static void
3293 hn_destroy_rx_data(struct hn_softc *sc)
3294 {
3295 	int i;
3296 
3297 	if (sc->hn_rxbuf != NULL) {
3298 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3299 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3300 		else
3301 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
3302 		sc->hn_rxbuf = NULL;
3303 	}
3304 
3305 	if (sc->hn_rx_ring_cnt == 0)
3306 		return;
3307 
3308 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3309 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3310 
3311 		if (rxr->hn_br == NULL)
3312 			continue;
3313 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3314 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3315 		} else {
3316 			device_printf(sc->hn_dev,
3317 			    "%dth channel bufring is referenced", i);
3318 		}
3319 		rxr->hn_br = NULL;
3320 
3321 #if defined(INET) || defined(INET6)
3322 		tcp_lro_free(&rxr->hn_lro);
3323 #endif
3324 		free(rxr->hn_pktbuf, M_DEVBUF);
3325 	}
3326 	free(sc->hn_rx_ring, M_DEVBUF);
3327 	sc->hn_rx_ring = NULL;
3328 
3329 	sc->hn_rx_ring_cnt = 0;
3330 	sc->hn_rx_ring_inuse = 0;
3331 }
3332 
3333 static int
3334 hn_tx_ring_create(struct hn_softc *sc, int id)
3335 {
3336 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3337 	device_t dev = sc->hn_dev;
3338 	bus_dma_tag_t parent_dtag;
3339 	int error, i;
3340 
3341 	txr->hn_sc = sc;
3342 	txr->hn_tx_idx = id;
3343 
3344 #ifndef HN_USE_TXDESC_BUFRING
3345 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3346 #endif
3347 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3348 
3349 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3350 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3351 	    M_DEVBUF, M_WAITOK | M_ZERO);
3352 #ifndef HN_USE_TXDESC_BUFRING
3353 	SLIST_INIT(&txr->hn_txlist);
3354 #else
3355 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3356 	    M_WAITOK, &txr->hn_tx_lock);
3357 #endif
3358 
3359 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3360 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3361 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3362 	} else {
3363 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3364 	}
3365 
3366 #ifdef HN_IFSTART_SUPPORT
3367 	if (hn_use_if_start) {
3368 		txr->hn_txeof = hn_start_txeof;
3369 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3370 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3371 	} else
3372 #endif
3373 	{
3374 		int br_depth;
3375 
3376 		txr->hn_txeof = hn_xmit_txeof;
3377 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3378 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3379 
3380 		br_depth = hn_get_txswq_depth(txr);
3381 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3382 		    M_WAITOK, &txr->hn_tx_lock);
3383 	}
3384 
3385 	txr->hn_direct_tx_size = hn_direct_tx_size;
3386 
3387 	/*
3388 	 * Always schedule transmission instead of trying to do direct
3389 	 * transmission.  This one gives the best performance so far.
3390 	 */
3391 	txr->hn_sched_tx = 1;
3392 
3393 	parent_dtag = bus_get_dma_tag(dev);
3394 
3395 	/* DMA tag for RNDIS packet messages. */
3396 	error = bus_dma_tag_create(parent_dtag, /* parent */
3397 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
3398 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
3399 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3400 	    BUS_SPACE_MAXADDR,		/* highaddr */
3401 	    NULL, NULL,			/* filter, filterarg */
3402 	    HN_RNDIS_PKT_LEN,		/* maxsize */
3403 	    1,				/* nsegments */
3404 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
3405 	    0,				/* flags */
3406 	    NULL,			/* lockfunc */
3407 	    NULL,			/* lockfuncarg */
3408 	    &txr->hn_tx_rndis_dtag);
3409 	if (error) {
3410 		device_printf(dev, "failed to create rndis dmatag\n");
3411 		return error;
3412 	}
3413 
3414 	/* DMA tag for data. */
3415 	error = bus_dma_tag_create(parent_dtag, /* parent */
3416 	    1,				/* alignment */
3417 	    HN_TX_DATA_BOUNDARY,	/* boundary */
3418 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3419 	    BUS_SPACE_MAXADDR,		/* highaddr */
3420 	    NULL, NULL,			/* filter, filterarg */
3421 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
3422 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
3423 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
3424 	    0,				/* flags */
3425 	    NULL,			/* lockfunc */
3426 	    NULL,			/* lockfuncarg */
3427 	    &txr->hn_tx_data_dtag);
3428 	if (error) {
3429 		device_printf(dev, "failed to create data dmatag\n");
3430 		return error;
3431 	}
3432 
3433 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3434 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
3435 
3436 		txd->txr = txr;
3437 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3438 		STAILQ_INIT(&txd->agg_list);
3439 
3440 		/*
3441 		 * Allocate and load RNDIS packet message.
3442 		 */
3443         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3444 		    (void **)&txd->rndis_pkt,
3445 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3446 		    &txd->rndis_pkt_dmap);
3447 		if (error) {
3448 			device_printf(dev,
3449 			    "failed to allocate rndis_packet_msg, %d\n", i);
3450 			return error;
3451 		}
3452 
3453 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3454 		    txd->rndis_pkt_dmap,
3455 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3456 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3457 		    BUS_DMA_NOWAIT);
3458 		if (error) {
3459 			device_printf(dev,
3460 			    "failed to load rndis_packet_msg, %d\n", i);
3461 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3462 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3463 			return error;
3464 		}
3465 
3466 		/* DMA map for TX data. */
3467 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3468 		    &txd->data_dmap);
3469 		if (error) {
3470 			device_printf(dev,
3471 			    "failed to allocate tx data dmamap\n");
3472 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3473 			    txd->rndis_pkt_dmap);
3474 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3475 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3476 			return error;
3477 		}
3478 
3479 		/* All set, put it to list */
3480 		txd->flags |= HN_TXD_FLAG_ONLIST;
3481 #ifndef HN_USE_TXDESC_BUFRING
3482 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3483 #else
3484 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
3485 #endif
3486 	}
3487 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3488 
3489 	if (sc->hn_tx_sysctl_tree != NULL) {
3490 		struct sysctl_oid_list *child;
3491 		struct sysctl_ctx_list *ctx;
3492 		char name[16];
3493 
3494 		/*
3495 		 * Create per TX ring sysctl tree:
3496 		 * dev.hn.UNIT.tx.RINGID
3497 		 */
3498 		ctx = device_get_sysctl_ctx(dev);
3499 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3500 
3501 		snprintf(name, sizeof(name), "%d", id);
3502 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3503 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3504 
3505 		if (txr->hn_tx_sysctl_tree != NULL) {
3506 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3507 
3508 #ifdef HN_DEBUG
3509 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3510 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3511 			    "# of available TX descs");
3512 #endif
3513 #ifdef HN_IFSTART_SUPPORT
3514 			if (!hn_use_if_start)
3515 #endif
3516 			{
3517 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3518 				    CTLFLAG_RD, &txr->hn_oactive, 0,
3519 				    "over active");
3520 			}
3521 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3522 			    CTLFLAG_RW, &txr->hn_pkts,
3523 			    "# of packets transmitted");
3524 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3525 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
3526 		}
3527 	}
3528 
3529 	return 0;
3530 }
3531 
3532 static void
3533 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3534 {
3535 	struct hn_tx_ring *txr = txd->txr;
3536 
3537 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
3538 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3539 
3540 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3541 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3542 	    txd->rndis_pkt_dmap);
3543 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3544 }
3545 
3546 static void
3547 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3548 {
3549 
3550 	KASSERT(txd->refs == 0 || txd->refs == 1,
3551 	    ("invalid txd refs %d", txd->refs));
3552 
3553 	/* Aggregated txds will be freed by their aggregating txd. */
3554 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3555 		int freed;
3556 
3557 		freed = hn_txdesc_put(txr, txd);
3558 		KASSERT(freed, ("can't free txdesc"));
3559 	}
3560 }
3561 
3562 static void
3563 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3564 {
3565 	int i;
3566 
3567 	if (txr->hn_txdesc == NULL)
3568 		return;
3569 
3570 	/*
3571 	 * NOTE:
3572 	 * Because the freeing of aggregated txds will be deferred
3573 	 * to the aggregating txd, two passes are used here:
3574 	 * - The first pass GCes any pending txds.  This GC is necessary,
3575 	 *   since if the channels are revoked, hypervisor will not
3576 	 *   deliver send-done for all pending txds.
3577 	 * - The second pass frees the busdma stuffs, i.e. after all txds
3578 	 *   were freed.
3579 	 */
3580 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3581 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3582 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3583 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3584 
3585 	if (txr->hn_tx_data_dtag != NULL)
3586 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3587 	if (txr->hn_tx_rndis_dtag != NULL)
3588 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3589 
3590 #ifdef HN_USE_TXDESC_BUFRING
3591 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3592 #endif
3593 
3594 	free(txr->hn_txdesc, M_DEVBUF);
3595 	txr->hn_txdesc = NULL;
3596 
3597 	if (txr->hn_mbuf_br != NULL)
3598 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3599 
3600 #ifndef HN_USE_TXDESC_BUFRING
3601 	mtx_destroy(&txr->hn_txlist_spin);
3602 #endif
3603 	mtx_destroy(&txr->hn_tx_lock);
3604 }
3605 
3606 static int
3607 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3608 {
3609 	struct sysctl_oid_list *child;
3610 	struct sysctl_ctx_list *ctx;
3611 	int i;
3612 
3613 	/*
3614 	 * Create TXBUF for chimney sending.
3615 	 *
3616 	 * NOTE: It is shared by all channels.
3617 	 */
3618 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3619 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3620 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3621 	if (sc->hn_chim == NULL) {
3622 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
3623 		return (ENOMEM);
3624 	}
3625 
3626 	sc->hn_tx_ring_cnt = ring_cnt;
3627 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3628 
3629 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3630 	    M_DEVBUF, M_WAITOK | M_ZERO);
3631 
3632 	ctx = device_get_sysctl_ctx(sc->hn_dev);
3633 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3634 
3635 	/* Create dev.hn.UNIT.tx sysctl tree */
3636 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3637 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3638 
3639 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3640 		int error;
3641 
3642 		error = hn_tx_ring_create(sc, i);
3643 		if (error)
3644 			return error;
3645 	}
3646 
3647 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3648 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3649 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
3650 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3651 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3652 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3653 	    __offsetof(struct hn_tx_ring, hn_send_failed),
3654 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3655 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3656 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3657 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
3658 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3659 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3660 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3661 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
3662 	    hn_tx_stat_ulong_sysctl, "LU",
3663 	    "# of packet transmission aggregation flush failure");
3664 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3665 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3666 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3667 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3668 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3669 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3670 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
3671 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3672 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3673 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3674 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3675 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3676 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3677 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3678 	    "# of total TX descs");
3679 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3680 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3681 	    "Chimney send packet size upper boundary");
3682 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3683 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3684 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3685 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3686 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3687 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3688 	    hn_tx_conf_int_sysctl, "I",
3689 	    "Size of the packet for direct transmission");
3690 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3691 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3692 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
3693 	    hn_tx_conf_int_sysctl, "I",
3694 	    "Always schedule transmission "
3695 	    "instead of doing direct transmission");
3696 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3697 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3698 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3699 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3700 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3701 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3702 	    "Applied packet transmission aggregation size");
3703 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3704 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3705 	    hn_txagg_pktmax_sysctl, "I",
3706 	    "Applied packet transmission aggregation packets");
3707 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3708 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3709 	    hn_txagg_align_sysctl, "I",
3710 	    "Applied packet transmission aggregation alignment");
3711 
3712 	return 0;
3713 }
3714 
3715 static void
3716 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3717 {
3718 	int i;
3719 
3720 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3721 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
3722 }
3723 
3724 static void
3725 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3726 {
3727 	struct ifnet *ifp = sc->hn_ifp;
3728 	int tso_minlen;
3729 
3730 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3731 		return;
3732 
3733 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3734 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3735 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3736 
3737 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3738 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3739 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3740 
3741 	if (tso_maxlen < tso_minlen)
3742 		tso_maxlen = tso_minlen;
3743 	else if (tso_maxlen > IP_MAXPACKET)
3744 		tso_maxlen = IP_MAXPACKET;
3745 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
3746 		tso_maxlen = sc->hn_ndis_tso_szmax;
3747 	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3748 	if (bootverbose)
3749 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3750 }
3751 
3752 static void
3753 hn_fixup_tx_data(struct hn_softc *sc)
3754 {
3755 	uint64_t csum_assist;
3756 	int i;
3757 
3758 	hn_set_chim_size(sc, sc->hn_chim_szmax);
3759 	if (hn_tx_chimney_size > 0 &&
3760 	    hn_tx_chimney_size < sc->hn_chim_szmax)
3761 		hn_set_chim_size(sc, hn_tx_chimney_size);
3762 
3763 	csum_assist = 0;
3764 	if (sc->hn_caps & HN_CAP_IPCS)
3765 		csum_assist |= CSUM_IP;
3766 	if (sc->hn_caps & HN_CAP_TCP4CS)
3767 		csum_assist |= CSUM_IP_TCP;
3768 	if (sc->hn_caps & HN_CAP_UDP4CS)
3769 		csum_assist |= CSUM_IP_UDP;
3770 	if (sc->hn_caps & HN_CAP_TCP6CS)
3771 		csum_assist |= CSUM_IP6_TCP;
3772 	if (sc->hn_caps & HN_CAP_UDP6CS)
3773 		csum_assist |= CSUM_IP6_UDP;
3774 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3775 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3776 
3777 	if (sc->hn_caps & HN_CAP_HASHVAL) {
3778 		/*
3779 		 * Support HASHVAL pktinfo on TX path.
3780 		 */
3781 		if (bootverbose)
3782 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3783 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3784 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3785 	}
3786 }
3787 
3788 static void
3789 hn_destroy_tx_data(struct hn_softc *sc)
3790 {
3791 	int i;
3792 
3793 	if (sc->hn_chim != NULL) {
3794 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
3795 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3796 		} else {
3797 			device_printf(sc->hn_dev,
3798 			    "chimney sending buffer is referenced");
3799 		}
3800 		sc->hn_chim = NULL;
3801 	}
3802 
3803 	if (sc->hn_tx_ring_cnt == 0)
3804 		return;
3805 
3806 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3807 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3808 
3809 	free(sc->hn_tx_ring, M_DEVBUF);
3810 	sc->hn_tx_ring = NULL;
3811 
3812 	sc->hn_tx_ring_cnt = 0;
3813 	sc->hn_tx_ring_inuse = 0;
3814 }
3815 
3816 #ifdef HN_IFSTART_SUPPORT
3817 
3818 static void
3819 hn_start_taskfunc(void *xtxr, int pending __unused)
3820 {
3821 	struct hn_tx_ring *txr = xtxr;
3822 
3823 	mtx_lock(&txr->hn_tx_lock);
3824 	hn_start_locked(txr, 0);
3825 	mtx_unlock(&txr->hn_tx_lock);
3826 }
3827 
3828 static int
3829 hn_start_locked(struct hn_tx_ring *txr, int len)
3830 {
3831 	struct hn_softc *sc = txr->hn_sc;
3832 	struct ifnet *ifp = sc->hn_ifp;
3833 	int sched = 0;
3834 
3835 	KASSERT(hn_use_if_start,
3836 	    ("hn_start_locked is called, when if_start is disabled"));
3837 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3838 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3839 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3840 
3841 	if (__predict_false(txr->hn_suspended))
3842 		return (0);
3843 
3844 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3845 	    IFF_DRV_RUNNING)
3846 		return (0);
3847 
3848 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3849 		struct hn_txdesc *txd;
3850 		struct mbuf *m_head;
3851 		int error;
3852 
3853 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3854 		if (m_head == NULL)
3855 			break;
3856 
3857 		if (len > 0 && m_head->m_pkthdr.len > len) {
3858 			/*
3859 			 * This sending could be time consuming; let callers
3860 			 * dispatch this packet sending (and sending of any
3861 			 * following up packets) to tx taskqueue.
3862 			 */
3863 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3864 			sched = 1;
3865 			break;
3866 		}
3867 
3868 #if defined(INET6) || defined(INET)
3869 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3870 			m_head = hn_tso_fixup(m_head);
3871 			if (__predict_false(m_head == NULL)) {
3872 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3873 				continue;
3874 			}
3875 		}
3876 #endif
3877 
3878 		txd = hn_txdesc_get(txr);
3879 		if (txd == NULL) {
3880 			txr->hn_no_txdescs++;
3881 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3882 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3883 			break;
3884 		}
3885 
3886 		error = hn_encap(ifp, txr, txd, &m_head);
3887 		if (error) {
3888 			/* Both txd and m_head are freed */
3889 			KASSERT(txr->hn_agg_txd == NULL,
3890 			    ("encap failed w/ pending aggregating txdesc"));
3891 			continue;
3892 		}
3893 
3894 		if (txr->hn_agg_pktleft == 0) {
3895 			if (txr->hn_agg_txd != NULL) {
3896 				KASSERT(m_head == NULL,
3897 				    ("pending mbuf for aggregating txdesc"));
3898 				error = hn_flush_txagg(ifp, txr);
3899 				if (__predict_false(error)) {
3900 					atomic_set_int(&ifp->if_drv_flags,
3901 					    IFF_DRV_OACTIVE);
3902 					break;
3903 				}
3904 			} else {
3905 				KASSERT(m_head != NULL, ("mbuf was freed"));
3906 				error = hn_txpkt(ifp, txr, txd);
3907 				if (__predict_false(error)) {
3908 					/* txd is freed, but m_head is not */
3909 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3910 					atomic_set_int(&ifp->if_drv_flags,
3911 					    IFF_DRV_OACTIVE);
3912 					break;
3913 				}
3914 			}
3915 		}
3916 #ifdef INVARIANTS
3917 		else {
3918 			KASSERT(txr->hn_agg_txd != NULL,
3919 			    ("no aggregating txdesc"));
3920 			KASSERT(m_head == NULL,
3921 			    ("pending mbuf for aggregating txdesc"));
3922 		}
3923 #endif
3924 	}
3925 
3926 	/* Flush pending aggerated transmission. */
3927 	if (txr->hn_agg_txd != NULL)
3928 		hn_flush_txagg(ifp, txr);
3929 	return (sched);
3930 }
3931 
3932 static void
3933 hn_start(struct ifnet *ifp)
3934 {
3935 	struct hn_softc *sc = ifp->if_softc;
3936 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3937 
3938 	if (txr->hn_sched_tx)
3939 		goto do_sched;
3940 
3941 	if (mtx_trylock(&txr->hn_tx_lock)) {
3942 		int sched;
3943 
3944 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3945 		mtx_unlock(&txr->hn_tx_lock);
3946 		if (!sched)
3947 			return;
3948 	}
3949 do_sched:
3950 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3951 }
3952 
3953 static void
3954 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3955 {
3956 	struct hn_tx_ring *txr = xtxr;
3957 
3958 	mtx_lock(&txr->hn_tx_lock);
3959 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3960 	hn_start_locked(txr, 0);
3961 	mtx_unlock(&txr->hn_tx_lock);
3962 }
3963 
3964 static void
3965 hn_start_txeof(struct hn_tx_ring *txr)
3966 {
3967 	struct hn_softc *sc = txr->hn_sc;
3968 	struct ifnet *ifp = sc->hn_ifp;
3969 
3970 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3971 
3972 	if (txr->hn_sched_tx)
3973 		goto do_sched;
3974 
3975 	if (mtx_trylock(&txr->hn_tx_lock)) {
3976 		int sched;
3977 
3978 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3979 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3980 		mtx_unlock(&txr->hn_tx_lock);
3981 		if (sched) {
3982 			taskqueue_enqueue(txr->hn_tx_taskq,
3983 			    &txr->hn_tx_task);
3984 		}
3985 	} else {
3986 do_sched:
3987 		/*
3988 		 * Release the OACTIVE earlier, with the hope, that
3989 		 * others could catch up.  The task will clear the
3990 		 * flag again with the hn_tx_lock to avoid possible
3991 		 * races.
3992 		 */
3993 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3994 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3995 	}
3996 }
3997 
3998 #endif	/* HN_IFSTART_SUPPORT */
3999 
4000 static int
4001 hn_xmit(struct hn_tx_ring *txr, int len)
4002 {
4003 	struct hn_softc *sc = txr->hn_sc;
4004 	struct ifnet *ifp = sc->hn_ifp;
4005 	struct mbuf *m_head;
4006 	int sched = 0;
4007 
4008 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4009 #ifdef HN_IFSTART_SUPPORT
4010 	KASSERT(hn_use_if_start == 0,
4011 	    ("hn_xmit is called, when if_start is enabled"));
4012 #endif
4013 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4014 
4015 	if (__predict_false(txr->hn_suspended))
4016 		return (0);
4017 
4018 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4019 		return (0);
4020 
4021 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4022 		struct hn_txdesc *txd;
4023 		int error;
4024 
4025 		if (len > 0 && m_head->m_pkthdr.len > len) {
4026 			/*
4027 			 * This sending could be time consuming; let callers
4028 			 * dispatch this packet sending (and sending of any
4029 			 * following up packets) to tx taskqueue.
4030 			 */
4031 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4032 			sched = 1;
4033 			break;
4034 		}
4035 
4036 		txd = hn_txdesc_get(txr);
4037 		if (txd == NULL) {
4038 			txr->hn_no_txdescs++;
4039 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4040 			txr->hn_oactive = 1;
4041 			break;
4042 		}
4043 
4044 		error = hn_encap(ifp, txr, txd, &m_head);
4045 		if (error) {
4046 			/* Both txd and m_head are freed; discard */
4047 			KASSERT(txr->hn_agg_txd == NULL,
4048 			    ("encap failed w/ pending aggregating txdesc"));
4049 			drbr_advance(ifp, txr->hn_mbuf_br);
4050 			continue;
4051 		}
4052 
4053 		if (txr->hn_agg_pktleft == 0) {
4054 			if (txr->hn_agg_txd != NULL) {
4055 				KASSERT(m_head == NULL,
4056 				    ("pending mbuf for aggregating txdesc"));
4057 				error = hn_flush_txagg(ifp, txr);
4058 				if (__predict_false(error)) {
4059 					txr->hn_oactive = 1;
4060 					break;
4061 				}
4062 			} else {
4063 				KASSERT(m_head != NULL, ("mbuf was freed"));
4064 				error = hn_txpkt(ifp, txr, txd);
4065 				if (__predict_false(error)) {
4066 					/* txd is freed, but m_head is not */
4067 					drbr_putback(ifp, txr->hn_mbuf_br,
4068 					    m_head);
4069 					txr->hn_oactive = 1;
4070 					break;
4071 				}
4072 			}
4073 		}
4074 #ifdef INVARIANTS
4075 		else {
4076 			KASSERT(txr->hn_agg_txd != NULL,
4077 			    ("no aggregating txdesc"));
4078 			KASSERT(m_head == NULL,
4079 			    ("pending mbuf for aggregating txdesc"));
4080 		}
4081 #endif
4082 
4083 		/* Sent */
4084 		drbr_advance(ifp, txr->hn_mbuf_br);
4085 	}
4086 
4087 	/* Flush pending aggerated transmission. */
4088 	if (txr->hn_agg_txd != NULL)
4089 		hn_flush_txagg(ifp, txr);
4090 	return (sched);
4091 }
4092 
4093 static int
4094 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4095 {
4096 	struct hn_softc *sc = ifp->if_softc;
4097 	struct hn_tx_ring *txr;
4098 	int error, idx = 0;
4099 
4100 #if defined(INET6) || defined(INET)
4101 	/*
4102 	 * Perform TSO packet header fixup now, since the TSO
4103 	 * packet header should be cache-hot.
4104 	 */
4105 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4106 		m = hn_tso_fixup(m);
4107 		if (__predict_false(m == NULL)) {
4108 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4109 			return EIO;
4110 		}
4111 	}
4112 #endif
4113 
4114 	/*
4115 	 * Select the TX ring based on flowid
4116 	 */
4117 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4118 #ifdef RSS
4119 		uint32_t bid;
4120 
4121 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
4122 		    &bid) == 0)
4123 			idx = bid % sc->hn_tx_ring_inuse;
4124 		else
4125 #endif
4126 			idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4127 	}
4128 	txr = &sc->hn_tx_ring[idx];
4129 
4130 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4131 	if (error) {
4132 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4133 		return error;
4134 	}
4135 
4136 	if (txr->hn_oactive)
4137 		return 0;
4138 
4139 	if (txr->hn_sched_tx)
4140 		goto do_sched;
4141 
4142 	if (mtx_trylock(&txr->hn_tx_lock)) {
4143 		int sched;
4144 
4145 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4146 		mtx_unlock(&txr->hn_tx_lock);
4147 		if (!sched)
4148 			return 0;
4149 	}
4150 do_sched:
4151 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4152 	return 0;
4153 }
4154 
4155 static void
4156 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4157 {
4158 	struct mbuf *m;
4159 
4160 	mtx_lock(&txr->hn_tx_lock);
4161 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4162 		m_freem(m);
4163 	mtx_unlock(&txr->hn_tx_lock);
4164 }
4165 
4166 static void
4167 hn_xmit_qflush(struct ifnet *ifp)
4168 {
4169 	struct hn_softc *sc = ifp->if_softc;
4170 	int i;
4171 
4172 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4173 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4174 	if_qflush(ifp);
4175 }
4176 
4177 static void
4178 hn_xmit_txeof(struct hn_tx_ring *txr)
4179 {
4180 
4181 	if (txr->hn_sched_tx)
4182 		goto do_sched;
4183 
4184 	if (mtx_trylock(&txr->hn_tx_lock)) {
4185 		int sched;
4186 
4187 		txr->hn_oactive = 0;
4188 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4189 		mtx_unlock(&txr->hn_tx_lock);
4190 		if (sched) {
4191 			taskqueue_enqueue(txr->hn_tx_taskq,
4192 			    &txr->hn_tx_task);
4193 		}
4194 	} else {
4195 do_sched:
4196 		/*
4197 		 * Release the oactive earlier, with the hope, that
4198 		 * others could catch up.  The task will clear the
4199 		 * oactive again with the hn_tx_lock to avoid possible
4200 		 * races.
4201 		 */
4202 		txr->hn_oactive = 0;
4203 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4204 	}
4205 }
4206 
4207 static void
4208 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4209 {
4210 	struct hn_tx_ring *txr = xtxr;
4211 
4212 	mtx_lock(&txr->hn_tx_lock);
4213 	hn_xmit(txr, 0);
4214 	mtx_unlock(&txr->hn_tx_lock);
4215 }
4216 
4217 static void
4218 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4219 {
4220 	struct hn_tx_ring *txr = xtxr;
4221 
4222 	mtx_lock(&txr->hn_tx_lock);
4223 	txr->hn_oactive = 0;
4224 	hn_xmit(txr, 0);
4225 	mtx_unlock(&txr->hn_tx_lock);
4226 }
4227 
4228 static int
4229 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4230 {
4231 	struct vmbus_chan_br cbr;
4232 	struct hn_rx_ring *rxr;
4233 	struct hn_tx_ring *txr = NULL;
4234 	int idx, error;
4235 
4236 	idx = vmbus_chan_subidx(chan);
4237 
4238 	/*
4239 	 * Link this channel to RX/TX ring.
4240 	 */
4241 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4242 	    ("invalid channel index %d, should > 0 && < %d",
4243 	     idx, sc->hn_rx_ring_inuse));
4244 	rxr = &sc->hn_rx_ring[idx];
4245 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4246 	    ("RX ring %d already attached", idx));
4247 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4248 
4249 	if (bootverbose) {
4250 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4251 		    idx, vmbus_chan_id(chan));
4252 	}
4253 
4254 	if (idx < sc->hn_tx_ring_inuse) {
4255 		txr = &sc->hn_tx_ring[idx];
4256 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4257 		    ("TX ring %d already attached", idx));
4258 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4259 
4260 		txr->hn_chan = chan;
4261 		if (bootverbose) {
4262 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4263 			    idx, vmbus_chan_id(chan));
4264 		}
4265 	}
4266 
4267 	/* Bind this channel to a proper CPU. */
4268 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4269 
4270 	/*
4271 	 * Open this channel
4272 	 */
4273 	cbr.cbr = rxr->hn_br;
4274 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4275 	cbr.cbr_txsz = HN_TXBR_SIZE;
4276 	cbr.cbr_rxsz = HN_RXBR_SIZE;
4277 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4278 	if (error) {
4279 		if (error == EISCONN) {
4280 			if_printf(sc->hn_ifp, "bufring is connected after "
4281 			    "chan%u open failure\n", vmbus_chan_id(chan));
4282 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4283 		} else {
4284 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4285 			    vmbus_chan_id(chan), error);
4286 		}
4287 	}
4288 	return (error);
4289 }
4290 
4291 static void
4292 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4293 {
4294 	struct hn_rx_ring *rxr;
4295 	int idx, error;
4296 
4297 	idx = vmbus_chan_subidx(chan);
4298 
4299 	/*
4300 	 * Link this channel to RX/TX ring.
4301 	 */
4302 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4303 	    ("invalid channel index %d, should > 0 && < %d",
4304 	     idx, sc->hn_rx_ring_inuse));
4305 	rxr = &sc->hn_rx_ring[idx];
4306 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4307 	    ("RX ring %d is not attached", idx));
4308 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4309 
4310 	if (idx < sc->hn_tx_ring_inuse) {
4311 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4312 
4313 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4314 		    ("TX ring %d is not attached attached", idx));
4315 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4316 	}
4317 
4318 	/*
4319 	 * Close this channel.
4320 	 *
4321 	 * NOTE:
4322 	 * Channel closing does _not_ destroy the target channel.
4323 	 */
4324 	error = vmbus_chan_close_direct(chan);
4325 	if (error == EISCONN) {
4326 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
4327 		    "after being closed\n", vmbus_chan_id(chan));
4328 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4329 	} else if (error) {
4330 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4331 		    vmbus_chan_id(chan), error);
4332 	}
4333 }
4334 
4335 static int
4336 hn_attach_subchans(struct hn_softc *sc)
4337 {
4338 	struct vmbus_channel **subchans;
4339 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4340 	int i, error = 0;
4341 
4342 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
4343 
4344 	/* Attach the sub-channels. */
4345 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4346 	for (i = 0; i < subchan_cnt; ++i) {
4347 		int error1;
4348 
4349 		error1 = hn_chan_attach(sc, subchans[i]);
4350 		if (error1) {
4351 			error = error1;
4352 			/* Move on; all channels will be detached later. */
4353 		}
4354 	}
4355 	vmbus_subchan_rel(subchans, subchan_cnt);
4356 
4357 	if (error) {
4358 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4359 	} else {
4360 		if (bootverbose) {
4361 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4362 			    subchan_cnt);
4363 		}
4364 	}
4365 	return (error);
4366 }
4367 
4368 static void
4369 hn_detach_allchans(struct hn_softc *sc)
4370 {
4371 	struct vmbus_channel **subchans;
4372 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4373 	int i;
4374 
4375 	if (subchan_cnt == 0)
4376 		goto back;
4377 
4378 	/* Detach the sub-channels. */
4379 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4380 	for (i = 0; i < subchan_cnt; ++i)
4381 		hn_chan_detach(sc, subchans[i]);
4382 	vmbus_subchan_rel(subchans, subchan_cnt);
4383 
4384 back:
4385 	/*
4386 	 * Detach the primary channel, _after_ all sub-channels
4387 	 * are detached.
4388 	 */
4389 	hn_chan_detach(sc, sc->hn_prichan);
4390 
4391 	/* Wait for sub-channels to be destroyed, if any. */
4392 	vmbus_subchan_drain(sc->hn_prichan);
4393 
4394 #ifdef INVARIANTS
4395 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4396 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4397 		    HN_RX_FLAG_ATTACHED) == 0,
4398 		    ("%dth RX ring is still attached", i));
4399 	}
4400 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4401 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4402 		    HN_TX_FLAG_ATTACHED) == 0,
4403 		    ("%dth TX ring is still attached", i));
4404 	}
4405 #endif
4406 }
4407 
4408 static int
4409 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4410 {
4411 	struct vmbus_channel **subchans;
4412 	int nchan, rxr_cnt, error;
4413 
4414 	nchan = *nsubch + 1;
4415 	if (nchan == 1) {
4416 		/*
4417 		 * Multiple RX/TX rings are not requested.
4418 		 */
4419 		*nsubch = 0;
4420 		return (0);
4421 	}
4422 
4423 	/*
4424 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4425 	 * table entries.
4426 	 */
4427 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4428 	if (error) {
4429 		/* No RSS; this is benign. */
4430 		*nsubch = 0;
4431 		return (0);
4432 	}
4433 	if (bootverbose) {
4434 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4435 		    rxr_cnt, nchan);
4436 	}
4437 
4438 	if (nchan > rxr_cnt)
4439 		nchan = rxr_cnt;
4440 	if (nchan == 1) {
4441 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4442 		*nsubch = 0;
4443 		return (0);
4444 	}
4445 
4446 	/*
4447 	 * Allocate sub-channels from NVS.
4448 	 */
4449 	*nsubch = nchan - 1;
4450 	error = hn_nvs_alloc_subchans(sc, nsubch);
4451 	if (error || *nsubch == 0) {
4452 		/* Failed to allocate sub-channels. */
4453 		*nsubch = 0;
4454 		return (0);
4455 	}
4456 
4457 	/*
4458 	 * Wait for all sub-channels to become ready before moving on.
4459 	 */
4460 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4461 	vmbus_subchan_rel(subchans, *nsubch);
4462 	return (0);
4463 }
4464 
4465 static bool
4466 hn_synth_attachable(const struct hn_softc *sc)
4467 {
4468 	int i;
4469 
4470 	if (sc->hn_flags & HN_FLAG_ERRORS)
4471 		return (false);
4472 
4473 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4474 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4475 
4476 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4477 			return (false);
4478 	}
4479 	return (true);
4480 }
4481 
4482 static int
4483 hn_synth_attach(struct hn_softc *sc, int mtu)
4484 {
4485 #define ATTACHED_NVS		0x0002
4486 #define ATTACHED_RNDIS		0x0004
4487 
4488 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4489 	int error, nsubch, nchan, i;
4490 	uint32_t old_caps, attached = 0;
4491 
4492 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4493 	    ("synthetic parts were attached"));
4494 
4495 	if (!hn_synth_attachable(sc))
4496 		return (ENXIO);
4497 
4498 	/* Save capabilities for later verification. */
4499 	old_caps = sc->hn_caps;
4500 	sc->hn_caps = 0;
4501 
4502 	/* Clear RSS stuffs. */
4503 	sc->hn_rss_ind_size = 0;
4504 	sc->hn_rss_hash = 0;
4505 
4506 	/*
4507 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
4508 	 */
4509 	error = hn_chan_attach(sc, sc->hn_prichan);
4510 	if (error)
4511 		goto failed;
4512 
4513 	/*
4514 	 * Attach NVS.
4515 	 */
4516 	error = hn_nvs_attach(sc, mtu);
4517 	if (error)
4518 		goto failed;
4519 	attached |= ATTACHED_NVS;
4520 
4521 	/*
4522 	 * Attach RNDIS _after_ NVS is attached.
4523 	 */
4524 	error = hn_rndis_attach(sc, mtu);
4525 	if (error)
4526 		goto failed;
4527 	attached |= ATTACHED_RNDIS;
4528 
4529 	/*
4530 	 * Make sure capabilities are not changed.
4531 	 */
4532 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4533 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4534 		    old_caps, sc->hn_caps);
4535 		error = ENXIO;
4536 		goto failed;
4537 	}
4538 
4539 	/*
4540 	 * Allocate sub-channels for multi-TX/RX rings.
4541 	 *
4542 	 * NOTE:
4543 	 * The # of RX rings that can be used is equivalent to the # of
4544 	 * channels to be requested.
4545 	 */
4546 	nsubch = sc->hn_rx_ring_cnt - 1;
4547 	error = hn_synth_alloc_subchans(sc, &nsubch);
4548 	if (error)
4549 		goto failed;
4550 	/* NOTE: _Full_ synthetic parts detach is required now. */
4551 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4552 
4553 	/*
4554 	 * Set the # of TX/RX rings that could be used according to
4555 	 * the # of channels that NVS offered.
4556 	 */
4557 	nchan = nsubch + 1;
4558 	hn_set_ring_inuse(sc, nchan);
4559 	if (nchan == 1) {
4560 		/* Only the primary channel can be used; done */
4561 		goto back;
4562 	}
4563 
4564 	/*
4565 	 * Attach the sub-channels.
4566 	 *
4567 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
4568 	 */
4569 	error = hn_attach_subchans(sc);
4570 	if (error)
4571 		goto failed;
4572 
4573 	/*
4574 	 * Configure RSS key and indirect table _after_ all sub-channels
4575 	 * are attached.
4576 	 */
4577 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4578 		/*
4579 		 * RSS key is not set yet; set it to the default RSS key.
4580 		 */
4581 		if (bootverbose)
4582 			if_printf(sc->hn_ifp, "setup default RSS key\n");
4583 #ifdef RSS
4584 		rss_getkey(rss->rss_key);
4585 #else
4586 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4587 #endif
4588 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4589 	}
4590 
4591 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4592 		/*
4593 		 * RSS indirect table is not set yet; set it up in round-
4594 		 * robin fashion.
4595 		 */
4596 		if (bootverbose) {
4597 			if_printf(sc->hn_ifp, "setup default RSS indirect "
4598 			    "table\n");
4599 		}
4600 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
4601 			uint32_t subidx;
4602 
4603 #ifdef RSS
4604 			subidx = rss_get_indirection_to_bucket(i);
4605 #else
4606 			subidx = i;
4607 #endif
4608 			rss->rss_ind[i] = subidx % nchan;
4609 		}
4610 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4611 	} else {
4612 		/*
4613 		 * # of usable channels may be changed, so we have to
4614 		 * make sure that all entries in RSS indirect table
4615 		 * are valid.
4616 		 *
4617 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
4618 		 */
4619 		hn_rss_ind_fixup(sc);
4620 	}
4621 
4622 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4623 	if (error)
4624 		goto failed;
4625 back:
4626 	/*
4627 	 * Fixup transmission aggregation setup.
4628 	 */
4629 	hn_set_txagg(sc);
4630 	return (0);
4631 
4632 failed:
4633 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4634 		hn_synth_detach(sc);
4635 	} else {
4636 		if (attached & ATTACHED_RNDIS)
4637 			hn_rndis_detach(sc);
4638 		if (attached & ATTACHED_NVS)
4639 			hn_nvs_detach(sc);
4640 		hn_chan_detach(sc, sc->hn_prichan);
4641 		/* Restore old capabilities. */
4642 		sc->hn_caps = old_caps;
4643 	}
4644 	return (error);
4645 
4646 #undef ATTACHED_RNDIS
4647 #undef ATTACHED_NVS
4648 }
4649 
4650 /*
4651  * NOTE:
4652  * The interface must have been suspended though hn_suspend(), before
4653  * this function get called.
4654  */
4655 static void
4656 hn_synth_detach(struct hn_softc *sc)
4657 {
4658 
4659 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4660 	    ("synthetic parts were not attached"));
4661 
4662 	/* Detach the RNDIS first. */
4663 	hn_rndis_detach(sc);
4664 
4665 	/* Detach NVS. */
4666 	hn_nvs_detach(sc);
4667 
4668 	/* Detach all of the channels. */
4669 	hn_detach_allchans(sc);
4670 
4671 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4672 }
4673 
4674 static void
4675 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4676 {
4677 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4678 	    ("invalid ring count %d", ring_cnt));
4679 
4680 	if (sc->hn_tx_ring_cnt > ring_cnt)
4681 		sc->hn_tx_ring_inuse = ring_cnt;
4682 	else
4683 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4684 	sc->hn_rx_ring_inuse = ring_cnt;
4685 
4686 #ifdef RSS
4687 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
4688 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
4689 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
4690 		    rss_getnumbuckets());
4691 	}
4692 #endif
4693 
4694 	if (bootverbose) {
4695 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4696 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4697 	}
4698 }
4699 
4700 static void
4701 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4702 {
4703 
4704 	/*
4705 	 * NOTE:
4706 	 * The TX bufring will not be drained by the hypervisor,
4707 	 * if the primary channel is revoked.
4708 	 */
4709 	while (!vmbus_chan_rx_empty(chan) ||
4710 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4711 	     !vmbus_chan_tx_empty(chan)))
4712 		pause("waitch", 1);
4713 	vmbus_chan_intr_drain(chan);
4714 }
4715 
4716 static void
4717 hn_suspend_data(struct hn_softc *sc)
4718 {
4719 	struct vmbus_channel **subch = NULL;
4720 	struct hn_tx_ring *txr;
4721 	int i, nsubch;
4722 
4723 	HN_LOCK_ASSERT(sc);
4724 
4725 	/*
4726 	 * Suspend TX.
4727 	 */
4728 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4729 		txr = &sc->hn_tx_ring[i];
4730 
4731 		mtx_lock(&txr->hn_tx_lock);
4732 		txr->hn_suspended = 1;
4733 		mtx_unlock(&txr->hn_tx_lock);
4734 		/* No one is able send more packets now. */
4735 
4736 		/*
4737 		 * Wait for all pending sends to finish.
4738 		 *
4739 		 * NOTE:
4740 		 * We will _not_ receive all pending send-done, if the
4741 		 * primary channel is revoked.
4742 		 */
4743 		while (hn_tx_ring_pending(txr) &&
4744 		    !vmbus_chan_is_revoked(sc->hn_prichan))
4745 			pause("hnwtx", 1 /* 1 tick */);
4746 	}
4747 
4748 	/*
4749 	 * Disable RX by clearing RX filter.
4750 	 */
4751 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4752 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4753 
4754 	/*
4755 	 * Give RNDIS enough time to flush all pending data packets.
4756 	 */
4757 	pause("waitrx", (200 * hz) / 1000);
4758 
4759 	/*
4760 	 * Drain RX/TX bufrings and interrupts.
4761 	 */
4762 	nsubch = sc->hn_rx_ring_inuse - 1;
4763 	if (nsubch > 0)
4764 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4765 
4766 	if (subch != NULL) {
4767 		for (i = 0; i < nsubch; ++i)
4768 			hn_chan_drain(sc, subch[i]);
4769 	}
4770 	hn_chan_drain(sc, sc->hn_prichan);
4771 
4772 	if (subch != NULL)
4773 		vmbus_subchan_rel(subch, nsubch);
4774 
4775 	/*
4776 	 * Drain any pending TX tasks.
4777 	 *
4778 	 * NOTE:
4779 	 * The above hn_chan_drain() can dispatch TX tasks, so the TX
4780 	 * tasks will have to be drained _after_ the above hn_chan_drain()
4781 	 * calls.
4782 	 */
4783 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4784 		txr = &sc->hn_tx_ring[i];
4785 
4786 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4787 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4788 	}
4789 }
4790 
4791 static void
4792 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4793 {
4794 
4795 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4796 }
4797 
4798 static void
4799 hn_suspend_mgmt(struct hn_softc *sc)
4800 {
4801 	struct task task;
4802 
4803 	HN_LOCK_ASSERT(sc);
4804 
4805 	/*
4806 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4807 	 * through hn_mgmt_taskq.
4808 	 */
4809 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4810 	vmbus_chan_run_task(sc->hn_prichan, &task);
4811 
4812 	/*
4813 	 * Make sure that all pending management tasks are completed.
4814 	 */
4815 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4816 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4817 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
4818 }
4819 
4820 static void
4821 hn_suspend(struct hn_softc *sc)
4822 {
4823 
4824 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4825 		hn_suspend_data(sc);
4826 	hn_suspend_mgmt(sc);
4827 }
4828 
4829 static void
4830 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4831 {
4832 	int i;
4833 
4834 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4835 	    ("invalid TX ring count %d", tx_ring_cnt));
4836 
4837 	for (i = 0; i < tx_ring_cnt; ++i) {
4838 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4839 
4840 		mtx_lock(&txr->hn_tx_lock);
4841 		txr->hn_suspended = 0;
4842 		mtx_unlock(&txr->hn_tx_lock);
4843 	}
4844 }
4845 
4846 static void
4847 hn_resume_data(struct hn_softc *sc)
4848 {
4849 	int i;
4850 
4851 	HN_LOCK_ASSERT(sc);
4852 
4853 	/*
4854 	 * Re-enable RX.
4855 	 */
4856 	hn_set_rxfilter(sc);
4857 
4858 	/*
4859 	 * Make sure to clear suspend status on "all" TX rings,
4860 	 * since hn_tx_ring_inuse can be changed after
4861 	 * hn_suspend_data().
4862 	 */
4863 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4864 
4865 #ifdef HN_IFSTART_SUPPORT
4866 	if (!hn_use_if_start)
4867 #endif
4868 	{
4869 		/*
4870 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
4871 		 * reduced.
4872 		 */
4873 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4874 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4875 	}
4876 
4877 	/*
4878 	 * Kick start TX.
4879 	 */
4880 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4881 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4882 
4883 		/*
4884 		 * Use txeof task, so that any pending oactive can be
4885 		 * cleared properly.
4886 		 */
4887 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4888 	}
4889 }
4890 
4891 static void
4892 hn_resume_mgmt(struct hn_softc *sc)
4893 {
4894 
4895 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4896 
4897 	/*
4898 	 * Kick off network change detection, if it was pending.
4899 	 * If no network change was pending, start link status
4900 	 * checks, which is more lightweight than network change
4901 	 * detection.
4902 	 */
4903 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4904 		hn_change_network(sc);
4905 	else
4906 		hn_update_link_status(sc);
4907 }
4908 
4909 static void
4910 hn_resume(struct hn_softc *sc)
4911 {
4912 
4913 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4914 		hn_resume_data(sc);
4915 	hn_resume_mgmt(sc);
4916 }
4917 
4918 static void
4919 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4920 {
4921 	const struct rndis_status_msg *msg;
4922 	int ofs;
4923 
4924 	if (dlen < sizeof(*msg)) {
4925 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4926 		return;
4927 	}
4928 	msg = data;
4929 
4930 	switch (msg->rm_status) {
4931 	case RNDIS_STATUS_MEDIA_CONNECT:
4932 	case RNDIS_STATUS_MEDIA_DISCONNECT:
4933 		hn_update_link_status(sc);
4934 		break;
4935 
4936 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4937 		/* Not really useful; ignore. */
4938 		break;
4939 
4940 	case RNDIS_STATUS_NETWORK_CHANGE:
4941 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4942 		if (dlen < ofs + msg->rm_stbuflen ||
4943 		    msg->rm_stbuflen < sizeof(uint32_t)) {
4944 			if_printf(sc->hn_ifp, "network changed\n");
4945 		} else {
4946 			uint32_t change;
4947 
4948 			memcpy(&change, ((const uint8_t *)msg) + ofs,
4949 			    sizeof(change));
4950 			if_printf(sc->hn_ifp, "network changed, change %u\n",
4951 			    change);
4952 		}
4953 		hn_change_network(sc);
4954 		break;
4955 
4956 	default:
4957 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4958 		    msg->rm_status);
4959 		break;
4960 	}
4961 }
4962 
4963 static int
4964 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4965 {
4966 	const struct rndis_pktinfo *pi = info_data;
4967 	uint32_t mask = 0;
4968 
4969 	while (info_dlen != 0) {
4970 		const void *data;
4971 		uint32_t dlen;
4972 
4973 		if (__predict_false(info_dlen < sizeof(*pi)))
4974 			return (EINVAL);
4975 		if (__predict_false(info_dlen < pi->rm_size))
4976 			return (EINVAL);
4977 		info_dlen -= pi->rm_size;
4978 
4979 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4980 			return (EINVAL);
4981 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4982 			return (EINVAL);
4983 		dlen = pi->rm_size - pi->rm_pktinfooffset;
4984 		data = pi->rm_data;
4985 
4986 		switch (pi->rm_type) {
4987 		case NDIS_PKTINFO_TYPE_VLAN:
4988 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4989 				return (EINVAL);
4990 			info->vlan_info = *((const uint32_t *)data);
4991 			mask |= HN_RXINFO_VLAN;
4992 			break;
4993 
4994 		case NDIS_PKTINFO_TYPE_CSUM:
4995 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4996 				return (EINVAL);
4997 			info->csum_info = *((const uint32_t *)data);
4998 			mask |= HN_RXINFO_CSUM;
4999 			break;
5000 
5001 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5002 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5003 				return (EINVAL);
5004 			info->hash_value = *((const uint32_t *)data);
5005 			mask |= HN_RXINFO_HASHVAL;
5006 			break;
5007 
5008 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
5009 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5010 				return (EINVAL);
5011 			info->hash_info = *((const uint32_t *)data);
5012 			mask |= HN_RXINFO_HASHINF;
5013 			break;
5014 
5015 		default:
5016 			goto next;
5017 		}
5018 
5019 		if (mask == HN_RXINFO_ALL) {
5020 			/* All found; done */
5021 			break;
5022 		}
5023 next:
5024 		pi = (const struct rndis_pktinfo *)
5025 		    ((const uint8_t *)pi + pi->rm_size);
5026 	}
5027 
5028 	/*
5029 	 * Final fixup.
5030 	 * - If there is no hash value, invalidate the hash info.
5031 	 */
5032 	if ((mask & HN_RXINFO_HASHVAL) == 0)
5033 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5034 	return (0);
5035 }
5036 
5037 static __inline bool
5038 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5039 {
5040 
5041 	if (off < check_off) {
5042 		if (__predict_true(off + len <= check_off))
5043 			return (false);
5044 	} else if (off > check_off) {
5045 		if (__predict_true(check_off + check_len <= off))
5046 			return (false);
5047 	}
5048 	return (true);
5049 }
5050 
5051 static void
5052 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5053 {
5054 	const struct rndis_packet_msg *pkt;
5055 	struct hn_rxinfo info;
5056 	int data_off, pktinfo_off, data_len, pktinfo_len;
5057 
5058 	/*
5059 	 * Check length.
5060 	 */
5061 	if (__predict_false(dlen < sizeof(*pkt))) {
5062 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5063 		return;
5064 	}
5065 	pkt = data;
5066 
5067 	if (__predict_false(dlen < pkt->rm_len)) {
5068 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5069 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5070 		return;
5071 	}
5072 	if (__predict_false(pkt->rm_len <
5073 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5074 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5075 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
5076 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5077 		    pkt->rm_pktinfolen);
5078 		return;
5079 	}
5080 	if (__predict_false(pkt->rm_datalen == 0)) {
5081 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5082 		return;
5083 	}
5084 
5085 	/*
5086 	 * Check offests.
5087 	 */
5088 #define IS_OFFSET_INVALID(ofs)			\
5089 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
5090 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5091 
5092 	/* XXX Hyper-V does not meet data offset alignment requirement */
5093 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5094 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5095 		    "data offset %u\n", pkt->rm_dataoffset);
5096 		return;
5097 	}
5098 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5099 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5100 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5101 		    "oob offset %u\n", pkt->rm_oobdataoffset);
5102 		return;
5103 	}
5104 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5105 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5106 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5107 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5108 		return;
5109 	}
5110 
5111 #undef IS_OFFSET_INVALID
5112 
5113 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5114 	data_len = pkt->rm_datalen;
5115 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5116 	pktinfo_len = pkt->rm_pktinfolen;
5117 
5118 	/*
5119 	 * Check OOB coverage.
5120 	 */
5121 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
5122 		int oob_off, oob_len;
5123 
5124 		if_printf(rxr->hn_ifp, "got oobdata\n");
5125 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5126 		oob_len = pkt->rm_oobdatalen;
5127 
5128 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5129 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5130 			    "oob overflow, msglen %u, oob abs %d len %d\n",
5131 			    pkt->rm_len, oob_off, oob_len);
5132 			return;
5133 		}
5134 
5135 		/*
5136 		 * Check against data.
5137 		 */
5138 		if (hn_rndis_check_overlap(oob_off, oob_len,
5139 		    data_off, data_len)) {
5140 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5141 			    "oob overlaps data, oob abs %d len %d, "
5142 			    "data abs %d len %d\n",
5143 			    oob_off, oob_len, data_off, data_len);
5144 			return;
5145 		}
5146 
5147 		/*
5148 		 * Check against pktinfo.
5149 		 */
5150 		if (pktinfo_len != 0 &&
5151 		    hn_rndis_check_overlap(oob_off, oob_len,
5152 		    pktinfo_off, pktinfo_len)) {
5153 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5154 			    "oob overlaps pktinfo, oob abs %d len %d, "
5155 			    "pktinfo abs %d len %d\n",
5156 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
5157 			return;
5158 		}
5159 	}
5160 
5161 	/*
5162 	 * Check per-packet-info coverage and find useful per-packet-info.
5163 	 */
5164 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5165 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5166 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5167 	if (__predict_true(pktinfo_len != 0)) {
5168 		bool overlap;
5169 		int error;
5170 
5171 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5172 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5173 			    "pktinfo overflow, msglen %u, "
5174 			    "pktinfo abs %d len %d\n",
5175 			    pkt->rm_len, pktinfo_off, pktinfo_len);
5176 			return;
5177 		}
5178 
5179 		/*
5180 		 * Check packet info coverage.
5181 		 */
5182 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5183 		    data_off, data_len);
5184 		if (__predict_false(overlap)) {
5185 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5186 			    "pktinfo overlap data, pktinfo abs %d len %d, "
5187 			    "data abs %d len %d\n",
5188 			    pktinfo_off, pktinfo_len, data_off, data_len);
5189 			return;
5190 		}
5191 
5192 		/*
5193 		 * Find useful per-packet-info.
5194 		 */
5195 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5196 		    pktinfo_len, &info);
5197 		if (__predict_false(error)) {
5198 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5199 			    "pktinfo\n");
5200 			return;
5201 		}
5202 	}
5203 
5204 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
5205 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5206 		    "data overflow, msglen %u, data abs %d len %d\n",
5207 		    pkt->rm_len, data_off, data_len);
5208 		return;
5209 	}
5210 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5211 }
5212 
5213 static __inline void
5214 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5215 {
5216 	const struct rndis_msghdr *hdr;
5217 
5218 	if (__predict_false(dlen < sizeof(*hdr))) {
5219 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5220 		return;
5221 	}
5222 	hdr = data;
5223 
5224 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5225 		/* Hot data path. */
5226 		hn_rndis_rx_data(rxr, data, dlen);
5227 		/* Done! */
5228 		return;
5229 	}
5230 
5231 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5232 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5233 	else
5234 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5235 }
5236 
5237 static void
5238 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5239 {
5240 	const struct hn_nvs_hdr *hdr;
5241 
5242 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5243 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
5244 		return;
5245 	}
5246 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5247 
5248 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5249 		/* Useless; ignore */
5250 		return;
5251 	}
5252 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5253 }
5254 
5255 static void
5256 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5257     const struct vmbus_chanpkt_hdr *pkt)
5258 {
5259 	struct hn_nvs_sendctx *sndc;
5260 
5261 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5262 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5263 	    VMBUS_CHANPKT_DATALEN(pkt));
5264 	/*
5265 	 * NOTE:
5266 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5267 	 * its callback.
5268 	 */
5269 }
5270 
5271 static void
5272 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5273     const struct vmbus_chanpkt_hdr *pkthdr)
5274 {
5275 	const struct vmbus_chanpkt_rxbuf *pkt;
5276 	const struct hn_nvs_hdr *nvs_hdr;
5277 	int count, i, hlen;
5278 
5279 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5280 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5281 		return;
5282 	}
5283 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5284 
5285 	/* Make sure that this is a RNDIS message. */
5286 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5287 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5288 		    nvs_hdr->nvs_type);
5289 		return;
5290 	}
5291 
5292 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5293 	if (__predict_false(hlen < sizeof(*pkt))) {
5294 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5295 		return;
5296 	}
5297 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5298 
5299 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5300 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5301 		    pkt->cp_rxbuf_id);
5302 		return;
5303 	}
5304 
5305 	count = pkt->cp_rxbuf_cnt;
5306 	if (__predict_false(hlen <
5307 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5308 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5309 		return;
5310 	}
5311 
5312 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5313 	for (i = 0; i < count; ++i) {
5314 		int ofs, len;
5315 
5316 		ofs = pkt->cp_rxbuf[i].rb_ofs;
5317 		len = pkt->cp_rxbuf[i].rb_len;
5318 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5319 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5320 			    "ofs %d, len %d\n", i, ofs, len);
5321 			continue;
5322 		}
5323 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5324 	}
5325 
5326 	/*
5327 	 * Ack the consumed RXBUF associated w/ this channel packet,
5328 	 * so that this RXBUF can be recycled by the hypervisor.
5329 	 */
5330 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5331 }
5332 
5333 static void
5334 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5335     uint64_t tid)
5336 {
5337 	struct hn_nvs_rndis_ack ack;
5338 	int retries, error;
5339 
5340 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5341 	ack.nvs_status = HN_NVS_STATUS_OK;
5342 
5343 	retries = 0;
5344 again:
5345 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5346 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5347 	if (__predict_false(error == EAGAIN)) {
5348 		/*
5349 		 * NOTE:
5350 		 * This should _not_ happen in real world, since the
5351 		 * consumption of the TX bufring from the TX path is
5352 		 * controlled.
5353 		 */
5354 		if (rxr->hn_ack_failed == 0)
5355 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5356 		rxr->hn_ack_failed++;
5357 		retries++;
5358 		if (retries < 10) {
5359 			DELAY(100);
5360 			goto again;
5361 		}
5362 		/* RXBUF leaks! */
5363 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5364 	}
5365 }
5366 
5367 static void
5368 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5369 {
5370 	struct hn_rx_ring *rxr = xrxr;
5371 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
5372 
5373 	for (;;) {
5374 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5375 		int error, pktlen;
5376 
5377 		pktlen = rxr->hn_pktbuf_len;
5378 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5379 		if (__predict_false(error == ENOBUFS)) {
5380 			void *nbuf;
5381 			int nlen;
5382 
5383 			/*
5384 			 * Expand channel packet buffer.
5385 			 *
5386 			 * XXX
5387 			 * Use M_WAITOK here, since allocation failure
5388 			 * is fatal.
5389 			 */
5390 			nlen = rxr->hn_pktbuf_len * 2;
5391 			while (nlen < pktlen)
5392 				nlen *= 2;
5393 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5394 
5395 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5396 			    rxr->hn_pktbuf_len, nlen);
5397 
5398 			free(rxr->hn_pktbuf, M_DEVBUF);
5399 			rxr->hn_pktbuf = nbuf;
5400 			rxr->hn_pktbuf_len = nlen;
5401 			/* Retry! */
5402 			continue;
5403 		} else if (__predict_false(error == EAGAIN)) {
5404 			/* No more channel packets; done! */
5405 			break;
5406 		}
5407 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5408 
5409 		switch (pkt->cph_type) {
5410 		case VMBUS_CHANPKT_TYPE_COMP:
5411 			hn_nvs_handle_comp(sc, chan, pkt);
5412 			break;
5413 
5414 		case VMBUS_CHANPKT_TYPE_RXBUF:
5415 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
5416 			break;
5417 
5418 		case VMBUS_CHANPKT_TYPE_INBAND:
5419 			hn_nvs_handle_notify(sc, pkt);
5420 			break;
5421 
5422 		default:
5423 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5424 			    pkt->cph_type);
5425 			break;
5426 		}
5427 	}
5428 	hn_chan_rollup(rxr, rxr->hn_txr);
5429 }
5430 
5431 static void
5432 hn_tx_taskq_create(void *arg __unused)
5433 {
5434 	int i;
5435 
5436 	/*
5437 	 * Fix the # of TX taskqueues.
5438 	 */
5439 	if (hn_tx_taskq_cnt <= 0)
5440 		hn_tx_taskq_cnt = 1;
5441 	else if (hn_tx_taskq_cnt > mp_ncpus)
5442 		hn_tx_taskq_cnt = mp_ncpus;
5443 
5444 	/*
5445 	 * Fix the TX taskqueue mode.
5446 	 */
5447 	switch (hn_tx_taskq_mode) {
5448 	case HN_TX_TASKQ_M_INDEP:
5449 	case HN_TX_TASKQ_M_GLOBAL:
5450 	case HN_TX_TASKQ_M_EVTTQ:
5451 		break;
5452 	default:
5453 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5454 		break;
5455 	}
5456 
5457 	if (vm_guest != VM_GUEST_HV)
5458 		return;
5459 
5460 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5461 		return;
5462 
5463 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5464 	    M_DEVBUF, M_WAITOK);
5465 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5466 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5467 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5468 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5469 		    "hn tx%d", i);
5470 	}
5471 }
5472 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5473     hn_tx_taskq_create, NULL);
5474 
5475 static void
5476 hn_tx_taskq_destroy(void *arg __unused)
5477 {
5478 
5479 	if (hn_tx_taskque != NULL) {
5480 		int i;
5481 
5482 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
5483 			taskqueue_free(hn_tx_taskque[i]);
5484 		free(hn_tx_taskque, M_DEVBUF);
5485 	}
5486 }
5487 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5488     hn_tx_taskq_destroy, NULL);
5489