xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 8657387683946d0c03e09fe77029edfe309eeb20)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/bus.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/rmlock.h>
73 #include <sys/sbuf.h>
74 #include <sys/smp.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
77 #include <sys/sx.h>
78 #include <sys/sysctl.h>
79 #include <sys/systm.h>
80 #include <sys/taskqueue.h>
81 #include <sys/buf_ring.h>
82 #include <sys/eventhandler.h>
83 
84 #include <machine/atomic.h>
85 #include <machine/in_cksum.h>
86 
87 #include <net/bpf.h>
88 #include <net/ethernet.h>
89 #include <net/if.h>
90 #include <net/if_dl.h>
91 #include <net/if_media.h>
92 #include <net/if_types.h>
93 #include <net/if_var.h>
94 #include <net/rndis.h>
95 #ifdef RSS
96 #include <net/rss_config.h>
97 #endif
98 
99 #include <netinet/in_systm.h>
100 #include <netinet/in.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103 #include <netinet/tcp.h>
104 #include <netinet/tcp_lro.h>
105 #include <netinet/udp.h>
106 
107 #include <dev/hyperv/include/hyperv.h>
108 #include <dev/hyperv/include/hyperv_busdma.h>
109 #include <dev/hyperv/include/vmbus.h>
110 #include <dev/hyperv/include/vmbus_xact.h>
111 
112 #include <dev/hyperv/netvsc/ndis.h>
113 #include <dev/hyperv/netvsc/if_hnreg.h>
114 #include <dev/hyperv/netvsc/if_hnvar.h>
115 #include <dev/hyperv/netvsc/hn_nvs.h>
116 #include <dev/hyperv/netvsc/hn_rndis.h>
117 
118 #include "vmbus_if.h"
119 
120 #define HN_IFSTART_SUPPORT
121 
122 #define HN_RING_CNT_DEF_MAX		8
123 
124 #define HN_VFMAP_SIZE_DEF		8
125 
126 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
127 
128 /* YYY should get it from the underlying channel */
129 #define HN_TX_DESC_CNT			512
130 
131 #define HN_RNDIS_PKT_LEN					\
132 	(sizeof(struct rndis_packet_msg) +			\
133 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
134 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
135 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
136 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
137 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
138 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
139 
140 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
141 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
142 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
143 /* -1 for RNDIS packet message */
144 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
145 
146 #define HN_DIRECT_TX_SIZE_DEF		128
147 
148 #define HN_EARLY_TXEOF_THRESH		8
149 
150 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
151 
152 #define HN_LROENT_CNT_DEF		128
153 
154 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
155 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
156 /* YYY 2*MTU is a bit rough, but should be good enough. */
157 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
158 
159 #define HN_LRO_ACKCNT_DEF		1
160 
161 #define HN_LOCK_INIT(sc)		\
162 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
163 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
164 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
165 #define HN_LOCK(sc)					\
166 do {							\
167 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
168 		DELAY(1000);				\
169 } while (0)
170 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
171 
172 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
173 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
174 #define HN_CSUM_IP_HWASSIST(sc)		\
175 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
176 #define HN_CSUM_IP6_HWASSIST(sc)	\
177 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
178 
179 #define HN_PKTSIZE_MIN(align)		\
180 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
181 	    HN_RNDIS_PKT_LEN, (align))
182 #define HN_PKTSIZE(m, align)		\
183 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
184 
185 #ifdef RSS
186 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
187 #else
188 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
189 #endif
190 
191 struct hn_txdesc {
192 #ifndef HN_USE_TXDESC_BUFRING
193 	SLIST_ENTRY(hn_txdesc)		link;
194 #endif
195 	STAILQ_ENTRY(hn_txdesc)		agg_link;
196 
197 	/* Aggregated txdescs, in sending order. */
198 	STAILQ_HEAD(, hn_txdesc)	agg_list;
199 
200 	/* The oldest packet, if transmission aggregation happens. */
201 	struct mbuf			*m;
202 	struct hn_tx_ring		*txr;
203 	int				refs;
204 	uint32_t			flags;	/* HN_TXD_FLAG_ */
205 	struct hn_nvs_sendctx		send_ctx;
206 	uint32_t			chim_index;
207 	int				chim_size;
208 
209 	bus_dmamap_t			data_dmap;
210 
211 	bus_addr_t			rndis_pkt_paddr;
212 	struct rndis_packet_msg		*rndis_pkt;
213 	bus_dmamap_t			rndis_pkt_dmap;
214 };
215 
216 #define HN_TXD_FLAG_ONLIST		0x0001
217 #define HN_TXD_FLAG_DMAMAP		0x0002
218 #define HN_TXD_FLAG_ONAGG		0x0004
219 
220 struct hn_rxinfo {
221 	uint32_t			vlan_info;
222 	uint32_t			csum_info;
223 	uint32_t			hash_info;
224 	uint32_t			hash_value;
225 };
226 
227 struct hn_rxvf_setarg {
228 	struct hn_rx_ring	*rxr;
229 	struct ifnet		*vf_ifp;
230 };
231 
232 #define HN_RXINFO_VLAN			0x0001
233 #define HN_RXINFO_CSUM			0x0002
234 #define HN_RXINFO_HASHINF		0x0004
235 #define HN_RXINFO_HASHVAL		0x0008
236 #define HN_RXINFO_ALL			\
237 	(HN_RXINFO_VLAN |		\
238 	 HN_RXINFO_CSUM |		\
239 	 HN_RXINFO_HASHINF |		\
240 	 HN_RXINFO_HASHVAL)
241 
242 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
243 #define HN_NDIS_RXCSUM_INFO_INVALID	0
244 #define HN_NDIS_HASH_INFO_INVALID	0
245 
246 static int			hn_probe(device_t);
247 static int			hn_attach(device_t);
248 static int			hn_detach(device_t);
249 static int			hn_shutdown(device_t);
250 static void			hn_chan_callback(struct vmbus_channel *,
251 				    void *);
252 
253 static void			hn_init(void *);
254 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
255 #ifdef HN_IFSTART_SUPPORT
256 static void			hn_start(struct ifnet *);
257 #endif
258 static int			hn_transmit(struct ifnet *, struct mbuf *);
259 static void			hn_xmit_qflush(struct ifnet *);
260 static int			hn_ifmedia_upd(struct ifnet *);
261 static void			hn_ifmedia_sts(struct ifnet *,
262 				    struct ifmediareq *);
263 
264 static void			hn_ifnet_event(void *, struct ifnet *, int);
265 static void			hn_ifaddr_event(void *, struct ifnet *);
266 static void			hn_ifnet_attevent(void *, struct ifnet *);
267 static void			hn_ifnet_detevent(void *, struct ifnet *);
268 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
269 
270 static bool			hn_ismyvf(const struct hn_softc *,
271 				    const struct ifnet *);
272 static void			hn_rxvf_change(struct hn_softc *,
273 				    struct ifnet *, bool);
274 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
275 static void			hn_rxvf_set_task(void *, int);
276 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
277 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
278 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
279 				    struct ifreq *);
280 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
281 static bool			hn_xpnt_vf_isready(struct hn_softc *);
282 static void			hn_xpnt_vf_setready(struct hn_softc *);
283 static void			hn_xpnt_vf_init_taskfunc(void *, int);
284 static void			hn_xpnt_vf_init(struct hn_softc *);
285 static void			hn_xpnt_vf_setenable(struct hn_softc *);
286 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
287 
288 static int			hn_rndis_rxinfo(const void *, int,
289 				    struct hn_rxinfo *);
290 static void			hn_rndis_rx_data(struct hn_rx_ring *,
291 				    const void *, int);
292 static void			hn_rndis_rx_status(struct hn_softc *,
293 				    const void *, int);
294 static void			hn_rndis_init_fixat(struct hn_softc *, int);
295 
296 static void			hn_nvs_handle_notify(struct hn_softc *,
297 				    const struct vmbus_chanpkt_hdr *);
298 static void			hn_nvs_handle_comp(struct hn_softc *,
299 				    struct vmbus_channel *,
300 				    const struct vmbus_chanpkt_hdr *);
301 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
302 				    struct vmbus_channel *,
303 				    const struct vmbus_chanpkt_hdr *);
304 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
305 				    struct vmbus_channel *, uint64_t);
306 
307 #if __FreeBSD_version >= 1100099
308 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
309 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
310 #endif
311 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
312 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
313 #if __FreeBSD_version < 1100095
314 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
315 #else
316 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
317 #endif
318 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
319 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
320 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
321 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
322 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
323 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
324 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
325 #ifndef RSS
326 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
327 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
328 #endif
329 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
331 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
341 
342 static void			hn_stop(struct hn_softc *, bool);
343 static void			hn_init_locked(struct hn_softc *);
344 static int			hn_chan_attach(struct hn_softc *,
345 				    struct vmbus_channel *);
346 static void			hn_chan_detach(struct hn_softc *,
347 				    struct vmbus_channel *);
348 static int			hn_attach_subchans(struct hn_softc *);
349 static void			hn_detach_allchans(struct hn_softc *);
350 static void			hn_chan_rollup(struct hn_rx_ring *,
351 				    struct hn_tx_ring *);
352 static void			hn_set_ring_inuse(struct hn_softc *, int);
353 static int			hn_synth_attach(struct hn_softc *, int);
354 static void			hn_synth_detach(struct hn_softc *);
355 static int			hn_synth_alloc_subchans(struct hn_softc *,
356 				    int *);
357 static bool			hn_synth_attachable(const struct hn_softc *);
358 static void			hn_suspend(struct hn_softc *);
359 static void			hn_suspend_data(struct hn_softc *);
360 static void			hn_suspend_mgmt(struct hn_softc *);
361 static void			hn_resume(struct hn_softc *);
362 static void			hn_resume_data(struct hn_softc *);
363 static void			hn_resume_mgmt(struct hn_softc *);
364 static void			hn_suspend_mgmt_taskfunc(void *, int);
365 static void			hn_chan_drain(struct hn_softc *,
366 				    struct vmbus_channel *);
367 static void			hn_disable_rx(struct hn_softc *);
368 static void			hn_drain_rxtx(struct hn_softc *, int);
369 static void			hn_polling(struct hn_softc *, u_int);
370 static void			hn_chan_polling(struct vmbus_channel *, u_int);
371 static void			hn_mtu_change_fixup(struct hn_softc *);
372 
373 static void			hn_update_link_status(struct hn_softc *);
374 static void			hn_change_network(struct hn_softc *);
375 static void			hn_link_taskfunc(void *, int);
376 static void			hn_netchg_init_taskfunc(void *, int);
377 static void			hn_netchg_status_taskfunc(void *, int);
378 static void			hn_link_status(struct hn_softc *);
379 
380 static int			hn_create_rx_data(struct hn_softc *, int);
381 static void			hn_destroy_rx_data(struct hn_softc *);
382 static int			hn_check_iplen(const struct mbuf *, int);
383 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
384 static int			hn_rxfilter_config(struct hn_softc *);
385 #ifndef RSS
386 static int			hn_rss_reconfig(struct hn_softc *);
387 #endif
388 static void			hn_rss_ind_fixup(struct hn_softc *);
389 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
390 				    int, const struct hn_rxinfo *);
391 
392 static int			hn_tx_ring_create(struct hn_softc *, int);
393 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
394 static int			hn_create_tx_data(struct hn_softc *, int);
395 static void			hn_fixup_tx_data(struct hn_softc *);
396 static void			hn_destroy_tx_data(struct hn_softc *);
397 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
398 static void			hn_txdesc_gc(struct hn_tx_ring *,
399 				    struct hn_txdesc *);
400 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
401 				    struct hn_txdesc *, struct mbuf **);
402 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
403 				    struct hn_txdesc *);
404 static void			hn_set_chim_size(struct hn_softc *, int);
405 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
406 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
407 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
408 static void			hn_resume_tx(struct hn_softc *, int);
409 static void			hn_set_txagg(struct hn_softc *);
410 static void			*hn_try_txagg(struct ifnet *,
411 				    struct hn_tx_ring *, struct hn_txdesc *,
412 				    int);
413 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
414 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
415 				    struct hn_softc *, struct vmbus_channel *,
416 				    const void *, int);
417 static int			hn_txpkt_sglist(struct hn_tx_ring *,
418 				    struct hn_txdesc *);
419 static int			hn_txpkt_chim(struct hn_tx_ring *,
420 				    struct hn_txdesc *);
421 static int			hn_xmit(struct hn_tx_ring *, int);
422 static void			hn_xmit_taskfunc(void *, int);
423 static void			hn_xmit_txeof(struct hn_tx_ring *);
424 static void			hn_xmit_txeof_taskfunc(void *, int);
425 #ifdef HN_IFSTART_SUPPORT
426 static int			hn_start_locked(struct hn_tx_ring *, int);
427 static void			hn_start_taskfunc(void *, int);
428 static void			hn_start_txeof(struct hn_tx_ring *);
429 static void			hn_start_txeof_taskfunc(void *, int);
430 #endif
431 
432 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
433     "Hyper-V network interface");
434 
435 /* Trust tcp segements verification on host side. */
436 static int			hn_trust_hosttcp = 1;
437 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
438     &hn_trust_hosttcp, 0,
439     "Trust tcp segement verification on host side, "
440     "when csum info is missing (global setting)");
441 
442 /* Trust udp datagrams verification on host side. */
443 static int			hn_trust_hostudp = 1;
444 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
445     &hn_trust_hostudp, 0,
446     "Trust udp datagram verification on host side, "
447     "when csum info is missing (global setting)");
448 
449 /* Trust ip packets verification on host side. */
450 static int			hn_trust_hostip = 1;
451 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
452     &hn_trust_hostip, 0,
453     "Trust ip packet verification on host side, "
454     "when csum info is missing (global setting)");
455 
456 /* Limit TSO burst size */
457 static int			hn_tso_maxlen = IP_MAXPACKET;
458 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
459     &hn_tso_maxlen, 0, "TSO burst limit");
460 
461 /* Limit chimney send size */
462 static int			hn_tx_chimney_size = 0;
463 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
464     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
465 
466 /* Limit the size of packet for direct transmission */
467 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
468 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
469     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
470 
471 /* # of LRO entries per RX ring */
472 #if defined(INET) || defined(INET6)
473 #if __FreeBSD_version >= 1100095
474 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
475 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
476     &hn_lro_entry_count, 0, "LRO entry count");
477 #endif
478 #endif
479 
480 static int			hn_tx_taskq_cnt = 1;
481 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
482     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
483 
484 #define HN_TX_TASKQ_M_INDEP	0
485 #define HN_TX_TASKQ_M_GLOBAL	1
486 #define HN_TX_TASKQ_M_EVTTQ	2
487 
488 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
489 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
490     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
491     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
492 
493 #ifndef HN_USE_TXDESC_BUFRING
494 static int			hn_use_txdesc_bufring = 0;
495 #else
496 static int			hn_use_txdesc_bufring = 1;
497 #endif
498 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
499     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
500 
501 #ifdef HN_IFSTART_SUPPORT
502 /* Use ifnet.if_start instead of ifnet.if_transmit */
503 static int			hn_use_if_start = 0;
504 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
505     &hn_use_if_start, 0, "Use if_start TX method");
506 #endif
507 
508 /* # of channels to use */
509 static int			hn_chan_cnt = 0;
510 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
511     &hn_chan_cnt, 0,
512     "# of channels to use; each channel has one RX ring and one TX ring");
513 
514 /* # of transmit rings to use */
515 static int			hn_tx_ring_cnt = 0;
516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
517     &hn_tx_ring_cnt, 0, "# of TX rings to use");
518 
519 /* Software TX ring deptch */
520 static int			hn_tx_swq_depth = 0;
521 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
522     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
523 
524 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
525 #if __FreeBSD_version >= 1100095
526 static u_int			hn_lro_mbufq_depth = 0;
527 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
528     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
529 #endif
530 
531 /* Packet transmission aggregation size limit */
532 static int			hn_tx_agg_size = -1;
533 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
534     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
535 
536 /* Packet transmission aggregation count limit */
537 static int			hn_tx_agg_pkts = -1;
538 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
539     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
540 
541 /* VF list */
542 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
543     0, 0, hn_vflist_sysctl, "A", "VF list");
544 
545 /* VF mapping */
546 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
547     0, 0, hn_vfmap_sysctl, "A", "VF mapping");
548 
549 /* Transparent VF */
550 static int			hn_xpnt_vf = 0;
551 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
552     &hn_xpnt_vf, 0, "Transparent VF mod");
553 
554 /* Accurate BPF support for Transparent VF */
555 static int			hn_xpnt_vf_accbpf = 0;
556 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
557     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
558 
559 /* Extra wait for transparent VF attach routing; unit seconds. */
560 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
561 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
562     &hn_xpnt_vf_attwait, 0,
563     "Extra wait for transparent VF attach routing; unit: seconds");
564 
565 static u_int			hn_cpu_index;	/* next CPU for channel */
566 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
567 
568 static struct rmlock		hn_vfmap_lock;
569 static int			hn_vfmap_size;
570 static struct ifnet		**hn_vfmap;
571 
572 #ifndef RSS
573 static const uint8_t
574 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
575 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
576 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
577 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
578 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
579 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
580 };
581 #endif	/* !RSS */
582 
583 static const struct hyperv_guid	hn_guid = {
584 	.hv_guid = {
585 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
586 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
587 };
588 
589 static device_method_t hn_methods[] = {
590 	/* Device interface */
591 	DEVMETHOD(device_probe,		hn_probe),
592 	DEVMETHOD(device_attach,	hn_attach),
593 	DEVMETHOD(device_detach,	hn_detach),
594 	DEVMETHOD(device_shutdown,	hn_shutdown),
595 	DEVMETHOD_END
596 };
597 
598 static driver_t hn_driver = {
599 	"hn",
600 	hn_methods,
601 	sizeof(struct hn_softc)
602 };
603 
604 static devclass_t hn_devclass;
605 
606 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
607 MODULE_VERSION(hn, 1);
608 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
609 
610 #if __FreeBSD_version >= 1100099
611 static void
612 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
613 {
614 	int i;
615 
616 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
617 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
618 }
619 #endif
620 
621 static int
622 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
623 {
624 
625 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
626 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
627 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
628 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
629 }
630 
631 static int
632 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
633 {
634 	struct hn_nvs_rndis rndis;
635 
636 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
637 	    txd->chim_size > 0, ("invalid rndis chim txd"));
638 
639 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
640 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
641 	rndis.nvs_chim_idx = txd->chim_index;
642 	rndis.nvs_chim_sz = txd->chim_size;
643 
644 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
645 	    &rndis, sizeof(rndis), &txd->send_ctx));
646 }
647 
648 static __inline uint32_t
649 hn_chim_alloc(struct hn_softc *sc)
650 {
651 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
652 	u_long *bmap = sc->hn_chim_bmap;
653 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
654 
655 	for (i = 0; i < bmap_cnt; ++i) {
656 		int idx;
657 
658 		idx = ffsl(~bmap[i]);
659 		if (idx == 0)
660 			continue;
661 
662 		--idx; /* ffsl is 1-based */
663 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
664 		    ("invalid i %d and idx %d", i, idx));
665 
666 		if (atomic_testandset_long(&bmap[i], idx))
667 			continue;
668 
669 		ret = i * LONG_BIT + idx;
670 		break;
671 	}
672 	return (ret);
673 }
674 
675 static __inline void
676 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
677 {
678 	u_long mask;
679 	uint32_t idx;
680 
681 	idx = chim_idx / LONG_BIT;
682 	KASSERT(idx < sc->hn_chim_bmap_cnt,
683 	    ("invalid chimney index 0x%x", chim_idx));
684 
685 	mask = 1UL << (chim_idx % LONG_BIT);
686 	KASSERT(sc->hn_chim_bmap[idx] & mask,
687 	    ("index bitmap 0x%lx, chimney index %u, "
688 	     "bitmap idx %d, bitmask 0x%lx",
689 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
690 
691 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
692 }
693 
694 #if defined(INET6) || defined(INET)
695 
696 #define PULLUP_HDR(m, len)				\
697 do {							\
698 	if (__predict_false((m)->m_len < (len))) {	\
699 		(m) = m_pullup((m), (len));		\
700 		if ((m) == NULL)			\
701 			return (NULL);			\
702 	}						\
703 } while (0)
704 
705 /*
706  * NOTE: If this function failed, the m_head would be freed.
707  */
708 static __inline struct mbuf *
709 hn_tso_fixup(struct mbuf *m_head)
710 {
711 	struct ether_vlan_header *evl;
712 	struct tcphdr *th;
713 	int ehlen;
714 
715 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
716 
717 	PULLUP_HDR(m_head, sizeof(*evl));
718 	evl = mtod(m_head, struct ether_vlan_header *);
719 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
720 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
721 	else
722 		ehlen = ETHER_HDR_LEN;
723 
724 #ifdef INET
725 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
726 		struct ip *ip;
727 		int iphlen;
728 
729 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
730 		ip = mtodo(m_head, ehlen);
731 		iphlen = ip->ip_hl << 2;
732 
733 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
734 		th = mtodo(m_head, ehlen + iphlen);
735 
736 		ip->ip_len = 0;
737 		ip->ip_sum = 0;
738 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
739 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
740 	}
741 #endif
742 #if defined(INET6) && defined(INET)
743 	else
744 #endif
745 #ifdef INET6
746 	{
747 		struct ip6_hdr *ip6;
748 
749 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
750 		ip6 = mtodo(m_head, ehlen);
751 		if (ip6->ip6_nxt != IPPROTO_TCP) {
752 			m_freem(m_head);
753 			return (NULL);
754 		}
755 
756 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
757 		th = mtodo(m_head, ehlen + sizeof(*ip6));
758 
759 		ip6->ip6_plen = 0;
760 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
761 	}
762 #endif
763 	return (m_head);
764 
765 }
766 
767 /*
768  * NOTE: If this function failed, the m_head would be freed.
769  */
770 static __inline struct mbuf *
771 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
772 {
773 	const struct ether_vlan_header *evl;
774 	const struct tcphdr *th;
775 	int ehlen;
776 
777 	*tcpsyn = 0;
778 
779 	PULLUP_HDR(m_head, sizeof(*evl));
780 	evl = mtod(m_head, const struct ether_vlan_header *);
781 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
782 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
783 	else
784 		ehlen = ETHER_HDR_LEN;
785 
786 #ifdef INET
787 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) {
788 		const struct ip *ip;
789 		int iphlen;
790 
791 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
792 		ip = mtodo(m_head, ehlen);
793 		iphlen = ip->ip_hl << 2;
794 
795 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
796 		th = mtodo(m_head, ehlen + iphlen);
797 		if (th->th_flags & TH_SYN)
798 			*tcpsyn = 1;
799 	}
800 #endif
801 #if defined(INET6) && defined(INET)
802 	else
803 #endif
804 #ifdef INET6
805 	{
806 		const struct ip6_hdr *ip6;
807 
808 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
809 		ip6 = mtodo(m_head, ehlen);
810 		if (ip6->ip6_nxt != IPPROTO_TCP)
811 			return (m_head);
812 
813 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
814 		th = mtodo(m_head, ehlen + sizeof(*ip6));
815 		if (th->th_flags & TH_SYN)
816 			*tcpsyn = 1;
817 	}
818 #endif
819 	return (m_head);
820 }
821 
822 #undef PULLUP_HDR
823 
824 #endif	/* INET6 || INET */
825 
826 static int
827 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
828 {
829 	int error = 0;
830 
831 	HN_LOCK_ASSERT(sc);
832 
833 	if (sc->hn_rx_filter != filter) {
834 		error = hn_rndis_set_rxfilter(sc, filter);
835 		if (!error)
836 			sc->hn_rx_filter = filter;
837 	}
838 	return (error);
839 }
840 
841 static int
842 hn_rxfilter_config(struct hn_softc *sc)
843 {
844 	struct ifnet *ifp = sc->hn_ifp;
845 	uint32_t filter;
846 
847 	HN_LOCK_ASSERT(sc);
848 
849 	/*
850 	 * If the non-transparent mode VF is activated, we don't know how
851 	 * its RX filter is configured, so stick the synthetic device in
852 	 * the promiscous mode.
853 	 */
854 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
855 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
856 	} else {
857 		filter = NDIS_PACKET_TYPE_DIRECTED;
858 		if (ifp->if_flags & IFF_BROADCAST)
859 			filter |= NDIS_PACKET_TYPE_BROADCAST;
860 		/* TODO: support multicast list */
861 		if ((ifp->if_flags & IFF_ALLMULTI) ||
862 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
863 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
864 	}
865 	return (hn_set_rxfilter(sc, filter));
866 }
867 
868 static void
869 hn_set_txagg(struct hn_softc *sc)
870 {
871 	uint32_t size, pkts;
872 	int i;
873 
874 	/*
875 	 * Setup aggregation size.
876 	 */
877 	if (sc->hn_agg_size < 0)
878 		size = UINT32_MAX;
879 	else
880 		size = sc->hn_agg_size;
881 
882 	if (sc->hn_rndis_agg_size < size)
883 		size = sc->hn_rndis_agg_size;
884 
885 	/* NOTE: We only aggregate packets using chimney sending buffers. */
886 	if (size > (uint32_t)sc->hn_chim_szmax)
887 		size = sc->hn_chim_szmax;
888 
889 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
890 		/* Disable */
891 		size = 0;
892 		pkts = 0;
893 		goto done;
894 	}
895 
896 	/* NOTE: Type of the per TX ring setting is 'int'. */
897 	if (size > INT_MAX)
898 		size = INT_MAX;
899 
900 	/*
901 	 * Setup aggregation packet count.
902 	 */
903 	if (sc->hn_agg_pkts < 0)
904 		pkts = UINT32_MAX;
905 	else
906 		pkts = sc->hn_agg_pkts;
907 
908 	if (sc->hn_rndis_agg_pkts < pkts)
909 		pkts = sc->hn_rndis_agg_pkts;
910 
911 	if (pkts <= 1) {
912 		/* Disable */
913 		size = 0;
914 		pkts = 0;
915 		goto done;
916 	}
917 
918 	/* NOTE: Type of the per TX ring setting is 'short'. */
919 	if (pkts > SHRT_MAX)
920 		pkts = SHRT_MAX;
921 
922 done:
923 	/* NOTE: Type of the per TX ring setting is 'short'. */
924 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
925 		/* Disable */
926 		size = 0;
927 		pkts = 0;
928 	}
929 
930 	if (bootverbose) {
931 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
932 		    size, pkts, sc->hn_rndis_agg_align);
933 	}
934 
935 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
936 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
937 
938 		mtx_lock(&txr->hn_tx_lock);
939 		txr->hn_agg_szmax = size;
940 		txr->hn_agg_pktmax = pkts;
941 		txr->hn_agg_align = sc->hn_rndis_agg_align;
942 		mtx_unlock(&txr->hn_tx_lock);
943 	}
944 }
945 
946 static int
947 hn_get_txswq_depth(const struct hn_tx_ring *txr)
948 {
949 
950 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
951 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
952 		return txr->hn_txdesc_cnt;
953 	return hn_tx_swq_depth;
954 }
955 
956 #ifndef RSS
957 static int
958 hn_rss_reconfig(struct hn_softc *sc)
959 {
960 	int error;
961 
962 	HN_LOCK_ASSERT(sc);
963 
964 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
965 		return (ENXIO);
966 
967 	/*
968 	 * Disable RSS first.
969 	 *
970 	 * NOTE:
971 	 * Direct reconfiguration by setting the UNCHG flags does
972 	 * _not_ work properly.
973 	 */
974 	if (bootverbose)
975 		if_printf(sc->hn_ifp, "disable RSS\n");
976 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
977 	if (error) {
978 		if_printf(sc->hn_ifp, "RSS disable failed\n");
979 		return (error);
980 	}
981 
982 	/*
983 	 * Reenable the RSS w/ the updated RSS key or indirect
984 	 * table.
985 	 */
986 	if (bootverbose)
987 		if_printf(sc->hn_ifp, "reconfig RSS\n");
988 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
989 	if (error) {
990 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
991 		return (error);
992 	}
993 	return (0);
994 }
995 #endif	/* !RSS */
996 
997 static void
998 hn_rss_ind_fixup(struct hn_softc *sc)
999 {
1000 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1001 	int i, nchan;
1002 
1003 	nchan = sc->hn_rx_ring_inuse;
1004 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1005 
1006 	/*
1007 	 * Check indirect table to make sure that all channels in it
1008 	 * can be used.
1009 	 */
1010 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1011 		if (rss->rss_ind[i] >= nchan) {
1012 			if_printf(sc->hn_ifp,
1013 			    "RSS indirect table %d fixup: %u -> %d\n",
1014 			    i, rss->rss_ind[i], nchan - 1);
1015 			rss->rss_ind[i] = nchan - 1;
1016 		}
1017 	}
1018 }
1019 
1020 static int
1021 hn_ifmedia_upd(struct ifnet *ifp __unused)
1022 {
1023 
1024 	return EOPNOTSUPP;
1025 }
1026 
1027 static void
1028 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1029 {
1030 	struct hn_softc *sc = ifp->if_softc;
1031 
1032 	ifmr->ifm_status = IFM_AVALID;
1033 	ifmr->ifm_active = IFM_ETHER;
1034 
1035 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1036 		ifmr->ifm_active |= IFM_NONE;
1037 		return;
1038 	}
1039 	ifmr->ifm_status |= IFM_ACTIVE;
1040 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1041 }
1042 
1043 static void
1044 hn_rxvf_set_task(void *xarg, int pending __unused)
1045 {
1046 	struct hn_rxvf_setarg *arg = xarg;
1047 
1048 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1049 }
1050 
1051 static void
1052 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1053 {
1054 	struct hn_rx_ring *rxr;
1055 	struct hn_rxvf_setarg arg;
1056 	struct task task;
1057 	int i;
1058 
1059 	HN_LOCK_ASSERT(sc);
1060 
1061 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1062 
1063 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1064 		rxr = &sc->hn_rx_ring[i];
1065 
1066 		if (i < sc->hn_rx_ring_inuse) {
1067 			arg.rxr = rxr;
1068 			arg.vf_ifp = vf_ifp;
1069 			vmbus_chan_run_task(rxr->hn_chan, &task);
1070 		} else {
1071 			rxr->hn_rxvf_ifp = vf_ifp;
1072 		}
1073 	}
1074 }
1075 
1076 static bool
1077 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1078 {
1079 	const struct ifnet *hn_ifp;
1080 
1081 	hn_ifp = sc->hn_ifp;
1082 
1083 	if (ifp == hn_ifp)
1084 		return (false);
1085 
1086 	if (ifp->if_alloctype != IFT_ETHER)
1087 		return (false);
1088 
1089 	/* Ignore lagg/vlan interfaces */
1090 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1091 	    strcmp(ifp->if_dname, "vlan") == 0)
1092 		return (false);
1093 
1094 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1095 		return (false);
1096 
1097 	return (true);
1098 }
1099 
1100 static void
1101 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1102 {
1103 	struct ifnet *hn_ifp;
1104 
1105 	HN_LOCK(sc);
1106 
1107 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1108 		goto out;
1109 
1110 	if (!hn_ismyvf(sc, ifp))
1111 		goto out;
1112 	hn_ifp = sc->hn_ifp;
1113 
1114 	if (rxvf) {
1115 		if (sc->hn_flags & HN_FLAG_RXVF)
1116 			goto out;
1117 
1118 		sc->hn_flags |= HN_FLAG_RXVF;
1119 		hn_rxfilter_config(sc);
1120 	} else {
1121 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1122 			goto out;
1123 
1124 		sc->hn_flags &= ~HN_FLAG_RXVF;
1125 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1126 			hn_rxfilter_config(sc);
1127 		else
1128 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1129 	}
1130 
1131 	hn_nvs_set_datapath(sc,
1132 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1133 
1134 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1135 
1136 	if (rxvf) {
1137 		hn_suspend_mgmt(sc);
1138 		sc->hn_link_flags &=
1139 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1140 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1141 	} else {
1142 		hn_resume_mgmt(sc);
1143 	}
1144 
1145 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1146 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1147 
1148 	if (bootverbose) {
1149 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1150 		    rxvf ? "to" : "from", ifp->if_xname);
1151 	}
1152 out:
1153 	HN_UNLOCK(sc);
1154 }
1155 
1156 static void
1157 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1158 {
1159 
1160 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1161 		return;
1162 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1163 }
1164 
1165 static void
1166 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1167 {
1168 
1169 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1170 }
1171 
1172 static int
1173 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1174 {
1175 	struct ifnet *ifp, *vf_ifp;
1176 	uint64_t tmp;
1177 	int error;
1178 
1179 	HN_LOCK_ASSERT(sc);
1180 	ifp = sc->hn_ifp;
1181 	vf_ifp = sc->hn_vf_ifp;
1182 
1183 	/*
1184 	 * Fix up requested capabilities w/ supported capabilities,
1185 	 * since the supported capabilities could have been changed.
1186 	 */
1187 	ifr->ifr_reqcap &= ifp->if_capabilities;
1188 	/* Pass SIOCSIFCAP to VF. */
1189 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1190 
1191 	/*
1192 	 * NOTE:
1193 	 * The error will be propagated to the callers, however, it
1194 	 * is _not_ useful here.
1195 	 */
1196 
1197 	/*
1198 	 * Merge VF's enabled capabilities.
1199 	 */
1200 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1201 
1202 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1203 	if (ifp->if_capenable & IFCAP_TXCSUM)
1204 		ifp->if_hwassist |= tmp;
1205 	else
1206 		ifp->if_hwassist &= ~tmp;
1207 
1208 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1209 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1210 		ifp->if_hwassist |= tmp;
1211 	else
1212 		ifp->if_hwassist &= ~tmp;
1213 
1214 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1215 	if (ifp->if_capenable & IFCAP_TSO4)
1216 		ifp->if_hwassist |= tmp;
1217 	else
1218 		ifp->if_hwassist &= ~tmp;
1219 
1220 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1221 	if (ifp->if_capenable & IFCAP_TSO6)
1222 		ifp->if_hwassist |= tmp;
1223 	else
1224 		ifp->if_hwassist &= ~tmp;
1225 
1226 	return (error);
1227 }
1228 
1229 static int
1230 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1231 {
1232 	struct ifnet *vf_ifp;
1233 	struct ifreq ifr;
1234 
1235 	HN_LOCK_ASSERT(sc);
1236 	vf_ifp = sc->hn_vf_ifp;
1237 
1238 	memset(&ifr, 0, sizeof(ifr));
1239 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1240 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1241 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1242 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1243 }
1244 
1245 static void
1246 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1247 {
1248 	struct ifnet *ifp = sc->hn_ifp;
1249 	int allmulti = 0;
1250 
1251 	HN_LOCK_ASSERT(sc);
1252 
1253 	/* XXX vlan(4) style mcast addr maintenance */
1254 	if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1255 		allmulti = IFF_ALLMULTI;
1256 
1257 	/* Always set the VF's if_flags */
1258 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1259 }
1260 
1261 static void
1262 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1263 {
1264 	struct rm_priotracker pt;
1265 	struct ifnet *hn_ifp = NULL;
1266 	struct mbuf *mn;
1267 
1268 	/*
1269 	 * XXX racy, if hn(4) ever detached.
1270 	 */
1271 	rm_rlock(&hn_vfmap_lock, &pt);
1272 	if (vf_ifp->if_index < hn_vfmap_size)
1273 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1274 	rm_runlock(&hn_vfmap_lock, &pt);
1275 
1276 	if (hn_ifp != NULL) {
1277 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1278 			/*
1279 			 * Allow tapping on the VF.
1280 			 */
1281 			ETHER_BPF_MTAP(vf_ifp, mn);
1282 
1283 			/*
1284 			 * Update VF stats.
1285 			 */
1286 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1287 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1288 				    mn->m_pkthdr.len);
1289 			}
1290 			/*
1291 			 * XXX IFCOUNTER_IMCAST
1292 			 * This stat updating is kinda invasive, since it
1293 			 * requires two checks on the mbuf: the length check
1294 			 * and the ethernet header check.  As of this write,
1295 			 * all multicast packets go directly to hn(4), which
1296 			 * makes imcast stat updating in the VF a try in vian.
1297 			 */
1298 
1299 			/*
1300 			 * Fix up rcvif and increase hn(4)'s ipackets.
1301 			 */
1302 			mn->m_pkthdr.rcvif = hn_ifp;
1303 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1304 		}
1305 		/*
1306 		 * Go through hn(4)'s if_input.
1307 		 */
1308 		hn_ifp->if_input(hn_ifp, m);
1309 	} else {
1310 		/*
1311 		 * In the middle of the transition; free this
1312 		 * mbuf chain.
1313 		 */
1314 		while (m != NULL) {
1315 			mn = m->m_nextpkt;
1316 			m->m_nextpkt = NULL;
1317 			m_freem(m);
1318 			m = mn;
1319 		}
1320 	}
1321 }
1322 
1323 static void
1324 hn_mtu_change_fixup(struct hn_softc *sc)
1325 {
1326 	struct ifnet *ifp;
1327 
1328 	HN_LOCK_ASSERT(sc);
1329 	ifp = sc->hn_ifp;
1330 
1331 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1332 #if __FreeBSD_version >= 1100099
1333 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1334 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1335 #endif
1336 }
1337 
1338 static void
1339 hn_xpnt_vf_setready(struct hn_softc *sc)
1340 {
1341 	struct ifnet *ifp, *vf_ifp;
1342 	struct ifreq ifr;
1343 
1344 	HN_LOCK_ASSERT(sc);
1345 	ifp = sc->hn_ifp;
1346 	vf_ifp = sc->hn_vf_ifp;
1347 
1348 	/*
1349 	 * Mark the VF ready.
1350 	 */
1351 	sc->hn_vf_rdytick = 0;
1352 
1353 	/*
1354 	 * Save information for restoration.
1355 	 */
1356 	sc->hn_saved_caps = ifp->if_capabilities;
1357 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1358 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1359 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1360 
1361 	/*
1362 	 * Intersect supported/enabled capabilities.
1363 	 *
1364 	 * NOTE:
1365 	 * if_hwassist is not changed here.
1366 	 */
1367 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1368 	ifp->if_capenable &= ifp->if_capabilities;
1369 
1370 	/*
1371 	 * Fix TSO settings.
1372 	 */
1373 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1374 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1375 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1376 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1377 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1378 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1379 
1380 	/*
1381 	 * Change VF's enabled capabilities.
1382 	 */
1383 	memset(&ifr, 0, sizeof(ifr));
1384 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1385 	ifr.ifr_reqcap = ifp->if_capenable;
1386 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1387 
1388 	if (ifp->if_mtu != ETHERMTU) {
1389 		int error;
1390 
1391 		/*
1392 		 * Change VF's MTU.
1393 		 */
1394 		memset(&ifr, 0, sizeof(ifr));
1395 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1396 		ifr.ifr_mtu = ifp->if_mtu;
1397 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1398 		if (error) {
1399 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1400 			    vf_ifp->if_xname, ifp->if_mtu);
1401 			if (ifp->if_mtu > ETHERMTU) {
1402 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1403 
1404 				/*
1405 				 * XXX
1406 				 * No need to adjust the synthetic parts' MTU;
1407 				 * failure of the adjustment will cause us
1408 				 * infinite headache.
1409 				 */
1410 				ifp->if_mtu = ETHERMTU;
1411 				hn_mtu_change_fixup(sc);
1412 			}
1413 		}
1414 	}
1415 }
1416 
1417 static bool
1418 hn_xpnt_vf_isready(struct hn_softc *sc)
1419 {
1420 
1421 	HN_LOCK_ASSERT(sc);
1422 
1423 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1424 		return (false);
1425 
1426 	if (sc->hn_vf_rdytick == 0)
1427 		return (true);
1428 
1429 	if (sc->hn_vf_rdytick > ticks)
1430 		return (false);
1431 
1432 	/* Mark VF as ready. */
1433 	hn_xpnt_vf_setready(sc);
1434 	return (true);
1435 }
1436 
1437 static void
1438 hn_xpnt_vf_setenable(struct hn_softc *sc)
1439 {
1440 	int i;
1441 
1442 	HN_LOCK_ASSERT(sc);
1443 
1444 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1445 	rm_wlock(&sc->hn_vf_lock);
1446 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1447 	rm_wunlock(&sc->hn_vf_lock);
1448 
1449 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1450 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1451 }
1452 
1453 static void
1454 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1455 {
1456 	int i;
1457 
1458 	HN_LOCK_ASSERT(sc);
1459 
1460 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1461 	rm_wlock(&sc->hn_vf_lock);
1462 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1463 	if (clear_vf)
1464 		sc->hn_vf_ifp = NULL;
1465 	rm_wunlock(&sc->hn_vf_lock);
1466 
1467 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1468 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1469 }
1470 
1471 static void
1472 hn_xpnt_vf_init(struct hn_softc *sc)
1473 {
1474 	int error;
1475 
1476 	HN_LOCK_ASSERT(sc);
1477 
1478 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1479 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1480 
1481 	if (bootverbose) {
1482 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1483 		    sc->hn_vf_ifp->if_xname);
1484 	}
1485 
1486 	/*
1487 	 * Bring the VF up.
1488 	 */
1489 	hn_xpnt_vf_saveifflags(sc);
1490 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1491 	error = hn_xpnt_vf_iocsetflags(sc);
1492 	if (error) {
1493 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1494 		    sc->hn_vf_ifp->if_xname, error);
1495 		return;
1496 	}
1497 
1498 	/*
1499 	 * NOTE:
1500 	 * Datapath setting must happen _after_ bringing the VF up.
1501 	 */
1502 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1503 
1504 	/* Mark transparent mode VF as enabled. */
1505 	hn_xpnt_vf_setenable(sc);
1506 }
1507 
1508 static void
1509 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1510 {
1511 	struct hn_softc *sc = xsc;
1512 
1513 	HN_LOCK(sc);
1514 
1515 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1516 		goto done;
1517 	if (sc->hn_vf_ifp == NULL)
1518 		goto done;
1519 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1520 		goto done;
1521 
1522 	if (sc->hn_vf_rdytick != 0) {
1523 		/* Mark VF as ready. */
1524 		hn_xpnt_vf_setready(sc);
1525 	}
1526 
1527 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1528 		/*
1529 		 * Delayed VF initialization.
1530 		 */
1531 		if (bootverbose) {
1532 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1533 			    sc->hn_vf_ifp->if_xname);
1534 		}
1535 		hn_xpnt_vf_init(sc);
1536 	}
1537 done:
1538 	HN_UNLOCK(sc);
1539 }
1540 
1541 static void
1542 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1543 {
1544 	struct hn_softc *sc = xsc;
1545 
1546 	HN_LOCK(sc);
1547 
1548 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1549 		goto done;
1550 
1551 	if (!hn_ismyvf(sc, ifp))
1552 		goto done;
1553 
1554 	if (sc->hn_vf_ifp != NULL) {
1555 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1556 		    sc->hn_vf_ifp->if_xname);
1557 		goto done;
1558 	}
1559 
1560 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1561 		/*
1562 		 * ifnet.if_start is _not_ supported by transparent
1563 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1564 		 */
1565 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1566 		    "in transparent VF mode.\n", ifp->if_xname);
1567 		goto done;
1568 	}
1569 
1570 	rm_wlock(&hn_vfmap_lock);
1571 
1572 	if (ifp->if_index >= hn_vfmap_size) {
1573 		struct ifnet **newmap;
1574 		int newsize;
1575 
1576 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1577 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1578 		    M_WAITOK | M_ZERO);
1579 
1580 		memcpy(newmap, hn_vfmap,
1581 		    sizeof(struct ifnet *) * hn_vfmap_size);
1582 		free(hn_vfmap, M_DEVBUF);
1583 		hn_vfmap = newmap;
1584 		hn_vfmap_size = newsize;
1585 	}
1586 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1587 	    ("%s: ifindex %d was mapped to %s",
1588 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1589 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1590 
1591 	rm_wunlock(&hn_vfmap_lock);
1592 
1593 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1594 	rm_wlock(&sc->hn_vf_lock);
1595 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1596 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1597 	sc->hn_vf_ifp = ifp;
1598 	rm_wunlock(&sc->hn_vf_lock);
1599 
1600 	if (hn_xpnt_vf) {
1601 		int wait_ticks;
1602 
1603 		/*
1604 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1605 		 * Save vf_ifp's current if_input for later restoration.
1606 		 */
1607 		sc->hn_vf_input = ifp->if_input;
1608 		ifp->if_input = hn_xpnt_vf_input;
1609 
1610 		/*
1611 		 * Stop link status management; use the VF's.
1612 		 */
1613 		hn_suspend_mgmt(sc);
1614 
1615 		/*
1616 		 * Give VF sometime to complete its attach routing.
1617 		 */
1618 		wait_ticks = hn_xpnt_vf_attwait * hz;
1619 		sc->hn_vf_rdytick = ticks + wait_ticks;
1620 
1621 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1622 		    wait_ticks);
1623 	}
1624 done:
1625 	HN_UNLOCK(sc);
1626 }
1627 
1628 static void
1629 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1630 {
1631 	struct hn_softc *sc = xsc;
1632 
1633 	HN_LOCK(sc);
1634 
1635 	if (sc->hn_vf_ifp == NULL)
1636 		goto done;
1637 
1638 	if (!hn_ismyvf(sc, ifp))
1639 		goto done;
1640 
1641 	if (hn_xpnt_vf) {
1642 		/*
1643 		 * Make sure that the delayed initialization is not running.
1644 		 *
1645 		 * NOTE:
1646 		 * - This lock _must_ be released, since the hn_vf_init task
1647 		 *   will try holding this lock.
1648 		 * - It is safe to release this lock here, since the
1649 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1650 		 *
1651 		 * XXX racy, if hn(4) ever detached.
1652 		 */
1653 		HN_UNLOCK(sc);
1654 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1655 		HN_LOCK(sc);
1656 
1657 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1658 		    sc->hn_ifp->if_xname));
1659 		ifp->if_input = sc->hn_vf_input;
1660 		sc->hn_vf_input = NULL;
1661 
1662 		if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1663 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1664 
1665 		if (sc->hn_vf_rdytick == 0) {
1666 			/*
1667 			 * The VF was ready; restore some settings.
1668 			 */
1669 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
1670 			/*
1671 			 * NOTE:
1672 			 * There is _no_ need to fixup if_capenable and
1673 			 * if_hwassist, since the if_capabilities before
1674 			 * restoration was an intersection of the VF's
1675 			 * if_capabilites and the synthetic device's
1676 			 * if_capabilites.
1677 			 */
1678 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
1679 			sc->hn_ifp->if_hw_tsomaxsegcount =
1680 			    sc->hn_saved_tsosegcnt;
1681 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
1682 		}
1683 
1684 		/*
1685 		 * Resume link status management, which was suspended
1686 		 * by hn_ifnet_attevent().
1687 		 */
1688 		hn_resume_mgmt(sc);
1689 	}
1690 
1691 	/* Mark transparent mode VF as disabled. */
1692 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
1693 
1694 	rm_wlock(&hn_vfmap_lock);
1695 
1696 	KASSERT(ifp->if_index < hn_vfmap_size,
1697 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
1698 	if (hn_vfmap[ifp->if_index] != NULL) {
1699 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
1700 		    ("%s: ifindex %d was mapped to %s",
1701 		     ifp->if_xname, ifp->if_index,
1702 		     hn_vfmap[ifp->if_index]->if_xname));
1703 		hn_vfmap[ifp->if_index] = NULL;
1704 	}
1705 
1706 	rm_wunlock(&hn_vfmap_lock);
1707 done:
1708 	HN_UNLOCK(sc);
1709 }
1710 
1711 static void
1712 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
1713 {
1714 	struct hn_softc *sc = xsc;
1715 
1716 	if (sc->hn_vf_ifp == ifp)
1717 		if_link_state_change(sc->hn_ifp, link_state);
1718 }
1719 
1720 static int
1721 hn_probe(device_t dev)
1722 {
1723 
1724 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
1725 		device_set_desc(dev, "Hyper-V Network Interface");
1726 		return BUS_PROBE_DEFAULT;
1727 	}
1728 	return ENXIO;
1729 }
1730 
1731 static int
1732 hn_attach(device_t dev)
1733 {
1734 	struct hn_softc *sc = device_get_softc(dev);
1735 	struct sysctl_oid_list *child;
1736 	struct sysctl_ctx_list *ctx;
1737 	uint8_t eaddr[ETHER_ADDR_LEN];
1738 	struct ifnet *ifp = NULL;
1739 	int error, ring_cnt, tx_ring_cnt;
1740 
1741 	sc->hn_dev = dev;
1742 	sc->hn_prichan = vmbus_get_channel(dev);
1743 	HN_LOCK_INIT(sc);
1744 	rm_init(&sc->hn_vf_lock, "hnvf");
1745 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
1746 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
1747 
1748 	/*
1749 	 * Initialize these tunables once.
1750 	 */
1751 	sc->hn_agg_size = hn_tx_agg_size;
1752 	sc->hn_agg_pkts = hn_tx_agg_pkts;
1753 
1754 	/*
1755 	 * Setup taskqueue for transmission.
1756 	 */
1757 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1758 		int i;
1759 
1760 		sc->hn_tx_taskqs =
1761 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1762 		    M_DEVBUF, M_WAITOK);
1763 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1764 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1765 			    M_WAITOK, taskqueue_thread_enqueue,
1766 			    &sc->hn_tx_taskqs[i]);
1767 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1768 			    "%s tx%d", device_get_nameunit(dev), i);
1769 		}
1770 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1771 		sc->hn_tx_taskqs = hn_tx_taskque;
1772 	}
1773 
1774 	/*
1775 	 * Setup taskqueue for mangement tasks, e.g. link status.
1776 	 */
1777 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1778 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1779 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1780 	    device_get_nameunit(dev));
1781 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1782 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1783 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1784 	    hn_netchg_status_taskfunc, sc);
1785 
1786 	if (hn_xpnt_vf) {
1787 		/*
1788 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
1789 		 */
1790 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
1791 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
1792 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
1793 		    device_get_nameunit(dev));
1794 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
1795 		    hn_xpnt_vf_init_taskfunc, sc);
1796 	}
1797 
1798 	/*
1799 	 * Allocate ifnet and setup its name earlier, so that if_printf
1800 	 * can be used by functions, which will be called after
1801 	 * ether_ifattach().
1802 	 */
1803 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
1804 	ifp->if_softc = sc;
1805 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1806 
1807 	/*
1808 	 * Initialize ifmedia earlier so that it can be unconditionally
1809 	 * destroyed, if error happened later on.
1810 	 */
1811 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1812 
1813 	/*
1814 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1815 	 * to use (tx_ring_cnt).
1816 	 *
1817 	 * NOTE:
1818 	 * The # of RX rings to use is same as the # of channels to use.
1819 	 */
1820 	ring_cnt = hn_chan_cnt;
1821 	if (ring_cnt <= 0) {
1822 		/* Default */
1823 		ring_cnt = mp_ncpus;
1824 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
1825 			ring_cnt = HN_RING_CNT_DEF_MAX;
1826 	} else if (ring_cnt > mp_ncpus) {
1827 		ring_cnt = mp_ncpus;
1828 	}
1829 #ifdef RSS
1830 	if (ring_cnt > rss_getnumbuckets())
1831 		ring_cnt = rss_getnumbuckets();
1832 #endif
1833 
1834 	tx_ring_cnt = hn_tx_ring_cnt;
1835 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1836 		tx_ring_cnt = ring_cnt;
1837 #ifdef HN_IFSTART_SUPPORT
1838 	if (hn_use_if_start) {
1839 		/* ifnet.if_start only needs one TX ring. */
1840 		tx_ring_cnt = 1;
1841 	}
1842 #endif
1843 
1844 	/*
1845 	 * Set the leader CPU for channels.
1846 	 */
1847 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1848 
1849 	/*
1850 	 * Create enough TX/RX rings, even if only limited number of
1851 	 * channels can be allocated.
1852 	 */
1853 	error = hn_create_tx_data(sc, tx_ring_cnt);
1854 	if (error)
1855 		goto failed;
1856 	error = hn_create_rx_data(sc, ring_cnt);
1857 	if (error)
1858 		goto failed;
1859 
1860 	/*
1861 	 * Create transaction context for NVS and RNDIS transactions.
1862 	 */
1863 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1864 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1865 	if (sc->hn_xact == NULL) {
1866 		error = ENXIO;
1867 		goto failed;
1868 	}
1869 
1870 	/*
1871 	 * Install orphan handler for the revocation of this device's
1872 	 * primary channel.
1873 	 *
1874 	 * NOTE:
1875 	 * The processing order is critical here:
1876 	 * Install the orphan handler, _before_ testing whether this
1877 	 * device's primary channel has been revoked or not.
1878 	 */
1879 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1880 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1881 		error = ENXIO;
1882 		goto failed;
1883 	}
1884 
1885 	/*
1886 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1887 	 */
1888 	error = hn_synth_attach(sc, ETHERMTU);
1889 	if (error)
1890 		goto failed;
1891 
1892 	error = hn_rndis_get_eaddr(sc, eaddr);
1893 	if (error)
1894 		goto failed;
1895 
1896 #if __FreeBSD_version >= 1100099
1897 	if (sc->hn_rx_ring_inuse > 1) {
1898 		/*
1899 		 * Reduce TCP segment aggregation limit for multiple
1900 		 * RX rings to increase ACK timeliness.
1901 		 */
1902 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1903 	}
1904 #endif
1905 
1906 	/*
1907 	 * Fixup TX stuffs after synthetic parts are attached.
1908 	 */
1909 	hn_fixup_tx_data(sc);
1910 
1911 	ctx = device_get_sysctl_ctx(dev);
1912 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1913 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1914 	    &sc->hn_nvs_ver, 0, "NVS version");
1915 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1916 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1917 	    hn_ndis_version_sysctl, "A", "NDIS version");
1918 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1919 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1920 	    hn_caps_sysctl, "A", "capabilities");
1921 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1922 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1923 	    hn_hwassist_sysctl, "A", "hwassist");
1924 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
1925 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
1926 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
1927 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
1928 	    "max # of TSO segments");
1929 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
1930 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
1931 	    "max size of TSO segment");
1932 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1933 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1934 	    hn_rxfilter_sysctl, "A", "rxfilter");
1935 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1936 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1937 	    hn_rss_hash_sysctl, "A", "RSS hash");
1938 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1939 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1940 #ifndef RSS
1941 	/*
1942 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
1943 	 */
1944 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1945 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1946 	    hn_rss_key_sysctl, "IU", "RSS key");
1947 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1948 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1949 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1950 #endif
1951 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1952 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1953 	    "RNDIS offered packet transmission aggregation size limit");
1954 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1955 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1956 	    "RNDIS offered packet transmission aggregation count limit");
1957 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1958 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1959 	    "RNDIS packet transmission aggregation alignment");
1960 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1961 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1962 	    hn_txagg_size_sysctl, "I",
1963 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1964 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1965 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1966 	    hn_txagg_pkts_sysctl, "I",
1967 	    "Packet transmission aggregation packets, "
1968 	    "0 -- disable, -1 -- auto");
1969 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1970 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1971 	    hn_polling_sysctl, "I",
1972 	    "Polling frequency: [100,1000000], 0 disable polling");
1973 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1974 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1975 	    hn_vf_sysctl, "A", "Virtual Function's name");
1976 	if (!hn_xpnt_vf) {
1977 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
1978 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1979 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
1980 	} else {
1981 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
1982 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1983 		    hn_xpnt_vf_enabled_sysctl, "I",
1984 		    "Transparent VF enabled");
1985 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
1986 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1987 		    hn_xpnt_vf_accbpf_sysctl, "I",
1988 		    "Accurate BPF for transparent VF");
1989 	}
1990 
1991 	/*
1992 	 * Setup the ifmedia, which has been initialized earlier.
1993 	 */
1994 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1995 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1996 	/* XXX ifmedia_set really should do this for us */
1997 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1998 
1999 	/*
2000 	 * Setup the ifnet for this interface.
2001 	 */
2002 
2003 	ifp->if_baudrate = IF_Gbps(10);
2004 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2005 	ifp->if_ioctl = hn_ioctl;
2006 	ifp->if_init = hn_init;
2007 #ifdef HN_IFSTART_SUPPORT
2008 	if (hn_use_if_start) {
2009 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2010 
2011 		ifp->if_start = hn_start;
2012 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2013 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2014 		IFQ_SET_READY(&ifp->if_snd);
2015 	} else
2016 #endif
2017 	{
2018 		ifp->if_transmit = hn_transmit;
2019 		ifp->if_qflush = hn_xmit_qflush;
2020 	}
2021 
2022 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2023 #ifdef foo
2024 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2025 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2026 #endif
2027 	if (sc->hn_caps & HN_CAP_VLAN) {
2028 		/* XXX not sure about VLAN_MTU. */
2029 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2030 	}
2031 
2032 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2033 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2034 		ifp->if_capabilities |= IFCAP_TXCSUM;
2035 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2036 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2037 	if (sc->hn_caps & HN_CAP_TSO4) {
2038 		ifp->if_capabilities |= IFCAP_TSO4;
2039 		ifp->if_hwassist |= CSUM_IP_TSO;
2040 	}
2041 	if (sc->hn_caps & HN_CAP_TSO6) {
2042 		ifp->if_capabilities |= IFCAP_TSO6;
2043 		ifp->if_hwassist |= CSUM_IP6_TSO;
2044 	}
2045 
2046 	/* Enable all available capabilities by default. */
2047 	ifp->if_capenable = ifp->if_capabilities;
2048 
2049 	/*
2050 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2051 	 * be enabled through SIOCSIFCAP.
2052 	 */
2053 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2054 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2055 
2056 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2057 		/*
2058 		 * Lock hn_set_tso_maxsize() to simplify its
2059 		 * internal logic.
2060 		 */
2061 		HN_LOCK(sc);
2062 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2063 		HN_UNLOCK(sc);
2064 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2065 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2066 	}
2067 
2068 	ether_ifattach(ifp, eaddr);
2069 
2070 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2071 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2072 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2073 	}
2074 
2075 	/* Inform the upper layer about the long frame support. */
2076 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2077 
2078 	/*
2079 	 * Kick off link status check.
2080 	 */
2081 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2082 	hn_update_link_status(sc);
2083 
2084 	if (!hn_xpnt_vf) {
2085 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2086 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2087 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2088 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2089 	} else {
2090 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2091 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2092 	}
2093 
2094 	/*
2095 	 * NOTE:
2096 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2097 	 * since interface's LLADDR is needed; interface LLADDR is not
2098 	 * available when ifnet_arrival event is triggered.
2099 	 */
2100 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2101 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2102 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2103 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2104 
2105 	return (0);
2106 failed:
2107 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2108 		hn_synth_detach(sc);
2109 	hn_detach(dev);
2110 	return (error);
2111 }
2112 
2113 static int
2114 hn_detach(device_t dev)
2115 {
2116 	struct hn_softc *sc = device_get_softc(dev);
2117 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2118 
2119 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2120 		/*
2121 		 * In case that the vmbus missed the orphan handler
2122 		 * installation.
2123 		 */
2124 		vmbus_xact_ctx_orphan(sc->hn_xact);
2125 	}
2126 
2127 	if (sc->hn_ifaddr_evthand != NULL)
2128 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2129 	if (sc->hn_ifnet_evthand != NULL)
2130 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2131 	if (sc->hn_ifnet_atthand != NULL) {
2132 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2133 		    sc->hn_ifnet_atthand);
2134 	}
2135 	if (sc->hn_ifnet_dethand != NULL) {
2136 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2137 		    sc->hn_ifnet_dethand);
2138 	}
2139 	if (sc->hn_ifnet_lnkhand != NULL)
2140 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2141 
2142 	vf_ifp = sc->hn_vf_ifp;
2143 	__compiler_membar();
2144 	if (vf_ifp != NULL)
2145 		hn_ifnet_detevent(sc, vf_ifp);
2146 
2147 	if (device_is_attached(dev)) {
2148 		HN_LOCK(sc);
2149 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2150 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2151 				hn_stop(sc, true);
2152 			/*
2153 			 * NOTE:
2154 			 * hn_stop() only suspends data, so managment
2155 			 * stuffs have to be suspended manually here.
2156 			 */
2157 			hn_suspend_mgmt(sc);
2158 			hn_synth_detach(sc);
2159 		}
2160 		HN_UNLOCK(sc);
2161 		ether_ifdetach(ifp);
2162 	}
2163 
2164 	ifmedia_removeall(&sc->hn_media);
2165 	hn_destroy_rx_data(sc);
2166 	hn_destroy_tx_data(sc);
2167 
2168 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2169 		int i;
2170 
2171 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2172 			taskqueue_free(sc->hn_tx_taskqs[i]);
2173 		free(sc->hn_tx_taskqs, M_DEVBUF);
2174 	}
2175 	taskqueue_free(sc->hn_mgmt_taskq0);
2176 	if (sc->hn_vf_taskq != NULL)
2177 		taskqueue_free(sc->hn_vf_taskq);
2178 
2179 	if (sc->hn_xact != NULL) {
2180 		/*
2181 		 * Uninstall the orphan handler _before_ the xact is
2182 		 * destructed.
2183 		 */
2184 		vmbus_chan_unset_orphan(sc->hn_prichan);
2185 		vmbus_xact_ctx_destroy(sc->hn_xact);
2186 	}
2187 
2188 	if_free(ifp);
2189 
2190 	HN_LOCK_DESTROY(sc);
2191 	rm_destroy(&sc->hn_vf_lock);
2192 	return (0);
2193 }
2194 
2195 static int
2196 hn_shutdown(device_t dev)
2197 {
2198 
2199 	return (0);
2200 }
2201 
2202 static void
2203 hn_link_status(struct hn_softc *sc)
2204 {
2205 	uint32_t link_status;
2206 	int error;
2207 
2208 	error = hn_rndis_get_linkstatus(sc, &link_status);
2209 	if (error) {
2210 		/* XXX what to do? */
2211 		return;
2212 	}
2213 
2214 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2215 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2216 	else
2217 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2218 	if_link_state_change(sc->hn_ifp,
2219 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2220 	    LINK_STATE_UP : LINK_STATE_DOWN);
2221 }
2222 
2223 static void
2224 hn_link_taskfunc(void *xsc, int pending __unused)
2225 {
2226 	struct hn_softc *sc = xsc;
2227 
2228 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2229 		return;
2230 	hn_link_status(sc);
2231 }
2232 
2233 static void
2234 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2235 {
2236 	struct hn_softc *sc = xsc;
2237 
2238 	/* Prevent any link status checks from running. */
2239 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2240 
2241 	/*
2242 	 * Fake up a [link down --> link up] state change; 5 seconds
2243 	 * delay is used, which closely simulates miibus reaction
2244 	 * upon link down event.
2245 	 */
2246 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2247 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2248 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2249 	    &sc->hn_netchg_status, 5 * hz);
2250 }
2251 
2252 static void
2253 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2254 {
2255 	struct hn_softc *sc = xsc;
2256 
2257 	/* Re-allow link status checks. */
2258 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2259 	hn_link_status(sc);
2260 }
2261 
2262 static void
2263 hn_update_link_status(struct hn_softc *sc)
2264 {
2265 
2266 	if (sc->hn_mgmt_taskq != NULL)
2267 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2268 }
2269 
2270 static void
2271 hn_change_network(struct hn_softc *sc)
2272 {
2273 
2274 	if (sc->hn_mgmt_taskq != NULL)
2275 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2276 }
2277 
2278 static __inline int
2279 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2280     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2281 {
2282 	struct mbuf *m = *m_head;
2283 	int error;
2284 
2285 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2286 
2287 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2288 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2289 	if (error == EFBIG) {
2290 		struct mbuf *m_new;
2291 
2292 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2293 		if (m_new == NULL)
2294 			return ENOBUFS;
2295 		else
2296 			*m_head = m = m_new;
2297 		txr->hn_tx_collapsed++;
2298 
2299 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2300 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2301 	}
2302 	if (!error) {
2303 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2304 		    BUS_DMASYNC_PREWRITE);
2305 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2306 	}
2307 	return error;
2308 }
2309 
2310 static __inline int
2311 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2312 {
2313 
2314 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2315 	    ("put an onlist txd %#x", txd->flags));
2316 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2317 	    ("put an onagg txd %#x", txd->flags));
2318 
2319 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2320 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2321 		return 0;
2322 
2323 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2324 		struct hn_txdesc *tmp_txd;
2325 
2326 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2327 			int freed;
2328 
2329 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2330 			    ("resursive aggregation on aggregated txdesc"));
2331 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2332 			    ("not aggregated txdesc"));
2333 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2334 			    ("aggregated txdesc uses dmamap"));
2335 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2336 			    ("aggregated txdesc consumes "
2337 			     "chimney sending buffer"));
2338 			KASSERT(tmp_txd->chim_size == 0,
2339 			    ("aggregated txdesc has non-zero "
2340 			     "chimney sending size"));
2341 
2342 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2343 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2344 			freed = hn_txdesc_put(txr, tmp_txd);
2345 			KASSERT(freed, ("failed to free aggregated txdesc"));
2346 		}
2347 	}
2348 
2349 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2350 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2351 		    ("chim txd uses dmamap"));
2352 		hn_chim_free(txr->hn_sc, txd->chim_index);
2353 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2354 		txd->chim_size = 0;
2355 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2356 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2357 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2358 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2359 		    txd->data_dmap);
2360 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2361 	}
2362 
2363 	if (txd->m != NULL) {
2364 		m_freem(txd->m);
2365 		txd->m = NULL;
2366 	}
2367 
2368 	txd->flags |= HN_TXD_FLAG_ONLIST;
2369 #ifndef HN_USE_TXDESC_BUFRING
2370 	mtx_lock_spin(&txr->hn_txlist_spin);
2371 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2372 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2373 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2374 	txr->hn_txdesc_avail++;
2375 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2376 	mtx_unlock_spin(&txr->hn_txlist_spin);
2377 #else	/* HN_USE_TXDESC_BUFRING */
2378 #ifdef HN_DEBUG
2379 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2380 #endif
2381 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2382 #endif	/* !HN_USE_TXDESC_BUFRING */
2383 
2384 	return 1;
2385 }
2386 
2387 static __inline struct hn_txdesc *
2388 hn_txdesc_get(struct hn_tx_ring *txr)
2389 {
2390 	struct hn_txdesc *txd;
2391 
2392 #ifndef HN_USE_TXDESC_BUFRING
2393 	mtx_lock_spin(&txr->hn_txlist_spin);
2394 	txd = SLIST_FIRST(&txr->hn_txlist);
2395 	if (txd != NULL) {
2396 		KASSERT(txr->hn_txdesc_avail > 0,
2397 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2398 		txr->hn_txdesc_avail--;
2399 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2400 	}
2401 	mtx_unlock_spin(&txr->hn_txlist_spin);
2402 #else
2403 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2404 #endif
2405 
2406 	if (txd != NULL) {
2407 #ifdef HN_USE_TXDESC_BUFRING
2408 #ifdef HN_DEBUG
2409 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2410 #endif
2411 #endif	/* HN_USE_TXDESC_BUFRING */
2412 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2413 		    STAILQ_EMPTY(&txd->agg_list) &&
2414 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2415 		    txd->chim_size == 0 &&
2416 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2417 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2418 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2419 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2420 		txd->refs = 1;
2421 	}
2422 	return txd;
2423 }
2424 
2425 static __inline void
2426 hn_txdesc_hold(struct hn_txdesc *txd)
2427 {
2428 
2429 	/* 0->1 transition will never work */
2430 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2431 	atomic_add_int(&txd->refs, 1);
2432 }
2433 
2434 static __inline void
2435 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2436 {
2437 
2438 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2439 	    ("recursive aggregation on aggregating txdesc"));
2440 
2441 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2442 	    ("already aggregated"));
2443 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2444 	    ("recursive aggregation on to-be-aggregated txdesc"));
2445 
2446 	txd->flags |= HN_TXD_FLAG_ONAGG;
2447 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2448 }
2449 
2450 static bool
2451 hn_tx_ring_pending(struct hn_tx_ring *txr)
2452 {
2453 	bool pending = false;
2454 
2455 #ifndef HN_USE_TXDESC_BUFRING
2456 	mtx_lock_spin(&txr->hn_txlist_spin);
2457 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2458 		pending = true;
2459 	mtx_unlock_spin(&txr->hn_txlist_spin);
2460 #else
2461 	if (!buf_ring_full(txr->hn_txdesc_br))
2462 		pending = true;
2463 #endif
2464 	return (pending);
2465 }
2466 
2467 static __inline void
2468 hn_txeof(struct hn_tx_ring *txr)
2469 {
2470 	txr->hn_has_txeof = 0;
2471 	txr->hn_txeof(txr);
2472 }
2473 
2474 static void
2475 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2476     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2477 {
2478 	struct hn_txdesc *txd = sndc->hn_cbarg;
2479 	struct hn_tx_ring *txr;
2480 
2481 	txr = txd->txr;
2482 	KASSERT(txr->hn_chan == chan,
2483 	    ("channel mismatch, on chan%u, should be chan%u",
2484 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2485 
2486 	txr->hn_has_txeof = 1;
2487 	hn_txdesc_put(txr, txd);
2488 
2489 	++txr->hn_txdone_cnt;
2490 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2491 		txr->hn_txdone_cnt = 0;
2492 		if (txr->hn_oactive)
2493 			hn_txeof(txr);
2494 	}
2495 }
2496 
2497 static void
2498 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2499 {
2500 #if defined(INET) || defined(INET6)
2501 	tcp_lro_flush_all(&rxr->hn_lro);
2502 #endif
2503 
2504 	/*
2505 	 * NOTE:
2506 	 * 'txr' could be NULL, if multiple channels and
2507 	 * ifnet.if_start method are enabled.
2508 	 */
2509 	if (txr == NULL || !txr->hn_has_txeof)
2510 		return;
2511 
2512 	txr->hn_txdone_cnt = 0;
2513 	hn_txeof(txr);
2514 }
2515 
2516 static __inline uint32_t
2517 hn_rndis_pktmsg_offset(uint32_t ofs)
2518 {
2519 
2520 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2521 	    ("invalid RNDIS packet msg offset %u", ofs));
2522 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2523 }
2524 
2525 static __inline void *
2526 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2527     size_t pi_dlen, uint32_t pi_type)
2528 {
2529 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2530 	struct rndis_pktinfo *pi;
2531 
2532 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2533 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2534 
2535 	/*
2536 	 * Per-packet-info does not move; it only grows.
2537 	 *
2538 	 * NOTE:
2539 	 * rm_pktinfooffset in this phase counts from the beginning
2540 	 * of rndis_packet_msg.
2541 	 */
2542 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2543 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2544 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2545 	    pkt->rm_pktinfolen);
2546 	pkt->rm_pktinfolen += pi_size;
2547 
2548 	pi->rm_size = pi_size;
2549 	pi->rm_type = pi_type;
2550 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2551 
2552 	return (pi->rm_data);
2553 }
2554 
2555 static __inline int
2556 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2557 {
2558 	struct hn_txdesc *txd;
2559 	struct mbuf *m;
2560 	int error, pkts;
2561 
2562 	txd = txr->hn_agg_txd;
2563 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2564 
2565 	/*
2566 	 * Since hn_txpkt() will reset this temporary stat, save
2567 	 * it now, so that oerrors can be updated properly, if
2568 	 * hn_txpkt() ever fails.
2569 	 */
2570 	pkts = txr->hn_stat_pkts;
2571 
2572 	/*
2573 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2574 	 * failure, save it for later freeing, if hn_txpkt() ever
2575 	 * fails.
2576 	 */
2577 	m = txd->m;
2578 	error = hn_txpkt(ifp, txr, txd);
2579 	if (__predict_false(error)) {
2580 		/* txd is freed, but m is not. */
2581 		m_freem(m);
2582 
2583 		txr->hn_flush_failed++;
2584 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2585 	}
2586 
2587 	/* Reset all aggregation states. */
2588 	txr->hn_agg_txd = NULL;
2589 	txr->hn_agg_szleft = 0;
2590 	txr->hn_agg_pktleft = 0;
2591 	txr->hn_agg_prevpkt = NULL;
2592 
2593 	return (error);
2594 }
2595 
2596 static void *
2597 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2598     int pktsize)
2599 {
2600 	void *chim;
2601 
2602 	if (txr->hn_agg_txd != NULL) {
2603 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2604 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2605 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2606 			int olen;
2607 
2608 			/*
2609 			 * Update the previous RNDIS packet's total length,
2610 			 * it can be increased due to the mandatory alignment
2611 			 * padding for this RNDIS packet.  And update the
2612 			 * aggregating txdesc's chimney sending buffer size
2613 			 * accordingly.
2614 			 *
2615 			 * XXX
2616 			 * Zero-out the padding, as required by the RNDIS spec.
2617 			 */
2618 			olen = pkt->rm_len;
2619 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2620 			agg_txd->chim_size += pkt->rm_len - olen;
2621 
2622 			/* Link this txdesc to the parent. */
2623 			hn_txdesc_agg(agg_txd, txd);
2624 
2625 			chim = (uint8_t *)pkt + pkt->rm_len;
2626 			/* Save the current packet for later fixup. */
2627 			txr->hn_agg_prevpkt = chim;
2628 
2629 			txr->hn_agg_pktleft--;
2630 			txr->hn_agg_szleft -= pktsize;
2631 			if (txr->hn_agg_szleft <=
2632 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2633 				/*
2634 				 * Probably can't aggregate more packets,
2635 				 * flush this aggregating txdesc proactively.
2636 				 */
2637 				txr->hn_agg_pktleft = 0;
2638 			}
2639 			/* Done! */
2640 			return (chim);
2641 		}
2642 		hn_flush_txagg(ifp, txr);
2643 	}
2644 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2645 
2646 	txr->hn_tx_chimney_tried++;
2647 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
2648 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2649 		return (NULL);
2650 	txr->hn_tx_chimney++;
2651 
2652 	chim = txr->hn_sc->hn_chim +
2653 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2654 
2655 	if (txr->hn_agg_pktmax > 1 &&
2656 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2657 		txr->hn_agg_txd = txd;
2658 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2659 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2660 		txr->hn_agg_prevpkt = chim;
2661 	}
2662 	return (chim);
2663 }
2664 
2665 /*
2666  * NOTE:
2667  * If this function fails, then both txd and m_head0 will be freed.
2668  */
2669 static int
2670 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2671     struct mbuf **m_head0)
2672 {
2673 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
2674 	int error, nsegs, i;
2675 	struct mbuf *m_head = *m_head0;
2676 	struct rndis_packet_msg *pkt;
2677 	uint32_t *pi_data;
2678 	void *chim = NULL;
2679 	int pkt_hlen, pkt_size;
2680 
2681 	pkt = txd->rndis_pkt;
2682 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
2683 	if (pkt_size < txr->hn_chim_size) {
2684 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
2685 		if (chim != NULL)
2686 			pkt = chim;
2687 	} else {
2688 		if (txr->hn_agg_txd != NULL)
2689 			hn_flush_txagg(ifp, txr);
2690 	}
2691 
2692 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
2693 	pkt->rm_len = m_head->m_pkthdr.len;
2694 	pkt->rm_dataoffset = 0;
2695 	pkt->rm_datalen = m_head->m_pkthdr.len;
2696 	pkt->rm_oobdataoffset = 0;
2697 	pkt->rm_oobdatalen = 0;
2698 	pkt->rm_oobdataelements = 0;
2699 	pkt->rm_pktinfooffset = sizeof(*pkt);
2700 	pkt->rm_pktinfolen = 0;
2701 	pkt->rm_vchandle = 0;
2702 	pkt->rm_reserved = 0;
2703 
2704 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2705 		/*
2706 		 * Set the hash value for this packet, so that the host could
2707 		 * dispatch the TX done event for this packet back to this TX
2708 		 * ring's channel.
2709 		 */
2710 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2711 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2712 		*pi_data = txr->hn_tx_idx;
2713 	}
2714 
2715 	if (m_head->m_flags & M_VLANTAG) {
2716 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2717 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
2718 		*pi_data = NDIS_VLAN_INFO_MAKE(
2719 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
2720 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
2721 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
2722 	}
2723 
2724 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2725 #if defined(INET6) || defined(INET)
2726 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2727 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
2728 #ifdef INET
2729 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
2730 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
2731 			    m_head->m_pkthdr.tso_segsz);
2732 		}
2733 #endif
2734 #if defined(INET6) && defined(INET)
2735 		else
2736 #endif
2737 #ifdef INET6
2738 		{
2739 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
2740 			    m_head->m_pkthdr.tso_segsz);
2741 		}
2742 #endif
2743 #endif	/* INET6 || INET */
2744 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
2745 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2746 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
2747 		if (m_head->m_pkthdr.csum_flags &
2748 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
2749 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
2750 		} else {
2751 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
2752 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
2753 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
2754 		}
2755 
2756 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
2757 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
2758 		else if (m_head->m_pkthdr.csum_flags &
2759 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
2760 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
2761 	}
2762 
2763 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
2764 	/* Fixup RNDIS packet message total length */
2765 	pkt->rm_len += pkt_hlen;
2766 	/* Convert RNDIS packet message offsets */
2767 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
2768 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2769 
2770 	/*
2771 	 * Fast path: Chimney sending.
2772 	 */
2773 	if (chim != NULL) {
2774 		struct hn_txdesc *tgt_txd = txd;
2775 
2776 		if (txr->hn_agg_txd != NULL) {
2777 			tgt_txd = txr->hn_agg_txd;
2778 #ifdef INVARIANTS
2779 			*m_head0 = NULL;
2780 #endif
2781 		}
2782 
2783 		KASSERT(pkt == chim,
2784 		    ("RNDIS pkt not in chimney sending buffer"));
2785 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2786 		    ("chimney sending buffer is not used"));
2787 		tgt_txd->chim_size += pkt->rm_len;
2788 
2789 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
2790 		    ((uint8_t *)chim) + pkt_hlen);
2791 
2792 		txr->hn_gpa_cnt = 0;
2793 		txr->hn_sendpkt = hn_txpkt_chim;
2794 		goto done;
2795 	}
2796 
2797 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2798 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2799 	    ("chimney buffer is used"));
2800 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2801 
2802 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2803 	if (__predict_false(error)) {
2804 		int freed;
2805 
2806 		/*
2807 		 * This mbuf is not linked w/ the txd yet, so free it now.
2808 		 */
2809 		m_freem(m_head);
2810 		*m_head0 = NULL;
2811 
2812 		freed = hn_txdesc_put(txr, txd);
2813 		KASSERT(freed != 0,
2814 		    ("fail to free txd upon txdma error"));
2815 
2816 		txr->hn_txdma_failed++;
2817 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2818 		return error;
2819 	}
2820 	*m_head0 = m_head;
2821 
2822 	/* +1 RNDIS packet message */
2823 	txr->hn_gpa_cnt = nsegs + 1;
2824 
2825 	/* send packet with page buffer */
2826 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2827 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2828 	txr->hn_gpa[0].gpa_len = pkt_hlen;
2829 
2830 	/*
2831 	 * Fill the page buffers with mbuf info after the page
2832 	 * buffer for RNDIS packet message.
2833 	 */
2834 	for (i = 0; i < nsegs; ++i) {
2835 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2836 
2837 		gpa->gpa_page = atop(segs[i].ds_addr);
2838 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2839 		gpa->gpa_len = segs[i].ds_len;
2840 	}
2841 
2842 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2843 	txd->chim_size = 0;
2844 	txr->hn_sendpkt = hn_txpkt_sglist;
2845 done:
2846 	txd->m = m_head;
2847 
2848 	/* Set the completion routine */
2849 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2850 
2851 	/* Update temporary stats for later use. */
2852 	txr->hn_stat_pkts++;
2853 	txr->hn_stat_size += m_head->m_pkthdr.len;
2854 	if (m_head->m_flags & M_MCAST)
2855 		txr->hn_stat_mcasts++;
2856 
2857 	return 0;
2858 }
2859 
2860 /*
2861  * NOTE:
2862  * If this function fails, then txd will be freed, but the mbuf
2863  * associated w/ the txd will _not_ be freed.
2864  */
2865 static int
2866 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2867 {
2868 	int error, send_failed = 0, has_bpf;
2869 
2870 again:
2871 	has_bpf = bpf_peers_present(ifp->if_bpf);
2872 	if (has_bpf) {
2873 		/*
2874 		 * Make sure that this txd and any aggregated txds are not
2875 		 * freed before ETHER_BPF_MTAP.
2876 		 */
2877 		hn_txdesc_hold(txd);
2878 	}
2879 	error = txr->hn_sendpkt(txr, txd);
2880 	if (!error) {
2881 		if (has_bpf) {
2882 			const struct hn_txdesc *tmp_txd;
2883 
2884 			ETHER_BPF_MTAP(ifp, txd->m);
2885 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2886 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
2887 		}
2888 
2889 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2890 #ifdef HN_IFSTART_SUPPORT
2891 		if (!hn_use_if_start)
2892 #endif
2893 		{
2894 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
2895 			    txr->hn_stat_size);
2896 			if (txr->hn_stat_mcasts != 0) {
2897 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2898 				    txr->hn_stat_mcasts);
2899 			}
2900 		}
2901 		txr->hn_pkts += txr->hn_stat_pkts;
2902 		txr->hn_sends++;
2903 	}
2904 	if (has_bpf)
2905 		hn_txdesc_put(txr, txd);
2906 
2907 	if (__predict_false(error)) {
2908 		int freed;
2909 
2910 		/*
2911 		 * This should "really rarely" happen.
2912 		 *
2913 		 * XXX Too many RX to be acked or too many sideband
2914 		 * commands to run?  Ask netvsc_channel_rollup()
2915 		 * to kick start later.
2916 		 */
2917 		txr->hn_has_txeof = 1;
2918 		if (!send_failed) {
2919 			txr->hn_send_failed++;
2920 			send_failed = 1;
2921 			/*
2922 			 * Try sending again after set hn_has_txeof;
2923 			 * in case that we missed the last
2924 			 * netvsc_channel_rollup().
2925 			 */
2926 			goto again;
2927 		}
2928 		if_printf(ifp, "send failed\n");
2929 
2930 		/*
2931 		 * Caller will perform further processing on the
2932 		 * associated mbuf, so don't free it in hn_txdesc_put();
2933 		 * only unload it from the DMA map in hn_txdesc_put(),
2934 		 * if it was loaded.
2935 		 */
2936 		txd->m = NULL;
2937 		freed = hn_txdesc_put(txr, txd);
2938 		KASSERT(freed != 0,
2939 		    ("fail to free txd upon send error"));
2940 
2941 		txr->hn_send_failed++;
2942 	}
2943 
2944 	/* Reset temporary stats, after this sending is done. */
2945 	txr->hn_stat_size = 0;
2946 	txr->hn_stat_pkts = 0;
2947 	txr->hn_stat_mcasts = 0;
2948 
2949 	return (error);
2950 }
2951 
2952 /*
2953  * Append the specified data to the indicated mbuf chain,
2954  * Extend the mbuf chain if the new data does not fit in
2955  * existing space.
2956  *
2957  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2958  * There should be an equivalent in the kernel mbuf code,
2959  * but there does not appear to be one yet.
2960  *
2961  * Differs from m_append() in that additional mbufs are
2962  * allocated with cluster size MJUMPAGESIZE, and filled
2963  * accordingly.
2964  *
2965  * Return 1 if able to complete the job; otherwise 0.
2966  */
2967 static int
2968 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2969 {
2970 	struct mbuf *m, *n;
2971 	int remainder, space;
2972 
2973 	for (m = m0; m->m_next != NULL; m = m->m_next)
2974 		;
2975 	remainder = len;
2976 	space = M_TRAILINGSPACE(m);
2977 	if (space > 0) {
2978 		/*
2979 		 * Copy into available space.
2980 		 */
2981 		if (space > remainder)
2982 			space = remainder;
2983 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2984 		m->m_len += space;
2985 		cp += space;
2986 		remainder -= space;
2987 	}
2988 	while (remainder > 0) {
2989 		/*
2990 		 * Allocate a new mbuf; could check space
2991 		 * and allocate a cluster instead.
2992 		 */
2993 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2994 		if (n == NULL)
2995 			break;
2996 		n->m_len = min(MJUMPAGESIZE, remainder);
2997 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2998 		cp += n->m_len;
2999 		remainder -= n->m_len;
3000 		m->m_next = n;
3001 		m = n;
3002 	}
3003 	if (m0->m_flags & M_PKTHDR)
3004 		m0->m_pkthdr.len += len - remainder;
3005 
3006 	return (remainder == 0);
3007 }
3008 
3009 #if defined(INET) || defined(INET6)
3010 static __inline int
3011 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3012 {
3013 #if __FreeBSD_version >= 1100095
3014 	if (hn_lro_mbufq_depth) {
3015 		tcp_lro_queue_mbuf(lc, m);
3016 		return 0;
3017 	}
3018 #endif
3019 	return tcp_lro_rx(lc, m, 0);
3020 }
3021 #endif
3022 
3023 static int
3024 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3025     const struct hn_rxinfo *info)
3026 {
3027 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3028 	struct mbuf *m_new;
3029 	int size, do_lro = 0, do_csum = 1;
3030 	int hash_type;
3031 
3032 	/*
3033 	 * If the non-transparent mode VF is active, inject this packet
3034 	 * into the VF.
3035 	 */
3036 	ifp = rxr->hn_rxvf_ifp ? rxr->hn_rxvf_ifp : hn_ifp;
3037 
3038 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3039 		/*
3040 		 * NOTE:
3041 		 * See the NOTE of hn_rndis_init_fixat().  This
3042 		 * function can be reached, immediately after the
3043 		 * RNDIS is initialized but before the ifnet is
3044 		 * setup on the hn_attach() path; drop the unexpected
3045 		 * packets.
3046 		 */
3047 		return (0);
3048 	}
3049 
3050 	if (__predict_false(dlen < ETHER_HDR_LEN)) {
3051 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3052 		return (0);
3053 	}
3054 
3055 	if (dlen <= MHLEN) {
3056 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3057 		if (m_new == NULL) {
3058 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3059 			return (0);
3060 		}
3061 		memcpy(mtod(m_new, void *), data, dlen);
3062 		m_new->m_pkthdr.len = m_new->m_len = dlen;
3063 		rxr->hn_small_pkts++;
3064 	} else {
3065 		/*
3066 		 * Get an mbuf with a cluster.  For packets 2K or less,
3067 		 * get a standard 2K cluster.  For anything larger, get a
3068 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3069 		 * if looped around to the Hyper-V TX channel, so avoid them.
3070 		 */
3071 		size = MCLBYTES;
3072 		if (dlen > MCLBYTES) {
3073 			/* 4096 */
3074 			size = MJUMPAGESIZE;
3075 		}
3076 
3077 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3078 		if (m_new == NULL) {
3079 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3080 			return (0);
3081 		}
3082 
3083 		hv_m_append(m_new, dlen, data);
3084 	}
3085 	m_new->m_pkthdr.rcvif = ifp;
3086 
3087 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3088 		do_csum = 0;
3089 
3090 	/* receive side checksum offload */
3091 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3092 		/* IP csum offload */
3093 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3094 			m_new->m_pkthdr.csum_flags |=
3095 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3096 			rxr->hn_csum_ip++;
3097 		}
3098 
3099 		/* TCP/UDP csum offload */
3100 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3101 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3102 			m_new->m_pkthdr.csum_flags |=
3103 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3104 			m_new->m_pkthdr.csum_data = 0xffff;
3105 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3106 				rxr->hn_csum_tcp++;
3107 			else
3108 				rxr->hn_csum_udp++;
3109 		}
3110 
3111 		/*
3112 		 * XXX
3113 		 * As of this write (Oct 28th, 2016), host side will turn
3114 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3115 		 * the do_lro setting here is actually _not_ accurate.  We
3116 		 * depend on the RSS hash type check to reset do_lro.
3117 		 */
3118 		if ((info->csum_info &
3119 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3120 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3121 			do_lro = 1;
3122 	} else {
3123 		const struct ether_header *eh;
3124 		uint16_t etype;
3125 		int hoff;
3126 
3127 		hoff = sizeof(*eh);
3128 		/* Checked at the beginning of this function. */
3129 		KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
3130 
3131 		eh = mtod(m_new, struct ether_header *);
3132 		etype = ntohs(eh->ether_type);
3133 		if (etype == ETHERTYPE_VLAN) {
3134 			const struct ether_vlan_header *evl;
3135 
3136 			hoff = sizeof(*evl);
3137 			if (m_new->m_len < hoff)
3138 				goto skip;
3139 			evl = mtod(m_new, struct ether_vlan_header *);
3140 			etype = ntohs(evl->evl_proto);
3141 		}
3142 
3143 		if (etype == ETHERTYPE_IP) {
3144 			int pr;
3145 
3146 			pr = hn_check_iplen(m_new, hoff);
3147 			if (pr == IPPROTO_TCP) {
3148 				if (do_csum &&
3149 				    (rxr->hn_trust_hcsum &
3150 				     HN_TRUST_HCSUM_TCP)) {
3151 					rxr->hn_csum_trusted++;
3152 					m_new->m_pkthdr.csum_flags |=
3153 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3154 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3155 					m_new->m_pkthdr.csum_data = 0xffff;
3156 				}
3157 				do_lro = 1;
3158 			} else if (pr == IPPROTO_UDP) {
3159 				if (do_csum &&
3160 				    (rxr->hn_trust_hcsum &
3161 				     HN_TRUST_HCSUM_UDP)) {
3162 					rxr->hn_csum_trusted++;
3163 					m_new->m_pkthdr.csum_flags |=
3164 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3165 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3166 					m_new->m_pkthdr.csum_data = 0xffff;
3167 				}
3168 			} else if (pr != IPPROTO_DONE && do_csum &&
3169 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3170 				rxr->hn_csum_trusted++;
3171 				m_new->m_pkthdr.csum_flags |=
3172 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3173 			}
3174 		}
3175 	}
3176 skip:
3177 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3178 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3179 		    NDIS_VLAN_INFO_ID(info->vlan_info),
3180 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3181 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3182 		m_new->m_flags |= M_VLANTAG;
3183 	}
3184 
3185 	/*
3186 	 * If VF is activated (tranparent/non-transparent mode does not
3187 	 * matter here).
3188 	 *
3189 	 * - Don't setup mbuf hash, if 'options RSS' is set.
3190 	 *
3191 	 *   In Azure, when VF is activated, TCP SYN and SYN|ACK go
3192 	 *   through hn(4) while the rest of segments and ACKs belonging
3193 	 *   to the same TCP 4-tuple go through the VF.  So don't setup
3194 	 *   mbuf hash, if a VF is activated and 'options RSS' is not
3195 	 *   enabled.  hn(4) and the VF may use neither the same RSS
3196 	 *   hash key nor the same RSS hash function, so the hash value
3197 	 *   for packets belonging to the same flow could be different!
3198 	 *
3199 	 * - Disable LRO
3200 	 *
3201 	 *   hn(4) will only receive broadcast packets, multicast packets,
3202 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3203 	 *   packet types.
3204 	 *
3205 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3206 	 *   all, since the LRO flush will use hn(4) as the receiving
3207 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3208 	 */
3209 	if (hn_ifp != ifp || (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF)) {
3210 		do_lro = 0;	/* disable LRO. */
3211 #ifndef RSS
3212 		goto skip_hash;	/* skip mbuf hash setup */
3213 #endif
3214 	}
3215 
3216 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3217 		rxr->hn_rss_pkts++;
3218 		m_new->m_pkthdr.flowid = info->hash_value;
3219 		hash_type = M_HASHTYPE_OPAQUE_HASH;
3220 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3221 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3222 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
3223 
3224 			/*
3225 			 * NOTE:
3226 			 * do_lro is resetted, if the hash types are not TCP
3227 			 * related.  See the comment in the above csum_flags
3228 			 * setup section.
3229 			 */
3230 			switch (type) {
3231 			case NDIS_HASH_IPV4:
3232 				hash_type = M_HASHTYPE_RSS_IPV4;
3233 				do_lro = 0;
3234 				break;
3235 
3236 			case NDIS_HASH_TCP_IPV4:
3237 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3238 				break;
3239 
3240 			case NDIS_HASH_IPV6:
3241 				hash_type = M_HASHTYPE_RSS_IPV6;
3242 				do_lro = 0;
3243 				break;
3244 
3245 			case NDIS_HASH_IPV6_EX:
3246 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3247 				do_lro = 0;
3248 				break;
3249 
3250 			case NDIS_HASH_TCP_IPV6:
3251 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3252 				break;
3253 
3254 			case NDIS_HASH_TCP_IPV6_EX:
3255 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3256 				break;
3257 			}
3258 		}
3259 	} else {
3260 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3261 		hash_type = M_HASHTYPE_OPAQUE;
3262 	}
3263 	M_HASHTYPE_SET(m_new, hash_type);
3264 
3265 #ifndef RSS
3266 skip_hash:
3267 #endif
3268 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3269 	if (hn_ifp != ifp) {
3270 		const struct ether_header *eh;
3271 
3272 		/*
3273 		 * Non-transparent mode VF is activated.
3274 		 */
3275 
3276 		/*
3277 		 * Allow tapping on hn(4).
3278 		 */
3279 		ETHER_BPF_MTAP(hn_ifp, m_new);
3280 
3281 		/*
3282 		 * Update hn(4)'s stats.
3283 		 */
3284 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3285 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3286 		/* Checked at the beginning of this function. */
3287 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3288 		eh = mtod(m_new, struct ether_header *);
3289 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3290 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3291 	}
3292 	rxr->hn_pkts++;
3293 
3294 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3295 #if defined(INET) || defined(INET6)
3296 		struct lro_ctrl *lro = &rxr->hn_lro;
3297 
3298 		if (lro->lro_cnt) {
3299 			rxr->hn_lro_tried++;
3300 			if (hn_lro_rx(lro, m_new) == 0) {
3301 				/* DONE! */
3302 				return 0;
3303 			}
3304 		}
3305 #endif
3306 	}
3307 	ifp->if_input(ifp, m_new);
3308 
3309 	return (0);
3310 }
3311 
3312 static int
3313 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3314 {
3315 	struct hn_softc *sc = ifp->if_softc;
3316 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3317 	struct ifnet *vf_ifp;
3318 	int mask, error = 0;
3319 
3320 	switch (cmd) {
3321 	case SIOCSIFMTU:
3322 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3323 			error = EINVAL;
3324 			break;
3325 		}
3326 
3327 		HN_LOCK(sc);
3328 
3329 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3330 			HN_UNLOCK(sc);
3331 			break;
3332 		}
3333 
3334 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3335 			/* Can't change MTU */
3336 			HN_UNLOCK(sc);
3337 			error = EOPNOTSUPP;
3338 			break;
3339 		}
3340 
3341 		if (ifp->if_mtu == ifr->ifr_mtu) {
3342 			HN_UNLOCK(sc);
3343 			break;
3344 		}
3345 
3346 		if (hn_xpnt_vf_isready(sc)) {
3347 			vf_ifp = sc->hn_vf_ifp;
3348 			ifr_vf = *ifr;
3349 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3350 			    sizeof(ifr_vf.ifr_name));
3351 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3352 			    (caddr_t)&ifr_vf);
3353 			if (error) {
3354 				HN_UNLOCK(sc);
3355 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3356 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3357 				break;
3358 			}
3359 		}
3360 
3361 		/*
3362 		 * Suspend this interface before the synthetic parts
3363 		 * are ripped.
3364 		 */
3365 		hn_suspend(sc);
3366 
3367 		/*
3368 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3369 		 */
3370 		hn_synth_detach(sc);
3371 
3372 		/*
3373 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3374 		 * with the new MTU setting.
3375 		 */
3376 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3377 		if (error) {
3378 			HN_UNLOCK(sc);
3379 			break;
3380 		}
3381 
3382 		/*
3383 		 * Commit the requested MTU, after the synthetic parts
3384 		 * have been successfully attached.
3385 		 */
3386 		ifp->if_mtu = ifr->ifr_mtu;
3387 
3388 		/*
3389 		 * Synthetic parts' reattach may change the chimney
3390 		 * sending size; update it.
3391 		 */
3392 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3393 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3394 
3395 		/*
3396 		 * Make sure that various parameters based on MTU are
3397 		 * still valid, after the MTU change.
3398 		 */
3399 		hn_mtu_change_fixup(sc);
3400 
3401 		/*
3402 		 * All done!  Resume the interface now.
3403 		 */
3404 		hn_resume(sc);
3405 
3406 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3407 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3408 			/*
3409 			 * Since we have reattached the NVS part,
3410 			 * change the datapath to VF again; in case
3411 			 * that it is lost, after the NVS was detached.
3412 			 */
3413 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3414 		}
3415 
3416 		HN_UNLOCK(sc);
3417 		break;
3418 
3419 	case SIOCSIFFLAGS:
3420 		HN_LOCK(sc);
3421 
3422 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3423 			HN_UNLOCK(sc);
3424 			break;
3425 		}
3426 
3427 		if (hn_xpnt_vf_isready(sc))
3428 			hn_xpnt_vf_saveifflags(sc);
3429 
3430 		if (ifp->if_flags & IFF_UP) {
3431 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3432 				/*
3433 				 * Caller meight hold mutex, e.g.
3434 				 * bpf; use busy-wait for the RNDIS
3435 				 * reply.
3436 				 */
3437 				HN_NO_SLEEPING(sc);
3438 				hn_rxfilter_config(sc);
3439 				HN_SLEEPING_OK(sc);
3440 
3441 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3442 					error = hn_xpnt_vf_iocsetflags(sc);
3443 			} else {
3444 				hn_init_locked(sc);
3445 			}
3446 		} else {
3447 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3448 				hn_stop(sc, false);
3449 		}
3450 		sc->hn_if_flags = ifp->if_flags;
3451 
3452 		HN_UNLOCK(sc);
3453 		break;
3454 
3455 	case SIOCSIFCAP:
3456 		HN_LOCK(sc);
3457 
3458 		if (hn_xpnt_vf_isready(sc)) {
3459 			ifr_vf = *ifr;
3460 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3461 			    sizeof(ifr_vf.ifr_name));
3462 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3463 			HN_UNLOCK(sc);
3464 			break;
3465 		}
3466 
3467 		/*
3468 		 * Fix up requested capabilities w/ supported capabilities,
3469 		 * since the supported capabilities could have been changed.
3470 		 */
3471 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3472 		    ifp->if_capenable;
3473 
3474 		if (mask & IFCAP_TXCSUM) {
3475 			ifp->if_capenable ^= IFCAP_TXCSUM;
3476 			if (ifp->if_capenable & IFCAP_TXCSUM)
3477 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3478 			else
3479 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3480 		}
3481 		if (mask & IFCAP_TXCSUM_IPV6) {
3482 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3483 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3484 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3485 			else
3486 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3487 		}
3488 
3489 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3490 		if (mask & IFCAP_RXCSUM)
3491 			ifp->if_capenable ^= IFCAP_RXCSUM;
3492 #ifdef foo
3493 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3494 		if (mask & IFCAP_RXCSUM_IPV6)
3495 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3496 #endif
3497 
3498 		if (mask & IFCAP_LRO)
3499 			ifp->if_capenable ^= IFCAP_LRO;
3500 
3501 		if (mask & IFCAP_TSO4) {
3502 			ifp->if_capenable ^= IFCAP_TSO4;
3503 			if (ifp->if_capenable & IFCAP_TSO4)
3504 				ifp->if_hwassist |= CSUM_IP_TSO;
3505 			else
3506 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3507 		}
3508 		if (mask & IFCAP_TSO6) {
3509 			ifp->if_capenable ^= IFCAP_TSO6;
3510 			if (ifp->if_capenable & IFCAP_TSO6)
3511 				ifp->if_hwassist |= CSUM_IP6_TSO;
3512 			else
3513 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3514 		}
3515 
3516 		HN_UNLOCK(sc);
3517 		break;
3518 
3519 	case SIOCADDMULTI:
3520 	case SIOCDELMULTI:
3521 		HN_LOCK(sc);
3522 
3523 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3524 			HN_UNLOCK(sc);
3525 			break;
3526 		}
3527 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3528 			/*
3529 			 * Multicast uses mutex; use busy-wait for
3530 			 * the RNDIS reply.
3531 			 */
3532 			HN_NO_SLEEPING(sc);
3533 			hn_rxfilter_config(sc);
3534 			HN_SLEEPING_OK(sc);
3535 		}
3536 
3537 		/* XXX vlan(4) style mcast addr maintenance */
3538 		if (hn_xpnt_vf_isready(sc)) {
3539 			int old_if_flags;
3540 
3541 			old_if_flags = sc->hn_vf_ifp->if_flags;
3542 			hn_xpnt_vf_saveifflags(sc);
3543 
3544 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3545 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3546 			     IFF_ALLMULTI))
3547 				error = hn_xpnt_vf_iocsetflags(sc);
3548 		}
3549 
3550 		HN_UNLOCK(sc);
3551 		break;
3552 
3553 	case SIOCSIFMEDIA:
3554 	case SIOCGIFMEDIA:
3555 		HN_LOCK(sc);
3556 		if (hn_xpnt_vf_isready(sc)) {
3557 			/*
3558 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3559 			 * create and pass ifr_vf to the VF here; just
3560 			 * replace the ifr_name.
3561 			 */
3562 			vf_ifp = sc->hn_vf_ifp;
3563 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3564 			    sizeof(ifr->ifr_name));
3565 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3566 			/* Restore the ifr_name. */
3567 			strlcpy(ifr->ifr_name, ifp->if_xname,
3568 			    sizeof(ifr->ifr_name));
3569 			HN_UNLOCK(sc);
3570 			break;
3571 		}
3572 		HN_UNLOCK(sc);
3573 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3574 		break;
3575 
3576 	default:
3577 		error = ether_ioctl(ifp, cmd, data);
3578 		break;
3579 	}
3580 	return (error);
3581 }
3582 
3583 static void
3584 hn_stop(struct hn_softc *sc, bool detaching)
3585 {
3586 	struct ifnet *ifp = sc->hn_ifp;
3587 	int i;
3588 
3589 	HN_LOCK_ASSERT(sc);
3590 
3591 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3592 	    ("synthetic parts were not attached"));
3593 
3594 	/* Clear RUNNING bit ASAP. */
3595 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3596 
3597 	/* Disable polling. */
3598 	hn_polling(sc, 0);
3599 
3600 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
3601 		KASSERT(sc->hn_vf_ifp != NULL,
3602 		    ("%s: VF is not attached", ifp->if_xname));
3603 
3604 		/* Mark transparent mode VF as disabled. */
3605 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
3606 
3607 		/*
3608 		 * NOTE:
3609 		 * Datapath setting must happen _before_ bringing
3610 		 * the VF down.
3611 		 */
3612 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
3613 
3614 		/*
3615 		 * Bring the VF down.
3616 		 */
3617 		hn_xpnt_vf_saveifflags(sc);
3618 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
3619 		hn_xpnt_vf_iocsetflags(sc);
3620 	}
3621 
3622 	/* Suspend data transfers. */
3623 	hn_suspend_data(sc);
3624 
3625 	/* Clear OACTIVE bit. */
3626 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3627 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3628 		sc->hn_tx_ring[i].hn_oactive = 0;
3629 
3630 	/*
3631 	 * If the non-transparent mode VF is active, make sure
3632 	 * that the RX filter still allows packet reception.
3633 	 */
3634 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
3635 		hn_rxfilter_config(sc);
3636 }
3637 
3638 static void
3639 hn_init_locked(struct hn_softc *sc)
3640 {
3641 	struct ifnet *ifp = sc->hn_ifp;
3642 	int i;
3643 
3644 	HN_LOCK_ASSERT(sc);
3645 
3646 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
3647 		return;
3648 
3649 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3650 		return;
3651 
3652 	/* Configure RX filter */
3653 	hn_rxfilter_config(sc);
3654 
3655 	/* Clear OACTIVE bit. */
3656 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3657 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3658 		sc->hn_tx_ring[i].hn_oactive = 0;
3659 
3660 	/* Clear TX 'suspended' bit. */
3661 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
3662 
3663 	if (hn_xpnt_vf_isready(sc)) {
3664 		/* Initialize transparent VF. */
3665 		hn_xpnt_vf_init(sc);
3666 	}
3667 
3668 	/* Everything is ready; unleash! */
3669 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3670 
3671 	/* Re-enable polling if requested. */
3672 	if (sc->hn_pollhz > 0)
3673 		hn_polling(sc, sc->hn_pollhz);
3674 }
3675 
3676 static void
3677 hn_init(void *xsc)
3678 {
3679 	struct hn_softc *sc = xsc;
3680 
3681 	HN_LOCK(sc);
3682 	hn_init_locked(sc);
3683 	HN_UNLOCK(sc);
3684 }
3685 
3686 #if __FreeBSD_version >= 1100099
3687 
3688 static int
3689 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
3690 {
3691 	struct hn_softc *sc = arg1;
3692 	unsigned int lenlim;
3693 	int error;
3694 
3695 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
3696 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
3697 	if (error || req->newptr == NULL)
3698 		return error;
3699 
3700 	HN_LOCK(sc);
3701 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
3702 	    lenlim > TCP_LRO_LENGTH_MAX) {
3703 		HN_UNLOCK(sc);
3704 		return EINVAL;
3705 	}
3706 	hn_set_lro_lenlim(sc, lenlim);
3707 	HN_UNLOCK(sc);
3708 
3709 	return 0;
3710 }
3711 
3712 static int
3713 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
3714 {
3715 	struct hn_softc *sc = arg1;
3716 	int ackcnt, error, i;
3717 
3718 	/*
3719 	 * lro_ackcnt_lim is append count limit,
3720 	 * +1 to turn it into aggregation limit.
3721 	 */
3722 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
3723 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
3724 	if (error || req->newptr == NULL)
3725 		return error;
3726 
3727 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
3728 		return EINVAL;
3729 
3730 	/*
3731 	 * Convert aggregation limit back to append
3732 	 * count limit.
3733 	 */
3734 	--ackcnt;
3735 	HN_LOCK(sc);
3736 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
3737 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
3738 	HN_UNLOCK(sc);
3739 	return 0;
3740 }
3741 
3742 #endif
3743 
3744 static int
3745 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
3746 {
3747 	struct hn_softc *sc = arg1;
3748 	int hcsum = arg2;
3749 	int on, error, i;
3750 
3751 	on = 0;
3752 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
3753 		on = 1;
3754 
3755 	error = sysctl_handle_int(oidp, &on, 0, req);
3756 	if (error || req->newptr == NULL)
3757 		return error;
3758 
3759 	HN_LOCK(sc);
3760 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3761 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3762 
3763 		if (on)
3764 			rxr->hn_trust_hcsum |= hcsum;
3765 		else
3766 			rxr->hn_trust_hcsum &= ~hcsum;
3767 	}
3768 	HN_UNLOCK(sc);
3769 	return 0;
3770 }
3771 
3772 static int
3773 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
3774 {
3775 	struct hn_softc *sc = arg1;
3776 	int chim_size, error;
3777 
3778 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
3779 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
3780 	if (error || req->newptr == NULL)
3781 		return error;
3782 
3783 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
3784 		return EINVAL;
3785 
3786 	HN_LOCK(sc);
3787 	hn_set_chim_size(sc, chim_size);
3788 	HN_UNLOCK(sc);
3789 	return 0;
3790 }
3791 
3792 #if __FreeBSD_version < 1100095
3793 static int
3794 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
3795 {
3796 	struct hn_softc *sc = arg1;
3797 	int ofs = arg2, i, error;
3798 	struct hn_rx_ring *rxr;
3799 	uint64_t stat;
3800 
3801 	stat = 0;
3802 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3803 		rxr = &sc->hn_rx_ring[i];
3804 		stat += *((int *)((uint8_t *)rxr + ofs));
3805 	}
3806 
3807 	error = sysctl_handle_64(oidp, &stat, 0, req);
3808 	if (error || req->newptr == NULL)
3809 		return error;
3810 
3811 	/* Zero out this stat. */
3812 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3813 		rxr = &sc->hn_rx_ring[i];
3814 		*((int *)((uint8_t *)rxr + ofs)) = 0;
3815 	}
3816 	return 0;
3817 }
3818 #else
3819 static int
3820 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
3821 {
3822 	struct hn_softc *sc = arg1;
3823 	int ofs = arg2, i, error;
3824 	struct hn_rx_ring *rxr;
3825 	uint64_t stat;
3826 
3827 	stat = 0;
3828 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3829 		rxr = &sc->hn_rx_ring[i];
3830 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
3831 	}
3832 
3833 	error = sysctl_handle_64(oidp, &stat, 0, req);
3834 	if (error || req->newptr == NULL)
3835 		return error;
3836 
3837 	/* Zero out this stat. */
3838 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3839 		rxr = &sc->hn_rx_ring[i];
3840 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
3841 	}
3842 	return 0;
3843 }
3844 
3845 #endif
3846 
3847 static int
3848 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3849 {
3850 	struct hn_softc *sc = arg1;
3851 	int ofs = arg2, i, error;
3852 	struct hn_rx_ring *rxr;
3853 	u_long stat;
3854 
3855 	stat = 0;
3856 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3857 		rxr = &sc->hn_rx_ring[i];
3858 		stat += *((u_long *)((uint8_t *)rxr + ofs));
3859 	}
3860 
3861 	error = sysctl_handle_long(oidp, &stat, 0, req);
3862 	if (error || req->newptr == NULL)
3863 		return error;
3864 
3865 	/* Zero out this stat. */
3866 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3867 		rxr = &sc->hn_rx_ring[i];
3868 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
3869 	}
3870 	return 0;
3871 }
3872 
3873 static int
3874 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3875 {
3876 	struct hn_softc *sc = arg1;
3877 	int ofs = arg2, i, error;
3878 	struct hn_tx_ring *txr;
3879 	u_long stat;
3880 
3881 	stat = 0;
3882 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3883 		txr = &sc->hn_tx_ring[i];
3884 		stat += *((u_long *)((uint8_t *)txr + ofs));
3885 	}
3886 
3887 	error = sysctl_handle_long(oidp, &stat, 0, req);
3888 	if (error || req->newptr == NULL)
3889 		return error;
3890 
3891 	/* Zero out this stat. */
3892 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3893 		txr = &sc->hn_tx_ring[i];
3894 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
3895 	}
3896 	return 0;
3897 }
3898 
3899 static int
3900 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
3901 {
3902 	struct hn_softc *sc = arg1;
3903 	int ofs = arg2, i, error, conf;
3904 	struct hn_tx_ring *txr;
3905 
3906 	txr = &sc->hn_tx_ring[0];
3907 	conf = *((int *)((uint8_t *)txr + ofs));
3908 
3909 	error = sysctl_handle_int(oidp, &conf, 0, req);
3910 	if (error || req->newptr == NULL)
3911 		return error;
3912 
3913 	HN_LOCK(sc);
3914 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3915 		txr = &sc->hn_tx_ring[i];
3916 		*((int *)((uint8_t *)txr + ofs)) = conf;
3917 	}
3918 	HN_UNLOCK(sc);
3919 
3920 	return 0;
3921 }
3922 
3923 static int
3924 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
3925 {
3926 	struct hn_softc *sc = arg1;
3927 	int error, size;
3928 
3929 	size = sc->hn_agg_size;
3930 	error = sysctl_handle_int(oidp, &size, 0, req);
3931 	if (error || req->newptr == NULL)
3932 		return (error);
3933 
3934 	HN_LOCK(sc);
3935 	sc->hn_agg_size = size;
3936 	hn_set_txagg(sc);
3937 	HN_UNLOCK(sc);
3938 
3939 	return (0);
3940 }
3941 
3942 static int
3943 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3944 {
3945 	struct hn_softc *sc = arg1;
3946 	int error, pkts;
3947 
3948 	pkts = sc->hn_agg_pkts;
3949 	error = sysctl_handle_int(oidp, &pkts, 0, req);
3950 	if (error || req->newptr == NULL)
3951 		return (error);
3952 
3953 	HN_LOCK(sc);
3954 	sc->hn_agg_pkts = pkts;
3955 	hn_set_txagg(sc);
3956 	HN_UNLOCK(sc);
3957 
3958 	return (0);
3959 }
3960 
3961 static int
3962 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
3963 {
3964 	struct hn_softc *sc = arg1;
3965 	int pkts;
3966 
3967 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
3968 	return (sysctl_handle_int(oidp, &pkts, 0, req));
3969 }
3970 
3971 static int
3972 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
3973 {
3974 	struct hn_softc *sc = arg1;
3975 	int align;
3976 
3977 	align = sc->hn_tx_ring[0].hn_agg_align;
3978 	return (sysctl_handle_int(oidp, &align, 0, req));
3979 }
3980 
3981 static void
3982 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
3983 {
3984 	if (pollhz == 0)
3985 		vmbus_chan_poll_disable(chan);
3986 	else
3987 		vmbus_chan_poll_enable(chan, pollhz);
3988 }
3989 
3990 static void
3991 hn_polling(struct hn_softc *sc, u_int pollhz)
3992 {
3993 	int nsubch = sc->hn_rx_ring_inuse - 1;
3994 
3995 	HN_LOCK_ASSERT(sc);
3996 
3997 	if (nsubch > 0) {
3998 		struct vmbus_channel **subch;
3999 		int i;
4000 
4001 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4002 		for (i = 0; i < nsubch; ++i)
4003 			hn_chan_polling(subch[i], pollhz);
4004 		vmbus_subchan_rel(subch, nsubch);
4005 	}
4006 	hn_chan_polling(sc->hn_prichan, pollhz);
4007 }
4008 
4009 static int
4010 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4011 {
4012 	struct hn_softc *sc = arg1;
4013 	int pollhz, error;
4014 
4015 	pollhz = sc->hn_pollhz;
4016 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4017 	if (error || req->newptr == NULL)
4018 		return (error);
4019 
4020 	if (pollhz != 0 &&
4021 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4022 		return (EINVAL);
4023 
4024 	HN_LOCK(sc);
4025 	if (sc->hn_pollhz != pollhz) {
4026 		sc->hn_pollhz = pollhz;
4027 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4028 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4029 			hn_polling(sc, sc->hn_pollhz);
4030 	}
4031 	HN_UNLOCK(sc);
4032 
4033 	return (0);
4034 }
4035 
4036 static int
4037 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4038 {
4039 	struct hn_softc *sc = arg1;
4040 	char verstr[16];
4041 
4042 	snprintf(verstr, sizeof(verstr), "%u.%u",
4043 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4044 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4045 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4046 }
4047 
4048 static int
4049 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4050 {
4051 	struct hn_softc *sc = arg1;
4052 	char caps_str[128];
4053 	uint32_t caps;
4054 
4055 	HN_LOCK(sc);
4056 	caps = sc->hn_caps;
4057 	HN_UNLOCK(sc);
4058 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4059 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4060 }
4061 
4062 static int
4063 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4064 {
4065 	struct hn_softc *sc = arg1;
4066 	char assist_str[128];
4067 	uint32_t hwassist;
4068 
4069 	HN_LOCK(sc);
4070 	hwassist = sc->hn_ifp->if_hwassist;
4071 	HN_UNLOCK(sc);
4072 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4073 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4074 }
4075 
4076 static int
4077 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4078 {
4079 	struct hn_softc *sc = arg1;
4080 	char filter_str[128];
4081 	uint32_t filter;
4082 
4083 	HN_LOCK(sc);
4084 	filter = sc->hn_rx_filter;
4085 	HN_UNLOCK(sc);
4086 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4087 	    NDIS_PACKET_TYPES);
4088 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4089 }
4090 
4091 #ifndef RSS
4092 
4093 static int
4094 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4095 {
4096 	struct hn_softc *sc = arg1;
4097 	int error;
4098 
4099 	HN_LOCK(sc);
4100 
4101 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4102 	if (error || req->newptr == NULL)
4103 		goto back;
4104 
4105 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4106 	if (error)
4107 		goto back;
4108 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4109 
4110 	if (sc->hn_rx_ring_inuse > 1) {
4111 		error = hn_rss_reconfig(sc);
4112 	} else {
4113 		/* Not RSS capable, at least for now; just save the RSS key. */
4114 		error = 0;
4115 	}
4116 back:
4117 	HN_UNLOCK(sc);
4118 	return (error);
4119 }
4120 
4121 static int
4122 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4123 {
4124 	struct hn_softc *sc = arg1;
4125 	int error;
4126 
4127 	HN_LOCK(sc);
4128 
4129 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4130 	if (error || req->newptr == NULL)
4131 		goto back;
4132 
4133 	/*
4134 	 * Don't allow RSS indirect table change, if this interface is not
4135 	 * RSS capable currently.
4136 	 */
4137 	if (sc->hn_rx_ring_inuse == 1) {
4138 		error = EOPNOTSUPP;
4139 		goto back;
4140 	}
4141 
4142 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4143 	if (error)
4144 		goto back;
4145 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4146 
4147 	hn_rss_ind_fixup(sc);
4148 	error = hn_rss_reconfig(sc);
4149 back:
4150 	HN_UNLOCK(sc);
4151 	return (error);
4152 }
4153 
4154 #endif	/* !RSS */
4155 
4156 static int
4157 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4158 {
4159 	struct hn_softc *sc = arg1;
4160 	char hash_str[128];
4161 	uint32_t hash;
4162 
4163 	HN_LOCK(sc);
4164 	hash = sc->hn_rss_hash;
4165 	HN_UNLOCK(sc);
4166 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4167 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4168 }
4169 
4170 static int
4171 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4172 {
4173 	struct hn_softc *sc = arg1;
4174 	char vf_name[IFNAMSIZ + 1];
4175 	struct ifnet *vf_ifp;
4176 
4177 	HN_LOCK(sc);
4178 	vf_name[0] = '\0';
4179 	vf_ifp = sc->hn_vf_ifp;
4180 	if (vf_ifp != NULL)
4181 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4182 	HN_UNLOCK(sc);
4183 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4184 }
4185 
4186 static int
4187 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4188 {
4189 	struct hn_softc *sc = arg1;
4190 	char vf_name[IFNAMSIZ + 1];
4191 	struct ifnet *vf_ifp;
4192 
4193 	HN_LOCK(sc);
4194 	vf_name[0] = '\0';
4195 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4196 	if (vf_ifp != NULL)
4197 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4198 	HN_UNLOCK(sc);
4199 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4200 }
4201 
4202 static int
4203 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4204 {
4205 	struct rm_priotracker pt;
4206 	struct sbuf *sb;
4207 	int error, i;
4208 	bool first;
4209 
4210 	error = sysctl_wire_old_buffer(req, 0);
4211 	if (error != 0)
4212 		return (error);
4213 
4214 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4215 	if (sb == NULL)
4216 		return (ENOMEM);
4217 
4218 	rm_rlock(&hn_vfmap_lock, &pt);
4219 
4220 	first = true;
4221 	for (i = 0; i < hn_vfmap_size; ++i) {
4222 		struct ifnet *ifp;
4223 
4224 		if (hn_vfmap[i] == NULL)
4225 			continue;
4226 
4227 		ifp = ifnet_byindex(i);
4228 		if (ifp != NULL) {
4229 			if (first)
4230 				sbuf_printf(sb, "%s", ifp->if_xname);
4231 			else
4232 				sbuf_printf(sb, " %s", ifp->if_xname);
4233 			first = false;
4234 		}
4235 	}
4236 
4237 	rm_runlock(&hn_vfmap_lock, &pt);
4238 
4239 	error = sbuf_finish(sb);
4240 	sbuf_delete(sb);
4241 	return (error);
4242 }
4243 
4244 static int
4245 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4246 {
4247 	struct rm_priotracker pt;
4248 	struct sbuf *sb;
4249 	int error, i;
4250 	bool first;
4251 
4252 	error = sysctl_wire_old_buffer(req, 0);
4253 	if (error != 0)
4254 		return (error);
4255 
4256 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4257 	if (sb == NULL)
4258 		return (ENOMEM);
4259 
4260 	rm_rlock(&hn_vfmap_lock, &pt);
4261 
4262 	first = true;
4263 	for (i = 0; i < hn_vfmap_size; ++i) {
4264 		struct ifnet *ifp, *hn_ifp;
4265 
4266 		hn_ifp = hn_vfmap[i];
4267 		if (hn_ifp == NULL)
4268 			continue;
4269 
4270 		ifp = ifnet_byindex(i);
4271 		if (ifp != NULL) {
4272 			if (first) {
4273 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4274 				    hn_ifp->if_xname);
4275 			} else {
4276 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4277 				    hn_ifp->if_xname);
4278 			}
4279 			first = false;
4280 		}
4281 	}
4282 
4283 	rm_runlock(&hn_vfmap_lock, &pt);
4284 
4285 	error = sbuf_finish(sb);
4286 	sbuf_delete(sb);
4287 	return (error);
4288 }
4289 
4290 static int
4291 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4292 {
4293 	struct hn_softc *sc = arg1;
4294 	int error, onoff = 0;
4295 
4296 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4297 		onoff = 1;
4298 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4299 	if (error || req->newptr == NULL)
4300 		return (error);
4301 
4302 	HN_LOCK(sc);
4303 	/* NOTE: hn_vf_lock for hn_transmit() */
4304 	rm_wlock(&sc->hn_vf_lock);
4305 	if (onoff)
4306 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4307 	else
4308 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4309 	rm_wunlock(&sc->hn_vf_lock);
4310 	HN_UNLOCK(sc);
4311 
4312 	return (0);
4313 }
4314 
4315 static int
4316 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4317 {
4318 	struct hn_softc *sc = arg1;
4319 	int enabled = 0;
4320 
4321 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4322 		enabled = 1;
4323 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4324 }
4325 
4326 static int
4327 hn_check_iplen(const struct mbuf *m, int hoff)
4328 {
4329 	const struct ip *ip;
4330 	int len, iphlen, iplen;
4331 	const struct tcphdr *th;
4332 	int thoff;				/* TCP data offset */
4333 
4334 	len = hoff + sizeof(struct ip);
4335 
4336 	/* The packet must be at least the size of an IP header. */
4337 	if (m->m_pkthdr.len < len)
4338 		return IPPROTO_DONE;
4339 
4340 	/* The fixed IP header must reside completely in the first mbuf. */
4341 	if (m->m_len < len)
4342 		return IPPROTO_DONE;
4343 
4344 	ip = mtodo(m, hoff);
4345 
4346 	/* Bound check the packet's stated IP header length. */
4347 	iphlen = ip->ip_hl << 2;
4348 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4349 		return IPPROTO_DONE;
4350 
4351 	/* The full IP header must reside completely in the one mbuf. */
4352 	if (m->m_len < hoff + iphlen)
4353 		return IPPROTO_DONE;
4354 
4355 	iplen = ntohs(ip->ip_len);
4356 
4357 	/*
4358 	 * Check that the amount of data in the buffers is as
4359 	 * at least much as the IP header would have us expect.
4360 	 */
4361 	if (m->m_pkthdr.len < hoff + iplen)
4362 		return IPPROTO_DONE;
4363 
4364 	/*
4365 	 * Ignore IP fragments.
4366 	 */
4367 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4368 		return IPPROTO_DONE;
4369 
4370 	/*
4371 	 * The TCP/IP or UDP/IP header must be entirely contained within
4372 	 * the first fragment of a packet.
4373 	 */
4374 	switch (ip->ip_p) {
4375 	case IPPROTO_TCP:
4376 		if (iplen < iphlen + sizeof(struct tcphdr))
4377 			return IPPROTO_DONE;
4378 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4379 			return IPPROTO_DONE;
4380 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4381 		thoff = th->th_off << 2;
4382 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4383 			return IPPROTO_DONE;
4384 		if (m->m_len < hoff + iphlen + thoff)
4385 			return IPPROTO_DONE;
4386 		break;
4387 	case IPPROTO_UDP:
4388 		if (iplen < iphlen + sizeof(struct udphdr))
4389 			return IPPROTO_DONE;
4390 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4391 			return IPPROTO_DONE;
4392 		break;
4393 	default:
4394 		if (iplen < iphlen)
4395 			return IPPROTO_DONE;
4396 		break;
4397 	}
4398 	return ip->ip_p;
4399 }
4400 
4401 static int
4402 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4403 {
4404 	struct sysctl_oid_list *child;
4405 	struct sysctl_ctx_list *ctx;
4406 	device_t dev = sc->hn_dev;
4407 #if defined(INET) || defined(INET6)
4408 #if __FreeBSD_version >= 1100095
4409 	int lroent_cnt;
4410 #endif
4411 #endif
4412 	int i;
4413 
4414 	/*
4415 	 * Create RXBUF for reception.
4416 	 *
4417 	 * NOTE:
4418 	 * - It is shared by all channels.
4419 	 * - A large enough buffer is allocated, certain version of NVSes
4420 	 *   may further limit the usable space.
4421 	 */
4422 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4423 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4424 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4425 	if (sc->hn_rxbuf == NULL) {
4426 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4427 		return (ENOMEM);
4428 	}
4429 
4430 	sc->hn_rx_ring_cnt = ring_cnt;
4431 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4432 
4433 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4434 	    M_DEVBUF, M_WAITOK | M_ZERO);
4435 
4436 #if defined(INET) || defined(INET6)
4437 #if __FreeBSD_version >= 1100095
4438 	lroent_cnt = hn_lro_entry_count;
4439 	if (lroent_cnt < TCP_LRO_ENTRIES)
4440 		lroent_cnt = TCP_LRO_ENTRIES;
4441 	if (bootverbose)
4442 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4443 #endif
4444 #endif	/* INET || INET6 */
4445 
4446 	ctx = device_get_sysctl_ctx(dev);
4447 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4448 
4449 	/* Create dev.hn.UNIT.rx sysctl tree */
4450 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4451 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4452 
4453 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4454 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4455 
4456 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4457 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4458 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4459 		if (rxr->hn_br == NULL) {
4460 			device_printf(dev, "allocate bufring failed\n");
4461 			return (ENOMEM);
4462 		}
4463 
4464 		if (hn_trust_hosttcp)
4465 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4466 		if (hn_trust_hostudp)
4467 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4468 		if (hn_trust_hostip)
4469 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4470 		rxr->hn_ifp = sc->hn_ifp;
4471 		if (i < sc->hn_tx_ring_cnt)
4472 			rxr->hn_txr = &sc->hn_tx_ring[i];
4473 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4474 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4475 		rxr->hn_rx_idx = i;
4476 		rxr->hn_rxbuf = sc->hn_rxbuf;
4477 
4478 		/*
4479 		 * Initialize LRO.
4480 		 */
4481 #if defined(INET) || defined(INET6)
4482 #if __FreeBSD_version >= 1100095
4483 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4484 		    hn_lro_mbufq_depth);
4485 #else
4486 		tcp_lro_init(&rxr->hn_lro);
4487 		rxr->hn_lro.ifp = sc->hn_ifp;
4488 #endif
4489 #if __FreeBSD_version >= 1100099
4490 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4491 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4492 #endif
4493 #endif	/* INET || INET6 */
4494 
4495 		if (sc->hn_rx_sysctl_tree != NULL) {
4496 			char name[16];
4497 
4498 			/*
4499 			 * Create per RX ring sysctl tree:
4500 			 * dev.hn.UNIT.rx.RINGID
4501 			 */
4502 			snprintf(name, sizeof(name), "%d", i);
4503 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4504 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4505 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4506 
4507 			if (rxr->hn_rx_sysctl_tree != NULL) {
4508 				SYSCTL_ADD_ULONG(ctx,
4509 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4510 				    OID_AUTO, "packets", CTLFLAG_RW,
4511 				    &rxr->hn_pkts, "# of packets received");
4512 				SYSCTL_ADD_ULONG(ctx,
4513 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4514 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
4515 				    &rxr->hn_rss_pkts,
4516 				    "# of packets w/ RSS info received");
4517 				SYSCTL_ADD_INT(ctx,
4518 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4519 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
4520 				    &rxr->hn_pktbuf_len, 0,
4521 				    "Temporary channel packet buffer length");
4522 			}
4523 		}
4524 	}
4525 
4526 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
4527 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4528 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
4529 #if __FreeBSD_version < 1100095
4530 	    hn_rx_stat_int_sysctl,
4531 #else
4532 	    hn_rx_stat_u64_sysctl,
4533 #endif
4534 	    "LU", "LRO queued");
4535 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
4536 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4537 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
4538 #if __FreeBSD_version < 1100095
4539 	    hn_rx_stat_int_sysctl,
4540 #else
4541 	    hn_rx_stat_u64_sysctl,
4542 #endif
4543 	    "LU", "LRO flushed");
4544 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
4545 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4546 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
4547 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
4548 #if __FreeBSD_version >= 1100099
4549 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
4550 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4551 	    hn_lro_lenlim_sysctl, "IU",
4552 	    "Max # of data bytes to be aggregated by LRO");
4553 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
4554 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4555 	    hn_lro_ackcnt_sysctl, "I",
4556 	    "Max # of ACKs to be aggregated by LRO");
4557 #endif
4558 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
4559 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
4560 	    hn_trust_hcsum_sysctl, "I",
4561 	    "Trust tcp segement verification on host side, "
4562 	    "when csum info is missing");
4563 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
4564 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
4565 	    hn_trust_hcsum_sysctl, "I",
4566 	    "Trust udp datagram verification on host side, "
4567 	    "when csum info is missing");
4568 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
4569 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
4570 	    hn_trust_hcsum_sysctl, "I",
4571 	    "Trust ip packet verification on host side, "
4572 	    "when csum info is missing");
4573 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
4574 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4575 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
4576 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
4577 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
4578 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4579 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
4580 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
4581 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
4582 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4583 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
4584 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
4585 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
4586 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4587 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
4588 	    hn_rx_stat_ulong_sysctl, "LU",
4589 	    "# of packets that we trust host's csum verification");
4590 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
4591 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4592 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
4593 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
4594 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
4595 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4596 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
4597 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
4598 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
4599 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
4600 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
4601 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
4602 
4603 	return (0);
4604 }
4605 
4606 static void
4607 hn_destroy_rx_data(struct hn_softc *sc)
4608 {
4609 	int i;
4610 
4611 	if (sc->hn_rxbuf != NULL) {
4612 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
4613 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
4614 		else
4615 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
4616 		sc->hn_rxbuf = NULL;
4617 	}
4618 
4619 	if (sc->hn_rx_ring_cnt == 0)
4620 		return;
4621 
4622 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4623 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4624 
4625 		if (rxr->hn_br == NULL)
4626 			continue;
4627 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
4628 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
4629 		} else {
4630 			device_printf(sc->hn_dev,
4631 			    "%dth channel bufring is referenced", i);
4632 		}
4633 		rxr->hn_br = NULL;
4634 
4635 #if defined(INET) || defined(INET6)
4636 		tcp_lro_free(&rxr->hn_lro);
4637 #endif
4638 		free(rxr->hn_pktbuf, M_DEVBUF);
4639 	}
4640 	free(sc->hn_rx_ring, M_DEVBUF);
4641 	sc->hn_rx_ring = NULL;
4642 
4643 	sc->hn_rx_ring_cnt = 0;
4644 	sc->hn_rx_ring_inuse = 0;
4645 }
4646 
4647 static int
4648 hn_tx_ring_create(struct hn_softc *sc, int id)
4649 {
4650 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
4651 	device_t dev = sc->hn_dev;
4652 	bus_dma_tag_t parent_dtag;
4653 	int error, i;
4654 
4655 	txr->hn_sc = sc;
4656 	txr->hn_tx_idx = id;
4657 
4658 #ifndef HN_USE_TXDESC_BUFRING
4659 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
4660 #endif
4661 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
4662 
4663 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
4664 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
4665 	    M_DEVBUF, M_WAITOK | M_ZERO);
4666 #ifndef HN_USE_TXDESC_BUFRING
4667 	SLIST_INIT(&txr->hn_txlist);
4668 #else
4669 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
4670 	    M_WAITOK, &txr->hn_tx_lock);
4671 #endif
4672 
4673 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
4674 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
4675 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
4676 	} else {
4677 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
4678 	}
4679 
4680 #ifdef HN_IFSTART_SUPPORT
4681 	if (hn_use_if_start) {
4682 		txr->hn_txeof = hn_start_txeof;
4683 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
4684 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
4685 	} else
4686 #endif
4687 	{
4688 		int br_depth;
4689 
4690 		txr->hn_txeof = hn_xmit_txeof;
4691 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
4692 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
4693 
4694 		br_depth = hn_get_txswq_depth(txr);
4695 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
4696 		    M_WAITOK, &txr->hn_tx_lock);
4697 	}
4698 
4699 	txr->hn_direct_tx_size = hn_direct_tx_size;
4700 
4701 	/*
4702 	 * Always schedule transmission instead of trying to do direct
4703 	 * transmission.  This one gives the best performance so far.
4704 	 */
4705 	txr->hn_sched_tx = 1;
4706 
4707 	parent_dtag = bus_get_dma_tag(dev);
4708 
4709 	/* DMA tag for RNDIS packet messages. */
4710 	error = bus_dma_tag_create(parent_dtag, /* parent */
4711 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
4712 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
4713 	    BUS_SPACE_MAXADDR,		/* lowaddr */
4714 	    BUS_SPACE_MAXADDR,		/* highaddr */
4715 	    NULL, NULL,			/* filter, filterarg */
4716 	    HN_RNDIS_PKT_LEN,		/* maxsize */
4717 	    1,				/* nsegments */
4718 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
4719 	    0,				/* flags */
4720 	    NULL,			/* lockfunc */
4721 	    NULL,			/* lockfuncarg */
4722 	    &txr->hn_tx_rndis_dtag);
4723 	if (error) {
4724 		device_printf(dev, "failed to create rndis dmatag\n");
4725 		return error;
4726 	}
4727 
4728 	/* DMA tag for data. */
4729 	error = bus_dma_tag_create(parent_dtag, /* parent */
4730 	    1,				/* alignment */
4731 	    HN_TX_DATA_BOUNDARY,	/* boundary */
4732 	    BUS_SPACE_MAXADDR,		/* lowaddr */
4733 	    BUS_SPACE_MAXADDR,		/* highaddr */
4734 	    NULL, NULL,			/* filter, filterarg */
4735 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
4736 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
4737 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
4738 	    0,				/* flags */
4739 	    NULL,			/* lockfunc */
4740 	    NULL,			/* lockfuncarg */
4741 	    &txr->hn_tx_data_dtag);
4742 	if (error) {
4743 		device_printf(dev, "failed to create data dmatag\n");
4744 		return error;
4745 	}
4746 
4747 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
4748 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
4749 
4750 		txd->txr = txr;
4751 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
4752 		STAILQ_INIT(&txd->agg_list);
4753 
4754 		/*
4755 		 * Allocate and load RNDIS packet message.
4756 		 */
4757         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
4758 		    (void **)&txd->rndis_pkt,
4759 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
4760 		    &txd->rndis_pkt_dmap);
4761 		if (error) {
4762 			device_printf(dev,
4763 			    "failed to allocate rndis_packet_msg, %d\n", i);
4764 			return error;
4765 		}
4766 
4767 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
4768 		    txd->rndis_pkt_dmap,
4769 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
4770 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
4771 		    BUS_DMA_NOWAIT);
4772 		if (error) {
4773 			device_printf(dev,
4774 			    "failed to load rndis_packet_msg, %d\n", i);
4775 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
4776 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
4777 			return error;
4778 		}
4779 
4780 		/* DMA map for TX data. */
4781 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
4782 		    &txd->data_dmap);
4783 		if (error) {
4784 			device_printf(dev,
4785 			    "failed to allocate tx data dmamap\n");
4786 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
4787 			    txd->rndis_pkt_dmap);
4788 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
4789 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
4790 			return error;
4791 		}
4792 
4793 		/* All set, put it to list */
4794 		txd->flags |= HN_TXD_FLAG_ONLIST;
4795 #ifndef HN_USE_TXDESC_BUFRING
4796 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
4797 #else
4798 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
4799 #endif
4800 	}
4801 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
4802 
4803 	if (sc->hn_tx_sysctl_tree != NULL) {
4804 		struct sysctl_oid_list *child;
4805 		struct sysctl_ctx_list *ctx;
4806 		char name[16];
4807 
4808 		/*
4809 		 * Create per TX ring sysctl tree:
4810 		 * dev.hn.UNIT.tx.RINGID
4811 		 */
4812 		ctx = device_get_sysctl_ctx(dev);
4813 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
4814 
4815 		snprintf(name, sizeof(name), "%d", id);
4816 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
4817 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4818 
4819 		if (txr->hn_tx_sysctl_tree != NULL) {
4820 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
4821 
4822 #ifdef HN_DEBUG
4823 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
4824 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
4825 			    "# of available TX descs");
4826 #endif
4827 #ifdef HN_IFSTART_SUPPORT
4828 			if (!hn_use_if_start)
4829 #endif
4830 			{
4831 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
4832 				    CTLFLAG_RD, &txr->hn_oactive, 0,
4833 				    "over active");
4834 			}
4835 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
4836 			    CTLFLAG_RW, &txr->hn_pkts,
4837 			    "# of packets transmitted");
4838 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
4839 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
4840 		}
4841 	}
4842 
4843 	return 0;
4844 }
4845 
4846 static void
4847 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
4848 {
4849 	struct hn_tx_ring *txr = txd->txr;
4850 
4851 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
4852 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
4853 
4854 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
4855 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
4856 	    txd->rndis_pkt_dmap);
4857 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
4858 }
4859 
4860 static void
4861 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
4862 {
4863 
4864 	KASSERT(txd->refs == 0 || txd->refs == 1,
4865 	    ("invalid txd refs %d", txd->refs));
4866 
4867 	/* Aggregated txds will be freed by their aggregating txd. */
4868 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
4869 		int freed;
4870 
4871 		freed = hn_txdesc_put(txr, txd);
4872 		KASSERT(freed, ("can't free txdesc"));
4873 	}
4874 }
4875 
4876 static void
4877 hn_tx_ring_destroy(struct hn_tx_ring *txr)
4878 {
4879 	int i;
4880 
4881 	if (txr->hn_txdesc == NULL)
4882 		return;
4883 
4884 	/*
4885 	 * NOTE:
4886 	 * Because the freeing of aggregated txds will be deferred
4887 	 * to the aggregating txd, two passes are used here:
4888 	 * - The first pass GCes any pending txds.  This GC is necessary,
4889 	 *   since if the channels are revoked, hypervisor will not
4890 	 *   deliver send-done for all pending txds.
4891 	 * - The second pass frees the busdma stuffs, i.e. after all txds
4892 	 *   were freed.
4893 	 */
4894 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4895 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
4896 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4897 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
4898 
4899 	if (txr->hn_tx_data_dtag != NULL)
4900 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
4901 	if (txr->hn_tx_rndis_dtag != NULL)
4902 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
4903 
4904 #ifdef HN_USE_TXDESC_BUFRING
4905 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
4906 #endif
4907 
4908 	free(txr->hn_txdesc, M_DEVBUF);
4909 	txr->hn_txdesc = NULL;
4910 
4911 	if (txr->hn_mbuf_br != NULL)
4912 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
4913 
4914 #ifndef HN_USE_TXDESC_BUFRING
4915 	mtx_destroy(&txr->hn_txlist_spin);
4916 #endif
4917 	mtx_destroy(&txr->hn_tx_lock);
4918 }
4919 
4920 static int
4921 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
4922 {
4923 	struct sysctl_oid_list *child;
4924 	struct sysctl_ctx_list *ctx;
4925 	int i;
4926 
4927 	/*
4928 	 * Create TXBUF for chimney sending.
4929 	 *
4930 	 * NOTE: It is shared by all channels.
4931 	 */
4932 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
4933 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
4934 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4935 	if (sc->hn_chim == NULL) {
4936 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
4937 		return (ENOMEM);
4938 	}
4939 
4940 	sc->hn_tx_ring_cnt = ring_cnt;
4941 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4942 
4943 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
4944 	    M_DEVBUF, M_WAITOK | M_ZERO);
4945 
4946 	ctx = device_get_sysctl_ctx(sc->hn_dev);
4947 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
4948 
4949 	/* Create dev.hn.UNIT.tx sysctl tree */
4950 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
4951 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4952 
4953 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4954 		int error;
4955 
4956 		error = hn_tx_ring_create(sc, i);
4957 		if (error)
4958 			return error;
4959 	}
4960 
4961 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
4962 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4963 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
4964 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
4965 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
4966 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4967 	    __offsetof(struct hn_tx_ring, hn_send_failed),
4968 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
4969 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
4970 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4971 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
4972 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
4973 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
4974 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4975 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
4976 	    hn_tx_stat_ulong_sysctl, "LU",
4977 	    "# of packet transmission aggregation flush failure");
4978 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
4979 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4980 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
4981 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
4982 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
4983 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4984 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
4985 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
4986 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
4987 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4988 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
4989 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
4990 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
4991 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
4992 	    "# of total TX descs");
4993 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
4994 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
4995 	    "Chimney send packet size upper boundary");
4996 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
4997 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4998 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
4999 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5000 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5001 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5002 	    hn_tx_conf_int_sysctl, "I",
5003 	    "Size of the packet for direct transmission");
5004 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5005 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5006 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5007 	    hn_tx_conf_int_sysctl, "I",
5008 	    "Always schedule transmission "
5009 	    "instead of doing direct transmission");
5010 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5011 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5012 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5013 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5014 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5015 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5016 	    "Applied packet transmission aggregation size");
5017 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5018 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5019 	    hn_txagg_pktmax_sysctl, "I",
5020 	    "Applied packet transmission aggregation packets");
5021 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5022 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5023 	    hn_txagg_align_sysctl, "I",
5024 	    "Applied packet transmission aggregation alignment");
5025 
5026 	return 0;
5027 }
5028 
5029 static void
5030 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5031 {
5032 	int i;
5033 
5034 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5035 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5036 }
5037 
5038 static void
5039 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5040 {
5041 	struct ifnet *ifp = sc->hn_ifp;
5042 	u_int hw_tsomax;
5043 	int tso_minlen;
5044 
5045 	HN_LOCK_ASSERT(sc);
5046 
5047 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5048 		return;
5049 
5050 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5051 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5052 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5053 
5054 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5055 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5056 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5057 
5058 	if (tso_maxlen < tso_minlen)
5059 		tso_maxlen = tso_minlen;
5060 	else if (tso_maxlen > IP_MAXPACKET)
5061 		tso_maxlen = IP_MAXPACKET;
5062 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5063 		tso_maxlen = sc->hn_ndis_tso_szmax;
5064 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5065 
5066 	if (hn_xpnt_vf_isready(sc)) {
5067 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5068 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5069 	}
5070 	ifp->if_hw_tsomax = hw_tsomax;
5071 	if (bootverbose)
5072 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5073 }
5074 
5075 static void
5076 hn_fixup_tx_data(struct hn_softc *sc)
5077 {
5078 	uint64_t csum_assist;
5079 	int i;
5080 
5081 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5082 	if (hn_tx_chimney_size > 0 &&
5083 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5084 		hn_set_chim_size(sc, hn_tx_chimney_size);
5085 
5086 	csum_assist = 0;
5087 	if (sc->hn_caps & HN_CAP_IPCS)
5088 		csum_assist |= CSUM_IP;
5089 	if (sc->hn_caps & HN_CAP_TCP4CS)
5090 		csum_assist |= CSUM_IP_TCP;
5091 	if (sc->hn_caps & HN_CAP_UDP4CS)
5092 		csum_assist |= CSUM_IP_UDP;
5093 	if (sc->hn_caps & HN_CAP_TCP6CS)
5094 		csum_assist |= CSUM_IP6_TCP;
5095 	if (sc->hn_caps & HN_CAP_UDP6CS)
5096 		csum_assist |= CSUM_IP6_UDP;
5097 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5098 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5099 
5100 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5101 		/*
5102 		 * Support HASHVAL pktinfo on TX path.
5103 		 */
5104 		if (bootverbose)
5105 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5106 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5107 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5108 	}
5109 }
5110 
5111 static void
5112 hn_destroy_tx_data(struct hn_softc *sc)
5113 {
5114 	int i;
5115 
5116 	if (sc->hn_chim != NULL) {
5117 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5118 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5119 		} else {
5120 			device_printf(sc->hn_dev,
5121 			    "chimney sending buffer is referenced");
5122 		}
5123 		sc->hn_chim = NULL;
5124 	}
5125 
5126 	if (sc->hn_tx_ring_cnt == 0)
5127 		return;
5128 
5129 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5130 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5131 
5132 	free(sc->hn_tx_ring, M_DEVBUF);
5133 	sc->hn_tx_ring = NULL;
5134 
5135 	sc->hn_tx_ring_cnt = 0;
5136 	sc->hn_tx_ring_inuse = 0;
5137 }
5138 
5139 #ifdef HN_IFSTART_SUPPORT
5140 
5141 static void
5142 hn_start_taskfunc(void *xtxr, int pending __unused)
5143 {
5144 	struct hn_tx_ring *txr = xtxr;
5145 
5146 	mtx_lock(&txr->hn_tx_lock);
5147 	hn_start_locked(txr, 0);
5148 	mtx_unlock(&txr->hn_tx_lock);
5149 }
5150 
5151 static int
5152 hn_start_locked(struct hn_tx_ring *txr, int len)
5153 {
5154 	struct hn_softc *sc = txr->hn_sc;
5155 	struct ifnet *ifp = sc->hn_ifp;
5156 	int sched = 0;
5157 
5158 	KASSERT(hn_use_if_start,
5159 	    ("hn_start_locked is called, when if_start is disabled"));
5160 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5161 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5162 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5163 
5164 	if (__predict_false(txr->hn_suspended))
5165 		return (0);
5166 
5167 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5168 	    IFF_DRV_RUNNING)
5169 		return (0);
5170 
5171 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5172 		struct hn_txdesc *txd;
5173 		struct mbuf *m_head;
5174 		int error;
5175 
5176 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5177 		if (m_head == NULL)
5178 			break;
5179 
5180 		if (len > 0 && m_head->m_pkthdr.len > len) {
5181 			/*
5182 			 * This sending could be time consuming; let callers
5183 			 * dispatch this packet sending (and sending of any
5184 			 * following up packets) to tx taskqueue.
5185 			 */
5186 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5187 			sched = 1;
5188 			break;
5189 		}
5190 
5191 #if defined(INET6) || defined(INET)
5192 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5193 			m_head = hn_tso_fixup(m_head);
5194 			if (__predict_false(m_head == NULL)) {
5195 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5196 				continue;
5197 			}
5198 		}
5199 #endif
5200 
5201 		txd = hn_txdesc_get(txr);
5202 		if (txd == NULL) {
5203 			txr->hn_no_txdescs++;
5204 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5205 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5206 			break;
5207 		}
5208 
5209 		error = hn_encap(ifp, txr, txd, &m_head);
5210 		if (error) {
5211 			/* Both txd and m_head are freed */
5212 			KASSERT(txr->hn_agg_txd == NULL,
5213 			    ("encap failed w/ pending aggregating txdesc"));
5214 			continue;
5215 		}
5216 
5217 		if (txr->hn_agg_pktleft == 0) {
5218 			if (txr->hn_agg_txd != NULL) {
5219 				KASSERT(m_head == NULL,
5220 				    ("pending mbuf for aggregating txdesc"));
5221 				error = hn_flush_txagg(ifp, txr);
5222 				if (__predict_false(error)) {
5223 					atomic_set_int(&ifp->if_drv_flags,
5224 					    IFF_DRV_OACTIVE);
5225 					break;
5226 				}
5227 			} else {
5228 				KASSERT(m_head != NULL, ("mbuf was freed"));
5229 				error = hn_txpkt(ifp, txr, txd);
5230 				if (__predict_false(error)) {
5231 					/* txd is freed, but m_head is not */
5232 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5233 					atomic_set_int(&ifp->if_drv_flags,
5234 					    IFF_DRV_OACTIVE);
5235 					break;
5236 				}
5237 			}
5238 		}
5239 #ifdef INVARIANTS
5240 		else {
5241 			KASSERT(txr->hn_agg_txd != NULL,
5242 			    ("no aggregating txdesc"));
5243 			KASSERT(m_head == NULL,
5244 			    ("pending mbuf for aggregating txdesc"));
5245 		}
5246 #endif
5247 	}
5248 
5249 	/* Flush pending aggerated transmission. */
5250 	if (txr->hn_agg_txd != NULL)
5251 		hn_flush_txagg(ifp, txr);
5252 	return (sched);
5253 }
5254 
5255 static void
5256 hn_start(struct ifnet *ifp)
5257 {
5258 	struct hn_softc *sc = ifp->if_softc;
5259 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5260 
5261 	if (txr->hn_sched_tx)
5262 		goto do_sched;
5263 
5264 	if (mtx_trylock(&txr->hn_tx_lock)) {
5265 		int sched;
5266 
5267 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5268 		mtx_unlock(&txr->hn_tx_lock);
5269 		if (!sched)
5270 			return;
5271 	}
5272 do_sched:
5273 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5274 }
5275 
5276 static void
5277 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5278 {
5279 	struct hn_tx_ring *txr = xtxr;
5280 
5281 	mtx_lock(&txr->hn_tx_lock);
5282 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5283 	hn_start_locked(txr, 0);
5284 	mtx_unlock(&txr->hn_tx_lock);
5285 }
5286 
5287 static void
5288 hn_start_txeof(struct hn_tx_ring *txr)
5289 {
5290 	struct hn_softc *sc = txr->hn_sc;
5291 	struct ifnet *ifp = sc->hn_ifp;
5292 
5293 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5294 
5295 	if (txr->hn_sched_tx)
5296 		goto do_sched;
5297 
5298 	if (mtx_trylock(&txr->hn_tx_lock)) {
5299 		int sched;
5300 
5301 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5302 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5303 		mtx_unlock(&txr->hn_tx_lock);
5304 		if (sched) {
5305 			taskqueue_enqueue(txr->hn_tx_taskq,
5306 			    &txr->hn_tx_task);
5307 		}
5308 	} else {
5309 do_sched:
5310 		/*
5311 		 * Release the OACTIVE earlier, with the hope, that
5312 		 * others could catch up.  The task will clear the
5313 		 * flag again with the hn_tx_lock to avoid possible
5314 		 * races.
5315 		 */
5316 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5317 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5318 	}
5319 }
5320 
5321 #endif	/* HN_IFSTART_SUPPORT */
5322 
5323 static int
5324 hn_xmit(struct hn_tx_ring *txr, int len)
5325 {
5326 	struct hn_softc *sc = txr->hn_sc;
5327 	struct ifnet *ifp = sc->hn_ifp;
5328 	struct mbuf *m_head;
5329 	int sched = 0;
5330 
5331 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5332 #ifdef HN_IFSTART_SUPPORT
5333 	KASSERT(hn_use_if_start == 0,
5334 	    ("hn_xmit is called, when if_start is enabled"));
5335 #endif
5336 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5337 
5338 	if (__predict_false(txr->hn_suspended))
5339 		return (0);
5340 
5341 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5342 		return (0);
5343 
5344 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5345 		struct hn_txdesc *txd;
5346 		int error;
5347 
5348 		if (len > 0 && m_head->m_pkthdr.len > len) {
5349 			/*
5350 			 * This sending could be time consuming; let callers
5351 			 * dispatch this packet sending (and sending of any
5352 			 * following up packets) to tx taskqueue.
5353 			 */
5354 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5355 			sched = 1;
5356 			break;
5357 		}
5358 
5359 		txd = hn_txdesc_get(txr);
5360 		if (txd == NULL) {
5361 			txr->hn_no_txdescs++;
5362 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5363 			txr->hn_oactive = 1;
5364 			break;
5365 		}
5366 
5367 		error = hn_encap(ifp, txr, txd, &m_head);
5368 		if (error) {
5369 			/* Both txd and m_head are freed; discard */
5370 			KASSERT(txr->hn_agg_txd == NULL,
5371 			    ("encap failed w/ pending aggregating txdesc"));
5372 			drbr_advance(ifp, txr->hn_mbuf_br);
5373 			continue;
5374 		}
5375 
5376 		if (txr->hn_agg_pktleft == 0) {
5377 			if (txr->hn_agg_txd != NULL) {
5378 				KASSERT(m_head == NULL,
5379 				    ("pending mbuf for aggregating txdesc"));
5380 				error = hn_flush_txagg(ifp, txr);
5381 				if (__predict_false(error)) {
5382 					txr->hn_oactive = 1;
5383 					break;
5384 				}
5385 			} else {
5386 				KASSERT(m_head != NULL, ("mbuf was freed"));
5387 				error = hn_txpkt(ifp, txr, txd);
5388 				if (__predict_false(error)) {
5389 					/* txd is freed, but m_head is not */
5390 					drbr_putback(ifp, txr->hn_mbuf_br,
5391 					    m_head);
5392 					txr->hn_oactive = 1;
5393 					break;
5394 				}
5395 			}
5396 		}
5397 #ifdef INVARIANTS
5398 		else {
5399 			KASSERT(txr->hn_agg_txd != NULL,
5400 			    ("no aggregating txdesc"));
5401 			KASSERT(m_head == NULL,
5402 			    ("pending mbuf for aggregating txdesc"));
5403 		}
5404 #endif
5405 
5406 		/* Sent */
5407 		drbr_advance(ifp, txr->hn_mbuf_br);
5408 	}
5409 
5410 	/* Flush pending aggerated transmission. */
5411 	if (txr->hn_agg_txd != NULL)
5412 		hn_flush_txagg(ifp, txr);
5413 	return (sched);
5414 }
5415 
5416 static int
5417 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5418 {
5419 	struct hn_softc *sc = ifp->if_softc;
5420 	struct hn_tx_ring *txr;
5421 	int error, idx = 0;
5422 
5423 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5424 		struct rm_priotracker pt;
5425 
5426 		rm_rlock(&sc->hn_vf_lock, &pt);
5427 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5428 			struct mbuf *m_bpf = NULL;
5429 			int obytes, omcast;
5430 
5431 			obytes = m->m_pkthdr.len;
5432 			if (m->m_flags & M_MCAST)
5433 				omcast = 1;
5434 
5435 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5436 				if (bpf_peers_present(ifp->if_bpf)) {
5437 					m_bpf = m_copypacket(m, M_NOWAIT);
5438 					if (m_bpf == NULL) {
5439 						/*
5440 						 * Failed to grab a shallow
5441 						 * copy; tap now.
5442 						 */
5443 						ETHER_BPF_MTAP(ifp, m);
5444 					}
5445 				}
5446 			} else {
5447 				ETHER_BPF_MTAP(ifp, m);
5448 			}
5449 
5450 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5451 			rm_runlock(&sc->hn_vf_lock, &pt);
5452 
5453 			if (m_bpf != NULL) {
5454 				if (!error)
5455 					ETHER_BPF_MTAP(ifp, m_bpf);
5456 				m_freem(m_bpf);
5457 			}
5458 
5459 			if (error == ENOBUFS) {
5460 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5461 			} else if (error) {
5462 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5463 			} else {
5464 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5465 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5466 				if (omcast) {
5467 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5468 					    omcast);
5469 				}
5470 			}
5471 			return (error);
5472 		}
5473 		rm_runlock(&sc->hn_vf_lock, &pt);
5474 	}
5475 
5476 #if defined(INET6) || defined(INET)
5477 	/*
5478 	 * Perform TSO packet header fixup now, since the TSO
5479 	 * packet header should be cache-hot.
5480 	 */
5481 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5482 		m = hn_tso_fixup(m);
5483 		if (__predict_false(m == NULL)) {
5484 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5485 			return EIO;
5486 		}
5487 	}
5488 #endif
5489 
5490 	/*
5491 	 * Select the TX ring based on flowid
5492 	 */
5493 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
5494 #ifdef RSS
5495 		uint32_t bid;
5496 
5497 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
5498 		    &bid) == 0)
5499 			idx = bid % sc->hn_tx_ring_inuse;
5500 		else
5501 #endif
5502 		{
5503 #if defined(INET6) || defined(INET)
5504 			int tcpsyn = 0;
5505 
5506 			if (m->m_pkthdr.len < 128 &&
5507 			    (m->m_pkthdr.csum_flags &
5508 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
5509 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
5510 				m = hn_check_tcpsyn(m, &tcpsyn);
5511 				if (__predict_false(m == NULL)) {
5512 					if_inc_counter(ifp,
5513 					    IFCOUNTER_OERRORS, 1);
5514 					return (EIO);
5515 				}
5516 			}
5517 #else
5518 			const int tcpsyn = 0;
5519 #endif
5520 			if (tcpsyn)
5521 				idx = 0;
5522 			else
5523 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
5524 		}
5525 	}
5526 	txr = &sc->hn_tx_ring[idx];
5527 
5528 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
5529 	if (error) {
5530 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5531 		return error;
5532 	}
5533 
5534 	if (txr->hn_oactive)
5535 		return 0;
5536 
5537 	if (txr->hn_sched_tx)
5538 		goto do_sched;
5539 
5540 	if (mtx_trylock(&txr->hn_tx_lock)) {
5541 		int sched;
5542 
5543 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
5544 		mtx_unlock(&txr->hn_tx_lock);
5545 		if (!sched)
5546 			return 0;
5547 	}
5548 do_sched:
5549 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5550 	return 0;
5551 }
5552 
5553 static void
5554 hn_tx_ring_qflush(struct hn_tx_ring *txr)
5555 {
5556 	struct mbuf *m;
5557 
5558 	mtx_lock(&txr->hn_tx_lock);
5559 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
5560 		m_freem(m);
5561 	mtx_unlock(&txr->hn_tx_lock);
5562 }
5563 
5564 static void
5565 hn_xmit_qflush(struct ifnet *ifp)
5566 {
5567 	struct hn_softc *sc = ifp->if_softc;
5568 	struct rm_priotracker pt;
5569 	int i;
5570 
5571 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
5572 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5573 	if_qflush(ifp);
5574 
5575 	rm_rlock(&sc->hn_vf_lock, &pt);
5576 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
5577 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
5578 	rm_runlock(&sc->hn_vf_lock, &pt);
5579 }
5580 
5581 static void
5582 hn_xmit_txeof(struct hn_tx_ring *txr)
5583 {
5584 
5585 	if (txr->hn_sched_tx)
5586 		goto do_sched;
5587 
5588 	if (mtx_trylock(&txr->hn_tx_lock)) {
5589 		int sched;
5590 
5591 		txr->hn_oactive = 0;
5592 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
5593 		mtx_unlock(&txr->hn_tx_lock);
5594 		if (sched) {
5595 			taskqueue_enqueue(txr->hn_tx_taskq,
5596 			    &txr->hn_tx_task);
5597 		}
5598 	} else {
5599 do_sched:
5600 		/*
5601 		 * Release the oactive earlier, with the hope, that
5602 		 * others could catch up.  The task will clear the
5603 		 * oactive again with the hn_tx_lock to avoid possible
5604 		 * races.
5605 		 */
5606 		txr->hn_oactive = 0;
5607 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5608 	}
5609 }
5610 
5611 static void
5612 hn_xmit_taskfunc(void *xtxr, int pending __unused)
5613 {
5614 	struct hn_tx_ring *txr = xtxr;
5615 
5616 	mtx_lock(&txr->hn_tx_lock);
5617 	hn_xmit(txr, 0);
5618 	mtx_unlock(&txr->hn_tx_lock);
5619 }
5620 
5621 static void
5622 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
5623 {
5624 	struct hn_tx_ring *txr = xtxr;
5625 
5626 	mtx_lock(&txr->hn_tx_lock);
5627 	txr->hn_oactive = 0;
5628 	hn_xmit(txr, 0);
5629 	mtx_unlock(&txr->hn_tx_lock);
5630 }
5631 
5632 static int
5633 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
5634 {
5635 	struct vmbus_chan_br cbr;
5636 	struct hn_rx_ring *rxr;
5637 	struct hn_tx_ring *txr = NULL;
5638 	int idx, error;
5639 
5640 	idx = vmbus_chan_subidx(chan);
5641 
5642 	/*
5643 	 * Link this channel to RX/TX ring.
5644 	 */
5645 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
5646 	    ("invalid channel index %d, should > 0 && < %d",
5647 	     idx, sc->hn_rx_ring_inuse));
5648 	rxr = &sc->hn_rx_ring[idx];
5649 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
5650 	    ("RX ring %d already attached", idx));
5651 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
5652 	rxr->hn_chan = chan;
5653 
5654 	if (bootverbose) {
5655 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
5656 		    idx, vmbus_chan_id(chan));
5657 	}
5658 
5659 	if (idx < sc->hn_tx_ring_inuse) {
5660 		txr = &sc->hn_tx_ring[idx];
5661 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
5662 		    ("TX ring %d already attached", idx));
5663 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
5664 
5665 		txr->hn_chan = chan;
5666 		if (bootverbose) {
5667 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
5668 			    idx, vmbus_chan_id(chan));
5669 		}
5670 	}
5671 
5672 	/* Bind this channel to a proper CPU. */
5673 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
5674 
5675 	/*
5676 	 * Open this channel
5677 	 */
5678 	cbr.cbr = rxr->hn_br;
5679 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
5680 	cbr.cbr_txsz = HN_TXBR_SIZE;
5681 	cbr.cbr_rxsz = HN_RXBR_SIZE;
5682 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
5683 	if (error) {
5684 		if (error == EISCONN) {
5685 			if_printf(sc->hn_ifp, "bufring is connected after "
5686 			    "chan%u open failure\n", vmbus_chan_id(chan));
5687 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
5688 		} else {
5689 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
5690 			    vmbus_chan_id(chan), error);
5691 		}
5692 	}
5693 	return (error);
5694 }
5695 
5696 static void
5697 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
5698 {
5699 	struct hn_rx_ring *rxr;
5700 	int idx, error;
5701 
5702 	idx = vmbus_chan_subidx(chan);
5703 
5704 	/*
5705 	 * Link this channel to RX/TX ring.
5706 	 */
5707 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
5708 	    ("invalid channel index %d, should > 0 && < %d",
5709 	     idx, sc->hn_rx_ring_inuse));
5710 	rxr = &sc->hn_rx_ring[idx];
5711 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
5712 	    ("RX ring %d is not attached", idx));
5713 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
5714 
5715 	if (idx < sc->hn_tx_ring_inuse) {
5716 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
5717 
5718 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
5719 		    ("TX ring %d is not attached attached", idx));
5720 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
5721 	}
5722 
5723 	/*
5724 	 * Close this channel.
5725 	 *
5726 	 * NOTE:
5727 	 * Channel closing does _not_ destroy the target channel.
5728 	 */
5729 	error = vmbus_chan_close_direct(chan);
5730 	if (error == EISCONN) {
5731 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
5732 		    "after being closed\n", vmbus_chan_id(chan));
5733 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
5734 	} else if (error) {
5735 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
5736 		    vmbus_chan_id(chan), error);
5737 	}
5738 }
5739 
5740 static int
5741 hn_attach_subchans(struct hn_softc *sc)
5742 {
5743 	struct vmbus_channel **subchans;
5744 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
5745 	int i, error = 0;
5746 
5747 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
5748 
5749 	/* Attach the sub-channels. */
5750 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
5751 	for (i = 0; i < subchan_cnt; ++i) {
5752 		int error1;
5753 
5754 		error1 = hn_chan_attach(sc, subchans[i]);
5755 		if (error1) {
5756 			error = error1;
5757 			/* Move on; all channels will be detached later. */
5758 		}
5759 	}
5760 	vmbus_subchan_rel(subchans, subchan_cnt);
5761 
5762 	if (error) {
5763 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
5764 	} else {
5765 		if (bootverbose) {
5766 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
5767 			    subchan_cnt);
5768 		}
5769 	}
5770 	return (error);
5771 }
5772 
5773 static void
5774 hn_detach_allchans(struct hn_softc *sc)
5775 {
5776 	struct vmbus_channel **subchans;
5777 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
5778 	int i;
5779 
5780 	if (subchan_cnt == 0)
5781 		goto back;
5782 
5783 	/* Detach the sub-channels. */
5784 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
5785 	for (i = 0; i < subchan_cnt; ++i)
5786 		hn_chan_detach(sc, subchans[i]);
5787 	vmbus_subchan_rel(subchans, subchan_cnt);
5788 
5789 back:
5790 	/*
5791 	 * Detach the primary channel, _after_ all sub-channels
5792 	 * are detached.
5793 	 */
5794 	hn_chan_detach(sc, sc->hn_prichan);
5795 
5796 	/* Wait for sub-channels to be destroyed, if any. */
5797 	vmbus_subchan_drain(sc->hn_prichan);
5798 
5799 #ifdef INVARIANTS
5800 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5801 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
5802 		    HN_RX_FLAG_ATTACHED) == 0,
5803 		    ("%dth RX ring is still attached", i));
5804 	}
5805 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5806 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
5807 		    HN_TX_FLAG_ATTACHED) == 0,
5808 		    ("%dth TX ring is still attached", i));
5809 	}
5810 #endif
5811 }
5812 
5813 static int
5814 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
5815 {
5816 	struct vmbus_channel **subchans;
5817 	int nchan, rxr_cnt, error;
5818 
5819 	nchan = *nsubch + 1;
5820 	if (nchan == 1) {
5821 		/*
5822 		 * Multiple RX/TX rings are not requested.
5823 		 */
5824 		*nsubch = 0;
5825 		return (0);
5826 	}
5827 
5828 	/*
5829 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
5830 	 * table entries.
5831 	 */
5832 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
5833 	if (error) {
5834 		/* No RSS; this is benign. */
5835 		*nsubch = 0;
5836 		return (0);
5837 	}
5838 	if (bootverbose) {
5839 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
5840 		    rxr_cnt, nchan);
5841 	}
5842 
5843 	if (nchan > rxr_cnt)
5844 		nchan = rxr_cnt;
5845 	if (nchan == 1) {
5846 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
5847 		*nsubch = 0;
5848 		return (0);
5849 	}
5850 
5851 	/*
5852 	 * Allocate sub-channels from NVS.
5853 	 */
5854 	*nsubch = nchan - 1;
5855 	error = hn_nvs_alloc_subchans(sc, nsubch);
5856 	if (error || *nsubch == 0) {
5857 		/* Failed to allocate sub-channels. */
5858 		*nsubch = 0;
5859 		return (0);
5860 	}
5861 
5862 	/*
5863 	 * Wait for all sub-channels to become ready before moving on.
5864 	 */
5865 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
5866 	vmbus_subchan_rel(subchans, *nsubch);
5867 	return (0);
5868 }
5869 
5870 static bool
5871 hn_synth_attachable(const struct hn_softc *sc)
5872 {
5873 	int i;
5874 
5875 	if (sc->hn_flags & HN_FLAG_ERRORS)
5876 		return (false);
5877 
5878 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5879 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5880 
5881 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
5882 			return (false);
5883 	}
5884 	return (true);
5885 }
5886 
5887 /*
5888  * Make sure that the RX filter is zero after the successful
5889  * RNDIS initialization.
5890  *
5891  * NOTE:
5892  * Under certain conditions on certain versions of Hyper-V,
5893  * the RNDIS rxfilter is _not_ zero on the hypervisor side
5894  * after the successful RNDIS initialization, which breaks
5895  * the assumption of any following code (well, it breaks the
5896  * RNDIS API contract actually).  Clear the RNDIS rxfilter
5897  * explicitly, drain packets sneaking through, and drain the
5898  * interrupt taskqueues scheduled due to the stealth packets.
5899  */
5900 static void
5901 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
5902 {
5903 
5904 	hn_disable_rx(sc);
5905 	hn_drain_rxtx(sc, nchan);
5906 }
5907 
5908 static int
5909 hn_synth_attach(struct hn_softc *sc, int mtu)
5910 {
5911 #define ATTACHED_NVS		0x0002
5912 #define ATTACHED_RNDIS		0x0004
5913 
5914 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
5915 	int error, nsubch, nchan = 1, i, rndis_inited;
5916 	uint32_t old_caps, attached = 0;
5917 
5918 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
5919 	    ("synthetic parts were attached"));
5920 
5921 	if (!hn_synth_attachable(sc))
5922 		return (ENXIO);
5923 
5924 	/* Save capabilities for later verification. */
5925 	old_caps = sc->hn_caps;
5926 	sc->hn_caps = 0;
5927 
5928 	/* Clear RSS stuffs. */
5929 	sc->hn_rss_ind_size = 0;
5930 	sc->hn_rss_hash = 0;
5931 
5932 	/*
5933 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
5934 	 */
5935 	error = hn_chan_attach(sc, sc->hn_prichan);
5936 	if (error)
5937 		goto failed;
5938 
5939 	/*
5940 	 * Attach NVS.
5941 	 */
5942 	error = hn_nvs_attach(sc, mtu);
5943 	if (error)
5944 		goto failed;
5945 	attached |= ATTACHED_NVS;
5946 
5947 	/*
5948 	 * Attach RNDIS _after_ NVS is attached.
5949 	 */
5950 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
5951 	if (rndis_inited)
5952 		attached |= ATTACHED_RNDIS;
5953 	if (error)
5954 		goto failed;
5955 
5956 	/*
5957 	 * Make sure capabilities are not changed.
5958 	 */
5959 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
5960 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
5961 		    old_caps, sc->hn_caps);
5962 		error = ENXIO;
5963 		goto failed;
5964 	}
5965 
5966 	/*
5967 	 * Allocate sub-channels for multi-TX/RX rings.
5968 	 *
5969 	 * NOTE:
5970 	 * The # of RX rings that can be used is equivalent to the # of
5971 	 * channels to be requested.
5972 	 */
5973 	nsubch = sc->hn_rx_ring_cnt - 1;
5974 	error = hn_synth_alloc_subchans(sc, &nsubch);
5975 	if (error)
5976 		goto failed;
5977 	/* NOTE: _Full_ synthetic parts detach is required now. */
5978 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
5979 
5980 	/*
5981 	 * Set the # of TX/RX rings that could be used according to
5982 	 * the # of channels that NVS offered.
5983 	 */
5984 	nchan = nsubch + 1;
5985 	hn_set_ring_inuse(sc, nchan);
5986 	if (nchan == 1) {
5987 		/* Only the primary channel can be used; done */
5988 		goto back;
5989 	}
5990 
5991 	/*
5992 	 * Attach the sub-channels.
5993 	 *
5994 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
5995 	 */
5996 	error = hn_attach_subchans(sc);
5997 	if (error)
5998 		goto failed;
5999 
6000 	/*
6001 	 * Configure RSS key and indirect table _after_ all sub-channels
6002 	 * are attached.
6003 	 */
6004 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6005 		/*
6006 		 * RSS key is not set yet; set it to the default RSS key.
6007 		 */
6008 		if (bootverbose)
6009 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6010 #ifdef RSS
6011 		rss_getkey(rss->rss_key);
6012 #else
6013 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6014 #endif
6015 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6016 	}
6017 
6018 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6019 		/*
6020 		 * RSS indirect table is not set yet; set it up in round-
6021 		 * robin fashion.
6022 		 */
6023 		if (bootverbose) {
6024 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6025 			    "table\n");
6026 		}
6027 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6028 			uint32_t subidx;
6029 
6030 #ifdef RSS
6031 			subidx = rss_get_indirection_to_bucket(i);
6032 #else
6033 			subidx = i;
6034 #endif
6035 			rss->rss_ind[i] = subidx % nchan;
6036 		}
6037 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6038 	} else {
6039 		/*
6040 		 * # of usable channels may be changed, so we have to
6041 		 * make sure that all entries in RSS indirect table
6042 		 * are valid.
6043 		 *
6044 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6045 		 */
6046 		hn_rss_ind_fixup(sc);
6047 	}
6048 
6049 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6050 	if (error)
6051 		goto failed;
6052 back:
6053 	/*
6054 	 * Fixup transmission aggregation setup.
6055 	 */
6056 	hn_set_txagg(sc);
6057 	hn_rndis_init_fixat(sc, nchan);
6058 	return (0);
6059 
6060 failed:
6061 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6062 		hn_rndis_init_fixat(sc, nchan);
6063 		hn_synth_detach(sc);
6064 	} else {
6065 		if (attached & ATTACHED_RNDIS) {
6066 			hn_rndis_init_fixat(sc, nchan);
6067 			hn_rndis_detach(sc);
6068 		}
6069 		if (attached & ATTACHED_NVS)
6070 			hn_nvs_detach(sc);
6071 		hn_chan_detach(sc, sc->hn_prichan);
6072 		/* Restore old capabilities. */
6073 		sc->hn_caps = old_caps;
6074 	}
6075 	return (error);
6076 
6077 #undef ATTACHED_RNDIS
6078 #undef ATTACHED_NVS
6079 }
6080 
6081 /*
6082  * NOTE:
6083  * The interface must have been suspended though hn_suspend(), before
6084  * this function get called.
6085  */
6086 static void
6087 hn_synth_detach(struct hn_softc *sc)
6088 {
6089 
6090 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6091 	    ("synthetic parts were not attached"));
6092 
6093 	/* Detach the RNDIS first. */
6094 	hn_rndis_detach(sc);
6095 
6096 	/* Detach NVS. */
6097 	hn_nvs_detach(sc);
6098 
6099 	/* Detach all of the channels. */
6100 	hn_detach_allchans(sc);
6101 
6102 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6103 }
6104 
6105 static void
6106 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6107 {
6108 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6109 	    ("invalid ring count %d", ring_cnt));
6110 
6111 	if (sc->hn_tx_ring_cnt > ring_cnt)
6112 		sc->hn_tx_ring_inuse = ring_cnt;
6113 	else
6114 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6115 	sc->hn_rx_ring_inuse = ring_cnt;
6116 
6117 #ifdef RSS
6118 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6119 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6120 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6121 		    rss_getnumbuckets());
6122 	}
6123 #endif
6124 
6125 	if (bootverbose) {
6126 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6127 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6128 	}
6129 }
6130 
6131 static void
6132 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6133 {
6134 
6135 	/*
6136 	 * NOTE:
6137 	 * The TX bufring will not be drained by the hypervisor,
6138 	 * if the primary channel is revoked.
6139 	 */
6140 	while (!vmbus_chan_rx_empty(chan) ||
6141 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6142 	     !vmbus_chan_tx_empty(chan)))
6143 		pause("waitch", 1);
6144 	vmbus_chan_intr_drain(chan);
6145 }
6146 
6147 static void
6148 hn_disable_rx(struct hn_softc *sc)
6149 {
6150 
6151 	/*
6152 	 * Disable RX by clearing RX filter forcefully.
6153 	 */
6154 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6155 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6156 
6157 	/*
6158 	 * Give RNDIS enough time to flush all pending data packets.
6159 	 */
6160 	pause("waitrx", (200 * hz) / 1000);
6161 }
6162 
6163 /*
6164  * NOTE:
6165  * RX/TX _must_ have been suspended/disabled, before this function
6166  * is called.
6167  */
6168 static void
6169 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6170 {
6171 	struct vmbus_channel **subch = NULL;
6172 	int nsubch;
6173 
6174 	/*
6175 	 * Drain RX/TX bufrings and interrupts.
6176 	 */
6177 	nsubch = nchan - 1;
6178 	if (nsubch > 0)
6179 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6180 
6181 	if (subch != NULL) {
6182 		int i;
6183 
6184 		for (i = 0; i < nsubch; ++i)
6185 			hn_chan_drain(sc, subch[i]);
6186 	}
6187 	hn_chan_drain(sc, sc->hn_prichan);
6188 
6189 	if (subch != NULL)
6190 		vmbus_subchan_rel(subch, nsubch);
6191 }
6192 
6193 static void
6194 hn_suspend_data(struct hn_softc *sc)
6195 {
6196 	struct hn_tx_ring *txr;
6197 	int i;
6198 
6199 	HN_LOCK_ASSERT(sc);
6200 
6201 	/*
6202 	 * Suspend TX.
6203 	 */
6204 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6205 		txr = &sc->hn_tx_ring[i];
6206 
6207 		mtx_lock(&txr->hn_tx_lock);
6208 		txr->hn_suspended = 1;
6209 		mtx_unlock(&txr->hn_tx_lock);
6210 		/* No one is able send more packets now. */
6211 
6212 		/*
6213 		 * Wait for all pending sends to finish.
6214 		 *
6215 		 * NOTE:
6216 		 * We will _not_ receive all pending send-done, if the
6217 		 * primary channel is revoked.
6218 		 */
6219 		while (hn_tx_ring_pending(txr) &&
6220 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6221 			pause("hnwtx", 1 /* 1 tick */);
6222 	}
6223 
6224 	/*
6225 	 * Disable RX.
6226 	 */
6227 	hn_disable_rx(sc);
6228 
6229 	/*
6230 	 * Drain RX/TX.
6231 	 */
6232 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6233 
6234 	/*
6235 	 * Drain any pending TX tasks.
6236 	 *
6237 	 * NOTE:
6238 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6239 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6240 	 */
6241 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6242 		txr = &sc->hn_tx_ring[i];
6243 
6244 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6245 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6246 	}
6247 }
6248 
6249 static void
6250 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6251 {
6252 
6253 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6254 }
6255 
6256 static void
6257 hn_suspend_mgmt(struct hn_softc *sc)
6258 {
6259 	struct task task;
6260 
6261 	HN_LOCK_ASSERT(sc);
6262 
6263 	/*
6264 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6265 	 * through hn_mgmt_taskq.
6266 	 */
6267 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6268 	vmbus_chan_run_task(sc->hn_prichan, &task);
6269 
6270 	/*
6271 	 * Make sure that all pending management tasks are completed.
6272 	 */
6273 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6274 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6275 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6276 }
6277 
6278 static void
6279 hn_suspend(struct hn_softc *sc)
6280 {
6281 
6282 	/* Disable polling. */
6283 	hn_polling(sc, 0);
6284 
6285 	/*
6286 	 * If the non-transparent mode VF is activated, the synthetic
6287 	 * device is receiving packets, so the data path of the
6288 	 * synthetic device must be suspended.
6289 	 */
6290 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6291 	    (sc->hn_flags & HN_FLAG_RXVF))
6292 		hn_suspend_data(sc);
6293 	hn_suspend_mgmt(sc);
6294 }
6295 
6296 static void
6297 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6298 {
6299 	int i;
6300 
6301 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6302 	    ("invalid TX ring count %d", tx_ring_cnt));
6303 
6304 	for (i = 0; i < tx_ring_cnt; ++i) {
6305 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6306 
6307 		mtx_lock(&txr->hn_tx_lock);
6308 		txr->hn_suspended = 0;
6309 		mtx_unlock(&txr->hn_tx_lock);
6310 	}
6311 }
6312 
6313 static void
6314 hn_resume_data(struct hn_softc *sc)
6315 {
6316 	int i;
6317 
6318 	HN_LOCK_ASSERT(sc);
6319 
6320 	/*
6321 	 * Re-enable RX.
6322 	 */
6323 	hn_rxfilter_config(sc);
6324 
6325 	/*
6326 	 * Make sure to clear suspend status on "all" TX rings,
6327 	 * since hn_tx_ring_inuse can be changed after
6328 	 * hn_suspend_data().
6329 	 */
6330 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6331 
6332 #ifdef HN_IFSTART_SUPPORT
6333 	if (!hn_use_if_start)
6334 #endif
6335 	{
6336 		/*
6337 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6338 		 * reduced.
6339 		 */
6340 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6341 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6342 	}
6343 
6344 	/*
6345 	 * Kick start TX.
6346 	 */
6347 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6348 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6349 
6350 		/*
6351 		 * Use txeof task, so that any pending oactive can be
6352 		 * cleared properly.
6353 		 */
6354 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6355 	}
6356 }
6357 
6358 static void
6359 hn_resume_mgmt(struct hn_softc *sc)
6360 {
6361 
6362 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6363 
6364 	/*
6365 	 * Kick off network change detection, if it was pending.
6366 	 * If no network change was pending, start link status
6367 	 * checks, which is more lightweight than network change
6368 	 * detection.
6369 	 */
6370 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6371 		hn_change_network(sc);
6372 	else
6373 		hn_update_link_status(sc);
6374 }
6375 
6376 static void
6377 hn_resume(struct hn_softc *sc)
6378 {
6379 
6380 	/*
6381 	 * If the non-transparent mode VF is activated, the synthetic
6382 	 * device have to receive packets, so the data path of the
6383 	 * synthetic device must be resumed.
6384 	 */
6385 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6386 	    (sc->hn_flags & HN_FLAG_RXVF))
6387 		hn_resume_data(sc);
6388 
6389 	/*
6390 	 * Don't resume link status change if VF is attached/activated.
6391 	 * - In the non-transparent VF mode, the synthetic device marks
6392 	 *   link down until the VF is deactivated; i.e. VF is down.
6393 	 * - In transparent VF mode, VF's media status is used until
6394 	 *   the VF is detached.
6395 	 */
6396 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6397 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6398 		hn_resume_mgmt(sc);
6399 
6400 	/*
6401 	 * Re-enable polling if this interface is running and
6402 	 * the polling is requested.
6403 	 */
6404 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6405 		hn_polling(sc, sc->hn_pollhz);
6406 }
6407 
6408 static void
6409 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6410 {
6411 	const struct rndis_status_msg *msg;
6412 	int ofs;
6413 
6414 	if (dlen < sizeof(*msg)) {
6415 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6416 		return;
6417 	}
6418 	msg = data;
6419 
6420 	switch (msg->rm_status) {
6421 	case RNDIS_STATUS_MEDIA_CONNECT:
6422 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6423 		hn_update_link_status(sc);
6424 		break;
6425 
6426 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6427 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6428 		/* Not really useful; ignore. */
6429 		break;
6430 
6431 	case RNDIS_STATUS_NETWORK_CHANGE:
6432 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6433 		if (dlen < ofs + msg->rm_stbuflen ||
6434 		    msg->rm_stbuflen < sizeof(uint32_t)) {
6435 			if_printf(sc->hn_ifp, "network changed\n");
6436 		} else {
6437 			uint32_t change;
6438 
6439 			memcpy(&change, ((const uint8_t *)msg) + ofs,
6440 			    sizeof(change));
6441 			if_printf(sc->hn_ifp, "network changed, change %u\n",
6442 			    change);
6443 		}
6444 		hn_change_network(sc);
6445 		break;
6446 
6447 	default:
6448 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6449 		    msg->rm_status);
6450 		break;
6451 	}
6452 }
6453 
6454 static int
6455 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6456 {
6457 	const struct rndis_pktinfo *pi = info_data;
6458 	uint32_t mask = 0;
6459 
6460 	while (info_dlen != 0) {
6461 		const void *data;
6462 		uint32_t dlen;
6463 
6464 		if (__predict_false(info_dlen < sizeof(*pi)))
6465 			return (EINVAL);
6466 		if (__predict_false(info_dlen < pi->rm_size))
6467 			return (EINVAL);
6468 		info_dlen -= pi->rm_size;
6469 
6470 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6471 			return (EINVAL);
6472 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6473 			return (EINVAL);
6474 		dlen = pi->rm_size - pi->rm_pktinfooffset;
6475 		data = pi->rm_data;
6476 
6477 		switch (pi->rm_type) {
6478 		case NDIS_PKTINFO_TYPE_VLAN:
6479 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6480 				return (EINVAL);
6481 			info->vlan_info = *((const uint32_t *)data);
6482 			mask |= HN_RXINFO_VLAN;
6483 			break;
6484 
6485 		case NDIS_PKTINFO_TYPE_CSUM:
6486 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6487 				return (EINVAL);
6488 			info->csum_info = *((const uint32_t *)data);
6489 			mask |= HN_RXINFO_CSUM;
6490 			break;
6491 
6492 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6493 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6494 				return (EINVAL);
6495 			info->hash_value = *((const uint32_t *)data);
6496 			mask |= HN_RXINFO_HASHVAL;
6497 			break;
6498 
6499 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
6500 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6501 				return (EINVAL);
6502 			info->hash_info = *((const uint32_t *)data);
6503 			mask |= HN_RXINFO_HASHINF;
6504 			break;
6505 
6506 		default:
6507 			goto next;
6508 		}
6509 
6510 		if (mask == HN_RXINFO_ALL) {
6511 			/* All found; done */
6512 			break;
6513 		}
6514 next:
6515 		pi = (const struct rndis_pktinfo *)
6516 		    ((const uint8_t *)pi + pi->rm_size);
6517 	}
6518 
6519 	/*
6520 	 * Final fixup.
6521 	 * - If there is no hash value, invalidate the hash info.
6522 	 */
6523 	if ((mask & HN_RXINFO_HASHVAL) == 0)
6524 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
6525 	return (0);
6526 }
6527 
6528 static __inline bool
6529 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
6530 {
6531 
6532 	if (off < check_off) {
6533 		if (__predict_true(off + len <= check_off))
6534 			return (false);
6535 	} else if (off > check_off) {
6536 		if (__predict_true(check_off + check_len <= off))
6537 			return (false);
6538 	}
6539 	return (true);
6540 }
6541 
6542 static void
6543 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
6544 {
6545 	const struct rndis_packet_msg *pkt;
6546 	struct hn_rxinfo info;
6547 	int data_off, pktinfo_off, data_len, pktinfo_len;
6548 
6549 	/*
6550 	 * Check length.
6551 	 */
6552 	if (__predict_false(dlen < sizeof(*pkt))) {
6553 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
6554 		return;
6555 	}
6556 	pkt = data;
6557 
6558 	if (__predict_false(dlen < pkt->rm_len)) {
6559 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
6560 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
6561 		return;
6562 	}
6563 	if (__predict_false(pkt->rm_len <
6564 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
6565 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
6566 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
6567 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
6568 		    pkt->rm_pktinfolen);
6569 		return;
6570 	}
6571 	if (__predict_false(pkt->rm_datalen == 0)) {
6572 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
6573 		return;
6574 	}
6575 
6576 	/*
6577 	 * Check offests.
6578 	 */
6579 #define IS_OFFSET_INVALID(ofs)			\
6580 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
6581 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
6582 
6583 	/* XXX Hyper-V does not meet data offset alignment requirement */
6584 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
6585 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6586 		    "data offset %u\n", pkt->rm_dataoffset);
6587 		return;
6588 	}
6589 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
6590 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
6591 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6592 		    "oob offset %u\n", pkt->rm_oobdataoffset);
6593 		return;
6594 	}
6595 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
6596 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
6597 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6598 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
6599 		return;
6600 	}
6601 
6602 #undef IS_OFFSET_INVALID
6603 
6604 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
6605 	data_len = pkt->rm_datalen;
6606 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
6607 	pktinfo_len = pkt->rm_pktinfolen;
6608 
6609 	/*
6610 	 * Check OOB coverage.
6611 	 */
6612 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
6613 		int oob_off, oob_len;
6614 
6615 		if_printf(rxr->hn_ifp, "got oobdata\n");
6616 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
6617 		oob_len = pkt->rm_oobdatalen;
6618 
6619 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
6620 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6621 			    "oob overflow, msglen %u, oob abs %d len %d\n",
6622 			    pkt->rm_len, oob_off, oob_len);
6623 			return;
6624 		}
6625 
6626 		/*
6627 		 * Check against data.
6628 		 */
6629 		if (hn_rndis_check_overlap(oob_off, oob_len,
6630 		    data_off, data_len)) {
6631 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6632 			    "oob overlaps data, oob abs %d len %d, "
6633 			    "data abs %d len %d\n",
6634 			    oob_off, oob_len, data_off, data_len);
6635 			return;
6636 		}
6637 
6638 		/*
6639 		 * Check against pktinfo.
6640 		 */
6641 		if (pktinfo_len != 0 &&
6642 		    hn_rndis_check_overlap(oob_off, oob_len,
6643 		    pktinfo_off, pktinfo_len)) {
6644 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6645 			    "oob overlaps pktinfo, oob abs %d len %d, "
6646 			    "pktinfo abs %d len %d\n",
6647 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
6648 			return;
6649 		}
6650 	}
6651 
6652 	/*
6653 	 * Check per-packet-info coverage and find useful per-packet-info.
6654 	 */
6655 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
6656 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
6657 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
6658 	if (__predict_true(pktinfo_len != 0)) {
6659 		bool overlap;
6660 		int error;
6661 
6662 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
6663 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6664 			    "pktinfo overflow, msglen %u, "
6665 			    "pktinfo abs %d len %d\n",
6666 			    pkt->rm_len, pktinfo_off, pktinfo_len);
6667 			return;
6668 		}
6669 
6670 		/*
6671 		 * Check packet info coverage.
6672 		 */
6673 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
6674 		    data_off, data_len);
6675 		if (__predict_false(overlap)) {
6676 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6677 			    "pktinfo overlap data, pktinfo abs %d len %d, "
6678 			    "data abs %d len %d\n",
6679 			    pktinfo_off, pktinfo_len, data_off, data_len);
6680 			return;
6681 		}
6682 
6683 		/*
6684 		 * Find useful per-packet-info.
6685 		 */
6686 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
6687 		    pktinfo_len, &info);
6688 		if (__predict_false(error)) {
6689 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
6690 			    "pktinfo\n");
6691 			return;
6692 		}
6693 	}
6694 
6695 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
6696 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6697 		    "data overflow, msglen %u, data abs %d len %d\n",
6698 		    pkt->rm_len, data_off, data_len);
6699 		return;
6700 	}
6701 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
6702 }
6703 
6704 static __inline void
6705 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
6706 {
6707 	const struct rndis_msghdr *hdr;
6708 
6709 	if (__predict_false(dlen < sizeof(*hdr))) {
6710 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
6711 		return;
6712 	}
6713 	hdr = data;
6714 
6715 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
6716 		/* Hot data path. */
6717 		hn_rndis_rx_data(rxr, data, dlen);
6718 		/* Done! */
6719 		return;
6720 	}
6721 
6722 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
6723 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
6724 	else
6725 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
6726 }
6727 
6728 static void
6729 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
6730 {
6731 	const struct hn_nvs_hdr *hdr;
6732 
6733 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
6734 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
6735 		return;
6736 	}
6737 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
6738 
6739 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
6740 		/* Useless; ignore */
6741 		return;
6742 	}
6743 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
6744 }
6745 
6746 static void
6747 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
6748     const struct vmbus_chanpkt_hdr *pkt)
6749 {
6750 	struct hn_nvs_sendctx *sndc;
6751 
6752 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
6753 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
6754 	    VMBUS_CHANPKT_DATALEN(pkt));
6755 	/*
6756 	 * NOTE:
6757 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
6758 	 * its callback.
6759 	 */
6760 }
6761 
6762 static void
6763 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
6764     const struct vmbus_chanpkt_hdr *pkthdr)
6765 {
6766 	const struct vmbus_chanpkt_rxbuf *pkt;
6767 	const struct hn_nvs_hdr *nvs_hdr;
6768 	int count, i, hlen;
6769 
6770 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
6771 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
6772 		return;
6773 	}
6774 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
6775 
6776 	/* Make sure that this is a RNDIS message. */
6777 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
6778 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
6779 		    nvs_hdr->nvs_type);
6780 		return;
6781 	}
6782 
6783 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
6784 	if (__predict_false(hlen < sizeof(*pkt))) {
6785 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
6786 		return;
6787 	}
6788 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
6789 
6790 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
6791 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
6792 		    pkt->cp_rxbuf_id);
6793 		return;
6794 	}
6795 
6796 	count = pkt->cp_rxbuf_cnt;
6797 	if (__predict_false(hlen <
6798 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
6799 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
6800 		return;
6801 	}
6802 
6803 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
6804 	for (i = 0; i < count; ++i) {
6805 		int ofs, len;
6806 
6807 		ofs = pkt->cp_rxbuf[i].rb_ofs;
6808 		len = pkt->cp_rxbuf[i].rb_len;
6809 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
6810 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
6811 			    "ofs %d, len %d\n", i, ofs, len);
6812 			continue;
6813 		}
6814 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
6815 	}
6816 
6817 	/*
6818 	 * Ack the consumed RXBUF associated w/ this channel packet,
6819 	 * so that this RXBUF can be recycled by the hypervisor.
6820 	 */
6821 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
6822 }
6823 
6824 static void
6825 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
6826     uint64_t tid)
6827 {
6828 	struct hn_nvs_rndis_ack ack;
6829 	int retries, error;
6830 
6831 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
6832 	ack.nvs_status = HN_NVS_STATUS_OK;
6833 
6834 	retries = 0;
6835 again:
6836 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
6837 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
6838 	if (__predict_false(error == EAGAIN)) {
6839 		/*
6840 		 * NOTE:
6841 		 * This should _not_ happen in real world, since the
6842 		 * consumption of the TX bufring from the TX path is
6843 		 * controlled.
6844 		 */
6845 		if (rxr->hn_ack_failed == 0)
6846 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
6847 		rxr->hn_ack_failed++;
6848 		retries++;
6849 		if (retries < 10) {
6850 			DELAY(100);
6851 			goto again;
6852 		}
6853 		/* RXBUF leaks! */
6854 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
6855 	}
6856 }
6857 
6858 static void
6859 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
6860 {
6861 	struct hn_rx_ring *rxr = xrxr;
6862 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
6863 
6864 	for (;;) {
6865 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
6866 		int error, pktlen;
6867 
6868 		pktlen = rxr->hn_pktbuf_len;
6869 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
6870 		if (__predict_false(error == ENOBUFS)) {
6871 			void *nbuf;
6872 			int nlen;
6873 
6874 			/*
6875 			 * Expand channel packet buffer.
6876 			 *
6877 			 * XXX
6878 			 * Use M_WAITOK here, since allocation failure
6879 			 * is fatal.
6880 			 */
6881 			nlen = rxr->hn_pktbuf_len * 2;
6882 			while (nlen < pktlen)
6883 				nlen *= 2;
6884 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
6885 
6886 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
6887 			    rxr->hn_pktbuf_len, nlen);
6888 
6889 			free(rxr->hn_pktbuf, M_DEVBUF);
6890 			rxr->hn_pktbuf = nbuf;
6891 			rxr->hn_pktbuf_len = nlen;
6892 			/* Retry! */
6893 			continue;
6894 		} else if (__predict_false(error == EAGAIN)) {
6895 			/* No more channel packets; done! */
6896 			break;
6897 		}
6898 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
6899 
6900 		switch (pkt->cph_type) {
6901 		case VMBUS_CHANPKT_TYPE_COMP:
6902 			hn_nvs_handle_comp(sc, chan, pkt);
6903 			break;
6904 
6905 		case VMBUS_CHANPKT_TYPE_RXBUF:
6906 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
6907 			break;
6908 
6909 		case VMBUS_CHANPKT_TYPE_INBAND:
6910 			hn_nvs_handle_notify(sc, pkt);
6911 			break;
6912 
6913 		default:
6914 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
6915 			    pkt->cph_type);
6916 			break;
6917 		}
6918 	}
6919 	hn_chan_rollup(rxr, rxr->hn_txr);
6920 }
6921 
6922 static void
6923 hn_sysinit(void *arg __unused)
6924 {
6925 	int i;
6926 
6927 #ifdef HN_IFSTART_SUPPORT
6928 	/*
6929 	 * Don't use ifnet.if_start if transparent VF mode is requested;
6930 	 * mainly due to the IFF_DRV_OACTIVE flag.
6931 	 */
6932 	if (hn_xpnt_vf && hn_use_if_start) {
6933 		hn_use_if_start = 0;
6934 		printf("hn: tranparent VF mode, if_transmit will be used, "
6935 		    "instead of if_start\n");
6936 	}
6937 #endif
6938 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
6939 		printf("hn: invalid transparent VF attach routing "
6940 		    "wait timeout %d, reset to %d\n",
6941 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
6942 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
6943 	}
6944 
6945 	/*
6946 	 * Initialize VF map.
6947 	 */
6948 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
6949 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
6950 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
6951 	    M_WAITOK | M_ZERO);
6952 
6953 	/*
6954 	 * Fix the # of TX taskqueues.
6955 	 */
6956 	if (hn_tx_taskq_cnt <= 0)
6957 		hn_tx_taskq_cnt = 1;
6958 	else if (hn_tx_taskq_cnt > mp_ncpus)
6959 		hn_tx_taskq_cnt = mp_ncpus;
6960 
6961 	/*
6962 	 * Fix the TX taskqueue mode.
6963 	 */
6964 	switch (hn_tx_taskq_mode) {
6965 	case HN_TX_TASKQ_M_INDEP:
6966 	case HN_TX_TASKQ_M_GLOBAL:
6967 	case HN_TX_TASKQ_M_EVTTQ:
6968 		break;
6969 	default:
6970 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
6971 		break;
6972 	}
6973 
6974 	if (vm_guest != VM_GUEST_HV)
6975 		return;
6976 
6977 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
6978 		return;
6979 
6980 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
6981 	    M_DEVBUF, M_WAITOK);
6982 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
6983 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
6984 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
6985 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
6986 		    "hn tx%d", i);
6987 	}
6988 }
6989 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
6990 
6991 static void
6992 hn_sysuninit(void *arg __unused)
6993 {
6994 
6995 	if (hn_tx_taskque != NULL) {
6996 		int i;
6997 
6998 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
6999 			taskqueue_free(hn_tx_taskque[i]);
7000 		free(hn_tx_taskque, M_DEVBUF);
7001 	}
7002 
7003 	if (hn_vfmap != NULL)
7004 		free(hn_vfmap, M_DEVBUF);
7005 	rm_destroy(&hn_vfmap_lock);
7006 }
7007 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7008