xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision d2bc7754a226c031b76184277e32c4d65a763f67)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/proc.h>
75 #include <sys/rmlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/socket.h>
80 #include <sys/sockio.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/buf_ring.h>
85 #include <sys/eventhandler.h>
86 #include <sys/epoch.h>
87 
88 #include <machine/atomic.h>
89 #include <machine/in_cksum.h>
90 
91 #include <net/bpf.h>
92 #include <net/ethernet.h>
93 #include <net/if.h>
94 #include <net/if_dl.h>
95 #include <net/if_media.h>
96 #include <net/if_types.h>
97 #include <net/if_var.h>
98 #include <net/rndis.h>
99 #ifdef RSS
100 #include <net/rss_config.h>
101 #endif
102 
103 #include <netinet/in_systm.h>
104 #include <netinet/in.h>
105 #include <netinet/ip.h>
106 #include <netinet/ip6.h>
107 #include <netinet/tcp.h>
108 #include <netinet/tcp_lro.h>
109 #include <netinet/udp.h>
110 
111 #include <dev/hyperv/include/hyperv.h>
112 #include <dev/hyperv/include/hyperv_busdma.h>
113 #include <dev/hyperv/include/vmbus.h>
114 #include <dev/hyperv/include/vmbus_xact.h>
115 
116 #include <dev/hyperv/netvsc/ndis.h>
117 #include <dev/hyperv/netvsc/if_hnreg.h>
118 #include <dev/hyperv/netvsc/if_hnvar.h>
119 #include <dev/hyperv/netvsc/hn_nvs.h>
120 #include <dev/hyperv/netvsc/hn_rndis.h>
121 
122 #include "vmbus_if.h"
123 
124 #define HN_IFSTART_SUPPORT
125 
126 #define HN_RING_CNT_DEF_MAX		8
127 
128 #define HN_VFMAP_SIZE_DEF		8
129 
130 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
131 
132 /* YYY should get it from the underlying channel */
133 #define HN_TX_DESC_CNT			512
134 
135 #define HN_RNDIS_PKT_LEN					\
136 	(sizeof(struct rndis_packet_msg) +			\
137 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
138 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
139 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
140 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
141 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
142 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
143 
144 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
145 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
146 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
147 /* -1 for RNDIS packet message */
148 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
149 
150 #define HN_DIRECT_TX_SIZE_DEF		128
151 
152 #define HN_EARLY_TXEOF_THRESH		8
153 
154 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
155 
156 #define HN_LROENT_CNT_DEF		128
157 
158 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
159 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
160 /* YYY 2*MTU is a bit rough, but should be good enough. */
161 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
162 
163 #define HN_LRO_ACKCNT_DEF		1
164 
165 #define HN_LOCK_INIT(sc)		\
166 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
167 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
168 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
169 #define HN_LOCK(sc)					\
170 do {							\
171 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
172 		/* Relinquish cpu to avoid deadlock */	\
173 		sched_relinquish(curthread);		\
174 		DELAY(1000);				\
175 	}						\
176 } while (0)
177 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
178 
179 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
180 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
181 #define HN_CSUM_IP_HWASSIST(sc)		\
182 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
183 #define HN_CSUM_IP6_HWASSIST(sc)	\
184 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
185 
186 #define HN_PKTSIZE_MIN(align)		\
187 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
188 	    HN_RNDIS_PKT_LEN, (align))
189 #define HN_PKTSIZE(m, align)		\
190 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
191 
192 #ifdef RSS
193 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
194 #else
195 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
196 #endif
197 
198 struct hn_txdesc {
199 #ifndef HN_USE_TXDESC_BUFRING
200 	SLIST_ENTRY(hn_txdesc)		link;
201 #endif
202 	STAILQ_ENTRY(hn_txdesc)		agg_link;
203 
204 	/* Aggregated txdescs, in sending order. */
205 	STAILQ_HEAD(, hn_txdesc)	agg_list;
206 
207 	/* The oldest packet, if transmission aggregation happens. */
208 	struct mbuf			*m;
209 	struct hn_tx_ring		*txr;
210 	int				refs;
211 	uint32_t			flags;	/* HN_TXD_FLAG_ */
212 	struct hn_nvs_sendctx		send_ctx;
213 	uint32_t			chim_index;
214 	int				chim_size;
215 
216 	bus_dmamap_t			data_dmap;
217 
218 	bus_addr_t			rndis_pkt_paddr;
219 	struct rndis_packet_msg		*rndis_pkt;
220 	bus_dmamap_t			rndis_pkt_dmap;
221 };
222 
223 #define HN_TXD_FLAG_ONLIST		0x0001
224 #define HN_TXD_FLAG_DMAMAP		0x0002
225 #define HN_TXD_FLAG_ONAGG		0x0004
226 
227 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
228 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
229 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
230 
231 struct packet_info_id {
232 	uint8_t				ver;
233 	uint8_t				flag;
234 	uint16_t			pkt_id;
235 };
236 
237 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
238 
239 
240 struct hn_rxinfo {
241 	const uint32_t			*vlan_info;
242 	const uint32_t			*csum_info;
243 	const uint32_t			*hash_info;
244 	const uint32_t			*hash_value;
245 	const struct packet_info_id	*pktinfo_id;
246 };
247 
248 struct hn_rxvf_setarg {
249 	struct hn_rx_ring	*rxr;
250 	struct ifnet		*vf_ifp;
251 };
252 
253 #define HN_RXINFO_VLAN			0x0001
254 #define HN_RXINFO_CSUM			0x0002
255 #define HN_RXINFO_HASHINF		0x0004
256 #define HN_RXINFO_HASHVAL		0x0008
257 #define HN_RXINFO_PKTINFO_ID		0x0010
258 #define HN_RXINFO_ALL			\
259 	(HN_RXINFO_VLAN |		\
260 	 HN_RXINFO_CSUM |		\
261 	 HN_RXINFO_HASHINF |		\
262 	 HN_RXINFO_HASHVAL |		\
263 	 HN_RXINFO_PKTINFO_ID)
264 
265 static int			hn_probe(device_t);
266 static int			hn_attach(device_t);
267 static int			hn_detach(device_t);
268 static int			hn_shutdown(device_t);
269 static void			hn_chan_callback(struct vmbus_channel *,
270 				    void *);
271 
272 static void			hn_init(void *);
273 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
274 #ifdef HN_IFSTART_SUPPORT
275 static void			hn_start(struct ifnet *);
276 #endif
277 static int			hn_transmit(struct ifnet *, struct mbuf *);
278 static void			hn_xmit_qflush(struct ifnet *);
279 static int			hn_ifmedia_upd(struct ifnet *);
280 static void			hn_ifmedia_sts(struct ifnet *,
281 				    struct ifmediareq *);
282 
283 static void			hn_ifnet_event(void *, struct ifnet *, int);
284 static void			hn_ifaddr_event(void *, struct ifnet *);
285 static void			hn_ifnet_attevent(void *, struct ifnet *);
286 static void			hn_ifnet_detevent(void *, struct ifnet *);
287 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
288 
289 static bool			hn_ismyvf(const struct hn_softc *,
290 				    const struct ifnet *);
291 static void			hn_rxvf_change(struct hn_softc *,
292 				    struct ifnet *, bool);
293 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
294 static void			hn_rxvf_set_task(void *, int);
295 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
296 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
297 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
298 				    struct ifreq *);
299 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
300 static bool			hn_xpnt_vf_isready(struct hn_softc *);
301 static void			hn_xpnt_vf_setready(struct hn_softc *);
302 static void			hn_xpnt_vf_init_taskfunc(void *, int);
303 static void			hn_xpnt_vf_init(struct hn_softc *);
304 static void			hn_xpnt_vf_setenable(struct hn_softc *);
305 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
306 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
307 static void			hn_vf_rss_restore(struct hn_softc *);
308 
309 static int			hn_rndis_rxinfo(const void *, int,
310 				    struct hn_rxinfo *);
311 static void			hn_rndis_rx_data(struct hn_rx_ring *,
312 				    const void *, int);
313 static void			hn_rndis_rx_status(struct hn_softc *,
314 				    const void *, int);
315 static void			hn_rndis_init_fixat(struct hn_softc *, int);
316 
317 static void			hn_nvs_handle_notify(struct hn_softc *,
318 				    const struct vmbus_chanpkt_hdr *);
319 static void			hn_nvs_handle_comp(struct hn_softc *,
320 				    struct vmbus_channel *,
321 				    const struct vmbus_chanpkt_hdr *);
322 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
323 				    struct vmbus_channel *,
324 				    const struct vmbus_chanpkt_hdr *);
325 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
326 				    struct vmbus_channel *, uint64_t);
327 
328 #if __FreeBSD_version >= 1100099
329 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
334 #if __FreeBSD_version < 1100095
335 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
336 #else
337 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
338 #endif
339 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
346 #ifndef RSS
347 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
349 #endif
350 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
358 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
359 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
360 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
361 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
362 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
363 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
364 
365 static void			hn_stop(struct hn_softc *, bool);
366 static void			hn_init_locked(struct hn_softc *);
367 static int			hn_chan_attach(struct hn_softc *,
368 				    struct vmbus_channel *);
369 static void			hn_chan_detach(struct hn_softc *,
370 				    struct vmbus_channel *);
371 static int			hn_attach_subchans(struct hn_softc *);
372 static void			hn_detach_allchans(struct hn_softc *);
373 static void			hn_chan_rollup(struct hn_rx_ring *,
374 				    struct hn_tx_ring *);
375 static void			hn_set_ring_inuse(struct hn_softc *, int);
376 static int			hn_synth_attach(struct hn_softc *, int);
377 static void			hn_synth_detach(struct hn_softc *);
378 static int			hn_synth_alloc_subchans(struct hn_softc *,
379 				    int *);
380 static bool			hn_synth_attachable(const struct hn_softc *);
381 static void			hn_suspend(struct hn_softc *);
382 static void			hn_suspend_data(struct hn_softc *);
383 static void			hn_suspend_mgmt(struct hn_softc *);
384 static void			hn_resume(struct hn_softc *);
385 static void			hn_resume_data(struct hn_softc *);
386 static void			hn_resume_mgmt(struct hn_softc *);
387 static void			hn_suspend_mgmt_taskfunc(void *, int);
388 static void			hn_chan_drain(struct hn_softc *,
389 				    struct vmbus_channel *);
390 static void			hn_disable_rx(struct hn_softc *);
391 static void			hn_drain_rxtx(struct hn_softc *, int);
392 static void			hn_polling(struct hn_softc *, u_int);
393 static void			hn_chan_polling(struct vmbus_channel *, u_int);
394 static void			hn_mtu_change_fixup(struct hn_softc *);
395 
396 static void			hn_update_link_status(struct hn_softc *);
397 static void			hn_change_network(struct hn_softc *);
398 static void			hn_link_taskfunc(void *, int);
399 static void			hn_netchg_init_taskfunc(void *, int);
400 static void			hn_netchg_status_taskfunc(void *, int);
401 static void			hn_link_status(struct hn_softc *);
402 
403 static int			hn_create_rx_data(struct hn_softc *, int);
404 static void			hn_destroy_rx_data(struct hn_softc *);
405 static int			hn_check_iplen(const struct mbuf *, int);
406 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
407 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
408 static int			hn_rxfilter_config(struct hn_softc *);
409 static int			hn_rss_reconfig(struct hn_softc *);
410 static void			hn_rss_ind_fixup(struct hn_softc *);
411 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
412 static int			hn_rxpkt(struct hn_rx_ring *);
413 static uint32_t			hn_rss_type_fromndis(uint32_t);
414 static uint32_t			hn_rss_type_tondis(uint32_t);
415 
416 static int			hn_tx_ring_create(struct hn_softc *, int);
417 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
418 static int			hn_create_tx_data(struct hn_softc *, int);
419 static void			hn_fixup_tx_data(struct hn_softc *);
420 static void			hn_fixup_rx_data(struct hn_softc *);
421 static void			hn_destroy_tx_data(struct hn_softc *);
422 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
423 static void			hn_txdesc_gc(struct hn_tx_ring *,
424 				    struct hn_txdesc *);
425 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
426 				    struct hn_txdesc *, struct mbuf **);
427 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
428 				    struct hn_txdesc *);
429 static void			hn_set_chim_size(struct hn_softc *, int);
430 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
431 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
432 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
433 static void			hn_resume_tx(struct hn_softc *, int);
434 static void			hn_set_txagg(struct hn_softc *);
435 static void			*hn_try_txagg(struct ifnet *,
436 				    struct hn_tx_ring *, struct hn_txdesc *,
437 				    int);
438 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
439 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
440 				    struct hn_softc *, struct vmbus_channel *,
441 				    const void *, int);
442 static int			hn_txpkt_sglist(struct hn_tx_ring *,
443 				    struct hn_txdesc *);
444 static int			hn_txpkt_chim(struct hn_tx_ring *,
445 				    struct hn_txdesc *);
446 static int			hn_xmit(struct hn_tx_ring *, int);
447 static void			hn_xmit_taskfunc(void *, int);
448 static void			hn_xmit_txeof(struct hn_tx_ring *);
449 static void			hn_xmit_txeof_taskfunc(void *, int);
450 #ifdef HN_IFSTART_SUPPORT
451 static int			hn_start_locked(struct hn_tx_ring *, int);
452 static void			hn_start_taskfunc(void *, int);
453 static void			hn_start_txeof(struct hn_tx_ring *);
454 static void			hn_start_txeof_taskfunc(void *, int);
455 #endif
456 
457 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
458     "Hyper-V network interface");
459 
460 /* Trust tcp segements verification on host side. */
461 static int			hn_trust_hosttcp = 1;
462 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
463     &hn_trust_hosttcp, 0,
464     "Trust tcp segement verification on host side, "
465     "when csum info is missing (global setting)");
466 
467 /* Trust udp datagrams verification on host side. */
468 static int			hn_trust_hostudp = 1;
469 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
470     &hn_trust_hostudp, 0,
471     "Trust udp datagram verification on host side, "
472     "when csum info is missing (global setting)");
473 
474 /* Trust ip packets verification on host side. */
475 static int			hn_trust_hostip = 1;
476 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
477     &hn_trust_hostip, 0,
478     "Trust ip packet verification on host side, "
479     "when csum info is missing (global setting)");
480 
481 /*
482  * Offload UDP/IPv4 checksum.
483  */
484 static int			hn_enable_udp4cs = 1;
485 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
486     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
487 
488 /*
489  * Offload UDP/IPv6 checksum.
490  */
491 static int			hn_enable_udp6cs = 1;
492 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
493     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
494 
495 /* Stats. */
496 static counter_u64_t		hn_udpcs_fixup;
497 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
498     &hn_udpcs_fixup, "# of UDP checksum fixup");
499 
500 /*
501  * See hn_set_hlen().
502  *
503  * This value is for Azure.  For Hyper-V, set this above
504  * 65536 to disable UDP datagram checksum fixup.
505  */
506 static int			hn_udpcs_fixup_mtu = 1420;
507 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
508     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
509 
510 /* Limit TSO burst size */
511 static int			hn_tso_maxlen = IP_MAXPACKET;
512 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
513     &hn_tso_maxlen, 0, "TSO burst limit");
514 
515 /* Limit chimney send size */
516 static int			hn_tx_chimney_size = 0;
517 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
518     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
519 
520 /* Limit the size of packet for direct transmission */
521 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
522 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
523     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
524 
525 /* # of LRO entries per RX ring */
526 #if defined(INET) || defined(INET6)
527 #if __FreeBSD_version >= 1100095
528 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
529 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
530     &hn_lro_entry_count, 0, "LRO entry count");
531 #endif
532 #endif
533 
534 static int			hn_tx_taskq_cnt = 1;
535 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
536     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
537 
538 #define HN_TX_TASKQ_M_INDEP	0
539 #define HN_TX_TASKQ_M_GLOBAL	1
540 #define HN_TX_TASKQ_M_EVTTQ	2
541 
542 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
543 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
544     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
545     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
546 
547 #ifndef HN_USE_TXDESC_BUFRING
548 static int			hn_use_txdesc_bufring = 0;
549 #else
550 static int			hn_use_txdesc_bufring = 1;
551 #endif
552 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
553     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
554 
555 #ifdef HN_IFSTART_SUPPORT
556 /* Use ifnet.if_start instead of ifnet.if_transmit */
557 static int			hn_use_if_start = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
559     &hn_use_if_start, 0, "Use if_start TX method");
560 #endif
561 
562 /* # of channels to use */
563 static int			hn_chan_cnt = 0;
564 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
565     &hn_chan_cnt, 0,
566     "# of channels to use; each channel has one RX ring and one TX ring");
567 
568 /* # of transmit rings to use */
569 static int			hn_tx_ring_cnt = 0;
570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
571     &hn_tx_ring_cnt, 0, "# of TX rings to use");
572 
573 /* Software TX ring deptch */
574 static int			hn_tx_swq_depth = 0;
575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
576     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
577 
578 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
579 #if __FreeBSD_version >= 1100095
580 static u_int			hn_lro_mbufq_depth = 0;
581 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
582     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
583 #endif
584 
585 /* Packet transmission aggregation size limit */
586 static int			hn_tx_agg_size = -1;
587 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
588     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
589 
590 /* Packet transmission aggregation count limit */
591 static int			hn_tx_agg_pkts = -1;
592 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
593     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
594 
595 /* VF list */
596 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
597     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
598     hn_vflist_sysctl, "A",
599     "VF list");
600 
601 /* VF mapping */
602 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
603     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
604     hn_vfmap_sysctl, "A",
605     "VF mapping");
606 
607 /* Transparent VF */
608 static int			hn_xpnt_vf = 1;
609 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
610     &hn_xpnt_vf, 0, "Transparent VF mod");
611 
612 /* Accurate BPF support for Transparent VF */
613 static int			hn_xpnt_vf_accbpf = 0;
614 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
615     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
616 
617 /* Extra wait for transparent VF attach routing; unit seconds. */
618 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
619 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
620     &hn_xpnt_vf_attwait, 0,
621     "Extra wait for transparent VF attach routing; unit: seconds");
622 
623 static u_int			hn_cpu_index;	/* next CPU for channel */
624 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
625 
626 static struct rmlock		hn_vfmap_lock;
627 static int			hn_vfmap_size;
628 static struct ifnet		**hn_vfmap;
629 
630 #ifndef RSS
631 static const uint8_t
632 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
633 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
634 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
635 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
636 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
637 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
638 };
639 #endif	/* !RSS */
640 
641 static const struct hyperv_guid	hn_guid = {
642 	.hv_guid = {
643 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
644 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
645 };
646 
647 static device_method_t hn_methods[] = {
648 	/* Device interface */
649 	DEVMETHOD(device_probe,		hn_probe),
650 	DEVMETHOD(device_attach,	hn_attach),
651 	DEVMETHOD(device_detach,	hn_detach),
652 	DEVMETHOD(device_shutdown,	hn_shutdown),
653 	DEVMETHOD_END
654 };
655 
656 static driver_t hn_driver = {
657 	"hn",
658 	hn_methods,
659 	sizeof(struct hn_softc)
660 };
661 
662 static devclass_t hn_devclass;
663 
664 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
665 MODULE_VERSION(hn, 1);
666 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
667 
668 #if __FreeBSD_version >= 1100099
669 static void
670 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
671 {
672 	int i;
673 
674 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
675 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
676 }
677 #endif
678 
679 static int
680 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
681 {
682 
683 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
684 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
685 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
686 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
687 }
688 
689 static int
690 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
691 {
692 	struct hn_nvs_rndis rndis;
693 
694 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
695 	    txd->chim_size > 0, ("invalid rndis chim txd"));
696 
697 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
698 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
699 	rndis.nvs_chim_idx = txd->chim_index;
700 	rndis.nvs_chim_sz = txd->chim_size;
701 
702 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
703 	    &rndis, sizeof(rndis), &txd->send_ctx));
704 }
705 
706 static __inline uint32_t
707 hn_chim_alloc(struct hn_softc *sc)
708 {
709 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
710 	u_long *bmap = sc->hn_chim_bmap;
711 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
712 
713 	for (i = 0; i < bmap_cnt; ++i) {
714 		int idx;
715 
716 		idx = ffsl(~bmap[i]);
717 		if (idx == 0)
718 			continue;
719 
720 		--idx; /* ffsl is 1-based */
721 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
722 		    ("invalid i %d and idx %d", i, idx));
723 
724 		if (atomic_testandset_long(&bmap[i], idx))
725 			continue;
726 
727 		ret = i * LONG_BIT + idx;
728 		break;
729 	}
730 	return (ret);
731 }
732 
733 static __inline void
734 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
735 {
736 	u_long mask;
737 	uint32_t idx;
738 
739 	idx = chim_idx / LONG_BIT;
740 	KASSERT(idx < sc->hn_chim_bmap_cnt,
741 	    ("invalid chimney index 0x%x", chim_idx));
742 
743 	mask = 1UL << (chim_idx % LONG_BIT);
744 	KASSERT(sc->hn_chim_bmap[idx] & mask,
745 	    ("index bitmap 0x%lx, chimney index %u, "
746 	     "bitmap idx %d, bitmask 0x%lx",
747 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
748 
749 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
750 }
751 
752 #if defined(INET6) || defined(INET)
753 
754 #define PULLUP_HDR(m, len)				\
755 do {							\
756 	if (__predict_false((m)->m_len < (len))) {	\
757 		(m) = m_pullup((m), (len));		\
758 		if ((m) == NULL)			\
759 			return (NULL);			\
760 	}						\
761 } while (0)
762 
763 /*
764  * NOTE: If this function failed, the m_head would be freed.
765  */
766 static __inline struct mbuf *
767 hn_tso_fixup(struct mbuf *m_head)
768 {
769 	struct ether_vlan_header *evl;
770 	struct tcphdr *th;
771 	int ehlen;
772 
773 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
774 
775 	PULLUP_HDR(m_head, sizeof(*evl));
776 	evl = mtod(m_head, struct ether_vlan_header *);
777 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
778 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
779 	else
780 		ehlen = ETHER_HDR_LEN;
781 	m_head->m_pkthdr.l2hlen = ehlen;
782 
783 #ifdef INET
784 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
785 		struct ip *ip;
786 		int iphlen;
787 
788 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
789 		ip = mtodo(m_head, ehlen);
790 		iphlen = ip->ip_hl << 2;
791 		m_head->m_pkthdr.l3hlen = iphlen;
792 
793 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
794 		th = mtodo(m_head, ehlen + iphlen);
795 
796 		ip->ip_len = 0;
797 		ip->ip_sum = 0;
798 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
799 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
800 	}
801 #endif
802 #if defined(INET6) && defined(INET)
803 	else
804 #endif
805 #ifdef INET6
806 	{
807 		struct ip6_hdr *ip6;
808 
809 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
810 		ip6 = mtodo(m_head, ehlen);
811 		if (ip6->ip6_nxt != IPPROTO_TCP) {
812 			m_freem(m_head);
813 			return (NULL);
814 		}
815 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
816 
817 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
818 		th = mtodo(m_head, ehlen + sizeof(*ip6));
819 
820 		ip6->ip6_plen = 0;
821 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
822 	}
823 #endif
824 	return (m_head);
825 }
826 
827 /*
828  * NOTE: If this function failed, the m_head would be freed.
829  */
830 static __inline struct mbuf *
831 hn_set_hlen(struct mbuf *m_head)
832 {
833 	const struct ether_vlan_header *evl;
834 	int ehlen;
835 
836 	PULLUP_HDR(m_head, sizeof(*evl));
837 	evl = mtod(m_head, const struct ether_vlan_header *);
838 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
839 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
840 	else
841 		ehlen = ETHER_HDR_LEN;
842 	m_head->m_pkthdr.l2hlen = ehlen;
843 
844 #ifdef INET
845 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
846 		const struct ip *ip;
847 		int iphlen;
848 
849 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
850 		ip = mtodo(m_head, ehlen);
851 		iphlen = ip->ip_hl << 2;
852 		m_head->m_pkthdr.l3hlen = iphlen;
853 
854 		/*
855 		 * UDP checksum offload does not work in Azure, if the
856 		 * following conditions meet:
857 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
858 		 * - IP_DF is not set in the IP hdr.
859 		 *
860 		 * Fallback to software checksum for these UDP datagrams.
861 		 */
862 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
863 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
864 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
865 			uint16_t off = ehlen + iphlen;
866 
867 			counter_u64_add(hn_udpcs_fixup, 1);
868 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
869 			*(uint16_t *)(m_head->m_data + off +
870                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
871 			    m_head, m_head->m_pkthdr.len, off);
872 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
873 		}
874 	}
875 #endif
876 #if defined(INET6) && defined(INET)
877 	else
878 #endif
879 #ifdef INET6
880 	{
881 		const struct ip6_hdr *ip6;
882 
883 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
884 		ip6 = mtodo(m_head, ehlen);
885 		if (ip6->ip6_nxt != IPPROTO_TCP &&
886 		    ip6->ip6_nxt != IPPROTO_UDP) {
887 			m_freem(m_head);
888 			return (NULL);
889 		}
890 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
891 	}
892 #endif
893 	return (m_head);
894 }
895 
896 /*
897  * NOTE: If this function failed, the m_head would be freed.
898  */
899 static __inline struct mbuf *
900 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
901 {
902 	const struct tcphdr *th;
903 	int ehlen, iphlen;
904 
905 	*tcpsyn = 0;
906 	ehlen = m_head->m_pkthdr.l2hlen;
907 	iphlen = m_head->m_pkthdr.l3hlen;
908 
909 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
910 	th = mtodo(m_head, ehlen + iphlen);
911 	if (th->th_flags & TH_SYN)
912 		*tcpsyn = 1;
913 	return (m_head);
914 }
915 
916 #undef PULLUP_HDR
917 
918 #endif	/* INET6 || INET */
919 
920 static int
921 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
922 {
923 	int error = 0;
924 
925 	HN_LOCK_ASSERT(sc);
926 
927 	if (sc->hn_rx_filter != filter) {
928 		error = hn_rndis_set_rxfilter(sc, filter);
929 		if (!error)
930 			sc->hn_rx_filter = filter;
931 	}
932 	return (error);
933 }
934 
935 static int
936 hn_rxfilter_config(struct hn_softc *sc)
937 {
938 	struct ifnet *ifp = sc->hn_ifp;
939 	uint32_t filter;
940 
941 	HN_LOCK_ASSERT(sc);
942 
943 	/*
944 	 * If the non-transparent mode VF is activated, we don't know how
945 	 * its RX filter is configured, so stick the synthetic device in
946 	 * the promiscous mode.
947 	 */
948 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
949 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
950 	} else {
951 		filter = NDIS_PACKET_TYPE_DIRECTED;
952 		if (ifp->if_flags & IFF_BROADCAST)
953 			filter |= NDIS_PACKET_TYPE_BROADCAST;
954 		/* TODO: support multicast list */
955 		if ((ifp->if_flags & IFF_ALLMULTI) ||
956 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
957 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
958 	}
959 	return (hn_set_rxfilter(sc, filter));
960 }
961 
962 static void
963 hn_set_txagg(struct hn_softc *sc)
964 {
965 	uint32_t size, pkts;
966 	int i;
967 
968 	/*
969 	 * Setup aggregation size.
970 	 */
971 	if (sc->hn_agg_size < 0)
972 		size = UINT32_MAX;
973 	else
974 		size = sc->hn_agg_size;
975 
976 	if (sc->hn_rndis_agg_size < size)
977 		size = sc->hn_rndis_agg_size;
978 
979 	/* NOTE: We only aggregate packets using chimney sending buffers. */
980 	if (size > (uint32_t)sc->hn_chim_szmax)
981 		size = sc->hn_chim_szmax;
982 
983 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
984 		/* Disable */
985 		size = 0;
986 		pkts = 0;
987 		goto done;
988 	}
989 
990 	/* NOTE: Type of the per TX ring setting is 'int'. */
991 	if (size > INT_MAX)
992 		size = INT_MAX;
993 
994 	/*
995 	 * Setup aggregation packet count.
996 	 */
997 	if (sc->hn_agg_pkts < 0)
998 		pkts = UINT32_MAX;
999 	else
1000 		pkts = sc->hn_agg_pkts;
1001 
1002 	if (sc->hn_rndis_agg_pkts < pkts)
1003 		pkts = sc->hn_rndis_agg_pkts;
1004 
1005 	if (pkts <= 1) {
1006 		/* Disable */
1007 		size = 0;
1008 		pkts = 0;
1009 		goto done;
1010 	}
1011 
1012 	/* NOTE: Type of the per TX ring setting is 'short'. */
1013 	if (pkts > SHRT_MAX)
1014 		pkts = SHRT_MAX;
1015 
1016 done:
1017 	/* NOTE: Type of the per TX ring setting is 'short'. */
1018 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1019 		/* Disable */
1020 		size = 0;
1021 		pkts = 0;
1022 	}
1023 
1024 	if (bootverbose) {
1025 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1026 		    size, pkts, sc->hn_rndis_agg_align);
1027 	}
1028 
1029 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1030 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1031 
1032 		mtx_lock(&txr->hn_tx_lock);
1033 		txr->hn_agg_szmax = size;
1034 		txr->hn_agg_pktmax = pkts;
1035 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1036 		mtx_unlock(&txr->hn_tx_lock);
1037 	}
1038 }
1039 
1040 static int
1041 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1042 {
1043 
1044 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1045 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1046 		return txr->hn_txdesc_cnt;
1047 	return hn_tx_swq_depth;
1048 }
1049 
1050 static int
1051 hn_rss_reconfig(struct hn_softc *sc)
1052 {
1053 	int error;
1054 
1055 	HN_LOCK_ASSERT(sc);
1056 
1057 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1058 		return (ENXIO);
1059 
1060 	/*
1061 	 * Disable RSS first.
1062 	 *
1063 	 * NOTE:
1064 	 * Direct reconfiguration by setting the UNCHG flags does
1065 	 * _not_ work properly.
1066 	 */
1067 	if (bootverbose)
1068 		if_printf(sc->hn_ifp, "disable RSS\n");
1069 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1070 	if (error) {
1071 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1072 		return (error);
1073 	}
1074 
1075 	/*
1076 	 * Reenable the RSS w/ the updated RSS key or indirect
1077 	 * table.
1078 	 */
1079 	if (bootverbose)
1080 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1081 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1082 	if (error) {
1083 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1084 		return (error);
1085 	}
1086 	return (0);
1087 }
1088 
1089 static void
1090 hn_rss_ind_fixup(struct hn_softc *sc)
1091 {
1092 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1093 	int i, nchan;
1094 
1095 	nchan = sc->hn_rx_ring_inuse;
1096 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1097 
1098 	/*
1099 	 * Check indirect table to make sure that all channels in it
1100 	 * can be used.
1101 	 */
1102 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1103 		if (rss->rss_ind[i] >= nchan) {
1104 			if_printf(sc->hn_ifp,
1105 			    "RSS indirect table %d fixup: %u -> %d\n",
1106 			    i, rss->rss_ind[i], nchan - 1);
1107 			rss->rss_ind[i] = nchan - 1;
1108 		}
1109 	}
1110 }
1111 
1112 static int
1113 hn_ifmedia_upd(struct ifnet *ifp __unused)
1114 {
1115 
1116 	return EOPNOTSUPP;
1117 }
1118 
1119 static void
1120 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1121 {
1122 	struct hn_softc *sc = ifp->if_softc;
1123 
1124 	ifmr->ifm_status = IFM_AVALID;
1125 	ifmr->ifm_active = IFM_ETHER;
1126 
1127 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1128 		ifmr->ifm_active |= IFM_NONE;
1129 		return;
1130 	}
1131 	ifmr->ifm_status |= IFM_ACTIVE;
1132 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1133 }
1134 
1135 static void
1136 hn_rxvf_set_task(void *xarg, int pending __unused)
1137 {
1138 	struct hn_rxvf_setarg *arg = xarg;
1139 
1140 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1141 }
1142 
1143 static void
1144 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1145 {
1146 	struct hn_rx_ring *rxr;
1147 	struct hn_rxvf_setarg arg;
1148 	struct task task;
1149 	int i;
1150 
1151 	HN_LOCK_ASSERT(sc);
1152 
1153 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1154 
1155 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1156 		rxr = &sc->hn_rx_ring[i];
1157 
1158 		if (i < sc->hn_rx_ring_inuse) {
1159 			arg.rxr = rxr;
1160 			arg.vf_ifp = vf_ifp;
1161 			vmbus_chan_run_task(rxr->hn_chan, &task);
1162 		} else {
1163 			rxr->hn_rxvf_ifp = vf_ifp;
1164 		}
1165 	}
1166 }
1167 
1168 static bool
1169 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1170 {
1171 	const struct ifnet *hn_ifp;
1172 
1173 	hn_ifp = sc->hn_ifp;
1174 
1175 	if (ifp == hn_ifp)
1176 		return (false);
1177 
1178 	if (ifp->if_alloctype != IFT_ETHER)
1179 		return (false);
1180 
1181 	/* Ignore lagg/vlan interfaces */
1182 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1183 	    strcmp(ifp->if_dname, "vlan") == 0)
1184 		return (false);
1185 
1186 	/*
1187 	 * During detach events ifp->if_addr might be NULL.
1188 	 * Make sure the bcmp() below doesn't panic on that:
1189 	 */
1190 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1191 		return (false);
1192 
1193 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1194 		return (false);
1195 
1196 	return (true);
1197 }
1198 
1199 static void
1200 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1201 {
1202 	struct ifnet *hn_ifp;
1203 
1204 	HN_LOCK(sc);
1205 
1206 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1207 		goto out;
1208 
1209 	if (!hn_ismyvf(sc, ifp))
1210 		goto out;
1211 	hn_ifp = sc->hn_ifp;
1212 
1213 	if (rxvf) {
1214 		if (sc->hn_flags & HN_FLAG_RXVF)
1215 			goto out;
1216 
1217 		sc->hn_flags |= HN_FLAG_RXVF;
1218 		hn_rxfilter_config(sc);
1219 	} else {
1220 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1221 			goto out;
1222 
1223 		sc->hn_flags &= ~HN_FLAG_RXVF;
1224 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1225 			hn_rxfilter_config(sc);
1226 		else
1227 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1228 	}
1229 
1230 	hn_nvs_set_datapath(sc,
1231 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1232 
1233 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1234 
1235 	if (rxvf) {
1236 		hn_vf_rss_fixup(sc, true);
1237 		hn_suspend_mgmt(sc);
1238 		sc->hn_link_flags &=
1239 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1240 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1241 	} else {
1242 		hn_vf_rss_restore(sc);
1243 		hn_resume_mgmt(sc);
1244 	}
1245 
1246 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1247 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1248 
1249 	if (bootverbose) {
1250 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1251 		    rxvf ? "to" : "from", ifp->if_xname);
1252 	}
1253 out:
1254 	HN_UNLOCK(sc);
1255 }
1256 
1257 static void
1258 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1259 {
1260 
1261 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1262 		return;
1263 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1264 }
1265 
1266 static void
1267 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1268 {
1269 
1270 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1271 }
1272 
1273 static int
1274 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1275 {
1276 	struct ifnet *ifp, *vf_ifp;
1277 	uint64_t tmp;
1278 	int error;
1279 
1280 	HN_LOCK_ASSERT(sc);
1281 	ifp = sc->hn_ifp;
1282 	vf_ifp = sc->hn_vf_ifp;
1283 
1284 	/*
1285 	 * Fix up requested capabilities w/ supported capabilities,
1286 	 * since the supported capabilities could have been changed.
1287 	 */
1288 	ifr->ifr_reqcap &= ifp->if_capabilities;
1289 	/* Pass SIOCSIFCAP to VF. */
1290 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1291 
1292 	/*
1293 	 * NOTE:
1294 	 * The error will be propagated to the callers, however, it
1295 	 * is _not_ useful here.
1296 	 */
1297 
1298 	/*
1299 	 * Merge VF's enabled capabilities.
1300 	 */
1301 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1302 
1303 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1304 	if (ifp->if_capenable & IFCAP_TXCSUM)
1305 		ifp->if_hwassist |= tmp;
1306 	else
1307 		ifp->if_hwassist &= ~tmp;
1308 
1309 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1310 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1311 		ifp->if_hwassist |= tmp;
1312 	else
1313 		ifp->if_hwassist &= ~tmp;
1314 
1315 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1316 	if (ifp->if_capenable & IFCAP_TSO4)
1317 		ifp->if_hwassist |= tmp;
1318 	else
1319 		ifp->if_hwassist &= ~tmp;
1320 
1321 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1322 	if (ifp->if_capenable & IFCAP_TSO6)
1323 		ifp->if_hwassist |= tmp;
1324 	else
1325 		ifp->if_hwassist &= ~tmp;
1326 
1327 	return (error);
1328 }
1329 
1330 static int
1331 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1332 {
1333 	struct ifnet *vf_ifp;
1334 	struct ifreq ifr;
1335 
1336 	HN_LOCK_ASSERT(sc);
1337 	vf_ifp = sc->hn_vf_ifp;
1338 
1339 	memset(&ifr, 0, sizeof(ifr));
1340 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1341 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1342 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1343 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1344 }
1345 
1346 static void
1347 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1348 {
1349 	struct ifnet *ifp = sc->hn_ifp;
1350 	int allmulti = 0;
1351 
1352 	HN_LOCK_ASSERT(sc);
1353 
1354 	/* XXX vlan(4) style mcast addr maintenance */
1355 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1356 		allmulti = IFF_ALLMULTI;
1357 
1358 	/* Always set the VF's if_flags */
1359 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1360 }
1361 
1362 static void
1363 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1364 {
1365 	struct rm_priotracker pt;
1366 	struct ifnet *hn_ifp = NULL;
1367 	struct mbuf *mn;
1368 
1369 	/*
1370 	 * XXX racy, if hn(4) ever detached.
1371 	 */
1372 	rm_rlock(&hn_vfmap_lock, &pt);
1373 	if (vf_ifp->if_index < hn_vfmap_size)
1374 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1375 	rm_runlock(&hn_vfmap_lock, &pt);
1376 
1377 	if (hn_ifp != NULL) {
1378 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1379 			/*
1380 			 * Allow tapping on the VF.
1381 			 */
1382 			ETHER_BPF_MTAP(vf_ifp, mn);
1383 
1384 			/*
1385 			 * Update VF stats.
1386 			 */
1387 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1388 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1389 				    mn->m_pkthdr.len);
1390 			}
1391 			/*
1392 			 * XXX IFCOUNTER_IMCAST
1393 			 * This stat updating is kinda invasive, since it
1394 			 * requires two checks on the mbuf: the length check
1395 			 * and the ethernet header check.  As of this write,
1396 			 * all multicast packets go directly to hn(4), which
1397 			 * makes imcast stat updating in the VF a try in vian.
1398 			 */
1399 
1400 			/*
1401 			 * Fix up rcvif and increase hn(4)'s ipackets.
1402 			 */
1403 			mn->m_pkthdr.rcvif = hn_ifp;
1404 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1405 		}
1406 		/*
1407 		 * Go through hn(4)'s if_input.
1408 		 */
1409 		hn_ifp->if_input(hn_ifp, m);
1410 	} else {
1411 		/*
1412 		 * In the middle of the transition; free this
1413 		 * mbuf chain.
1414 		 */
1415 		while (m != NULL) {
1416 			mn = m->m_nextpkt;
1417 			m->m_nextpkt = NULL;
1418 			m_freem(m);
1419 			m = mn;
1420 		}
1421 	}
1422 }
1423 
1424 static void
1425 hn_mtu_change_fixup(struct hn_softc *sc)
1426 {
1427 	struct ifnet *ifp;
1428 
1429 	HN_LOCK_ASSERT(sc);
1430 	ifp = sc->hn_ifp;
1431 
1432 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1433 #if __FreeBSD_version >= 1100099
1434 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1435 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1436 #endif
1437 }
1438 
1439 static uint32_t
1440 hn_rss_type_fromndis(uint32_t rss_hash)
1441 {
1442 	uint32_t types = 0;
1443 
1444 	if (rss_hash & NDIS_HASH_IPV4)
1445 		types |= RSS_TYPE_IPV4;
1446 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1447 		types |= RSS_TYPE_TCP_IPV4;
1448 	if (rss_hash & NDIS_HASH_IPV6)
1449 		types |= RSS_TYPE_IPV6;
1450 	if (rss_hash & NDIS_HASH_IPV6_EX)
1451 		types |= RSS_TYPE_IPV6_EX;
1452 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1453 		types |= RSS_TYPE_TCP_IPV6;
1454 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1455 		types |= RSS_TYPE_TCP_IPV6_EX;
1456 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1457 		types |= RSS_TYPE_UDP_IPV4;
1458 	return (types);
1459 }
1460 
1461 static uint32_t
1462 hn_rss_type_tondis(uint32_t types)
1463 {
1464 	uint32_t rss_hash = 0;
1465 
1466 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1467 	    ("UDP6 and UDP6EX are not supported"));
1468 
1469 	if (types & RSS_TYPE_IPV4)
1470 		rss_hash |= NDIS_HASH_IPV4;
1471 	if (types & RSS_TYPE_TCP_IPV4)
1472 		rss_hash |= NDIS_HASH_TCP_IPV4;
1473 	if (types & RSS_TYPE_IPV6)
1474 		rss_hash |= NDIS_HASH_IPV6;
1475 	if (types & RSS_TYPE_IPV6_EX)
1476 		rss_hash |= NDIS_HASH_IPV6_EX;
1477 	if (types & RSS_TYPE_TCP_IPV6)
1478 		rss_hash |= NDIS_HASH_TCP_IPV6;
1479 	if (types & RSS_TYPE_TCP_IPV6_EX)
1480 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1481 	if (types & RSS_TYPE_UDP_IPV4)
1482 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1483 	return (rss_hash);
1484 }
1485 
1486 static void
1487 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1488 {
1489 	int i;
1490 
1491 	HN_LOCK_ASSERT(sc);
1492 
1493 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1494 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1495 }
1496 
1497 static void
1498 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1499 {
1500 	struct ifnet *ifp, *vf_ifp;
1501 	struct ifrsshash ifrh;
1502 	struct ifrsskey ifrk;
1503 	int error;
1504 	uint32_t my_types, diff_types, mbuf_types = 0;
1505 
1506 	HN_LOCK_ASSERT(sc);
1507 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1508 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1509 
1510 	if (sc->hn_rx_ring_inuse == 1) {
1511 		/* No RSS on synthetic parts; done. */
1512 		return;
1513 	}
1514 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1515 		/* Synthetic parts do not support Toeplitz; done. */
1516 		return;
1517 	}
1518 
1519 	ifp = sc->hn_ifp;
1520 	vf_ifp = sc->hn_vf_ifp;
1521 
1522 	/*
1523 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1524 	 * supported.
1525 	 */
1526 	memset(&ifrk, 0, sizeof(ifrk));
1527 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1528 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1529 	if (error) {
1530 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1531 		    vf_ifp->if_xname, error);
1532 		goto done;
1533 	}
1534 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1535 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1536 		    vf_ifp->if_xname, ifrk.ifrk_func);
1537 		goto done;
1538 	}
1539 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1540 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1541 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1542 		goto done;
1543 	}
1544 
1545 	/*
1546 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1547 	 */
1548 	memset(&ifrh, 0, sizeof(ifrh));
1549 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1550 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1551 	if (error) {
1552 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1553 		    vf_ifp->if_xname, error);
1554 		goto done;
1555 	}
1556 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1557 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1558 		    vf_ifp->if_xname, ifrh.ifrh_func);
1559 		goto done;
1560 	}
1561 
1562 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1563 	if ((ifrh.ifrh_types & my_types) == 0) {
1564 		/* This disables RSS; ignore it then */
1565 		if_printf(ifp, "%s intersection of RSS types failed.  "
1566 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1567 		    ifrh.ifrh_types, my_types);
1568 		goto done;
1569 	}
1570 
1571 	diff_types = my_types ^ ifrh.ifrh_types;
1572 	my_types &= ifrh.ifrh_types;
1573 	mbuf_types = my_types;
1574 
1575 	/*
1576 	 * Detect RSS hash value/type confliction.
1577 	 *
1578 	 * NOTE:
1579 	 * We don't disable the hash type, but stop delivery the hash
1580 	 * value/type through mbufs on RX path.
1581 	 *
1582 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1583 	 * hash is delivered with type of TCP_IPV4.  This means if
1584 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1585 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1586 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1587 	 * here.
1588 	 */
1589 	if ((my_types & RSS_TYPE_IPV4) &&
1590 	    (diff_types & ifrh.ifrh_types &
1591 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1592 		/* Conflict; disable IPV4 hash type/value delivery. */
1593 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1594 		mbuf_types &= ~RSS_TYPE_IPV4;
1595 	}
1596 	if ((my_types & RSS_TYPE_IPV6) &&
1597 	    (diff_types & ifrh.ifrh_types &
1598 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1599 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1600 	      RSS_TYPE_IPV6_EX))) {
1601 		/* Conflict; disable IPV6 hash type/value delivery. */
1602 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1603 		mbuf_types &= ~RSS_TYPE_IPV6;
1604 	}
1605 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1606 	    (diff_types & ifrh.ifrh_types &
1607 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1608 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1609 	      RSS_TYPE_IPV6))) {
1610 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1611 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1612 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1613 	}
1614 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1615 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1616 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1617 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1618 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1619 	}
1620 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1621 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1622 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1623 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1624 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1625 	}
1626 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1627 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1628 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1629 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1630 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1631 	}
1632 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1633 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1634 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1635 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1636 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1637 	}
1638 
1639 	/*
1640 	 * Indirect table does not matter.
1641 	 */
1642 
1643 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1644 	    hn_rss_type_tondis(my_types);
1645 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1646 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1647 
1648 	if (reconf) {
1649 		error = hn_rss_reconfig(sc);
1650 		if (error) {
1651 			/* XXX roll-back? */
1652 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1653 			/* XXX keep going. */
1654 		}
1655 	}
1656 done:
1657 	/* Hash deliverability for mbufs. */
1658 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1659 }
1660 
1661 static void
1662 hn_vf_rss_restore(struct hn_softc *sc)
1663 {
1664 
1665 	HN_LOCK_ASSERT(sc);
1666 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1667 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1668 
1669 	if (sc->hn_rx_ring_inuse == 1)
1670 		goto done;
1671 
1672 	/*
1673 	 * Restore hash types.  Key does _not_ matter.
1674 	 */
1675 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1676 		int error;
1677 
1678 		sc->hn_rss_hash = sc->hn_rss_hcap;
1679 		error = hn_rss_reconfig(sc);
1680 		if (error) {
1681 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1682 			    error);
1683 			/* XXX keep going. */
1684 		}
1685 	}
1686 done:
1687 	/* Hash deliverability for mbufs. */
1688 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1689 }
1690 
1691 static void
1692 hn_xpnt_vf_setready(struct hn_softc *sc)
1693 {
1694 	struct ifnet *ifp, *vf_ifp;
1695 	struct ifreq ifr;
1696 
1697 	HN_LOCK_ASSERT(sc);
1698 	ifp = sc->hn_ifp;
1699 	vf_ifp = sc->hn_vf_ifp;
1700 
1701 	/*
1702 	 * Mark the VF ready.
1703 	 */
1704 	sc->hn_vf_rdytick = 0;
1705 
1706 	/*
1707 	 * Save information for restoration.
1708 	 */
1709 	sc->hn_saved_caps = ifp->if_capabilities;
1710 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1711 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1712 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1713 
1714 	/*
1715 	 * Intersect supported/enabled capabilities.
1716 	 *
1717 	 * NOTE:
1718 	 * if_hwassist is not changed here.
1719 	 */
1720 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1721 	ifp->if_capenable &= ifp->if_capabilities;
1722 
1723 	/*
1724 	 * Fix TSO settings.
1725 	 */
1726 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1727 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1728 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1729 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1730 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1731 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1732 
1733 	/*
1734 	 * Change VF's enabled capabilities.
1735 	 */
1736 	memset(&ifr, 0, sizeof(ifr));
1737 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1738 	ifr.ifr_reqcap = ifp->if_capenable;
1739 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1740 
1741 	if (ifp->if_mtu != ETHERMTU) {
1742 		int error;
1743 
1744 		/*
1745 		 * Change VF's MTU.
1746 		 */
1747 		memset(&ifr, 0, sizeof(ifr));
1748 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1749 		ifr.ifr_mtu = ifp->if_mtu;
1750 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1751 		if (error) {
1752 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1753 			    vf_ifp->if_xname, ifp->if_mtu);
1754 			if (ifp->if_mtu > ETHERMTU) {
1755 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1756 
1757 				/*
1758 				 * XXX
1759 				 * No need to adjust the synthetic parts' MTU;
1760 				 * failure of the adjustment will cause us
1761 				 * infinite headache.
1762 				 */
1763 				ifp->if_mtu = ETHERMTU;
1764 				hn_mtu_change_fixup(sc);
1765 			}
1766 		}
1767 	}
1768 }
1769 
1770 static bool
1771 hn_xpnt_vf_isready(struct hn_softc *sc)
1772 {
1773 
1774 	HN_LOCK_ASSERT(sc);
1775 
1776 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1777 		return (false);
1778 
1779 	if (sc->hn_vf_rdytick == 0)
1780 		return (true);
1781 
1782 	if (sc->hn_vf_rdytick > ticks)
1783 		return (false);
1784 
1785 	/* Mark VF as ready. */
1786 	hn_xpnt_vf_setready(sc);
1787 	return (true);
1788 }
1789 
1790 static void
1791 hn_xpnt_vf_setenable(struct hn_softc *sc)
1792 {
1793 	int i;
1794 
1795 	HN_LOCK_ASSERT(sc);
1796 
1797 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1798 	rm_wlock(&sc->hn_vf_lock);
1799 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1800 	rm_wunlock(&sc->hn_vf_lock);
1801 
1802 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1803 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1804 }
1805 
1806 static void
1807 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1808 {
1809 	int i;
1810 
1811 	HN_LOCK_ASSERT(sc);
1812 
1813 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1814 	rm_wlock(&sc->hn_vf_lock);
1815 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1816 	if (clear_vf)
1817 		sc->hn_vf_ifp = NULL;
1818 	rm_wunlock(&sc->hn_vf_lock);
1819 
1820 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1821 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1822 }
1823 
1824 static void
1825 hn_xpnt_vf_init(struct hn_softc *sc)
1826 {
1827 	int error;
1828 
1829 	HN_LOCK_ASSERT(sc);
1830 
1831 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1832 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1833 
1834 	if (bootverbose) {
1835 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1836 		    sc->hn_vf_ifp->if_xname);
1837 	}
1838 
1839 	/*
1840 	 * Bring the VF up.
1841 	 */
1842 	hn_xpnt_vf_saveifflags(sc);
1843 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1844 	error = hn_xpnt_vf_iocsetflags(sc);
1845 	if (error) {
1846 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1847 		    sc->hn_vf_ifp->if_xname, error);
1848 		return;
1849 	}
1850 
1851 	/*
1852 	 * NOTE:
1853 	 * Datapath setting must happen _after_ bringing the VF up.
1854 	 */
1855 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1856 
1857 	/*
1858 	 * NOTE:
1859 	 * Fixup RSS related bits _after_ the VF is brought up, since
1860 	 * many VFs generate RSS key during it's initialization.
1861 	 */
1862 	hn_vf_rss_fixup(sc, true);
1863 
1864 	/* Mark transparent mode VF as enabled. */
1865 	hn_xpnt_vf_setenable(sc);
1866 }
1867 
1868 static void
1869 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1870 {
1871 	struct hn_softc *sc = xsc;
1872 
1873 	HN_LOCK(sc);
1874 
1875 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1876 		goto done;
1877 	if (sc->hn_vf_ifp == NULL)
1878 		goto done;
1879 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1880 		goto done;
1881 
1882 	if (sc->hn_vf_rdytick != 0) {
1883 		/* Mark VF as ready. */
1884 		hn_xpnt_vf_setready(sc);
1885 	}
1886 
1887 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1888 		/*
1889 		 * Delayed VF initialization.
1890 		 */
1891 		if (bootverbose) {
1892 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1893 			    sc->hn_vf_ifp->if_xname);
1894 		}
1895 		hn_xpnt_vf_init(sc);
1896 	}
1897 done:
1898 	HN_UNLOCK(sc);
1899 }
1900 
1901 static void
1902 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1903 {
1904 	struct hn_softc *sc = xsc;
1905 
1906 	HN_LOCK(sc);
1907 
1908 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1909 		goto done;
1910 
1911 	if (!hn_ismyvf(sc, ifp))
1912 		goto done;
1913 
1914 	if (sc->hn_vf_ifp != NULL) {
1915 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1916 		    sc->hn_vf_ifp->if_xname);
1917 		goto done;
1918 	}
1919 
1920 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1921 		/*
1922 		 * ifnet.if_start is _not_ supported by transparent
1923 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1924 		 */
1925 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1926 		    "in transparent VF mode.\n", ifp->if_xname);
1927 		goto done;
1928 	}
1929 
1930 	rm_wlock(&hn_vfmap_lock);
1931 
1932 	if (ifp->if_index >= hn_vfmap_size) {
1933 		struct ifnet **newmap;
1934 		int newsize;
1935 
1936 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1937 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1938 		    M_WAITOK | M_ZERO);
1939 
1940 		memcpy(newmap, hn_vfmap,
1941 		    sizeof(struct ifnet *) * hn_vfmap_size);
1942 		free(hn_vfmap, M_DEVBUF);
1943 		hn_vfmap = newmap;
1944 		hn_vfmap_size = newsize;
1945 	}
1946 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1947 	    ("%s: ifindex %d was mapped to %s",
1948 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1949 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1950 
1951 	rm_wunlock(&hn_vfmap_lock);
1952 
1953 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1954 	rm_wlock(&sc->hn_vf_lock);
1955 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1956 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1957 	sc->hn_vf_ifp = ifp;
1958 	rm_wunlock(&sc->hn_vf_lock);
1959 
1960 	if (hn_xpnt_vf) {
1961 		int wait_ticks;
1962 
1963 		/*
1964 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1965 		 * Save vf_ifp's current if_input for later restoration.
1966 		 */
1967 		sc->hn_vf_input = ifp->if_input;
1968 		ifp->if_input = hn_xpnt_vf_input;
1969 
1970 		/*
1971 		 * Stop link status management; use the VF's.
1972 		 */
1973 		hn_suspend_mgmt(sc);
1974 
1975 		/*
1976 		 * Give VF sometime to complete its attach routing.
1977 		 */
1978 		wait_ticks = hn_xpnt_vf_attwait * hz;
1979 		sc->hn_vf_rdytick = ticks + wait_ticks;
1980 
1981 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1982 		    wait_ticks);
1983 	}
1984 done:
1985 	HN_UNLOCK(sc);
1986 }
1987 
1988 static void
1989 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1990 {
1991 	struct hn_softc *sc = xsc;
1992 
1993 	HN_LOCK(sc);
1994 
1995 	if (sc->hn_vf_ifp == NULL)
1996 		goto done;
1997 
1998 	if (!hn_ismyvf(sc, ifp))
1999 		goto done;
2000 
2001 	if (hn_xpnt_vf) {
2002 		/*
2003 		 * Make sure that the delayed initialization is not running.
2004 		 *
2005 		 * NOTE:
2006 		 * - This lock _must_ be released, since the hn_vf_init task
2007 		 *   will try holding this lock.
2008 		 * - It is safe to release this lock here, since the
2009 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
2010 		 *
2011 		 * XXX racy, if hn(4) ever detached.
2012 		 */
2013 		HN_UNLOCK(sc);
2014 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2015 		HN_LOCK(sc);
2016 
2017 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2018 		    sc->hn_ifp->if_xname));
2019 		ifp->if_input = sc->hn_vf_input;
2020 		sc->hn_vf_input = NULL;
2021 
2022 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2023 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2024 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2025 
2026 		if (sc->hn_vf_rdytick == 0) {
2027 			/*
2028 			 * The VF was ready; restore some settings.
2029 			 */
2030 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2031 			/*
2032 			 * NOTE:
2033 			 * There is _no_ need to fixup if_capenable and
2034 			 * if_hwassist, since the if_capabilities before
2035 			 * restoration was an intersection of the VF's
2036 			 * if_capabilites and the synthetic device's
2037 			 * if_capabilites.
2038 			 */
2039 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2040 			sc->hn_ifp->if_hw_tsomaxsegcount =
2041 			    sc->hn_saved_tsosegcnt;
2042 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2043 		}
2044 
2045 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2046 			/*
2047 			 * Restore RSS settings.
2048 			 */
2049 			hn_vf_rss_restore(sc);
2050 
2051 			/*
2052 			 * Resume link status management, which was suspended
2053 			 * by hn_ifnet_attevent().
2054 			 */
2055 			hn_resume_mgmt(sc);
2056 		}
2057 	}
2058 
2059 	/* Mark transparent mode VF as disabled. */
2060 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2061 
2062 	rm_wlock(&hn_vfmap_lock);
2063 
2064 	KASSERT(ifp->if_index < hn_vfmap_size,
2065 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2066 	if (hn_vfmap[ifp->if_index] != NULL) {
2067 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2068 		    ("%s: ifindex %d was mapped to %s",
2069 		     ifp->if_xname, ifp->if_index,
2070 		     hn_vfmap[ifp->if_index]->if_xname));
2071 		hn_vfmap[ifp->if_index] = NULL;
2072 	}
2073 
2074 	rm_wunlock(&hn_vfmap_lock);
2075 done:
2076 	HN_UNLOCK(sc);
2077 }
2078 
2079 static void
2080 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2081 {
2082 	struct hn_softc *sc = xsc;
2083 
2084 	if (sc->hn_vf_ifp == ifp)
2085 		if_link_state_change(sc->hn_ifp, link_state);
2086 }
2087 
2088 static int
2089 hn_probe(device_t dev)
2090 {
2091 
2092 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2093 		device_set_desc(dev, "Hyper-V Network Interface");
2094 		return BUS_PROBE_DEFAULT;
2095 	}
2096 	return ENXIO;
2097 }
2098 
2099 static int
2100 hn_attach(device_t dev)
2101 {
2102 	struct hn_softc *sc = device_get_softc(dev);
2103 	struct sysctl_oid_list *child;
2104 	struct sysctl_ctx_list *ctx;
2105 	uint8_t eaddr[ETHER_ADDR_LEN];
2106 	struct ifnet *ifp = NULL;
2107 	int error, ring_cnt, tx_ring_cnt;
2108 	uint32_t mtu;
2109 
2110 	sc->hn_dev = dev;
2111 	sc->hn_prichan = vmbus_get_channel(dev);
2112 	HN_LOCK_INIT(sc);
2113 	rm_init(&sc->hn_vf_lock, "hnvf");
2114 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2115 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2116 
2117 	/*
2118 	 * Initialize these tunables once.
2119 	 */
2120 	sc->hn_agg_size = hn_tx_agg_size;
2121 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2122 
2123 	/*
2124 	 * Setup taskqueue for transmission.
2125 	 */
2126 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2127 		int i;
2128 
2129 		sc->hn_tx_taskqs =
2130 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2131 		    M_DEVBUF, M_WAITOK);
2132 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2133 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2134 			    M_WAITOK, taskqueue_thread_enqueue,
2135 			    &sc->hn_tx_taskqs[i]);
2136 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2137 			    "%s tx%d", device_get_nameunit(dev), i);
2138 		}
2139 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2140 		sc->hn_tx_taskqs = hn_tx_taskque;
2141 	}
2142 
2143 	/*
2144 	 * Setup taskqueue for mangement tasks, e.g. link status.
2145 	 */
2146 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2147 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2148 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2149 	    device_get_nameunit(dev));
2150 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2151 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2152 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2153 	    hn_netchg_status_taskfunc, sc);
2154 
2155 	if (hn_xpnt_vf) {
2156 		/*
2157 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2158 		 */
2159 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2160 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2161 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2162 		    device_get_nameunit(dev));
2163 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2164 		    hn_xpnt_vf_init_taskfunc, sc);
2165 	}
2166 
2167 	/*
2168 	 * Allocate ifnet and setup its name earlier, so that if_printf
2169 	 * can be used by functions, which will be called after
2170 	 * ether_ifattach().
2171 	 */
2172 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2173 	ifp->if_softc = sc;
2174 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2175 
2176 	/*
2177 	 * Initialize ifmedia earlier so that it can be unconditionally
2178 	 * destroyed, if error happened later on.
2179 	 */
2180 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2181 
2182 	/*
2183 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2184 	 * to use (tx_ring_cnt).
2185 	 *
2186 	 * NOTE:
2187 	 * The # of RX rings to use is same as the # of channels to use.
2188 	 */
2189 	ring_cnt = hn_chan_cnt;
2190 	if (ring_cnt <= 0) {
2191 		/* Default */
2192 		ring_cnt = mp_ncpus;
2193 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2194 			ring_cnt = HN_RING_CNT_DEF_MAX;
2195 	} else if (ring_cnt > mp_ncpus) {
2196 		ring_cnt = mp_ncpus;
2197 	}
2198 #ifdef RSS
2199 	if (ring_cnt > rss_getnumbuckets())
2200 		ring_cnt = rss_getnumbuckets();
2201 #endif
2202 
2203 	tx_ring_cnt = hn_tx_ring_cnt;
2204 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2205 		tx_ring_cnt = ring_cnt;
2206 #ifdef HN_IFSTART_SUPPORT
2207 	if (hn_use_if_start) {
2208 		/* ifnet.if_start only needs one TX ring. */
2209 		tx_ring_cnt = 1;
2210 	}
2211 #endif
2212 
2213 	/*
2214 	 * Set the leader CPU for channels.
2215 	 */
2216 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2217 
2218 	/*
2219 	 * Create enough TX/RX rings, even if only limited number of
2220 	 * channels can be allocated.
2221 	 */
2222 	error = hn_create_tx_data(sc, tx_ring_cnt);
2223 	if (error)
2224 		goto failed;
2225 	error = hn_create_rx_data(sc, ring_cnt);
2226 	if (error)
2227 		goto failed;
2228 
2229 	/*
2230 	 * Create transaction context for NVS and RNDIS transactions.
2231 	 */
2232 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2233 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2234 	if (sc->hn_xact == NULL) {
2235 		error = ENXIO;
2236 		goto failed;
2237 	}
2238 
2239 	/*
2240 	 * Install orphan handler for the revocation of this device's
2241 	 * primary channel.
2242 	 *
2243 	 * NOTE:
2244 	 * The processing order is critical here:
2245 	 * Install the orphan handler, _before_ testing whether this
2246 	 * device's primary channel has been revoked or not.
2247 	 */
2248 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2249 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2250 		error = ENXIO;
2251 		goto failed;
2252 	}
2253 
2254 	/*
2255 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2256 	 */
2257 	error = hn_synth_attach(sc, ETHERMTU);
2258 	if (error)
2259 		goto failed;
2260 
2261 	error = hn_rndis_get_eaddr(sc, eaddr);
2262 	if (error)
2263 		goto failed;
2264 
2265 	error = hn_rndis_get_mtu(sc, &mtu);
2266 	if (error)
2267 		mtu = ETHERMTU;
2268 	else if (bootverbose)
2269 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2270 
2271 #if __FreeBSD_version >= 1100099
2272 	if (sc->hn_rx_ring_inuse > 1) {
2273 		/*
2274 		 * Reduce TCP segment aggregation limit for multiple
2275 		 * RX rings to increase ACK timeliness.
2276 		 */
2277 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2278 	}
2279 #endif
2280 
2281 	/*
2282 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2283 	 */
2284 	hn_fixup_tx_data(sc);
2285 	hn_fixup_rx_data(sc);
2286 
2287 	ctx = device_get_sysctl_ctx(dev);
2288 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2289 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2290 	    &sc->hn_nvs_ver, 0, "NVS version");
2291 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2292 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2293 	    hn_ndis_version_sysctl, "A", "NDIS version");
2294 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2295 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2296 	    hn_caps_sysctl, "A", "capabilities");
2297 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2298 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2299 	    hn_hwassist_sysctl, "A", "hwassist");
2300 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2301 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2302 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2303 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2304 	    "max # of TSO segments");
2305 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2306 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2307 	    "max size of TSO segment");
2308 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2309 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2310 	    hn_rxfilter_sysctl, "A", "rxfilter");
2311 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2312 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2313 	    hn_rss_hash_sysctl, "A", "RSS hash");
2314 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2315 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2316 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2317 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2318 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2319 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2320 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2321 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2322 #ifndef RSS
2323 	/*
2324 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2325 	 */
2326 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2327 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2328 	    hn_rss_key_sysctl, "IU", "RSS key");
2329 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2330 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2331 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2332 #endif
2333 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2334 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2335 	    "RNDIS offered packet transmission aggregation size limit");
2336 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2337 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2338 	    "RNDIS offered packet transmission aggregation count limit");
2339 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2340 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2341 	    "RNDIS packet transmission aggregation alignment");
2342 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2343 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2344 	    hn_txagg_size_sysctl, "I",
2345 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2346 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2347 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2348 	    hn_txagg_pkts_sysctl, "I",
2349 	    "Packet transmission aggregation packets, "
2350 	    "0 -- disable, -1 -- auto");
2351 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2352 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2353 	    hn_polling_sysctl, "I",
2354 	    "Polling frequency: [100,1000000], 0 disable polling");
2355 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2356 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2357 	    hn_vf_sysctl, "A", "Virtual Function's name");
2358 	if (!hn_xpnt_vf) {
2359 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2360 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2361 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2362 	} else {
2363 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2364 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2365 		    hn_xpnt_vf_enabled_sysctl, "I",
2366 		    "Transparent VF enabled");
2367 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2368 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2369 		    hn_xpnt_vf_accbpf_sysctl, "I",
2370 		    "Accurate BPF for transparent VF");
2371 	}
2372 
2373 	/*
2374 	 * Setup the ifmedia, which has been initialized earlier.
2375 	 */
2376 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2377 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2378 	/* XXX ifmedia_set really should do this for us */
2379 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2380 
2381 	/*
2382 	 * Setup the ifnet for this interface.
2383 	 */
2384 
2385 	ifp->if_baudrate = IF_Gbps(10);
2386 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2387 	ifp->if_ioctl = hn_ioctl;
2388 	ifp->if_init = hn_init;
2389 #ifdef HN_IFSTART_SUPPORT
2390 	if (hn_use_if_start) {
2391 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2392 
2393 		ifp->if_start = hn_start;
2394 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2395 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2396 		IFQ_SET_READY(&ifp->if_snd);
2397 	} else
2398 #endif
2399 	{
2400 		ifp->if_transmit = hn_transmit;
2401 		ifp->if_qflush = hn_xmit_qflush;
2402 	}
2403 
2404 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2405 #ifdef foo
2406 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2407 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2408 #endif
2409 	if (sc->hn_caps & HN_CAP_VLAN) {
2410 		/* XXX not sure about VLAN_MTU. */
2411 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2412 	}
2413 
2414 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2415 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2416 		ifp->if_capabilities |= IFCAP_TXCSUM;
2417 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2418 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2419 	if (sc->hn_caps & HN_CAP_TSO4) {
2420 		ifp->if_capabilities |= IFCAP_TSO4;
2421 		ifp->if_hwassist |= CSUM_IP_TSO;
2422 	}
2423 	if (sc->hn_caps & HN_CAP_TSO6) {
2424 		ifp->if_capabilities |= IFCAP_TSO6;
2425 		ifp->if_hwassist |= CSUM_IP6_TSO;
2426 	}
2427 
2428 	/* Enable all available capabilities by default. */
2429 	ifp->if_capenable = ifp->if_capabilities;
2430 
2431 	/*
2432 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2433 	 * be enabled through SIOCSIFCAP.
2434 	 */
2435 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2436 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2437 
2438 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2439 		/*
2440 		 * Lock hn_set_tso_maxsize() to simplify its
2441 		 * internal logic.
2442 		 */
2443 		HN_LOCK(sc);
2444 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2445 		HN_UNLOCK(sc);
2446 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2447 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2448 	}
2449 
2450 	ether_ifattach(ifp, eaddr);
2451 
2452 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2453 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2454 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2455 	}
2456 	if (mtu < ETHERMTU) {
2457 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2458 		ifp->if_mtu = mtu;
2459 	}
2460 
2461 	/* Inform the upper layer about the long frame support. */
2462 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2463 
2464 	/*
2465 	 * Kick off link status check.
2466 	 */
2467 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2468 	hn_update_link_status(sc);
2469 
2470 	if (!hn_xpnt_vf) {
2471 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2472 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2473 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2474 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2475 	} else {
2476 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2477 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2478 	}
2479 
2480 	/*
2481 	 * NOTE:
2482 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2483 	 * since interface's LLADDR is needed; interface LLADDR is not
2484 	 * available when ifnet_arrival event is triggered.
2485 	 */
2486 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2487 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2488 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2489 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2490 
2491 	return (0);
2492 failed:
2493 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2494 		hn_synth_detach(sc);
2495 	hn_detach(dev);
2496 	return (error);
2497 }
2498 
2499 static int
2500 hn_detach(device_t dev)
2501 {
2502 	struct hn_softc *sc = device_get_softc(dev);
2503 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2504 
2505 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2506 		/*
2507 		 * In case that the vmbus missed the orphan handler
2508 		 * installation.
2509 		 */
2510 		vmbus_xact_ctx_orphan(sc->hn_xact);
2511 	}
2512 
2513 	if (sc->hn_ifaddr_evthand != NULL)
2514 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2515 	if (sc->hn_ifnet_evthand != NULL)
2516 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2517 	if (sc->hn_ifnet_atthand != NULL) {
2518 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2519 		    sc->hn_ifnet_atthand);
2520 	}
2521 	if (sc->hn_ifnet_dethand != NULL) {
2522 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2523 		    sc->hn_ifnet_dethand);
2524 	}
2525 	if (sc->hn_ifnet_lnkhand != NULL)
2526 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2527 
2528 	vf_ifp = sc->hn_vf_ifp;
2529 	__compiler_membar();
2530 	if (vf_ifp != NULL)
2531 		hn_ifnet_detevent(sc, vf_ifp);
2532 
2533 	if (device_is_attached(dev)) {
2534 		HN_LOCK(sc);
2535 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2536 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2537 				hn_stop(sc, true);
2538 			/*
2539 			 * NOTE:
2540 			 * hn_stop() only suspends data, so managment
2541 			 * stuffs have to be suspended manually here.
2542 			 */
2543 			hn_suspend_mgmt(sc);
2544 			hn_synth_detach(sc);
2545 		}
2546 		HN_UNLOCK(sc);
2547 		ether_ifdetach(ifp);
2548 	}
2549 
2550 	ifmedia_removeall(&sc->hn_media);
2551 	hn_destroy_rx_data(sc);
2552 	hn_destroy_tx_data(sc);
2553 
2554 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2555 		int i;
2556 
2557 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2558 			taskqueue_free(sc->hn_tx_taskqs[i]);
2559 		free(sc->hn_tx_taskqs, M_DEVBUF);
2560 	}
2561 	taskqueue_free(sc->hn_mgmt_taskq0);
2562 	if (sc->hn_vf_taskq != NULL)
2563 		taskqueue_free(sc->hn_vf_taskq);
2564 
2565 	if (sc->hn_xact != NULL) {
2566 		/*
2567 		 * Uninstall the orphan handler _before_ the xact is
2568 		 * destructed.
2569 		 */
2570 		vmbus_chan_unset_orphan(sc->hn_prichan);
2571 		vmbus_xact_ctx_destroy(sc->hn_xact);
2572 	}
2573 
2574 	if_free(ifp);
2575 
2576 	HN_LOCK_DESTROY(sc);
2577 	rm_destroy(&sc->hn_vf_lock);
2578 	return (0);
2579 }
2580 
2581 static int
2582 hn_shutdown(device_t dev)
2583 {
2584 
2585 	return (0);
2586 }
2587 
2588 static void
2589 hn_link_status(struct hn_softc *sc)
2590 {
2591 	uint32_t link_status;
2592 	int error;
2593 
2594 	error = hn_rndis_get_linkstatus(sc, &link_status);
2595 	if (error) {
2596 		/* XXX what to do? */
2597 		return;
2598 	}
2599 
2600 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2601 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2602 	else
2603 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2604 	if_link_state_change(sc->hn_ifp,
2605 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2606 	    LINK_STATE_UP : LINK_STATE_DOWN);
2607 }
2608 
2609 static void
2610 hn_link_taskfunc(void *xsc, int pending __unused)
2611 {
2612 	struct hn_softc *sc = xsc;
2613 
2614 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2615 		return;
2616 	hn_link_status(sc);
2617 }
2618 
2619 static void
2620 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2621 {
2622 	struct hn_softc *sc = xsc;
2623 
2624 	/* Prevent any link status checks from running. */
2625 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2626 
2627 	/*
2628 	 * Fake up a [link down --> link up] state change; 5 seconds
2629 	 * delay is used, which closely simulates miibus reaction
2630 	 * upon link down event.
2631 	 */
2632 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2633 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2634 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2635 	    &sc->hn_netchg_status, 5 * hz);
2636 }
2637 
2638 static void
2639 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2640 {
2641 	struct hn_softc *sc = xsc;
2642 
2643 	/* Re-allow link status checks. */
2644 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2645 	hn_link_status(sc);
2646 }
2647 
2648 static void
2649 hn_update_link_status(struct hn_softc *sc)
2650 {
2651 
2652 	if (sc->hn_mgmt_taskq != NULL)
2653 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2654 }
2655 
2656 static void
2657 hn_change_network(struct hn_softc *sc)
2658 {
2659 
2660 	if (sc->hn_mgmt_taskq != NULL)
2661 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2662 }
2663 
2664 static __inline int
2665 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2666     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2667 {
2668 	struct mbuf *m = *m_head;
2669 	int error;
2670 
2671 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2672 
2673 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2674 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2675 	if (error == EFBIG) {
2676 		struct mbuf *m_new;
2677 
2678 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2679 		if (m_new == NULL)
2680 			return ENOBUFS;
2681 		else
2682 			*m_head = m = m_new;
2683 		txr->hn_tx_collapsed++;
2684 
2685 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2686 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2687 	}
2688 	if (!error) {
2689 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2690 		    BUS_DMASYNC_PREWRITE);
2691 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2692 	}
2693 	return error;
2694 }
2695 
2696 static __inline int
2697 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2698 {
2699 
2700 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2701 	    ("put an onlist txd %#x", txd->flags));
2702 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2703 	    ("put an onagg txd %#x", txd->flags));
2704 
2705 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2706 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2707 		return 0;
2708 
2709 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2710 		struct hn_txdesc *tmp_txd;
2711 
2712 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2713 			int freed;
2714 
2715 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2716 			    ("resursive aggregation on aggregated txdesc"));
2717 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2718 			    ("not aggregated txdesc"));
2719 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2720 			    ("aggregated txdesc uses dmamap"));
2721 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2722 			    ("aggregated txdesc consumes "
2723 			     "chimney sending buffer"));
2724 			KASSERT(tmp_txd->chim_size == 0,
2725 			    ("aggregated txdesc has non-zero "
2726 			     "chimney sending size"));
2727 
2728 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2729 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2730 			freed = hn_txdesc_put(txr, tmp_txd);
2731 			KASSERT(freed, ("failed to free aggregated txdesc"));
2732 		}
2733 	}
2734 
2735 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2736 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2737 		    ("chim txd uses dmamap"));
2738 		hn_chim_free(txr->hn_sc, txd->chim_index);
2739 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2740 		txd->chim_size = 0;
2741 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2742 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2743 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2744 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2745 		    txd->data_dmap);
2746 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2747 	}
2748 
2749 	if (txd->m != NULL) {
2750 		m_freem(txd->m);
2751 		txd->m = NULL;
2752 	}
2753 
2754 	txd->flags |= HN_TXD_FLAG_ONLIST;
2755 #ifndef HN_USE_TXDESC_BUFRING
2756 	mtx_lock_spin(&txr->hn_txlist_spin);
2757 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2758 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2759 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2760 	txr->hn_txdesc_avail++;
2761 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2762 	mtx_unlock_spin(&txr->hn_txlist_spin);
2763 #else	/* HN_USE_TXDESC_BUFRING */
2764 #ifdef HN_DEBUG
2765 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2766 #endif
2767 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2768 #endif	/* !HN_USE_TXDESC_BUFRING */
2769 
2770 	return 1;
2771 }
2772 
2773 static __inline struct hn_txdesc *
2774 hn_txdesc_get(struct hn_tx_ring *txr)
2775 {
2776 	struct hn_txdesc *txd;
2777 
2778 #ifndef HN_USE_TXDESC_BUFRING
2779 	mtx_lock_spin(&txr->hn_txlist_spin);
2780 	txd = SLIST_FIRST(&txr->hn_txlist);
2781 	if (txd != NULL) {
2782 		KASSERT(txr->hn_txdesc_avail > 0,
2783 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2784 		txr->hn_txdesc_avail--;
2785 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2786 	}
2787 	mtx_unlock_spin(&txr->hn_txlist_spin);
2788 #else
2789 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2790 #endif
2791 
2792 	if (txd != NULL) {
2793 #ifdef HN_USE_TXDESC_BUFRING
2794 #ifdef HN_DEBUG
2795 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2796 #endif
2797 #endif	/* HN_USE_TXDESC_BUFRING */
2798 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2799 		    STAILQ_EMPTY(&txd->agg_list) &&
2800 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2801 		    txd->chim_size == 0 &&
2802 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2803 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2804 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2805 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2806 		txd->refs = 1;
2807 	}
2808 	return txd;
2809 }
2810 
2811 static __inline void
2812 hn_txdesc_hold(struct hn_txdesc *txd)
2813 {
2814 
2815 	/* 0->1 transition will never work */
2816 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2817 	atomic_add_int(&txd->refs, 1);
2818 }
2819 
2820 static __inline void
2821 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2822 {
2823 
2824 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2825 	    ("recursive aggregation on aggregating txdesc"));
2826 
2827 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2828 	    ("already aggregated"));
2829 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2830 	    ("recursive aggregation on to-be-aggregated txdesc"));
2831 
2832 	txd->flags |= HN_TXD_FLAG_ONAGG;
2833 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2834 }
2835 
2836 static bool
2837 hn_tx_ring_pending(struct hn_tx_ring *txr)
2838 {
2839 	bool pending = false;
2840 
2841 #ifndef HN_USE_TXDESC_BUFRING
2842 	mtx_lock_spin(&txr->hn_txlist_spin);
2843 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2844 		pending = true;
2845 	mtx_unlock_spin(&txr->hn_txlist_spin);
2846 #else
2847 	if (!buf_ring_full(txr->hn_txdesc_br))
2848 		pending = true;
2849 #endif
2850 	return (pending);
2851 }
2852 
2853 static __inline void
2854 hn_txeof(struct hn_tx_ring *txr)
2855 {
2856 	txr->hn_has_txeof = 0;
2857 	txr->hn_txeof(txr);
2858 }
2859 
2860 static void
2861 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2862     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2863 {
2864 	struct hn_txdesc *txd = sndc->hn_cbarg;
2865 	struct hn_tx_ring *txr;
2866 
2867 	txr = txd->txr;
2868 	KASSERT(txr->hn_chan == chan,
2869 	    ("channel mismatch, on chan%u, should be chan%u",
2870 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2871 
2872 	txr->hn_has_txeof = 1;
2873 	hn_txdesc_put(txr, txd);
2874 
2875 	++txr->hn_txdone_cnt;
2876 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2877 		txr->hn_txdone_cnt = 0;
2878 		if (txr->hn_oactive)
2879 			hn_txeof(txr);
2880 	}
2881 }
2882 
2883 static void
2884 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2885 {
2886 #if defined(INET) || defined(INET6)
2887 	struct epoch_tracker et;
2888 
2889 	NET_EPOCH_ENTER(et);
2890 	tcp_lro_flush_all(&rxr->hn_lro);
2891 	NET_EPOCH_EXIT(et);
2892 #endif
2893 
2894 	/*
2895 	 * NOTE:
2896 	 * 'txr' could be NULL, if multiple channels and
2897 	 * ifnet.if_start method are enabled.
2898 	 */
2899 	if (txr == NULL || !txr->hn_has_txeof)
2900 		return;
2901 
2902 	txr->hn_txdone_cnt = 0;
2903 	hn_txeof(txr);
2904 }
2905 
2906 static __inline uint32_t
2907 hn_rndis_pktmsg_offset(uint32_t ofs)
2908 {
2909 
2910 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2911 	    ("invalid RNDIS packet msg offset %u", ofs));
2912 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2913 }
2914 
2915 static __inline void *
2916 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2917     size_t pi_dlen, uint32_t pi_type)
2918 {
2919 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2920 	struct rndis_pktinfo *pi;
2921 
2922 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2923 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2924 
2925 	/*
2926 	 * Per-packet-info does not move; it only grows.
2927 	 *
2928 	 * NOTE:
2929 	 * rm_pktinfooffset in this phase counts from the beginning
2930 	 * of rndis_packet_msg.
2931 	 */
2932 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2933 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2934 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2935 	    pkt->rm_pktinfolen);
2936 	pkt->rm_pktinfolen += pi_size;
2937 
2938 	pi->rm_size = pi_size;
2939 	pi->rm_type = pi_type;
2940 	pi->rm_internal = 0;
2941 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2942 
2943 	return (pi->rm_data);
2944 }
2945 
2946 static __inline int
2947 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2948 {
2949 	struct hn_txdesc *txd;
2950 	struct mbuf *m;
2951 	int error, pkts;
2952 
2953 	txd = txr->hn_agg_txd;
2954 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2955 
2956 	/*
2957 	 * Since hn_txpkt() will reset this temporary stat, save
2958 	 * it now, so that oerrors can be updated properly, if
2959 	 * hn_txpkt() ever fails.
2960 	 */
2961 	pkts = txr->hn_stat_pkts;
2962 
2963 	/*
2964 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2965 	 * failure, save it for later freeing, if hn_txpkt() ever
2966 	 * fails.
2967 	 */
2968 	m = txd->m;
2969 	error = hn_txpkt(ifp, txr, txd);
2970 	if (__predict_false(error)) {
2971 		/* txd is freed, but m is not. */
2972 		m_freem(m);
2973 
2974 		txr->hn_flush_failed++;
2975 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2976 	}
2977 
2978 	/* Reset all aggregation states. */
2979 	txr->hn_agg_txd = NULL;
2980 	txr->hn_agg_szleft = 0;
2981 	txr->hn_agg_pktleft = 0;
2982 	txr->hn_agg_prevpkt = NULL;
2983 
2984 	return (error);
2985 }
2986 
2987 static void *
2988 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2989     int pktsize)
2990 {
2991 	void *chim;
2992 
2993 	if (txr->hn_agg_txd != NULL) {
2994 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2995 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2996 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2997 			int olen;
2998 
2999 			/*
3000 			 * Update the previous RNDIS packet's total length,
3001 			 * it can be increased due to the mandatory alignment
3002 			 * padding for this RNDIS packet.  And update the
3003 			 * aggregating txdesc's chimney sending buffer size
3004 			 * accordingly.
3005 			 *
3006 			 * XXX
3007 			 * Zero-out the padding, as required by the RNDIS spec.
3008 			 */
3009 			olen = pkt->rm_len;
3010 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
3011 			agg_txd->chim_size += pkt->rm_len - olen;
3012 
3013 			/* Link this txdesc to the parent. */
3014 			hn_txdesc_agg(agg_txd, txd);
3015 
3016 			chim = (uint8_t *)pkt + pkt->rm_len;
3017 			/* Save the current packet for later fixup. */
3018 			txr->hn_agg_prevpkt = chim;
3019 
3020 			txr->hn_agg_pktleft--;
3021 			txr->hn_agg_szleft -= pktsize;
3022 			if (txr->hn_agg_szleft <=
3023 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3024 				/*
3025 				 * Probably can't aggregate more packets,
3026 				 * flush this aggregating txdesc proactively.
3027 				 */
3028 				txr->hn_agg_pktleft = 0;
3029 			}
3030 			/* Done! */
3031 			return (chim);
3032 		}
3033 		hn_flush_txagg(ifp, txr);
3034 	}
3035 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3036 
3037 	txr->hn_tx_chimney_tried++;
3038 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3039 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3040 		return (NULL);
3041 	txr->hn_tx_chimney++;
3042 
3043 	chim = txr->hn_sc->hn_chim +
3044 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3045 
3046 	if (txr->hn_agg_pktmax > 1 &&
3047 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3048 		txr->hn_agg_txd = txd;
3049 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3050 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3051 		txr->hn_agg_prevpkt = chim;
3052 	}
3053 	return (chim);
3054 }
3055 
3056 /*
3057  * NOTE:
3058  * If this function fails, then both txd and m_head0 will be freed.
3059  */
3060 static int
3061 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3062     struct mbuf **m_head0)
3063 {
3064 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3065 	int error, nsegs, i;
3066 	struct mbuf *m_head = *m_head0;
3067 	struct rndis_packet_msg *pkt;
3068 	uint32_t *pi_data;
3069 	void *chim = NULL;
3070 	int pkt_hlen, pkt_size;
3071 
3072 	pkt = txd->rndis_pkt;
3073 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3074 	if (pkt_size < txr->hn_chim_size) {
3075 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3076 		if (chim != NULL)
3077 			pkt = chim;
3078 	} else {
3079 		if (txr->hn_agg_txd != NULL)
3080 			hn_flush_txagg(ifp, txr);
3081 	}
3082 
3083 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3084 	pkt->rm_len = m_head->m_pkthdr.len;
3085 	pkt->rm_dataoffset = 0;
3086 	pkt->rm_datalen = m_head->m_pkthdr.len;
3087 	pkt->rm_oobdataoffset = 0;
3088 	pkt->rm_oobdatalen = 0;
3089 	pkt->rm_oobdataelements = 0;
3090 	pkt->rm_pktinfooffset = sizeof(*pkt);
3091 	pkt->rm_pktinfolen = 0;
3092 	pkt->rm_vchandle = 0;
3093 	pkt->rm_reserved = 0;
3094 
3095 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3096 		/*
3097 		 * Set the hash value for this packet.
3098 		 */
3099 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3100 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3101 
3102 		if (M_HASHTYPE_ISHASH(m_head))
3103 			/*
3104 			 * The flowid field contains the hash value host
3105 			 * set in the rx queue if it is a ip forwarding pkt.
3106 			 * Set the same hash value so host can send on the
3107 			 * cpu it was received.
3108 			 */
3109 			*pi_data = m_head->m_pkthdr.flowid;
3110 		else
3111 			/*
3112 			 * Otherwise just put the tx queue index.
3113 			 */
3114 			*pi_data = txr->hn_tx_idx;
3115 	}
3116 
3117 	if (m_head->m_flags & M_VLANTAG) {
3118 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3119 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3120 		*pi_data = NDIS_VLAN_INFO_MAKE(
3121 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3122 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3123 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3124 	}
3125 
3126 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3127 #if defined(INET6) || defined(INET)
3128 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3129 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3130 #ifdef INET
3131 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3132 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3133 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3134 			    m_head->m_pkthdr.tso_segsz);
3135 		}
3136 #endif
3137 #if defined(INET6) && defined(INET)
3138 		else
3139 #endif
3140 #ifdef INET6
3141 		{
3142 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3143 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3144 			    m_head->m_pkthdr.tso_segsz);
3145 		}
3146 #endif
3147 #endif	/* INET6 || INET */
3148 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3149 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3150 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3151 		if (m_head->m_pkthdr.csum_flags &
3152 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3153 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3154 		} else {
3155 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3156 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3157 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3158 		}
3159 
3160 		if (m_head->m_pkthdr.csum_flags &
3161 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3162 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3163 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3164 		} else if (m_head->m_pkthdr.csum_flags &
3165 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3166 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3167 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3168 		}
3169 	}
3170 
3171 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3172 	/* Fixup RNDIS packet message total length */
3173 	pkt->rm_len += pkt_hlen;
3174 	/* Convert RNDIS packet message offsets */
3175 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3176 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3177 
3178 	/*
3179 	 * Fast path: Chimney sending.
3180 	 */
3181 	if (chim != NULL) {
3182 		struct hn_txdesc *tgt_txd = txd;
3183 
3184 		if (txr->hn_agg_txd != NULL) {
3185 			tgt_txd = txr->hn_agg_txd;
3186 #ifdef INVARIANTS
3187 			*m_head0 = NULL;
3188 #endif
3189 		}
3190 
3191 		KASSERT(pkt == chim,
3192 		    ("RNDIS pkt not in chimney sending buffer"));
3193 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3194 		    ("chimney sending buffer is not used"));
3195 		tgt_txd->chim_size += pkt->rm_len;
3196 
3197 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3198 		    ((uint8_t *)chim) + pkt_hlen);
3199 
3200 		txr->hn_gpa_cnt = 0;
3201 		txr->hn_sendpkt = hn_txpkt_chim;
3202 		goto done;
3203 	}
3204 
3205 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3206 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3207 	    ("chimney buffer is used"));
3208 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3209 
3210 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3211 	if (__predict_false(error)) {
3212 		int freed;
3213 
3214 		/*
3215 		 * This mbuf is not linked w/ the txd yet, so free it now.
3216 		 */
3217 		m_freem(m_head);
3218 		*m_head0 = NULL;
3219 
3220 		freed = hn_txdesc_put(txr, txd);
3221 		KASSERT(freed != 0,
3222 		    ("fail to free txd upon txdma error"));
3223 
3224 		txr->hn_txdma_failed++;
3225 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3226 		return error;
3227 	}
3228 	*m_head0 = m_head;
3229 
3230 	/* +1 RNDIS packet message */
3231 	txr->hn_gpa_cnt = nsegs + 1;
3232 
3233 	/* send packet with page buffer */
3234 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3235 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3236 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3237 
3238 	/*
3239 	 * Fill the page buffers with mbuf info after the page
3240 	 * buffer for RNDIS packet message.
3241 	 */
3242 	for (i = 0; i < nsegs; ++i) {
3243 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3244 
3245 		gpa->gpa_page = atop(segs[i].ds_addr);
3246 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3247 		gpa->gpa_len = segs[i].ds_len;
3248 	}
3249 
3250 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3251 	txd->chim_size = 0;
3252 	txr->hn_sendpkt = hn_txpkt_sglist;
3253 done:
3254 	txd->m = m_head;
3255 
3256 	/* Set the completion routine */
3257 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3258 
3259 	/* Update temporary stats for later use. */
3260 	txr->hn_stat_pkts++;
3261 	txr->hn_stat_size += m_head->m_pkthdr.len;
3262 	if (m_head->m_flags & M_MCAST)
3263 		txr->hn_stat_mcasts++;
3264 
3265 	return 0;
3266 }
3267 
3268 /*
3269  * NOTE:
3270  * If this function fails, then txd will be freed, but the mbuf
3271  * associated w/ the txd will _not_ be freed.
3272  */
3273 static int
3274 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3275 {
3276 	int error, send_failed = 0, has_bpf;
3277 
3278 again:
3279 	has_bpf = bpf_peers_present(ifp->if_bpf);
3280 	if (has_bpf) {
3281 		/*
3282 		 * Make sure that this txd and any aggregated txds are not
3283 		 * freed before ETHER_BPF_MTAP.
3284 		 */
3285 		hn_txdesc_hold(txd);
3286 	}
3287 	error = txr->hn_sendpkt(txr, txd);
3288 	if (!error) {
3289 		if (has_bpf) {
3290 			const struct hn_txdesc *tmp_txd;
3291 
3292 			ETHER_BPF_MTAP(ifp, txd->m);
3293 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3294 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3295 		}
3296 
3297 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3298 #ifdef HN_IFSTART_SUPPORT
3299 		if (!hn_use_if_start)
3300 #endif
3301 		{
3302 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3303 			    txr->hn_stat_size);
3304 			if (txr->hn_stat_mcasts != 0) {
3305 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3306 				    txr->hn_stat_mcasts);
3307 			}
3308 		}
3309 		txr->hn_pkts += txr->hn_stat_pkts;
3310 		txr->hn_sends++;
3311 	}
3312 	if (has_bpf)
3313 		hn_txdesc_put(txr, txd);
3314 
3315 	if (__predict_false(error)) {
3316 		int freed;
3317 
3318 		/*
3319 		 * This should "really rarely" happen.
3320 		 *
3321 		 * XXX Too many RX to be acked or too many sideband
3322 		 * commands to run?  Ask netvsc_channel_rollup()
3323 		 * to kick start later.
3324 		 */
3325 		txr->hn_has_txeof = 1;
3326 		if (!send_failed) {
3327 			txr->hn_send_failed++;
3328 			send_failed = 1;
3329 			/*
3330 			 * Try sending again after set hn_has_txeof;
3331 			 * in case that we missed the last
3332 			 * netvsc_channel_rollup().
3333 			 */
3334 			goto again;
3335 		}
3336 		if_printf(ifp, "send failed\n");
3337 
3338 		/*
3339 		 * Caller will perform further processing on the
3340 		 * associated mbuf, so don't free it in hn_txdesc_put();
3341 		 * only unload it from the DMA map in hn_txdesc_put(),
3342 		 * if it was loaded.
3343 		 */
3344 		txd->m = NULL;
3345 		freed = hn_txdesc_put(txr, txd);
3346 		KASSERT(freed != 0,
3347 		    ("fail to free txd upon send error"));
3348 
3349 		txr->hn_send_failed++;
3350 	}
3351 
3352 	/* Reset temporary stats, after this sending is done. */
3353 	txr->hn_stat_size = 0;
3354 	txr->hn_stat_pkts = 0;
3355 	txr->hn_stat_mcasts = 0;
3356 
3357 	return (error);
3358 }
3359 
3360 /*
3361  * Append the specified data to the indicated mbuf chain,
3362  * Extend the mbuf chain if the new data does not fit in
3363  * existing space.
3364  *
3365  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3366  * There should be an equivalent in the kernel mbuf code,
3367  * but there does not appear to be one yet.
3368  *
3369  * Differs from m_append() in that additional mbufs are
3370  * allocated with cluster size MJUMPAGESIZE, and filled
3371  * accordingly.
3372  *
3373  * Return the last mbuf in the chain or NULL if failed to
3374  * allocate new mbuf.
3375  */
3376 static struct mbuf *
3377 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3378 {
3379 	struct mbuf *m, *n;
3380 	int remainder, space;
3381 
3382 	for (m = m0; m->m_next != NULL; m = m->m_next)
3383 		;
3384 	remainder = len;
3385 	space = M_TRAILINGSPACE(m);
3386 	if (space > 0) {
3387 		/*
3388 		 * Copy into available space.
3389 		 */
3390 		if (space > remainder)
3391 			space = remainder;
3392 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3393 		m->m_len += space;
3394 		cp += space;
3395 		remainder -= space;
3396 	}
3397 	while (remainder > 0) {
3398 		/*
3399 		 * Allocate a new mbuf; could check space
3400 		 * and allocate a cluster instead.
3401 		 */
3402 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3403 		if (n == NULL)
3404 			return NULL;
3405 		n->m_len = min(MJUMPAGESIZE, remainder);
3406 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3407 		cp += n->m_len;
3408 		remainder -= n->m_len;
3409 		m->m_next = n;
3410 		m = n;
3411 	}
3412 
3413 	return m;
3414 }
3415 
3416 #if defined(INET) || defined(INET6)
3417 static __inline int
3418 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3419 {
3420 #if __FreeBSD_version >= 1100095
3421 	if (hn_lro_mbufq_depth) {
3422 		tcp_lro_queue_mbuf(lc, m);
3423 		return 0;
3424 	}
3425 #endif
3426 	return tcp_lro_rx(lc, m, 0);
3427 }
3428 #endif
3429 
3430 static int
3431 hn_rxpkt(struct hn_rx_ring *rxr)
3432 {
3433 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3434 	struct mbuf *m_new, *n;
3435 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3436 	int hash_type = M_HASHTYPE_NONE;
3437 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3438 	int i;
3439 
3440 	ifp = hn_ifp;
3441 	if (rxr->hn_rxvf_ifp != NULL) {
3442 		/*
3443 		 * Non-transparent mode VF; pretend this packet is from
3444 		 * the VF.
3445 		 */
3446 		ifp = rxr->hn_rxvf_ifp;
3447 		is_vf = 1;
3448 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3449 		/* Transparent mode VF. */
3450 		is_vf = 1;
3451 	}
3452 
3453 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3454 		/*
3455 		 * NOTE:
3456 		 * See the NOTE of hn_rndis_init_fixat().  This
3457 		 * function can be reached, immediately after the
3458 		 * RNDIS is initialized but before the ifnet is
3459 		 * setup on the hn_attach() path; drop the unexpected
3460 		 * packets.
3461 		 */
3462 		return (0);
3463 	}
3464 
3465 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3466 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3467 		return (0);
3468 	}
3469 
3470 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3471 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3472 		if (m_new == NULL) {
3473 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3474 			return (0);
3475 		}
3476 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3477 		    rxr->rsc.frag_len[0]);
3478 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3479 	} else {
3480 		/*
3481 		 * Get an mbuf with a cluster.  For packets 2K or less,
3482 		 * get a standard 2K cluster.  For anything larger, get a
3483 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3484 		 * if looped around to the Hyper-V TX channel, so avoid them.
3485 		 */
3486 		size = MCLBYTES;
3487 		if (rxr->rsc.pktlen > MCLBYTES) {
3488 			/* 4096 */
3489 			size = MJUMPAGESIZE;
3490 		}
3491 
3492 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3493 		if (m_new == NULL) {
3494 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3495 			return (0);
3496 		}
3497 
3498 		n = m_new;
3499 		for (i = 0; i < rxr->rsc.cnt; i++) {
3500 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3501 			    rxr->rsc.frag_data[i]);
3502 			if (n == NULL) {
3503 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3504 				return (0);
3505 			} else {
3506 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3507 			}
3508 		}
3509 	}
3510 	if (rxr->rsc.pktlen <= MHLEN)
3511 		rxr->hn_small_pkts++;
3512 
3513 	m_new->m_pkthdr.rcvif = ifp;
3514 
3515 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3516 		do_csum = 0;
3517 
3518 	/* receive side checksum offload */
3519 	if (rxr->rsc.csum_info != NULL) {
3520 		/* IP csum offload */
3521 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3522 			m_new->m_pkthdr.csum_flags |=
3523 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3524 			rxr->hn_csum_ip++;
3525 		}
3526 
3527 		/* TCP/UDP csum offload */
3528 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3529 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3530 			m_new->m_pkthdr.csum_flags |=
3531 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3532 			m_new->m_pkthdr.csum_data = 0xffff;
3533 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3534 				rxr->hn_csum_tcp++;
3535 			else
3536 				rxr->hn_csum_udp++;
3537 		}
3538 
3539 		/*
3540 		 * XXX
3541 		 * As of this write (Oct 28th, 2016), host side will turn
3542 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3543 		 * the do_lro setting here is actually _not_ accurate.  We
3544 		 * depend on the RSS hash type check to reset do_lro.
3545 		 */
3546 		if ((*(rxr->rsc.csum_info) &
3547 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3548 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3549 			do_lro = 1;
3550 	} else {
3551 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3552 		if (l3proto == ETHERTYPE_IP) {
3553 			if (l4proto == IPPROTO_TCP) {
3554 				if (do_csum &&
3555 				    (rxr->hn_trust_hcsum &
3556 				     HN_TRUST_HCSUM_TCP)) {
3557 					rxr->hn_csum_trusted++;
3558 					m_new->m_pkthdr.csum_flags |=
3559 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3560 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3561 					m_new->m_pkthdr.csum_data = 0xffff;
3562 				}
3563 				do_lro = 1;
3564 			} else if (l4proto == IPPROTO_UDP) {
3565 				if (do_csum &&
3566 				    (rxr->hn_trust_hcsum &
3567 				     HN_TRUST_HCSUM_UDP)) {
3568 					rxr->hn_csum_trusted++;
3569 					m_new->m_pkthdr.csum_flags |=
3570 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3571 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3572 					m_new->m_pkthdr.csum_data = 0xffff;
3573 				}
3574 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3575 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3576 				rxr->hn_csum_trusted++;
3577 				m_new->m_pkthdr.csum_flags |=
3578 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3579 			}
3580 		}
3581 	}
3582 
3583 	if (rxr->rsc.vlan_info != NULL) {
3584 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3585 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3586 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3587 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3588 		m_new->m_flags |= M_VLANTAG;
3589 	}
3590 
3591 	/*
3592 	 * If VF is activated (tranparent/non-transparent mode does not
3593 	 * matter here).
3594 	 *
3595 	 * - Disable LRO
3596 	 *
3597 	 *   hn(4) will only receive broadcast packets, multicast packets,
3598 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3599 	 *   packet types.
3600 	 *
3601 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3602 	 *   all, since the LRO flush will use hn(4) as the receiving
3603 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3604 	 */
3605 	if (is_vf)
3606 		do_lro = 0;
3607 
3608 	/*
3609 	 * If VF is activated (tranparent/non-transparent mode does not
3610 	 * matter here), do _not_ mess with unsupported hash types or
3611 	 * functions.
3612 	 */
3613 	if (rxr->rsc.hash_info != NULL) {
3614 		rxr->hn_rss_pkts++;
3615 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3616 		if (!is_vf)
3617 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3618 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3619 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3620 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3621 			    rxr->hn_mbuf_hash);
3622 
3623 			/*
3624 			 * NOTE:
3625 			 * do_lro is resetted, if the hash types are not TCP
3626 			 * related.  See the comment in the above csum_flags
3627 			 * setup section.
3628 			 */
3629 			switch (type) {
3630 			case NDIS_HASH_IPV4:
3631 				hash_type = M_HASHTYPE_RSS_IPV4;
3632 				do_lro = 0;
3633 				break;
3634 
3635 			case NDIS_HASH_TCP_IPV4:
3636 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3637 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3638 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3639 
3640 					if (is_vf)
3641 						def_htype = M_HASHTYPE_NONE;
3642 
3643 					/*
3644 					 * UDP 4-tuple hash is delivered as
3645 					 * TCP 4-tuple hash.
3646 					 */
3647 					if (l3proto == ETHERTYPE_MAX) {
3648 						hn_rxpkt_proto(m_new,
3649 						    &l3proto, &l4proto);
3650 					}
3651 					if (l3proto == ETHERTYPE_IP) {
3652 						if (l4proto == IPPROTO_UDP &&
3653 						    (rxr->hn_mbuf_hash &
3654 						     NDIS_HASH_UDP_IPV4_X)) {
3655 							hash_type =
3656 							M_HASHTYPE_RSS_UDP_IPV4;
3657 							do_lro = 0;
3658 						} else if (l4proto !=
3659 						    IPPROTO_TCP) {
3660 							hash_type = def_htype;
3661 							do_lro = 0;
3662 						}
3663 					} else {
3664 						hash_type = def_htype;
3665 						do_lro = 0;
3666 					}
3667 				}
3668 				break;
3669 
3670 			case NDIS_HASH_IPV6:
3671 				hash_type = M_HASHTYPE_RSS_IPV6;
3672 				do_lro = 0;
3673 				break;
3674 
3675 			case NDIS_HASH_IPV6_EX:
3676 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3677 				do_lro = 0;
3678 				break;
3679 
3680 			case NDIS_HASH_TCP_IPV6:
3681 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3682 				break;
3683 
3684 			case NDIS_HASH_TCP_IPV6_EX:
3685 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3686 				break;
3687 			}
3688 		}
3689 	} else if (!is_vf) {
3690 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3691 		hash_type = M_HASHTYPE_OPAQUE;
3692 	}
3693 	M_HASHTYPE_SET(m_new, hash_type);
3694 
3695 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3696 	if (hn_ifp != ifp) {
3697 		const struct ether_header *eh;
3698 
3699 		/*
3700 		 * Non-transparent mode VF is activated.
3701 		 */
3702 
3703 		/*
3704 		 * Allow tapping on hn(4).
3705 		 */
3706 		ETHER_BPF_MTAP(hn_ifp, m_new);
3707 
3708 		/*
3709 		 * Update hn(4)'s stats.
3710 		 */
3711 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3712 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3713 		/* Checked at the beginning of this function. */
3714 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3715 		eh = mtod(m_new, struct ether_header *);
3716 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3717 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3718 	}
3719 	rxr->hn_pkts++;
3720 
3721 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3722 #if defined(INET) || defined(INET6)
3723 		struct lro_ctrl *lro = &rxr->hn_lro;
3724 
3725 		if (lro->lro_cnt) {
3726 			rxr->hn_lro_tried++;
3727 			if (hn_lro_rx(lro, m_new) == 0) {
3728 				/* DONE! */
3729 				return 0;
3730 			}
3731 		}
3732 #endif
3733 	}
3734 	ifp->if_input(ifp, m_new);
3735 
3736 	return (0);
3737 }
3738 
3739 static int
3740 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3741 {
3742 	struct hn_softc *sc = ifp->if_softc;
3743 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3744 	struct ifnet *vf_ifp;
3745 	int mask, error = 0;
3746 	struct ifrsskey *ifrk;
3747 	struct ifrsshash *ifrh;
3748 	uint32_t mtu;
3749 
3750 	switch (cmd) {
3751 	case SIOCSIFMTU:
3752 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3753 			error = EINVAL;
3754 			break;
3755 		}
3756 
3757 		HN_LOCK(sc);
3758 
3759 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3760 			HN_UNLOCK(sc);
3761 			break;
3762 		}
3763 
3764 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3765 			/* Can't change MTU */
3766 			HN_UNLOCK(sc);
3767 			error = EOPNOTSUPP;
3768 			break;
3769 		}
3770 
3771 		if (ifp->if_mtu == ifr->ifr_mtu) {
3772 			HN_UNLOCK(sc);
3773 			break;
3774 		}
3775 
3776 		if (hn_xpnt_vf_isready(sc)) {
3777 			vf_ifp = sc->hn_vf_ifp;
3778 			ifr_vf = *ifr;
3779 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3780 			    sizeof(ifr_vf.ifr_name));
3781 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3782 			    (caddr_t)&ifr_vf);
3783 			if (error) {
3784 				HN_UNLOCK(sc);
3785 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3786 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3787 				break;
3788 			}
3789 		}
3790 
3791 		/*
3792 		 * Suspend this interface before the synthetic parts
3793 		 * are ripped.
3794 		 */
3795 		hn_suspend(sc);
3796 
3797 		/*
3798 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3799 		 */
3800 		hn_synth_detach(sc);
3801 
3802 		/*
3803 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3804 		 * with the new MTU setting.
3805 		 */
3806 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3807 		if (error) {
3808 			HN_UNLOCK(sc);
3809 			break;
3810 		}
3811 
3812 		error = hn_rndis_get_mtu(sc, &mtu);
3813 		if (error)
3814 			mtu = ifr->ifr_mtu;
3815 		else if (bootverbose)
3816 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3817 
3818 		/*
3819 		 * Commit the requested MTU, after the synthetic parts
3820 		 * have been successfully attached.
3821 		 */
3822 		if (mtu >= ifr->ifr_mtu) {
3823 			mtu = ifr->ifr_mtu;
3824 		} else {
3825 			if_printf(ifp, "fixup mtu %d -> %u\n",
3826 			    ifr->ifr_mtu, mtu);
3827 		}
3828 		ifp->if_mtu = mtu;
3829 
3830 		/*
3831 		 * Synthetic parts' reattach may change the chimney
3832 		 * sending size; update it.
3833 		 */
3834 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3835 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3836 
3837 		/*
3838 		 * Make sure that various parameters based on MTU are
3839 		 * still valid, after the MTU change.
3840 		 */
3841 		hn_mtu_change_fixup(sc);
3842 
3843 		/*
3844 		 * All done!  Resume the interface now.
3845 		 */
3846 		hn_resume(sc);
3847 
3848 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3849 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3850 			/*
3851 			 * Since we have reattached the NVS part,
3852 			 * change the datapath to VF again; in case
3853 			 * that it is lost, after the NVS was detached.
3854 			 */
3855 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3856 		}
3857 
3858 		HN_UNLOCK(sc);
3859 		break;
3860 
3861 	case SIOCSIFFLAGS:
3862 		HN_LOCK(sc);
3863 
3864 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3865 			HN_UNLOCK(sc);
3866 			break;
3867 		}
3868 
3869 		if (hn_xpnt_vf_isready(sc))
3870 			hn_xpnt_vf_saveifflags(sc);
3871 
3872 		if (ifp->if_flags & IFF_UP) {
3873 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3874 				/*
3875 				 * Caller meight hold mutex, e.g.
3876 				 * bpf; use busy-wait for the RNDIS
3877 				 * reply.
3878 				 */
3879 				HN_NO_SLEEPING(sc);
3880 				hn_rxfilter_config(sc);
3881 				HN_SLEEPING_OK(sc);
3882 
3883 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3884 					error = hn_xpnt_vf_iocsetflags(sc);
3885 			} else {
3886 				hn_init_locked(sc);
3887 			}
3888 		} else {
3889 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3890 				hn_stop(sc, false);
3891 		}
3892 		sc->hn_if_flags = ifp->if_flags;
3893 
3894 		HN_UNLOCK(sc);
3895 		break;
3896 
3897 	case SIOCSIFCAP:
3898 		HN_LOCK(sc);
3899 
3900 		if (hn_xpnt_vf_isready(sc)) {
3901 			ifr_vf = *ifr;
3902 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3903 			    sizeof(ifr_vf.ifr_name));
3904 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3905 			HN_UNLOCK(sc);
3906 			break;
3907 		}
3908 
3909 		/*
3910 		 * Fix up requested capabilities w/ supported capabilities,
3911 		 * since the supported capabilities could have been changed.
3912 		 */
3913 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3914 		    ifp->if_capenable;
3915 
3916 		if (mask & IFCAP_TXCSUM) {
3917 			ifp->if_capenable ^= IFCAP_TXCSUM;
3918 			if (ifp->if_capenable & IFCAP_TXCSUM)
3919 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3920 			else
3921 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3922 		}
3923 		if (mask & IFCAP_TXCSUM_IPV6) {
3924 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3925 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3926 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3927 			else
3928 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3929 		}
3930 
3931 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3932 		if (mask & IFCAP_RXCSUM)
3933 			ifp->if_capenable ^= IFCAP_RXCSUM;
3934 #ifdef foo
3935 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3936 		if (mask & IFCAP_RXCSUM_IPV6)
3937 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3938 #endif
3939 
3940 		if (mask & IFCAP_LRO)
3941 			ifp->if_capenable ^= IFCAP_LRO;
3942 
3943 		if (mask & IFCAP_TSO4) {
3944 			ifp->if_capenable ^= IFCAP_TSO4;
3945 			if (ifp->if_capenable & IFCAP_TSO4)
3946 				ifp->if_hwassist |= CSUM_IP_TSO;
3947 			else
3948 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3949 		}
3950 		if (mask & IFCAP_TSO6) {
3951 			ifp->if_capenable ^= IFCAP_TSO6;
3952 			if (ifp->if_capenable & IFCAP_TSO6)
3953 				ifp->if_hwassist |= CSUM_IP6_TSO;
3954 			else
3955 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3956 		}
3957 
3958 		HN_UNLOCK(sc);
3959 		break;
3960 
3961 	case SIOCADDMULTI:
3962 	case SIOCDELMULTI:
3963 		HN_LOCK(sc);
3964 
3965 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3966 			HN_UNLOCK(sc);
3967 			break;
3968 		}
3969 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3970 			/*
3971 			 * Multicast uses mutex; use busy-wait for
3972 			 * the RNDIS reply.
3973 			 */
3974 			HN_NO_SLEEPING(sc);
3975 			hn_rxfilter_config(sc);
3976 			HN_SLEEPING_OK(sc);
3977 		}
3978 
3979 		/* XXX vlan(4) style mcast addr maintenance */
3980 		if (hn_xpnt_vf_isready(sc)) {
3981 			int old_if_flags;
3982 
3983 			old_if_flags = sc->hn_vf_ifp->if_flags;
3984 			hn_xpnt_vf_saveifflags(sc);
3985 
3986 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3987 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3988 			     IFF_ALLMULTI))
3989 				error = hn_xpnt_vf_iocsetflags(sc);
3990 		}
3991 
3992 		HN_UNLOCK(sc);
3993 		break;
3994 
3995 	case SIOCSIFMEDIA:
3996 	case SIOCGIFMEDIA:
3997 		HN_LOCK(sc);
3998 		if (hn_xpnt_vf_isready(sc)) {
3999 			/*
4000 			 * SIOCGIFMEDIA expects ifmediareq, so don't
4001 			 * create and pass ifr_vf to the VF here; just
4002 			 * replace the ifr_name.
4003 			 */
4004 			vf_ifp = sc->hn_vf_ifp;
4005 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
4006 			    sizeof(ifr->ifr_name));
4007 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
4008 			/* Restore the ifr_name. */
4009 			strlcpy(ifr->ifr_name, ifp->if_xname,
4010 			    sizeof(ifr->ifr_name));
4011 			HN_UNLOCK(sc);
4012 			break;
4013 		}
4014 		HN_UNLOCK(sc);
4015 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4016 		break;
4017 
4018 	case SIOCGIFRSSHASH:
4019 		ifrh = (struct ifrsshash *)data;
4020 		HN_LOCK(sc);
4021 		if (sc->hn_rx_ring_inuse == 1) {
4022 			HN_UNLOCK(sc);
4023 			ifrh->ifrh_func = RSS_FUNC_NONE;
4024 			ifrh->ifrh_types = 0;
4025 			break;
4026 		}
4027 
4028 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4029 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4030 		else
4031 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4032 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4033 		HN_UNLOCK(sc);
4034 		break;
4035 
4036 	case SIOCGIFRSSKEY:
4037 		ifrk = (struct ifrsskey *)data;
4038 		HN_LOCK(sc);
4039 		if (sc->hn_rx_ring_inuse == 1) {
4040 			HN_UNLOCK(sc);
4041 			ifrk->ifrk_func = RSS_FUNC_NONE;
4042 			ifrk->ifrk_keylen = 0;
4043 			break;
4044 		}
4045 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4046 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4047 		else
4048 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4049 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4050 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4051 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4052 		HN_UNLOCK(sc);
4053 		break;
4054 
4055 	default:
4056 		error = ether_ioctl(ifp, cmd, data);
4057 		break;
4058 	}
4059 	return (error);
4060 }
4061 
4062 static void
4063 hn_stop(struct hn_softc *sc, bool detaching)
4064 {
4065 	struct ifnet *ifp = sc->hn_ifp;
4066 	int i;
4067 
4068 	HN_LOCK_ASSERT(sc);
4069 
4070 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4071 	    ("synthetic parts were not attached"));
4072 
4073 	/* Clear RUNNING bit ASAP. */
4074 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4075 
4076 	/* Disable polling. */
4077 	hn_polling(sc, 0);
4078 
4079 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4080 		KASSERT(sc->hn_vf_ifp != NULL,
4081 		    ("%s: VF is not attached", ifp->if_xname));
4082 
4083 		/* Mark transparent mode VF as disabled. */
4084 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4085 
4086 		/*
4087 		 * NOTE:
4088 		 * Datapath setting must happen _before_ bringing
4089 		 * the VF down.
4090 		 */
4091 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4092 
4093 		/*
4094 		 * Bring the VF down.
4095 		 */
4096 		hn_xpnt_vf_saveifflags(sc);
4097 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4098 		hn_xpnt_vf_iocsetflags(sc);
4099 	}
4100 
4101 	/* Suspend data transfers. */
4102 	hn_suspend_data(sc);
4103 
4104 	/* Clear OACTIVE bit. */
4105 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4106 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4107 		sc->hn_tx_ring[i].hn_oactive = 0;
4108 
4109 	/*
4110 	 * If the non-transparent mode VF is active, make sure
4111 	 * that the RX filter still allows packet reception.
4112 	 */
4113 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4114 		hn_rxfilter_config(sc);
4115 }
4116 
4117 static void
4118 hn_init_locked(struct hn_softc *sc)
4119 {
4120 	struct ifnet *ifp = sc->hn_ifp;
4121 	int i;
4122 
4123 	HN_LOCK_ASSERT(sc);
4124 
4125 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4126 		return;
4127 
4128 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4129 		return;
4130 
4131 	/* Configure RX filter */
4132 	hn_rxfilter_config(sc);
4133 
4134 	/* Clear OACTIVE bit. */
4135 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4136 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4137 		sc->hn_tx_ring[i].hn_oactive = 0;
4138 
4139 	/* Clear TX 'suspended' bit. */
4140 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4141 
4142 	if (hn_xpnt_vf_isready(sc)) {
4143 		/* Initialize transparent VF. */
4144 		hn_xpnt_vf_init(sc);
4145 	}
4146 
4147 	/* Everything is ready; unleash! */
4148 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4149 
4150 	/* Re-enable polling if requested. */
4151 	if (sc->hn_pollhz > 0)
4152 		hn_polling(sc, sc->hn_pollhz);
4153 }
4154 
4155 static void
4156 hn_init(void *xsc)
4157 {
4158 	struct hn_softc *sc = xsc;
4159 
4160 	HN_LOCK(sc);
4161 	hn_init_locked(sc);
4162 	HN_UNLOCK(sc);
4163 }
4164 
4165 #if __FreeBSD_version >= 1100099
4166 
4167 static int
4168 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4169 {
4170 	struct hn_softc *sc = arg1;
4171 	unsigned int lenlim;
4172 	int error;
4173 
4174 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4175 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4176 	if (error || req->newptr == NULL)
4177 		return error;
4178 
4179 	HN_LOCK(sc);
4180 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4181 	    lenlim > TCP_LRO_LENGTH_MAX) {
4182 		HN_UNLOCK(sc);
4183 		return EINVAL;
4184 	}
4185 	hn_set_lro_lenlim(sc, lenlim);
4186 	HN_UNLOCK(sc);
4187 
4188 	return 0;
4189 }
4190 
4191 static int
4192 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4193 {
4194 	struct hn_softc *sc = arg1;
4195 	int ackcnt, error, i;
4196 
4197 	/*
4198 	 * lro_ackcnt_lim is append count limit,
4199 	 * +1 to turn it into aggregation limit.
4200 	 */
4201 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4202 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4203 	if (error || req->newptr == NULL)
4204 		return error;
4205 
4206 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4207 		return EINVAL;
4208 
4209 	/*
4210 	 * Convert aggregation limit back to append
4211 	 * count limit.
4212 	 */
4213 	--ackcnt;
4214 	HN_LOCK(sc);
4215 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4216 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4217 	HN_UNLOCK(sc);
4218 	return 0;
4219 }
4220 
4221 #endif
4222 
4223 static int
4224 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4225 {
4226 	struct hn_softc *sc = arg1;
4227 	int hcsum = arg2;
4228 	int on, error, i;
4229 
4230 	on = 0;
4231 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4232 		on = 1;
4233 
4234 	error = sysctl_handle_int(oidp, &on, 0, req);
4235 	if (error || req->newptr == NULL)
4236 		return error;
4237 
4238 	HN_LOCK(sc);
4239 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4240 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4241 
4242 		if (on)
4243 			rxr->hn_trust_hcsum |= hcsum;
4244 		else
4245 			rxr->hn_trust_hcsum &= ~hcsum;
4246 	}
4247 	HN_UNLOCK(sc);
4248 	return 0;
4249 }
4250 
4251 static int
4252 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4253 {
4254 	struct hn_softc *sc = arg1;
4255 	int chim_size, error;
4256 
4257 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4258 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4259 	if (error || req->newptr == NULL)
4260 		return error;
4261 
4262 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4263 		return EINVAL;
4264 
4265 	HN_LOCK(sc);
4266 	hn_set_chim_size(sc, chim_size);
4267 	HN_UNLOCK(sc);
4268 	return 0;
4269 }
4270 
4271 #if __FreeBSD_version < 1100095
4272 static int
4273 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4274 {
4275 	struct hn_softc *sc = arg1;
4276 	int ofs = arg2, i, error;
4277 	struct hn_rx_ring *rxr;
4278 	uint64_t stat;
4279 
4280 	stat = 0;
4281 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4282 		rxr = &sc->hn_rx_ring[i];
4283 		stat += *((int *)((uint8_t *)rxr + ofs));
4284 	}
4285 
4286 	error = sysctl_handle_64(oidp, &stat, 0, req);
4287 	if (error || req->newptr == NULL)
4288 		return error;
4289 
4290 	/* Zero out this stat. */
4291 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4292 		rxr = &sc->hn_rx_ring[i];
4293 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4294 	}
4295 	return 0;
4296 }
4297 #else
4298 static int
4299 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4300 {
4301 	struct hn_softc *sc = arg1;
4302 	int ofs = arg2, i, error;
4303 	struct hn_rx_ring *rxr;
4304 	uint64_t stat;
4305 
4306 	stat = 0;
4307 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4308 		rxr = &sc->hn_rx_ring[i];
4309 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4310 	}
4311 
4312 	error = sysctl_handle_64(oidp, &stat, 0, req);
4313 	if (error || req->newptr == NULL)
4314 		return error;
4315 
4316 	/* Zero out this stat. */
4317 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4318 		rxr = &sc->hn_rx_ring[i];
4319 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4320 	}
4321 	return 0;
4322 }
4323 
4324 #endif
4325 
4326 static int
4327 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4328 {
4329 	struct hn_softc *sc = arg1;
4330 	int ofs = arg2, i, error;
4331 	struct hn_rx_ring *rxr;
4332 	u_long stat;
4333 
4334 	stat = 0;
4335 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4336 		rxr = &sc->hn_rx_ring[i];
4337 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4338 	}
4339 
4340 	error = sysctl_handle_long(oidp, &stat, 0, req);
4341 	if (error || req->newptr == NULL)
4342 		return error;
4343 
4344 	/* Zero out this stat. */
4345 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4346 		rxr = &sc->hn_rx_ring[i];
4347 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4348 	}
4349 	return 0;
4350 }
4351 
4352 static int
4353 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4354 {
4355 	struct hn_softc *sc = arg1;
4356 	int ofs = arg2, i, error;
4357 	struct hn_tx_ring *txr;
4358 	u_long stat;
4359 
4360 	stat = 0;
4361 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4362 		txr = &sc->hn_tx_ring[i];
4363 		stat += *((u_long *)((uint8_t *)txr + ofs));
4364 	}
4365 
4366 	error = sysctl_handle_long(oidp, &stat, 0, req);
4367 	if (error || req->newptr == NULL)
4368 		return error;
4369 
4370 	/* Zero out this stat. */
4371 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4372 		txr = &sc->hn_tx_ring[i];
4373 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4374 	}
4375 	return 0;
4376 }
4377 
4378 static int
4379 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4380 {
4381 	struct hn_softc *sc = arg1;
4382 	int ofs = arg2, i, error, conf;
4383 	struct hn_tx_ring *txr;
4384 
4385 	txr = &sc->hn_tx_ring[0];
4386 	conf = *((int *)((uint8_t *)txr + ofs));
4387 
4388 	error = sysctl_handle_int(oidp, &conf, 0, req);
4389 	if (error || req->newptr == NULL)
4390 		return error;
4391 
4392 	HN_LOCK(sc);
4393 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4394 		txr = &sc->hn_tx_ring[i];
4395 		*((int *)((uint8_t *)txr + ofs)) = conf;
4396 	}
4397 	HN_UNLOCK(sc);
4398 
4399 	return 0;
4400 }
4401 
4402 static int
4403 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4404 {
4405 	struct hn_softc *sc = arg1;
4406 	int error, size;
4407 
4408 	size = sc->hn_agg_size;
4409 	error = sysctl_handle_int(oidp, &size, 0, req);
4410 	if (error || req->newptr == NULL)
4411 		return (error);
4412 
4413 	HN_LOCK(sc);
4414 	sc->hn_agg_size = size;
4415 	hn_set_txagg(sc);
4416 	HN_UNLOCK(sc);
4417 
4418 	return (0);
4419 }
4420 
4421 static int
4422 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4423 {
4424 	struct hn_softc *sc = arg1;
4425 	int error, pkts;
4426 
4427 	pkts = sc->hn_agg_pkts;
4428 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4429 	if (error || req->newptr == NULL)
4430 		return (error);
4431 
4432 	HN_LOCK(sc);
4433 	sc->hn_agg_pkts = pkts;
4434 	hn_set_txagg(sc);
4435 	HN_UNLOCK(sc);
4436 
4437 	return (0);
4438 }
4439 
4440 static int
4441 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4442 {
4443 	struct hn_softc *sc = arg1;
4444 	int pkts;
4445 
4446 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4447 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4448 }
4449 
4450 static int
4451 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4452 {
4453 	struct hn_softc *sc = arg1;
4454 	int align;
4455 
4456 	align = sc->hn_tx_ring[0].hn_agg_align;
4457 	return (sysctl_handle_int(oidp, &align, 0, req));
4458 }
4459 
4460 static void
4461 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4462 {
4463 	if (pollhz == 0)
4464 		vmbus_chan_poll_disable(chan);
4465 	else
4466 		vmbus_chan_poll_enable(chan, pollhz);
4467 }
4468 
4469 static void
4470 hn_polling(struct hn_softc *sc, u_int pollhz)
4471 {
4472 	int nsubch = sc->hn_rx_ring_inuse - 1;
4473 
4474 	HN_LOCK_ASSERT(sc);
4475 
4476 	if (nsubch > 0) {
4477 		struct vmbus_channel **subch;
4478 		int i;
4479 
4480 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4481 		for (i = 0; i < nsubch; ++i)
4482 			hn_chan_polling(subch[i], pollhz);
4483 		vmbus_subchan_rel(subch, nsubch);
4484 	}
4485 	hn_chan_polling(sc->hn_prichan, pollhz);
4486 }
4487 
4488 static int
4489 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4490 {
4491 	struct hn_softc *sc = arg1;
4492 	int pollhz, error;
4493 
4494 	pollhz = sc->hn_pollhz;
4495 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4496 	if (error || req->newptr == NULL)
4497 		return (error);
4498 
4499 	if (pollhz != 0 &&
4500 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4501 		return (EINVAL);
4502 
4503 	HN_LOCK(sc);
4504 	if (sc->hn_pollhz != pollhz) {
4505 		sc->hn_pollhz = pollhz;
4506 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4507 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4508 			hn_polling(sc, sc->hn_pollhz);
4509 	}
4510 	HN_UNLOCK(sc);
4511 
4512 	return (0);
4513 }
4514 
4515 static int
4516 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4517 {
4518 	struct hn_softc *sc = arg1;
4519 	char verstr[16];
4520 
4521 	snprintf(verstr, sizeof(verstr), "%u.%u",
4522 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4523 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4524 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4525 }
4526 
4527 static int
4528 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4529 {
4530 	struct hn_softc *sc = arg1;
4531 	char caps_str[128];
4532 	uint32_t caps;
4533 
4534 	HN_LOCK(sc);
4535 	caps = sc->hn_caps;
4536 	HN_UNLOCK(sc);
4537 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4538 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4539 }
4540 
4541 static int
4542 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4543 {
4544 	struct hn_softc *sc = arg1;
4545 	char assist_str[128];
4546 	uint32_t hwassist;
4547 
4548 	HN_LOCK(sc);
4549 	hwassist = sc->hn_ifp->if_hwassist;
4550 	HN_UNLOCK(sc);
4551 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4552 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4553 }
4554 
4555 static int
4556 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4557 {
4558 	struct hn_softc *sc = arg1;
4559 	char filter_str[128];
4560 	uint32_t filter;
4561 
4562 	HN_LOCK(sc);
4563 	filter = sc->hn_rx_filter;
4564 	HN_UNLOCK(sc);
4565 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4566 	    NDIS_PACKET_TYPES);
4567 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4568 }
4569 
4570 #ifndef RSS
4571 
4572 static int
4573 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4574 {
4575 	struct hn_softc *sc = arg1;
4576 	int error;
4577 
4578 	HN_LOCK(sc);
4579 
4580 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4581 	if (error || req->newptr == NULL)
4582 		goto back;
4583 
4584 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4585 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4586 		/*
4587 		 * RSS key is synchronized w/ VF's, don't allow users
4588 		 * to change it.
4589 		 */
4590 		error = EBUSY;
4591 		goto back;
4592 	}
4593 
4594 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4595 	if (error)
4596 		goto back;
4597 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4598 
4599 	if (sc->hn_rx_ring_inuse > 1) {
4600 		error = hn_rss_reconfig(sc);
4601 	} else {
4602 		/* Not RSS capable, at least for now; just save the RSS key. */
4603 		error = 0;
4604 	}
4605 back:
4606 	HN_UNLOCK(sc);
4607 	return (error);
4608 }
4609 
4610 static int
4611 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4612 {
4613 	struct hn_softc *sc = arg1;
4614 	int error;
4615 
4616 	HN_LOCK(sc);
4617 
4618 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4619 	if (error || req->newptr == NULL)
4620 		goto back;
4621 
4622 	/*
4623 	 * Don't allow RSS indirect table change, if this interface is not
4624 	 * RSS capable currently.
4625 	 */
4626 	if (sc->hn_rx_ring_inuse == 1) {
4627 		error = EOPNOTSUPP;
4628 		goto back;
4629 	}
4630 
4631 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4632 	if (error)
4633 		goto back;
4634 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4635 
4636 	hn_rss_ind_fixup(sc);
4637 	error = hn_rss_reconfig(sc);
4638 back:
4639 	HN_UNLOCK(sc);
4640 	return (error);
4641 }
4642 
4643 #endif	/* !RSS */
4644 
4645 static int
4646 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4647 {
4648 	struct hn_softc *sc = arg1;
4649 	char hash_str[128];
4650 	uint32_t hash;
4651 
4652 	HN_LOCK(sc);
4653 	hash = sc->hn_rss_hash;
4654 	HN_UNLOCK(sc);
4655 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4656 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4657 }
4658 
4659 static int
4660 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4661 {
4662 	struct hn_softc *sc = arg1;
4663 	char hash_str[128];
4664 	uint32_t hash;
4665 
4666 	HN_LOCK(sc);
4667 	hash = sc->hn_rss_hcap;
4668 	HN_UNLOCK(sc);
4669 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4670 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4671 }
4672 
4673 static int
4674 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4675 {
4676 	struct hn_softc *sc = arg1;
4677 	char hash_str[128];
4678 	uint32_t hash;
4679 
4680 	HN_LOCK(sc);
4681 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4682 	HN_UNLOCK(sc);
4683 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4684 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4685 }
4686 
4687 static int
4688 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4689 {
4690 	struct hn_softc *sc = arg1;
4691 	char vf_name[IFNAMSIZ + 1];
4692 	struct ifnet *vf_ifp;
4693 
4694 	HN_LOCK(sc);
4695 	vf_name[0] = '\0';
4696 	vf_ifp = sc->hn_vf_ifp;
4697 	if (vf_ifp != NULL)
4698 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4699 	HN_UNLOCK(sc);
4700 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4701 }
4702 
4703 static int
4704 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4705 {
4706 	struct hn_softc *sc = arg1;
4707 	char vf_name[IFNAMSIZ + 1];
4708 	struct ifnet *vf_ifp;
4709 
4710 	HN_LOCK(sc);
4711 	vf_name[0] = '\0';
4712 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4713 	if (vf_ifp != NULL)
4714 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4715 	HN_UNLOCK(sc);
4716 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4717 }
4718 
4719 static int
4720 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4721 {
4722 	struct rm_priotracker pt;
4723 	struct sbuf *sb;
4724 	int error, i;
4725 	bool first;
4726 
4727 	error = sysctl_wire_old_buffer(req, 0);
4728 	if (error != 0)
4729 		return (error);
4730 
4731 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4732 	if (sb == NULL)
4733 		return (ENOMEM);
4734 
4735 	rm_rlock(&hn_vfmap_lock, &pt);
4736 
4737 	first = true;
4738 	for (i = 0; i < hn_vfmap_size; ++i) {
4739 		struct ifnet *ifp;
4740 
4741 		if (hn_vfmap[i] == NULL)
4742 			continue;
4743 
4744 		ifp = ifnet_byindex(i);
4745 		if (ifp != NULL) {
4746 			if (first)
4747 				sbuf_printf(sb, "%s", ifp->if_xname);
4748 			else
4749 				sbuf_printf(sb, " %s", ifp->if_xname);
4750 			first = false;
4751 		}
4752 	}
4753 
4754 	rm_runlock(&hn_vfmap_lock, &pt);
4755 
4756 	error = sbuf_finish(sb);
4757 	sbuf_delete(sb);
4758 	return (error);
4759 }
4760 
4761 static int
4762 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4763 {
4764 	struct rm_priotracker pt;
4765 	struct sbuf *sb;
4766 	int error, i;
4767 	bool first;
4768 
4769 	error = sysctl_wire_old_buffer(req, 0);
4770 	if (error != 0)
4771 		return (error);
4772 
4773 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4774 	if (sb == NULL)
4775 		return (ENOMEM);
4776 
4777 	rm_rlock(&hn_vfmap_lock, &pt);
4778 
4779 	first = true;
4780 	for (i = 0; i < hn_vfmap_size; ++i) {
4781 		struct ifnet *ifp, *hn_ifp;
4782 
4783 		hn_ifp = hn_vfmap[i];
4784 		if (hn_ifp == NULL)
4785 			continue;
4786 
4787 		ifp = ifnet_byindex(i);
4788 		if (ifp != NULL) {
4789 			if (first) {
4790 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4791 				    hn_ifp->if_xname);
4792 			} else {
4793 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4794 				    hn_ifp->if_xname);
4795 			}
4796 			first = false;
4797 		}
4798 	}
4799 
4800 	rm_runlock(&hn_vfmap_lock, &pt);
4801 
4802 	error = sbuf_finish(sb);
4803 	sbuf_delete(sb);
4804 	return (error);
4805 }
4806 
4807 static int
4808 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4809 {
4810 	struct hn_softc *sc = arg1;
4811 	int error, onoff = 0;
4812 
4813 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4814 		onoff = 1;
4815 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4816 	if (error || req->newptr == NULL)
4817 		return (error);
4818 
4819 	HN_LOCK(sc);
4820 	/* NOTE: hn_vf_lock for hn_transmit() */
4821 	rm_wlock(&sc->hn_vf_lock);
4822 	if (onoff)
4823 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4824 	else
4825 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4826 	rm_wunlock(&sc->hn_vf_lock);
4827 	HN_UNLOCK(sc);
4828 
4829 	return (0);
4830 }
4831 
4832 static int
4833 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4834 {
4835 	struct hn_softc *sc = arg1;
4836 	int enabled = 0;
4837 
4838 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4839 		enabled = 1;
4840 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4841 }
4842 
4843 static int
4844 hn_check_iplen(const struct mbuf *m, int hoff)
4845 {
4846 	const struct ip *ip;
4847 	int len, iphlen, iplen;
4848 	const struct tcphdr *th;
4849 	int thoff;				/* TCP data offset */
4850 
4851 	len = hoff + sizeof(struct ip);
4852 
4853 	/* The packet must be at least the size of an IP header. */
4854 	if (m->m_pkthdr.len < len)
4855 		return IPPROTO_DONE;
4856 
4857 	/* The fixed IP header must reside completely in the first mbuf. */
4858 	if (m->m_len < len)
4859 		return IPPROTO_DONE;
4860 
4861 	ip = mtodo(m, hoff);
4862 
4863 	/* Bound check the packet's stated IP header length. */
4864 	iphlen = ip->ip_hl << 2;
4865 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4866 		return IPPROTO_DONE;
4867 
4868 	/* The full IP header must reside completely in the one mbuf. */
4869 	if (m->m_len < hoff + iphlen)
4870 		return IPPROTO_DONE;
4871 
4872 	iplen = ntohs(ip->ip_len);
4873 
4874 	/*
4875 	 * Check that the amount of data in the buffers is as
4876 	 * at least much as the IP header would have us expect.
4877 	 */
4878 	if (m->m_pkthdr.len < hoff + iplen)
4879 		return IPPROTO_DONE;
4880 
4881 	/*
4882 	 * Ignore IP fragments.
4883 	 */
4884 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4885 		return IPPROTO_DONE;
4886 
4887 	/*
4888 	 * The TCP/IP or UDP/IP header must be entirely contained within
4889 	 * the first fragment of a packet.
4890 	 */
4891 	switch (ip->ip_p) {
4892 	case IPPROTO_TCP:
4893 		if (iplen < iphlen + sizeof(struct tcphdr))
4894 			return IPPROTO_DONE;
4895 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4896 			return IPPROTO_DONE;
4897 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4898 		thoff = th->th_off << 2;
4899 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4900 			return IPPROTO_DONE;
4901 		if (m->m_len < hoff + iphlen + thoff)
4902 			return IPPROTO_DONE;
4903 		break;
4904 	case IPPROTO_UDP:
4905 		if (iplen < iphlen + sizeof(struct udphdr))
4906 			return IPPROTO_DONE;
4907 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4908 			return IPPROTO_DONE;
4909 		break;
4910 	default:
4911 		if (iplen < iphlen)
4912 			return IPPROTO_DONE;
4913 		break;
4914 	}
4915 	return ip->ip_p;
4916 }
4917 
4918 static void
4919 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4920 {
4921 	const struct ether_header *eh;
4922 	uint16_t etype;
4923 	int hoff;
4924 
4925 	hoff = sizeof(*eh);
4926 	/* Checked at the beginning of this function. */
4927 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4928 
4929 	eh = mtod(m_new, const struct ether_header *);
4930 	etype = ntohs(eh->ether_type);
4931 	if (etype == ETHERTYPE_VLAN) {
4932 		const struct ether_vlan_header *evl;
4933 
4934 		hoff = sizeof(*evl);
4935 		if (m_new->m_len < hoff)
4936 			return;
4937 		evl = mtod(m_new, const struct ether_vlan_header *);
4938 		etype = ntohs(evl->evl_proto);
4939 	}
4940 	*l3proto = etype;
4941 
4942 	if (etype == ETHERTYPE_IP)
4943 		*l4proto = hn_check_iplen(m_new, hoff);
4944 	else
4945 		*l4proto = IPPROTO_DONE;
4946 }
4947 
4948 static int
4949 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4950 {
4951 	struct sysctl_oid_list *child;
4952 	struct sysctl_ctx_list *ctx;
4953 	device_t dev = sc->hn_dev;
4954 #if defined(INET) || defined(INET6)
4955 #if __FreeBSD_version >= 1100095
4956 	int lroent_cnt;
4957 #endif
4958 #endif
4959 	int i;
4960 
4961 	/*
4962 	 * Create RXBUF for reception.
4963 	 *
4964 	 * NOTE:
4965 	 * - It is shared by all channels.
4966 	 * - A large enough buffer is allocated, certain version of NVSes
4967 	 *   may further limit the usable space.
4968 	 */
4969 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4970 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4971 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4972 	if (sc->hn_rxbuf == NULL) {
4973 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4974 		return (ENOMEM);
4975 	}
4976 
4977 	sc->hn_rx_ring_cnt = ring_cnt;
4978 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4979 
4980 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4981 	    M_DEVBUF, M_WAITOK | M_ZERO);
4982 
4983 #if defined(INET) || defined(INET6)
4984 #if __FreeBSD_version >= 1100095
4985 	lroent_cnt = hn_lro_entry_count;
4986 	if (lroent_cnt < TCP_LRO_ENTRIES)
4987 		lroent_cnt = TCP_LRO_ENTRIES;
4988 	if (bootverbose)
4989 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4990 #endif
4991 #endif	/* INET || INET6 */
4992 
4993 	ctx = device_get_sysctl_ctx(dev);
4994 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4995 
4996 	/* Create dev.hn.UNIT.rx sysctl tree */
4997 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4998 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4999 
5000 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5001 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5002 
5003 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
5004 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
5005 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
5006 		if (rxr->hn_br == NULL) {
5007 			device_printf(dev, "allocate bufring failed\n");
5008 			return (ENOMEM);
5009 		}
5010 
5011 		if (hn_trust_hosttcp)
5012 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
5013 		if (hn_trust_hostudp)
5014 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
5015 		if (hn_trust_hostip)
5016 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
5017 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
5018 		rxr->hn_ifp = sc->hn_ifp;
5019 		if (i < sc->hn_tx_ring_cnt)
5020 			rxr->hn_txr = &sc->hn_tx_ring[i];
5021 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
5022 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
5023 		rxr->hn_rx_idx = i;
5024 		rxr->hn_rxbuf = sc->hn_rxbuf;
5025 
5026 		/*
5027 		 * Initialize LRO.
5028 		 */
5029 #if defined(INET) || defined(INET6)
5030 #if __FreeBSD_version >= 1100095
5031 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5032 		    hn_lro_mbufq_depth);
5033 #else
5034 		tcp_lro_init(&rxr->hn_lro);
5035 		rxr->hn_lro.ifp = sc->hn_ifp;
5036 #endif
5037 #if __FreeBSD_version >= 1100099
5038 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5039 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5040 #endif
5041 #endif	/* INET || INET6 */
5042 
5043 		if (sc->hn_rx_sysctl_tree != NULL) {
5044 			char name[16];
5045 
5046 			/*
5047 			 * Create per RX ring sysctl tree:
5048 			 * dev.hn.UNIT.rx.RINGID
5049 			 */
5050 			snprintf(name, sizeof(name), "%d", i);
5051 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5052 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5053 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5054 
5055 			if (rxr->hn_rx_sysctl_tree != NULL) {
5056 				SYSCTL_ADD_ULONG(ctx,
5057 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5058 				    OID_AUTO, "packets", CTLFLAG_RW,
5059 				    &rxr->hn_pkts, "# of packets received");
5060 				SYSCTL_ADD_ULONG(ctx,
5061 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5062 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
5063 				    &rxr->hn_rss_pkts,
5064 				    "# of packets w/ RSS info received");
5065 				SYSCTL_ADD_ULONG(ctx,
5066 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5067 				    OID_AUTO, "rsc_pkts", CTLFLAG_RW,
5068 				    &rxr->hn_rsc_pkts,
5069 				    "# of RSC packets received");
5070 				SYSCTL_ADD_ULONG(ctx,
5071 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5072 				    OID_AUTO, "rsc_drop", CTLFLAG_RW,
5073 				    &rxr->hn_rsc_drop,
5074 				    "# of RSC fragments dropped");
5075 				SYSCTL_ADD_INT(ctx,
5076 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5077 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5078 				    &rxr->hn_pktbuf_len, 0,
5079 				    "Temporary channel packet buffer length");
5080 			}
5081 		}
5082 	}
5083 
5084 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5085 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5086 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5087 #if __FreeBSD_version < 1100095
5088 	    hn_rx_stat_int_sysctl,
5089 #else
5090 	    hn_rx_stat_u64_sysctl,
5091 #endif
5092 	    "LU", "LRO queued");
5093 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5094 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5095 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5096 #if __FreeBSD_version < 1100095
5097 	    hn_rx_stat_int_sysctl,
5098 #else
5099 	    hn_rx_stat_u64_sysctl,
5100 #endif
5101 	    "LU", "LRO flushed");
5102 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5103 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5104 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5105 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5106 #if __FreeBSD_version >= 1100099
5107 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5108 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5109 	    hn_lro_lenlim_sysctl, "IU",
5110 	    "Max # of data bytes to be aggregated by LRO");
5111 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5112 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5113 	    hn_lro_ackcnt_sysctl, "I",
5114 	    "Max # of ACKs to be aggregated by LRO");
5115 #endif
5116 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5117 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5118 	    hn_trust_hcsum_sysctl, "I",
5119 	    "Trust tcp segement verification on host side, "
5120 	    "when csum info is missing");
5121 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5122 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5123 	    hn_trust_hcsum_sysctl, "I",
5124 	    "Trust udp datagram verification on host side, "
5125 	    "when csum info is missing");
5126 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5127 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5128 	    hn_trust_hcsum_sysctl, "I",
5129 	    "Trust ip packet verification on host side, "
5130 	    "when csum info is missing");
5131 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5132 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5133 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5134 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5135 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5136 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5137 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5138 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5139 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5140 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5141 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5142 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5143 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5144 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5145 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5146 	    hn_rx_stat_ulong_sysctl, "LU",
5147 	    "# of packets that we trust host's csum verification");
5148 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5149 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5150 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5151 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5152 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5153 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5154 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5155 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5156 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5157 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5158 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5159 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5160 
5161 	return (0);
5162 }
5163 
5164 static void
5165 hn_destroy_rx_data(struct hn_softc *sc)
5166 {
5167 	int i;
5168 
5169 	if (sc->hn_rxbuf != NULL) {
5170 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5171 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5172 		else
5173 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5174 		sc->hn_rxbuf = NULL;
5175 	}
5176 
5177 	if (sc->hn_rx_ring_cnt == 0)
5178 		return;
5179 
5180 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5181 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5182 
5183 		if (rxr->hn_br == NULL)
5184 			continue;
5185 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5186 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5187 		} else {
5188 			device_printf(sc->hn_dev,
5189 			    "%dth channel bufring is referenced", i);
5190 		}
5191 		rxr->hn_br = NULL;
5192 
5193 #if defined(INET) || defined(INET6)
5194 		tcp_lro_free(&rxr->hn_lro);
5195 #endif
5196 		free(rxr->hn_pktbuf, M_DEVBUF);
5197 	}
5198 	free(sc->hn_rx_ring, M_DEVBUF);
5199 	sc->hn_rx_ring = NULL;
5200 
5201 	sc->hn_rx_ring_cnt = 0;
5202 	sc->hn_rx_ring_inuse = 0;
5203 }
5204 
5205 static int
5206 hn_tx_ring_create(struct hn_softc *sc, int id)
5207 {
5208 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5209 	device_t dev = sc->hn_dev;
5210 	bus_dma_tag_t parent_dtag;
5211 	int error, i;
5212 
5213 	txr->hn_sc = sc;
5214 	txr->hn_tx_idx = id;
5215 
5216 #ifndef HN_USE_TXDESC_BUFRING
5217 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5218 #endif
5219 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5220 
5221 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5222 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5223 	    M_DEVBUF, M_WAITOK | M_ZERO);
5224 #ifndef HN_USE_TXDESC_BUFRING
5225 	SLIST_INIT(&txr->hn_txlist);
5226 #else
5227 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5228 	    M_WAITOK, &txr->hn_tx_lock);
5229 #endif
5230 
5231 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5232 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5233 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5234 	} else {
5235 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5236 	}
5237 
5238 #ifdef HN_IFSTART_SUPPORT
5239 	if (hn_use_if_start) {
5240 		txr->hn_txeof = hn_start_txeof;
5241 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5242 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5243 	} else
5244 #endif
5245 	{
5246 		int br_depth;
5247 
5248 		txr->hn_txeof = hn_xmit_txeof;
5249 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5250 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5251 
5252 		br_depth = hn_get_txswq_depth(txr);
5253 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5254 		    M_WAITOK, &txr->hn_tx_lock);
5255 	}
5256 
5257 	txr->hn_direct_tx_size = hn_direct_tx_size;
5258 
5259 	/*
5260 	 * Always schedule transmission instead of trying to do direct
5261 	 * transmission.  This one gives the best performance so far.
5262 	 */
5263 	txr->hn_sched_tx = 1;
5264 
5265 	parent_dtag = bus_get_dma_tag(dev);
5266 
5267 	/* DMA tag for RNDIS packet messages. */
5268 	error = bus_dma_tag_create(parent_dtag, /* parent */
5269 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5270 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5271 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5272 	    BUS_SPACE_MAXADDR,		/* highaddr */
5273 	    NULL, NULL,			/* filter, filterarg */
5274 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5275 	    1,				/* nsegments */
5276 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5277 	    0,				/* flags */
5278 	    NULL,			/* lockfunc */
5279 	    NULL,			/* lockfuncarg */
5280 	    &txr->hn_tx_rndis_dtag);
5281 	if (error) {
5282 		device_printf(dev, "failed to create rndis dmatag\n");
5283 		return error;
5284 	}
5285 
5286 	/* DMA tag for data. */
5287 	error = bus_dma_tag_create(parent_dtag, /* parent */
5288 	    1,				/* alignment */
5289 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5290 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5291 	    BUS_SPACE_MAXADDR,		/* highaddr */
5292 	    NULL, NULL,			/* filter, filterarg */
5293 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5294 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5295 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5296 	    0,				/* flags */
5297 	    NULL,			/* lockfunc */
5298 	    NULL,			/* lockfuncarg */
5299 	    &txr->hn_tx_data_dtag);
5300 	if (error) {
5301 		device_printf(dev, "failed to create data dmatag\n");
5302 		return error;
5303 	}
5304 
5305 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5306 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5307 
5308 		txd->txr = txr;
5309 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5310 		STAILQ_INIT(&txd->agg_list);
5311 
5312 		/*
5313 		 * Allocate and load RNDIS packet message.
5314 		 */
5315         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5316 		    (void **)&txd->rndis_pkt,
5317 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5318 		    &txd->rndis_pkt_dmap);
5319 		if (error) {
5320 			device_printf(dev,
5321 			    "failed to allocate rndis_packet_msg, %d\n", i);
5322 			return error;
5323 		}
5324 
5325 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5326 		    txd->rndis_pkt_dmap,
5327 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5328 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5329 		    BUS_DMA_NOWAIT);
5330 		if (error) {
5331 			device_printf(dev,
5332 			    "failed to load rndis_packet_msg, %d\n", i);
5333 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5334 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5335 			return error;
5336 		}
5337 
5338 		/* DMA map for TX data. */
5339 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5340 		    &txd->data_dmap);
5341 		if (error) {
5342 			device_printf(dev,
5343 			    "failed to allocate tx data dmamap\n");
5344 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5345 			    txd->rndis_pkt_dmap);
5346 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5347 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5348 			return error;
5349 		}
5350 
5351 		/* All set, put it to list */
5352 		txd->flags |= HN_TXD_FLAG_ONLIST;
5353 #ifndef HN_USE_TXDESC_BUFRING
5354 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5355 #else
5356 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5357 #endif
5358 	}
5359 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5360 
5361 	if (sc->hn_tx_sysctl_tree != NULL) {
5362 		struct sysctl_oid_list *child;
5363 		struct sysctl_ctx_list *ctx;
5364 		char name[16];
5365 
5366 		/*
5367 		 * Create per TX ring sysctl tree:
5368 		 * dev.hn.UNIT.tx.RINGID
5369 		 */
5370 		ctx = device_get_sysctl_ctx(dev);
5371 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5372 
5373 		snprintf(name, sizeof(name), "%d", id);
5374 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5375 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5376 
5377 		if (txr->hn_tx_sysctl_tree != NULL) {
5378 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5379 
5380 #ifdef HN_DEBUG
5381 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5382 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5383 			    "# of available TX descs");
5384 #endif
5385 #ifdef HN_IFSTART_SUPPORT
5386 			if (!hn_use_if_start)
5387 #endif
5388 			{
5389 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5390 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5391 				    "over active");
5392 			}
5393 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5394 			    CTLFLAG_RW, &txr->hn_pkts,
5395 			    "# of packets transmitted");
5396 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5397 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5398 		}
5399 	}
5400 
5401 	return 0;
5402 }
5403 
5404 static void
5405 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5406 {
5407 	struct hn_tx_ring *txr = txd->txr;
5408 
5409 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5410 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5411 
5412 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5413 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5414 	    txd->rndis_pkt_dmap);
5415 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5416 }
5417 
5418 static void
5419 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5420 {
5421 
5422 	KASSERT(txd->refs == 0 || txd->refs == 1,
5423 	    ("invalid txd refs %d", txd->refs));
5424 
5425 	/* Aggregated txds will be freed by their aggregating txd. */
5426 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5427 		int freed;
5428 
5429 		freed = hn_txdesc_put(txr, txd);
5430 		KASSERT(freed, ("can't free txdesc"));
5431 	}
5432 }
5433 
5434 static void
5435 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5436 {
5437 	int i;
5438 
5439 	if (txr->hn_txdesc == NULL)
5440 		return;
5441 
5442 	/*
5443 	 * NOTE:
5444 	 * Because the freeing of aggregated txds will be deferred
5445 	 * to the aggregating txd, two passes are used here:
5446 	 * - The first pass GCes any pending txds.  This GC is necessary,
5447 	 *   since if the channels are revoked, hypervisor will not
5448 	 *   deliver send-done for all pending txds.
5449 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5450 	 *   were freed.
5451 	 */
5452 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5453 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5454 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5455 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5456 
5457 	if (txr->hn_tx_data_dtag != NULL)
5458 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5459 	if (txr->hn_tx_rndis_dtag != NULL)
5460 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5461 
5462 #ifdef HN_USE_TXDESC_BUFRING
5463 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5464 #endif
5465 
5466 	free(txr->hn_txdesc, M_DEVBUF);
5467 	txr->hn_txdesc = NULL;
5468 
5469 	if (txr->hn_mbuf_br != NULL)
5470 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5471 
5472 #ifndef HN_USE_TXDESC_BUFRING
5473 	mtx_destroy(&txr->hn_txlist_spin);
5474 #endif
5475 	mtx_destroy(&txr->hn_tx_lock);
5476 }
5477 
5478 static int
5479 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5480 {
5481 	struct sysctl_oid_list *child;
5482 	struct sysctl_ctx_list *ctx;
5483 	int i;
5484 
5485 	/*
5486 	 * Create TXBUF for chimney sending.
5487 	 *
5488 	 * NOTE: It is shared by all channels.
5489 	 */
5490 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5491 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5492 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5493 	if (sc->hn_chim == NULL) {
5494 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5495 		return (ENOMEM);
5496 	}
5497 
5498 	sc->hn_tx_ring_cnt = ring_cnt;
5499 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5500 
5501 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5502 	    M_DEVBUF, M_WAITOK | M_ZERO);
5503 
5504 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5505 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5506 
5507 	/* Create dev.hn.UNIT.tx sysctl tree */
5508 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5509 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5510 
5511 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5512 		int error;
5513 
5514 		error = hn_tx_ring_create(sc, i);
5515 		if (error)
5516 			return error;
5517 	}
5518 
5519 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5520 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5521 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5522 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5523 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5524 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5525 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5526 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5527 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5528 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5529 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5530 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5531 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5532 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5533 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5534 	    hn_tx_stat_ulong_sysctl, "LU",
5535 	    "# of packet transmission aggregation flush failure");
5536 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5537 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5538 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5539 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5540 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5541 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5542 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5543 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5544 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5545 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5546 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5547 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5548 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5549 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5550 	    "# of total TX descs");
5551 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5552 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5553 	    "Chimney send packet size upper boundary");
5554 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5555 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5556 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5557 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5558 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5559 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5560 	    hn_tx_conf_int_sysctl, "I",
5561 	    "Size of the packet for direct transmission");
5562 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5563 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5564 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5565 	    hn_tx_conf_int_sysctl, "I",
5566 	    "Always schedule transmission "
5567 	    "instead of doing direct transmission");
5568 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5569 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5570 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5571 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5572 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5573 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5574 	    "Applied packet transmission aggregation size");
5575 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5576 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5577 	    hn_txagg_pktmax_sysctl, "I",
5578 	    "Applied packet transmission aggregation packets");
5579 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5580 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5581 	    hn_txagg_align_sysctl, "I",
5582 	    "Applied packet transmission aggregation alignment");
5583 
5584 	return 0;
5585 }
5586 
5587 static void
5588 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5589 {
5590 	int i;
5591 
5592 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5593 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5594 }
5595 
5596 static void
5597 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5598 {
5599 	struct ifnet *ifp = sc->hn_ifp;
5600 	u_int hw_tsomax;
5601 	int tso_minlen;
5602 
5603 	HN_LOCK_ASSERT(sc);
5604 
5605 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5606 		return;
5607 
5608 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5609 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5610 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5611 
5612 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5613 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5614 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5615 
5616 	if (tso_maxlen < tso_minlen)
5617 		tso_maxlen = tso_minlen;
5618 	else if (tso_maxlen > IP_MAXPACKET)
5619 		tso_maxlen = IP_MAXPACKET;
5620 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5621 		tso_maxlen = sc->hn_ndis_tso_szmax;
5622 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5623 
5624 	if (hn_xpnt_vf_isready(sc)) {
5625 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5626 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5627 	}
5628 	ifp->if_hw_tsomax = hw_tsomax;
5629 	if (bootverbose)
5630 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5631 }
5632 
5633 static void
5634 hn_fixup_tx_data(struct hn_softc *sc)
5635 {
5636 	uint64_t csum_assist;
5637 	int i;
5638 
5639 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5640 	if (hn_tx_chimney_size > 0 &&
5641 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5642 		hn_set_chim_size(sc, hn_tx_chimney_size);
5643 
5644 	csum_assist = 0;
5645 	if (sc->hn_caps & HN_CAP_IPCS)
5646 		csum_assist |= CSUM_IP;
5647 	if (sc->hn_caps & HN_CAP_TCP4CS)
5648 		csum_assist |= CSUM_IP_TCP;
5649 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5650 		csum_assist |= CSUM_IP_UDP;
5651 	if (sc->hn_caps & HN_CAP_TCP6CS)
5652 		csum_assist |= CSUM_IP6_TCP;
5653 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5654 		csum_assist |= CSUM_IP6_UDP;
5655 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5656 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5657 
5658 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5659 		/*
5660 		 * Support HASHVAL pktinfo on TX path.
5661 		 */
5662 		if (bootverbose)
5663 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5664 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5665 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5666 	}
5667 }
5668 
5669 static void
5670 hn_fixup_rx_data(struct hn_softc *sc)
5671 {
5672 
5673 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5674 		int i;
5675 
5676 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5677 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5678 	}
5679 }
5680 
5681 static void
5682 hn_destroy_tx_data(struct hn_softc *sc)
5683 {
5684 	int i;
5685 
5686 	if (sc->hn_chim != NULL) {
5687 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5688 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5689 		} else {
5690 			device_printf(sc->hn_dev,
5691 			    "chimney sending buffer is referenced");
5692 		}
5693 		sc->hn_chim = NULL;
5694 	}
5695 
5696 	if (sc->hn_tx_ring_cnt == 0)
5697 		return;
5698 
5699 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5700 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5701 
5702 	free(sc->hn_tx_ring, M_DEVBUF);
5703 	sc->hn_tx_ring = NULL;
5704 
5705 	sc->hn_tx_ring_cnt = 0;
5706 	sc->hn_tx_ring_inuse = 0;
5707 }
5708 
5709 #ifdef HN_IFSTART_SUPPORT
5710 
5711 static void
5712 hn_start_taskfunc(void *xtxr, int pending __unused)
5713 {
5714 	struct hn_tx_ring *txr = xtxr;
5715 
5716 	mtx_lock(&txr->hn_tx_lock);
5717 	hn_start_locked(txr, 0);
5718 	mtx_unlock(&txr->hn_tx_lock);
5719 }
5720 
5721 static int
5722 hn_start_locked(struct hn_tx_ring *txr, int len)
5723 {
5724 	struct hn_softc *sc = txr->hn_sc;
5725 	struct ifnet *ifp = sc->hn_ifp;
5726 	int sched = 0;
5727 
5728 	KASSERT(hn_use_if_start,
5729 	    ("hn_start_locked is called, when if_start is disabled"));
5730 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5731 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5732 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5733 
5734 	if (__predict_false(txr->hn_suspended))
5735 		return (0);
5736 
5737 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5738 	    IFF_DRV_RUNNING)
5739 		return (0);
5740 
5741 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5742 		struct hn_txdesc *txd;
5743 		struct mbuf *m_head;
5744 		int error;
5745 
5746 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5747 		if (m_head == NULL)
5748 			break;
5749 
5750 		if (len > 0 && m_head->m_pkthdr.len > len) {
5751 			/*
5752 			 * This sending could be time consuming; let callers
5753 			 * dispatch this packet sending (and sending of any
5754 			 * following up packets) to tx taskqueue.
5755 			 */
5756 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5757 			sched = 1;
5758 			break;
5759 		}
5760 
5761 #if defined(INET6) || defined(INET)
5762 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5763 			m_head = hn_tso_fixup(m_head);
5764 			if (__predict_false(m_head == NULL)) {
5765 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5766 				continue;
5767 			}
5768 		} else if (m_head->m_pkthdr.csum_flags &
5769 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5770 			m_head = hn_set_hlen(m_head);
5771 			if (__predict_false(m_head == NULL)) {
5772 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5773 				continue;
5774 			}
5775 		}
5776 #endif
5777 
5778 		txd = hn_txdesc_get(txr);
5779 		if (txd == NULL) {
5780 			txr->hn_no_txdescs++;
5781 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5782 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5783 			break;
5784 		}
5785 
5786 		error = hn_encap(ifp, txr, txd, &m_head);
5787 		if (error) {
5788 			/* Both txd and m_head are freed */
5789 			KASSERT(txr->hn_agg_txd == NULL,
5790 			    ("encap failed w/ pending aggregating txdesc"));
5791 			continue;
5792 		}
5793 
5794 		if (txr->hn_agg_pktleft == 0) {
5795 			if (txr->hn_agg_txd != NULL) {
5796 				KASSERT(m_head == NULL,
5797 				    ("pending mbuf for aggregating txdesc"));
5798 				error = hn_flush_txagg(ifp, txr);
5799 				if (__predict_false(error)) {
5800 					atomic_set_int(&ifp->if_drv_flags,
5801 					    IFF_DRV_OACTIVE);
5802 					break;
5803 				}
5804 			} else {
5805 				KASSERT(m_head != NULL, ("mbuf was freed"));
5806 				error = hn_txpkt(ifp, txr, txd);
5807 				if (__predict_false(error)) {
5808 					/* txd is freed, but m_head is not */
5809 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5810 					atomic_set_int(&ifp->if_drv_flags,
5811 					    IFF_DRV_OACTIVE);
5812 					break;
5813 				}
5814 			}
5815 		}
5816 #ifdef INVARIANTS
5817 		else {
5818 			KASSERT(txr->hn_agg_txd != NULL,
5819 			    ("no aggregating txdesc"));
5820 			KASSERT(m_head == NULL,
5821 			    ("pending mbuf for aggregating txdesc"));
5822 		}
5823 #endif
5824 	}
5825 
5826 	/* Flush pending aggerated transmission. */
5827 	if (txr->hn_agg_txd != NULL)
5828 		hn_flush_txagg(ifp, txr);
5829 	return (sched);
5830 }
5831 
5832 static void
5833 hn_start(struct ifnet *ifp)
5834 {
5835 	struct hn_softc *sc = ifp->if_softc;
5836 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5837 
5838 	if (txr->hn_sched_tx)
5839 		goto do_sched;
5840 
5841 	if (mtx_trylock(&txr->hn_tx_lock)) {
5842 		int sched;
5843 
5844 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5845 		mtx_unlock(&txr->hn_tx_lock);
5846 		if (!sched)
5847 			return;
5848 	}
5849 do_sched:
5850 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5851 }
5852 
5853 static void
5854 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5855 {
5856 	struct hn_tx_ring *txr = xtxr;
5857 
5858 	mtx_lock(&txr->hn_tx_lock);
5859 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5860 	hn_start_locked(txr, 0);
5861 	mtx_unlock(&txr->hn_tx_lock);
5862 }
5863 
5864 static void
5865 hn_start_txeof(struct hn_tx_ring *txr)
5866 {
5867 	struct hn_softc *sc = txr->hn_sc;
5868 	struct ifnet *ifp = sc->hn_ifp;
5869 
5870 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5871 
5872 	if (txr->hn_sched_tx)
5873 		goto do_sched;
5874 
5875 	if (mtx_trylock(&txr->hn_tx_lock)) {
5876 		int sched;
5877 
5878 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5879 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5880 		mtx_unlock(&txr->hn_tx_lock);
5881 		if (sched) {
5882 			taskqueue_enqueue(txr->hn_tx_taskq,
5883 			    &txr->hn_tx_task);
5884 		}
5885 	} else {
5886 do_sched:
5887 		/*
5888 		 * Release the OACTIVE earlier, with the hope, that
5889 		 * others could catch up.  The task will clear the
5890 		 * flag again with the hn_tx_lock to avoid possible
5891 		 * races.
5892 		 */
5893 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5894 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5895 	}
5896 }
5897 
5898 #endif	/* HN_IFSTART_SUPPORT */
5899 
5900 static int
5901 hn_xmit(struct hn_tx_ring *txr, int len)
5902 {
5903 	struct hn_softc *sc = txr->hn_sc;
5904 	struct ifnet *ifp = sc->hn_ifp;
5905 	struct mbuf *m_head;
5906 	int sched = 0;
5907 
5908 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5909 #ifdef HN_IFSTART_SUPPORT
5910 	KASSERT(hn_use_if_start == 0,
5911 	    ("hn_xmit is called, when if_start is enabled"));
5912 #endif
5913 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5914 
5915 	if (__predict_false(txr->hn_suspended))
5916 		return (0);
5917 
5918 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5919 		return (0);
5920 
5921 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5922 		struct hn_txdesc *txd;
5923 		int error;
5924 
5925 		if (len > 0 && m_head->m_pkthdr.len > len) {
5926 			/*
5927 			 * This sending could be time consuming; let callers
5928 			 * dispatch this packet sending (and sending of any
5929 			 * following up packets) to tx taskqueue.
5930 			 */
5931 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5932 			sched = 1;
5933 			break;
5934 		}
5935 
5936 		txd = hn_txdesc_get(txr);
5937 		if (txd == NULL) {
5938 			txr->hn_no_txdescs++;
5939 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5940 			txr->hn_oactive = 1;
5941 			break;
5942 		}
5943 
5944 		error = hn_encap(ifp, txr, txd, &m_head);
5945 		if (error) {
5946 			/* Both txd and m_head are freed; discard */
5947 			KASSERT(txr->hn_agg_txd == NULL,
5948 			    ("encap failed w/ pending aggregating txdesc"));
5949 			drbr_advance(ifp, txr->hn_mbuf_br);
5950 			continue;
5951 		}
5952 
5953 		if (txr->hn_agg_pktleft == 0) {
5954 			if (txr->hn_agg_txd != NULL) {
5955 				KASSERT(m_head == NULL,
5956 				    ("pending mbuf for aggregating txdesc"));
5957 				error = hn_flush_txagg(ifp, txr);
5958 				if (__predict_false(error)) {
5959 					txr->hn_oactive = 1;
5960 					break;
5961 				}
5962 			} else {
5963 				KASSERT(m_head != NULL, ("mbuf was freed"));
5964 				error = hn_txpkt(ifp, txr, txd);
5965 				if (__predict_false(error)) {
5966 					/* txd is freed, but m_head is not */
5967 					drbr_putback(ifp, txr->hn_mbuf_br,
5968 					    m_head);
5969 					txr->hn_oactive = 1;
5970 					break;
5971 				}
5972 			}
5973 		}
5974 #ifdef INVARIANTS
5975 		else {
5976 			KASSERT(txr->hn_agg_txd != NULL,
5977 			    ("no aggregating txdesc"));
5978 			KASSERT(m_head == NULL,
5979 			    ("pending mbuf for aggregating txdesc"));
5980 		}
5981 #endif
5982 
5983 		/* Sent */
5984 		drbr_advance(ifp, txr->hn_mbuf_br);
5985 	}
5986 
5987 	/* Flush pending aggerated transmission. */
5988 	if (txr->hn_agg_txd != NULL)
5989 		hn_flush_txagg(ifp, txr);
5990 	return (sched);
5991 }
5992 
5993 static int
5994 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5995 {
5996 	struct hn_softc *sc = ifp->if_softc;
5997 	struct hn_tx_ring *txr;
5998 	int error, idx = 0;
5999 
6000 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
6001 		struct rm_priotracker pt;
6002 
6003 		rm_rlock(&sc->hn_vf_lock, &pt);
6004 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6005 			struct mbuf *m_bpf = NULL;
6006 			int obytes, omcast;
6007 
6008 			obytes = m->m_pkthdr.len;
6009 			omcast = (m->m_flags & M_MCAST) != 0;
6010 
6011 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
6012 				if (bpf_peers_present(ifp->if_bpf)) {
6013 					m_bpf = m_copypacket(m, M_NOWAIT);
6014 					if (m_bpf == NULL) {
6015 						/*
6016 						 * Failed to grab a shallow
6017 						 * copy; tap now.
6018 						 */
6019 						ETHER_BPF_MTAP(ifp, m);
6020 					}
6021 				}
6022 			} else {
6023 				ETHER_BPF_MTAP(ifp, m);
6024 			}
6025 
6026 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
6027 			rm_runlock(&sc->hn_vf_lock, &pt);
6028 
6029 			if (m_bpf != NULL) {
6030 				if (!error)
6031 					ETHER_BPF_MTAP(ifp, m_bpf);
6032 				m_freem(m_bpf);
6033 			}
6034 
6035 			if (error == ENOBUFS) {
6036 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6037 			} else if (error) {
6038 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6039 			} else {
6040 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6041 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6042 				if (omcast) {
6043 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6044 					    omcast);
6045 				}
6046 			}
6047 			return (error);
6048 		}
6049 		rm_runlock(&sc->hn_vf_lock, &pt);
6050 	}
6051 
6052 #if defined(INET6) || defined(INET)
6053 	/*
6054 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6055 	 * since packet headers should be cache-hot.
6056 	 */
6057 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6058 		m = hn_tso_fixup(m);
6059 		if (__predict_false(m == NULL)) {
6060 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6061 			return EIO;
6062 		}
6063 	} else if (m->m_pkthdr.csum_flags &
6064 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6065 		m = hn_set_hlen(m);
6066 		if (__predict_false(m == NULL)) {
6067 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6068 			return EIO;
6069 		}
6070 	}
6071 #endif
6072 
6073 	/*
6074 	 * Select the TX ring based on flowid
6075 	 */
6076 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6077 #ifdef RSS
6078 		uint32_t bid;
6079 
6080 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6081 		    &bid) == 0)
6082 			idx = bid % sc->hn_tx_ring_inuse;
6083 		else
6084 #endif
6085 		{
6086 #if defined(INET6) || defined(INET)
6087 			int tcpsyn = 0;
6088 
6089 			if (m->m_pkthdr.len < 128 &&
6090 			    (m->m_pkthdr.csum_flags &
6091 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6092 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6093 				m = hn_check_tcpsyn(m, &tcpsyn);
6094 				if (__predict_false(m == NULL)) {
6095 					if_inc_counter(ifp,
6096 					    IFCOUNTER_OERRORS, 1);
6097 					return (EIO);
6098 				}
6099 			}
6100 #else
6101 			const int tcpsyn = 0;
6102 #endif
6103 			if (tcpsyn)
6104 				idx = 0;
6105 			else
6106 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6107 		}
6108 	}
6109 	txr = &sc->hn_tx_ring[idx];
6110 
6111 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6112 	if (error) {
6113 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6114 		return error;
6115 	}
6116 
6117 	if (txr->hn_oactive)
6118 		return 0;
6119 
6120 	if (txr->hn_sched_tx)
6121 		goto do_sched;
6122 
6123 	if (mtx_trylock(&txr->hn_tx_lock)) {
6124 		int sched;
6125 
6126 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6127 		mtx_unlock(&txr->hn_tx_lock);
6128 		if (!sched)
6129 			return 0;
6130 	}
6131 do_sched:
6132 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6133 	return 0;
6134 }
6135 
6136 static void
6137 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6138 {
6139 	struct mbuf *m;
6140 
6141 	mtx_lock(&txr->hn_tx_lock);
6142 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6143 		m_freem(m);
6144 	mtx_unlock(&txr->hn_tx_lock);
6145 }
6146 
6147 static void
6148 hn_xmit_qflush(struct ifnet *ifp)
6149 {
6150 	struct hn_softc *sc = ifp->if_softc;
6151 	struct rm_priotracker pt;
6152 	int i;
6153 
6154 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6155 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6156 	if_qflush(ifp);
6157 
6158 	rm_rlock(&sc->hn_vf_lock, &pt);
6159 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6160 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6161 	rm_runlock(&sc->hn_vf_lock, &pt);
6162 }
6163 
6164 static void
6165 hn_xmit_txeof(struct hn_tx_ring *txr)
6166 {
6167 
6168 	if (txr->hn_sched_tx)
6169 		goto do_sched;
6170 
6171 	if (mtx_trylock(&txr->hn_tx_lock)) {
6172 		int sched;
6173 
6174 		txr->hn_oactive = 0;
6175 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6176 		mtx_unlock(&txr->hn_tx_lock);
6177 		if (sched) {
6178 			taskqueue_enqueue(txr->hn_tx_taskq,
6179 			    &txr->hn_tx_task);
6180 		}
6181 	} else {
6182 do_sched:
6183 		/*
6184 		 * Release the oactive earlier, with the hope, that
6185 		 * others could catch up.  The task will clear the
6186 		 * oactive again with the hn_tx_lock to avoid possible
6187 		 * races.
6188 		 */
6189 		txr->hn_oactive = 0;
6190 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6191 	}
6192 }
6193 
6194 static void
6195 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6196 {
6197 	struct hn_tx_ring *txr = xtxr;
6198 
6199 	mtx_lock(&txr->hn_tx_lock);
6200 	hn_xmit(txr, 0);
6201 	mtx_unlock(&txr->hn_tx_lock);
6202 }
6203 
6204 static void
6205 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6206 {
6207 	struct hn_tx_ring *txr = xtxr;
6208 
6209 	mtx_lock(&txr->hn_tx_lock);
6210 	txr->hn_oactive = 0;
6211 	hn_xmit(txr, 0);
6212 	mtx_unlock(&txr->hn_tx_lock);
6213 }
6214 
6215 static int
6216 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6217 {
6218 	struct vmbus_chan_br cbr;
6219 	struct hn_rx_ring *rxr;
6220 	struct hn_tx_ring *txr = NULL;
6221 	int idx, error;
6222 
6223 	idx = vmbus_chan_subidx(chan);
6224 
6225 	/*
6226 	 * Link this channel to RX/TX ring.
6227 	 */
6228 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6229 	    ("invalid channel index %d, should > 0 && < %d",
6230 	     idx, sc->hn_rx_ring_inuse));
6231 	rxr = &sc->hn_rx_ring[idx];
6232 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6233 	    ("RX ring %d already attached", idx));
6234 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6235 	rxr->hn_chan = chan;
6236 
6237 	if (bootverbose) {
6238 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6239 		    idx, vmbus_chan_id(chan));
6240 	}
6241 
6242 	if (idx < sc->hn_tx_ring_inuse) {
6243 		txr = &sc->hn_tx_ring[idx];
6244 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6245 		    ("TX ring %d already attached", idx));
6246 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6247 
6248 		txr->hn_chan = chan;
6249 		if (bootverbose) {
6250 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6251 			    idx, vmbus_chan_id(chan));
6252 		}
6253 	}
6254 
6255 	/* Bind this channel to a proper CPU. */
6256 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6257 
6258 	/*
6259 	 * Open this channel
6260 	 */
6261 	cbr.cbr = rxr->hn_br;
6262 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6263 	cbr.cbr_txsz = HN_TXBR_SIZE;
6264 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6265 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6266 	if (error) {
6267 		if (error == EISCONN) {
6268 			if_printf(sc->hn_ifp, "bufring is connected after "
6269 			    "chan%u open failure\n", vmbus_chan_id(chan));
6270 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6271 		} else {
6272 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6273 			    vmbus_chan_id(chan), error);
6274 		}
6275 	}
6276 	return (error);
6277 }
6278 
6279 static void
6280 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6281 {
6282 	struct hn_rx_ring *rxr;
6283 	int idx, error;
6284 
6285 	idx = vmbus_chan_subidx(chan);
6286 
6287 	/*
6288 	 * Link this channel to RX/TX ring.
6289 	 */
6290 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6291 	    ("invalid channel index %d, should > 0 && < %d",
6292 	     idx, sc->hn_rx_ring_inuse));
6293 	rxr = &sc->hn_rx_ring[idx];
6294 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6295 	    ("RX ring %d is not attached", idx));
6296 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6297 
6298 	if (idx < sc->hn_tx_ring_inuse) {
6299 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6300 
6301 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6302 		    ("TX ring %d is not attached attached", idx));
6303 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6304 	}
6305 
6306 	/*
6307 	 * Close this channel.
6308 	 *
6309 	 * NOTE:
6310 	 * Channel closing does _not_ destroy the target channel.
6311 	 */
6312 	error = vmbus_chan_close_direct(chan);
6313 	if (error == EISCONN) {
6314 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6315 		    "after being closed\n", vmbus_chan_id(chan));
6316 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6317 	} else if (error) {
6318 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6319 		    vmbus_chan_id(chan), error);
6320 	}
6321 }
6322 
6323 static int
6324 hn_attach_subchans(struct hn_softc *sc)
6325 {
6326 	struct vmbus_channel **subchans;
6327 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6328 	int i, error = 0;
6329 
6330 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6331 
6332 	/* Attach the sub-channels. */
6333 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6334 	for (i = 0; i < subchan_cnt; ++i) {
6335 		int error1;
6336 
6337 		error1 = hn_chan_attach(sc, subchans[i]);
6338 		if (error1) {
6339 			error = error1;
6340 			/* Move on; all channels will be detached later. */
6341 		}
6342 	}
6343 	vmbus_subchan_rel(subchans, subchan_cnt);
6344 
6345 	if (error) {
6346 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6347 	} else {
6348 		if (bootverbose) {
6349 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6350 			    subchan_cnt);
6351 		}
6352 	}
6353 	return (error);
6354 }
6355 
6356 static void
6357 hn_detach_allchans(struct hn_softc *sc)
6358 {
6359 	struct vmbus_channel **subchans;
6360 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6361 	int i;
6362 
6363 	if (subchan_cnt == 0)
6364 		goto back;
6365 
6366 	/* Detach the sub-channels. */
6367 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6368 	for (i = 0; i < subchan_cnt; ++i)
6369 		hn_chan_detach(sc, subchans[i]);
6370 	vmbus_subchan_rel(subchans, subchan_cnt);
6371 
6372 back:
6373 	/*
6374 	 * Detach the primary channel, _after_ all sub-channels
6375 	 * are detached.
6376 	 */
6377 	hn_chan_detach(sc, sc->hn_prichan);
6378 
6379 	/* Wait for sub-channels to be destroyed, if any. */
6380 	vmbus_subchan_drain(sc->hn_prichan);
6381 
6382 #ifdef INVARIANTS
6383 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6384 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6385 		    HN_RX_FLAG_ATTACHED) == 0,
6386 		    ("%dth RX ring is still attached", i));
6387 	}
6388 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6389 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6390 		    HN_TX_FLAG_ATTACHED) == 0,
6391 		    ("%dth TX ring is still attached", i));
6392 	}
6393 #endif
6394 }
6395 
6396 static int
6397 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6398 {
6399 	struct vmbus_channel **subchans;
6400 	int nchan, rxr_cnt, error;
6401 
6402 	nchan = *nsubch + 1;
6403 	if (nchan == 1) {
6404 		/*
6405 		 * Multiple RX/TX rings are not requested.
6406 		 */
6407 		*nsubch = 0;
6408 		return (0);
6409 	}
6410 
6411 	/*
6412 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6413 	 * table entries.
6414 	 */
6415 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6416 	if (error) {
6417 		/* No RSS; this is benign. */
6418 		*nsubch = 0;
6419 		return (0);
6420 	}
6421 	if (bootverbose) {
6422 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6423 		    rxr_cnt, nchan);
6424 	}
6425 
6426 	if (nchan > rxr_cnt)
6427 		nchan = rxr_cnt;
6428 	if (nchan == 1) {
6429 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6430 		*nsubch = 0;
6431 		return (0);
6432 	}
6433 
6434 	/*
6435 	 * Allocate sub-channels from NVS.
6436 	 */
6437 	*nsubch = nchan - 1;
6438 	error = hn_nvs_alloc_subchans(sc, nsubch);
6439 	if (error || *nsubch == 0) {
6440 		/* Failed to allocate sub-channels. */
6441 		*nsubch = 0;
6442 		return (0);
6443 	}
6444 
6445 	/*
6446 	 * Wait for all sub-channels to become ready before moving on.
6447 	 */
6448 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6449 	vmbus_subchan_rel(subchans, *nsubch);
6450 	return (0);
6451 }
6452 
6453 static bool
6454 hn_synth_attachable(const struct hn_softc *sc)
6455 {
6456 	int i;
6457 
6458 	if (sc->hn_flags & HN_FLAG_ERRORS)
6459 		return (false);
6460 
6461 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6462 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6463 
6464 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6465 			return (false);
6466 	}
6467 	return (true);
6468 }
6469 
6470 /*
6471  * Make sure that the RX filter is zero after the successful
6472  * RNDIS initialization.
6473  *
6474  * NOTE:
6475  * Under certain conditions on certain versions of Hyper-V,
6476  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6477  * after the successful RNDIS initialization, which breaks
6478  * the assumption of any following code (well, it breaks the
6479  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6480  * explicitly, drain packets sneaking through, and drain the
6481  * interrupt taskqueues scheduled due to the stealth packets.
6482  */
6483 static void
6484 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6485 {
6486 
6487 	hn_disable_rx(sc);
6488 	hn_drain_rxtx(sc, nchan);
6489 }
6490 
6491 static int
6492 hn_synth_attach(struct hn_softc *sc, int mtu)
6493 {
6494 #define ATTACHED_NVS		0x0002
6495 #define ATTACHED_RNDIS		0x0004
6496 
6497 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6498 	int error, nsubch, nchan = 1, i, rndis_inited;
6499 	uint32_t old_caps, attached = 0;
6500 
6501 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6502 	    ("synthetic parts were attached"));
6503 
6504 	if (!hn_synth_attachable(sc))
6505 		return (ENXIO);
6506 
6507 	/* Save capabilities for later verification. */
6508 	old_caps = sc->hn_caps;
6509 	sc->hn_caps = 0;
6510 
6511 	/* Clear RSS stuffs. */
6512 	sc->hn_rss_ind_size = 0;
6513 	sc->hn_rss_hash = 0;
6514 	sc->hn_rss_hcap = 0;
6515 
6516 	/*
6517 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6518 	 */
6519 	error = hn_chan_attach(sc, sc->hn_prichan);
6520 	if (error)
6521 		goto failed;
6522 
6523 	/*
6524 	 * Attach NVS.
6525 	 */
6526 	error = hn_nvs_attach(sc, mtu);
6527 	if (error)
6528 		goto failed;
6529 	attached |= ATTACHED_NVS;
6530 
6531 	/*
6532 	 * Attach RNDIS _after_ NVS is attached.
6533 	 */
6534 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6535 	if (rndis_inited)
6536 		attached |= ATTACHED_RNDIS;
6537 	if (error)
6538 		goto failed;
6539 
6540 	/*
6541 	 * Make sure capabilities are not changed.
6542 	 */
6543 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6544 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6545 		    old_caps, sc->hn_caps);
6546 		error = ENXIO;
6547 		goto failed;
6548 	}
6549 
6550 	/*
6551 	 * Allocate sub-channels for multi-TX/RX rings.
6552 	 *
6553 	 * NOTE:
6554 	 * The # of RX rings that can be used is equivalent to the # of
6555 	 * channels to be requested.
6556 	 */
6557 	nsubch = sc->hn_rx_ring_cnt - 1;
6558 	error = hn_synth_alloc_subchans(sc, &nsubch);
6559 	if (error)
6560 		goto failed;
6561 	/* NOTE: _Full_ synthetic parts detach is required now. */
6562 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6563 
6564 	/*
6565 	 * Set the # of TX/RX rings that could be used according to
6566 	 * the # of channels that NVS offered.
6567 	 */
6568 	nchan = nsubch + 1;
6569 	hn_set_ring_inuse(sc, nchan);
6570 	if (nchan == 1) {
6571 		/* Only the primary channel can be used; done */
6572 		goto back;
6573 	}
6574 
6575 	/*
6576 	 * Attach the sub-channels.
6577 	 *
6578 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6579 	 */
6580 	error = hn_attach_subchans(sc);
6581 	if (error)
6582 		goto failed;
6583 
6584 	/*
6585 	 * Configure RSS key and indirect table _after_ all sub-channels
6586 	 * are attached.
6587 	 */
6588 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6589 		/*
6590 		 * RSS key is not set yet; set it to the default RSS key.
6591 		 */
6592 		if (bootverbose)
6593 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6594 #ifdef RSS
6595 		rss_getkey(rss->rss_key);
6596 #else
6597 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6598 #endif
6599 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6600 	}
6601 
6602 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6603 		/*
6604 		 * RSS indirect table is not set yet; set it up in round-
6605 		 * robin fashion.
6606 		 */
6607 		if (bootverbose) {
6608 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6609 			    "table\n");
6610 		}
6611 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6612 			uint32_t subidx;
6613 
6614 #ifdef RSS
6615 			subidx = rss_get_indirection_to_bucket(i);
6616 #else
6617 			subidx = i;
6618 #endif
6619 			rss->rss_ind[i] = subidx % nchan;
6620 		}
6621 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6622 	} else {
6623 		/*
6624 		 * # of usable channels may be changed, so we have to
6625 		 * make sure that all entries in RSS indirect table
6626 		 * are valid.
6627 		 *
6628 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6629 		 */
6630 		hn_rss_ind_fixup(sc);
6631 	}
6632 
6633 	sc->hn_rss_hash = sc->hn_rss_hcap;
6634 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6635 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6636 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6637 		hn_vf_rss_fixup(sc, false);
6638 	}
6639 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6640 	if (error)
6641 		goto failed;
6642 back:
6643 	/*
6644 	 * Fixup transmission aggregation setup.
6645 	 */
6646 	hn_set_txagg(sc);
6647 	hn_rndis_init_fixat(sc, nchan);
6648 	return (0);
6649 
6650 failed:
6651 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6652 		hn_rndis_init_fixat(sc, nchan);
6653 		hn_synth_detach(sc);
6654 	} else {
6655 		if (attached & ATTACHED_RNDIS) {
6656 			hn_rndis_init_fixat(sc, nchan);
6657 			hn_rndis_detach(sc);
6658 		}
6659 		if (attached & ATTACHED_NVS)
6660 			hn_nvs_detach(sc);
6661 		hn_chan_detach(sc, sc->hn_prichan);
6662 		/* Restore old capabilities. */
6663 		sc->hn_caps = old_caps;
6664 	}
6665 	return (error);
6666 
6667 #undef ATTACHED_RNDIS
6668 #undef ATTACHED_NVS
6669 }
6670 
6671 /*
6672  * NOTE:
6673  * The interface must have been suspended though hn_suspend(), before
6674  * this function get called.
6675  */
6676 static void
6677 hn_synth_detach(struct hn_softc *sc)
6678 {
6679 
6680 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6681 	    ("synthetic parts were not attached"));
6682 
6683 	/* Detach the RNDIS first. */
6684 	hn_rndis_detach(sc);
6685 
6686 	/* Detach NVS. */
6687 	hn_nvs_detach(sc);
6688 
6689 	/* Detach all of the channels. */
6690 	hn_detach_allchans(sc);
6691 
6692 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6693 		/*
6694 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6695 		 */
6696 		int error;
6697 
6698 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6699 		    sc->hn_rxbuf_gpadl);
6700 		if (error) {
6701 			if_printf(sc->hn_ifp,
6702 			    "rxbuf gpadl disconn failed: %d\n", error);
6703 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6704 		}
6705 		sc->hn_rxbuf_gpadl = 0;
6706 	}
6707 
6708 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6709 		/*
6710 		 * Host is post-Win2016, disconnect chimney sending buffer from
6711 		 * primary channel here.
6712 		 */
6713 		int error;
6714 
6715 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6716 		    sc->hn_chim_gpadl);
6717 		if (error) {
6718 			if_printf(sc->hn_ifp,
6719 			    "chim gpadl disconn failed: %d\n", error);
6720 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6721 		}
6722 		sc->hn_chim_gpadl = 0;
6723 	}
6724 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6725 }
6726 
6727 static void
6728 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6729 {
6730 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6731 	    ("invalid ring count %d", ring_cnt));
6732 
6733 	if (sc->hn_tx_ring_cnt > ring_cnt)
6734 		sc->hn_tx_ring_inuse = ring_cnt;
6735 	else
6736 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6737 	sc->hn_rx_ring_inuse = ring_cnt;
6738 
6739 #ifdef RSS
6740 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6741 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6742 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6743 		    rss_getnumbuckets());
6744 	}
6745 #endif
6746 
6747 	if (bootverbose) {
6748 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6749 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6750 	}
6751 }
6752 
6753 static void
6754 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6755 {
6756 
6757 	/*
6758 	 * NOTE:
6759 	 * The TX bufring will not be drained by the hypervisor,
6760 	 * if the primary channel is revoked.
6761 	 */
6762 	while (!vmbus_chan_rx_empty(chan) ||
6763 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6764 	     !vmbus_chan_tx_empty(chan)))
6765 		pause("waitch", 1);
6766 	vmbus_chan_intr_drain(chan);
6767 }
6768 
6769 static void
6770 hn_disable_rx(struct hn_softc *sc)
6771 {
6772 
6773 	/*
6774 	 * Disable RX by clearing RX filter forcefully.
6775 	 */
6776 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6777 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6778 
6779 	/*
6780 	 * Give RNDIS enough time to flush all pending data packets.
6781 	 */
6782 	pause("waitrx", (200 * hz) / 1000);
6783 }
6784 
6785 /*
6786  * NOTE:
6787  * RX/TX _must_ have been suspended/disabled, before this function
6788  * is called.
6789  */
6790 static void
6791 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6792 {
6793 	struct vmbus_channel **subch = NULL;
6794 	int nsubch;
6795 
6796 	/*
6797 	 * Drain RX/TX bufrings and interrupts.
6798 	 */
6799 	nsubch = nchan - 1;
6800 	if (nsubch > 0)
6801 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6802 
6803 	if (subch != NULL) {
6804 		int i;
6805 
6806 		for (i = 0; i < nsubch; ++i)
6807 			hn_chan_drain(sc, subch[i]);
6808 	}
6809 	hn_chan_drain(sc, sc->hn_prichan);
6810 
6811 	if (subch != NULL)
6812 		vmbus_subchan_rel(subch, nsubch);
6813 }
6814 
6815 static void
6816 hn_suspend_data(struct hn_softc *sc)
6817 {
6818 	struct hn_tx_ring *txr;
6819 	int i;
6820 
6821 	HN_LOCK_ASSERT(sc);
6822 
6823 	/*
6824 	 * Suspend TX.
6825 	 */
6826 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6827 		txr = &sc->hn_tx_ring[i];
6828 
6829 		mtx_lock(&txr->hn_tx_lock);
6830 		txr->hn_suspended = 1;
6831 		mtx_unlock(&txr->hn_tx_lock);
6832 		/* No one is able send more packets now. */
6833 
6834 		/*
6835 		 * Wait for all pending sends to finish.
6836 		 *
6837 		 * NOTE:
6838 		 * We will _not_ receive all pending send-done, if the
6839 		 * primary channel is revoked.
6840 		 */
6841 		while (hn_tx_ring_pending(txr) &&
6842 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6843 			pause("hnwtx", 1 /* 1 tick */);
6844 	}
6845 
6846 	/*
6847 	 * Disable RX.
6848 	 */
6849 	hn_disable_rx(sc);
6850 
6851 	/*
6852 	 * Drain RX/TX.
6853 	 */
6854 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6855 
6856 	/*
6857 	 * Drain any pending TX tasks.
6858 	 *
6859 	 * NOTE:
6860 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6861 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6862 	 */
6863 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6864 		txr = &sc->hn_tx_ring[i];
6865 
6866 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6867 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6868 	}
6869 }
6870 
6871 static void
6872 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6873 {
6874 
6875 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6876 }
6877 
6878 static void
6879 hn_suspend_mgmt(struct hn_softc *sc)
6880 {
6881 	struct task task;
6882 
6883 	HN_LOCK_ASSERT(sc);
6884 
6885 	/*
6886 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6887 	 * through hn_mgmt_taskq.
6888 	 */
6889 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6890 	vmbus_chan_run_task(sc->hn_prichan, &task);
6891 
6892 	/*
6893 	 * Make sure that all pending management tasks are completed.
6894 	 */
6895 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6896 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6897 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6898 }
6899 
6900 static void
6901 hn_suspend(struct hn_softc *sc)
6902 {
6903 
6904 	/* Disable polling. */
6905 	hn_polling(sc, 0);
6906 
6907 	/*
6908 	 * If the non-transparent mode VF is activated, the synthetic
6909 	 * device is receiving packets, so the data path of the
6910 	 * synthetic device must be suspended.
6911 	 */
6912 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6913 	    (sc->hn_flags & HN_FLAG_RXVF))
6914 		hn_suspend_data(sc);
6915 	hn_suspend_mgmt(sc);
6916 }
6917 
6918 static void
6919 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6920 {
6921 	int i;
6922 
6923 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6924 	    ("invalid TX ring count %d", tx_ring_cnt));
6925 
6926 	for (i = 0; i < tx_ring_cnt; ++i) {
6927 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6928 
6929 		mtx_lock(&txr->hn_tx_lock);
6930 		txr->hn_suspended = 0;
6931 		mtx_unlock(&txr->hn_tx_lock);
6932 	}
6933 }
6934 
6935 static void
6936 hn_resume_data(struct hn_softc *sc)
6937 {
6938 	int i;
6939 
6940 	HN_LOCK_ASSERT(sc);
6941 
6942 	/*
6943 	 * Re-enable RX.
6944 	 */
6945 	hn_rxfilter_config(sc);
6946 
6947 	/*
6948 	 * Make sure to clear suspend status on "all" TX rings,
6949 	 * since hn_tx_ring_inuse can be changed after
6950 	 * hn_suspend_data().
6951 	 */
6952 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6953 
6954 #ifdef HN_IFSTART_SUPPORT
6955 	if (!hn_use_if_start)
6956 #endif
6957 	{
6958 		/*
6959 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6960 		 * reduced.
6961 		 */
6962 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6963 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6964 	}
6965 
6966 	/*
6967 	 * Kick start TX.
6968 	 */
6969 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6970 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6971 
6972 		/*
6973 		 * Use txeof task, so that any pending oactive can be
6974 		 * cleared properly.
6975 		 */
6976 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6977 	}
6978 }
6979 
6980 static void
6981 hn_resume_mgmt(struct hn_softc *sc)
6982 {
6983 
6984 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6985 
6986 	/*
6987 	 * Kick off network change detection, if it was pending.
6988 	 * If no network change was pending, start link status
6989 	 * checks, which is more lightweight than network change
6990 	 * detection.
6991 	 */
6992 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6993 		hn_change_network(sc);
6994 	else
6995 		hn_update_link_status(sc);
6996 }
6997 
6998 static void
6999 hn_resume(struct hn_softc *sc)
7000 {
7001 
7002 	/*
7003 	 * If the non-transparent mode VF is activated, the synthetic
7004 	 * device have to receive packets, so the data path of the
7005 	 * synthetic device must be resumed.
7006 	 */
7007 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
7008 	    (sc->hn_flags & HN_FLAG_RXVF))
7009 		hn_resume_data(sc);
7010 
7011 	/*
7012 	 * Don't resume link status change if VF is attached/activated.
7013 	 * - In the non-transparent VF mode, the synthetic device marks
7014 	 *   link down until the VF is deactivated; i.e. VF is down.
7015 	 * - In transparent VF mode, VF's media status is used until
7016 	 *   the VF is detached.
7017 	 */
7018 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
7019 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
7020 		hn_resume_mgmt(sc);
7021 
7022 	/*
7023 	 * Re-enable polling if this interface is running and
7024 	 * the polling is requested.
7025 	 */
7026 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
7027 		hn_polling(sc, sc->hn_pollhz);
7028 }
7029 
7030 static void
7031 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
7032 {
7033 	const struct rndis_status_msg *msg;
7034 	int ofs;
7035 
7036 	if (dlen < sizeof(*msg)) {
7037 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7038 		return;
7039 	}
7040 	msg = data;
7041 
7042 	switch (msg->rm_status) {
7043 	case RNDIS_STATUS_MEDIA_CONNECT:
7044 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7045 		hn_update_link_status(sc);
7046 		break;
7047 
7048 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7049 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7050 		/* Not really useful; ignore. */
7051 		break;
7052 
7053 	case RNDIS_STATUS_NETWORK_CHANGE:
7054 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7055 		if (dlen < ofs + msg->rm_stbuflen ||
7056 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7057 			if_printf(sc->hn_ifp, "network changed\n");
7058 		} else {
7059 			uint32_t change;
7060 
7061 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7062 			    sizeof(change));
7063 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7064 			    change);
7065 		}
7066 		hn_change_network(sc);
7067 		break;
7068 
7069 	default:
7070 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7071 		    msg->rm_status);
7072 		break;
7073 	}
7074 }
7075 
7076 static int
7077 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7078 {
7079 	const struct rndis_pktinfo *pi = info_data;
7080 	uint32_t mask = 0;
7081 
7082 	while (info_dlen != 0) {
7083 		const void *data;
7084 		uint32_t dlen;
7085 
7086 		if (__predict_false(info_dlen < sizeof(*pi)))
7087 			return (EINVAL);
7088 		if (__predict_false(info_dlen < pi->rm_size))
7089 			return (EINVAL);
7090 		info_dlen -= pi->rm_size;
7091 
7092 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7093 			return (EINVAL);
7094 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7095 			return (EINVAL);
7096 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7097 		data = pi->rm_data;
7098 
7099 		if (pi->rm_internal == 1) {
7100 			switch (pi->rm_type) {
7101 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7102 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7103 					return (EINVAL);
7104 				info->pktinfo_id =
7105 				    (const struct packet_info_id *)data;
7106 				mask |= HN_RXINFO_PKTINFO_ID;
7107 				break;
7108 
7109 			default:
7110 				goto next;
7111 			}
7112 		} else {
7113 			switch (pi->rm_type) {
7114 			case NDIS_PKTINFO_TYPE_VLAN:
7115 				if (__predict_false(dlen
7116 				    < NDIS_VLAN_INFO_SIZE))
7117 					return (EINVAL);
7118 				info->vlan_info = (const uint32_t *)data;
7119 				mask |= HN_RXINFO_VLAN;
7120 				break;
7121 
7122 			case NDIS_PKTINFO_TYPE_CSUM:
7123 				if (__predict_false(dlen
7124 				    < NDIS_RXCSUM_INFO_SIZE))
7125 					return (EINVAL);
7126 				info->csum_info = (const uint32_t *)data;
7127 				mask |= HN_RXINFO_CSUM;
7128 				break;
7129 
7130 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7131 				if (__predict_false(dlen
7132 				    < HN_NDIS_HASH_VALUE_SIZE))
7133 					return (EINVAL);
7134 				info->hash_value = (const uint32_t *)data;
7135 				mask |= HN_RXINFO_HASHVAL;
7136 				break;
7137 
7138 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7139 				if (__predict_false(dlen
7140 				    < HN_NDIS_HASH_INFO_SIZE))
7141 					return (EINVAL);
7142 				info->hash_info = (const uint32_t *)data;
7143 				mask |= HN_RXINFO_HASHINF;
7144 				break;
7145 
7146 			default:
7147 				goto next;
7148 			}
7149 		}
7150 
7151 		if (mask == HN_RXINFO_ALL) {
7152 			/* All found; done */
7153 			break;
7154 		}
7155 next:
7156 		pi = (const struct rndis_pktinfo *)
7157 		    ((const uint8_t *)pi + pi->rm_size);
7158 	}
7159 
7160 	/*
7161 	 * Final fixup.
7162 	 * - If there is no hash value, invalidate the hash info.
7163 	 */
7164 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7165 		info->hash_info = NULL;
7166 	return (0);
7167 }
7168 
7169 static __inline bool
7170 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7171 {
7172 
7173 	if (off < check_off) {
7174 		if (__predict_true(off + len <= check_off))
7175 			return (false);
7176 	} else if (off > check_off) {
7177 		if (__predict_true(check_off + check_len <= off))
7178 			return (false);
7179 	}
7180 	return (true);
7181 }
7182 
7183 static __inline void
7184 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7185 		uint32_t len, struct hn_rxinfo *info)
7186 {
7187 	uint32_t cnt = rxr->rsc.cnt;
7188 
7189 	if (cnt) {
7190 		rxr->rsc.pktlen += len;
7191 	} else {
7192 		rxr->rsc.vlan_info = info->vlan_info;
7193 		rxr->rsc.csum_info = info->csum_info;
7194 		rxr->rsc.hash_info = info->hash_info;
7195 		rxr->rsc.hash_value = info->hash_value;
7196 		rxr->rsc.pktlen = len;
7197 	}
7198 
7199 	rxr->rsc.frag_data[cnt] = data;
7200 	rxr->rsc.frag_len[cnt] = len;
7201 	rxr->rsc.cnt++;
7202 }
7203 
7204 static void
7205 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7206 {
7207 	const struct rndis_packet_msg *pkt;
7208 	struct hn_rxinfo info;
7209 	int data_off, pktinfo_off, data_len, pktinfo_len;
7210 	bool rsc_more= false;
7211 
7212 	/*
7213 	 * Check length.
7214 	 */
7215 	if (__predict_false(dlen < sizeof(*pkt))) {
7216 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7217 		return;
7218 	}
7219 	pkt = data;
7220 
7221 	if (__predict_false(dlen < pkt->rm_len)) {
7222 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7223 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7224 		return;
7225 	}
7226 	if (__predict_false(pkt->rm_len <
7227 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7228 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7229 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7230 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7231 		    pkt->rm_pktinfolen);
7232 		return;
7233 	}
7234 	if (__predict_false(pkt->rm_datalen == 0)) {
7235 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7236 		return;
7237 	}
7238 
7239 	/*
7240 	 * Check offests.
7241 	 */
7242 #define IS_OFFSET_INVALID(ofs)			\
7243 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7244 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7245 
7246 	/* XXX Hyper-V does not meet data offset alignment requirement */
7247 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7248 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7249 		    "data offset %u\n", pkt->rm_dataoffset);
7250 		return;
7251 	}
7252 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7253 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7254 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7255 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7256 		return;
7257 	}
7258 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7259 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7260 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7261 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7262 		return;
7263 	}
7264 
7265 #undef IS_OFFSET_INVALID
7266 
7267 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7268 	data_len = pkt->rm_datalen;
7269 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7270 	pktinfo_len = pkt->rm_pktinfolen;
7271 
7272 	/*
7273 	 * Check OOB coverage.
7274 	 */
7275 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7276 		int oob_off, oob_len;
7277 
7278 		if_printf(rxr->hn_ifp, "got oobdata\n");
7279 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7280 		oob_len = pkt->rm_oobdatalen;
7281 
7282 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7283 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7284 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7285 			    pkt->rm_len, oob_off, oob_len);
7286 			return;
7287 		}
7288 
7289 		/*
7290 		 * Check against data.
7291 		 */
7292 		if (hn_rndis_check_overlap(oob_off, oob_len,
7293 		    data_off, data_len)) {
7294 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7295 			    "oob overlaps data, oob abs %d len %d, "
7296 			    "data abs %d len %d\n",
7297 			    oob_off, oob_len, data_off, data_len);
7298 			return;
7299 		}
7300 
7301 		/*
7302 		 * Check against pktinfo.
7303 		 */
7304 		if (pktinfo_len != 0 &&
7305 		    hn_rndis_check_overlap(oob_off, oob_len,
7306 		    pktinfo_off, pktinfo_len)) {
7307 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7308 			    "oob overlaps pktinfo, oob abs %d len %d, "
7309 			    "pktinfo abs %d len %d\n",
7310 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7311 			return;
7312 		}
7313 	}
7314 
7315 	/*
7316 	 * Check per-packet-info coverage and find useful per-packet-info.
7317 	 */
7318 	info.vlan_info = NULL;
7319 	info.csum_info = NULL;
7320 	info.hash_info = NULL;
7321 	info.pktinfo_id = NULL;
7322 
7323 	if (__predict_true(pktinfo_len != 0)) {
7324 		bool overlap;
7325 		int error;
7326 
7327 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7328 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7329 			    "pktinfo overflow, msglen %u, "
7330 			    "pktinfo abs %d len %d\n",
7331 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7332 			return;
7333 		}
7334 
7335 		/*
7336 		 * Check packet info coverage.
7337 		 */
7338 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7339 		    data_off, data_len);
7340 		if (__predict_false(overlap)) {
7341 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7342 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7343 			    "data abs %d len %d\n",
7344 			    pktinfo_off, pktinfo_len, data_off, data_len);
7345 			return;
7346 		}
7347 
7348 		/*
7349 		 * Find useful per-packet-info.
7350 		 */
7351 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7352 		    pktinfo_len, &info);
7353 		if (__predict_false(error)) {
7354 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7355 			    "pktinfo\n");
7356 			return;
7357 		}
7358 	}
7359 
7360 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7361 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7362 		    "data overflow, msglen %u, data abs %d len %d\n",
7363 		    pkt->rm_len, data_off, data_len);
7364 		return;
7365 	}
7366 
7367 	/* Identify RSC fragments, drop invalid packets */
7368 	if ((info.pktinfo_id != NULL) &&
7369 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7370 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7371 			rxr->rsc.cnt = 0;
7372 			rxr->hn_rsc_pkts++;
7373 		} else if (rxr->rsc.cnt == 0)
7374 			goto drop;
7375 
7376 		rsc_more = true;
7377 
7378 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7379 			rsc_more = false;
7380 
7381 		if (rsc_more && rxr->rsc.is_last)
7382 			goto drop;
7383 	} else {
7384 		rxr->rsc.cnt = 0;
7385 	}
7386 
7387 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7388 		goto drop;
7389 
7390 	/* Store data in per rx ring structure */
7391 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7392 	    data_len, &info);
7393 
7394 	if (rsc_more)
7395 		return;
7396 
7397 	hn_rxpkt(rxr);
7398 	rxr->rsc.cnt = 0;
7399 	return;
7400 drop:
7401 	rxr->hn_rsc_drop++;
7402 	return;
7403 }
7404 
7405 static __inline void
7406 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7407 {
7408 	const struct rndis_msghdr *hdr;
7409 
7410 	if (__predict_false(dlen < sizeof(*hdr))) {
7411 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7412 		return;
7413 	}
7414 	hdr = data;
7415 
7416 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7417 		/* Hot data path. */
7418 		hn_rndis_rx_data(rxr, data, dlen);
7419 		/* Done! */
7420 		return;
7421 	}
7422 
7423 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7424 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7425 	else
7426 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7427 }
7428 
7429 static void
7430 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7431 {
7432 	const struct hn_nvs_hdr *hdr;
7433 
7434 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7435 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7436 		return;
7437 	}
7438 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7439 
7440 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7441 		/* Useless; ignore */
7442 		return;
7443 	}
7444 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7445 }
7446 
7447 static void
7448 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7449     const struct vmbus_chanpkt_hdr *pkt)
7450 {
7451 	struct hn_nvs_sendctx *sndc;
7452 
7453 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7454 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7455 	    VMBUS_CHANPKT_DATALEN(pkt));
7456 	/*
7457 	 * NOTE:
7458 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7459 	 * its callback.
7460 	 */
7461 }
7462 
7463 static void
7464 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7465     const struct vmbus_chanpkt_hdr *pkthdr)
7466 {
7467 	struct epoch_tracker et;
7468 	const struct vmbus_chanpkt_rxbuf *pkt;
7469 	const struct hn_nvs_hdr *nvs_hdr;
7470 	int count, i, hlen;
7471 
7472 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7473 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7474 		return;
7475 	}
7476 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7477 
7478 	/* Make sure that this is a RNDIS message. */
7479 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7480 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7481 		    nvs_hdr->nvs_type);
7482 		return;
7483 	}
7484 
7485 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7486 	if (__predict_false(hlen < sizeof(*pkt))) {
7487 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7488 		return;
7489 	}
7490 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7491 
7492 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7493 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7494 		    pkt->cp_rxbuf_id);
7495 		return;
7496 	}
7497 
7498 	count = pkt->cp_rxbuf_cnt;
7499 	if (__predict_false(hlen <
7500 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7501 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7502 		return;
7503 	}
7504 
7505 	NET_EPOCH_ENTER(et);
7506 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7507 	for (i = 0; i < count; ++i) {
7508 		int ofs, len;
7509 
7510 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7511 		len = pkt->cp_rxbuf[i].rb_len;
7512 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7513 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7514 			    "ofs %d, len %d\n", i, ofs, len);
7515 			continue;
7516 		}
7517 
7518 		rxr->rsc.is_last = (i == (count - 1));
7519 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7520 	}
7521 	NET_EPOCH_EXIT(et);
7522 
7523 	/*
7524 	 * Ack the consumed RXBUF associated w/ this channel packet,
7525 	 * so that this RXBUF can be recycled by the hypervisor.
7526 	 */
7527 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7528 }
7529 
7530 static void
7531 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7532     uint64_t tid)
7533 {
7534 	struct hn_nvs_rndis_ack ack;
7535 	int retries, error;
7536 
7537 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7538 	ack.nvs_status = HN_NVS_STATUS_OK;
7539 
7540 	retries = 0;
7541 again:
7542 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7543 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7544 	if (__predict_false(error == EAGAIN)) {
7545 		/*
7546 		 * NOTE:
7547 		 * This should _not_ happen in real world, since the
7548 		 * consumption of the TX bufring from the TX path is
7549 		 * controlled.
7550 		 */
7551 		if (rxr->hn_ack_failed == 0)
7552 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7553 		rxr->hn_ack_failed++;
7554 		retries++;
7555 		if (retries < 10) {
7556 			DELAY(100);
7557 			goto again;
7558 		}
7559 		/* RXBUF leaks! */
7560 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7561 	}
7562 }
7563 
7564 static void
7565 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7566 {
7567 	struct hn_rx_ring *rxr = xrxr;
7568 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7569 
7570 	for (;;) {
7571 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7572 		int error, pktlen;
7573 
7574 		pktlen = rxr->hn_pktbuf_len;
7575 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7576 		if (__predict_false(error == ENOBUFS)) {
7577 			void *nbuf;
7578 			int nlen;
7579 
7580 			/*
7581 			 * Expand channel packet buffer.
7582 			 *
7583 			 * XXX
7584 			 * Use M_WAITOK here, since allocation failure
7585 			 * is fatal.
7586 			 */
7587 			nlen = rxr->hn_pktbuf_len * 2;
7588 			while (nlen < pktlen)
7589 				nlen *= 2;
7590 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7591 
7592 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7593 			    rxr->hn_pktbuf_len, nlen);
7594 
7595 			free(rxr->hn_pktbuf, M_DEVBUF);
7596 			rxr->hn_pktbuf = nbuf;
7597 			rxr->hn_pktbuf_len = nlen;
7598 			/* Retry! */
7599 			continue;
7600 		} else if (__predict_false(error == EAGAIN)) {
7601 			/* No more channel packets; done! */
7602 			break;
7603 		}
7604 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7605 
7606 		switch (pkt->cph_type) {
7607 		case VMBUS_CHANPKT_TYPE_COMP:
7608 			hn_nvs_handle_comp(sc, chan, pkt);
7609 			break;
7610 
7611 		case VMBUS_CHANPKT_TYPE_RXBUF:
7612 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7613 			break;
7614 
7615 		case VMBUS_CHANPKT_TYPE_INBAND:
7616 			hn_nvs_handle_notify(sc, pkt);
7617 			break;
7618 
7619 		default:
7620 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7621 			    pkt->cph_type);
7622 			break;
7623 		}
7624 	}
7625 	hn_chan_rollup(rxr, rxr->hn_txr);
7626 }
7627 
7628 static void
7629 hn_sysinit(void *arg __unused)
7630 {
7631 	int i;
7632 
7633 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7634 
7635 #ifdef HN_IFSTART_SUPPORT
7636 	/*
7637 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7638 	 * mainly due to the IFF_DRV_OACTIVE flag.
7639 	 */
7640 	if (hn_xpnt_vf && hn_use_if_start) {
7641 		hn_use_if_start = 0;
7642 		printf("hn: tranparent VF mode, if_transmit will be used, "
7643 		    "instead of if_start\n");
7644 	}
7645 #endif
7646 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7647 		printf("hn: invalid transparent VF attach routing "
7648 		    "wait timeout %d, reset to %d\n",
7649 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7650 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7651 	}
7652 
7653 	/*
7654 	 * Initialize VF map.
7655 	 */
7656 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7657 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7658 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7659 	    M_WAITOK | M_ZERO);
7660 
7661 	/*
7662 	 * Fix the # of TX taskqueues.
7663 	 */
7664 	if (hn_tx_taskq_cnt <= 0)
7665 		hn_tx_taskq_cnt = 1;
7666 	else if (hn_tx_taskq_cnt > mp_ncpus)
7667 		hn_tx_taskq_cnt = mp_ncpus;
7668 
7669 	/*
7670 	 * Fix the TX taskqueue mode.
7671 	 */
7672 	switch (hn_tx_taskq_mode) {
7673 	case HN_TX_TASKQ_M_INDEP:
7674 	case HN_TX_TASKQ_M_GLOBAL:
7675 	case HN_TX_TASKQ_M_EVTTQ:
7676 		break;
7677 	default:
7678 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7679 		break;
7680 	}
7681 
7682 	if (vm_guest != VM_GUEST_HV)
7683 		return;
7684 
7685 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7686 		return;
7687 
7688 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7689 	    M_DEVBUF, M_WAITOK);
7690 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7691 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7692 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7693 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7694 		    "hn tx%d", i);
7695 	}
7696 }
7697 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7698 
7699 static void
7700 hn_sysuninit(void *arg __unused)
7701 {
7702 
7703 	if (hn_tx_taskque != NULL) {
7704 		int i;
7705 
7706 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7707 			taskqueue_free(hn_tx_taskque[i]);
7708 		free(hn_tx_taskque, M_DEVBUF);
7709 	}
7710 
7711 	if (hn_vfmap != NULL)
7712 		free(hn_vfmap, M_DEVBUF);
7713 	rm_destroy(&hn_vfmap_lock);
7714 
7715 	counter_u64_free(hn_udpcs_fixup);
7716 }
7717 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7718