xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 7ef62cebc2f965b0f640263e179276928885e33d)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/proc.h>
75 #include <sys/rmlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/socket.h>
80 #include <sys/sockio.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/buf_ring.h>
85 #include <sys/eventhandler.h>
86 #include <sys/epoch.h>
87 
88 #include <vm/vm.h>
89 #include <vm/vm_extern.h>
90 #include <vm/pmap.h>
91 
92 #include <machine/atomic.h>
93 #include <machine/in_cksum.h>
94 
95 #include <net/bpf.h>
96 #include <net/ethernet.h>
97 #include <net/if.h>
98 #include <net/if_dl.h>
99 #include <net/if_media.h>
100 #include <net/if_types.h>
101 #include <net/if_var.h>
102 #include <net/rndis.h>
103 #ifdef RSS
104 #include <net/rss_config.h>
105 #endif
106 
107 #include <netinet/in_systm.h>
108 #include <netinet/in.h>
109 #include <netinet/ip.h>
110 #include <netinet/ip6.h>
111 #include <netinet/tcp.h>
112 #include <netinet/tcp_lro.h>
113 #include <netinet/udp.h>
114 
115 #include <dev/hyperv/include/hyperv.h>
116 #include <dev/hyperv/include/hyperv_busdma.h>
117 #include <dev/hyperv/include/vmbus.h>
118 #include <dev/hyperv/include/vmbus_xact.h>
119 
120 #include <dev/hyperv/netvsc/ndis.h>
121 #include <dev/hyperv/netvsc/if_hnreg.h>
122 #include <dev/hyperv/netvsc/if_hnvar.h>
123 #include <dev/hyperv/netvsc/hn_nvs.h>
124 #include <dev/hyperv/netvsc/hn_rndis.h>
125 
126 #include "vmbus_if.h"
127 
128 #define HN_IFSTART_SUPPORT
129 
130 #define HN_RING_CNT_DEF_MAX		8
131 
132 #define HN_VFMAP_SIZE_DEF		8
133 
134 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
135 
136 /* YYY should get it from the underlying channel */
137 #define HN_TX_DESC_CNT			512
138 
139 #define HN_RNDIS_PKT_LEN					\
140 	(sizeof(struct rndis_packet_msg) +			\
141 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
142 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
143 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
144 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
145 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
146 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
147 
148 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
149 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
150 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
151 /* -1 for RNDIS packet message */
152 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
153 
154 #define HN_DIRECT_TX_SIZE_DEF		128
155 
156 #define HN_EARLY_TXEOF_THRESH		8
157 
158 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
159 
160 #define HN_LROENT_CNT_DEF		128
161 
162 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
163 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
164 /* YYY 2*MTU is a bit rough, but should be good enough. */
165 #define HN_LRO_LENLIM_MIN(ifp)		(2 * if_getmtu(ifp))
166 
167 #define HN_LRO_ACKCNT_DEF		1
168 
169 #define HN_LOCK_INIT(sc)		\
170 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
171 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
172 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
173 #define HN_LOCK(sc)					\
174 do {							\
175 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
176 		/* Relinquish cpu to avoid deadlock */	\
177 		sched_relinquish(curthread);		\
178 		DELAY(1000);				\
179 	}						\
180 } while (0)
181 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
182 
183 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
184 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
185 #define HN_CSUM_IP_HWASSIST(sc)		\
186 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
187 #define HN_CSUM_IP6_HWASSIST(sc)	\
188 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
189 
190 #define HN_PKTSIZE_MIN(align)		\
191 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
192 	    HN_RNDIS_PKT_LEN, (align))
193 #define HN_PKTSIZE(m, align)		\
194 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
195 
196 #ifdef RSS
197 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
198 #else
199 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
200 #endif
201 
202 struct hn_txdesc {
203 #ifndef HN_USE_TXDESC_BUFRING
204 	SLIST_ENTRY(hn_txdesc)		link;
205 #endif
206 	STAILQ_ENTRY(hn_txdesc)		agg_link;
207 
208 	/* Aggregated txdescs, in sending order. */
209 	STAILQ_HEAD(, hn_txdesc)	agg_list;
210 
211 	/* The oldest packet, if transmission aggregation happens. */
212 	struct mbuf			*m;
213 	struct hn_tx_ring		*txr;
214 	int				refs;
215 	uint32_t			flags;	/* HN_TXD_FLAG_ */
216 	struct hn_nvs_sendctx		send_ctx;
217 	uint32_t			chim_index;
218 	int				chim_size;
219 
220 	bus_dmamap_t			data_dmap;
221 
222 	bus_addr_t			rndis_pkt_paddr;
223 	struct rndis_packet_msg		*rndis_pkt;
224 	bus_dmamap_t			rndis_pkt_dmap;
225 };
226 
227 #define HN_TXD_FLAG_ONLIST		0x0001
228 #define HN_TXD_FLAG_DMAMAP		0x0002
229 #define HN_TXD_FLAG_ONAGG		0x0004
230 
231 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
232 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
233 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
234 
235 struct packet_info_id {
236 	uint8_t				ver;
237 	uint8_t				flag;
238 	uint16_t			pkt_id;
239 };
240 
241 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
242 
243 
244 struct hn_rxinfo {
245 	const uint32_t			*vlan_info;
246 	const uint32_t			*csum_info;
247 	const uint32_t			*hash_info;
248 	const uint32_t			*hash_value;
249 	const struct packet_info_id	*pktinfo_id;
250 };
251 
252 struct hn_rxvf_setarg {
253 	struct hn_rx_ring	*rxr;
254 	if_t			vf_ifp;
255 };
256 
257 #define HN_RXINFO_VLAN			0x0001
258 #define HN_RXINFO_CSUM			0x0002
259 #define HN_RXINFO_HASHINF		0x0004
260 #define HN_RXINFO_HASHVAL		0x0008
261 #define HN_RXINFO_PKTINFO_ID		0x0010
262 #define HN_RXINFO_ALL			\
263 	(HN_RXINFO_VLAN |		\
264 	 HN_RXINFO_CSUM |		\
265 	 HN_RXINFO_HASHINF |		\
266 	 HN_RXINFO_HASHVAL |		\
267 	 HN_RXINFO_PKTINFO_ID)
268 
269 static int			hn_probe(device_t);
270 static int			hn_attach(device_t);
271 static int			hn_detach(device_t);
272 static int			hn_shutdown(device_t);
273 static void			hn_chan_callback(struct vmbus_channel *,
274 				    void *);
275 
276 static void			hn_init(void *);
277 static int			hn_ioctl(if_t, u_long, caddr_t);
278 #ifdef HN_IFSTART_SUPPORT
279 static void			hn_start(if_t);
280 #endif
281 static int			hn_transmit(if_t, struct mbuf *);
282 static void			hn_xmit_qflush(if_t);
283 static int			hn_ifmedia_upd(if_t);
284 static void			hn_ifmedia_sts(if_t,
285 				    struct ifmediareq *);
286 
287 static void			hn_ifnet_event(void *, if_t, int);
288 static void			hn_ifaddr_event(void *, if_t);
289 static void			hn_ifnet_attevent(void *, if_t);
290 static void			hn_ifnet_detevent(void *, if_t);
291 static void			hn_ifnet_lnkevent(void *, if_t, int);
292 
293 static bool			hn_ismyvf(const struct hn_softc *,
294 				    const if_t);
295 static void			hn_rxvf_change(struct hn_softc *,
296 				    if_t, bool);
297 static void			hn_rxvf_set(struct hn_softc *, if_t);
298 static void			hn_rxvf_set_task(void *, int);
299 static void			hn_xpnt_vf_input(if_t, struct mbuf *);
300 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
301 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
302 				    struct ifreq *);
303 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
304 static bool			hn_xpnt_vf_isready(struct hn_softc *);
305 static void			hn_xpnt_vf_setready(struct hn_softc *);
306 static void			hn_xpnt_vf_init_taskfunc(void *, int);
307 static void			hn_xpnt_vf_init(struct hn_softc *);
308 static void			hn_xpnt_vf_setenable(struct hn_softc *);
309 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
310 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
311 static void			hn_vf_rss_restore(struct hn_softc *);
312 
313 static int			hn_rndis_rxinfo(const void *, int,
314 				    struct hn_rxinfo *);
315 static void			hn_rndis_rx_data(struct hn_rx_ring *,
316 				    const void *, int);
317 static void			hn_rndis_rx_status(struct hn_softc *,
318 				    const void *, int);
319 static void			hn_rndis_init_fixat(struct hn_softc *, int);
320 
321 static void			hn_nvs_handle_notify(struct hn_softc *,
322 				    const struct vmbus_chanpkt_hdr *);
323 static void			hn_nvs_handle_comp(struct hn_softc *,
324 				    struct vmbus_channel *,
325 				    const struct vmbus_chanpkt_hdr *);
326 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
327 				    struct vmbus_channel *,
328 				    const struct vmbus_chanpkt_hdr *);
329 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
330 				    struct vmbus_channel *, uint64_t);
331 
332 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
344 #ifndef RSS
345 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
346 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
347 #endif
348 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
349 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
350 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
358 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
359 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
360 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
361 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
362 
363 static void			hn_stop(struct hn_softc *, bool);
364 static void			hn_init_locked(struct hn_softc *);
365 static int			hn_chan_attach(struct hn_softc *,
366 				    struct vmbus_channel *);
367 static void			hn_chan_detach(struct hn_softc *,
368 				    struct vmbus_channel *);
369 static int			hn_attach_subchans(struct hn_softc *);
370 static void			hn_detach_allchans(struct hn_softc *);
371 static void			hn_chan_rollup(struct hn_rx_ring *,
372 				    struct hn_tx_ring *);
373 static void			hn_set_ring_inuse(struct hn_softc *, int);
374 static int			hn_synth_attach(struct hn_softc *, int);
375 static void			hn_synth_detach(struct hn_softc *);
376 static int			hn_synth_alloc_subchans(struct hn_softc *,
377 				    int *);
378 static bool			hn_synth_attachable(const struct hn_softc *);
379 static void			hn_suspend(struct hn_softc *);
380 static void			hn_suspend_data(struct hn_softc *);
381 static void			hn_suspend_mgmt(struct hn_softc *);
382 static void			hn_resume(struct hn_softc *);
383 static void			hn_resume_data(struct hn_softc *);
384 static void			hn_resume_mgmt(struct hn_softc *);
385 static void			hn_suspend_mgmt_taskfunc(void *, int);
386 static void			hn_chan_drain(struct hn_softc *,
387 				    struct vmbus_channel *);
388 static void			hn_disable_rx(struct hn_softc *);
389 static void			hn_drain_rxtx(struct hn_softc *, int);
390 static void			hn_polling(struct hn_softc *, u_int);
391 static void			hn_chan_polling(struct vmbus_channel *, u_int);
392 static void			hn_mtu_change_fixup(struct hn_softc *);
393 
394 static void			hn_update_link_status(struct hn_softc *);
395 static void			hn_change_network(struct hn_softc *);
396 static void			hn_link_taskfunc(void *, int);
397 static void			hn_netchg_init_taskfunc(void *, int);
398 static void			hn_netchg_status_taskfunc(void *, int);
399 static void			hn_link_status(struct hn_softc *);
400 
401 static int			hn_create_rx_data(struct hn_softc *, int);
402 static void			hn_destroy_rx_data(struct hn_softc *);
403 static int			hn_check_iplen(const struct mbuf *, int);
404 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
405 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
406 static int			hn_rxfilter_config(struct hn_softc *);
407 static int			hn_rss_reconfig(struct hn_softc *);
408 static void			hn_rss_ind_fixup(struct hn_softc *);
409 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
410 static int			hn_rxpkt(struct hn_rx_ring *);
411 static uint32_t			hn_rss_type_fromndis(uint32_t);
412 static uint32_t			hn_rss_type_tondis(uint32_t);
413 
414 static int			hn_tx_ring_create(struct hn_softc *, int);
415 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
416 static int			hn_create_tx_data(struct hn_softc *, int);
417 static void			hn_fixup_tx_data(struct hn_softc *);
418 static void			hn_fixup_rx_data(struct hn_softc *);
419 static void			hn_destroy_tx_data(struct hn_softc *);
420 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
421 static void			hn_txdesc_gc(struct hn_tx_ring *,
422 				    struct hn_txdesc *);
423 static int			hn_encap(if_t, struct hn_tx_ring *,
424 				    struct hn_txdesc *, struct mbuf **);
425 static int			hn_txpkt(if_t, struct hn_tx_ring *,
426 				    struct hn_txdesc *);
427 static void			hn_set_chim_size(struct hn_softc *, int);
428 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
429 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
430 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
431 static void			hn_resume_tx(struct hn_softc *, int);
432 static void			hn_set_txagg(struct hn_softc *);
433 static void			*hn_try_txagg(if_t,
434 				    struct hn_tx_ring *, struct hn_txdesc *,
435 				    int);
436 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
437 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
438 				    struct hn_softc *, struct vmbus_channel *,
439 				    const void *, int);
440 static int			hn_txpkt_sglist(struct hn_tx_ring *,
441 				    struct hn_txdesc *);
442 static int			hn_txpkt_chim(struct hn_tx_ring *,
443 				    struct hn_txdesc *);
444 static int			hn_xmit(struct hn_tx_ring *, int);
445 static void			hn_xmit_taskfunc(void *, int);
446 static void			hn_xmit_txeof(struct hn_tx_ring *);
447 static void			hn_xmit_txeof_taskfunc(void *, int);
448 #ifdef HN_IFSTART_SUPPORT
449 static int			hn_start_locked(struct hn_tx_ring *, int);
450 static void			hn_start_taskfunc(void *, int);
451 static void			hn_start_txeof(struct hn_tx_ring *);
452 static void			hn_start_txeof_taskfunc(void *, int);
453 #endif
454 
455 static int			hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
456 
457 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
458     "Hyper-V network interface");
459 
460 /* Trust tcp segment verification on host side. */
461 static int			hn_trust_hosttcp = 1;
462 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
463     &hn_trust_hosttcp, 0,
464     "Trust tcp segment verification on host side, "
465     "when csum info is missing (global setting)");
466 
467 /* Trust udp datagrams verification on host side. */
468 static int			hn_trust_hostudp = 1;
469 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
470     &hn_trust_hostudp, 0,
471     "Trust udp datagram verification on host side, "
472     "when csum info is missing (global setting)");
473 
474 /* Trust ip packets verification on host side. */
475 static int			hn_trust_hostip = 1;
476 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
477     &hn_trust_hostip, 0,
478     "Trust ip packet verification on host side, "
479     "when csum info is missing (global setting)");
480 
481 /*
482  * Offload UDP/IPv4 checksum.
483  */
484 static int			hn_enable_udp4cs = 1;
485 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
486     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
487 
488 /*
489  * Offload UDP/IPv6 checksum.
490  */
491 static int			hn_enable_udp6cs = 1;
492 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
493     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
494 
495 /* Stats. */
496 static counter_u64_t		hn_udpcs_fixup;
497 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
498     &hn_udpcs_fixup, "# of UDP checksum fixup");
499 
500 /*
501  * See hn_set_hlen().
502  *
503  * This value is for Azure.  For Hyper-V, set this above
504  * 65536 to disable UDP datagram checksum fixup.
505  */
506 static int			hn_udpcs_fixup_mtu = 1420;
507 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
508     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
509 
510 /* Limit TSO burst size */
511 static int			hn_tso_maxlen = IP_MAXPACKET;
512 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
513     &hn_tso_maxlen, 0, "TSO burst limit");
514 
515 /* Limit chimney send size */
516 static int			hn_tx_chimney_size = 0;
517 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
518     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
519 
520 /* Limit the size of packet for direct transmission */
521 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
522 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
523     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
524 
525 /* # of LRO entries per RX ring */
526 #if defined(INET) || defined(INET6)
527 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
528 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
529     &hn_lro_entry_count, 0, "LRO entry count");
530 #endif
531 
532 static int			hn_tx_taskq_cnt = 1;
533 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
534     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
535 
536 #define HN_TX_TASKQ_M_INDEP	0
537 #define HN_TX_TASKQ_M_GLOBAL	1
538 #define HN_TX_TASKQ_M_EVTTQ	2
539 
540 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
541 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
542     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
543     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
544 
545 #ifndef HN_USE_TXDESC_BUFRING
546 static int			hn_use_txdesc_bufring = 0;
547 #else
548 static int			hn_use_txdesc_bufring = 1;
549 #endif
550 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
551     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
552 
553 #ifdef HN_IFSTART_SUPPORT
554 /* Use ifnet.if_start instead of ifnet.if_transmit */
555 static int			hn_use_if_start = 0;
556 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
557     &hn_use_if_start, 0, "Use if_start TX method");
558 #endif
559 
560 /* # of channels to use */
561 static int			hn_chan_cnt = 0;
562 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
563     &hn_chan_cnt, 0,
564     "# of channels to use; each channel has one RX ring and one TX ring");
565 
566 /* # of transmit rings to use */
567 static int			hn_tx_ring_cnt = 0;
568 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
569     &hn_tx_ring_cnt, 0, "# of TX rings to use");
570 
571 /* Software TX ring deptch */
572 static int			hn_tx_swq_depth = 0;
573 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
574     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
575 
576 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
577 static u_int			hn_lro_mbufq_depth = 0;
578 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
579     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
580 
581 /* Packet transmission aggregation size limit */
582 static int			hn_tx_agg_size = -1;
583 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
584     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
585 
586 /* Packet transmission aggregation count limit */
587 static int			hn_tx_agg_pkts = -1;
588 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
589     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
590 
591 /* VF list */
592 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
593     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
594     hn_vflist_sysctl, "A",
595     "VF list");
596 
597 /* VF mapping */
598 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
599     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
600     hn_vfmap_sysctl, "A",
601     "VF mapping");
602 
603 /* Transparent VF */
604 static int			hn_xpnt_vf = 1;
605 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
606     &hn_xpnt_vf, 0, "Transparent VF mod");
607 
608 /* Accurate BPF support for Transparent VF */
609 static int			hn_xpnt_vf_accbpf = 0;
610 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
611     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
612 
613 /* Extra wait for transparent VF attach routing; unit seconds. */
614 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
615 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
616     &hn_xpnt_vf_attwait, 0,
617     "Extra wait for transparent VF attach routing; unit: seconds");
618 
619 static u_int			hn_cpu_index;	/* next CPU for channel */
620 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
621 
622 static struct rmlock		hn_vfmap_lock;
623 static int			hn_vfmap_size;
624 static if_t			*hn_vfmap;
625 
626 #ifndef RSS
627 static const uint8_t
628 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
629 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
630 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
631 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
632 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
633 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
634 };
635 #endif	/* !RSS */
636 
637 static const struct hyperv_guid	hn_guid = {
638 	.hv_guid = {
639 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
640 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
641 };
642 
643 static device_method_t hn_methods[] = {
644 	/* Device interface */
645 	DEVMETHOD(device_probe,		hn_probe),
646 	DEVMETHOD(device_attach,	hn_attach),
647 	DEVMETHOD(device_detach,	hn_detach),
648 	DEVMETHOD(device_shutdown,	hn_shutdown),
649 	DEVMETHOD_END
650 };
651 
652 static driver_t hn_driver = {
653 	"hn",
654 	hn_methods,
655 	sizeof(struct hn_softc)
656 };
657 
658 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0);
659 MODULE_VERSION(hn, 1);
660 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
661 
662 static void
663 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
664 {
665 	int i;
666 
667 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
668 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
669 }
670 
671 static int
672 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
673 {
674 
675 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
676 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
677 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
678 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
679 }
680 
681 static int
682 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
683 {
684 	struct hn_nvs_rndis rndis;
685 
686 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
687 	    txd->chim_size > 0, ("invalid rndis chim txd"));
688 
689 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
690 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
691 	rndis.nvs_chim_idx = txd->chim_index;
692 	rndis.nvs_chim_sz = txd->chim_size;
693 
694 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
695 	    &rndis, sizeof(rndis), &txd->send_ctx));
696 }
697 
698 static __inline uint32_t
699 hn_chim_alloc(struct hn_softc *sc)
700 {
701 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
702 	u_long *bmap = sc->hn_chim_bmap;
703 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
704 
705 	for (i = 0; i < bmap_cnt; ++i) {
706 		int idx;
707 
708 		idx = ffsl(~bmap[i]);
709 		if (idx == 0)
710 			continue;
711 
712 		--idx; /* ffsl is 1-based */
713 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
714 		    ("invalid i %d and idx %d", i, idx));
715 
716 		if (atomic_testandset_long(&bmap[i], idx))
717 			continue;
718 
719 		ret = i * LONG_BIT + idx;
720 		break;
721 	}
722 	return (ret);
723 }
724 
725 static __inline void
726 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
727 {
728 	u_long mask;
729 	uint32_t idx;
730 
731 	idx = chim_idx / LONG_BIT;
732 	KASSERT(idx < sc->hn_chim_bmap_cnt,
733 	    ("invalid chimney index 0x%x", chim_idx));
734 
735 	mask = 1UL << (chim_idx % LONG_BIT);
736 	KASSERT(sc->hn_chim_bmap[idx] & mask,
737 	    ("index bitmap 0x%lx, chimney index %u, "
738 	     "bitmap idx %d, bitmask 0x%lx",
739 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
740 
741 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
742 }
743 
744 #if defined(INET6) || defined(INET)
745 
746 #define PULLUP_HDR(m, len)				\
747 do {							\
748 	if (__predict_false((m)->m_len < (len))) {	\
749 		(m) = m_pullup((m), (len));		\
750 		if ((m) == NULL)			\
751 			return (NULL);			\
752 	}						\
753 } while (0)
754 
755 /*
756  * NOTE: If this function failed, the m_head would be freed.
757  */
758 static __inline struct mbuf *
759 hn_tso_fixup(struct mbuf *m_head)
760 {
761 	struct ether_vlan_header *evl;
762 	struct tcphdr *th;
763 	int ehlen;
764 
765 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
766 
767 	PULLUP_HDR(m_head, sizeof(*evl));
768 	evl = mtod(m_head, struct ether_vlan_header *);
769 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
770 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
771 	else
772 		ehlen = ETHER_HDR_LEN;
773 	m_head->m_pkthdr.l2hlen = ehlen;
774 
775 #ifdef INET
776 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
777 		struct ip *ip;
778 		int iphlen;
779 
780 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
781 		ip = mtodo(m_head, ehlen);
782 		iphlen = ip->ip_hl << 2;
783 		m_head->m_pkthdr.l3hlen = iphlen;
784 
785 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
786 		th = mtodo(m_head, ehlen + iphlen);
787 
788 		ip->ip_len = 0;
789 		ip->ip_sum = 0;
790 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
791 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
792 	}
793 #endif
794 #if defined(INET6) && defined(INET)
795 	else
796 #endif
797 #ifdef INET6
798 	{
799 		struct ip6_hdr *ip6;
800 
801 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
802 		ip6 = mtodo(m_head, ehlen);
803 		if (ip6->ip6_nxt != IPPROTO_TCP) {
804 			m_freem(m_head);
805 			return (NULL);
806 		}
807 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
808 
809 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
810 		th = mtodo(m_head, ehlen + sizeof(*ip6));
811 
812 		ip6->ip6_plen = 0;
813 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
814 	}
815 #endif
816 	return (m_head);
817 }
818 
819 /*
820  * NOTE: If this function failed, the m_head would be freed.
821  */
822 static __inline struct mbuf *
823 hn_set_hlen(struct mbuf *m_head)
824 {
825 	const struct ether_vlan_header *evl;
826 	int ehlen;
827 
828 	PULLUP_HDR(m_head, sizeof(*evl));
829 	evl = mtod(m_head, const struct ether_vlan_header *);
830 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
831 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
832 	else
833 		ehlen = ETHER_HDR_LEN;
834 	m_head->m_pkthdr.l2hlen = ehlen;
835 
836 #ifdef INET
837 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
838 		const struct ip *ip;
839 		int iphlen;
840 
841 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
842 		ip = mtodo(m_head, ehlen);
843 		iphlen = ip->ip_hl << 2;
844 		m_head->m_pkthdr.l3hlen = iphlen;
845 
846 		/*
847 		 * UDP checksum offload does not work in Azure, if the
848 		 * following conditions meet:
849 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
850 		 * - IP_DF is not set in the IP hdr.
851 		 *
852 		 * Fallback to software checksum for these UDP datagrams.
853 		 */
854 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
855 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
856 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
857 			uint16_t off = ehlen + iphlen;
858 
859 			counter_u64_add(hn_udpcs_fixup, 1);
860 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
861 			*(uint16_t *)(m_head->m_data + off +
862                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
863 			    m_head, m_head->m_pkthdr.len, off);
864 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
865 		}
866 	}
867 #endif
868 #if defined(INET6) && defined(INET)
869 	else
870 #endif
871 #ifdef INET6
872 	{
873 		const struct ip6_hdr *ip6;
874 
875 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
876 		ip6 = mtodo(m_head, ehlen);
877 		if (ip6->ip6_nxt != IPPROTO_TCP &&
878 		    ip6->ip6_nxt != IPPROTO_UDP) {
879 			m_freem(m_head);
880 			return (NULL);
881 		}
882 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
883 	}
884 #endif
885 	return (m_head);
886 }
887 
888 /*
889  * NOTE: If this function failed, the m_head would be freed.
890  */
891 static __inline struct mbuf *
892 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
893 {
894 	const struct tcphdr *th;
895 	int ehlen, iphlen;
896 
897 	*tcpsyn = 0;
898 	ehlen = m_head->m_pkthdr.l2hlen;
899 	iphlen = m_head->m_pkthdr.l3hlen;
900 
901 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
902 	th = mtodo(m_head, ehlen + iphlen);
903 	if (th->th_flags & TH_SYN)
904 		*tcpsyn = 1;
905 	return (m_head);
906 }
907 
908 #undef PULLUP_HDR
909 
910 #endif	/* INET6 || INET */
911 
912 static int
913 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
914 {
915 	int error = 0;
916 
917 	HN_LOCK_ASSERT(sc);
918 
919 	if (sc->hn_rx_filter != filter) {
920 		error = hn_rndis_set_rxfilter(sc, filter);
921 		if (!error)
922 			sc->hn_rx_filter = filter;
923 	}
924 	return (error);
925 }
926 
927 static int
928 hn_rxfilter_config(struct hn_softc *sc)
929 {
930 	if_t ifp = sc->hn_ifp;
931 	uint32_t filter;
932 
933 	HN_LOCK_ASSERT(sc);
934 
935 	/*
936 	 * If the non-transparent mode VF is activated, we don't know how
937 	 * its RX filter is configured, so stick the synthetic device in
938 	 * the promiscous mode.
939 	 */
940 	if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
941 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
942 	} else {
943 		filter = NDIS_PACKET_TYPE_DIRECTED;
944 		if (if_getflags(ifp) & IFF_BROADCAST)
945 			filter |= NDIS_PACKET_TYPE_BROADCAST;
946 		/* TODO: support multicast list */
947 		if ((if_getflags(ifp) & IFF_ALLMULTI) ||
948 		    !if_maddr_empty(ifp))
949 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
950 	}
951 	return (hn_set_rxfilter(sc, filter));
952 }
953 
954 static void
955 hn_set_txagg(struct hn_softc *sc)
956 {
957 	uint32_t size, pkts;
958 	int i;
959 
960 	/*
961 	 * Setup aggregation size.
962 	 */
963 	if (sc->hn_agg_size < 0)
964 		size = UINT32_MAX;
965 	else
966 		size = sc->hn_agg_size;
967 
968 	if (sc->hn_rndis_agg_size < size)
969 		size = sc->hn_rndis_agg_size;
970 
971 	/* NOTE: We only aggregate packets using chimney sending buffers. */
972 	if (size > (uint32_t)sc->hn_chim_szmax)
973 		size = sc->hn_chim_szmax;
974 
975 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
976 		/* Disable */
977 		size = 0;
978 		pkts = 0;
979 		goto done;
980 	}
981 
982 	/* NOTE: Type of the per TX ring setting is 'int'. */
983 	if (size > INT_MAX)
984 		size = INT_MAX;
985 
986 	/*
987 	 * Setup aggregation packet count.
988 	 */
989 	if (sc->hn_agg_pkts < 0)
990 		pkts = UINT32_MAX;
991 	else
992 		pkts = sc->hn_agg_pkts;
993 
994 	if (sc->hn_rndis_agg_pkts < pkts)
995 		pkts = sc->hn_rndis_agg_pkts;
996 
997 	if (pkts <= 1) {
998 		/* Disable */
999 		size = 0;
1000 		pkts = 0;
1001 		goto done;
1002 	}
1003 
1004 	/* NOTE: Type of the per TX ring setting is 'short'. */
1005 	if (pkts > SHRT_MAX)
1006 		pkts = SHRT_MAX;
1007 
1008 done:
1009 	/* NOTE: Type of the per TX ring setting is 'short'. */
1010 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1011 		/* Disable */
1012 		size = 0;
1013 		pkts = 0;
1014 	}
1015 
1016 	if (bootverbose) {
1017 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1018 		    size, pkts, sc->hn_rndis_agg_align);
1019 	}
1020 
1021 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1022 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1023 
1024 		mtx_lock(&txr->hn_tx_lock);
1025 		txr->hn_agg_szmax = size;
1026 		txr->hn_agg_pktmax = pkts;
1027 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1028 		mtx_unlock(&txr->hn_tx_lock);
1029 	}
1030 }
1031 
1032 static int
1033 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1034 {
1035 
1036 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1037 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1038 		return txr->hn_txdesc_cnt;
1039 	return hn_tx_swq_depth;
1040 }
1041 
1042 static int
1043 hn_rss_reconfig(struct hn_softc *sc)
1044 {
1045 	int error;
1046 
1047 	HN_LOCK_ASSERT(sc);
1048 
1049 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1050 		return (ENXIO);
1051 
1052 	/*
1053 	 * Disable RSS first.
1054 	 *
1055 	 * NOTE:
1056 	 * Direct reconfiguration by setting the UNCHG flags does
1057 	 * _not_ work properly.
1058 	 */
1059 	if (bootverbose)
1060 		if_printf(sc->hn_ifp, "disable RSS\n");
1061 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1062 	if (error) {
1063 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1064 		return (error);
1065 	}
1066 
1067 	/*
1068 	 * Reenable the RSS w/ the updated RSS key or indirect
1069 	 * table.
1070 	 */
1071 	if (bootverbose)
1072 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1073 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1074 	if (error) {
1075 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1076 		return (error);
1077 	}
1078 	return (0);
1079 }
1080 
1081 static void
1082 hn_rss_ind_fixup(struct hn_softc *sc)
1083 {
1084 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1085 	int i, nchan;
1086 
1087 	nchan = sc->hn_rx_ring_inuse;
1088 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1089 
1090 	/*
1091 	 * Check indirect table to make sure that all channels in it
1092 	 * can be used.
1093 	 */
1094 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1095 		if (rss->rss_ind[i] >= nchan) {
1096 			if_printf(sc->hn_ifp,
1097 			    "RSS indirect table %d fixup: %u -> %d\n",
1098 			    i, rss->rss_ind[i], nchan - 1);
1099 			rss->rss_ind[i] = nchan - 1;
1100 		}
1101 	}
1102 }
1103 
1104 static int
1105 hn_ifmedia_upd(if_t ifp __unused)
1106 {
1107 
1108 	return EOPNOTSUPP;
1109 }
1110 
1111 static void
1112 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr)
1113 {
1114 	struct hn_softc *sc = if_getsoftc(ifp);
1115 
1116 	ifmr->ifm_status = IFM_AVALID;
1117 	ifmr->ifm_active = IFM_ETHER;
1118 
1119 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1120 		ifmr->ifm_active |= IFM_NONE;
1121 		return;
1122 	}
1123 	ifmr->ifm_status |= IFM_ACTIVE;
1124 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1125 }
1126 
1127 static void
1128 hn_rxvf_set_task(void *xarg, int pending __unused)
1129 {
1130 	struct hn_rxvf_setarg *arg = xarg;
1131 
1132 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1133 }
1134 
1135 static void
1136 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp)
1137 {
1138 	struct hn_rx_ring *rxr;
1139 	struct hn_rxvf_setarg arg;
1140 	struct task task;
1141 	int i;
1142 
1143 	HN_LOCK_ASSERT(sc);
1144 
1145 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1146 
1147 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1148 		rxr = &sc->hn_rx_ring[i];
1149 
1150 		if (i < sc->hn_rx_ring_inuse) {
1151 			arg.rxr = rxr;
1152 			arg.vf_ifp = vf_ifp;
1153 			vmbus_chan_run_task(rxr->hn_chan, &task);
1154 		} else {
1155 			rxr->hn_rxvf_ifp = vf_ifp;
1156 		}
1157 	}
1158 }
1159 
1160 static bool
1161 hn_ismyvf(const struct hn_softc *sc, const if_t ifp)
1162 {
1163 	if_t hn_ifp;
1164 
1165 	hn_ifp = sc->hn_ifp;
1166 
1167 	if (ifp == hn_ifp)
1168 		return (false);
1169 
1170 	if (if_getalloctype(ifp) != IFT_ETHER)
1171 		return (false);
1172 
1173 	/* Ignore lagg/vlan interfaces */
1174 	if (strcmp(if_getdname(ifp), "lagg") == 0 ||
1175 	    strcmp(if_getdname(ifp), "vlan") == 0)
1176 		return (false);
1177 
1178 	/*
1179 	 * During detach events if_getifaddr(ifp) might be NULL.
1180 	 * Make sure the bcmp() below doesn't panic on that:
1181 	 */
1182 	if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL)
1183 		return (false);
1184 
1185 	if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0)
1186 		return (false);
1187 
1188 	return (true);
1189 }
1190 
1191 static void
1192 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf)
1193 {
1194 	if_t hn_ifp;
1195 
1196 	HN_LOCK(sc);
1197 
1198 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1199 		goto out;
1200 
1201 	if (!hn_ismyvf(sc, ifp))
1202 		goto out;
1203 	hn_ifp = sc->hn_ifp;
1204 
1205 	if (rxvf) {
1206 		if (sc->hn_flags & HN_FLAG_RXVF)
1207 			goto out;
1208 
1209 		sc->hn_flags |= HN_FLAG_RXVF;
1210 		hn_rxfilter_config(sc);
1211 	} else {
1212 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1213 			goto out;
1214 
1215 		sc->hn_flags &= ~HN_FLAG_RXVF;
1216 		if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING)
1217 			hn_rxfilter_config(sc);
1218 		else
1219 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1220 	}
1221 
1222 	hn_nvs_set_datapath(sc,
1223 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1224 
1225 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1226 
1227 	if (rxvf) {
1228 		hn_vf_rss_fixup(sc, true);
1229 		hn_suspend_mgmt(sc);
1230 		sc->hn_link_flags &=
1231 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1232 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1233 	} else {
1234 		hn_vf_rss_restore(sc);
1235 		hn_resume_mgmt(sc);
1236 	}
1237 
1238 	devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1239 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1240 
1241 	if (bootverbose) {
1242 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1243 		    rxvf ? "to" : "from", if_name(ifp));
1244 	}
1245 out:
1246 	HN_UNLOCK(sc);
1247 }
1248 
1249 static void
1250 hn_ifnet_event(void *arg, if_t ifp, int event)
1251 {
1252 
1253 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1254 		return;
1255 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1256 }
1257 
1258 static void
1259 hn_ifaddr_event(void *arg, if_t ifp)
1260 {
1261 
1262 	hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP);
1263 }
1264 
1265 static int
1266 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1267 {
1268 	if_t ifp, vf_ifp;
1269 	uint64_t tmp;
1270 	int error;
1271 
1272 	HN_LOCK_ASSERT(sc);
1273 	ifp = sc->hn_ifp;
1274 	vf_ifp = sc->hn_vf_ifp;
1275 
1276 	/*
1277 	 * Fix up requested capabilities w/ supported capabilities,
1278 	 * since the supported capabilities could have been changed.
1279 	 */
1280 	ifr->ifr_reqcap &= if_getcapabilities(ifp);
1281 	/* Pass SIOCSIFCAP to VF. */
1282 	error = ifhwioctl(SIOCSIFCAP, vf_ifp, (caddr_t)ifr, curthread);
1283 
1284 	/*
1285 	 * NOTE:
1286 	 * The error will be propagated to the callers, however, it
1287 	 * is _not_ useful here.
1288 	 */
1289 
1290 	/*
1291 	 * Merge VF's enabled capabilities.
1292 	 */
1293 	if_setcapenable(ifp, if_getcapenable(vf_ifp) & if_getcapabilities(ifp));
1294 
1295 	tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP_HWASSIST(sc);
1296 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
1297 		if_sethwassistbits(ifp, tmp, 0);
1298 	else
1299 		if_sethwassistbits(ifp, 0, tmp);
1300 
1301 	tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP6_HWASSIST(sc);
1302 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
1303 		if_sethwassistbits(ifp, tmp, 0);
1304 	else
1305 		if_sethwassistbits(ifp, 0, tmp);
1306 
1307 	tmp = if_gethwassist(vf_ifp) & CSUM_IP_TSO;
1308 	if (if_getcapenable(ifp) & IFCAP_TSO4)
1309 		if_sethwassistbits(ifp, tmp, 0);
1310 	else
1311 		if_sethwassistbits(ifp, 0, tmp);
1312 
1313 	tmp = if_gethwassist(vf_ifp) & CSUM_IP6_TSO;
1314 	if (if_getcapenable(ifp) & IFCAP_TSO6)
1315 		if_sethwassistbits(ifp, tmp, 0);
1316 	else
1317 		if_sethwassistbits(ifp, 0, tmp);
1318 
1319 	return (error);
1320 }
1321 
1322 static int
1323 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1324 {
1325 	if_t vf_ifp;
1326 	struct ifreq ifr;
1327 
1328 	HN_LOCK_ASSERT(sc);
1329 	vf_ifp = sc->hn_vf_ifp;
1330 
1331 	memset(&ifr, 0, sizeof(ifr));
1332 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1333 	ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff;
1334 	ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16;
1335 	return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread));
1336 }
1337 
1338 static void
1339 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1340 {
1341 	if_t ifp = sc->hn_ifp;
1342 	int allmulti = 0;
1343 
1344 	HN_LOCK_ASSERT(sc);
1345 
1346 	/* XXX vlan(4) style mcast addr maintenance */
1347 	if (!if_maddr_empty(ifp))
1348 		allmulti = IFF_ALLMULTI;
1349 
1350 	/* Always set the VF's if_flags */
1351 	if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti);
1352 }
1353 
1354 static void
1355 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m)
1356 {
1357 	struct rm_priotracker pt;
1358 	if_t hn_ifp = NULL;
1359 	struct mbuf *mn;
1360 
1361 	/*
1362 	 * XXX racy, if hn(4) ever detached.
1363 	 */
1364 	rm_rlock(&hn_vfmap_lock, &pt);
1365 	if (if_getindex(vf_ifp) < hn_vfmap_size)
1366 		hn_ifp = hn_vfmap[if_getindex(vf_ifp)];
1367 	rm_runlock(&hn_vfmap_lock, &pt);
1368 
1369 	if (hn_ifp != NULL) {
1370 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1371 			/*
1372 			 * Allow tapping on the VF.
1373 			 */
1374 			ETHER_BPF_MTAP(vf_ifp, mn);
1375 
1376 			/*
1377 			 * Update VF stats.
1378 			 */
1379 			if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) {
1380 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1381 				    mn->m_pkthdr.len);
1382 			}
1383 			/*
1384 			 * XXX IFCOUNTER_IMCAST
1385 			 * This stat updating is kinda invasive, since it
1386 			 * requires two checks on the mbuf: the length check
1387 			 * and the ethernet header check.  As of this write,
1388 			 * all multicast packets go directly to hn(4), which
1389 			 * makes imcast stat updating in the VF a try in vian.
1390 			 */
1391 
1392 			/*
1393 			 * Fix up rcvif and increase hn(4)'s ipackets.
1394 			 */
1395 			mn->m_pkthdr.rcvif = hn_ifp;
1396 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1397 		}
1398 		/*
1399 		 * Go through hn(4)'s if_input.
1400 		 */
1401 		if_input(hn_ifp, m);
1402 	} else {
1403 		/*
1404 		 * In the middle of the transition; free this
1405 		 * mbuf chain.
1406 		 */
1407 		while (m != NULL) {
1408 			mn = m->m_nextpkt;
1409 			m->m_nextpkt = NULL;
1410 			m_freem(m);
1411 			m = mn;
1412 		}
1413 	}
1414 }
1415 
1416 static void
1417 hn_mtu_change_fixup(struct hn_softc *sc)
1418 {
1419 	if_t ifp;
1420 
1421 	HN_LOCK_ASSERT(sc);
1422 	ifp = sc->hn_ifp;
1423 
1424 	hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp));
1425 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1426 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1427 }
1428 
1429 static uint32_t
1430 hn_rss_type_fromndis(uint32_t rss_hash)
1431 {
1432 	uint32_t types = 0;
1433 
1434 	if (rss_hash & NDIS_HASH_IPV4)
1435 		types |= RSS_TYPE_IPV4;
1436 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1437 		types |= RSS_TYPE_TCP_IPV4;
1438 	if (rss_hash & NDIS_HASH_IPV6)
1439 		types |= RSS_TYPE_IPV6;
1440 	if (rss_hash & NDIS_HASH_IPV6_EX)
1441 		types |= RSS_TYPE_IPV6_EX;
1442 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1443 		types |= RSS_TYPE_TCP_IPV6;
1444 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1445 		types |= RSS_TYPE_TCP_IPV6_EX;
1446 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1447 		types |= RSS_TYPE_UDP_IPV4;
1448 	return (types);
1449 }
1450 
1451 static uint32_t
1452 hn_rss_type_tondis(uint32_t types)
1453 {
1454 	uint32_t rss_hash = 0;
1455 
1456 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1457 	    ("UDP6 and UDP6EX are not supported"));
1458 
1459 	if (types & RSS_TYPE_IPV4)
1460 		rss_hash |= NDIS_HASH_IPV4;
1461 	if (types & RSS_TYPE_TCP_IPV4)
1462 		rss_hash |= NDIS_HASH_TCP_IPV4;
1463 	if (types & RSS_TYPE_IPV6)
1464 		rss_hash |= NDIS_HASH_IPV6;
1465 	if (types & RSS_TYPE_IPV6_EX)
1466 		rss_hash |= NDIS_HASH_IPV6_EX;
1467 	if (types & RSS_TYPE_TCP_IPV6)
1468 		rss_hash |= NDIS_HASH_TCP_IPV6;
1469 	if (types & RSS_TYPE_TCP_IPV6_EX)
1470 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1471 	if (types & RSS_TYPE_UDP_IPV4)
1472 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1473 	return (rss_hash);
1474 }
1475 
1476 static void
1477 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1478 {
1479 	int i;
1480 
1481 	HN_LOCK_ASSERT(sc);
1482 
1483 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1484 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1485 }
1486 
1487 static void
1488 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1489 {
1490 	if_t ifp, vf_ifp;
1491 	struct ifrsshash ifrh;
1492 	struct ifrsskey ifrk;
1493 	int error;
1494 	uint32_t my_types, diff_types, mbuf_types = 0;
1495 
1496 	HN_LOCK_ASSERT(sc);
1497 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1498 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1499 
1500 	if (sc->hn_rx_ring_inuse == 1) {
1501 		/* No RSS on synthetic parts; done. */
1502 		return;
1503 	}
1504 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1505 		/* Synthetic parts do not support Toeplitz; done. */
1506 		return;
1507 	}
1508 
1509 	ifp = sc->hn_ifp;
1510 	vf_ifp = sc->hn_vf_ifp;
1511 
1512 	/*
1513 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1514 	 * supported.
1515 	 */
1516 	memset(&ifrk, 0, sizeof(ifrk));
1517 	strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name));
1518 	error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread);
1519 	if (error) {
1520 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1521 		    if_name(vf_ifp), error);
1522 		goto done;
1523 	}
1524 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1525 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1526 		    if_name(vf_ifp), ifrk.ifrk_func);
1527 		goto done;
1528 	}
1529 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1530 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1531 		    if_name(vf_ifp), ifrk.ifrk_keylen);
1532 		goto done;
1533 	}
1534 
1535 	/*
1536 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1537 	 */
1538 	memset(&ifrh, 0, sizeof(ifrh));
1539 	strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name));
1540 	error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread);
1541 	if (error) {
1542 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1543 		    if_name(vf_ifp), error);
1544 		goto done;
1545 	}
1546 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1547 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1548 		    if_name(vf_ifp), ifrh.ifrh_func);
1549 		goto done;
1550 	}
1551 
1552 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1553 	if ((ifrh.ifrh_types & my_types) == 0) {
1554 		/* This disables RSS; ignore it then */
1555 		if_printf(ifp, "%s intersection of RSS types failed.  "
1556 		    "VF %#x, mine %#x\n", if_name(vf_ifp),
1557 		    ifrh.ifrh_types, my_types);
1558 		goto done;
1559 	}
1560 
1561 	diff_types = my_types ^ ifrh.ifrh_types;
1562 	my_types &= ifrh.ifrh_types;
1563 	mbuf_types = my_types;
1564 
1565 	/*
1566 	 * Detect RSS hash value/type confliction.
1567 	 *
1568 	 * NOTE:
1569 	 * We don't disable the hash type, but stop delivery the hash
1570 	 * value/type through mbufs on RX path.
1571 	 *
1572 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1573 	 * hash is delivered with type of TCP_IPV4.  This means if
1574 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1575 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1576 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1577 	 * here.
1578 	 */
1579 	if ((my_types & RSS_TYPE_IPV4) &&
1580 	    (diff_types & ifrh.ifrh_types &
1581 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1582 		/* Conflict; disable IPV4 hash type/value delivery. */
1583 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1584 		mbuf_types &= ~RSS_TYPE_IPV4;
1585 	}
1586 	if ((my_types & RSS_TYPE_IPV6) &&
1587 	    (diff_types & ifrh.ifrh_types &
1588 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1589 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1590 	      RSS_TYPE_IPV6_EX))) {
1591 		/* Conflict; disable IPV6 hash type/value delivery. */
1592 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1593 		mbuf_types &= ~RSS_TYPE_IPV6;
1594 	}
1595 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1596 	    (diff_types & ifrh.ifrh_types &
1597 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1598 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1599 	      RSS_TYPE_IPV6))) {
1600 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1601 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1602 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1603 	}
1604 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1605 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1606 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1607 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1608 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1609 	}
1610 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1611 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1612 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1613 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1614 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1615 	}
1616 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1617 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1618 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1619 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1620 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1621 	}
1622 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1623 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1624 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1625 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1626 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1627 	}
1628 
1629 	/*
1630 	 * Indirect table does not matter.
1631 	 */
1632 
1633 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1634 	    hn_rss_type_tondis(my_types);
1635 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1636 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1637 
1638 	if (reconf) {
1639 		error = hn_rss_reconfig(sc);
1640 		if (error) {
1641 			/* XXX roll-back? */
1642 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1643 			/* XXX keep going. */
1644 		}
1645 	}
1646 done:
1647 	/* Hash deliverability for mbufs. */
1648 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1649 }
1650 
1651 static void
1652 hn_vf_rss_restore(struct hn_softc *sc)
1653 {
1654 
1655 	HN_LOCK_ASSERT(sc);
1656 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1657 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1658 
1659 	if (sc->hn_rx_ring_inuse == 1)
1660 		goto done;
1661 
1662 	/*
1663 	 * Restore hash types.  Key does _not_ matter.
1664 	 */
1665 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1666 		int error;
1667 
1668 		sc->hn_rss_hash = sc->hn_rss_hcap;
1669 		error = hn_rss_reconfig(sc);
1670 		if (error) {
1671 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1672 			    error);
1673 			/* XXX keep going. */
1674 		}
1675 	}
1676 done:
1677 	/* Hash deliverability for mbufs. */
1678 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1679 }
1680 
1681 static void
1682 hn_xpnt_vf_setready(struct hn_softc *sc)
1683 {
1684 	if_t ifp, vf_ifp;
1685 	struct ifreq ifr;
1686 
1687 	HN_LOCK_ASSERT(sc);
1688 	ifp = sc->hn_ifp;
1689 	vf_ifp = sc->hn_vf_ifp;
1690 
1691 	/*
1692 	 * Mark the VF ready.
1693 	 */
1694 	sc->hn_vf_rdytick = 0;
1695 
1696 	/*
1697 	 * Save information for restoration.
1698 	 */
1699 	sc->hn_saved_caps = if_getcapabilities(ifp);
1700 	sc->hn_saved_tsomax = if_gethwtsomax(ifp);
1701 	sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp);
1702 	sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp);
1703 
1704 	/*
1705 	 * Intersect supported/enabled capabilities.
1706 	 *
1707 	 * NOTE:
1708 	 * if_hwassist is not changed here.
1709 	 */
1710 	if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp));
1711 	if_setcapenablebit(ifp, 0, if_getcapabilities(ifp));
1712 
1713 	/*
1714 	 * Fix TSO settings.
1715 	 */
1716 	if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp))
1717 		if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp));
1718 	if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp))
1719 		if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp));
1720 	if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp))
1721 		if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp));
1722 
1723 	/*
1724 	 * Change VF's enabled capabilities.
1725 	 */
1726 	memset(&ifr, 0, sizeof(ifr));
1727 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1728 	ifr.ifr_reqcap = if_getcapenable(ifp);
1729 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1730 
1731 	if (if_getmtu(ifp) != ETHERMTU) {
1732 		int error;
1733 
1734 		/*
1735 		 * Change VF's MTU.
1736 		 */
1737 		memset(&ifr, 0, sizeof(ifr));
1738 		strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1739 		ifr.ifr_mtu = if_getmtu(ifp);
1740 		error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread);
1741 		if (error) {
1742 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1743 			    if_name(vf_ifp), if_getmtu(ifp));
1744 			if (if_getmtu(ifp) > ETHERMTU) {
1745 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1746 
1747 				/*
1748 				 * XXX
1749 				 * No need to adjust the synthetic parts' MTU;
1750 				 * failure of the adjustment will cause us
1751 				 * infinite headache.
1752 				 */
1753 				if_setmtu(ifp, ETHERMTU);
1754 				hn_mtu_change_fixup(sc);
1755 			}
1756 		}
1757 	}
1758 }
1759 
1760 static bool
1761 hn_xpnt_vf_isready(struct hn_softc *sc)
1762 {
1763 
1764 	HN_LOCK_ASSERT(sc);
1765 
1766 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1767 		return (false);
1768 
1769 	if (sc->hn_vf_rdytick == 0)
1770 		return (true);
1771 
1772 	if (sc->hn_vf_rdytick > ticks)
1773 		return (false);
1774 
1775 	/* Mark VF as ready. */
1776 	hn_xpnt_vf_setready(sc);
1777 	return (true);
1778 }
1779 
1780 static void
1781 hn_xpnt_vf_setenable(struct hn_softc *sc)
1782 {
1783 	int i;
1784 
1785 	HN_LOCK_ASSERT(sc);
1786 
1787 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1788 	rm_wlock(&sc->hn_vf_lock);
1789 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1790 	rm_wunlock(&sc->hn_vf_lock);
1791 
1792 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1793 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1794 }
1795 
1796 static void
1797 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1798 {
1799 	int i;
1800 
1801 	HN_LOCK_ASSERT(sc);
1802 
1803 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1804 	rm_wlock(&sc->hn_vf_lock);
1805 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1806 	if (clear_vf)
1807 		sc->hn_vf_ifp = NULL;
1808 	rm_wunlock(&sc->hn_vf_lock);
1809 
1810 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1811 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1812 }
1813 
1814 static void
1815 hn_xpnt_vf_init(struct hn_softc *sc)
1816 {
1817 	int error;
1818 
1819 	HN_LOCK_ASSERT(sc);
1820 
1821 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1822 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1823 
1824 	if (bootverbose) {
1825 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1826 		    if_name(sc->hn_vf_ifp));
1827 	}
1828 
1829 	/*
1830 	 * Bring the VF up.
1831 	 */
1832 	hn_xpnt_vf_saveifflags(sc);
1833 	if_setflagbits(sc->hn_ifp, IFF_UP, 0);
1834 	error = hn_xpnt_vf_iocsetflags(sc);
1835 	if (error) {
1836 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1837 		    if_name(sc->hn_vf_ifp), error);
1838 		return;
1839 	}
1840 
1841 	/*
1842 	 * NOTE:
1843 	 * Datapath setting must happen _after_ bringing the VF up.
1844 	 */
1845 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1846 
1847 	/*
1848 	 * NOTE:
1849 	 * Fixup RSS related bits _after_ the VF is brought up, since
1850 	 * many VFs generate RSS key during it's initialization.
1851 	 */
1852 	hn_vf_rss_fixup(sc, true);
1853 
1854 	/* Mark transparent mode VF as enabled. */
1855 	hn_xpnt_vf_setenable(sc);
1856 }
1857 
1858 static void
1859 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1860 {
1861 	struct hn_softc *sc = xsc;
1862 
1863 	HN_LOCK(sc);
1864 
1865 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1866 		goto done;
1867 	if (sc->hn_vf_ifp == NULL)
1868 		goto done;
1869 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1870 		goto done;
1871 
1872 	if (sc->hn_vf_rdytick != 0) {
1873 		/* Mark VF as ready. */
1874 		hn_xpnt_vf_setready(sc);
1875 	}
1876 
1877 	if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) {
1878 		/*
1879 		 * Delayed VF initialization.
1880 		 */
1881 		if (bootverbose) {
1882 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1883 			    if_name(sc->hn_vf_ifp));
1884 		}
1885 		hn_xpnt_vf_init(sc);
1886 	}
1887 done:
1888 	HN_UNLOCK(sc);
1889 }
1890 
1891 static void
1892 hn_ifnet_attevent(void *xsc, if_t ifp)
1893 {
1894 	struct hn_softc *sc = xsc;
1895 
1896 	HN_LOCK(sc);
1897 
1898 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1899 		goto done;
1900 
1901 	if (!hn_ismyvf(sc, ifp))
1902 		goto done;
1903 
1904 	if (sc->hn_vf_ifp != NULL) {
1905 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1906 		    if_name(sc->hn_vf_ifp));
1907 		goto done;
1908 	}
1909 
1910 	if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) {
1911 		/*
1912 		 * ifnet.if_start is _not_ supported by transparent
1913 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1914 		 */
1915 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1916 		    "in transparent VF mode.\n", if_name(sc->hn_vf_ifp));
1917 
1918 		goto done;
1919 	}
1920 
1921 	rm_wlock(&hn_vfmap_lock);
1922 
1923 	if (if_getindex(ifp) >= hn_vfmap_size) {
1924 		if_t *newmap;
1925 		int newsize;
1926 
1927 		newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF;
1928 		newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF,
1929 		    M_WAITOK | M_ZERO);
1930 
1931 		memcpy(newmap, hn_vfmap,
1932 		    sizeof(if_t) * hn_vfmap_size);
1933 		free(hn_vfmap, M_DEVBUF);
1934 		hn_vfmap = newmap;
1935 		hn_vfmap_size = newsize;
1936 	}
1937 	KASSERT(hn_vfmap[if_getindex(ifp)] == NULL,
1938 	    ("%s: ifindex %d was mapped to %s",
1939 	     if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)])));
1940 	hn_vfmap[if_getindex(ifp)] = sc->hn_ifp;
1941 
1942 	rm_wunlock(&hn_vfmap_lock);
1943 
1944 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1945 	rm_wlock(&sc->hn_vf_lock);
1946 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1947 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1948 	sc->hn_vf_ifp = ifp;
1949 	rm_wunlock(&sc->hn_vf_lock);
1950 
1951 	if (hn_xpnt_vf) {
1952 		int wait_ticks;
1953 
1954 		/*
1955 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1956 		 * Save vf_ifp's current if_input for later restoration.
1957 		 */
1958 		sc->hn_vf_input = if_getinputfn(ifp);
1959 		if_setinputfn(ifp, hn_xpnt_vf_input);
1960 
1961 		/*
1962 		 * Stop link status management; use the VF's.
1963 		 */
1964 		hn_suspend_mgmt(sc);
1965 
1966 		/*
1967 		 * Give VF sometime to complete its attach routing.
1968 		 */
1969 		wait_ticks = hn_xpnt_vf_attwait * hz;
1970 		sc->hn_vf_rdytick = ticks + wait_ticks;
1971 
1972 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1973 		    wait_ticks);
1974 	}
1975 done:
1976 	HN_UNLOCK(sc);
1977 }
1978 
1979 static void
1980 hn_ifnet_detevent(void *xsc, if_t ifp)
1981 {
1982 	struct hn_softc *sc = xsc;
1983 
1984 	HN_LOCK(sc);
1985 
1986 	if (sc->hn_vf_ifp == NULL)
1987 		goto done;
1988 
1989 	if (!hn_ismyvf(sc, ifp))
1990 		goto done;
1991 
1992 	if (hn_xpnt_vf) {
1993 		/*
1994 		 * Make sure that the delayed initialization is not running.
1995 		 *
1996 		 * NOTE:
1997 		 * - This lock _must_ be released, since the hn_vf_init task
1998 		 *   will try holding this lock.
1999 		 * - It is safe to release this lock here, since the
2000 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
2001 		 *
2002 		 * XXX racy, if hn(4) ever detached.
2003 		 */
2004 		HN_UNLOCK(sc);
2005 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2006 		HN_LOCK(sc);
2007 
2008 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2009 		    if_name(sc->hn_ifp)));
2010 		if_setinputfn(ifp, sc->hn_vf_input);
2011 		sc->hn_vf_input = NULL;
2012 
2013 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2014 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2015 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2016 
2017 		if (sc->hn_vf_rdytick == 0) {
2018 			/*
2019 			 * The VF was ready; restore some settings.
2020 			 */
2021 			if_setcapabilities(ifp, sc->hn_saved_caps);
2022 			/*
2023 			 * NOTE:
2024 			 * There is _no_ need to fixup if_capenable and
2025 			 * if_hwassist, since the if_capabilities before
2026 			 * restoration was an intersection of the VF's
2027 			 * if_capabilites and the synthetic device's
2028 			 * if_capabilites.
2029 			 */
2030 			if_sethwtsomax(ifp, sc->hn_saved_tsomax);
2031 			if_sethwtsomaxsegcount(sc->hn_ifp,
2032 			    sc->hn_saved_tsosegcnt);
2033 			if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz);
2034 		}
2035 
2036 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2037 			/*
2038 			 * Restore RSS settings.
2039 			 */
2040 			hn_vf_rss_restore(sc);
2041 
2042 			/*
2043 			 * Resume link status management, which was suspended
2044 			 * by hn_ifnet_attevent().
2045 			 */
2046 			hn_resume_mgmt(sc);
2047 		}
2048 	}
2049 
2050 	/* Mark transparent mode VF as disabled. */
2051 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2052 
2053 	rm_wlock(&hn_vfmap_lock);
2054 
2055 	KASSERT(if_getindex(ifp) < hn_vfmap_size,
2056 	    ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size));
2057 	if (hn_vfmap[if_getindex(ifp)] != NULL) {
2058 		KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp,
2059 		    ("%s: ifindex %d was mapped to %s",
2060 		     if_name(ifp), if_getindex(ifp),
2061 		     if_name(hn_vfmap[if_getindex(ifp)])));
2062 		hn_vfmap[if_getindex(ifp)] = NULL;
2063 	}
2064 
2065 	rm_wunlock(&hn_vfmap_lock);
2066 done:
2067 	HN_UNLOCK(sc);
2068 }
2069 
2070 static void
2071 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state)
2072 {
2073 	struct hn_softc *sc = xsc;
2074 
2075 	if (sc->hn_vf_ifp == ifp)
2076 		if_link_state_change(sc->hn_ifp, link_state);
2077 }
2078 
2079 static int
2080 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS)
2081 {
2082 	struct hn_softc *sc = arg1;
2083 	unsigned int tsomax;
2084 	int error;
2085 
2086 	tsomax = if_gethwtsomax(sc->hn_ifp);
2087 	error = sysctl_handle_int(oidp, &tsomax, 0, req);
2088 	return error;
2089 }
2090 
2091 static int
2092 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS)
2093 {
2094 	struct hn_softc *sc = arg1;
2095 	unsigned int tsomaxsegcnt;
2096 	int error;
2097 
2098 	tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp);
2099 	error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req);
2100 	return error;
2101 }
2102 
2103 static int
2104 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS)
2105 {
2106 	struct hn_softc *sc = arg1;
2107 	unsigned int tsomaxsegsz;
2108 	int error;
2109 
2110 	tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp);
2111 	error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req);
2112 	return error;
2113 }
2114 
2115 static int
2116 hn_probe(device_t dev)
2117 {
2118 
2119 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2120 		device_set_desc(dev, "Hyper-V Network Interface");
2121 		return BUS_PROBE_DEFAULT;
2122 	}
2123 	return ENXIO;
2124 }
2125 
2126 static int
2127 hn_attach(device_t dev)
2128 {
2129 	struct hn_softc *sc = device_get_softc(dev);
2130 	struct sysctl_oid_list *child;
2131 	struct sysctl_ctx_list *ctx;
2132 	uint8_t eaddr[ETHER_ADDR_LEN];
2133 	if_t ifp = NULL;
2134 	int error, ring_cnt, tx_ring_cnt;
2135 	uint32_t mtu;
2136 
2137 	sc->hn_dev = dev;
2138 	sc->hn_prichan = vmbus_get_channel(dev);
2139 	HN_LOCK_INIT(sc);
2140 	rm_init(&sc->hn_vf_lock, "hnvf");
2141 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2142 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2143 
2144 	/*
2145 	 * Initialize these tunables once.
2146 	 */
2147 	sc->hn_agg_size = hn_tx_agg_size;
2148 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2149 
2150 	/*
2151 	 * Setup taskqueue for transmission.
2152 	 */
2153 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2154 		int i;
2155 
2156 		sc->hn_tx_taskqs =
2157 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2158 		    M_DEVBUF, M_WAITOK);
2159 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2160 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2161 			    M_WAITOK, taskqueue_thread_enqueue,
2162 			    &sc->hn_tx_taskqs[i]);
2163 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2164 			    "%s tx%d", device_get_nameunit(dev), i);
2165 		}
2166 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2167 		sc->hn_tx_taskqs = hn_tx_taskque;
2168 	}
2169 
2170 	/*
2171 	 * Setup taskqueue for mangement tasks, e.g. link status.
2172 	 */
2173 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2174 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2175 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2176 	    device_get_nameunit(dev));
2177 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2178 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2179 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2180 	    hn_netchg_status_taskfunc, sc);
2181 
2182 	if (hn_xpnt_vf) {
2183 		/*
2184 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2185 		 */
2186 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2187 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2188 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2189 		    device_get_nameunit(dev));
2190 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2191 		    hn_xpnt_vf_init_taskfunc, sc);
2192 	}
2193 
2194 	/*
2195 	 * Allocate ifnet and setup its name earlier, so that if_printf
2196 	 * can be used by functions, which will be called after
2197 	 * ether_ifattach().
2198 	 */
2199 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2200 	if_setsoftc(ifp, sc);
2201 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2202 
2203 	/*
2204 	 * Initialize ifmedia earlier so that it can be unconditionally
2205 	 * destroyed, if error happened later on.
2206 	 */
2207 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2208 
2209 	/*
2210 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2211 	 * to use (tx_ring_cnt).
2212 	 *
2213 	 * NOTE:
2214 	 * The # of RX rings to use is same as the # of channels to use.
2215 	 */
2216 	ring_cnt = hn_chan_cnt;
2217 	if (ring_cnt <= 0) {
2218 		/* Default */
2219 		ring_cnt = mp_ncpus;
2220 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2221 			ring_cnt = HN_RING_CNT_DEF_MAX;
2222 	} else if (ring_cnt > mp_ncpus) {
2223 		ring_cnt = mp_ncpus;
2224 	}
2225 #ifdef RSS
2226 	if (ring_cnt > rss_getnumbuckets())
2227 		ring_cnt = rss_getnumbuckets();
2228 #endif
2229 
2230 	tx_ring_cnt = hn_tx_ring_cnt;
2231 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2232 		tx_ring_cnt = ring_cnt;
2233 #ifdef HN_IFSTART_SUPPORT
2234 	if (hn_use_if_start) {
2235 		/* ifnet.if_start only needs one TX ring. */
2236 		tx_ring_cnt = 1;
2237 	}
2238 #endif
2239 
2240 	/*
2241 	 * Set the leader CPU for channels.
2242 	 */
2243 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2244 
2245 	/*
2246 	 * Create enough TX/RX rings, even if only limited number of
2247 	 * channels can be allocated.
2248 	 */
2249 	error = hn_create_tx_data(sc, tx_ring_cnt);
2250 	if (error)
2251 		goto failed;
2252 	error = hn_create_rx_data(sc, ring_cnt);
2253 	if (error)
2254 		goto failed;
2255 
2256 	/*
2257 	 * Create transaction context for NVS and RNDIS transactions.
2258 	 */
2259 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2260 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2261 	if (sc->hn_xact == NULL) {
2262 		error = ENXIO;
2263 		goto failed;
2264 	}
2265 
2266 	/*
2267 	 * Install orphan handler for the revocation of this device's
2268 	 * primary channel.
2269 	 *
2270 	 * NOTE:
2271 	 * The processing order is critical here:
2272 	 * Install the orphan handler, _before_ testing whether this
2273 	 * device's primary channel has been revoked or not.
2274 	 */
2275 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2276 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2277 		error = ENXIO;
2278 		goto failed;
2279 	}
2280 
2281 	/*
2282 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2283 	 */
2284 	error = hn_synth_attach(sc, ETHERMTU);
2285 	if (error)
2286 		goto failed;
2287 
2288 	error = hn_rndis_get_eaddr(sc, eaddr);
2289 	if (error)
2290 		goto failed;
2291 
2292 	error = hn_rndis_get_mtu(sc, &mtu);
2293 	if (error)
2294 		mtu = ETHERMTU;
2295 	else if (bootverbose)
2296 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2297 
2298 	if (sc->hn_rx_ring_inuse > 1) {
2299 		/*
2300 		 * Reduce TCP segment aggregation limit for multiple
2301 		 * RX rings to increase ACK timeliness.
2302 		 */
2303 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2304 	}
2305 
2306 	/*
2307 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2308 	 */
2309 	hn_fixup_tx_data(sc);
2310 	hn_fixup_rx_data(sc);
2311 
2312 	ctx = device_get_sysctl_ctx(dev);
2313 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2314 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2315 	    &sc->hn_nvs_ver, 0, "NVS version");
2316 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2317 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2318 	    hn_ndis_version_sysctl, "A", "NDIS version");
2319 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2320 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2321 	    hn_caps_sysctl, "A", "capabilities");
2322 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2323 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2324 	    hn_hwassist_sysctl, "A", "hwassist");
2325 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max",
2326 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl,
2327 	    "IU", "max TSO size");
2328 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt",
2329 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl,
2330 	    "IU", "max # of TSO segments");
2331 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz",
2332 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl,
2333 	    "IU", "max size of TSO segment");
2334 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2335 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2336 	    hn_rxfilter_sysctl, "A", "rxfilter");
2337 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2338 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2339 	    hn_rss_hash_sysctl, "A", "RSS hash");
2340 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2341 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2342 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2343 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2344 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2345 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2346 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2347 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2348 #ifndef RSS
2349 	/*
2350 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2351 	 */
2352 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2353 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2354 	    hn_rss_key_sysctl, "IU", "RSS key");
2355 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2356 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2357 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2358 #endif
2359 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2360 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2361 	    "RNDIS offered packet transmission aggregation size limit");
2362 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2363 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2364 	    "RNDIS offered packet transmission aggregation count limit");
2365 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2366 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2367 	    "RNDIS packet transmission aggregation alignment");
2368 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2369 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2370 	    hn_txagg_size_sysctl, "I",
2371 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2372 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2373 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2374 	    hn_txagg_pkts_sysctl, "I",
2375 	    "Packet transmission aggregation packets, "
2376 	    "0 -- disable, -1 -- auto");
2377 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2378 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2379 	    hn_polling_sysctl, "I",
2380 	    "Polling frequency: [100,1000000], 0 disable polling");
2381 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2382 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2383 	    hn_vf_sysctl, "A", "Virtual Function's name");
2384 	if (!hn_xpnt_vf) {
2385 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2386 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2387 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2388 	} else {
2389 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2390 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2391 		    hn_xpnt_vf_enabled_sysctl, "I",
2392 		    "Transparent VF enabled");
2393 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2394 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2395 		    hn_xpnt_vf_accbpf_sysctl, "I",
2396 		    "Accurate BPF for transparent VF");
2397 	}
2398 
2399 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
2400 	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A",
2401 	    "switch to rsc");
2402 
2403 	/*
2404 	 * Setup the ifmedia, which has been initialized earlier.
2405 	 */
2406 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2407 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2408 	/* XXX ifmedia_set really should do this for us */
2409 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2410 
2411 	/*
2412 	 * Setup the ifnet for this interface.
2413 	 */
2414 
2415 	if_setbaudrate(ifp, IF_Gbps(10));
2416 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
2417 	if_setioctlfn(ifp, hn_ioctl);
2418 	if_setinitfn(ifp, hn_init);
2419 #ifdef HN_IFSTART_SUPPORT
2420 	if (hn_use_if_start) {
2421 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2422 
2423 		if_setstartfn(ifp, hn_start);
2424 		if_setsendqlen(ifp, qdepth);
2425 		if_setsendqready(ifp);
2426 	} else
2427 #endif
2428 	{
2429 		if_settransmitfn(ifp, hn_transmit);
2430 		if_setqflushfn(ifp, hn_xmit_qflush);
2431 	}
2432 
2433 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0);
2434 #ifdef foo
2435 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2436 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0);
2437 #endif
2438 	if (sc->hn_caps & HN_CAP_VLAN) {
2439 		/* XXX not sure about VLAN_MTU. */
2440 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0);
2441 	}
2442 
2443 	if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist);
2444 	if (if_gethwassist(ifp) & HN_CSUM_IP_MASK)
2445 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0);
2446 	if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK)
2447 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0);
2448 	if (sc->hn_caps & HN_CAP_TSO4) {
2449 		if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
2450 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2451 	}
2452 	if (sc->hn_caps & HN_CAP_TSO6) {
2453 		if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
2454 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2455 	}
2456 
2457 	/* Enable all available capabilities by default. */
2458 	if_setcapenable(ifp, if_getcapabilities(ifp));
2459 
2460 	/*
2461 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2462 	 * be enabled through SIOCSIFCAP.
2463 	 */
2464 	if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6));
2465 	if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO));
2466 
2467 	if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) {
2468 		/*
2469 		 * Lock hn_set_tso_maxsize() to simplify its
2470 		 * internal logic.
2471 		 */
2472 		HN_LOCK(sc);
2473 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2474 		HN_UNLOCK(sc);
2475 		if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX);
2476 		if_sethwtsomaxsegsize(ifp, PAGE_SIZE);
2477 	}
2478 
2479 	ether_ifattach(ifp, eaddr);
2480 
2481 	if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2482 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2483 		    if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp));
2484 	}
2485 	if (mtu < ETHERMTU) {
2486 
2487 		if_setmtu(ifp, mtu);
2488 	}
2489 
2490 	/* Inform the upper layer about the long frame support. */
2491 	if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
2492 
2493 	/*
2494 	 * Kick off link status check.
2495 	 */
2496 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2497 	hn_update_link_status(sc);
2498 
2499 	if (!hn_xpnt_vf) {
2500 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2501 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2502 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2503 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2504 	} else {
2505 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2506 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2507 	}
2508 
2509 	/*
2510 	 * NOTE:
2511 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2512 	 * since interface's LLADDR is needed; interface LLADDR is not
2513 	 * available when ifnet_arrival event is triggered.
2514 	 */
2515 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2516 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2517 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2518 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2519 
2520 	return (0);
2521 failed:
2522 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2523 		hn_synth_detach(sc);
2524 	hn_detach(dev);
2525 	return (error);
2526 }
2527 
2528 static int
2529 hn_detach(device_t dev)
2530 {
2531 	struct hn_softc *sc = device_get_softc(dev);
2532 	if_t ifp = sc->hn_ifp, vf_ifp;
2533 
2534 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2535 		/*
2536 		 * In case that the vmbus missed the orphan handler
2537 		 * installation.
2538 		 */
2539 		vmbus_xact_ctx_orphan(sc->hn_xact);
2540 	}
2541 
2542 	if (sc->hn_ifaddr_evthand != NULL)
2543 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2544 	if (sc->hn_ifnet_evthand != NULL)
2545 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2546 	if (sc->hn_ifnet_atthand != NULL) {
2547 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2548 		    sc->hn_ifnet_atthand);
2549 	}
2550 	if (sc->hn_ifnet_dethand != NULL) {
2551 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2552 		    sc->hn_ifnet_dethand);
2553 	}
2554 	if (sc->hn_ifnet_lnkhand != NULL)
2555 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2556 
2557 	vf_ifp = sc->hn_vf_ifp;
2558 	__compiler_membar();
2559 	if (vf_ifp != NULL)
2560 		hn_ifnet_detevent(sc, vf_ifp);
2561 
2562 	if (device_is_attached(dev)) {
2563 		HN_LOCK(sc);
2564 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2565 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
2566 				hn_stop(sc, true);
2567 			/*
2568 			 * NOTE:
2569 			 * hn_stop() only suspends data, so managment
2570 			 * stuffs have to be suspended manually here.
2571 			 */
2572 			hn_suspend_mgmt(sc);
2573 			hn_synth_detach(sc);
2574 		}
2575 		HN_UNLOCK(sc);
2576 		ether_ifdetach(ifp);
2577 	}
2578 
2579 	ifmedia_removeall(&sc->hn_media);
2580 	hn_destroy_rx_data(sc);
2581 	hn_destroy_tx_data(sc);
2582 
2583 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2584 		int i;
2585 
2586 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2587 			taskqueue_free(sc->hn_tx_taskqs[i]);
2588 		free(sc->hn_tx_taskqs, M_DEVBUF);
2589 	}
2590 	taskqueue_free(sc->hn_mgmt_taskq0);
2591 	if (sc->hn_vf_taskq != NULL)
2592 		taskqueue_free(sc->hn_vf_taskq);
2593 
2594 	if (sc->hn_xact != NULL) {
2595 		/*
2596 		 * Uninstall the orphan handler _before_ the xact is
2597 		 * destructed.
2598 		 */
2599 		vmbus_chan_unset_orphan(sc->hn_prichan);
2600 		vmbus_xact_ctx_destroy(sc->hn_xact);
2601 	}
2602 
2603 	if_free(ifp);
2604 
2605 	HN_LOCK_DESTROY(sc);
2606 	rm_destroy(&sc->hn_vf_lock);
2607 	return (0);
2608 }
2609 
2610 static int
2611 hn_shutdown(device_t dev)
2612 {
2613 
2614 	return (0);
2615 }
2616 
2617 static void
2618 hn_link_status(struct hn_softc *sc)
2619 {
2620 	uint32_t link_status;
2621 	int error;
2622 
2623 	error = hn_rndis_get_linkstatus(sc, &link_status);
2624 	if (error) {
2625 		/* XXX what to do? */
2626 		return;
2627 	}
2628 
2629 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2630 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2631 	else
2632 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2633 	if_link_state_change(sc->hn_ifp,
2634 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2635 	    LINK_STATE_UP : LINK_STATE_DOWN);
2636 }
2637 
2638 static void
2639 hn_link_taskfunc(void *xsc, int pending __unused)
2640 {
2641 	struct hn_softc *sc = xsc;
2642 
2643 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2644 		return;
2645 	hn_link_status(sc);
2646 }
2647 
2648 static void
2649 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2650 {
2651 	struct hn_softc *sc = xsc;
2652 
2653 	/* Prevent any link status checks from running. */
2654 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2655 
2656 	/*
2657 	 * Fake up a [link down --> link up] state change; 5 seconds
2658 	 * delay is used, which closely simulates miibus reaction
2659 	 * upon link down event.
2660 	 */
2661 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2662 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2663 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2664 	    &sc->hn_netchg_status, 5 * hz);
2665 }
2666 
2667 static void
2668 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2669 {
2670 	struct hn_softc *sc = xsc;
2671 
2672 	/* Re-allow link status checks. */
2673 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2674 	hn_link_status(sc);
2675 }
2676 
2677 static void
2678 hn_update_link_status(struct hn_softc *sc)
2679 {
2680 
2681 	if (sc->hn_mgmt_taskq != NULL)
2682 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2683 }
2684 
2685 static void
2686 hn_change_network(struct hn_softc *sc)
2687 {
2688 
2689 	if (sc->hn_mgmt_taskq != NULL)
2690 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2691 }
2692 
2693 static __inline int
2694 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2695     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2696 {
2697 	struct mbuf *m = *m_head;
2698 	int error;
2699 
2700 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2701 
2702 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2703 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2704 	if (error == EFBIG) {
2705 		struct mbuf *m_new;
2706 
2707 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2708 		if (m_new == NULL)
2709 			return ENOBUFS;
2710 		else
2711 			*m_head = m = m_new;
2712 		txr->hn_tx_collapsed++;
2713 
2714 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2715 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2716 	}
2717 	if (!error) {
2718 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2719 		    BUS_DMASYNC_PREWRITE);
2720 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2721 	}
2722 	return error;
2723 }
2724 
2725 static __inline int
2726 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2727 {
2728 
2729 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2730 	    ("put an onlist txd %#x", txd->flags));
2731 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2732 	    ("put an onagg txd %#x", txd->flags));
2733 
2734 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2735 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2736 		return 0;
2737 
2738 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2739 		struct hn_txdesc *tmp_txd;
2740 
2741 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2742 			int freed __diagused;
2743 
2744 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2745 			    ("resursive aggregation on aggregated txdesc"));
2746 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2747 			    ("not aggregated txdesc"));
2748 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2749 			    ("aggregated txdesc uses dmamap"));
2750 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2751 			    ("aggregated txdesc consumes "
2752 			     "chimney sending buffer"));
2753 			KASSERT(tmp_txd->chim_size == 0,
2754 			    ("aggregated txdesc has non-zero "
2755 			     "chimney sending size"));
2756 
2757 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2758 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2759 			freed = hn_txdesc_put(txr, tmp_txd);
2760 			KASSERT(freed, ("failed to free aggregated txdesc"));
2761 		}
2762 	}
2763 
2764 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2765 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2766 		    ("chim txd uses dmamap"));
2767 		hn_chim_free(txr->hn_sc, txd->chim_index);
2768 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2769 		txd->chim_size = 0;
2770 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2771 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2772 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2773 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2774 		    txd->data_dmap);
2775 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2776 	}
2777 
2778 	if (txd->m != NULL) {
2779 		m_freem(txd->m);
2780 		txd->m = NULL;
2781 	}
2782 
2783 	txd->flags |= HN_TXD_FLAG_ONLIST;
2784 #ifndef HN_USE_TXDESC_BUFRING
2785 	mtx_lock_spin(&txr->hn_txlist_spin);
2786 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2787 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2788 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2789 	txr->hn_txdesc_avail++;
2790 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2791 	mtx_unlock_spin(&txr->hn_txlist_spin);
2792 #else	/* HN_USE_TXDESC_BUFRING */
2793 #ifdef HN_DEBUG
2794 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2795 #endif
2796 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2797 #endif	/* !HN_USE_TXDESC_BUFRING */
2798 
2799 	return 1;
2800 }
2801 
2802 static __inline struct hn_txdesc *
2803 hn_txdesc_get(struct hn_tx_ring *txr)
2804 {
2805 	struct hn_txdesc *txd;
2806 
2807 #ifndef HN_USE_TXDESC_BUFRING
2808 	mtx_lock_spin(&txr->hn_txlist_spin);
2809 	txd = SLIST_FIRST(&txr->hn_txlist);
2810 	if (txd != NULL) {
2811 		KASSERT(txr->hn_txdesc_avail > 0,
2812 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2813 		txr->hn_txdesc_avail--;
2814 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2815 	}
2816 	mtx_unlock_spin(&txr->hn_txlist_spin);
2817 #else
2818 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2819 #endif
2820 
2821 	if (txd != NULL) {
2822 #ifdef HN_USE_TXDESC_BUFRING
2823 #ifdef HN_DEBUG
2824 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2825 #endif
2826 #endif	/* HN_USE_TXDESC_BUFRING */
2827 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2828 		    STAILQ_EMPTY(&txd->agg_list) &&
2829 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2830 		    txd->chim_size == 0 &&
2831 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2832 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2833 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2834 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2835 		txd->refs = 1;
2836 	}
2837 	return txd;
2838 }
2839 
2840 static __inline void
2841 hn_txdesc_hold(struct hn_txdesc *txd)
2842 {
2843 
2844 	/* 0->1 transition will never work */
2845 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2846 	atomic_add_int(&txd->refs, 1);
2847 }
2848 
2849 static __inline void
2850 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2851 {
2852 
2853 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2854 	    ("recursive aggregation on aggregating txdesc"));
2855 
2856 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2857 	    ("already aggregated"));
2858 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2859 	    ("recursive aggregation on to-be-aggregated txdesc"));
2860 
2861 	txd->flags |= HN_TXD_FLAG_ONAGG;
2862 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2863 }
2864 
2865 static bool
2866 hn_tx_ring_pending(struct hn_tx_ring *txr)
2867 {
2868 	bool pending = false;
2869 
2870 #ifndef HN_USE_TXDESC_BUFRING
2871 	mtx_lock_spin(&txr->hn_txlist_spin);
2872 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2873 		pending = true;
2874 	mtx_unlock_spin(&txr->hn_txlist_spin);
2875 #else
2876 	if (!buf_ring_full(txr->hn_txdesc_br))
2877 		pending = true;
2878 #endif
2879 	return (pending);
2880 }
2881 
2882 static __inline void
2883 hn_txeof(struct hn_tx_ring *txr)
2884 {
2885 	txr->hn_has_txeof = 0;
2886 	txr->hn_txeof(txr);
2887 }
2888 
2889 static void
2890 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2891     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2892 {
2893 	struct hn_txdesc *txd = sndc->hn_cbarg;
2894 	struct hn_tx_ring *txr;
2895 
2896 	txr = txd->txr;
2897 	KASSERT(txr->hn_chan == chan,
2898 	    ("channel mismatch, on chan%u, should be chan%u",
2899 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2900 
2901 	txr->hn_has_txeof = 1;
2902 	hn_txdesc_put(txr, txd);
2903 
2904 	++txr->hn_txdone_cnt;
2905 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2906 		txr->hn_txdone_cnt = 0;
2907 		if (txr->hn_oactive)
2908 			hn_txeof(txr);
2909 	}
2910 }
2911 
2912 static void
2913 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2914 {
2915 #if defined(INET) || defined(INET6)
2916 	struct epoch_tracker et;
2917 
2918 	NET_EPOCH_ENTER(et);
2919 	tcp_lro_flush_all(&rxr->hn_lro);
2920 	NET_EPOCH_EXIT(et);
2921 #endif
2922 
2923 	/*
2924 	 * NOTE:
2925 	 * 'txr' could be NULL, if multiple channels and
2926 	 * ifnet.if_start method are enabled.
2927 	 */
2928 	if (txr == NULL || !txr->hn_has_txeof)
2929 		return;
2930 
2931 	txr->hn_txdone_cnt = 0;
2932 	hn_txeof(txr);
2933 }
2934 
2935 static __inline uint32_t
2936 hn_rndis_pktmsg_offset(uint32_t ofs)
2937 {
2938 
2939 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2940 	    ("invalid RNDIS packet msg offset %u", ofs));
2941 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2942 }
2943 
2944 static __inline void *
2945 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2946     size_t pi_dlen, uint32_t pi_type)
2947 {
2948 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2949 	struct rndis_pktinfo *pi;
2950 
2951 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2952 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2953 
2954 	/*
2955 	 * Per-packet-info does not move; it only grows.
2956 	 *
2957 	 * NOTE:
2958 	 * rm_pktinfooffset in this phase counts from the beginning
2959 	 * of rndis_packet_msg.
2960 	 */
2961 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2962 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2963 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2964 	    pkt->rm_pktinfolen);
2965 	pkt->rm_pktinfolen += pi_size;
2966 
2967 	pi->rm_size = pi_size;
2968 	pi->rm_type = pi_type;
2969 	pi->rm_internal = 0;
2970 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2971 
2972 	return (pi->rm_data);
2973 }
2974 
2975 static __inline int
2976 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr)
2977 {
2978 	struct hn_txdesc *txd;
2979 	struct mbuf *m;
2980 	int error, pkts;
2981 
2982 	txd = txr->hn_agg_txd;
2983 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2984 
2985 	/*
2986 	 * Since hn_txpkt() will reset this temporary stat, save
2987 	 * it now, so that oerrors can be updated properly, if
2988 	 * hn_txpkt() ever fails.
2989 	 */
2990 	pkts = txr->hn_stat_pkts;
2991 
2992 	/*
2993 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2994 	 * failure, save it for later freeing, if hn_txpkt() ever
2995 	 * fails.
2996 	 */
2997 	m = txd->m;
2998 	error = hn_txpkt(ifp, txr, txd);
2999 	if (__predict_false(error)) {
3000 		/* txd is freed, but m is not. */
3001 		m_freem(m);
3002 
3003 		txr->hn_flush_failed++;
3004 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
3005 	}
3006 
3007 	/* Reset all aggregation states. */
3008 	txr->hn_agg_txd = NULL;
3009 	txr->hn_agg_szleft = 0;
3010 	txr->hn_agg_pktleft = 0;
3011 	txr->hn_agg_prevpkt = NULL;
3012 
3013 	return (error);
3014 }
3015 
3016 static void *
3017 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3018     int pktsize)
3019 {
3020 	void *chim;
3021 
3022 	if (txr->hn_agg_txd != NULL) {
3023 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
3024 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
3025 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
3026 			int olen;
3027 
3028 			/*
3029 			 * Update the previous RNDIS packet's total length,
3030 			 * it can be increased due to the mandatory alignment
3031 			 * padding for this RNDIS packet.  And update the
3032 			 * aggregating txdesc's chimney sending buffer size
3033 			 * accordingly.
3034 			 *
3035 			 * XXX
3036 			 * Zero-out the padding, as required by the RNDIS spec.
3037 			 */
3038 			olen = pkt->rm_len;
3039 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
3040 			agg_txd->chim_size += pkt->rm_len - olen;
3041 
3042 			/* Link this txdesc to the parent. */
3043 			hn_txdesc_agg(agg_txd, txd);
3044 
3045 			chim = (uint8_t *)pkt + pkt->rm_len;
3046 			/* Save the current packet for later fixup. */
3047 			txr->hn_agg_prevpkt = chim;
3048 
3049 			txr->hn_agg_pktleft--;
3050 			txr->hn_agg_szleft -= pktsize;
3051 			if (txr->hn_agg_szleft <=
3052 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3053 				/*
3054 				 * Probably can't aggregate more packets,
3055 				 * flush this aggregating txdesc proactively.
3056 				 */
3057 				txr->hn_agg_pktleft = 0;
3058 			}
3059 			/* Done! */
3060 			return (chim);
3061 		}
3062 		hn_flush_txagg(ifp, txr);
3063 	}
3064 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3065 
3066 	txr->hn_tx_chimney_tried++;
3067 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3068 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3069 		return (NULL);
3070 	txr->hn_tx_chimney++;
3071 
3072 	chim = txr->hn_sc->hn_chim +
3073 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3074 
3075 	if (txr->hn_agg_pktmax > 1 &&
3076 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3077 		txr->hn_agg_txd = txd;
3078 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3079 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3080 		txr->hn_agg_prevpkt = chim;
3081 	}
3082 	return (chim);
3083 }
3084 
3085 /*
3086  * NOTE:
3087  * If this function fails, then both txd and m_head0 will be freed.
3088  */
3089 static int
3090 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3091     struct mbuf **m_head0)
3092 {
3093 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3094 	int error, nsegs, i;
3095 	struct mbuf *m_head = *m_head0;
3096 	struct rndis_packet_msg *pkt;
3097 	uint32_t *pi_data;
3098 	void *chim = NULL;
3099 	int pkt_hlen, pkt_size;
3100 
3101 	pkt = txd->rndis_pkt;
3102 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3103 	if (pkt_size < txr->hn_chim_size) {
3104 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3105 		if (chim != NULL)
3106 			pkt = chim;
3107 	} else {
3108 		if (txr->hn_agg_txd != NULL)
3109 			hn_flush_txagg(ifp, txr);
3110 	}
3111 
3112 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3113 	pkt->rm_len = m_head->m_pkthdr.len;
3114 	pkt->rm_dataoffset = 0;
3115 	pkt->rm_datalen = m_head->m_pkthdr.len;
3116 	pkt->rm_oobdataoffset = 0;
3117 	pkt->rm_oobdatalen = 0;
3118 	pkt->rm_oobdataelements = 0;
3119 	pkt->rm_pktinfooffset = sizeof(*pkt);
3120 	pkt->rm_pktinfolen = 0;
3121 	pkt->rm_vchandle = 0;
3122 	pkt->rm_reserved = 0;
3123 
3124 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3125 		/*
3126 		 * Set the hash value for this packet.
3127 		 */
3128 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3129 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3130 
3131 		if (M_HASHTYPE_ISHASH(m_head))
3132 			/*
3133 			 * The flowid field contains the hash value host
3134 			 * set in the rx queue if it is a ip forwarding pkt.
3135 			 * Set the same hash value so host can send on the
3136 			 * cpu it was received.
3137 			 */
3138 			*pi_data = m_head->m_pkthdr.flowid;
3139 		else
3140 			/*
3141 			 * Otherwise just put the tx queue index.
3142 			 */
3143 			*pi_data = txr->hn_tx_idx;
3144 	}
3145 
3146 	if (m_head->m_flags & M_VLANTAG) {
3147 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3148 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3149 		*pi_data = NDIS_VLAN_INFO_MAKE(
3150 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3151 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3152 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3153 	}
3154 
3155 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3156 #if defined(INET6) || defined(INET)
3157 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3158 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3159 #ifdef INET
3160 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3161 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3162 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3163 			    m_head->m_pkthdr.tso_segsz);
3164 		}
3165 #endif
3166 #if defined(INET6) && defined(INET)
3167 		else
3168 #endif
3169 #ifdef INET6
3170 		{
3171 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3172 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3173 			    m_head->m_pkthdr.tso_segsz);
3174 		}
3175 #endif
3176 #endif	/* INET6 || INET */
3177 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3178 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3179 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3180 		if (m_head->m_pkthdr.csum_flags &
3181 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3182 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3183 		} else {
3184 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3185 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3186 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3187 		}
3188 
3189 		if (m_head->m_pkthdr.csum_flags &
3190 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3191 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3192 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3193 		} else if (m_head->m_pkthdr.csum_flags &
3194 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3195 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3196 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3197 		}
3198 	}
3199 
3200 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3201 	/* Fixup RNDIS packet message total length */
3202 	pkt->rm_len += pkt_hlen;
3203 	/* Convert RNDIS packet message offsets */
3204 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3205 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3206 
3207 	/*
3208 	 * Fast path: Chimney sending.
3209 	 */
3210 	if (chim != NULL) {
3211 		struct hn_txdesc *tgt_txd = txd;
3212 
3213 		if (txr->hn_agg_txd != NULL) {
3214 			tgt_txd = txr->hn_agg_txd;
3215 #ifdef INVARIANTS
3216 			*m_head0 = NULL;
3217 #endif
3218 		}
3219 
3220 		KASSERT(pkt == chim,
3221 		    ("RNDIS pkt not in chimney sending buffer"));
3222 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3223 		    ("chimney sending buffer is not used"));
3224 		tgt_txd->chim_size += pkt->rm_len;
3225 
3226 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3227 		    ((uint8_t *)chim) + pkt_hlen);
3228 
3229 		txr->hn_gpa_cnt = 0;
3230 		txr->hn_sendpkt = hn_txpkt_chim;
3231 		goto done;
3232 	}
3233 
3234 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3235 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3236 	    ("chimney buffer is used"));
3237 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3238 
3239 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3240 	if (__predict_false(error)) {
3241 		int freed __diagused;
3242 
3243 		/*
3244 		 * This mbuf is not linked w/ the txd yet, so free it now.
3245 		 */
3246 		m_freem(m_head);
3247 		*m_head0 = NULL;
3248 
3249 		freed = hn_txdesc_put(txr, txd);
3250 		KASSERT(freed != 0,
3251 		    ("fail to free txd upon txdma error"));
3252 
3253 		txr->hn_txdma_failed++;
3254 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3255 		return error;
3256 	}
3257 	*m_head0 = m_head;
3258 
3259 	/* +1 RNDIS packet message */
3260 	txr->hn_gpa_cnt = nsegs + 1;
3261 
3262 	/* send packet with page buffer */
3263 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3264 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3265 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3266 
3267 	/*
3268 	 * Fill the page buffers with mbuf info after the page
3269 	 * buffer for RNDIS packet message.
3270 	 */
3271 	for (i = 0; i < nsegs; ++i) {
3272 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3273 
3274 		gpa->gpa_page = atop(segs[i].ds_addr);
3275 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3276 		gpa->gpa_len = segs[i].ds_len;
3277 	}
3278 
3279 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3280 	txd->chim_size = 0;
3281 	txr->hn_sendpkt = hn_txpkt_sglist;
3282 done:
3283 	txd->m = m_head;
3284 
3285 	/* Set the completion routine */
3286 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3287 
3288 	/* Update temporary stats for later use. */
3289 	txr->hn_stat_pkts++;
3290 	txr->hn_stat_size += m_head->m_pkthdr.len;
3291 	if (m_head->m_flags & M_MCAST)
3292 		txr->hn_stat_mcasts++;
3293 
3294 	return 0;
3295 }
3296 
3297 /*
3298  * NOTE:
3299  * If this function fails, then txd will be freed, but the mbuf
3300  * associated w/ the txd will _not_ be freed.
3301  */
3302 static int
3303 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3304 {
3305 	int error, send_failed = 0, has_bpf;
3306 
3307 again:
3308 	has_bpf = bpf_peers_present(if_getbpf(ifp));
3309 	if (has_bpf) {
3310 		/*
3311 		 * Make sure that this txd and any aggregated txds are not
3312 		 * freed before ETHER_BPF_MTAP.
3313 		 */
3314 		hn_txdesc_hold(txd);
3315 	}
3316 	error = txr->hn_sendpkt(txr, txd);
3317 	if (!error) {
3318 		if (has_bpf) {
3319 			const struct hn_txdesc *tmp_txd;
3320 
3321 			ETHER_BPF_MTAP(ifp, txd->m);
3322 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3323 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3324 		}
3325 
3326 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3327 #ifdef HN_IFSTART_SUPPORT
3328 		if (!hn_use_if_start)
3329 #endif
3330 		{
3331 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3332 			    txr->hn_stat_size);
3333 			if (txr->hn_stat_mcasts != 0) {
3334 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3335 				    txr->hn_stat_mcasts);
3336 			}
3337 		}
3338 		txr->hn_pkts += txr->hn_stat_pkts;
3339 		txr->hn_sends++;
3340 	}
3341 	if (has_bpf)
3342 		hn_txdesc_put(txr, txd);
3343 
3344 	if (__predict_false(error)) {
3345 		int freed __diagused;
3346 
3347 		/*
3348 		 * This should "really rarely" happen.
3349 		 *
3350 		 * XXX Too many RX to be acked or too many sideband
3351 		 * commands to run?  Ask netvsc_channel_rollup()
3352 		 * to kick start later.
3353 		 */
3354 		txr->hn_has_txeof = 1;
3355 		if (!send_failed) {
3356 			txr->hn_send_failed++;
3357 			send_failed = 1;
3358 			/*
3359 			 * Try sending again after set hn_has_txeof;
3360 			 * in case that we missed the last
3361 			 * netvsc_channel_rollup().
3362 			 */
3363 			goto again;
3364 		}
3365 		if_printf(ifp, "send failed\n");
3366 
3367 		/*
3368 		 * Caller will perform further processing on the
3369 		 * associated mbuf, so don't free it in hn_txdesc_put();
3370 		 * only unload it from the DMA map in hn_txdesc_put(),
3371 		 * if it was loaded.
3372 		 */
3373 		txd->m = NULL;
3374 		freed = hn_txdesc_put(txr, txd);
3375 		KASSERT(freed != 0,
3376 		    ("fail to free txd upon send error"));
3377 
3378 		txr->hn_send_failed++;
3379 	}
3380 
3381 	/* Reset temporary stats, after this sending is done. */
3382 	txr->hn_stat_size = 0;
3383 	txr->hn_stat_pkts = 0;
3384 	txr->hn_stat_mcasts = 0;
3385 
3386 	return (error);
3387 }
3388 
3389 /*
3390  * Append the specified data to the indicated mbuf chain,
3391  * Extend the mbuf chain if the new data does not fit in
3392  * existing space.
3393  *
3394  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3395  * There should be an equivalent in the kernel mbuf code,
3396  * but there does not appear to be one yet.
3397  *
3398  * Differs from m_append() in that additional mbufs are
3399  * allocated with cluster size MJUMPAGESIZE, and filled
3400  * accordingly.
3401  *
3402  * Return the last mbuf in the chain or NULL if failed to
3403  * allocate new mbuf.
3404  */
3405 static struct mbuf *
3406 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3407 {
3408 	struct mbuf *m, *n;
3409 	int remainder, space;
3410 
3411 	for (m = m0; m->m_next != NULL; m = m->m_next)
3412 		;
3413 	remainder = len;
3414 	space = M_TRAILINGSPACE(m);
3415 	if (space > 0) {
3416 		/*
3417 		 * Copy into available space.
3418 		 */
3419 		if (space > remainder)
3420 			space = remainder;
3421 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3422 		m->m_len += space;
3423 		cp += space;
3424 		remainder -= space;
3425 	}
3426 	while (remainder > 0) {
3427 		/*
3428 		 * Allocate a new mbuf; could check space
3429 		 * and allocate a cluster instead.
3430 		 */
3431 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3432 		if (n == NULL)
3433 			return NULL;
3434 		n->m_len = min(MJUMPAGESIZE, remainder);
3435 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3436 		cp += n->m_len;
3437 		remainder -= n->m_len;
3438 		m->m_next = n;
3439 		m = n;
3440 	}
3441 
3442 	return m;
3443 }
3444 
3445 #if defined(INET) || defined(INET6)
3446 static __inline int
3447 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3448 {
3449 	if (hn_lro_mbufq_depth) {
3450 		tcp_lro_queue_mbuf(lc, m);
3451 		return 0;
3452 	}
3453 	return tcp_lro_rx(lc, m, 0);
3454 }
3455 #endif
3456 
3457 static int
3458 hn_rxpkt(struct hn_rx_ring *rxr)
3459 {
3460 	if_t ifp, hn_ifp = rxr->hn_ifp;
3461 	struct mbuf *m_new, *n;
3462 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3463 	int hash_type = M_HASHTYPE_NONE;
3464 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3465 	int i;
3466 
3467 	ifp = hn_ifp;
3468 	if (rxr->hn_rxvf_ifp != NULL) {
3469 		/*
3470 		 * Non-transparent mode VF; pretend this packet is from
3471 		 * the VF.
3472 		 */
3473 		ifp = rxr->hn_rxvf_ifp;
3474 		is_vf = 1;
3475 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3476 		/* Transparent mode VF. */
3477 		is_vf = 1;
3478 	}
3479 
3480 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
3481 		/*
3482 		 * NOTE:
3483 		 * See the NOTE of hn_rndis_init_fixat().  This
3484 		 * function can be reached, immediately after the
3485 		 * RNDIS is initialized but before the ifnet is
3486 		 * setup on the hn_attach() path; drop the unexpected
3487 		 * packets.
3488 		 */
3489 		return (0);
3490 	}
3491 
3492 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3493 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3494 		return (0);
3495 	}
3496 
3497 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3498 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3499 		if (m_new == NULL) {
3500 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3501 			return (0);
3502 		}
3503 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3504 		    rxr->rsc.frag_len[0]);
3505 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3506 	} else {
3507 		/*
3508 		 * Get an mbuf with a cluster.  For packets 2K or less,
3509 		 * get a standard 2K cluster.  For anything larger, get a
3510 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3511 		 * if looped around to the Hyper-V TX channel, so avoid them.
3512 		 */
3513 		size = MCLBYTES;
3514 		if (rxr->rsc.pktlen > MCLBYTES) {
3515 			/* 4096 */
3516 			size = MJUMPAGESIZE;
3517 		}
3518 
3519 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3520 		if (m_new == NULL) {
3521 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3522 			return (0);
3523 		}
3524 
3525 		n = m_new;
3526 		for (i = 0; i < rxr->rsc.cnt; i++) {
3527 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3528 			    rxr->rsc.frag_data[i]);
3529 			if (n == NULL) {
3530 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3531 				return (0);
3532 			} else {
3533 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3534 			}
3535 		}
3536 	}
3537 	if (rxr->rsc.pktlen <= MHLEN)
3538 		rxr->hn_small_pkts++;
3539 
3540 	m_new->m_pkthdr.rcvif = ifp;
3541 
3542 	if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0))
3543 		do_csum = 0;
3544 
3545 	/* receive side checksum offload */
3546 	if (rxr->rsc.csum_info != NULL) {
3547 		/* IP csum offload */
3548 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3549 			m_new->m_pkthdr.csum_flags |=
3550 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3551 			rxr->hn_csum_ip++;
3552 		}
3553 
3554 		/* TCP/UDP csum offload */
3555 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3556 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3557 			m_new->m_pkthdr.csum_flags |=
3558 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3559 			m_new->m_pkthdr.csum_data = 0xffff;
3560 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3561 				rxr->hn_csum_tcp++;
3562 			else
3563 				rxr->hn_csum_udp++;
3564 		}
3565 
3566 		/*
3567 		 * XXX
3568 		 * As of this write (Oct 28th, 2016), host side will turn
3569 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3570 		 * the do_lro setting here is actually _not_ accurate.  We
3571 		 * depend on the RSS hash type check to reset do_lro.
3572 		 */
3573 		if ((*(rxr->rsc.csum_info) &
3574 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3575 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3576 			do_lro = 1;
3577 	} else {
3578 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3579 		if (l3proto == ETHERTYPE_IP) {
3580 			if (l4proto == IPPROTO_TCP) {
3581 				if (do_csum &&
3582 				    (rxr->hn_trust_hcsum &
3583 				     HN_TRUST_HCSUM_TCP)) {
3584 					rxr->hn_csum_trusted++;
3585 					m_new->m_pkthdr.csum_flags |=
3586 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3587 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3588 					m_new->m_pkthdr.csum_data = 0xffff;
3589 				}
3590 				do_lro = 1;
3591 			} else if (l4proto == IPPROTO_UDP) {
3592 				if (do_csum &&
3593 				    (rxr->hn_trust_hcsum &
3594 				     HN_TRUST_HCSUM_UDP)) {
3595 					rxr->hn_csum_trusted++;
3596 					m_new->m_pkthdr.csum_flags |=
3597 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3598 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3599 					m_new->m_pkthdr.csum_data = 0xffff;
3600 				}
3601 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3602 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3603 				rxr->hn_csum_trusted++;
3604 				m_new->m_pkthdr.csum_flags |=
3605 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3606 			}
3607 		}
3608 	}
3609 
3610 	if (rxr->rsc.vlan_info != NULL) {
3611 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3612 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3613 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3614 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3615 		m_new->m_flags |= M_VLANTAG;
3616 	}
3617 
3618 	/*
3619 	 * If VF is activated (tranparent/non-transparent mode does not
3620 	 * matter here).
3621 	 *
3622 	 * - Disable LRO
3623 	 *
3624 	 *   hn(4) will only receive broadcast packets, multicast packets,
3625 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3626 	 *   packet types.
3627 	 *
3628 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3629 	 *   all, since the LRO flush will use hn(4) as the receiving
3630 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3631 	 */
3632 	if (is_vf)
3633 		do_lro = 0;
3634 
3635 	/*
3636 	 * If VF is activated (tranparent/non-transparent mode does not
3637 	 * matter here), do _not_ mess with unsupported hash types or
3638 	 * functions.
3639 	 */
3640 	if (rxr->rsc.hash_info != NULL) {
3641 		rxr->hn_rss_pkts++;
3642 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3643 		if (!is_vf)
3644 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3645 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3646 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3647 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3648 			    rxr->hn_mbuf_hash);
3649 
3650 			/*
3651 			 * NOTE:
3652 			 * do_lro is resetted, if the hash types are not TCP
3653 			 * related.  See the comment in the above csum_flags
3654 			 * setup section.
3655 			 */
3656 			switch (type) {
3657 			case NDIS_HASH_IPV4:
3658 				hash_type = M_HASHTYPE_RSS_IPV4;
3659 				do_lro = 0;
3660 				break;
3661 
3662 			case NDIS_HASH_TCP_IPV4:
3663 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3664 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3665 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3666 
3667 					if (is_vf)
3668 						def_htype = M_HASHTYPE_NONE;
3669 
3670 					/*
3671 					 * UDP 4-tuple hash is delivered as
3672 					 * TCP 4-tuple hash.
3673 					 */
3674 					if (l3proto == ETHERTYPE_MAX) {
3675 						hn_rxpkt_proto(m_new,
3676 						    &l3proto, &l4proto);
3677 					}
3678 					if (l3proto == ETHERTYPE_IP) {
3679 						if (l4proto == IPPROTO_UDP &&
3680 						    (rxr->hn_mbuf_hash &
3681 						     NDIS_HASH_UDP_IPV4_X)) {
3682 							hash_type =
3683 							M_HASHTYPE_RSS_UDP_IPV4;
3684 							do_lro = 0;
3685 						} else if (l4proto !=
3686 						    IPPROTO_TCP) {
3687 							hash_type = def_htype;
3688 							do_lro = 0;
3689 						}
3690 					} else {
3691 						hash_type = def_htype;
3692 						do_lro = 0;
3693 					}
3694 				}
3695 				break;
3696 
3697 			case NDIS_HASH_IPV6:
3698 				hash_type = M_HASHTYPE_RSS_IPV6;
3699 				do_lro = 0;
3700 				break;
3701 
3702 			case NDIS_HASH_IPV6_EX:
3703 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3704 				do_lro = 0;
3705 				break;
3706 
3707 			case NDIS_HASH_TCP_IPV6:
3708 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3709 				break;
3710 
3711 			case NDIS_HASH_TCP_IPV6_EX:
3712 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3713 				break;
3714 			}
3715 		}
3716 	} else if (!is_vf) {
3717 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3718 		hash_type = M_HASHTYPE_OPAQUE;
3719 	}
3720 	M_HASHTYPE_SET(m_new, hash_type);
3721 
3722 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3723 	if (hn_ifp != ifp) {
3724 		const struct ether_header *eh;
3725 
3726 		/*
3727 		 * Non-transparent mode VF is activated.
3728 		 */
3729 
3730 		/*
3731 		 * Allow tapping on hn(4).
3732 		 */
3733 		ETHER_BPF_MTAP(hn_ifp, m_new);
3734 
3735 		/*
3736 		 * Update hn(4)'s stats.
3737 		 */
3738 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3739 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3740 		/* Checked at the beginning of this function. */
3741 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3742 		eh = mtod(m_new, struct ether_header *);
3743 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3744 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3745 	}
3746 	rxr->hn_pkts++;
3747 
3748 	if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) {
3749 #if defined(INET) || defined(INET6)
3750 		struct lro_ctrl *lro = &rxr->hn_lro;
3751 
3752 		if (lro->lro_cnt) {
3753 			rxr->hn_lro_tried++;
3754 			if (hn_lro_rx(lro, m_new) == 0) {
3755 				/* DONE! */
3756 				return 0;
3757 			}
3758 		}
3759 #endif
3760 	}
3761 	if_input(ifp, m_new);
3762 
3763 	return (0);
3764 }
3765 
3766 static int
3767 hn_ioctl(if_t ifp, u_long cmd, caddr_t data)
3768 {
3769 	struct hn_softc *sc = if_getsoftc(ifp);
3770 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3771 	if_t vf_ifp;
3772 	int mask, error = 0;
3773 	struct ifrsskey *ifrk;
3774 	struct ifrsshash *ifrh;
3775 	uint32_t mtu;
3776 
3777 	switch (cmd) {
3778 	case SIOCSIFMTU:
3779 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3780 			error = EINVAL;
3781 			break;
3782 		}
3783 
3784 		HN_LOCK(sc);
3785 
3786 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3787 			HN_UNLOCK(sc);
3788 			break;
3789 		}
3790 
3791 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3792 			/* Can't change MTU */
3793 			HN_UNLOCK(sc);
3794 			error = EOPNOTSUPP;
3795 			break;
3796 		}
3797 
3798 		if (if_getmtu(ifp) == ifr->ifr_mtu) {
3799 			HN_UNLOCK(sc);
3800 			break;
3801 		}
3802 
3803 		if (hn_xpnt_vf_isready(sc)) {
3804 			vf_ifp = sc->hn_vf_ifp;
3805 			ifr_vf = *ifr;
3806 			strlcpy(ifr_vf.ifr_name, if_name(vf_ifp),
3807 			    sizeof(ifr_vf.ifr_name));
3808 			error = ifhwioctl(SIOCSIFMTU,vf_ifp,
3809 			    (caddr_t)&ifr_vf, curthread);
3810 			if (error) {
3811 				HN_UNLOCK(sc);
3812 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3813 				    if_name(vf_ifp), ifr->ifr_mtu, error);
3814 				break;
3815 			}
3816 		}
3817 
3818 		/*
3819 		 * Suspend this interface before the synthetic parts
3820 		 * are ripped.
3821 		 */
3822 		hn_suspend(sc);
3823 
3824 		/*
3825 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3826 		 */
3827 		hn_synth_detach(sc);
3828 
3829 		/*
3830 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3831 		 * with the new MTU setting.
3832 		 */
3833 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3834 		if (error) {
3835 			HN_UNLOCK(sc);
3836 			break;
3837 		}
3838 
3839 		error = hn_rndis_get_mtu(sc, &mtu);
3840 		if (error)
3841 			mtu = ifr->ifr_mtu;
3842 		else if (bootverbose)
3843 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3844 
3845 		/*
3846 		 * Commit the requested MTU, after the synthetic parts
3847 		 * have been successfully attached.
3848 		 */
3849 		if (mtu >= ifr->ifr_mtu) {
3850 			mtu = ifr->ifr_mtu;
3851 		} else {
3852 			if_printf(ifp, "fixup mtu %d -> %u\n",
3853 			    ifr->ifr_mtu, mtu);
3854 		}
3855 		if_setmtu(ifp, mtu);
3856 
3857 		/*
3858 		 * Synthetic parts' reattach may change the chimney
3859 		 * sending size; update it.
3860 		 */
3861 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3862 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3863 
3864 		/*
3865 		 * Make sure that various parameters based on MTU are
3866 		 * still valid, after the MTU change.
3867 		 */
3868 		hn_mtu_change_fixup(sc);
3869 
3870 		/*
3871 		 * All done!  Resume the interface now.
3872 		 */
3873 		hn_resume(sc);
3874 
3875 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3876 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3877 			/*
3878 			 * Since we have reattached the NVS part,
3879 			 * change the datapath to VF again; in case
3880 			 * that it is lost, after the NVS was detached.
3881 			 */
3882 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3883 		}
3884 
3885 		HN_UNLOCK(sc);
3886 		break;
3887 
3888 	case SIOCSIFFLAGS:
3889 		HN_LOCK(sc);
3890 
3891 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3892 			HN_UNLOCK(sc);
3893 			break;
3894 		}
3895 
3896 		if (hn_xpnt_vf_isready(sc))
3897 			hn_xpnt_vf_saveifflags(sc);
3898 
3899 		if (if_getflags(ifp) & IFF_UP) {
3900 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3901 				/*
3902 				 * Caller meight hold mutex, e.g.
3903 				 * bpf; use busy-wait for the RNDIS
3904 				 * reply.
3905 				 */
3906 				HN_NO_SLEEPING(sc);
3907 				hn_rxfilter_config(sc);
3908 				HN_SLEEPING_OK(sc);
3909 
3910 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3911 					error = hn_xpnt_vf_iocsetflags(sc);
3912 			} else {
3913 				hn_init_locked(sc);
3914 			}
3915 		} else {
3916 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
3917 				hn_stop(sc, false);
3918 		}
3919 		sc->hn_if_flags = if_getflags(ifp);
3920 
3921 		HN_UNLOCK(sc);
3922 		break;
3923 
3924 	case SIOCSIFCAP:
3925 		HN_LOCK(sc);
3926 
3927 		if (hn_xpnt_vf_isready(sc)) {
3928 			ifr_vf = *ifr;
3929 			strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp),
3930 			    sizeof(ifr_vf.ifr_name));
3931 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3932 			HN_UNLOCK(sc);
3933 			break;
3934 		}
3935 
3936 		/*
3937 		 * Fix up requested capabilities w/ supported capabilities,
3938 		 * since the supported capabilities could have been changed.
3939 		 */
3940 		mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^
3941 		    if_getcapenable(ifp);
3942 
3943 		if (mask & IFCAP_TXCSUM) {
3944 			if_togglecapenable(ifp, IFCAP_TXCSUM);
3945 			if (if_getcapenable(ifp) & IFCAP_TXCSUM)
3946 				if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0);
3947 			else
3948 				if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc));
3949 		}
3950 		if (mask & IFCAP_TXCSUM_IPV6) {
3951 			if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6);
3952 			if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
3953 				if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0);
3954 			else
3955 				if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc));
3956 		}
3957 
3958 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3959 		if (mask & IFCAP_RXCSUM)
3960 			if_togglecapenable(ifp, IFCAP_RXCSUM);
3961 #ifdef foo
3962 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3963 		if (mask & IFCAP_RXCSUM_IPV6)
3964 			if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6);
3965 #endif
3966 
3967 		if (mask & IFCAP_LRO)
3968 			if_togglecapenable(ifp, IFCAP_LRO);
3969 
3970 		if (mask & IFCAP_TSO4) {
3971 			if_togglecapenable(ifp, IFCAP_TSO4);
3972 			if (if_getcapenable(ifp) & IFCAP_TSO4)
3973 				if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
3974 			else
3975 				if_sethwassistbits(ifp, 0, CSUM_IP_TSO);
3976 		}
3977 		if (mask & IFCAP_TSO6) {
3978 			if_togglecapenable(ifp, IFCAP_TSO6);
3979 			if (if_getcapenable(ifp) & IFCAP_TSO6)
3980 				if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
3981 			else
3982 				if_sethwassistbits(ifp, 0, CSUM_IP6_TSO);
3983 		}
3984 
3985 		HN_UNLOCK(sc);
3986 		break;
3987 
3988 	case SIOCADDMULTI:
3989 	case SIOCDELMULTI:
3990 		HN_LOCK(sc);
3991 
3992 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3993 			HN_UNLOCK(sc);
3994 			break;
3995 		}
3996 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3997 			/*
3998 			 * Multicast uses mutex; use busy-wait for
3999 			 * the RNDIS reply.
4000 			 */
4001 			HN_NO_SLEEPING(sc);
4002 			hn_rxfilter_config(sc);
4003 			HN_SLEEPING_OK(sc);
4004 		}
4005 
4006 		/* XXX vlan(4) style mcast addr maintenance */
4007 		if (hn_xpnt_vf_isready(sc)) {
4008 			int old_if_flags;
4009 
4010 			old_if_flags = if_getflags(sc->hn_vf_ifp);
4011 			hn_xpnt_vf_saveifflags(sc);
4012 
4013 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
4014 			    ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) &
4015 			     IFF_ALLMULTI))
4016 				error = hn_xpnt_vf_iocsetflags(sc);
4017 		}
4018 
4019 		HN_UNLOCK(sc);
4020 		break;
4021 
4022 	case SIOCSIFMEDIA:
4023 	case SIOCGIFMEDIA:
4024 		HN_LOCK(sc);
4025 		if (hn_xpnt_vf_isready(sc)) {
4026 			/*
4027 			 * SIOCGIFMEDIA expects ifmediareq, so don't
4028 			 * create and pass ifr_vf to the VF here; just
4029 			 * replace the ifr_name.
4030 			 */
4031 			vf_ifp = sc->hn_vf_ifp;
4032 			strlcpy(ifr->ifr_name, if_name(vf_ifp),
4033 			    sizeof(ifr->ifr_name));
4034 			error = ifhwioctl(cmd, vf_ifp, data, curthread);
4035 			/* Restore the ifr_name. */
4036 			strlcpy(ifr->ifr_name, if_name(ifp),
4037 			    sizeof(ifr->ifr_name));
4038 			HN_UNLOCK(sc);
4039 			break;
4040 		}
4041 		HN_UNLOCK(sc);
4042 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4043 		break;
4044 
4045 	case SIOCGIFRSSHASH:
4046 		ifrh = (struct ifrsshash *)data;
4047 		HN_LOCK(sc);
4048 		if (sc->hn_rx_ring_inuse == 1) {
4049 			HN_UNLOCK(sc);
4050 			ifrh->ifrh_func = RSS_FUNC_NONE;
4051 			ifrh->ifrh_types = 0;
4052 			break;
4053 		}
4054 
4055 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4056 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4057 		else
4058 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4059 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4060 		HN_UNLOCK(sc);
4061 		break;
4062 
4063 	case SIOCGIFRSSKEY:
4064 		ifrk = (struct ifrsskey *)data;
4065 		HN_LOCK(sc);
4066 		if (sc->hn_rx_ring_inuse == 1) {
4067 			HN_UNLOCK(sc);
4068 			ifrk->ifrk_func = RSS_FUNC_NONE;
4069 			ifrk->ifrk_keylen = 0;
4070 			break;
4071 		}
4072 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4073 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4074 		else
4075 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4076 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4077 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4078 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4079 		HN_UNLOCK(sc);
4080 		break;
4081 
4082 	default:
4083 		error = ether_ioctl(ifp, cmd, data);
4084 		break;
4085 	}
4086 	return (error);
4087 }
4088 
4089 static void
4090 hn_stop(struct hn_softc *sc, bool detaching)
4091 {
4092 	if_t ifp = sc->hn_ifp;
4093 	int i;
4094 
4095 	HN_LOCK_ASSERT(sc);
4096 
4097 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4098 	    ("synthetic parts were not attached"));
4099 
4100 	/* Clear RUNNING bit ASAP. */
4101 	if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
4102 
4103 	/* Disable polling. */
4104 	hn_polling(sc, 0);
4105 
4106 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4107 		KASSERT(sc->hn_vf_ifp != NULL,
4108 		    ("%s: VF is not attached", if_name(ifp)));
4109 
4110 		/* Mark transparent mode VF as disabled. */
4111 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4112 
4113 		/*
4114 		 * NOTE:
4115 		 * Datapath setting must happen _before_ bringing
4116 		 * the VF down.
4117 		 */
4118 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4119 
4120 		/*
4121 		 * Bring the VF down.
4122 		 */
4123 		hn_xpnt_vf_saveifflags(sc);
4124 		if_setflagbits(ifp, 0, IFF_UP);
4125 		hn_xpnt_vf_iocsetflags(sc);
4126 	}
4127 
4128 	/* Suspend data transfers. */
4129 	hn_suspend_data(sc);
4130 
4131 	/* Clear OACTIVE bit. */
4132 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4133 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4134 		sc->hn_tx_ring[i].hn_oactive = 0;
4135 
4136 	/*
4137 	 * If the non-transparent mode VF is active, make sure
4138 	 * that the RX filter still allows packet reception.
4139 	 */
4140 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4141 		hn_rxfilter_config(sc);
4142 }
4143 
4144 static void
4145 hn_init_locked(struct hn_softc *sc)
4146 {
4147 	if_t ifp = sc->hn_ifp;
4148 	int i;
4149 
4150 	HN_LOCK_ASSERT(sc);
4151 
4152 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4153 		return;
4154 
4155 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
4156 		return;
4157 
4158 	/* Configure RX filter */
4159 	hn_rxfilter_config(sc);
4160 
4161 	/* Clear OACTIVE bit. */
4162 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4163 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4164 		sc->hn_tx_ring[i].hn_oactive = 0;
4165 
4166 	/* Clear TX 'suspended' bit. */
4167 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4168 
4169 	if (hn_xpnt_vf_isready(sc)) {
4170 		/* Initialize transparent VF. */
4171 		hn_xpnt_vf_init(sc);
4172 	}
4173 
4174 	/* Everything is ready; unleash! */
4175 	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
4176 
4177 	/* Re-enable polling if requested. */
4178 	if (sc->hn_pollhz > 0)
4179 		hn_polling(sc, sc->hn_pollhz);
4180 }
4181 
4182 static void
4183 hn_init(void *xsc)
4184 {
4185 	struct hn_softc *sc = xsc;
4186 
4187 	HN_LOCK(sc);
4188 	hn_init_locked(sc);
4189 	HN_UNLOCK(sc);
4190 }
4191 
4192 static int
4193 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4194 {
4195 	struct hn_softc *sc = arg1;
4196 	unsigned int lenlim;
4197 	int error;
4198 
4199 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4200 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4201 	if (error || req->newptr == NULL)
4202 		return error;
4203 
4204 	HN_LOCK(sc);
4205 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4206 	    lenlim > TCP_LRO_LENGTH_MAX) {
4207 		HN_UNLOCK(sc);
4208 		return EINVAL;
4209 	}
4210 	hn_set_lro_lenlim(sc, lenlim);
4211 	HN_UNLOCK(sc);
4212 
4213 	return 0;
4214 }
4215 
4216 static int
4217 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4218 {
4219 	struct hn_softc *sc = arg1;
4220 	int ackcnt, error, i;
4221 
4222 	/*
4223 	 * lro_ackcnt_lim is append count limit,
4224 	 * +1 to turn it into aggregation limit.
4225 	 */
4226 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4227 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4228 	if (error || req->newptr == NULL)
4229 		return error;
4230 
4231 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4232 		return EINVAL;
4233 
4234 	/*
4235 	 * Convert aggregation limit back to append
4236 	 * count limit.
4237 	 */
4238 	--ackcnt;
4239 	HN_LOCK(sc);
4240 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4241 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4242 	HN_UNLOCK(sc);
4243 	return 0;
4244 }
4245 
4246 static int
4247 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4248 {
4249 	struct hn_softc *sc = arg1;
4250 	int hcsum = arg2;
4251 	int on, error, i;
4252 
4253 	on = 0;
4254 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4255 		on = 1;
4256 
4257 	error = sysctl_handle_int(oidp, &on, 0, req);
4258 	if (error || req->newptr == NULL)
4259 		return error;
4260 
4261 	HN_LOCK(sc);
4262 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4263 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4264 
4265 		if (on)
4266 			rxr->hn_trust_hcsum |= hcsum;
4267 		else
4268 			rxr->hn_trust_hcsum &= ~hcsum;
4269 	}
4270 	HN_UNLOCK(sc);
4271 	return 0;
4272 }
4273 
4274 static int
4275 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4276 {
4277 	struct hn_softc *sc = arg1;
4278 	int chim_size, error;
4279 
4280 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4281 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4282 	if (error || req->newptr == NULL)
4283 		return error;
4284 
4285 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4286 		return EINVAL;
4287 
4288 	HN_LOCK(sc);
4289 	hn_set_chim_size(sc, chim_size);
4290 	HN_UNLOCK(sc);
4291 	return 0;
4292 }
4293 
4294 static int
4295 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4296 {
4297 	struct hn_softc *sc = arg1;
4298 	int ofs = arg2, i, error;
4299 	struct hn_rx_ring *rxr;
4300 	uint64_t stat;
4301 
4302 	stat = 0;
4303 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4304 		rxr = &sc->hn_rx_ring[i];
4305 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4306 	}
4307 
4308 	error = sysctl_handle_64(oidp, &stat, 0, req);
4309 	if (error || req->newptr == NULL)
4310 		return error;
4311 
4312 	/* Zero out this stat. */
4313 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4314 		rxr = &sc->hn_rx_ring[i];
4315 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4316 	}
4317 	return 0;
4318 }
4319 
4320 static int
4321 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4322 {
4323 	struct hn_softc *sc = arg1;
4324 	int ofs = arg2, i, error;
4325 	struct hn_rx_ring *rxr;
4326 	u_long stat;
4327 
4328 	stat = 0;
4329 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4330 		rxr = &sc->hn_rx_ring[i];
4331 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4332 	}
4333 
4334 	error = sysctl_handle_long(oidp, &stat, 0, req);
4335 	if (error || req->newptr == NULL)
4336 		return error;
4337 
4338 	/* Zero out this stat. */
4339 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4340 		rxr = &sc->hn_rx_ring[i];
4341 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4342 	}
4343 	return 0;
4344 }
4345 
4346 static int
4347 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4348 {
4349 	struct hn_softc *sc = arg1;
4350 	int ofs = arg2, i, error;
4351 	struct hn_tx_ring *txr;
4352 	u_long stat;
4353 
4354 	stat = 0;
4355 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4356 		txr = &sc->hn_tx_ring[i];
4357 		stat += *((u_long *)((uint8_t *)txr + ofs));
4358 	}
4359 
4360 	error = sysctl_handle_long(oidp, &stat, 0, req);
4361 	if (error || req->newptr == NULL)
4362 		return error;
4363 
4364 	/* Zero out this stat. */
4365 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4366 		txr = &sc->hn_tx_ring[i];
4367 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4368 	}
4369 	return 0;
4370 }
4371 
4372 static int
4373 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4374 {
4375 	struct hn_softc *sc = arg1;
4376 	int ofs = arg2, i, error, conf;
4377 	struct hn_tx_ring *txr;
4378 
4379 	txr = &sc->hn_tx_ring[0];
4380 	conf = *((int *)((uint8_t *)txr + ofs));
4381 
4382 	error = sysctl_handle_int(oidp, &conf, 0, req);
4383 	if (error || req->newptr == NULL)
4384 		return error;
4385 
4386 	HN_LOCK(sc);
4387 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4388 		txr = &sc->hn_tx_ring[i];
4389 		*((int *)((uint8_t *)txr + ofs)) = conf;
4390 	}
4391 	HN_UNLOCK(sc);
4392 
4393 	return 0;
4394 }
4395 
4396 static int
4397 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4398 {
4399 	struct hn_softc *sc = arg1;
4400 	int error, size;
4401 
4402 	size = sc->hn_agg_size;
4403 	error = sysctl_handle_int(oidp, &size, 0, req);
4404 	if (error || req->newptr == NULL)
4405 		return (error);
4406 
4407 	HN_LOCK(sc);
4408 	sc->hn_agg_size = size;
4409 	hn_set_txagg(sc);
4410 	HN_UNLOCK(sc);
4411 
4412 	return (0);
4413 }
4414 
4415 static int
4416 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4417 {
4418 	struct hn_softc *sc = arg1;
4419 	int error, pkts;
4420 
4421 	pkts = sc->hn_agg_pkts;
4422 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4423 	if (error || req->newptr == NULL)
4424 		return (error);
4425 
4426 	HN_LOCK(sc);
4427 	sc->hn_agg_pkts = pkts;
4428 	hn_set_txagg(sc);
4429 	HN_UNLOCK(sc);
4430 
4431 	return (0);
4432 }
4433 
4434 static int
4435 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4436 {
4437 	struct hn_softc *sc = arg1;
4438 	int pkts;
4439 
4440 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4441 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4442 }
4443 
4444 static int
4445 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4446 {
4447 	struct hn_softc *sc = arg1;
4448 	int align;
4449 
4450 	align = sc->hn_tx_ring[0].hn_agg_align;
4451 	return (sysctl_handle_int(oidp, &align, 0, req));
4452 }
4453 
4454 static void
4455 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4456 {
4457 	if (pollhz == 0)
4458 		vmbus_chan_poll_disable(chan);
4459 	else
4460 		vmbus_chan_poll_enable(chan, pollhz);
4461 }
4462 
4463 static void
4464 hn_polling(struct hn_softc *sc, u_int pollhz)
4465 {
4466 	int nsubch = sc->hn_rx_ring_inuse - 1;
4467 
4468 	HN_LOCK_ASSERT(sc);
4469 
4470 	if (nsubch > 0) {
4471 		struct vmbus_channel **subch;
4472 		int i;
4473 
4474 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4475 		for (i = 0; i < nsubch; ++i)
4476 			hn_chan_polling(subch[i], pollhz);
4477 		vmbus_subchan_rel(subch, nsubch);
4478 	}
4479 	hn_chan_polling(sc->hn_prichan, pollhz);
4480 }
4481 
4482 static int
4483 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4484 {
4485 	struct hn_softc *sc = arg1;
4486 	int pollhz, error;
4487 
4488 	pollhz = sc->hn_pollhz;
4489 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4490 	if (error || req->newptr == NULL)
4491 		return (error);
4492 
4493 	if (pollhz != 0 &&
4494 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4495 		return (EINVAL);
4496 
4497 	HN_LOCK(sc);
4498 	if (sc->hn_pollhz != pollhz) {
4499 		sc->hn_pollhz = pollhz;
4500 		if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) &&
4501 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4502 			hn_polling(sc, sc->hn_pollhz);
4503 	}
4504 	HN_UNLOCK(sc);
4505 
4506 	return (0);
4507 }
4508 
4509 static int
4510 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4511 {
4512 	struct hn_softc *sc = arg1;
4513 	char verstr[16];
4514 
4515 	snprintf(verstr, sizeof(verstr), "%u.%u",
4516 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4517 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4518 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4519 }
4520 
4521 static int
4522 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4523 {
4524 	struct hn_softc *sc = arg1;
4525 	char caps_str[128];
4526 	uint32_t caps;
4527 
4528 	HN_LOCK(sc);
4529 	caps = sc->hn_caps;
4530 	HN_UNLOCK(sc);
4531 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4532 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4533 }
4534 
4535 static int
4536 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4537 {
4538 	struct hn_softc *sc = arg1;
4539 	char assist_str[128];
4540 	uint32_t hwassist;
4541 
4542 	HN_LOCK(sc);
4543 	hwassist = if_gethwassist(sc->hn_ifp);
4544 	HN_UNLOCK(sc);
4545 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4546 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4547 }
4548 
4549 static int
4550 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4551 {
4552 	struct hn_softc *sc = arg1;
4553 	char filter_str[128];
4554 	uint32_t filter;
4555 
4556 	HN_LOCK(sc);
4557 	filter = sc->hn_rx_filter;
4558 	HN_UNLOCK(sc);
4559 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4560 	    NDIS_PACKET_TYPES);
4561 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4562 }
4563 
4564 static int
4565 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
4566 {
4567 	struct hn_softc *sc = arg1;
4568 	uint32_t mtu;
4569 	int error;
4570 	HN_LOCK(sc);
4571 	error = hn_rndis_get_mtu(sc, &mtu);
4572 	if (error) {
4573 		if_printf(sc->hn_ifp, "failed to get mtu\n");
4574 		goto back;
4575 	}
4576 	error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4577 	if (error || req->newptr == NULL)
4578 		goto back;
4579 
4580 	error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4581 	if (error)
4582 		goto back;
4583 	error = hn_rndis_reconf_offload(sc, mtu);
4584 back:
4585 	HN_UNLOCK(sc);
4586 	return (error);
4587 }
4588 #ifndef RSS
4589 
4590 static int
4591 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4592 {
4593 	struct hn_softc *sc = arg1;
4594 	int error;
4595 
4596 	HN_LOCK(sc);
4597 
4598 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4599 	if (error || req->newptr == NULL)
4600 		goto back;
4601 
4602 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4603 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4604 		/*
4605 		 * RSS key is synchronized w/ VF's, don't allow users
4606 		 * to change it.
4607 		 */
4608 		error = EBUSY;
4609 		goto back;
4610 	}
4611 
4612 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4613 	if (error)
4614 		goto back;
4615 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4616 
4617 	if (sc->hn_rx_ring_inuse > 1) {
4618 		error = hn_rss_reconfig(sc);
4619 	} else {
4620 		/* Not RSS capable, at least for now; just save the RSS key. */
4621 		error = 0;
4622 	}
4623 back:
4624 	HN_UNLOCK(sc);
4625 	return (error);
4626 }
4627 
4628 static int
4629 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4630 {
4631 	struct hn_softc *sc = arg1;
4632 	int error;
4633 
4634 	HN_LOCK(sc);
4635 
4636 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4637 	if (error || req->newptr == NULL)
4638 		goto back;
4639 
4640 	/*
4641 	 * Don't allow RSS indirect table change, if this interface is not
4642 	 * RSS capable currently.
4643 	 */
4644 	if (sc->hn_rx_ring_inuse == 1) {
4645 		error = EOPNOTSUPP;
4646 		goto back;
4647 	}
4648 
4649 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4650 	if (error)
4651 		goto back;
4652 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4653 
4654 	hn_rss_ind_fixup(sc);
4655 	error = hn_rss_reconfig(sc);
4656 back:
4657 	HN_UNLOCK(sc);
4658 	return (error);
4659 }
4660 
4661 #endif	/* !RSS */
4662 
4663 static int
4664 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4665 {
4666 	struct hn_softc *sc = arg1;
4667 	char hash_str[128];
4668 	uint32_t hash;
4669 
4670 	HN_LOCK(sc);
4671 	hash = sc->hn_rss_hash;
4672 	HN_UNLOCK(sc);
4673 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4674 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4675 }
4676 
4677 static int
4678 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4679 {
4680 	struct hn_softc *sc = arg1;
4681 	char hash_str[128];
4682 	uint32_t hash;
4683 
4684 	HN_LOCK(sc);
4685 	hash = sc->hn_rss_hcap;
4686 	HN_UNLOCK(sc);
4687 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4688 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4689 }
4690 
4691 static int
4692 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4693 {
4694 	struct hn_softc *sc = arg1;
4695 	char hash_str[128];
4696 	uint32_t hash;
4697 
4698 	HN_LOCK(sc);
4699 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4700 	HN_UNLOCK(sc);
4701 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4702 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4703 }
4704 
4705 static int
4706 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4707 {
4708 	struct hn_softc *sc = arg1;
4709 	char vf_name[IFNAMSIZ + 1];
4710 	if_t vf_ifp;
4711 
4712 	HN_LOCK(sc);
4713 	vf_name[0] = '\0';
4714 	vf_ifp = sc->hn_vf_ifp;
4715 	if (vf_ifp != NULL)
4716 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4717 	HN_UNLOCK(sc);
4718 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4719 }
4720 
4721 static int
4722 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4723 {
4724 	struct hn_softc *sc = arg1;
4725 	char vf_name[IFNAMSIZ + 1];
4726 	if_t vf_ifp;
4727 
4728 	HN_LOCK(sc);
4729 	vf_name[0] = '\0';
4730 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4731 	if (vf_ifp != NULL)
4732 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4733 	HN_UNLOCK(sc);
4734 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4735 }
4736 
4737 static int
4738 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4739 {
4740 	struct rm_priotracker pt;
4741 	struct sbuf *sb;
4742 	int error, i;
4743 	bool first;
4744 
4745 	error = sysctl_wire_old_buffer(req, 0);
4746 	if (error != 0)
4747 		return (error);
4748 
4749 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4750 	if (sb == NULL)
4751 		return (ENOMEM);
4752 
4753 	rm_rlock(&hn_vfmap_lock, &pt);
4754 
4755 	first = true;
4756 	for (i = 0; i < hn_vfmap_size; ++i) {
4757 		struct epoch_tracker et;
4758 		if_t ifp;
4759 
4760 		if (hn_vfmap[i] == NULL)
4761 			continue;
4762 
4763 		NET_EPOCH_ENTER(et);
4764 		ifp = ifnet_byindex(i);
4765 		if (ifp != NULL) {
4766 			if (first)
4767 				sbuf_printf(sb, "%s", if_name(ifp));
4768 			else
4769 				sbuf_printf(sb, " %s", if_name(ifp));
4770 			first = false;
4771 		}
4772 		NET_EPOCH_EXIT(et);
4773 	}
4774 
4775 	rm_runlock(&hn_vfmap_lock, &pt);
4776 
4777 	error = sbuf_finish(sb);
4778 	sbuf_delete(sb);
4779 	return (error);
4780 }
4781 
4782 static int
4783 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4784 {
4785 	struct rm_priotracker pt;
4786 	struct sbuf *sb;
4787 	int error, i;
4788 	bool first;
4789 
4790 	error = sysctl_wire_old_buffer(req, 0);
4791 	if (error != 0)
4792 		return (error);
4793 
4794 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4795 	if (sb == NULL)
4796 		return (ENOMEM);
4797 
4798 	rm_rlock(&hn_vfmap_lock, &pt);
4799 
4800 	first = true;
4801 	for (i = 0; i < hn_vfmap_size; ++i) {
4802 		struct epoch_tracker et;
4803 		if_t ifp, hn_ifp;
4804 
4805 		hn_ifp = hn_vfmap[i];
4806 		if (hn_ifp == NULL)
4807 			continue;
4808 
4809 		NET_EPOCH_ENTER(et);
4810 		ifp = ifnet_byindex(i);
4811 		if (ifp != NULL) {
4812 			if (first) {
4813 				sbuf_printf(sb, "%s:%s", if_name(ifp),
4814 				    if_name(hn_ifp));
4815 			} else {
4816 				sbuf_printf(sb, " %s:%s", if_name(ifp),
4817 				    if_name(hn_ifp));
4818 			}
4819 			first = false;
4820 		}
4821 		NET_EPOCH_EXIT(et);
4822 	}
4823 
4824 	rm_runlock(&hn_vfmap_lock, &pt);
4825 
4826 	error = sbuf_finish(sb);
4827 	sbuf_delete(sb);
4828 	return (error);
4829 }
4830 
4831 static int
4832 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4833 {
4834 	struct hn_softc *sc = arg1;
4835 	int error, onoff = 0;
4836 
4837 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4838 		onoff = 1;
4839 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4840 	if (error || req->newptr == NULL)
4841 		return (error);
4842 
4843 	HN_LOCK(sc);
4844 	/* NOTE: hn_vf_lock for hn_transmit() */
4845 	rm_wlock(&sc->hn_vf_lock);
4846 	if (onoff)
4847 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4848 	else
4849 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4850 	rm_wunlock(&sc->hn_vf_lock);
4851 	HN_UNLOCK(sc);
4852 
4853 	return (0);
4854 }
4855 
4856 static int
4857 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4858 {
4859 	struct hn_softc *sc = arg1;
4860 	int enabled = 0;
4861 
4862 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4863 		enabled = 1;
4864 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4865 }
4866 
4867 static int
4868 hn_check_iplen(const struct mbuf *m, int hoff)
4869 {
4870 	const struct ip *ip;
4871 	int len, iphlen, iplen;
4872 	const struct tcphdr *th;
4873 	int thoff;				/* TCP data offset */
4874 
4875 	len = hoff + sizeof(struct ip);
4876 
4877 	/* The packet must be at least the size of an IP header. */
4878 	if (m->m_pkthdr.len < len)
4879 		return IPPROTO_DONE;
4880 
4881 	/* The fixed IP header must reside completely in the first mbuf. */
4882 	if (m->m_len < len)
4883 		return IPPROTO_DONE;
4884 
4885 	ip = mtodo(m, hoff);
4886 
4887 	/* Bound check the packet's stated IP header length. */
4888 	iphlen = ip->ip_hl << 2;
4889 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4890 		return IPPROTO_DONE;
4891 
4892 	/* The full IP header must reside completely in the one mbuf. */
4893 	if (m->m_len < hoff + iphlen)
4894 		return IPPROTO_DONE;
4895 
4896 	iplen = ntohs(ip->ip_len);
4897 
4898 	/*
4899 	 * Check that the amount of data in the buffers is as
4900 	 * at least much as the IP header would have us expect.
4901 	 */
4902 	if (m->m_pkthdr.len < hoff + iplen)
4903 		return IPPROTO_DONE;
4904 
4905 	/*
4906 	 * Ignore IP fragments.
4907 	 */
4908 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4909 		return IPPROTO_DONE;
4910 
4911 	/*
4912 	 * The TCP/IP or UDP/IP header must be entirely contained within
4913 	 * the first fragment of a packet.
4914 	 */
4915 	switch (ip->ip_p) {
4916 	case IPPROTO_TCP:
4917 		if (iplen < iphlen + sizeof(struct tcphdr))
4918 			return IPPROTO_DONE;
4919 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4920 			return IPPROTO_DONE;
4921 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4922 		thoff = th->th_off << 2;
4923 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4924 			return IPPROTO_DONE;
4925 		if (m->m_len < hoff + iphlen + thoff)
4926 			return IPPROTO_DONE;
4927 		break;
4928 	case IPPROTO_UDP:
4929 		if (iplen < iphlen + sizeof(struct udphdr))
4930 			return IPPROTO_DONE;
4931 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4932 			return IPPROTO_DONE;
4933 		break;
4934 	default:
4935 		if (iplen < iphlen)
4936 			return IPPROTO_DONE;
4937 		break;
4938 	}
4939 	return ip->ip_p;
4940 }
4941 
4942 static void
4943 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4944 {
4945 	const struct ether_header *eh;
4946 	uint16_t etype;
4947 	int hoff;
4948 
4949 	hoff = sizeof(*eh);
4950 	/* Checked at the beginning of this function. */
4951 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4952 
4953 	eh = mtod(m_new, const struct ether_header *);
4954 	etype = ntohs(eh->ether_type);
4955 	if (etype == ETHERTYPE_VLAN) {
4956 		const struct ether_vlan_header *evl;
4957 
4958 		hoff = sizeof(*evl);
4959 		if (m_new->m_len < hoff)
4960 			return;
4961 		evl = mtod(m_new, const struct ether_vlan_header *);
4962 		etype = ntohs(evl->evl_proto);
4963 	}
4964 	*l3proto = etype;
4965 
4966 	if (etype == ETHERTYPE_IP)
4967 		*l4proto = hn_check_iplen(m_new, hoff);
4968 	else
4969 		*l4proto = IPPROTO_DONE;
4970 }
4971 
4972 static int
4973 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4974 {
4975 	struct sysctl_oid_list *child;
4976 	struct sysctl_ctx_list *ctx;
4977 	device_t dev = sc->hn_dev;
4978 #if defined(INET) || defined(INET6)
4979 	int lroent_cnt;
4980 #endif
4981 	int i;
4982 
4983 	/*
4984 	 * Create RXBUF for reception.
4985 	 *
4986 	 * NOTE:
4987 	 * - It is shared by all channels.
4988 	 * - A large enough buffer is allocated, certain version of NVSes
4989 	 *   may further limit the usable space.
4990 	 */
4991 	sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
4992 	    0ul, ~0ul, PAGE_SIZE, 0);
4993 	if (sc->hn_rxbuf == NULL) {
4994 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4995 		return (ENOMEM);
4996 	}
4997 
4998 	sc->hn_rx_ring_cnt = ring_cnt;
4999 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
5000 
5001 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
5002 	    M_DEVBUF, M_WAITOK | M_ZERO);
5003 
5004 #if defined(INET) || defined(INET6)
5005 	lroent_cnt = hn_lro_entry_count;
5006 	if (lroent_cnt < TCP_LRO_ENTRIES)
5007 		lroent_cnt = TCP_LRO_ENTRIES;
5008 	if (bootverbose)
5009 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
5010 #endif	/* INET || INET6 */
5011 
5012 	ctx = device_get_sysctl_ctx(dev);
5013 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
5014 
5015 	/* Create dev.hn.UNIT.rx sysctl tree */
5016 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
5017 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5018 
5019 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5020 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5021 
5022 		rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF,
5023 		    M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0);
5024 		if (rxr->hn_br == NULL) {
5025 			device_printf(dev, "allocate bufring failed\n");
5026 			return (ENOMEM);
5027 		}
5028 
5029 		if (hn_trust_hosttcp)
5030 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
5031 		if (hn_trust_hostudp)
5032 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
5033 		if (hn_trust_hostip)
5034 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
5035 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
5036 		rxr->hn_ifp = sc->hn_ifp;
5037 		if (i < sc->hn_tx_ring_cnt)
5038 			rxr->hn_txr = &sc->hn_tx_ring[i];
5039 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
5040 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
5041 		rxr->hn_rx_idx = i;
5042 		rxr->hn_rxbuf = sc->hn_rxbuf;
5043 
5044 		/*
5045 		 * Initialize LRO.
5046 		 */
5047 #if defined(INET) || defined(INET6)
5048 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5049 		    hn_lro_mbufq_depth);
5050 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5051 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5052 #endif	/* INET || INET6 */
5053 
5054 		if (sc->hn_rx_sysctl_tree != NULL) {
5055 			char name[16];
5056 
5057 			/*
5058 			 * Create per RX ring sysctl tree:
5059 			 * dev.hn.UNIT.rx.RINGID
5060 			 */
5061 			snprintf(name, sizeof(name), "%d", i);
5062 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5063 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5064 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5065 
5066 			if (rxr->hn_rx_sysctl_tree != NULL) {
5067 				SYSCTL_ADD_ULONG(ctx,
5068 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5069 				    OID_AUTO, "packets",
5070 				    CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5071 				    "# of packets received");
5072 				SYSCTL_ADD_ULONG(ctx,
5073 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5074 				    OID_AUTO, "rss_pkts",
5075 				    CTLFLAG_RW | CTLFLAG_STATS,
5076 				    &rxr->hn_rss_pkts,
5077 				    "# of packets w/ RSS info received");
5078 				SYSCTL_ADD_ULONG(ctx,
5079 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5080 				    OID_AUTO, "rsc_pkts",
5081 				    CTLFLAG_RW | CTLFLAG_STATS,
5082 				    &rxr->hn_rsc_pkts,
5083 				    "# of RSC packets received");
5084 				SYSCTL_ADD_ULONG(ctx,
5085 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5086 				    OID_AUTO, "rsc_drop",
5087 				    CTLFLAG_RW | CTLFLAG_STATS,
5088 				    &rxr->hn_rsc_drop,
5089 				    "# of RSC fragments dropped");
5090 				SYSCTL_ADD_INT(ctx,
5091 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5092 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5093 				    &rxr->hn_pktbuf_len, 0,
5094 				    "Temporary channel packet buffer length");
5095 			}
5096 		}
5097 	}
5098 
5099 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5100 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5101 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5102 	    hn_rx_stat_u64_sysctl,
5103 	    "LU", "LRO queued");
5104 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5105 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5106 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5107 	    hn_rx_stat_u64_sysctl,
5108 	    "LU", "LRO flushed");
5109 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5110 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5111 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5112 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5113 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5114 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5115 	    hn_lro_lenlim_sysctl, "IU",
5116 	    "Max # of data bytes to be aggregated by LRO");
5117 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5118 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5119 	    hn_lro_ackcnt_sysctl, "I",
5120 	    "Max # of ACKs to be aggregated by LRO");
5121 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5122 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5123 	    hn_trust_hcsum_sysctl, "I",
5124 	    "Trust tcp segment verification on host side, "
5125 	    "when csum info is missing");
5126 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5127 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5128 	    hn_trust_hcsum_sysctl, "I",
5129 	    "Trust udp datagram verification on host side, "
5130 	    "when csum info is missing");
5131 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5132 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5133 	    hn_trust_hcsum_sysctl, "I",
5134 	    "Trust ip packet verification on host side, "
5135 	    "when csum info is missing");
5136 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5137 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5138 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5139 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5140 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5141 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5142 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5143 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5144 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5145 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5146 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5147 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5148 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5149 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5150 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5151 	    hn_rx_stat_ulong_sysctl, "LU",
5152 	    "# of packets that we trust host's csum verification");
5153 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5154 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5155 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5156 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5157 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5158 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5159 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5160 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5161 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5162 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5163 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5164 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5165 
5166 	return (0);
5167 }
5168 
5169 static void
5170 hn_destroy_rx_data(struct hn_softc *sc)
5171 {
5172 	int i;
5173 
5174 	if (sc->hn_rxbuf != NULL) {
5175 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5176 			contigfree(sc->hn_rxbuf, HN_RXBUF_SIZE, M_DEVBUF);
5177 		else
5178 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5179 		sc->hn_rxbuf = NULL;
5180 	}
5181 
5182 	if (sc->hn_rx_ring_cnt == 0)
5183 		return;
5184 
5185 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5186 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5187 
5188 		if (rxr->hn_br == NULL)
5189 			continue;
5190 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5191 			contigfree(rxr->hn_br, HN_TXBR_SIZE + HN_RXBR_SIZE,
5192 			    M_DEVBUF);
5193 		} else {
5194 			device_printf(sc->hn_dev,
5195 			    "%dth channel bufring is referenced", i);
5196 		}
5197 		rxr->hn_br = NULL;
5198 
5199 #if defined(INET) || defined(INET6)
5200 		tcp_lro_free(&rxr->hn_lro);
5201 #endif
5202 		free(rxr->hn_pktbuf, M_DEVBUF);
5203 	}
5204 	free(sc->hn_rx_ring, M_DEVBUF);
5205 	sc->hn_rx_ring = NULL;
5206 
5207 	sc->hn_rx_ring_cnt = 0;
5208 	sc->hn_rx_ring_inuse = 0;
5209 }
5210 
5211 static int
5212 hn_tx_ring_create(struct hn_softc *sc, int id)
5213 {
5214 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5215 	device_t dev = sc->hn_dev;
5216 	bus_dma_tag_t parent_dtag;
5217 	int error, i;
5218 
5219 	txr->hn_sc = sc;
5220 	txr->hn_tx_idx = id;
5221 
5222 #ifndef HN_USE_TXDESC_BUFRING
5223 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5224 #endif
5225 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5226 
5227 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5228 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5229 	    M_DEVBUF, M_WAITOK | M_ZERO);
5230 #ifndef HN_USE_TXDESC_BUFRING
5231 	SLIST_INIT(&txr->hn_txlist);
5232 #else
5233 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5234 	    M_WAITOK, &txr->hn_tx_lock);
5235 #endif
5236 
5237 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5238 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5239 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5240 	} else {
5241 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5242 	}
5243 
5244 #ifdef HN_IFSTART_SUPPORT
5245 	if (hn_use_if_start) {
5246 		txr->hn_txeof = hn_start_txeof;
5247 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5248 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5249 	} else
5250 #endif
5251 	{
5252 		int br_depth;
5253 
5254 		txr->hn_txeof = hn_xmit_txeof;
5255 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5256 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5257 
5258 		br_depth = hn_get_txswq_depth(txr);
5259 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5260 		    M_WAITOK, &txr->hn_tx_lock);
5261 	}
5262 
5263 	txr->hn_direct_tx_size = hn_direct_tx_size;
5264 
5265 	/*
5266 	 * Always schedule transmission instead of trying to do direct
5267 	 * transmission.  This one gives the best performance so far.
5268 	 */
5269 	txr->hn_sched_tx = 1;
5270 
5271 	parent_dtag = bus_get_dma_tag(dev);
5272 
5273 	/* DMA tag for RNDIS packet messages. */
5274 	error = bus_dma_tag_create(parent_dtag, /* parent */
5275 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5276 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5277 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5278 	    BUS_SPACE_MAXADDR,		/* highaddr */
5279 	    NULL, NULL,			/* filter, filterarg */
5280 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5281 	    1,				/* nsegments */
5282 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5283 	    0,				/* flags */
5284 	    NULL,			/* lockfunc */
5285 	    NULL,			/* lockfuncarg */
5286 	    &txr->hn_tx_rndis_dtag);
5287 	if (error) {
5288 		device_printf(dev, "failed to create rndis dmatag\n");
5289 		return error;
5290 	}
5291 
5292 	/* DMA tag for data. */
5293 	error = bus_dma_tag_create(parent_dtag, /* parent */
5294 	    1,				/* alignment */
5295 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5296 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5297 	    BUS_SPACE_MAXADDR,		/* highaddr */
5298 	    NULL, NULL,			/* filter, filterarg */
5299 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5300 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5301 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5302 	    0,				/* flags */
5303 	    NULL,			/* lockfunc */
5304 	    NULL,			/* lockfuncarg */
5305 	    &txr->hn_tx_data_dtag);
5306 	if (error) {
5307 		device_printf(dev, "failed to create data dmatag\n");
5308 		return error;
5309 	}
5310 
5311 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5312 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5313 
5314 		txd->txr = txr;
5315 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5316 		STAILQ_INIT(&txd->agg_list);
5317 
5318 		/*
5319 		 * Allocate and load RNDIS packet message.
5320 		 */
5321         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5322 		    (void **)&txd->rndis_pkt,
5323 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5324 		    &txd->rndis_pkt_dmap);
5325 		if (error) {
5326 			device_printf(dev,
5327 			    "failed to allocate rndis_packet_msg, %d\n", i);
5328 			return error;
5329 		}
5330 
5331 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5332 		    txd->rndis_pkt_dmap,
5333 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5334 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5335 		    BUS_DMA_NOWAIT);
5336 		if (error) {
5337 			device_printf(dev,
5338 			    "failed to load rndis_packet_msg, %d\n", i);
5339 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5340 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5341 			return error;
5342 		}
5343 
5344 		/* DMA map for TX data. */
5345 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5346 		    &txd->data_dmap);
5347 		if (error) {
5348 			device_printf(dev,
5349 			    "failed to allocate tx data dmamap\n");
5350 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5351 			    txd->rndis_pkt_dmap);
5352 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5353 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5354 			return error;
5355 		}
5356 
5357 		/* All set, put it to list */
5358 		txd->flags |= HN_TXD_FLAG_ONLIST;
5359 #ifndef HN_USE_TXDESC_BUFRING
5360 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5361 #else
5362 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5363 #endif
5364 	}
5365 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5366 
5367 	if (sc->hn_tx_sysctl_tree != NULL) {
5368 		struct sysctl_oid_list *child;
5369 		struct sysctl_ctx_list *ctx;
5370 		char name[16];
5371 
5372 		/*
5373 		 * Create per TX ring sysctl tree:
5374 		 * dev.hn.UNIT.tx.RINGID
5375 		 */
5376 		ctx = device_get_sysctl_ctx(dev);
5377 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5378 
5379 		snprintf(name, sizeof(name), "%d", id);
5380 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5381 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5382 
5383 		if (txr->hn_tx_sysctl_tree != NULL) {
5384 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5385 
5386 #ifdef HN_DEBUG
5387 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5388 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5389 			    "# of available TX descs");
5390 #endif
5391 #ifdef HN_IFSTART_SUPPORT
5392 			if (!hn_use_if_start)
5393 #endif
5394 			{
5395 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5396 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5397 				    "over active");
5398 			}
5399 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5400 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5401 			    "# of packets transmitted");
5402 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5403 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5404 			    "# of sends");
5405 		}
5406 	}
5407 
5408 	return 0;
5409 }
5410 
5411 static void
5412 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5413 {
5414 	struct hn_tx_ring *txr = txd->txr;
5415 
5416 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5417 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5418 
5419 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5420 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5421 	    txd->rndis_pkt_dmap);
5422 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5423 }
5424 
5425 static void
5426 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5427 {
5428 
5429 	KASSERT(txd->refs == 0 || txd->refs == 1,
5430 	    ("invalid txd refs %d", txd->refs));
5431 
5432 	/* Aggregated txds will be freed by their aggregating txd. */
5433 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5434 		int freed __diagused;
5435 
5436 		freed = hn_txdesc_put(txr, txd);
5437 		KASSERT(freed, ("can't free txdesc"));
5438 	}
5439 }
5440 
5441 static void
5442 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5443 {
5444 	int i;
5445 
5446 	if (txr->hn_txdesc == NULL)
5447 		return;
5448 
5449 	/*
5450 	 * NOTE:
5451 	 * Because the freeing of aggregated txds will be deferred
5452 	 * to the aggregating txd, two passes are used here:
5453 	 * - The first pass GCes any pending txds.  This GC is necessary,
5454 	 *   since if the channels are revoked, hypervisor will not
5455 	 *   deliver send-done for all pending txds.
5456 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5457 	 *   were freed.
5458 	 */
5459 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5460 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5461 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5462 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5463 
5464 	if (txr->hn_tx_data_dtag != NULL)
5465 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5466 	if (txr->hn_tx_rndis_dtag != NULL)
5467 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5468 
5469 #ifdef HN_USE_TXDESC_BUFRING
5470 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5471 #endif
5472 
5473 	free(txr->hn_txdesc, M_DEVBUF);
5474 	txr->hn_txdesc = NULL;
5475 
5476 	if (txr->hn_mbuf_br != NULL)
5477 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5478 
5479 #ifndef HN_USE_TXDESC_BUFRING
5480 	mtx_destroy(&txr->hn_txlist_spin);
5481 #endif
5482 	mtx_destroy(&txr->hn_tx_lock);
5483 }
5484 
5485 static int
5486 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5487 {
5488 	struct sysctl_oid_list *child;
5489 	struct sysctl_ctx_list *ctx;
5490 	int i;
5491 
5492 	/*
5493 	 * Create TXBUF for chimney sending.
5494 	 *
5495 	 * NOTE: It is shared by all channels.
5496 	 */
5497 	sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
5498 	    0ul, ~0ul, PAGE_SIZE, 0);
5499 	if (sc->hn_chim == NULL) {
5500 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5501 		return (ENOMEM);
5502 	}
5503 
5504 	sc->hn_tx_ring_cnt = ring_cnt;
5505 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5506 
5507 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5508 	    M_DEVBUF, M_WAITOK | M_ZERO);
5509 
5510 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5511 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5512 
5513 	/* Create dev.hn.UNIT.tx sysctl tree */
5514 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5515 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5516 
5517 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5518 		int error;
5519 
5520 		error = hn_tx_ring_create(sc, i);
5521 		if (error)
5522 			return error;
5523 	}
5524 
5525 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5526 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5527 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5528 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5529 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5530 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5531 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5532 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5533 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5534 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5535 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5536 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5537 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5538 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5539 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5540 	    hn_tx_stat_ulong_sysctl, "LU",
5541 	    "# of packet transmission aggregation flush failure");
5542 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5543 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5544 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5545 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5546 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5547 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5548 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5549 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5550 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5551 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5552 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5553 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5554 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5555 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5556 	    "# of total TX descs");
5557 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5558 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5559 	    "Chimney send packet size upper boundary");
5560 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5561 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5562 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5563 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5564 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5565 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5566 	    hn_tx_conf_int_sysctl, "I",
5567 	    "Size of the packet for direct transmission");
5568 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5569 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5570 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5571 	    hn_tx_conf_int_sysctl, "I",
5572 	    "Always schedule transmission "
5573 	    "instead of doing direct transmission");
5574 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5575 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5576 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5577 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5578 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5579 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5580 	    "Applied packet transmission aggregation size");
5581 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5582 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5583 	    hn_txagg_pktmax_sysctl, "I",
5584 	    "Applied packet transmission aggregation packets");
5585 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5586 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5587 	    hn_txagg_align_sysctl, "I",
5588 	    "Applied packet transmission aggregation alignment");
5589 
5590 	return 0;
5591 }
5592 
5593 static void
5594 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5595 {
5596 	int i;
5597 
5598 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5599 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5600 }
5601 
5602 static void
5603 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5604 {
5605 	if_t ifp = sc->hn_ifp;
5606 	u_int hw_tsomax;
5607 	int tso_minlen;
5608 
5609 	HN_LOCK_ASSERT(sc);
5610 
5611 	if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5612 		return;
5613 
5614 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5615 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5616 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5617 
5618 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5619 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5620 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5621 
5622 	if (tso_maxlen < tso_minlen)
5623 		tso_maxlen = tso_minlen;
5624 	else if (tso_maxlen > IP_MAXPACKET)
5625 		tso_maxlen = IP_MAXPACKET;
5626 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5627 		tso_maxlen = sc->hn_ndis_tso_szmax;
5628 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5629 
5630 	if (hn_xpnt_vf_isready(sc)) {
5631 		if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp))
5632 			hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp);
5633 	}
5634 	if_sethwtsomax(ifp, hw_tsomax);
5635 	if (bootverbose)
5636 		if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp));
5637 }
5638 
5639 static void
5640 hn_fixup_tx_data(struct hn_softc *sc)
5641 {
5642 	uint64_t csum_assist;
5643 	int i;
5644 
5645 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5646 	if (hn_tx_chimney_size > 0 &&
5647 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5648 		hn_set_chim_size(sc, hn_tx_chimney_size);
5649 
5650 	csum_assist = 0;
5651 	if (sc->hn_caps & HN_CAP_IPCS)
5652 		csum_assist |= CSUM_IP;
5653 	if (sc->hn_caps & HN_CAP_TCP4CS)
5654 		csum_assist |= CSUM_IP_TCP;
5655 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5656 		csum_assist |= CSUM_IP_UDP;
5657 	if (sc->hn_caps & HN_CAP_TCP6CS)
5658 		csum_assist |= CSUM_IP6_TCP;
5659 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5660 		csum_assist |= CSUM_IP6_UDP;
5661 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5662 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5663 
5664 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5665 		/*
5666 		 * Support HASHVAL pktinfo on TX path.
5667 		 */
5668 		if (bootverbose)
5669 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5670 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5671 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5672 	}
5673 }
5674 
5675 static void
5676 hn_fixup_rx_data(struct hn_softc *sc)
5677 {
5678 
5679 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5680 		int i;
5681 
5682 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5683 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5684 	}
5685 }
5686 
5687 static void
5688 hn_destroy_tx_data(struct hn_softc *sc)
5689 {
5690 	int i;
5691 
5692 	if (sc->hn_chim != NULL) {
5693 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5694 			contigfree(sc->hn_chim, HN_CHIM_SIZE, M_DEVBUF);
5695 		} else {
5696 			device_printf(sc->hn_dev,
5697 			    "chimney sending buffer is referenced");
5698 		}
5699 		sc->hn_chim = NULL;
5700 	}
5701 
5702 	if (sc->hn_tx_ring_cnt == 0)
5703 		return;
5704 
5705 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5706 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5707 
5708 	free(sc->hn_tx_ring, M_DEVBUF);
5709 	sc->hn_tx_ring = NULL;
5710 
5711 	sc->hn_tx_ring_cnt = 0;
5712 	sc->hn_tx_ring_inuse = 0;
5713 }
5714 
5715 #ifdef HN_IFSTART_SUPPORT
5716 
5717 static void
5718 hn_start_taskfunc(void *xtxr, int pending __unused)
5719 {
5720 	struct hn_tx_ring *txr = xtxr;
5721 
5722 	mtx_lock(&txr->hn_tx_lock);
5723 	hn_start_locked(txr, 0);
5724 	mtx_unlock(&txr->hn_tx_lock);
5725 }
5726 
5727 static int
5728 hn_start_locked(struct hn_tx_ring *txr, int len)
5729 {
5730 	struct hn_softc *sc = txr->hn_sc;
5731 	if_t ifp = sc->hn_ifp;
5732 	int sched = 0;
5733 
5734 	KASSERT(hn_use_if_start,
5735 	    ("hn_start_locked is called, when if_start is disabled"));
5736 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5737 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5738 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5739 
5740 	if (__predict_false(txr->hn_suspended))
5741 		return (0);
5742 
5743 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5744 	    IFF_DRV_RUNNING)
5745 		return (0);
5746 
5747 	while (!if_sendq_empty(ifp)) {
5748 		struct hn_txdesc *txd;
5749 		struct mbuf *m_head;
5750 		int error;
5751 
5752 		m_head = if_dequeue(ifp);
5753 		if (m_head == NULL)
5754 			break;
5755 
5756 		if (len > 0 && m_head->m_pkthdr.len > len) {
5757 			/*
5758 			 * This sending could be time consuming; let callers
5759 			 * dispatch this packet sending (and sending of any
5760 			 * following up packets) to tx taskqueue.
5761 			 */
5762 			if_sendq_prepend(ifp, m_head);
5763 			sched = 1;
5764 			break;
5765 		}
5766 
5767 #if defined(INET6) || defined(INET)
5768 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5769 			m_head = hn_tso_fixup(m_head);
5770 			if (__predict_false(m_head == NULL)) {
5771 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5772 				continue;
5773 			}
5774 		} else if (m_head->m_pkthdr.csum_flags &
5775 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5776 			m_head = hn_set_hlen(m_head);
5777 			if (__predict_false(m_head == NULL)) {
5778 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5779 				continue;
5780 			}
5781 		}
5782 #endif
5783 
5784 		txd = hn_txdesc_get(txr);
5785 		if (txd == NULL) {
5786 			txr->hn_no_txdescs++;
5787 			if_sendq_prepend(ifp, m_head);
5788 			if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
5789 			break;
5790 		}
5791 
5792 		error = hn_encap(ifp, txr, txd, &m_head);
5793 		if (error) {
5794 			/* Both txd and m_head are freed */
5795 			KASSERT(txr->hn_agg_txd == NULL,
5796 			    ("encap failed w/ pending aggregating txdesc"));
5797 			continue;
5798 		}
5799 
5800 		if (txr->hn_agg_pktleft == 0) {
5801 			if (txr->hn_agg_txd != NULL) {
5802 				KASSERT(m_head == NULL,
5803 				    ("pending mbuf for aggregating txdesc"));
5804 				error = hn_flush_txagg(ifp, txr);
5805 				if (__predict_false(error)) {
5806 					if_setdrvflagbits(ifp,
5807 					    IFF_DRV_OACTIVE, 0);
5808 					break;
5809 				}
5810 			} else {
5811 				KASSERT(m_head != NULL, ("mbuf was freed"));
5812 				error = hn_txpkt(ifp, txr, txd);
5813 				if (__predict_false(error)) {
5814 					/* txd is freed, but m_head is not */
5815 					if_sendq_prepend(ifp, m_head);
5816 					if_setdrvflagbits(ifp,
5817 					    IFF_DRV_OACTIVE, 0);
5818 					break;
5819 				}
5820 			}
5821 		}
5822 #ifdef INVARIANTS
5823 		else {
5824 			KASSERT(txr->hn_agg_txd != NULL,
5825 			    ("no aggregating txdesc"));
5826 			KASSERT(m_head == NULL,
5827 			    ("pending mbuf for aggregating txdesc"));
5828 		}
5829 #endif
5830 	}
5831 
5832 	/* Flush pending aggerated transmission. */
5833 	if (txr->hn_agg_txd != NULL)
5834 		hn_flush_txagg(ifp, txr);
5835 	return (sched);
5836 }
5837 
5838 static void
5839 hn_start(if_t ifp)
5840 {
5841 	struct hn_softc *sc = if_getsoftc(ifp);
5842 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5843 
5844 	if (txr->hn_sched_tx)
5845 		goto do_sched;
5846 
5847 	if (mtx_trylock(&txr->hn_tx_lock)) {
5848 		int sched;
5849 
5850 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5851 		mtx_unlock(&txr->hn_tx_lock);
5852 		if (!sched)
5853 			return;
5854 	}
5855 do_sched:
5856 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5857 }
5858 
5859 static void
5860 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5861 {
5862 	struct hn_tx_ring *txr = xtxr;
5863 
5864 	mtx_lock(&txr->hn_tx_lock);
5865 	if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE);
5866 	hn_start_locked(txr, 0);
5867 	mtx_unlock(&txr->hn_tx_lock);
5868 }
5869 
5870 static void
5871 hn_start_txeof(struct hn_tx_ring *txr)
5872 {
5873 	struct hn_softc *sc = txr->hn_sc;
5874 	if_t ifp = sc->hn_ifp;
5875 
5876 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5877 
5878 	if (txr->hn_sched_tx)
5879 		goto do_sched;
5880 
5881 	if (mtx_trylock(&txr->hn_tx_lock)) {
5882 		int sched;
5883 
5884 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5885 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5886 		mtx_unlock(&txr->hn_tx_lock);
5887 		if (sched) {
5888 			taskqueue_enqueue(txr->hn_tx_taskq,
5889 			    &txr->hn_tx_task);
5890 		}
5891 	} else {
5892 do_sched:
5893 		/*
5894 		 * Release the OACTIVE earlier, with the hope, that
5895 		 * others could catch up.  The task will clear the
5896 		 * flag again with the hn_tx_lock to avoid possible
5897 		 * races.
5898 		 */
5899 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5900 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5901 	}
5902 }
5903 
5904 #endif	/* HN_IFSTART_SUPPORT */
5905 
5906 static int
5907 hn_xmit(struct hn_tx_ring *txr, int len)
5908 {
5909 	struct hn_softc *sc = txr->hn_sc;
5910 	if_t ifp = sc->hn_ifp;
5911 	struct mbuf *m_head;
5912 	int sched = 0;
5913 
5914 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5915 #ifdef HN_IFSTART_SUPPORT
5916 	KASSERT(hn_use_if_start == 0,
5917 	    ("hn_xmit is called, when if_start is enabled"));
5918 #endif
5919 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5920 
5921 	if (__predict_false(txr->hn_suspended))
5922 		return (0);
5923 
5924 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5925 		return (0);
5926 
5927 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5928 		struct hn_txdesc *txd;
5929 		int error;
5930 
5931 		if (len > 0 && m_head->m_pkthdr.len > len) {
5932 			/*
5933 			 * This sending could be time consuming; let callers
5934 			 * dispatch this packet sending (and sending of any
5935 			 * following up packets) to tx taskqueue.
5936 			 */
5937 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5938 			sched = 1;
5939 			break;
5940 		}
5941 
5942 		txd = hn_txdesc_get(txr);
5943 		if (txd == NULL) {
5944 			txr->hn_no_txdescs++;
5945 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5946 			txr->hn_oactive = 1;
5947 			break;
5948 		}
5949 
5950 		error = hn_encap(ifp, txr, txd, &m_head);
5951 		if (error) {
5952 			/* Both txd and m_head are freed; discard */
5953 			KASSERT(txr->hn_agg_txd == NULL,
5954 			    ("encap failed w/ pending aggregating txdesc"));
5955 			drbr_advance(ifp, txr->hn_mbuf_br);
5956 			continue;
5957 		}
5958 
5959 		if (txr->hn_agg_pktleft == 0) {
5960 			if (txr->hn_agg_txd != NULL) {
5961 				KASSERT(m_head == NULL,
5962 				    ("pending mbuf for aggregating txdesc"));
5963 				error = hn_flush_txagg(ifp, txr);
5964 				if (__predict_false(error)) {
5965 					txr->hn_oactive = 1;
5966 					break;
5967 				}
5968 			} else {
5969 				KASSERT(m_head != NULL, ("mbuf was freed"));
5970 				error = hn_txpkt(ifp, txr, txd);
5971 				if (__predict_false(error)) {
5972 					/* txd is freed, but m_head is not */
5973 					drbr_putback(ifp, txr->hn_mbuf_br,
5974 					    m_head);
5975 					txr->hn_oactive = 1;
5976 					break;
5977 				}
5978 			}
5979 		}
5980 #ifdef INVARIANTS
5981 		else {
5982 			KASSERT(txr->hn_agg_txd != NULL,
5983 			    ("no aggregating txdesc"));
5984 			KASSERT(m_head == NULL,
5985 			    ("pending mbuf for aggregating txdesc"));
5986 		}
5987 #endif
5988 
5989 		/* Sent */
5990 		drbr_advance(ifp, txr->hn_mbuf_br);
5991 	}
5992 
5993 	/* Flush pending aggerated transmission. */
5994 	if (txr->hn_agg_txd != NULL)
5995 		hn_flush_txagg(ifp, txr);
5996 	return (sched);
5997 }
5998 
5999 static int
6000 hn_transmit(if_t ifp, struct mbuf *m)
6001 {
6002 	struct hn_softc *sc = if_getsoftc(ifp);
6003 	struct hn_tx_ring *txr;
6004 	int error, idx = 0;
6005 
6006 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
6007 		struct rm_priotracker pt;
6008 
6009 		rm_rlock(&sc->hn_vf_lock, &pt);
6010 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6011 			struct mbuf *m_bpf = NULL;
6012 			int obytes, omcast;
6013 
6014 			obytes = m->m_pkthdr.len;
6015 			omcast = (m->m_flags & M_MCAST) != 0;
6016 
6017 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
6018 				if (bpf_peers_present(if_getbpf(ifp))) {
6019 					m_bpf = m_copypacket(m, M_NOWAIT);
6020 					if (m_bpf == NULL) {
6021 						/*
6022 						 * Failed to grab a shallow
6023 						 * copy; tap now.
6024 						 */
6025 						ETHER_BPF_MTAP(ifp, m);
6026 					}
6027 				}
6028 			} else {
6029 				ETHER_BPF_MTAP(ifp, m);
6030 			}
6031 
6032 			error = if_transmit(sc->hn_vf_ifp, m);
6033 			rm_runlock(&sc->hn_vf_lock, &pt);
6034 
6035 			if (m_bpf != NULL) {
6036 				if (!error)
6037 					ETHER_BPF_MTAP(ifp, m_bpf);
6038 				m_freem(m_bpf);
6039 			}
6040 
6041 			if (error == ENOBUFS) {
6042 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6043 			} else if (error) {
6044 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6045 			} else {
6046 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6047 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6048 				if (omcast) {
6049 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6050 					    omcast);
6051 				}
6052 			}
6053 			return (error);
6054 		}
6055 		rm_runlock(&sc->hn_vf_lock, &pt);
6056 	}
6057 
6058 #if defined(INET6) || defined(INET)
6059 	/*
6060 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6061 	 * since packet headers should be cache-hot.
6062 	 */
6063 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6064 		m = hn_tso_fixup(m);
6065 		if (__predict_false(m == NULL)) {
6066 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6067 			return EIO;
6068 		}
6069 	} else if (m->m_pkthdr.csum_flags &
6070 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6071 		m = hn_set_hlen(m);
6072 		if (__predict_false(m == NULL)) {
6073 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6074 			return EIO;
6075 		}
6076 	}
6077 #endif
6078 
6079 	/*
6080 	 * Select the TX ring based on flowid
6081 	 */
6082 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6083 #ifdef RSS
6084 		uint32_t bid;
6085 
6086 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6087 		    &bid) == 0)
6088 			idx = bid % sc->hn_tx_ring_inuse;
6089 		else
6090 #endif
6091 		{
6092 #if defined(INET6) || defined(INET)
6093 			int tcpsyn = 0;
6094 
6095 			if (m->m_pkthdr.len < 128 &&
6096 			    (m->m_pkthdr.csum_flags &
6097 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6098 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6099 				m = hn_check_tcpsyn(m, &tcpsyn);
6100 				if (__predict_false(m == NULL)) {
6101 					if_inc_counter(ifp,
6102 					    IFCOUNTER_OERRORS, 1);
6103 					return (EIO);
6104 				}
6105 			}
6106 #else
6107 			const int tcpsyn = 0;
6108 #endif
6109 			if (tcpsyn)
6110 				idx = 0;
6111 			else
6112 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6113 		}
6114 	}
6115 	txr = &sc->hn_tx_ring[idx];
6116 
6117 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6118 	if (error) {
6119 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6120 		return error;
6121 	}
6122 
6123 	if (txr->hn_oactive)
6124 		return 0;
6125 
6126 	if (txr->hn_sched_tx)
6127 		goto do_sched;
6128 
6129 	if (mtx_trylock(&txr->hn_tx_lock)) {
6130 		int sched;
6131 
6132 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6133 		mtx_unlock(&txr->hn_tx_lock);
6134 		if (!sched)
6135 			return 0;
6136 	}
6137 do_sched:
6138 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6139 	return 0;
6140 }
6141 
6142 static void
6143 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6144 {
6145 	struct mbuf *m;
6146 
6147 	mtx_lock(&txr->hn_tx_lock);
6148 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6149 		m_freem(m);
6150 	mtx_unlock(&txr->hn_tx_lock);
6151 }
6152 
6153 static void
6154 hn_xmit_qflush(if_t ifp)
6155 {
6156 	struct hn_softc *sc = if_getsoftc(ifp);
6157 	struct rm_priotracker pt;
6158 	int i;
6159 
6160 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6161 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6162 	if_qflush(ifp);
6163 
6164 	rm_rlock(&sc->hn_vf_lock, &pt);
6165 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6166 		if_qflush(sc->hn_vf_ifp);
6167 	rm_runlock(&sc->hn_vf_lock, &pt);
6168 }
6169 
6170 static void
6171 hn_xmit_txeof(struct hn_tx_ring *txr)
6172 {
6173 
6174 	if (txr->hn_sched_tx)
6175 		goto do_sched;
6176 
6177 	if (mtx_trylock(&txr->hn_tx_lock)) {
6178 		int sched;
6179 
6180 		txr->hn_oactive = 0;
6181 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6182 		mtx_unlock(&txr->hn_tx_lock);
6183 		if (sched) {
6184 			taskqueue_enqueue(txr->hn_tx_taskq,
6185 			    &txr->hn_tx_task);
6186 		}
6187 	} else {
6188 do_sched:
6189 		/*
6190 		 * Release the oactive earlier, with the hope, that
6191 		 * others could catch up.  The task will clear the
6192 		 * oactive again with the hn_tx_lock to avoid possible
6193 		 * races.
6194 		 */
6195 		txr->hn_oactive = 0;
6196 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6197 	}
6198 }
6199 
6200 static void
6201 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6202 {
6203 	struct hn_tx_ring *txr = xtxr;
6204 
6205 	mtx_lock(&txr->hn_tx_lock);
6206 	hn_xmit(txr, 0);
6207 	mtx_unlock(&txr->hn_tx_lock);
6208 }
6209 
6210 static void
6211 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6212 {
6213 	struct hn_tx_ring *txr = xtxr;
6214 
6215 	mtx_lock(&txr->hn_tx_lock);
6216 	txr->hn_oactive = 0;
6217 	hn_xmit(txr, 0);
6218 	mtx_unlock(&txr->hn_tx_lock);
6219 }
6220 
6221 static int
6222 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6223 {
6224 	struct vmbus_chan_br cbr;
6225 	struct hn_rx_ring *rxr;
6226 	struct hn_tx_ring *txr = NULL;
6227 	int idx, error;
6228 
6229 	idx = vmbus_chan_subidx(chan);
6230 
6231 	/*
6232 	 * Link this channel to RX/TX ring.
6233 	 */
6234 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6235 	    ("invalid channel index %d, should > 0 && < %d",
6236 	     idx, sc->hn_rx_ring_inuse));
6237 	rxr = &sc->hn_rx_ring[idx];
6238 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6239 	    ("RX ring %d already attached", idx));
6240 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6241 	rxr->hn_chan = chan;
6242 
6243 	if (bootverbose) {
6244 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6245 		    idx, vmbus_chan_id(chan));
6246 	}
6247 
6248 	if (idx < sc->hn_tx_ring_inuse) {
6249 		txr = &sc->hn_tx_ring[idx];
6250 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6251 		    ("TX ring %d already attached", idx));
6252 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6253 
6254 		txr->hn_chan = chan;
6255 		if (bootverbose) {
6256 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6257 			    idx, vmbus_chan_id(chan));
6258 		}
6259 	}
6260 
6261 	/* Bind this channel to a proper CPU. */
6262 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6263 
6264 	/*
6265 	 * Open this channel
6266 	 */
6267 	cbr.cbr = rxr->hn_br;
6268 	cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br);
6269 	cbr.cbr_txsz = HN_TXBR_SIZE;
6270 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6271 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6272 	if (error) {
6273 		if (error == EISCONN) {
6274 			if_printf(sc->hn_ifp, "bufring is connected after "
6275 			    "chan%u open failure\n", vmbus_chan_id(chan));
6276 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6277 		} else {
6278 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6279 			    vmbus_chan_id(chan), error);
6280 		}
6281 	}
6282 	return (error);
6283 }
6284 
6285 static void
6286 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6287 {
6288 	struct hn_rx_ring *rxr;
6289 	int idx, error;
6290 
6291 	idx = vmbus_chan_subidx(chan);
6292 
6293 	/*
6294 	 * Link this channel to RX/TX ring.
6295 	 */
6296 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6297 	    ("invalid channel index %d, should > 0 && < %d",
6298 	     idx, sc->hn_rx_ring_inuse));
6299 	rxr = &sc->hn_rx_ring[idx];
6300 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6301 	    ("RX ring %d is not attached", idx));
6302 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6303 
6304 	if (idx < sc->hn_tx_ring_inuse) {
6305 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6306 
6307 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6308 		    ("TX ring %d is not attached attached", idx));
6309 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6310 	}
6311 
6312 	/*
6313 	 * Close this channel.
6314 	 *
6315 	 * NOTE:
6316 	 * Channel closing does _not_ destroy the target channel.
6317 	 */
6318 	error = vmbus_chan_close_direct(chan);
6319 	if (error == EISCONN) {
6320 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6321 		    "after being closed\n", vmbus_chan_id(chan));
6322 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6323 	} else if (error) {
6324 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6325 		    vmbus_chan_id(chan), error);
6326 	}
6327 }
6328 
6329 static int
6330 hn_attach_subchans(struct hn_softc *sc)
6331 {
6332 	struct vmbus_channel **subchans;
6333 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6334 	int i, error = 0;
6335 
6336 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6337 
6338 	/* Attach the sub-channels. */
6339 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6340 	for (i = 0; i < subchan_cnt; ++i) {
6341 		int error1;
6342 
6343 		error1 = hn_chan_attach(sc, subchans[i]);
6344 		if (error1) {
6345 			error = error1;
6346 			/* Move on; all channels will be detached later. */
6347 		}
6348 	}
6349 	vmbus_subchan_rel(subchans, subchan_cnt);
6350 
6351 	if (error) {
6352 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6353 	} else {
6354 		if (bootverbose) {
6355 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6356 			    subchan_cnt);
6357 		}
6358 	}
6359 	return (error);
6360 }
6361 
6362 static void
6363 hn_detach_allchans(struct hn_softc *sc)
6364 {
6365 	struct vmbus_channel **subchans;
6366 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6367 	int i;
6368 
6369 	if (subchan_cnt == 0)
6370 		goto back;
6371 
6372 	/* Detach the sub-channels. */
6373 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6374 	for (i = 0; i < subchan_cnt; ++i)
6375 		hn_chan_detach(sc, subchans[i]);
6376 	vmbus_subchan_rel(subchans, subchan_cnt);
6377 
6378 back:
6379 	/*
6380 	 * Detach the primary channel, _after_ all sub-channels
6381 	 * are detached.
6382 	 */
6383 	hn_chan_detach(sc, sc->hn_prichan);
6384 
6385 	/* Wait for sub-channels to be destroyed, if any. */
6386 	vmbus_subchan_drain(sc->hn_prichan);
6387 
6388 #ifdef INVARIANTS
6389 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6390 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6391 		    HN_RX_FLAG_ATTACHED) == 0,
6392 		    ("%dth RX ring is still attached", i));
6393 	}
6394 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6395 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6396 		    HN_TX_FLAG_ATTACHED) == 0,
6397 		    ("%dth TX ring is still attached", i));
6398 	}
6399 #endif
6400 }
6401 
6402 static int
6403 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6404 {
6405 	struct vmbus_channel **subchans;
6406 	int nchan, rxr_cnt, error;
6407 
6408 	nchan = *nsubch + 1;
6409 	if (nchan == 1) {
6410 		/*
6411 		 * Multiple RX/TX rings are not requested.
6412 		 */
6413 		*nsubch = 0;
6414 		return (0);
6415 	}
6416 
6417 	/*
6418 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6419 	 * table entries.
6420 	 */
6421 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6422 	if (error) {
6423 		/* No RSS; this is benign. */
6424 		*nsubch = 0;
6425 		return (0);
6426 	}
6427 	if (bootverbose) {
6428 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6429 		    rxr_cnt, nchan);
6430 	}
6431 
6432 	if (nchan > rxr_cnt)
6433 		nchan = rxr_cnt;
6434 	if (nchan == 1) {
6435 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6436 		*nsubch = 0;
6437 		return (0);
6438 	}
6439 
6440 	/*
6441 	 * Allocate sub-channels from NVS.
6442 	 */
6443 	*nsubch = nchan - 1;
6444 	error = hn_nvs_alloc_subchans(sc, nsubch);
6445 	if (error || *nsubch == 0) {
6446 		/* Failed to allocate sub-channels. */
6447 		*nsubch = 0;
6448 		return (0);
6449 	}
6450 
6451 	/*
6452 	 * Wait for all sub-channels to become ready before moving on.
6453 	 */
6454 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6455 	vmbus_subchan_rel(subchans, *nsubch);
6456 	return (0);
6457 }
6458 
6459 static bool
6460 hn_synth_attachable(const struct hn_softc *sc)
6461 {
6462 	int i;
6463 
6464 	if (sc->hn_flags & HN_FLAG_ERRORS)
6465 		return (false);
6466 
6467 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6468 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6469 
6470 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6471 			return (false);
6472 	}
6473 	return (true);
6474 }
6475 
6476 /*
6477  * Make sure that the RX filter is zero after the successful
6478  * RNDIS initialization.
6479  *
6480  * NOTE:
6481  * Under certain conditions on certain versions of Hyper-V,
6482  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6483  * after the successful RNDIS initialization, which breaks
6484  * the assumption of any following code (well, it breaks the
6485  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6486  * explicitly, drain packets sneaking through, and drain the
6487  * interrupt taskqueues scheduled due to the stealth packets.
6488  */
6489 static void
6490 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6491 {
6492 
6493 	hn_disable_rx(sc);
6494 	hn_drain_rxtx(sc, nchan);
6495 }
6496 
6497 static int
6498 hn_synth_attach(struct hn_softc *sc, int mtu)
6499 {
6500 #define ATTACHED_NVS		0x0002
6501 #define ATTACHED_RNDIS		0x0004
6502 
6503 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6504 	int error, nsubch, nchan = 1, i, rndis_inited;
6505 	uint32_t old_caps, attached = 0;
6506 
6507 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6508 	    ("synthetic parts were attached"));
6509 
6510 	if (!hn_synth_attachable(sc))
6511 		return (ENXIO);
6512 
6513 	/* Save capabilities for later verification. */
6514 	old_caps = sc->hn_caps;
6515 	sc->hn_caps = 0;
6516 
6517 	/* Clear RSS stuffs. */
6518 	sc->hn_rss_ind_size = 0;
6519 	sc->hn_rss_hash = 0;
6520 	sc->hn_rss_hcap = 0;
6521 
6522 	/*
6523 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6524 	 */
6525 	error = hn_chan_attach(sc, sc->hn_prichan);
6526 	if (error)
6527 		goto failed;
6528 
6529 	/*
6530 	 * Attach NVS.
6531 	 */
6532 	error = hn_nvs_attach(sc, mtu);
6533 	if (error)
6534 		goto failed;
6535 	attached |= ATTACHED_NVS;
6536 
6537 	/*
6538 	 * Attach RNDIS _after_ NVS is attached.
6539 	 */
6540 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6541 	if (rndis_inited)
6542 		attached |= ATTACHED_RNDIS;
6543 	if (error)
6544 		goto failed;
6545 
6546 	/*
6547 	 * Make sure capabilities are not changed.
6548 	 */
6549 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6550 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6551 		    old_caps, sc->hn_caps);
6552 		error = ENXIO;
6553 		goto failed;
6554 	}
6555 
6556 	/*
6557 	 * Allocate sub-channels for multi-TX/RX rings.
6558 	 *
6559 	 * NOTE:
6560 	 * The # of RX rings that can be used is equivalent to the # of
6561 	 * channels to be requested.
6562 	 */
6563 	nsubch = sc->hn_rx_ring_cnt - 1;
6564 	error = hn_synth_alloc_subchans(sc, &nsubch);
6565 	if (error)
6566 		goto failed;
6567 	/* NOTE: _Full_ synthetic parts detach is required now. */
6568 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6569 
6570 	/*
6571 	 * Set the # of TX/RX rings that could be used according to
6572 	 * the # of channels that NVS offered.
6573 	 */
6574 	nchan = nsubch + 1;
6575 	hn_set_ring_inuse(sc, nchan);
6576 	if (nchan == 1) {
6577 		/* Only the primary channel can be used; done */
6578 		goto back;
6579 	}
6580 
6581 	/*
6582 	 * Attach the sub-channels.
6583 	 *
6584 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6585 	 */
6586 	error = hn_attach_subchans(sc);
6587 	if (error)
6588 		goto failed;
6589 
6590 	/*
6591 	 * Configure RSS key and indirect table _after_ all sub-channels
6592 	 * are attached.
6593 	 */
6594 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6595 		/*
6596 		 * RSS key is not set yet; set it to the default RSS key.
6597 		 */
6598 		if (bootverbose)
6599 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6600 #ifdef RSS
6601 		rss_getkey(rss->rss_key);
6602 #else
6603 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6604 #endif
6605 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6606 	}
6607 
6608 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6609 		/*
6610 		 * RSS indirect table is not set yet; set it up in round-
6611 		 * robin fashion.
6612 		 */
6613 		if (bootverbose) {
6614 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6615 			    "table\n");
6616 		}
6617 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6618 			uint32_t subidx;
6619 
6620 #ifdef RSS
6621 			subidx = rss_get_indirection_to_bucket(i);
6622 #else
6623 			subidx = i;
6624 #endif
6625 			rss->rss_ind[i] = subidx % nchan;
6626 		}
6627 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6628 	} else {
6629 		/*
6630 		 * # of usable channels may be changed, so we have to
6631 		 * make sure that all entries in RSS indirect table
6632 		 * are valid.
6633 		 *
6634 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6635 		 */
6636 		hn_rss_ind_fixup(sc);
6637 	}
6638 
6639 	sc->hn_rss_hash = sc->hn_rss_hcap;
6640 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6641 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6642 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6643 		hn_vf_rss_fixup(sc, false);
6644 	}
6645 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6646 	if (error)
6647 		goto failed;
6648 back:
6649 	/*
6650 	 * Fixup transmission aggregation setup.
6651 	 */
6652 	hn_set_txagg(sc);
6653 	hn_rndis_init_fixat(sc, nchan);
6654 	return (0);
6655 
6656 failed:
6657 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6658 		hn_rndis_init_fixat(sc, nchan);
6659 		hn_synth_detach(sc);
6660 	} else {
6661 		if (attached & ATTACHED_RNDIS) {
6662 			hn_rndis_init_fixat(sc, nchan);
6663 			hn_rndis_detach(sc);
6664 		}
6665 		if (attached & ATTACHED_NVS)
6666 			hn_nvs_detach(sc);
6667 		hn_chan_detach(sc, sc->hn_prichan);
6668 		/* Restore old capabilities. */
6669 		sc->hn_caps = old_caps;
6670 	}
6671 	return (error);
6672 
6673 #undef ATTACHED_RNDIS
6674 #undef ATTACHED_NVS
6675 }
6676 
6677 /*
6678  * NOTE:
6679  * The interface must have been suspended though hn_suspend(), before
6680  * this function get called.
6681  */
6682 static void
6683 hn_synth_detach(struct hn_softc *sc)
6684 {
6685 
6686 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6687 	    ("synthetic parts were not attached"));
6688 
6689 	/* Detach the RNDIS first. */
6690 	hn_rndis_detach(sc);
6691 
6692 	/* Detach NVS. */
6693 	hn_nvs_detach(sc);
6694 
6695 	/* Detach all of the channels. */
6696 	hn_detach_allchans(sc);
6697 
6698 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6699 		/*
6700 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6701 		 */
6702 		int error;
6703 
6704 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6705 		    sc->hn_rxbuf_gpadl);
6706 		if (error) {
6707 			if_printf(sc->hn_ifp,
6708 			    "rxbuf gpadl disconn failed: %d\n", error);
6709 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6710 		}
6711 		sc->hn_rxbuf_gpadl = 0;
6712 	}
6713 
6714 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6715 		/*
6716 		 * Host is post-Win2016, disconnect chimney sending buffer from
6717 		 * primary channel here.
6718 		 */
6719 		int error;
6720 
6721 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6722 		    sc->hn_chim_gpadl);
6723 		if (error) {
6724 			if_printf(sc->hn_ifp,
6725 			    "chim gpadl disconn failed: %d\n", error);
6726 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6727 		}
6728 		sc->hn_chim_gpadl = 0;
6729 	}
6730 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6731 }
6732 
6733 static void
6734 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6735 {
6736 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6737 	    ("invalid ring count %d", ring_cnt));
6738 
6739 	if (sc->hn_tx_ring_cnt > ring_cnt)
6740 		sc->hn_tx_ring_inuse = ring_cnt;
6741 	else
6742 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6743 	sc->hn_rx_ring_inuse = ring_cnt;
6744 
6745 #ifdef RSS
6746 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6747 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6748 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6749 		    rss_getnumbuckets());
6750 	}
6751 #endif
6752 
6753 	if (bootverbose) {
6754 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6755 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6756 	}
6757 }
6758 
6759 static void
6760 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6761 {
6762 
6763 	/*
6764 	 * NOTE:
6765 	 * The TX bufring will not be drained by the hypervisor,
6766 	 * if the primary channel is revoked.
6767 	 */
6768 	while (!vmbus_chan_rx_empty(chan) ||
6769 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6770 	     !vmbus_chan_tx_empty(chan)))
6771 		pause("waitch", 1);
6772 	vmbus_chan_intr_drain(chan);
6773 }
6774 
6775 static void
6776 hn_disable_rx(struct hn_softc *sc)
6777 {
6778 
6779 	/*
6780 	 * Disable RX by clearing RX filter forcefully.
6781 	 */
6782 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6783 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6784 
6785 	/*
6786 	 * Give RNDIS enough time to flush all pending data packets.
6787 	 */
6788 	pause("waitrx", (200 * hz) / 1000);
6789 }
6790 
6791 /*
6792  * NOTE:
6793  * RX/TX _must_ have been suspended/disabled, before this function
6794  * is called.
6795  */
6796 static void
6797 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6798 {
6799 	struct vmbus_channel **subch = NULL;
6800 	int nsubch;
6801 
6802 	/*
6803 	 * Drain RX/TX bufrings and interrupts.
6804 	 */
6805 	nsubch = nchan - 1;
6806 	if (nsubch > 0)
6807 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6808 
6809 	if (subch != NULL) {
6810 		int i;
6811 
6812 		for (i = 0; i < nsubch; ++i)
6813 			hn_chan_drain(sc, subch[i]);
6814 	}
6815 	hn_chan_drain(sc, sc->hn_prichan);
6816 
6817 	if (subch != NULL)
6818 		vmbus_subchan_rel(subch, nsubch);
6819 }
6820 
6821 static void
6822 hn_suspend_data(struct hn_softc *sc)
6823 {
6824 	struct hn_tx_ring *txr;
6825 	int i;
6826 
6827 	HN_LOCK_ASSERT(sc);
6828 
6829 	/*
6830 	 * Suspend TX.
6831 	 */
6832 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6833 		txr = &sc->hn_tx_ring[i];
6834 
6835 		mtx_lock(&txr->hn_tx_lock);
6836 		txr->hn_suspended = 1;
6837 		mtx_unlock(&txr->hn_tx_lock);
6838 		/* No one is able send more packets now. */
6839 
6840 		/*
6841 		 * Wait for all pending sends to finish.
6842 		 *
6843 		 * NOTE:
6844 		 * We will _not_ receive all pending send-done, if the
6845 		 * primary channel is revoked.
6846 		 */
6847 		while (hn_tx_ring_pending(txr) &&
6848 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6849 			pause("hnwtx", 1 /* 1 tick */);
6850 	}
6851 
6852 	/*
6853 	 * Disable RX.
6854 	 */
6855 	hn_disable_rx(sc);
6856 
6857 	/*
6858 	 * Drain RX/TX.
6859 	 */
6860 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6861 
6862 	/*
6863 	 * Drain any pending TX tasks.
6864 	 *
6865 	 * NOTE:
6866 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6867 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6868 	 */
6869 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6870 		txr = &sc->hn_tx_ring[i];
6871 
6872 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6873 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6874 	}
6875 }
6876 
6877 static void
6878 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6879 {
6880 
6881 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6882 }
6883 
6884 static void
6885 hn_suspend_mgmt(struct hn_softc *sc)
6886 {
6887 	struct task task;
6888 
6889 	HN_LOCK_ASSERT(sc);
6890 
6891 	/*
6892 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6893 	 * through hn_mgmt_taskq.
6894 	 */
6895 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6896 	vmbus_chan_run_task(sc->hn_prichan, &task);
6897 
6898 	/*
6899 	 * Make sure that all pending management tasks are completed.
6900 	 */
6901 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6902 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6903 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6904 }
6905 
6906 static void
6907 hn_suspend(struct hn_softc *sc)
6908 {
6909 
6910 	/* Disable polling. */
6911 	hn_polling(sc, 0);
6912 
6913 	/*
6914 	 * If the non-transparent mode VF is activated, the synthetic
6915 	 * device is receiving packets, so the data path of the
6916 	 * synthetic device must be suspended.
6917 	 */
6918 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6919 	    (sc->hn_flags & HN_FLAG_RXVF))
6920 		hn_suspend_data(sc);
6921 	hn_suspend_mgmt(sc);
6922 }
6923 
6924 static void
6925 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6926 {
6927 	int i;
6928 
6929 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6930 	    ("invalid TX ring count %d", tx_ring_cnt));
6931 
6932 	for (i = 0; i < tx_ring_cnt; ++i) {
6933 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6934 
6935 		mtx_lock(&txr->hn_tx_lock);
6936 		txr->hn_suspended = 0;
6937 		mtx_unlock(&txr->hn_tx_lock);
6938 	}
6939 }
6940 
6941 static void
6942 hn_resume_data(struct hn_softc *sc)
6943 {
6944 	int i;
6945 
6946 	HN_LOCK_ASSERT(sc);
6947 
6948 	/*
6949 	 * Re-enable RX.
6950 	 */
6951 	hn_rxfilter_config(sc);
6952 
6953 	/*
6954 	 * Make sure to clear suspend status on "all" TX rings,
6955 	 * since hn_tx_ring_inuse can be changed after
6956 	 * hn_suspend_data().
6957 	 */
6958 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6959 
6960 #ifdef HN_IFSTART_SUPPORT
6961 	if (!hn_use_if_start)
6962 #endif
6963 	{
6964 		/*
6965 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6966 		 * reduced.
6967 		 */
6968 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6969 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6970 	}
6971 
6972 	/*
6973 	 * Kick start TX.
6974 	 */
6975 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6976 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6977 
6978 		/*
6979 		 * Use txeof task, so that any pending oactive can be
6980 		 * cleared properly.
6981 		 */
6982 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6983 	}
6984 }
6985 
6986 static void
6987 hn_resume_mgmt(struct hn_softc *sc)
6988 {
6989 
6990 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6991 
6992 	/*
6993 	 * Kick off network change detection, if it was pending.
6994 	 * If no network change was pending, start link status
6995 	 * checks, which is more lightweight than network change
6996 	 * detection.
6997 	 */
6998 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6999 		hn_change_network(sc);
7000 	else
7001 		hn_update_link_status(sc);
7002 }
7003 
7004 static void
7005 hn_resume(struct hn_softc *sc)
7006 {
7007 
7008 	/*
7009 	 * If the non-transparent mode VF is activated, the synthetic
7010 	 * device have to receive packets, so the data path of the
7011 	 * synthetic device must be resumed.
7012 	 */
7013 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
7014 	    (sc->hn_flags & HN_FLAG_RXVF))
7015 		hn_resume_data(sc);
7016 
7017 	/*
7018 	 * Don't resume link status change if VF is attached/activated.
7019 	 * - In the non-transparent VF mode, the synthetic device marks
7020 	 *   link down until the VF is deactivated; i.e. VF is down.
7021 	 * - In transparent VF mode, VF's media status is used until
7022 	 *   the VF is detached.
7023 	 */
7024 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
7025 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
7026 		hn_resume_mgmt(sc);
7027 
7028 	/*
7029 	 * Re-enable polling if this interface is running and
7030 	 * the polling is requested.
7031 	 */
7032 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
7033 		hn_polling(sc, sc->hn_pollhz);
7034 }
7035 
7036 static void
7037 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
7038 {
7039 	const struct rndis_status_msg *msg;
7040 	int ofs;
7041 
7042 	if (dlen < sizeof(*msg)) {
7043 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7044 		return;
7045 	}
7046 	msg = data;
7047 
7048 	switch (msg->rm_status) {
7049 	case RNDIS_STATUS_MEDIA_CONNECT:
7050 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7051 		hn_update_link_status(sc);
7052 		break;
7053 
7054 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7055 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7056 		/* Not really useful; ignore. */
7057 		break;
7058 
7059 	case RNDIS_STATUS_NETWORK_CHANGE:
7060 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7061 		if (dlen < ofs + msg->rm_stbuflen ||
7062 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7063 			if_printf(sc->hn_ifp, "network changed\n");
7064 		} else {
7065 			uint32_t change;
7066 
7067 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7068 			    sizeof(change));
7069 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7070 			    change);
7071 		}
7072 		hn_change_network(sc);
7073 		break;
7074 
7075 	default:
7076 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7077 		    msg->rm_status);
7078 		break;
7079 	}
7080 }
7081 
7082 static int
7083 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7084 {
7085 	const struct rndis_pktinfo *pi = info_data;
7086 	uint32_t mask = 0;
7087 
7088 	while (info_dlen != 0) {
7089 		const void *data;
7090 		uint32_t dlen;
7091 
7092 		if (__predict_false(info_dlen < sizeof(*pi)))
7093 			return (EINVAL);
7094 		if (__predict_false(info_dlen < pi->rm_size))
7095 			return (EINVAL);
7096 		info_dlen -= pi->rm_size;
7097 
7098 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7099 			return (EINVAL);
7100 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7101 			return (EINVAL);
7102 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7103 		data = pi->rm_data;
7104 
7105 		if (pi->rm_internal == 1) {
7106 			switch (pi->rm_type) {
7107 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7108 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7109 					return (EINVAL);
7110 				info->pktinfo_id =
7111 				    (const struct packet_info_id *)data;
7112 				mask |= HN_RXINFO_PKTINFO_ID;
7113 				break;
7114 
7115 			default:
7116 				goto next;
7117 			}
7118 		} else {
7119 			switch (pi->rm_type) {
7120 			case NDIS_PKTINFO_TYPE_VLAN:
7121 				if (__predict_false(dlen
7122 				    < NDIS_VLAN_INFO_SIZE))
7123 					return (EINVAL);
7124 				info->vlan_info = (const uint32_t *)data;
7125 				mask |= HN_RXINFO_VLAN;
7126 				break;
7127 
7128 			case NDIS_PKTINFO_TYPE_CSUM:
7129 				if (__predict_false(dlen
7130 				    < NDIS_RXCSUM_INFO_SIZE))
7131 					return (EINVAL);
7132 				info->csum_info = (const uint32_t *)data;
7133 				mask |= HN_RXINFO_CSUM;
7134 				break;
7135 
7136 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7137 				if (__predict_false(dlen
7138 				    < HN_NDIS_HASH_VALUE_SIZE))
7139 					return (EINVAL);
7140 				info->hash_value = (const uint32_t *)data;
7141 				mask |= HN_RXINFO_HASHVAL;
7142 				break;
7143 
7144 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7145 				if (__predict_false(dlen
7146 				    < HN_NDIS_HASH_INFO_SIZE))
7147 					return (EINVAL);
7148 				info->hash_info = (const uint32_t *)data;
7149 				mask |= HN_RXINFO_HASHINF;
7150 				break;
7151 
7152 			default:
7153 				goto next;
7154 			}
7155 		}
7156 
7157 		if (mask == HN_RXINFO_ALL) {
7158 			/* All found; done */
7159 			break;
7160 		}
7161 next:
7162 		pi = (const struct rndis_pktinfo *)
7163 		    ((const uint8_t *)pi + pi->rm_size);
7164 	}
7165 
7166 	/*
7167 	 * Final fixup.
7168 	 * - If there is no hash value, invalidate the hash info.
7169 	 */
7170 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7171 		info->hash_info = NULL;
7172 	return (0);
7173 }
7174 
7175 static __inline bool
7176 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7177 {
7178 
7179 	if (off < check_off) {
7180 		if (__predict_true(off + len <= check_off))
7181 			return (false);
7182 	} else if (off > check_off) {
7183 		if (__predict_true(check_off + check_len <= off))
7184 			return (false);
7185 	}
7186 	return (true);
7187 }
7188 
7189 static __inline void
7190 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7191 		uint32_t len, struct hn_rxinfo *info)
7192 {
7193 	uint32_t cnt = rxr->rsc.cnt;
7194 
7195 	if (cnt) {
7196 		rxr->rsc.pktlen += len;
7197 	} else {
7198 		rxr->rsc.vlan_info = info->vlan_info;
7199 		rxr->rsc.csum_info = info->csum_info;
7200 		rxr->rsc.hash_info = info->hash_info;
7201 		rxr->rsc.hash_value = info->hash_value;
7202 		rxr->rsc.pktlen = len;
7203 	}
7204 
7205 	rxr->rsc.frag_data[cnt] = data;
7206 	rxr->rsc.frag_len[cnt] = len;
7207 	rxr->rsc.cnt++;
7208 }
7209 
7210 static void
7211 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7212 {
7213 	const struct rndis_packet_msg *pkt;
7214 	struct hn_rxinfo info;
7215 	int data_off, pktinfo_off, data_len, pktinfo_len;
7216 	bool rsc_more= false;
7217 
7218 	/*
7219 	 * Check length.
7220 	 */
7221 	if (__predict_false(dlen < sizeof(*pkt))) {
7222 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7223 		return;
7224 	}
7225 	pkt = data;
7226 
7227 	if (__predict_false(dlen < pkt->rm_len)) {
7228 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7229 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7230 		return;
7231 	}
7232 	if (__predict_false(pkt->rm_len <
7233 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7234 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7235 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7236 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7237 		    pkt->rm_pktinfolen);
7238 		return;
7239 	}
7240 	if (__predict_false(pkt->rm_datalen == 0)) {
7241 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7242 		return;
7243 	}
7244 
7245 	/*
7246 	 * Check offests.
7247 	 */
7248 #define IS_OFFSET_INVALID(ofs)			\
7249 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7250 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7251 
7252 	/* XXX Hyper-V does not meet data offset alignment requirement */
7253 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7254 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7255 		    "data offset %u\n", pkt->rm_dataoffset);
7256 		return;
7257 	}
7258 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7259 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7260 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7261 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7262 		return;
7263 	}
7264 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7265 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7266 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7267 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7268 		return;
7269 	}
7270 
7271 #undef IS_OFFSET_INVALID
7272 
7273 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7274 	data_len = pkt->rm_datalen;
7275 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7276 	pktinfo_len = pkt->rm_pktinfolen;
7277 
7278 	/*
7279 	 * Check OOB coverage.
7280 	 */
7281 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7282 		int oob_off, oob_len;
7283 
7284 		if_printf(rxr->hn_ifp, "got oobdata\n");
7285 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7286 		oob_len = pkt->rm_oobdatalen;
7287 
7288 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7289 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7290 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7291 			    pkt->rm_len, oob_off, oob_len);
7292 			return;
7293 		}
7294 
7295 		/*
7296 		 * Check against data.
7297 		 */
7298 		if (hn_rndis_check_overlap(oob_off, oob_len,
7299 		    data_off, data_len)) {
7300 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7301 			    "oob overlaps data, oob abs %d len %d, "
7302 			    "data abs %d len %d\n",
7303 			    oob_off, oob_len, data_off, data_len);
7304 			return;
7305 		}
7306 
7307 		/*
7308 		 * Check against pktinfo.
7309 		 */
7310 		if (pktinfo_len != 0 &&
7311 		    hn_rndis_check_overlap(oob_off, oob_len,
7312 		    pktinfo_off, pktinfo_len)) {
7313 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7314 			    "oob overlaps pktinfo, oob abs %d len %d, "
7315 			    "pktinfo abs %d len %d\n",
7316 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7317 			return;
7318 		}
7319 	}
7320 
7321 	/*
7322 	 * Check per-packet-info coverage and find useful per-packet-info.
7323 	 */
7324 	info.vlan_info = NULL;
7325 	info.csum_info = NULL;
7326 	info.hash_info = NULL;
7327 	info.pktinfo_id = NULL;
7328 
7329 	if (__predict_true(pktinfo_len != 0)) {
7330 		bool overlap;
7331 		int error;
7332 
7333 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7334 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7335 			    "pktinfo overflow, msglen %u, "
7336 			    "pktinfo abs %d len %d\n",
7337 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7338 			return;
7339 		}
7340 
7341 		/*
7342 		 * Check packet info coverage.
7343 		 */
7344 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7345 		    data_off, data_len);
7346 		if (__predict_false(overlap)) {
7347 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7348 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7349 			    "data abs %d len %d\n",
7350 			    pktinfo_off, pktinfo_len, data_off, data_len);
7351 			return;
7352 		}
7353 
7354 		/*
7355 		 * Find useful per-packet-info.
7356 		 */
7357 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7358 		    pktinfo_len, &info);
7359 		if (__predict_false(error)) {
7360 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7361 			    "pktinfo\n");
7362 			return;
7363 		}
7364 	}
7365 
7366 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7367 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7368 		    "data overflow, msglen %u, data abs %d len %d\n",
7369 		    pkt->rm_len, data_off, data_len);
7370 		return;
7371 	}
7372 
7373 	/* Identify RSC fragments, drop invalid packets */
7374 	if ((info.pktinfo_id != NULL) &&
7375 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7376 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7377 			rxr->rsc.cnt = 0;
7378 			rxr->hn_rsc_pkts++;
7379 		} else if (rxr->rsc.cnt == 0)
7380 			goto drop;
7381 
7382 		rsc_more = true;
7383 
7384 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7385 			rsc_more = false;
7386 
7387 		if (rsc_more && rxr->rsc.is_last)
7388 			goto drop;
7389 	} else {
7390 		rxr->rsc.cnt = 0;
7391 	}
7392 
7393 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7394 		goto drop;
7395 
7396 	/* Store data in per rx ring structure */
7397 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7398 	    data_len, &info);
7399 
7400 	if (rsc_more)
7401 		return;
7402 
7403 	hn_rxpkt(rxr);
7404 	rxr->rsc.cnt = 0;
7405 	return;
7406 drop:
7407 	rxr->hn_rsc_drop++;
7408 	return;
7409 }
7410 
7411 static __inline void
7412 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7413 {
7414 	const struct rndis_msghdr *hdr;
7415 
7416 	if (__predict_false(dlen < sizeof(*hdr))) {
7417 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7418 		return;
7419 	}
7420 	hdr = data;
7421 
7422 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7423 		/* Hot data path. */
7424 		hn_rndis_rx_data(rxr, data, dlen);
7425 		/* Done! */
7426 		return;
7427 	}
7428 
7429 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7430 		hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen);
7431 	else
7432 		hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen);
7433 }
7434 
7435 static void
7436 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7437 {
7438 	const struct hn_nvs_hdr *hdr;
7439 
7440 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7441 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7442 		return;
7443 	}
7444 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7445 
7446 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7447 		/* Useless; ignore */
7448 		return;
7449 	}
7450 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7451 }
7452 
7453 static void
7454 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7455     const struct vmbus_chanpkt_hdr *pkt)
7456 {
7457 	struct hn_nvs_sendctx *sndc;
7458 
7459 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7460 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7461 	    VMBUS_CHANPKT_DATALEN(pkt));
7462 	/*
7463 	 * NOTE:
7464 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7465 	 * its callback.
7466 	 */
7467 }
7468 
7469 static void
7470 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7471     const struct vmbus_chanpkt_hdr *pkthdr)
7472 {
7473 	struct epoch_tracker et;
7474 	const struct vmbus_chanpkt_rxbuf *pkt;
7475 	const struct hn_nvs_hdr *nvs_hdr;
7476 	int count, i, hlen;
7477 
7478 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7479 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7480 		return;
7481 	}
7482 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7483 
7484 	/* Make sure that this is a RNDIS message. */
7485 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7486 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7487 		    nvs_hdr->nvs_type);
7488 		return;
7489 	}
7490 
7491 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7492 	if (__predict_false(hlen < sizeof(*pkt))) {
7493 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7494 		return;
7495 	}
7496 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7497 
7498 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7499 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7500 		    pkt->cp_rxbuf_id);
7501 		return;
7502 	}
7503 
7504 	count = pkt->cp_rxbuf_cnt;
7505 	if (__predict_false(hlen <
7506 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7507 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7508 		return;
7509 	}
7510 
7511 	NET_EPOCH_ENTER(et);
7512 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7513 	for (i = 0; i < count; ++i) {
7514 		int ofs, len;
7515 
7516 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7517 		len = pkt->cp_rxbuf[i].rb_len;
7518 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7519 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7520 			    "ofs %d, len %d\n", i, ofs, len);
7521 			continue;
7522 		}
7523 
7524 		rxr->rsc.is_last = (i == (count - 1));
7525 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7526 	}
7527 	NET_EPOCH_EXIT(et);
7528 
7529 	/*
7530 	 * Ack the consumed RXBUF associated w/ this channel packet,
7531 	 * so that this RXBUF can be recycled by the hypervisor.
7532 	 */
7533 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7534 }
7535 
7536 static void
7537 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7538     uint64_t tid)
7539 {
7540 	struct hn_nvs_rndis_ack ack;
7541 	int retries, error;
7542 
7543 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7544 	ack.nvs_status = HN_NVS_STATUS_OK;
7545 
7546 	retries = 0;
7547 again:
7548 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7549 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7550 	if (__predict_false(error == EAGAIN)) {
7551 		/*
7552 		 * NOTE:
7553 		 * This should _not_ happen in real world, since the
7554 		 * consumption of the TX bufring from the TX path is
7555 		 * controlled.
7556 		 */
7557 		if (rxr->hn_ack_failed == 0)
7558 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7559 		rxr->hn_ack_failed++;
7560 		retries++;
7561 		if (retries < 10) {
7562 			DELAY(100);
7563 			goto again;
7564 		}
7565 		/* RXBUF leaks! */
7566 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7567 	}
7568 }
7569 
7570 static void
7571 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7572 {
7573 	struct hn_rx_ring *rxr = xrxr;
7574 	struct hn_softc *sc = if_getsoftc(rxr->hn_ifp);
7575 
7576 	for (;;) {
7577 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7578 		int error, pktlen;
7579 
7580 		pktlen = rxr->hn_pktbuf_len;
7581 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7582 		if (__predict_false(error == ENOBUFS)) {
7583 			void *nbuf;
7584 			int nlen;
7585 
7586 			/*
7587 			 * Expand channel packet buffer.
7588 			 *
7589 			 * XXX
7590 			 * Use M_WAITOK here, since allocation failure
7591 			 * is fatal.
7592 			 */
7593 			nlen = rxr->hn_pktbuf_len * 2;
7594 			while (nlen < pktlen)
7595 				nlen *= 2;
7596 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7597 
7598 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7599 			    rxr->hn_pktbuf_len, nlen);
7600 
7601 			free(rxr->hn_pktbuf, M_DEVBUF);
7602 			rxr->hn_pktbuf = nbuf;
7603 			rxr->hn_pktbuf_len = nlen;
7604 			/* Retry! */
7605 			continue;
7606 		} else if (__predict_false(error == EAGAIN)) {
7607 			/* No more channel packets; done! */
7608 			break;
7609 		}
7610 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7611 
7612 		switch (pkt->cph_type) {
7613 		case VMBUS_CHANPKT_TYPE_COMP:
7614 			hn_nvs_handle_comp(sc, chan, pkt);
7615 			break;
7616 
7617 		case VMBUS_CHANPKT_TYPE_RXBUF:
7618 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7619 			break;
7620 
7621 		case VMBUS_CHANPKT_TYPE_INBAND:
7622 			hn_nvs_handle_notify(sc, pkt);
7623 			break;
7624 
7625 		default:
7626 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7627 			    pkt->cph_type);
7628 			break;
7629 		}
7630 	}
7631 	hn_chan_rollup(rxr, rxr->hn_txr);
7632 }
7633 
7634 static void
7635 hn_sysinit(void *arg __unused)
7636 {
7637 	int i;
7638 
7639 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7640 
7641 #ifdef HN_IFSTART_SUPPORT
7642 	/*
7643 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7644 	 * mainly due to the IFF_DRV_OACTIVE flag.
7645 	 */
7646 	if (hn_xpnt_vf && hn_use_if_start) {
7647 		hn_use_if_start = 0;
7648 		printf("hn: tranparent VF mode, if_transmit will be used, "
7649 		    "instead of if_start\n");
7650 	}
7651 #endif
7652 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7653 		printf("hn: invalid transparent VF attach routing "
7654 		    "wait timeout %d, reset to %d\n",
7655 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7656 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7657 	}
7658 
7659 	/*
7660 	 * Initialize VF map.
7661 	 */
7662 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7663 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7664 	hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF,
7665 	    M_WAITOK | M_ZERO);
7666 
7667 	/*
7668 	 * Fix the # of TX taskqueues.
7669 	 */
7670 	if (hn_tx_taskq_cnt <= 0)
7671 		hn_tx_taskq_cnt = 1;
7672 	else if (hn_tx_taskq_cnt > mp_ncpus)
7673 		hn_tx_taskq_cnt = mp_ncpus;
7674 
7675 	/*
7676 	 * Fix the TX taskqueue mode.
7677 	 */
7678 	switch (hn_tx_taskq_mode) {
7679 	case HN_TX_TASKQ_M_INDEP:
7680 	case HN_TX_TASKQ_M_GLOBAL:
7681 	case HN_TX_TASKQ_M_EVTTQ:
7682 		break;
7683 	default:
7684 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7685 		break;
7686 	}
7687 
7688 	if (vm_guest != VM_GUEST_HV)
7689 		return;
7690 
7691 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7692 		return;
7693 
7694 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7695 	    M_DEVBUF, M_WAITOK);
7696 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7697 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7698 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7699 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7700 		    "hn tx%d", i);
7701 	}
7702 }
7703 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7704 
7705 static void
7706 hn_sysuninit(void *arg __unused)
7707 {
7708 
7709 	if (hn_tx_taskque != NULL) {
7710 		int i;
7711 
7712 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7713 			taskqueue_free(hn_tx_taskque[i]);
7714 		free(hn_tx_taskque, M_DEVBUF);
7715 	}
7716 
7717 	if (hn_vfmap != NULL)
7718 		free(hn_vfmap, M_DEVBUF);
7719 	rm_destroy(&hn_vfmap_lock);
7720 
7721 	counter_u64_free(hn_udpcs_fixup);
7722 }
7723 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7724