xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision d0b2dbfa0ecf2bbc9709efc5e20baf8e4b44bbbf)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 #include "opt_hn.h"
57 #include "opt_inet6.h"
58 #include "opt_inet.h"
59 #include "opt_rss.h"
60 
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/bus.h>
64 #include <sys/counter.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/proc.h>
73 #include <sys/rmlock.h>
74 #include <sys/sbuf.h>
75 #include <sys/sched.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 #include <sys/epoch.h>
85 
86 #include <vm/vm.h>
87 #include <vm/vm_extern.h>
88 #include <vm/pmap.h>
89 
90 #include <machine/atomic.h>
91 #include <machine/in_cksum.h>
92 
93 #include <net/bpf.h>
94 #include <net/ethernet.h>
95 #include <net/if.h>
96 #include <net/if_dl.h>
97 #include <net/if_media.h>
98 #include <net/if_types.h>
99 #include <net/if_var.h>
100 #include <net/rndis.h>
101 #ifdef RSS
102 #include <net/rss_config.h>
103 #endif
104 
105 #include <netinet/in_systm.h>
106 #include <netinet/in.h>
107 #include <netinet/ip.h>
108 #include <netinet/ip6.h>
109 #include <netinet/tcp.h>
110 #include <netinet/tcp_lro.h>
111 #include <netinet/udp.h>
112 
113 #include <dev/hyperv/include/hyperv.h>
114 #include <dev/hyperv/include/hyperv_busdma.h>
115 #include <dev/hyperv/include/vmbus.h>
116 #include <dev/hyperv/include/vmbus_xact.h>
117 
118 #include <dev/hyperv/netvsc/ndis.h>
119 #include <dev/hyperv/netvsc/if_hnreg.h>
120 #include <dev/hyperv/netvsc/if_hnvar.h>
121 #include <dev/hyperv/netvsc/hn_nvs.h>
122 #include <dev/hyperv/netvsc/hn_rndis.h>
123 
124 #include "vmbus_if.h"
125 
126 #define HN_IFSTART_SUPPORT
127 
128 #define HN_RING_CNT_DEF_MAX		8
129 
130 #define HN_VFMAP_SIZE_DEF		8
131 
132 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
133 
134 /* YYY should get it from the underlying channel */
135 #define HN_TX_DESC_CNT			512
136 
137 #define HN_RNDIS_PKT_LEN					\
138 	(sizeof(struct rndis_packet_msg) +			\
139 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
140 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
141 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
142 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
143 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
144 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
145 
146 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
147 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
148 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
149 /* -1 for RNDIS packet message */
150 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
151 
152 #define HN_DIRECT_TX_SIZE_DEF		128
153 
154 #define HN_EARLY_TXEOF_THRESH		8
155 
156 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
157 
158 #define HN_LROENT_CNT_DEF		128
159 
160 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
161 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
162 /* YYY 2*MTU is a bit rough, but should be good enough. */
163 #define HN_LRO_LENLIM_MIN(ifp)		(2 * if_getmtu(ifp))
164 
165 #define HN_LRO_ACKCNT_DEF		1
166 
167 #define HN_LOCK_INIT(sc)		\
168 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
169 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
170 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
171 #define HN_LOCK(sc)					\
172 do {							\
173 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
174 		/* Relinquish cpu to avoid deadlock */	\
175 		sched_relinquish(curthread);		\
176 		DELAY(1000);				\
177 	}						\
178 } while (0)
179 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
180 
181 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
182 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
183 #define HN_CSUM_IP_HWASSIST(sc)		\
184 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
185 #define HN_CSUM_IP6_HWASSIST(sc)	\
186 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
187 
188 #define HN_PKTSIZE_MIN(align)		\
189 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
190 	    HN_RNDIS_PKT_LEN, (align))
191 #define HN_PKTSIZE(m, align)		\
192 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
193 
194 #ifdef RSS
195 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
196 #else
197 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
198 #endif
199 
200 struct hn_txdesc {
201 #ifndef HN_USE_TXDESC_BUFRING
202 	SLIST_ENTRY(hn_txdesc)		link;
203 #endif
204 	STAILQ_ENTRY(hn_txdesc)		agg_link;
205 
206 	/* Aggregated txdescs, in sending order. */
207 	STAILQ_HEAD(, hn_txdesc)	agg_list;
208 
209 	/* The oldest packet, if transmission aggregation happens. */
210 	struct mbuf			*m;
211 	struct hn_tx_ring		*txr;
212 	int				refs;
213 	uint32_t			flags;	/* HN_TXD_FLAG_ */
214 	struct hn_nvs_sendctx		send_ctx;
215 	uint32_t			chim_index;
216 	int				chim_size;
217 
218 	bus_dmamap_t			data_dmap;
219 
220 	bus_addr_t			rndis_pkt_paddr;
221 	struct rndis_packet_msg		*rndis_pkt;
222 	bus_dmamap_t			rndis_pkt_dmap;
223 };
224 
225 #define HN_TXD_FLAG_ONLIST		0x0001
226 #define HN_TXD_FLAG_DMAMAP		0x0002
227 #define HN_TXD_FLAG_ONAGG		0x0004
228 
229 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
230 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
231 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
232 
233 struct packet_info_id {
234 	uint8_t				ver;
235 	uint8_t				flag;
236 	uint16_t			pkt_id;
237 };
238 
239 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
240 
241 
242 struct hn_rxinfo {
243 	const uint32_t			*vlan_info;
244 	const uint32_t			*csum_info;
245 	const uint32_t			*hash_info;
246 	const uint32_t			*hash_value;
247 	const struct packet_info_id	*pktinfo_id;
248 };
249 
250 struct hn_rxvf_setarg {
251 	struct hn_rx_ring	*rxr;
252 	if_t			vf_ifp;
253 };
254 
255 #define HN_RXINFO_VLAN			0x0001
256 #define HN_RXINFO_CSUM			0x0002
257 #define HN_RXINFO_HASHINF		0x0004
258 #define HN_RXINFO_HASHVAL		0x0008
259 #define HN_RXINFO_PKTINFO_ID		0x0010
260 #define HN_RXINFO_ALL			\
261 	(HN_RXINFO_VLAN |		\
262 	 HN_RXINFO_CSUM |		\
263 	 HN_RXINFO_HASHINF |		\
264 	 HN_RXINFO_HASHVAL |		\
265 	 HN_RXINFO_PKTINFO_ID)
266 
267 static int			hn_probe(device_t);
268 static int			hn_attach(device_t);
269 static int			hn_detach(device_t);
270 static int			hn_shutdown(device_t);
271 static void			hn_chan_callback(struct vmbus_channel *,
272 				    void *);
273 
274 static void			hn_init(void *);
275 static int			hn_ioctl(if_t, u_long, caddr_t);
276 #ifdef HN_IFSTART_SUPPORT
277 static void			hn_start(if_t);
278 #endif
279 static int			hn_transmit(if_t, struct mbuf *);
280 static void			hn_xmit_qflush(if_t);
281 static int			hn_ifmedia_upd(if_t);
282 static void			hn_ifmedia_sts(if_t,
283 				    struct ifmediareq *);
284 
285 static void			hn_ifnet_event(void *, if_t, int);
286 static void			hn_ifaddr_event(void *, if_t);
287 static void			hn_ifnet_attevent(void *, if_t);
288 static void			hn_ifnet_detevent(void *, if_t);
289 static void			hn_ifnet_lnkevent(void *, if_t, int);
290 
291 static bool			hn_ismyvf(const struct hn_softc *,
292 				    const if_t);
293 static void			hn_rxvf_change(struct hn_softc *,
294 				    if_t, bool);
295 static void			hn_rxvf_set(struct hn_softc *, if_t);
296 static void			hn_rxvf_set_task(void *, int);
297 static void			hn_xpnt_vf_input(if_t, struct mbuf *);
298 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
299 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
300 				    struct ifreq *);
301 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
302 static bool			hn_xpnt_vf_isready(struct hn_softc *);
303 static void			hn_xpnt_vf_setready(struct hn_softc *);
304 static void			hn_xpnt_vf_init_taskfunc(void *, int);
305 static void			hn_xpnt_vf_init(struct hn_softc *);
306 static void			hn_xpnt_vf_setenable(struct hn_softc *);
307 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
308 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
309 static void			hn_vf_rss_restore(struct hn_softc *);
310 
311 static int			hn_rndis_rxinfo(const void *, int,
312 				    struct hn_rxinfo *);
313 static void			hn_rndis_rx_data(struct hn_rx_ring *,
314 				    const void *, int);
315 static void			hn_rndis_rx_status(struct hn_softc *,
316 				    const void *, int);
317 static void			hn_rndis_init_fixat(struct hn_softc *, int);
318 
319 static void			hn_nvs_handle_notify(struct hn_softc *,
320 				    const struct vmbus_chanpkt_hdr *);
321 static void			hn_nvs_handle_comp(struct hn_softc *,
322 				    struct vmbus_channel *,
323 				    const struct vmbus_chanpkt_hdr *);
324 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
325 				    struct vmbus_channel *,
326 				    const struct vmbus_chanpkt_hdr *);
327 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
328 				    struct vmbus_channel *, uint64_t);
329 
330 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
331 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
342 #ifndef RSS
343 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
345 #endif
346 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
347 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
349 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
350 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
358 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
359 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
360 
361 static void			hn_stop(struct hn_softc *, bool);
362 static void			hn_init_locked(struct hn_softc *);
363 static int			hn_chan_attach(struct hn_softc *,
364 				    struct vmbus_channel *);
365 static void			hn_chan_detach(struct hn_softc *,
366 				    struct vmbus_channel *);
367 static int			hn_attach_subchans(struct hn_softc *);
368 static void			hn_detach_allchans(struct hn_softc *);
369 static void			hn_chan_rollup(struct hn_rx_ring *,
370 				    struct hn_tx_ring *);
371 static void			hn_set_ring_inuse(struct hn_softc *, int);
372 static int			hn_synth_attach(struct hn_softc *, int);
373 static void			hn_synth_detach(struct hn_softc *);
374 static int			hn_synth_alloc_subchans(struct hn_softc *,
375 				    int *);
376 static bool			hn_synth_attachable(const struct hn_softc *);
377 static void			hn_suspend(struct hn_softc *);
378 static void			hn_suspend_data(struct hn_softc *);
379 static void			hn_suspend_mgmt(struct hn_softc *);
380 static void			hn_resume(struct hn_softc *);
381 static void			hn_resume_data(struct hn_softc *);
382 static void			hn_resume_mgmt(struct hn_softc *);
383 static void			hn_suspend_mgmt_taskfunc(void *, int);
384 static void			hn_chan_drain(struct hn_softc *,
385 				    struct vmbus_channel *);
386 static void			hn_disable_rx(struct hn_softc *);
387 static void			hn_drain_rxtx(struct hn_softc *, int);
388 static void			hn_polling(struct hn_softc *, u_int);
389 static void			hn_chan_polling(struct vmbus_channel *, u_int);
390 static void			hn_mtu_change_fixup(struct hn_softc *);
391 
392 static void			hn_update_link_status(struct hn_softc *);
393 static void			hn_change_network(struct hn_softc *);
394 static void			hn_link_taskfunc(void *, int);
395 static void			hn_netchg_init_taskfunc(void *, int);
396 static void			hn_netchg_status_taskfunc(void *, int);
397 static void			hn_link_status(struct hn_softc *);
398 
399 static int			hn_create_rx_data(struct hn_softc *, int);
400 static void			hn_destroy_rx_data(struct hn_softc *);
401 static int			hn_check_iplen(const struct mbuf *, int);
402 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
403 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
404 static int			hn_rxfilter_config(struct hn_softc *);
405 static int			hn_rss_reconfig(struct hn_softc *);
406 static void			hn_rss_ind_fixup(struct hn_softc *);
407 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
408 static int			hn_rxpkt(struct hn_rx_ring *);
409 static uint32_t			hn_rss_type_fromndis(uint32_t);
410 static uint32_t			hn_rss_type_tondis(uint32_t);
411 
412 static int			hn_tx_ring_create(struct hn_softc *, int);
413 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
414 static int			hn_create_tx_data(struct hn_softc *, int);
415 static void			hn_fixup_tx_data(struct hn_softc *);
416 static void			hn_fixup_rx_data(struct hn_softc *);
417 static void			hn_destroy_tx_data(struct hn_softc *);
418 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
419 static void			hn_txdesc_gc(struct hn_tx_ring *,
420 				    struct hn_txdesc *);
421 static int			hn_encap(if_t, struct hn_tx_ring *,
422 				    struct hn_txdesc *, struct mbuf **);
423 static int			hn_txpkt(if_t, struct hn_tx_ring *,
424 				    struct hn_txdesc *);
425 static void			hn_set_chim_size(struct hn_softc *, int);
426 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
427 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
428 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
429 static void			hn_resume_tx(struct hn_softc *, int);
430 static void			hn_set_txagg(struct hn_softc *);
431 static void			*hn_try_txagg(if_t,
432 				    struct hn_tx_ring *, struct hn_txdesc *,
433 				    int);
434 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
435 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
436 				    struct hn_softc *, struct vmbus_channel *,
437 				    const void *, int);
438 static int			hn_txpkt_sglist(struct hn_tx_ring *,
439 				    struct hn_txdesc *);
440 static int			hn_txpkt_chim(struct hn_tx_ring *,
441 				    struct hn_txdesc *);
442 static int			hn_xmit(struct hn_tx_ring *, int);
443 static void			hn_xmit_taskfunc(void *, int);
444 static void			hn_xmit_txeof(struct hn_tx_ring *);
445 static void			hn_xmit_txeof_taskfunc(void *, int);
446 #ifdef HN_IFSTART_SUPPORT
447 static int			hn_start_locked(struct hn_tx_ring *, int);
448 static void			hn_start_taskfunc(void *, int);
449 static void			hn_start_txeof(struct hn_tx_ring *);
450 static void			hn_start_txeof_taskfunc(void *, int);
451 #endif
452 
453 static int			hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
454 
455 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
456     "Hyper-V network interface");
457 
458 /* Trust tcp segment verification on host side. */
459 static int			hn_trust_hosttcp = 1;
460 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
461     &hn_trust_hosttcp, 0,
462     "Trust tcp segment verification on host side, "
463     "when csum info is missing (global setting)");
464 
465 /* Trust udp datagrams verification on host side. */
466 static int			hn_trust_hostudp = 1;
467 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
468     &hn_trust_hostudp, 0,
469     "Trust udp datagram verification on host side, "
470     "when csum info is missing (global setting)");
471 
472 /* Trust ip packets verification on host side. */
473 static int			hn_trust_hostip = 1;
474 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
475     &hn_trust_hostip, 0,
476     "Trust ip packet verification on host side, "
477     "when csum info is missing (global setting)");
478 
479 /*
480  * Offload UDP/IPv4 checksum.
481  */
482 static int			hn_enable_udp4cs = 1;
483 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
484     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
485 
486 /*
487  * Offload UDP/IPv6 checksum.
488  */
489 static int			hn_enable_udp6cs = 1;
490 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
491     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
492 
493 /* Stats. */
494 static counter_u64_t		hn_udpcs_fixup;
495 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
496     &hn_udpcs_fixup, "# of UDP checksum fixup");
497 
498 /*
499  * See hn_set_hlen().
500  *
501  * This value is for Azure.  For Hyper-V, set this above
502  * 65536 to disable UDP datagram checksum fixup.
503  */
504 static int			hn_udpcs_fixup_mtu = 1420;
505 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
506     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
507 
508 /* Limit TSO burst size */
509 static int			hn_tso_maxlen = IP_MAXPACKET;
510 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
511     &hn_tso_maxlen, 0, "TSO burst limit");
512 
513 /* Limit chimney send size */
514 static int			hn_tx_chimney_size = 0;
515 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
516     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
517 
518 /* Limit the size of packet for direct transmission */
519 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
520 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
521     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
522 
523 /* # of LRO entries per RX ring */
524 #if defined(INET) || defined(INET6)
525 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
526 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
527     &hn_lro_entry_count, 0, "LRO entry count");
528 #endif
529 
530 static int			hn_tx_taskq_cnt = 1;
531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
532     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
533 
534 #define HN_TX_TASKQ_M_INDEP	0
535 #define HN_TX_TASKQ_M_GLOBAL	1
536 #define HN_TX_TASKQ_M_EVTTQ	2
537 
538 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
539 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
540     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
541     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
542 
543 #ifndef HN_USE_TXDESC_BUFRING
544 static int			hn_use_txdesc_bufring = 0;
545 #else
546 static int			hn_use_txdesc_bufring = 1;
547 #endif
548 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
549     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
550 
551 #ifdef HN_IFSTART_SUPPORT
552 /* Use ifnet.if_start instead of ifnet.if_transmit */
553 static int			hn_use_if_start = 0;
554 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
555     &hn_use_if_start, 0, "Use if_start TX method");
556 #endif
557 
558 /* # of channels to use */
559 static int			hn_chan_cnt = 0;
560 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
561     &hn_chan_cnt, 0,
562     "# of channels to use; each channel has one RX ring and one TX ring");
563 
564 /* # of transmit rings to use */
565 static int			hn_tx_ring_cnt = 0;
566 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
567     &hn_tx_ring_cnt, 0, "# of TX rings to use");
568 
569 /* Software TX ring deptch */
570 static int			hn_tx_swq_depth = 0;
571 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
572     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
573 
574 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
575 static u_int			hn_lro_mbufq_depth = 0;
576 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
577     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
578 
579 /* Packet transmission aggregation size limit */
580 static int			hn_tx_agg_size = -1;
581 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
582     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
583 
584 /* Packet transmission aggregation count limit */
585 static int			hn_tx_agg_pkts = -1;
586 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
587     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
588 
589 /* VF list */
590 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
591     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
592     hn_vflist_sysctl, "A",
593     "VF list");
594 
595 /* VF mapping */
596 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
597     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
598     hn_vfmap_sysctl, "A",
599     "VF mapping");
600 
601 /* Transparent VF */
602 static int			hn_xpnt_vf = 1;
603 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
604     &hn_xpnt_vf, 0, "Transparent VF mod");
605 
606 /* Accurate BPF support for Transparent VF */
607 static int			hn_xpnt_vf_accbpf = 0;
608 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
609     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
610 
611 /* Extra wait for transparent VF attach routing; unit seconds. */
612 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
613 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
614     &hn_xpnt_vf_attwait, 0,
615     "Extra wait for transparent VF attach routing; unit: seconds");
616 
617 static u_int			hn_cpu_index;	/* next CPU for channel */
618 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
619 
620 static struct rmlock		hn_vfmap_lock;
621 static int			hn_vfmap_size;
622 static if_t			*hn_vfmap;
623 
624 #ifndef RSS
625 static const uint8_t
626 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
627 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
628 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
629 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
630 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
631 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
632 };
633 #endif	/* !RSS */
634 
635 static const struct hyperv_guid	hn_guid = {
636 	.hv_guid = {
637 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
638 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
639 };
640 
641 static device_method_t hn_methods[] = {
642 	/* Device interface */
643 	DEVMETHOD(device_probe,		hn_probe),
644 	DEVMETHOD(device_attach,	hn_attach),
645 	DEVMETHOD(device_detach,	hn_detach),
646 	DEVMETHOD(device_shutdown,	hn_shutdown),
647 	DEVMETHOD_END
648 };
649 
650 static driver_t hn_driver = {
651 	"hn",
652 	hn_methods,
653 	sizeof(struct hn_softc)
654 };
655 
656 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0);
657 MODULE_VERSION(hn, 1);
658 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
659 
660 static void
661 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
662 {
663 	int i;
664 
665 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
666 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
667 }
668 
669 static int
670 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
671 {
672 
673 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
674 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
675 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
676 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
677 }
678 
679 static int
680 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
681 {
682 	struct hn_nvs_rndis rndis;
683 
684 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
685 	    txd->chim_size > 0, ("invalid rndis chim txd"));
686 
687 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
688 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
689 	rndis.nvs_chim_idx = txd->chim_index;
690 	rndis.nvs_chim_sz = txd->chim_size;
691 
692 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
693 	    &rndis, sizeof(rndis), &txd->send_ctx));
694 }
695 
696 static __inline uint32_t
697 hn_chim_alloc(struct hn_softc *sc)
698 {
699 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
700 	u_long *bmap = sc->hn_chim_bmap;
701 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
702 
703 	for (i = 0; i < bmap_cnt; ++i) {
704 		int idx;
705 
706 		idx = ffsl(~bmap[i]);
707 		if (idx == 0)
708 			continue;
709 
710 		--idx; /* ffsl is 1-based */
711 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
712 		    ("invalid i %d and idx %d", i, idx));
713 
714 		if (atomic_testandset_long(&bmap[i], idx))
715 			continue;
716 
717 		ret = i * LONG_BIT + idx;
718 		break;
719 	}
720 	return (ret);
721 }
722 
723 static __inline void
724 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
725 {
726 	u_long mask;
727 	uint32_t idx;
728 
729 	idx = chim_idx / LONG_BIT;
730 	KASSERT(idx < sc->hn_chim_bmap_cnt,
731 	    ("invalid chimney index 0x%x", chim_idx));
732 
733 	mask = 1UL << (chim_idx % LONG_BIT);
734 	KASSERT(sc->hn_chim_bmap[idx] & mask,
735 	    ("index bitmap 0x%lx, chimney index %u, "
736 	     "bitmap idx %d, bitmask 0x%lx",
737 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
738 
739 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
740 }
741 
742 #if defined(INET6) || defined(INET)
743 
744 #define PULLUP_HDR(m, len)				\
745 do {							\
746 	if (__predict_false((m)->m_len < (len))) {	\
747 		(m) = m_pullup((m), (len));		\
748 		if ((m) == NULL)			\
749 			return (NULL);			\
750 	}						\
751 } while (0)
752 
753 /*
754  * NOTE: If this function failed, the m_head would be freed.
755  */
756 static __inline struct mbuf *
757 hn_tso_fixup(struct mbuf *m_head)
758 {
759 	struct ether_vlan_header *evl;
760 	struct tcphdr *th;
761 	int ehlen;
762 
763 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
764 
765 	PULLUP_HDR(m_head, sizeof(*evl));
766 	evl = mtod(m_head, struct ether_vlan_header *);
767 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
768 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
769 	else
770 		ehlen = ETHER_HDR_LEN;
771 	m_head->m_pkthdr.l2hlen = ehlen;
772 
773 #ifdef INET
774 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
775 		struct ip *ip;
776 		int iphlen;
777 
778 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
779 		ip = mtodo(m_head, ehlen);
780 		iphlen = ip->ip_hl << 2;
781 		m_head->m_pkthdr.l3hlen = iphlen;
782 
783 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
784 		th = mtodo(m_head, ehlen + iphlen);
785 
786 		ip->ip_len = 0;
787 		ip->ip_sum = 0;
788 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
789 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
790 	}
791 #endif
792 #if defined(INET6) && defined(INET)
793 	else
794 #endif
795 #ifdef INET6
796 	{
797 		struct ip6_hdr *ip6;
798 
799 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
800 		ip6 = mtodo(m_head, ehlen);
801 		if (ip6->ip6_nxt != IPPROTO_TCP) {
802 			m_freem(m_head);
803 			return (NULL);
804 		}
805 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
806 
807 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
808 		th = mtodo(m_head, ehlen + sizeof(*ip6));
809 
810 		ip6->ip6_plen = 0;
811 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
812 	}
813 #endif
814 	return (m_head);
815 }
816 
817 /*
818  * NOTE: If this function failed, the m_head would be freed.
819  */
820 static __inline struct mbuf *
821 hn_set_hlen(struct mbuf *m_head)
822 {
823 	const struct ether_vlan_header *evl;
824 	int ehlen;
825 
826 	PULLUP_HDR(m_head, sizeof(*evl));
827 	evl = mtod(m_head, const struct ether_vlan_header *);
828 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
829 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
830 	else
831 		ehlen = ETHER_HDR_LEN;
832 	m_head->m_pkthdr.l2hlen = ehlen;
833 
834 #ifdef INET
835 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
836 		const struct ip *ip;
837 		int iphlen;
838 
839 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
840 		ip = mtodo(m_head, ehlen);
841 		iphlen = ip->ip_hl << 2;
842 		m_head->m_pkthdr.l3hlen = iphlen;
843 
844 		/*
845 		 * UDP checksum offload does not work in Azure, if the
846 		 * following conditions meet:
847 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
848 		 * - IP_DF is not set in the IP hdr.
849 		 *
850 		 * Fallback to software checksum for these UDP datagrams.
851 		 */
852 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
853 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
854 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
855 			uint16_t off = ehlen + iphlen;
856 
857 			counter_u64_add(hn_udpcs_fixup, 1);
858 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
859 			*(uint16_t *)(m_head->m_data + off +
860                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
861 			    m_head, m_head->m_pkthdr.len, off);
862 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
863 		}
864 	}
865 #endif
866 #if defined(INET6) && defined(INET)
867 	else
868 #endif
869 #ifdef INET6
870 	{
871 		const struct ip6_hdr *ip6;
872 
873 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
874 		ip6 = mtodo(m_head, ehlen);
875 		if (ip6->ip6_nxt != IPPROTO_TCP &&
876 		    ip6->ip6_nxt != IPPROTO_UDP) {
877 			m_freem(m_head);
878 			return (NULL);
879 		}
880 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
881 	}
882 #endif
883 	return (m_head);
884 }
885 
886 /*
887  * NOTE: If this function failed, the m_head would be freed.
888  */
889 static __inline struct mbuf *
890 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
891 {
892 	const struct tcphdr *th;
893 	int ehlen, iphlen;
894 
895 	*tcpsyn = 0;
896 	ehlen = m_head->m_pkthdr.l2hlen;
897 	iphlen = m_head->m_pkthdr.l3hlen;
898 
899 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
900 	th = mtodo(m_head, ehlen + iphlen);
901 	if (th->th_flags & TH_SYN)
902 		*tcpsyn = 1;
903 	return (m_head);
904 }
905 
906 #undef PULLUP_HDR
907 
908 #endif	/* INET6 || INET */
909 
910 static int
911 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
912 {
913 	int error = 0;
914 
915 	HN_LOCK_ASSERT(sc);
916 
917 	if (sc->hn_rx_filter != filter) {
918 		error = hn_rndis_set_rxfilter(sc, filter);
919 		if (!error)
920 			sc->hn_rx_filter = filter;
921 	}
922 	return (error);
923 }
924 
925 static int
926 hn_rxfilter_config(struct hn_softc *sc)
927 {
928 	if_t ifp = sc->hn_ifp;
929 	uint32_t filter;
930 
931 	HN_LOCK_ASSERT(sc);
932 
933 	/*
934 	 * If the non-transparent mode VF is activated, we don't know how
935 	 * its RX filter is configured, so stick the synthetic device in
936 	 * the promiscous mode.
937 	 */
938 	if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
939 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
940 	} else {
941 		filter = NDIS_PACKET_TYPE_DIRECTED;
942 		if (if_getflags(ifp) & IFF_BROADCAST)
943 			filter |= NDIS_PACKET_TYPE_BROADCAST;
944 		/* TODO: support multicast list */
945 		if ((if_getflags(ifp) & IFF_ALLMULTI) ||
946 		    !if_maddr_empty(ifp))
947 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
948 	}
949 	return (hn_set_rxfilter(sc, filter));
950 }
951 
952 static void
953 hn_set_txagg(struct hn_softc *sc)
954 {
955 	uint32_t size, pkts;
956 	int i;
957 
958 	/*
959 	 * Setup aggregation size.
960 	 */
961 	if (sc->hn_agg_size < 0)
962 		size = UINT32_MAX;
963 	else
964 		size = sc->hn_agg_size;
965 
966 	if (sc->hn_rndis_agg_size < size)
967 		size = sc->hn_rndis_agg_size;
968 
969 	/* NOTE: We only aggregate packets using chimney sending buffers. */
970 	if (size > (uint32_t)sc->hn_chim_szmax)
971 		size = sc->hn_chim_szmax;
972 
973 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
974 		/* Disable */
975 		size = 0;
976 		pkts = 0;
977 		goto done;
978 	}
979 
980 	/* NOTE: Type of the per TX ring setting is 'int'. */
981 	if (size > INT_MAX)
982 		size = INT_MAX;
983 
984 	/*
985 	 * Setup aggregation packet count.
986 	 */
987 	if (sc->hn_agg_pkts < 0)
988 		pkts = UINT32_MAX;
989 	else
990 		pkts = sc->hn_agg_pkts;
991 
992 	if (sc->hn_rndis_agg_pkts < pkts)
993 		pkts = sc->hn_rndis_agg_pkts;
994 
995 	if (pkts <= 1) {
996 		/* Disable */
997 		size = 0;
998 		pkts = 0;
999 		goto done;
1000 	}
1001 
1002 	/* NOTE: Type of the per TX ring setting is 'short'. */
1003 	if (pkts > SHRT_MAX)
1004 		pkts = SHRT_MAX;
1005 
1006 done:
1007 	/* NOTE: Type of the per TX ring setting is 'short'. */
1008 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1009 		/* Disable */
1010 		size = 0;
1011 		pkts = 0;
1012 	}
1013 
1014 	if (bootverbose) {
1015 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1016 		    size, pkts, sc->hn_rndis_agg_align);
1017 	}
1018 
1019 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1020 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1021 
1022 		mtx_lock(&txr->hn_tx_lock);
1023 		txr->hn_agg_szmax = size;
1024 		txr->hn_agg_pktmax = pkts;
1025 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1026 		mtx_unlock(&txr->hn_tx_lock);
1027 	}
1028 }
1029 
1030 static int
1031 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1032 {
1033 
1034 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1035 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1036 		return txr->hn_txdesc_cnt;
1037 	return hn_tx_swq_depth;
1038 }
1039 
1040 static int
1041 hn_rss_reconfig(struct hn_softc *sc)
1042 {
1043 	int error;
1044 
1045 	HN_LOCK_ASSERT(sc);
1046 
1047 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1048 		return (ENXIO);
1049 
1050 	/*
1051 	 * Disable RSS first.
1052 	 *
1053 	 * NOTE:
1054 	 * Direct reconfiguration by setting the UNCHG flags does
1055 	 * _not_ work properly.
1056 	 */
1057 	if (bootverbose)
1058 		if_printf(sc->hn_ifp, "disable RSS\n");
1059 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1060 	if (error) {
1061 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1062 		return (error);
1063 	}
1064 
1065 	/*
1066 	 * Reenable the RSS w/ the updated RSS key or indirect
1067 	 * table.
1068 	 */
1069 	if (bootverbose)
1070 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1071 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1072 	if (error) {
1073 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1074 		return (error);
1075 	}
1076 	return (0);
1077 }
1078 
1079 static void
1080 hn_rss_ind_fixup(struct hn_softc *sc)
1081 {
1082 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1083 	int i, nchan;
1084 
1085 	nchan = sc->hn_rx_ring_inuse;
1086 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1087 
1088 	/*
1089 	 * Check indirect table to make sure that all channels in it
1090 	 * can be used.
1091 	 */
1092 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1093 		if (rss->rss_ind[i] >= nchan) {
1094 			if_printf(sc->hn_ifp,
1095 			    "RSS indirect table %d fixup: %u -> %d\n",
1096 			    i, rss->rss_ind[i], nchan - 1);
1097 			rss->rss_ind[i] = nchan - 1;
1098 		}
1099 	}
1100 }
1101 
1102 static int
1103 hn_ifmedia_upd(if_t ifp __unused)
1104 {
1105 
1106 	return EOPNOTSUPP;
1107 }
1108 
1109 static void
1110 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr)
1111 {
1112 	struct hn_softc *sc = if_getsoftc(ifp);
1113 
1114 	ifmr->ifm_status = IFM_AVALID;
1115 	ifmr->ifm_active = IFM_ETHER;
1116 
1117 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1118 		ifmr->ifm_active |= IFM_NONE;
1119 		return;
1120 	}
1121 	ifmr->ifm_status |= IFM_ACTIVE;
1122 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1123 }
1124 
1125 static void
1126 hn_rxvf_set_task(void *xarg, int pending __unused)
1127 {
1128 	struct hn_rxvf_setarg *arg = xarg;
1129 
1130 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1131 }
1132 
1133 static void
1134 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp)
1135 {
1136 	struct hn_rx_ring *rxr;
1137 	struct hn_rxvf_setarg arg;
1138 	struct task task;
1139 	int i;
1140 
1141 	HN_LOCK_ASSERT(sc);
1142 
1143 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1144 
1145 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1146 		rxr = &sc->hn_rx_ring[i];
1147 
1148 		if (i < sc->hn_rx_ring_inuse) {
1149 			arg.rxr = rxr;
1150 			arg.vf_ifp = vf_ifp;
1151 			vmbus_chan_run_task(rxr->hn_chan, &task);
1152 		} else {
1153 			rxr->hn_rxvf_ifp = vf_ifp;
1154 		}
1155 	}
1156 }
1157 
1158 static bool
1159 hn_ismyvf(const struct hn_softc *sc, const if_t ifp)
1160 {
1161 	if_t hn_ifp;
1162 
1163 	hn_ifp = sc->hn_ifp;
1164 
1165 	if (ifp == hn_ifp)
1166 		return (false);
1167 
1168 	if (if_getalloctype(ifp) != IFT_ETHER)
1169 		return (false);
1170 
1171 	/* Ignore lagg/vlan interfaces */
1172 	if (strcmp(if_getdname(ifp), "lagg") == 0 ||
1173 	    strcmp(if_getdname(ifp), "vlan") == 0)
1174 		return (false);
1175 
1176 	/*
1177 	 * During detach events if_getifaddr(ifp) might be NULL.
1178 	 * Make sure the bcmp() below doesn't panic on that:
1179 	 */
1180 	if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL)
1181 		return (false);
1182 
1183 	if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0)
1184 		return (false);
1185 
1186 	return (true);
1187 }
1188 
1189 static void
1190 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf)
1191 {
1192 	if_t hn_ifp;
1193 
1194 	HN_LOCK(sc);
1195 
1196 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1197 		goto out;
1198 
1199 	if (!hn_ismyvf(sc, ifp))
1200 		goto out;
1201 	hn_ifp = sc->hn_ifp;
1202 
1203 	if (rxvf) {
1204 		if (sc->hn_flags & HN_FLAG_RXVF)
1205 			goto out;
1206 
1207 		sc->hn_flags |= HN_FLAG_RXVF;
1208 		hn_rxfilter_config(sc);
1209 	} else {
1210 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1211 			goto out;
1212 
1213 		sc->hn_flags &= ~HN_FLAG_RXVF;
1214 		if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING)
1215 			hn_rxfilter_config(sc);
1216 		else
1217 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1218 	}
1219 
1220 	hn_nvs_set_datapath(sc,
1221 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1222 
1223 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1224 
1225 	if (rxvf) {
1226 		hn_vf_rss_fixup(sc, true);
1227 		hn_suspend_mgmt(sc);
1228 		sc->hn_link_flags &=
1229 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1230 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1231 	} else {
1232 		hn_vf_rss_restore(sc);
1233 		hn_resume_mgmt(sc);
1234 	}
1235 
1236 	devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1237 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1238 
1239 	if (bootverbose) {
1240 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1241 		    rxvf ? "to" : "from", if_name(ifp));
1242 	}
1243 out:
1244 	HN_UNLOCK(sc);
1245 }
1246 
1247 static void
1248 hn_ifnet_event(void *arg, if_t ifp, int event)
1249 {
1250 
1251 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1252 		return;
1253 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1254 }
1255 
1256 static void
1257 hn_ifaddr_event(void *arg, if_t ifp)
1258 {
1259 
1260 	hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP);
1261 }
1262 
1263 static int
1264 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1265 {
1266 	if_t ifp, vf_ifp;
1267 	uint64_t tmp;
1268 	int error;
1269 
1270 	HN_LOCK_ASSERT(sc);
1271 	ifp = sc->hn_ifp;
1272 	vf_ifp = sc->hn_vf_ifp;
1273 
1274 	/*
1275 	 * Fix up requested capabilities w/ supported capabilities,
1276 	 * since the supported capabilities could have been changed.
1277 	 */
1278 	ifr->ifr_reqcap &= if_getcapabilities(ifp);
1279 	/* Pass SIOCSIFCAP to VF. */
1280 	error = ifhwioctl(SIOCSIFCAP, vf_ifp, (caddr_t)ifr, curthread);
1281 
1282 	/*
1283 	 * NOTE:
1284 	 * The error will be propagated to the callers, however, it
1285 	 * is _not_ useful here.
1286 	 */
1287 
1288 	/*
1289 	 * Merge VF's enabled capabilities.
1290 	 */
1291 	if_setcapenable(ifp, if_getcapenable(vf_ifp) & if_getcapabilities(ifp));
1292 
1293 	tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP_HWASSIST(sc);
1294 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
1295 		if_sethwassistbits(ifp, tmp, 0);
1296 	else
1297 		if_sethwassistbits(ifp, 0, tmp);
1298 
1299 	tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP6_HWASSIST(sc);
1300 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
1301 		if_sethwassistbits(ifp, tmp, 0);
1302 	else
1303 		if_sethwassistbits(ifp, 0, tmp);
1304 
1305 	tmp = if_gethwassist(vf_ifp) & CSUM_IP_TSO;
1306 	if (if_getcapenable(ifp) & IFCAP_TSO4)
1307 		if_sethwassistbits(ifp, tmp, 0);
1308 	else
1309 		if_sethwassistbits(ifp, 0, tmp);
1310 
1311 	tmp = if_gethwassist(vf_ifp) & CSUM_IP6_TSO;
1312 	if (if_getcapenable(ifp) & IFCAP_TSO6)
1313 		if_sethwassistbits(ifp, tmp, 0);
1314 	else
1315 		if_sethwassistbits(ifp, 0, tmp);
1316 
1317 	return (error);
1318 }
1319 
1320 static int
1321 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1322 {
1323 	if_t vf_ifp;
1324 	struct ifreq ifr;
1325 
1326 	HN_LOCK_ASSERT(sc);
1327 	vf_ifp = sc->hn_vf_ifp;
1328 
1329 	memset(&ifr, 0, sizeof(ifr));
1330 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1331 	ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff;
1332 	ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16;
1333 	return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread));
1334 }
1335 
1336 static void
1337 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1338 {
1339 	if_t ifp = sc->hn_ifp;
1340 	int allmulti = 0;
1341 
1342 	HN_LOCK_ASSERT(sc);
1343 
1344 	/* XXX vlan(4) style mcast addr maintenance */
1345 	if (!if_maddr_empty(ifp))
1346 		allmulti = IFF_ALLMULTI;
1347 
1348 	/* Always set the VF's if_flags */
1349 	if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti);
1350 }
1351 
1352 static void
1353 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m)
1354 {
1355 	struct rm_priotracker pt;
1356 	if_t hn_ifp = NULL;
1357 	struct mbuf *mn;
1358 
1359 	/*
1360 	 * XXX racy, if hn(4) ever detached.
1361 	 */
1362 	rm_rlock(&hn_vfmap_lock, &pt);
1363 	if (if_getindex(vf_ifp) < hn_vfmap_size)
1364 		hn_ifp = hn_vfmap[if_getindex(vf_ifp)];
1365 	rm_runlock(&hn_vfmap_lock, &pt);
1366 
1367 	if (hn_ifp != NULL) {
1368 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1369 			/*
1370 			 * Allow tapping on the VF.
1371 			 */
1372 			ETHER_BPF_MTAP(vf_ifp, mn);
1373 
1374 			/*
1375 			 * Update VF stats.
1376 			 */
1377 			if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) {
1378 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1379 				    mn->m_pkthdr.len);
1380 			}
1381 			/*
1382 			 * XXX IFCOUNTER_IMCAST
1383 			 * This stat updating is kinda invasive, since it
1384 			 * requires two checks on the mbuf: the length check
1385 			 * and the ethernet header check.  As of this write,
1386 			 * all multicast packets go directly to hn(4), which
1387 			 * makes imcast stat updating in the VF a try in vian.
1388 			 */
1389 
1390 			/*
1391 			 * Fix up rcvif and increase hn(4)'s ipackets.
1392 			 */
1393 			mn->m_pkthdr.rcvif = hn_ifp;
1394 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1395 		}
1396 		/*
1397 		 * Go through hn(4)'s if_input.
1398 		 */
1399 		if_input(hn_ifp, m);
1400 	} else {
1401 		/*
1402 		 * In the middle of the transition; free this
1403 		 * mbuf chain.
1404 		 */
1405 		while (m != NULL) {
1406 			mn = m->m_nextpkt;
1407 			m->m_nextpkt = NULL;
1408 			m_freem(m);
1409 			m = mn;
1410 		}
1411 	}
1412 }
1413 
1414 static void
1415 hn_mtu_change_fixup(struct hn_softc *sc)
1416 {
1417 	if_t ifp;
1418 
1419 	HN_LOCK_ASSERT(sc);
1420 	ifp = sc->hn_ifp;
1421 
1422 	hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp));
1423 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1424 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1425 }
1426 
1427 static uint32_t
1428 hn_rss_type_fromndis(uint32_t rss_hash)
1429 {
1430 	uint32_t types = 0;
1431 
1432 	if (rss_hash & NDIS_HASH_IPV4)
1433 		types |= RSS_TYPE_IPV4;
1434 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1435 		types |= RSS_TYPE_TCP_IPV4;
1436 	if (rss_hash & NDIS_HASH_IPV6)
1437 		types |= RSS_TYPE_IPV6;
1438 	if (rss_hash & NDIS_HASH_IPV6_EX)
1439 		types |= RSS_TYPE_IPV6_EX;
1440 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1441 		types |= RSS_TYPE_TCP_IPV6;
1442 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1443 		types |= RSS_TYPE_TCP_IPV6_EX;
1444 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1445 		types |= RSS_TYPE_UDP_IPV4;
1446 	return (types);
1447 }
1448 
1449 static uint32_t
1450 hn_rss_type_tondis(uint32_t types)
1451 {
1452 	uint32_t rss_hash = 0;
1453 
1454 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1455 	    ("UDP6 and UDP6EX are not supported"));
1456 
1457 	if (types & RSS_TYPE_IPV4)
1458 		rss_hash |= NDIS_HASH_IPV4;
1459 	if (types & RSS_TYPE_TCP_IPV4)
1460 		rss_hash |= NDIS_HASH_TCP_IPV4;
1461 	if (types & RSS_TYPE_IPV6)
1462 		rss_hash |= NDIS_HASH_IPV6;
1463 	if (types & RSS_TYPE_IPV6_EX)
1464 		rss_hash |= NDIS_HASH_IPV6_EX;
1465 	if (types & RSS_TYPE_TCP_IPV6)
1466 		rss_hash |= NDIS_HASH_TCP_IPV6;
1467 	if (types & RSS_TYPE_TCP_IPV6_EX)
1468 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1469 	if (types & RSS_TYPE_UDP_IPV4)
1470 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1471 	return (rss_hash);
1472 }
1473 
1474 static void
1475 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1476 {
1477 	int i;
1478 
1479 	HN_LOCK_ASSERT(sc);
1480 
1481 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1482 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1483 }
1484 
1485 static void
1486 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1487 {
1488 	if_t ifp, vf_ifp;
1489 	struct ifrsshash ifrh;
1490 	struct ifrsskey ifrk;
1491 	int error;
1492 	uint32_t my_types, diff_types, mbuf_types = 0;
1493 
1494 	HN_LOCK_ASSERT(sc);
1495 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1496 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1497 
1498 	if (sc->hn_rx_ring_inuse == 1) {
1499 		/* No RSS on synthetic parts; done. */
1500 		return;
1501 	}
1502 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1503 		/* Synthetic parts do not support Toeplitz; done. */
1504 		return;
1505 	}
1506 
1507 	ifp = sc->hn_ifp;
1508 	vf_ifp = sc->hn_vf_ifp;
1509 
1510 	/*
1511 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1512 	 * supported.
1513 	 */
1514 	memset(&ifrk, 0, sizeof(ifrk));
1515 	strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name));
1516 	error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread);
1517 	if (error) {
1518 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1519 		    if_name(vf_ifp), error);
1520 		goto done;
1521 	}
1522 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1523 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1524 		    if_name(vf_ifp), ifrk.ifrk_func);
1525 		goto done;
1526 	}
1527 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1528 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1529 		    if_name(vf_ifp), ifrk.ifrk_keylen);
1530 		goto done;
1531 	}
1532 
1533 	/*
1534 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1535 	 */
1536 	memset(&ifrh, 0, sizeof(ifrh));
1537 	strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name));
1538 	error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread);
1539 	if (error) {
1540 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1541 		    if_name(vf_ifp), error);
1542 		goto done;
1543 	}
1544 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1545 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1546 		    if_name(vf_ifp), ifrh.ifrh_func);
1547 		goto done;
1548 	}
1549 
1550 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1551 	if ((ifrh.ifrh_types & my_types) == 0) {
1552 		/* This disables RSS; ignore it then */
1553 		if_printf(ifp, "%s intersection of RSS types failed.  "
1554 		    "VF %#x, mine %#x\n", if_name(vf_ifp),
1555 		    ifrh.ifrh_types, my_types);
1556 		goto done;
1557 	}
1558 
1559 	diff_types = my_types ^ ifrh.ifrh_types;
1560 	my_types &= ifrh.ifrh_types;
1561 	mbuf_types = my_types;
1562 
1563 	/*
1564 	 * Detect RSS hash value/type confliction.
1565 	 *
1566 	 * NOTE:
1567 	 * We don't disable the hash type, but stop delivery the hash
1568 	 * value/type through mbufs on RX path.
1569 	 *
1570 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1571 	 * hash is delivered with type of TCP_IPV4.  This means if
1572 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1573 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1574 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1575 	 * here.
1576 	 */
1577 	if ((my_types & RSS_TYPE_IPV4) &&
1578 	    (diff_types & ifrh.ifrh_types &
1579 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1580 		/* Conflict; disable IPV4 hash type/value delivery. */
1581 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1582 		mbuf_types &= ~RSS_TYPE_IPV4;
1583 	}
1584 	if ((my_types & RSS_TYPE_IPV6) &&
1585 	    (diff_types & ifrh.ifrh_types &
1586 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1587 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1588 	      RSS_TYPE_IPV6_EX))) {
1589 		/* Conflict; disable IPV6 hash type/value delivery. */
1590 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1591 		mbuf_types &= ~RSS_TYPE_IPV6;
1592 	}
1593 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1594 	    (diff_types & ifrh.ifrh_types &
1595 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1596 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1597 	      RSS_TYPE_IPV6))) {
1598 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1599 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1600 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1601 	}
1602 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1603 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1604 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1605 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1606 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1607 	}
1608 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1609 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1610 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1611 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1612 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1613 	}
1614 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1615 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1616 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1617 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1618 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1619 	}
1620 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1621 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1622 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1623 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1624 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1625 	}
1626 
1627 	/*
1628 	 * Indirect table does not matter.
1629 	 */
1630 
1631 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1632 	    hn_rss_type_tondis(my_types);
1633 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1634 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1635 
1636 	if (reconf) {
1637 		error = hn_rss_reconfig(sc);
1638 		if (error) {
1639 			/* XXX roll-back? */
1640 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1641 			/* XXX keep going. */
1642 		}
1643 	}
1644 done:
1645 	/* Hash deliverability for mbufs. */
1646 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1647 }
1648 
1649 static void
1650 hn_vf_rss_restore(struct hn_softc *sc)
1651 {
1652 
1653 	HN_LOCK_ASSERT(sc);
1654 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1655 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1656 
1657 	if (sc->hn_rx_ring_inuse == 1)
1658 		goto done;
1659 
1660 	/*
1661 	 * Restore hash types.  Key does _not_ matter.
1662 	 */
1663 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1664 		int error;
1665 
1666 		sc->hn_rss_hash = sc->hn_rss_hcap;
1667 		error = hn_rss_reconfig(sc);
1668 		if (error) {
1669 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1670 			    error);
1671 			/* XXX keep going. */
1672 		}
1673 	}
1674 done:
1675 	/* Hash deliverability for mbufs. */
1676 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1677 }
1678 
1679 static void
1680 hn_xpnt_vf_setready(struct hn_softc *sc)
1681 {
1682 	if_t ifp, vf_ifp;
1683 	struct ifreq ifr;
1684 
1685 	HN_LOCK_ASSERT(sc);
1686 	ifp = sc->hn_ifp;
1687 	vf_ifp = sc->hn_vf_ifp;
1688 
1689 	/*
1690 	 * Mark the VF ready.
1691 	 */
1692 	sc->hn_vf_rdytick = 0;
1693 
1694 	/*
1695 	 * Save information for restoration.
1696 	 */
1697 	sc->hn_saved_caps = if_getcapabilities(ifp);
1698 	sc->hn_saved_tsomax = if_gethwtsomax(ifp);
1699 	sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp);
1700 	sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp);
1701 
1702 	/*
1703 	 * Intersect supported/enabled capabilities.
1704 	 *
1705 	 * NOTE:
1706 	 * if_hwassist is not changed here.
1707 	 */
1708 	if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp));
1709 	if_setcapenablebit(ifp, 0, if_getcapabilities(ifp));
1710 
1711 	/*
1712 	 * Fix TSO settings.
1713 	 */
1714 	if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp))
1715 		if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp));
1716 	if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp))
1717 		if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp));
1718 	if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp))
1719 		if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp));
1720 
1721 	/*
1722 	 * Change VF's enabled capabilities.
1723 	 */
1724 	memset(&ifr, 0, sizeof(ifr));
1725 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1726 	ifr.ifr_reqcap = if_getcapenable(ifp);
1727 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1728 
1729 	if (if_getmtu(ifp) != ETHERMTU) {
1730 		int error;
1731 
1732 		/*
1733 		 * Change VF's MTU.
1734 		 */
1735 		memset(&ifr, 0, sizeof(ifr));
1736 		strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1737 		ifr.ifr_mtu = if_getmtu(ifp);
1738 		error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread);
1739 		if (error) {
1740 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1741 			    if_name(vf_ifp), if_getmtu(ifp));
1742 			if (if_getmtu(ifp) > ETHERMTU) {
1743 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1744 
1745 				/*
1746 				 * XXX
1747 				 * No need to adjust the synthetic parts' MTU;
1748 				 * failure of the adjustment will cause us
1749 				 * infinite headache.
1750 				 */
1751 				if_setmtu(ifp, ETHERMTU);
1752 				hn_mtu_change_fixup(sc);
1753 			}
1754 		}
1755 	}
1756 }
1757 
1758 static bool
1759 hn_xpnt_vf_isready(struct hn_softc *sc)
1760 {
1761 
1762 	HN_LOCK_ASSERT(sc);
1763 
1764 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1765 		return (false);
1766 
1767 	if (sc->hn_vf_rdytick == 0)
1768 		return (true);
1769 
1770 	if (sc->hn_vf_rdytick > ticks)
1771 		return (false);
1772 
1773 	/* Mark VF as ready. */
1774 	hn_xpnt_vf_setready(sc);
1775 	return (true);
1776 }
1777 
1778 static void
1779 hn_xpnt_vf_setenable(struct hn_softc *sc)
1780 {
1781 	int i;
1782 
1783 	HN_LOCK_ASSERT(sc);
1784 
1785 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1786 	rm_wlock(&sc->hn_vf_lock);
1787 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1788 	rm_wunlock(&sc->hn_vf_lock);
1789 
1790 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1791 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1792 }
1793 
1794 static void
1795 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1796 {
1797 	int i;
1798 
1799 	HN_LOCK_ASSERT(sc);
1800 
1801 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1802 	rm_wlock(&sc->hn_vf_lock);
1803 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1804 	if (clear_vf)
1805 		sc->hn_vf_ifp = NULL;
1806 	rm_wunlock(&sc->hn_vf_lock);
1807 
1808 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1809 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1810 }
1811 
1812 static void
1813 hn_xpnt_vf_init(struct hn_softc *sc)
1814 {
1815 	int error;
1816 
1817 	HN_LOCK_ASSERT(sc);
1818 
1819 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1820 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1821 
1822 	if (bootverbose) {
1823 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1824 		    if_name(sc->hn_vf_ifp));
1825 	}
1826 
1827 	/*
1828 	 * Bring the VF up.
1829 	 */
1830 	hn_xpnt_vf_saveifflags(sc);
1831 	if_setflagbits(sc->hn_ifp, IFF_UP, 0);
1832 	error = hn_xpnt_vf_iocsetflags(sc);
1833 	if (error) {
1834 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1835 		    if_name(sc->hn_vf_ifp), error);
1836 		return;
1837 	}
1838 
1839 	/*
1840 	 * NOTE:
1841 	 * Datapath setting must happen _after_ bringing the VF up.
1842 	 */
1843 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1844 
1845 	/*
1846 	 * NOTE:
1847 	 * Fixup RSS related bits _after_ the VF is brought up, since
1848 	 * many VFs generate RSS key during it's initialization.
1849 	 */
1850 	hn_vf_rss_fixup(sc, true);
1851 
1852 	/* Mark transparent mode VF as enabled. */
1853 	hn_xpnt_vf_setenable(sc);
1854 }
1855 
1856 static void
1857 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1858 {
1859 	struct hn_softc *sc = xsc;
1860 
1861 	HN_LOCK(sc);
1862 
1863 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1864 		goto done;
1865 	if (sc->hn_vf_ifp == NULL)
1866 		goto done;
1867 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1868 		goto done;
1869 
1870 	if (sc->hn_vf_rdytick != 0) {
1871 		/* Mark VF as ready. */
1872 		hn_xpnt_vf_setready(sc);
1873 	}
1874 
1875 	if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) {
1876 		/*
1877 		 * Delayed VF initialization.
1878 		 */
1879 		if (bootverbose) {
1880 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1881 			    if_name(sc->hn_vf_ifp));
1882 		}
1883 		hn_xpnt_vf_init(sc);
1884 	}
1885 done:
1886 	HN_UNLOCK(sc);
1887 }
1888 
1889 static void
1890 hn_ifnet_attevent(void *xsc, if_t ifp)
1891 {
1892 	struct hn_softc *sc = xsc;
1893 
1894 	HN_LOCK(sc);
1895 
1896 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1897 		goto done;
1898 
1899 	if (!hn_ismyvf(sc, ifp))
1900 		goto done;
1901 
1902 	if (sc->hn_vf_ifp != NULL) {
1903 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1904 		    if_name(sc->hn_vf_ifp));
1905 		goto done;
1906 	}
1907 
1908 	if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) {
1909 		/*
1910 		 * ifnet.if_start is _not_ supported by transparent
1911 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1912 		 */
1913 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1914 		    "in transparent VF mode.\n", if_name(sc->hn_vf_ifp));
1915 
1916 		goto done;
1917 	}
1918 
1919 	rm_wlock(&hn_vfmap_lock);
1920 
1921 	if (if_getindex(ifp) >= hn_vfmap_size) {
1922 		if_t *newmap;
1923 		int newsize;
1924 
1925 		newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF;
1926 		newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF,
1927 		    M_WAITOK | M_ZERO);
1928 
1929 		memcpy(newmap, hn_vfmap,
1930 		    sizeof(if_t) * hn_vfmap_size);
1931 		free(hn_vfmap, M_DEVBUF);
1932 		hn_vfmap = newmap;
1933 		hn_vfmap_size = newsize;
1934 	}
1935 	KASSERT(hn_vfmap[if_getindex(ifp)] == NULL,
1936 	    ("%s: ifindex %d was mapped to %s",
1937 	     if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)])));
1938 	hn_vfmap[if_getindex(ifp)] = sc->hn_ifp;
1939 
1940 	rm_wunlock(&hn_vfmap_lock);
1941 
1942 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1943 	rm_wlock(&sc->hn_vf_lock);
1944 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1945 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1946 	sc->hn_vf_ifp = ifp;
1947 	rm_wunlock(&sc->hn_vf_lock);
1948 
1949 	if (hn_xpnt_vf) {
1950 		int wait_ticks;
1951 
1952 		/*
1953 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1954 		 * Save vf_ifp's current if_input for later restoration.
1955 		 */
1956 		sc->hn_vf_input = if_getinputfn(ifp);
1957 		if_setinputfn(ifp, hn_xpnt_vf_input);
1958 
1959 		/*
1960 		 * Stop link status management; use the VF's.
1961 		 */
1962 		hn_suspend_mgmt(sc);
1963 
1964 		/*
1965 		 * Give VF sometime to complete its attach routing.
1966 		 */
1967 		wait_ticks = hn_xpnt_vf_attwait * hz;
1968 		sc->hn_vf_rdytick = ticks + wait_ticks;
1969 
1970 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1971 		    wait_ticks);
1972 	}
1973 done:
1974 	HN_UNLOCK(sc);
1975 }
1976 
1977 static void
1978 hn_ifnet_detevent(void *xsc, if_t ifp)
1979 {
1980 	struct hn_softc *sc = xsc;
1981 
1982 	HN_LOCK(sc);
1983 
1984 	if (sc->hn_vf_ifp == NULL)
1985 		goto done;
1986 
1987 	if (!hn_ismyvf(sc, ifp))
1988 		goto done;
1989 
1990 	if (hn_xpnt_vf) {
1991 		/*
1992 		 * Make sure that the delayed initialization is not running.
1993 		 *
1994 		 * NOTE:
1995 		 * - This lock _must_ be released, since the hn_vf_init task
1996 		 *   will try holding this lock.
1997 		 * - It is safe to release this lock here, since the
1998 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1999 		 *
2000 		 * XXX racy, if hn(4) ever detached.
2001 		 */
2002 		HN_UNLOCK(sc);
2003 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2004 		HN_LOCK(sc);
2005 
2006 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2007 		    if_name(sc->hn_ifp)));
2008 		if_setinputfn(ifp, sc->hn_vf_input);
2009 		sc->hn_vf_input = NULL;
2010 
2011 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2012 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2013 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2014 
2015 		if (sc->hn_vf_rdytick == 0) {
2016 			/*
2017 			 * The VF was ready; restore some settings.
2018 			 */
2019 			if_setcapabilities(ifp, sc->hn_saved_caps);
2020 			/*
2021 			 * NOTE:
2022 			 * There is _no_ need to fixup if_capenable and
2023 			 * if_hwassist, since the if_capabilities before
2024 			 * restoration was an intersection of the VF's
2025 			 * if_capabilites and the synthetic device's
2026 			 * if_capabilites.
2027 			 */
2028 			if_sethwtsomax(ifp, sc->hn_saved_tsomax);
2029 			if_sethwtsomaxsegcount(sc->hn_ifp,
2030 			    sc->hn_saved_tsosegcnt);
2031 			if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz);
2032 		}
2033 
2034 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2035 			/*
2036 			 * Restore RSS settings.
2037 			 */
2038 			hn_vf_rss_restore(sc);
2039 
2040 			/*
2041 			 * Resume link status management, which was suspended
2042 			 * by hn_ifnet_attevent().
2043 			 */
2044 			hn_resume_mgmt(sc);
2045 		}
2046 	}
2047 
2048 	/* Mark transparent mode VF as disabled. */
2049 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2050 
2051 	rm_wlock(&hn_vfmap_lock);
2052 
2053 	KASSERT(if_getindex(ifp) < hn_vfmap_size,
2054 	    ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size));
2055 	if (hn_vfmap[if_getindex(ifp)] != NULL) {
2056 		KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp,
2057 		    ("%s: ifindex %d was mapped to %s",
2058 		     if_name(ifp), if_getindex(ifp),
2059 		     if_name(hn_vfmap[if_getindex(ifp)])));
2060 		hn_vfmap[if_getindex(ifp)] = NULL;
2061 	}
2062 
2063 	rm_wunlock(&hn_vfmap_lock);
2064 done:
2065 	HN_UNLOCK(sc);
2066 }
2067 
2068 static void
2069 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state)
2070 {
2071 	struct hn_softc *sc = xsc;
2072 
2073 	if (sc->hn_vf_ifp == ifp)
2074 		if_link_state_change(sc->hn_ifp, link_state);
2075 }
2076 
2077 static int
2078 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS)
2079 {
2080 	struct hn_softc *sc = arg1;
2081 	unsigned int tsomax;
2082 	int error;
2083 
2084 	tsomax = if_gethwtsomax(sc->hn_ifp);
2085 	error = sysctl_handle_int(oidp, &tsomax, 0, req);
2086 	return error;
2087 }
2088 
2089 static int
2090 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS)
2091 {
2092 	struct hn_softc *sc = arg1;
2093 	unsigned int tsomaxsegcnt;
2094 	int error;
2095 
2096 	tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp);
2097 	error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req);
2098 	return error;
2099 }
2100 
2101 static int
2102 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS)
2103 {
2104 	struct hn_softc *sc = arg1;
2105 	unsigned int tsomaxsegsz;
2106 	int error;
2107 
2108 	tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp);
2109 	error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req);
2110 	return error;
2111 }
2112 
2113 static int
2114 hn_probe(device_t dev)
2115 {
2116 
2117 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2118 		device_set_desc(dev, "Hyper-V Network Interface");
2119 		return BUS_PROBE_DEFAULT;
2120 	}
2121 	return ENXIO;
2122 }
2123 
2124 static int
2125 hn_attach(device_t dev)
2126 {
2127 	struct hn_softc *sc = device_get_softc(dev);
2128 	struct sysctl_oid_list *child;
2129 	struct sysctl_ctx_list *ctx;
2130 	uint8_t eaddr[ETHER_ADDR_LEN];
2131 	if_t ifp = NULL;
2132 	int error, ring_cnt, tx_ring_cnt;
2133 	uint32_t mtu;
2134 
2135 	sc->hn_dev = dev;
2136 	sc->hn_prichan = vmbus_get_channel(dev);
2137 	HN_LOCK_INIT(sc);
2138 	rm_init(&sc->hn_vf_lock, "hnvf");
2139 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2140 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2141 
2142 	/*
2143 	 * Initialize these tunables once.
2144 	 */
2145 	sc->hn_agg_size = hn_tx_agg_size;
2146 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2147 
2148 	/*
2149 	 * Setup taskqueue for transmission.
2150 	 */
2151 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2152 		int i;
2153 
2154 		sc->hn_tx_taskqs =
2155 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2156 		    M_DEVBUF, M_WAITOK);
2157 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2158 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2159 			    M_WAITOK, taskqueue_thread_enqueue,
2160 			    &sc->hn_tx_taskqs[i]);
2161 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2162 			    "%s tx%d", device_get_nameunit(dev), i);
2163 		}
2164 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2165 		sc->hn_tx_taskqs = hn_tx_taskque;
2166 	}
2167 
2168 	/*
2169 	 * Setup taskqueue for mangement tasks, e.g. link status.
2170 	 */
2171 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2172 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2173 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2174 	    device_get_nameunit(dev));
2175 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2176 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2177 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2178 	    hn_netchg_status_taskfunc, sc);
2179 
2180 	if (hn_xpnt_vf) {
2181 		/*
2182 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2183 		 */
2184 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2185 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2186 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2187 		    device_get_nameunit(dev));
2188 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2189 		    hn_xpnt_vf_init_taskfunc, sc);
2190 	}
2191 
2192 	/*
2193 	 * Allocate ifnet and setup its name earlier, so that if_printf
2194 	 * can be used by functions, which will be called after
2195 	 * ether_ifattach().
2196 	 */
2197 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2198 	if_setsoftc(ifp, sc);
2199 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2200 
2201 	/*
2202 	 * Initialize ifmedia earlier so that it can be unconditionally
2203 	 * destroyed, if error happened later on.
2204 	 */
2205 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2206 
2207 	/*
2208 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2209 	 * to use (tx_ring_cnt).
2210 	 *
2211 	 * NOTE:
2212 	 * The # of RX rings to use is same as the # of channels to use.
2213 	 */
2214 	ring_cnt = hn_chan_cnt;
2215 	if (ring_cnt <= 0) {
2216 		/* Default */
2217 		ring_cnt = mp_ncpus;
2218 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2219 			ring_cnt = HN_RING_CNT_DEF_MAX;
2220 	} else if (ring_cnt > mp_ncpus) {
2221 		ring_cnt = mp_ncpus;
2222 	}
2223 #ifdef RSS
2224 	if (ring_cnt > rss_getnumbuckets())
2225 		ring_cnt = rss_getnumbuckets();
2226 #endif
2227 
2228 	tx_ring_cnt = hn_tx_ring_cnt;
2229 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2230 		tx_ring_cnt = ring_cnt;
2231 #ifdef HN_IFSTART_SUPPORT
2232 	if (hn_use_if_start) {
2233 		/* ifnet.if_start only needs one TX ring. */
2234 		tx_ring_cnt = 1;
2235 	}
2236 #endif
2237 
2238 	/*
2239 	 * Set the leader CPU for channels.
2240 	 */
2241 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2242 
2243 	/*
2244 	 * Create enough TX/RX rings, even if only limited number of
2245 	 * channels can be allocated.
2246 	 */
2247 	error = hn_create_tx_data(sc, tx_ring_cnt);
2248 	if (error)
2249 		goto failed;
2250 	error = hn_create_rx_data(sc, ring_cnt);
2251 	if (error)
2252 		goto failed;
2253 
2254 	/*
2255 	 * Create transaction context for NVS and RNDIS transactions.
2256 	 */
2257 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2258 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2259 	if (sc->hn_xact == NULL) {
2260 		error = ENXIO;
2261 		goto failed;
2262 	}
2263 
2264 	/*
2265 	 * Install orphan handler for the revocation of this device's
2266 	 * primary channel.
2267 	 *
2268 	 * NOTE:
2269 	 * The processing order is critical here:
2270 	 * Install the orphan handler, _before_ testing whether this
2271 	 * device's primary channel has been revoked or not.
2272 	 */
2273 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2274 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2275 		error = ENXIO;
2276 		goto failed;
2277 	}
2278 
2279 	/*
2280 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2281 	 */
2282 	error = hn_synth_attach(sc, ETHERMTU);
2283 	if (error)
2284 		goto failed;
2285 
2286 	error = hn_rndis_get_eaddr(sc, eaddr);
2287 	if (error)
2288 		goto failed;
2289 
2290 	error = hn_rndis_get_mtu(sc, &mtu);
2291 	if (error)
2292 		mtu = ETHERMTU;
2293 	else if (bootverbose)
2294 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2295 
2296 	if (sc->hn_rx_ring_inuse > 1) {
2297 		/*
2298 		 * Reduce TCP segment aggregation limit for multiple
2299 		 * RX rings to increase ACK timeliness.
2300 		 */
2301 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2302 	}
2303 
2304 	/*
2305 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2306 	 */
2307 	hn_fixup_tx_data(sc);
2308 	hn_fixup_rx_data(sc);
2309 
2310 	ctx = device_get_sysctl_ctx(dev);
2311 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2312 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2313 	    &sc->hn_nvs_ver, 0, "NVS version");
2314 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2315 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2316 	    hn_ndis_version_sysctl, "A", "NDIS version");
2317 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2318 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2319 	    hn_caps_sysctl, "A", "capabilities");
2320 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2321 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2322 	    hn_hwassist_sysctl, "A", "hwassist");
2323 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max",
2324 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl,
2325 	    "IU", "max TSO size");
2326 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt",
2327 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl,
2328 	    "IU", "max # of TSO segments");
2329 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz",
2330 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl,
2331 	    "IU", "max size of TSO segment");
2332 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2333 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2334 	    hn_rxfilter_sysctl, "A", "rxfilter");
2335 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2336 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2337 	    hn_rss_hash_sysctl, "A", "RSS hash");
2338 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2339 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2340 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2341 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2342 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2343 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2344 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2345 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2346 #ifndef RSS
2347 	/*
2348 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2349 	 */
2350 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2351 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2352 	    hn_rss_key_sysctl, "IU", "RSS key");
2353 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2354 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2355 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2356 #endif
2357 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2358 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2359 	    "RNDIS offered packet transmission aggregation size limit");
2360 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2361 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2362 	    "RNDIS offered packet transmission aggregation count limit");
2363 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2364 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2365 	    "RNDIS packet transmission aggregation alignment");
2366 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2367 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2368 	    hn_txagg_size_sysctl, "I",
2369 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2370 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2371 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2372 	    hn_txagg_pkts_sysctl, "I",
2373 	    "Packet transmission aggregation packets, "
2374 	    "0 -- disable, -1 -- auto");
2375 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2376 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2377 	    hn_polling_sysctl, "I",
2378 	    "Polling frequency: [100,1000000], 0 disable polling");
2379 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2380 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2381 	    hn_vf_sysctl, "A", "Virtual Function's name");
2382 	if (!hn_xpnt_vf) {
2383 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2384 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2385 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2386 	} else {
2387 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2388 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2389 		    hn_xpnt_vf_enabled_sysctl, "I",
2390 		    "Transparent VF enabled");
2391 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2392 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2393 		    hn_xpnt_vf_accbpf_sysctl, "I",
2394 		    "Accurate BPF for transparent VF");
2395 	}
2396 
2397 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
2398 	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A",
2399 	    "switch to rsc");
2400 
2401 	/*
2402 	 * Setup the ifmedia, which has been initialized earlier.
2403 	 */
2404 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2405 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2406 	/* XXX ifmedia_set really should do this for us */
2407 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2408 
2409 	/*
2410 	 * Setup the ifnet for this interface.
2411 	 */
2412 
2413 	if_setbaudrate(ifp, IF_Gbps(10));
2414 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
2415 	if_setioctlfn(ifp, hn_ioctl);
2416 	if_setinitfn(ifp, hn_init);
2417 #ifdef HN_IFSTART_SUPPORT
2418 	if (hn_use_if_start) {
2419 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2420 
2421 		if_setstartfn(ifp, hn_start);
2422 		if_setsendqlen(ifp, qdepth);
2423 		if_setsendqready(ifp);
2424 	} else
2425 #endif
2426 	{
2427 		if_settransmitfn(ifp, hn_transmit);
2428 		if_setqflushfn(ifp, hn_xmit_qflush);
2429 	}
2430 
2431 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0);
2432 #ifdef foo
2433 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2434 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0);
2435 #endif
2436 	if (sc->hn_caps & HN_CAP_VLAN) {
2437 		/* XXX not sure about VLAN_MTU. */
2438 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0);
2439 	}
2440 
2441 	if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist);
2442 	if (if_gethwassist(ifp) & HN_CSUM_IP_MASK)
2443 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0);
2444 	if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK)
2445 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0);
2446 	if (sc->hn_caps & HN_CAP_TSO4) {
2447 		if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
2448 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2449 	}
2450 	if (sc->hn_caps & HN_CAP_TSO6) {
2451 		if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
2452 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2453 	}
2454 
2455 	/* Enable all available capabilities by default. */
2456 	if_setcapenable(ifp, if_getcapabilities(ifp));
2457 
2458 	/*
2459 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2460 	 * be enabled through SIOCSIFCAP.
2461 	 */
2462 	if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6));
2463 	if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO));
2464 
2465 	if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) {
2466 		/*
2467 		 * Lock hn_set_tso_maxsize() to simplify its
2468 		 * internal logic.
2469 		 */
2470 		HN_LOCK(sc);
2471 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2472 		HN_UNLOCK(sc);
2473 		if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX);
2474 		if_sethwtsomaxsegsize(ifp, PAGE_SIZE);
2475 	}
2476 
2477 	ether_ifattach(ifp, eaddr);
2478 
2479 	if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2480 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2481 		    if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp));
2482 	}
2483 	if (mtu < ETHERMTU) {
2484 
2485 		if_setmtu(ifp, mtu);
2486 	}
2487 
2488 	/* Inform the upper layer about the long frame support. */
2489 	if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
2490 
2491 	/*
2492 	 * Kick off link status check.
2493 	 */
2494 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2495 	hn_update_link_status(sc);
2496 
2497 	if (!hn_xpnt_vf) {
2498 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2499 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2500 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2501 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2502 	} else {
2503 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2504 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2505 	}
2506 
2507 	/*
2508 	 * NOTE:
2509 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2510 	 * since interface's LLADDR is needed; interface LLADDR is not
2511 	 * available when ifnet_arrival event is triggered.
2512 	 */
2513 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2514 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2515 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2516 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2517 
2518 	return (0);
2519 failed:
2520 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2521 		hn_synth_detach(sc);
2522 	hn_detach(dev);
2523 	return (error);
2524 }
2525 
2526 static int
2527 hn_detach(device_t dev)
2528 {
2529 	struct hn_softc *sc = device_get_softc(dev);
2530 	if_t ifp = sc->hn_ifp, vf_ifp;
2531 
2532 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2533 		/*
2534 		 * In case that the vmbus missed the orphan handler
2535 		 * installation.
2536 		 */
2537 		vmbus_xact_ctx_orphan(sc->hn_xact);
2538 	}
2539 
2540 	if (sc->hn_ifaddr_evthand != NULL)
2541 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2542 	if (sc->hn_ifnet_evthand != NULL)
2543 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2544 	if (sc->hn_ifnet_atthand != NULL) {
2545 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2546 		    sc->hn_ifnet_atthand);
2547 	}
2548 	if (sc->hn_ifnet_dethand != NULL) {
2549 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2550 		    sc->hn_ifnet_dethand);
2551 	}
2552 	if (sc->hn_ifnet_lnkhand != NULL)
2553 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2554 
2555 	vf_ifp = sc->hn_vf_ifp;
2556 	__compiler_membar();
2557 	if (vf_ifp != NULL)
2558 		hn_ifnet_detevent(sc, vf_ifp);
2559 
2560 	if (device_is_attached(dev)) {
2561 		HN_LOCK(sc);
2562 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2563 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
2564 				hn_stop(sc, true);
2565 			/*
2566 			 * NOTE:
2567 			 * hn_stop() only suspends data, so managment
2568 			 * stuffs have to be suspended manually here.
2569 			 */
2570 			hn_suspend_mgmt(sc);
2571 			hn_synth_detach(sc);
2572 		}
2573 		HN_UNLOCK(sc);
2574 		ether_ifdetach(ifp);
2575 	}
2576 
2577 	ifmedia_removeall(&sc->hn_media);
2578 	hn_destroy_rx_data(sc);
2579 	hn_destroy_tx_data(sc);
2580 
2581 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2582 		int i;
2583 
2584 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2585 			taskqueue_free(sc->hn_tx_taskqs[i]);
2586 		free(sc->hn_tx_taskqs, M_DEVBUF);
2587 	}
2588 	taskqueue_free(sc->hn_mgmt_taskq0);
2589 	if (sc->hn_vf_taskq != NULL)
2590 		taskqueue_free(sc->hn_vf_taskq);
2591 
2592 	if (sc->hn_xact != NULL) {
2593 		/*
2594 		 * Uninstall the orphan handler _before_ the xact is
2595 		 * destructed.
2596 		 */
2597 		vmbus_chan_unset_orphan(sc->hn_prichan);
2598 		vmbus_xact_ctx_destroy(sc->hn_xact);
2599 	}
2600 
2601 	if_free(ifp);
2602 
2603 	HN_LOCK_DESTROY(sc);
2604 	rm_destroy(&sc->hn_vf_lock);
2605 	return (0);
2606 }
2607 
2608 static int
2609 hn_shutdown(device_t dev)
2610 {
2611 
2612 	return (0);
2613 }
2614 
2615 static void
2616 hn_link_status(struct hn_softc *sc)
2617 {
2618 	uint32_t link_status;
2619 	int error;
2620 
2621 	error = hn_rndis_get_linkstatus(sc, &link_status);
2622 	if (error) {
2623 		/* XXX what to do? */
2624 		return;
2625 	}
2626 
2627 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2628 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2629 	else
2630 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2631 	if_link_state_change(sc->hn_ifp,
2632 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2633 	    LINK_STATE_UP : LINK_STATE_DOWN);
2634 }
2635 
2636 static void
2637 hn_link_taskfunc(void *xsc, int pending __unused)
2638 {
2639 	struct hn_softc *sc = xsc;
2640 
2641 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2642 		return;
2643 	hn_link_status(sc);
2644 }
2645 
2646 static void
2647 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2648 {
2649 	struct hn_softc *sc = xsc;
2650 
2651 	/* Prevent any link status checks from running. */
2652 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2653 
2654 	/*
2655 	 * Fake up a [link down --> link up] state change; 5 seconds
2656 	 * delay is used, which closely simulates miibus reaction
2657 	 * upon link down event.
2658 	 */
2659 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2660 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2661 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2662 	    &sc->hn_netchg_status, 5 * hz);
2663 }
2664 
2665 static void
2666 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2667 {
2668 	struct hn_softc *sc = xsc;
2669 
2670 	/* Re-allow link status checks. */
2671 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2672 	hn_link_status(sc);
2673 }
2674 
2675 static void
2676 hn_update_link_status(struct hn_softc *sc)
2677 {
2678 
2679 	if (sc->hn_mgmt_taskq != NULL)
2680 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2681 }
2682 
2683 static void
2684 hn_change_network(struct hn_softc *sc)
2685 {
2686 
2687 	if (sc->hn_mgmt_taskq != NULL)
2688 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2689 }
2690 
2691 static __inline int
2692 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2693     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2694 {
2695 	struct mbuf *m = *m_head;
2696 	int error;
2697 
2698 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2699 
2700 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2701 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2702 	if (error == EFBIG) {
2703 		struct mbuf *m_new;
2704 
2705 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2706 		if (m_new == NULL)
2707 			return ENOBUFS;
2708 		else
2709 			*m_head = m = m_new;
2710 		txr->hn_tx_collapsed++;
2711 
2712 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2713 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2714 	}
2715 	if (!error) {
2716 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2717 		    BUS_DMASYNC_PREWRITE);
2718 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2719 	}
2720 	return error;
2721 }
2722 
2723 static __inline int
2724 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2725 {
2726 
2727 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2728 	    ("put an onlist txd %#x", txd->flags));
2729 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2730 	    ("put an onagg txd %#x", txd->flags));
2731 
2732 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2733 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2734 		return 0;
2735 
2736 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2737 		struct hn_txdesc *tmp_txd;
2738 
2739 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2740 			int freed __diagused;
2741 
2742 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2743 			    ("resursive aggregation on aggregated txdesc"));
2744 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2745 			    ("not aggregated txdesc"));
2746 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2747 			    ("aggregated txdesc uses dmamap"));
2748 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2749 			    ("aggregated txdesc consumes "
2750 			     "chimney sending buffer"));
2751 			KASSERT(tmp_txd->chim_size == 0,
2752 			    ("aggregated txdesc has non-zero "
2753 			     "chimney sending size"));
2754 
2755 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2756 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2757 			freed = hn_txdesc_put(txr, tmp_txd);
2758 			KASSERT(freed, ("failed to free aggregated txdesc"));
2759 		}
2760 	}
2761 
2762 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2763 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2764 		    ("chim txd uses dmamap"));
2765 		hn_chim_free(txr->hn_sc, txd->chim_index);
2766 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2767 		txd->chim_size = 0;
2768 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2769 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2770 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2771 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2772 		    txd->data_dmap);
2773 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2774 	}
2775 
2776 	if (txd->m != NULL) {
2777 		m_freem(txd->m);
2778 		txd->m = NULL;
2779 	}
2780 
2781 	txd->flags |= HN_TXD_FLAG_ONLIST;
2782 #ifndef HN_USE_TXDESC_BUFRING
2783 	mtx_lock_spin(&txr->hn_txlist_spin);
2784 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2785 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2786 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2787 	txr->hn_txdesc_avail++;
2788 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2789 	mtx_unlock_spin(&txr->hn_txlist_spin);
2790 #else	/* HN_USE_TXDESC_BUFRING */
2791 #ifdef HN_DEBUG
2792 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2793 #endif
2794 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2795 #endif	/* !HN_USE_TXDESC_BUFRING */
2796 
2797 	return 1;
2798 }
2799 
2800 static __inline struct hn_txdesc *
2801 hn_txdesc_get(struct hn_tx_ring *txr)
2802 {
2803 	struct hn_txdesc *txd;
2804 
2805 #ifndef HN_USE_TXDESC_BUFRING
2806 	mtx_lock_spin(&txr->hn_txlist_spin);
2807 	txd = SLIST_FIRST(&txr->hn_txlist);
2808 	if (txd != NULL) {
2809 		KASSERT(txr->hn_txdesc_avail > 0,
2810 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2811 		txr->hn_txdesc_avail--;
2812 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2813 	}
2814 	mtx_unlock_spin(&txr->hn_txlist_spin);
2815 #else
2816 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2817 #endif
2818 
2819 	if (txd != NULL) {
2820 #ifdef HN_USE_TXDESC_BUFRING
2821 #ifdef HN_DEBUG
2822 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2823 #endif
2824 #endif	/* HN_USE_TXDESC_BUFRING */
2825 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2826 		    STAILQ_EMPTY(&txd->agg_list) &&
2827 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2828 		    txd->chim_size == 0 &&
2829 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2830 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2831 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2832 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2833 		txd->refs = 1;
2834 	}
2835 	return txd;
2836 }
2837 
2838 static __inline void
2839 hn_txdesc_hold(struct hn_txdesc *txd)
2840 {
2841 
2842 	/* 0->1 transition will never work */
2843 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2844 	atomic_add_int(&txd->refs, 1);
2845 }
2846 
2847 static __inline void
2848 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2849 {
2850 
2851 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2852 	    ("recursive aggregation on aggregating txdesc"));
2853 
2854 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2855 	    ("already aggregated"));
2856 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2857 	    ("recursive aggregation on to-be-aggregated txdesc"));
2858 
2859 	txd->flags |= HN_TXD_FLAG_ONAGG;
2860 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2861 }
2862 
2863 static bool
2864 hn_tx_ring_pending(struct hn_tx_ring *txr)
2865 {
2866 	bool pending = false;
2867 
2868 #ifndef HN_USE_TXDESC_BUFRING
2869 	mtx_lock_spin(&txr->hn_txlist_spin);
2870 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2871 		pending = true;
2872 	mtx_unlock_spin(&txr->hn_txlist_spin);
2873 #else
2874 	if (!buf_ring_full(txr->hn_txdesc_br))
2875 		pending = true;
2876 #endif
2877 	return (pending);
2878 }
2879 
2880 static __inline void
2881 hn_txeof(struct hn_tx_ring *txr)
2882 {
2883 	txr->hn_has_txeof = 0;
2884 	txr->hn_txeof(txr);
2885 }
2886 
2887 static void
2888 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2889     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2890 {
2891 	struct hn_txdesc *txd = sndc->hn_cbarg;
2892 	struct hn_tx_ring *txr;
2893 
2894 	txr = txd->txr;
2895 	KASSERT(txr->hn_chan == chan,
2896 	    ("channel mismatch, on chan%u, should be chan%u",
2897 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2898 
2899 	txr->hn_has_txeof = 1;
2900 	hn_txdesc_put(txr, txd);
2901 
2902 	++txr->hn_txdone_cnt;
2903 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2904 		txr->hn_txdone_cnt = 0;
2905 		if (txr->hn_oactive)
2906 			hn_txeof(txr);
2907 	}
2908 }
2909 
2910 static void
2911 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2912 {
2913 #if defined(INET) || defined(INET6)
2914 	struct epoch_tracker et;
2915 
2916 	NET_EPOCH_ENTER(et);
2917 	tcp_lro_flush_all(&rxr->hn_lro);
2918 	NET_EPOCH_EXIT(et);
2919 #endif
2920 
2921 	/*
2922 	 * NOTE:
2923 	 * 'txr' could be NULL, if multiple channels and
2924 	 * ifnet.if_start method are enabled.
2925 	 */
2926 	if (txr == NULL || !txr->hn_has_txeof)
2927 		return;
2928 
2929 	txr->hn_txdone_cnt = 0;
2930 	hn_txeof(txr);
2931 }
2932 
2933 static __inline uint32_t
2934 hn_rndis_pktmsg_offset(uint32_t ofs)
2935 {
2936 
2937 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2938 	    ("invalid RNDIS packet msg offset %u", ofs));
2939 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2940 }
2941 
2942 static __inline void *
2943 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2944     size_t pi_dlen, uint32_t pi_type)
2945 {
2946 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2947 	struct rndis_pktinfo *pi;
2948 
2949 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2950 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2951 
2952 	/*
2953 	 * Per-packet-info does not move; it only grows.
2954 	 *
2955 	 * NOTE:
2956 	 * rm_pktinfooffset in this phase counts from the beginning
2957 	 * of rndis_packet_msg.
2958 	 */
2959 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2960 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2961 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2962 	    pkt->rm_pktinfolen);
2963 	pkt->rm_pktinfolen += pi_size;
2964 
2965 	pi->rm_size = pi_size;
2966 	pi->rm_type = pi_type;
2967 	pi->rm_internal = 0;
2968 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2969 
2970 	return (pi->rm_data);
2971 }
2972 
2973 static __inline int
2974 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr)
2975 {
2976 	struct hn_txdesc *txd;
2977 	struct mbuf *m;
2978 	int error, pkts;
2979 
2980 	txd = txr->hn_agg_txd;
2981 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2982 
2983 	/*
2984 	 * Since hn_txpkt() will reset this temporary stat, save
2985 	 * it now, so that oerrors can be updated properly, if
2986 	 * hn_txpkt() ever fails.
2987 	 */
2988 	pkts = txr->hn_stat_pkts;
2989 
2990 	/*
2991 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2992 	 * failure, save it for later freeing, if hn_txpkt() ever
2993 	 * fails.
2994 	 */
2995 	m = txd->m;
2996 	error = hn_txpkt(ifp, txr, txd);
2997 	if (__predict_false(error)) {
2998 		/* txd is freed, but m is not. */
2999 		m_freem(m);
3000 
3001 		txr->hn_flush_failed++;
3002 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
3003 	}
3004 
3005 	/* Reset all aggregation states. */
3006 	txr->hn_agg_txd = NULL;
3007 	txr->hn_agg_szleft = 0;
3008 	txr->hn_agg_pktleft = 0;
3009 	txr->hn_agg_prevpkt = NULL;
3010 
3011 	return (error);
3012 }
3013 
3014 static void *
3015 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3016     int pktsize)
3017 {
3018 	void *chim;
3019 
3020 	if (txr->hn_agg_txd != NULL) {
3021 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
3022 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
3023 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
3024 			int olen;
3025 
3026 			/*
3027 			 * Update the previous RNDIS packet's total length,
3028 			 * it can be increased due to the mandatory alignment
3029 			 * padding for this RNDIS packet.  And update the
3030 			 * aggregating txdesc's chimney sending buffer size
3031 			 * accordingly.
3032 			 *
3033 			 * XXX
3034 			 * Zero-out the padding, as required by the RNDIS spec.
3035 			 */
3036 			olen = pkt->rm_len;
3037 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
3038 			agg_txd->chim_size += pkt->rm_len - olen;
3039 
3040 			/* Link this txdesc to the parent. */
3041 			hn_txdesc_agg(agg_txd, txd);
3042 
3043 			chim = (uint8_t *)pkt + pkt->rm_len;
3044 			/* Save the current packet for later fixup. */
3045 			txr->hn_agg_prevpkt = chim;
3046 
3047 			txr->hn_agg_pktleft--;
3048 			txr->hn_agg_szleft -= pktsize;
3049 			if (txr->hn_agg_szleft <=
3050 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3051 				/*
3052 				 * Probably can't aggregate more packets,
3053 				 * flush this aggregating txdesc proactively.
3054 				 */
3055 				txr->hn_agg_pktleft = 0;
3056 			}
3057 			/* Done! */
3058 			return (chim);
3059 		}
3060 		hn_flush_txagg(ifp, txr);
3061 	}
3062 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3063 
3064 	txr->hn_tx_chimney_tried++;
3065 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3066 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3067 		return (NULL);
3068 	txr->hn_tx_chimney++;
3069 
3070 	chim = txr->hn_sc->hn_chim +
3071 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3072 
3073 	if (txr->hn_agg_pktmax > 1 &&
3074 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3075 		txr->hn_agg_txd = txd;
3076 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3077 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3078 		txr->hn_agg_prevpkt = chim;
3079 	}
3080 	return (chim);
3081 }
3082 
3083 /*
3084  * NOTE:
3085  * If this function fails, then both txd and m_head0 will be freed.
3086  */
3087 static int
3088 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3089     struct mbuf **m_head0)
3090 {
3091 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3092 	int error, nsegs, i;
3093 	struct mbuf *m_head = *m_head0;
3094 	struct rndis_packet_msg *pkt;
3095 	uint32_t *pi_data;
3096 	void *chim = NULL;
3097 	int pkt_hlen, pkt_size;
3098 
3099 	pkt = txd->rndis_pkt;
3100 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3101 	if (pkt_size < txr->hn_chim_size) {
3102 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3103 		if (chim != NULL)
3104 			pkt = chim;
3105 	} else {
3106 		if (txr->hn_agg_txd != NULL)
3107 			hn_flush_txagg(ifp, txr);
3108 	}
3109 
3110 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3111 	pkt->rm_len = m_head->m_pkthdr.len;
3112 	pkt->rm_dataoffset = 0;
3113 	pkt->rm_datalen = m_head->m_pkthdr.len;
3114 	pkt->rm_oobdataoffset = 0;
3115 	pkt->rm_oobdatalen = 0;
3116 	pkt->rm_oobdataelements = 0;
3117 	pkt->rm_pktinfooffset = sizeof(*pkt);
3118 	pkt->rm_pktinfolen = 0;
3119 	pkt->rm_vchandle = 0;
3120 	pkt->rm_reserved = 0;
3121 
3122 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3123 		/*
3124 		 * Set the hash value for this packet.
3125 		 */
3126 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3127 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3128 
3129 		if (M_HASHTYPE_ISHASH(m_head))
3130 			/*
3131 			 * The flowid field contains the hash value host
3132 			 * set in the rx queue if it is a ip forwarding pkt.
3133 			 * Set the same hash value so host can send on the
3134 			 * cpu it was received.
3135 			 */
3136 			*pi_data = m_head->m_pkthdr.flowid;
3137 		else
3138 			/*
3139 			 * Otherwise just put the tx queue index.
3140 			 */
3141 			*pi_data = txr->hn_tx_idx;
3142 	}
3143 
3144 	if (m_head->m_flags & M_VLANTAG) {
3145 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3146 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3147 		*pi_data = NDIS_VLAN_INFO_MAKE(
3148 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3149 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3150 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3151 	}
3152 
3153 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3154 #if defined(INET6) || defined(INET)
3155 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3156 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3157 #ifdef INET
3158 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3159 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3160 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3161 			    m_head->m_pkthdr.tso_segsz);
3162 		}
3163 #endif
3164 #if defined(INET6) && defined(INET)
3165 		else
3166 #endif
3167 #ifdef INET6
3168 		{
3169 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3170 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3171 			    m_head->m_pkthdr.tso_segsz);
3172 		}
3173 #endif
3174 #endif	/* INET6 || INET */
3175 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3176 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3177 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3178 		if (m_head->m_pkthdr.csum_flags &
3179 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3180 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3181 		} else {
3182 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3183 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3184 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3185 		}
3186 
3187 		if (m_head->m_pkthdr.csum_flags &
3188 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3189 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3190 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3191 		} else if (m_head->m_pkthdr.csum_flags &
3192 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3193 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3194 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3195 		}
3196 	}
3197 
3198 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3199 	/* Fixup RNDIS packet message total length */
3200 	pkt->rm_len += pkt_hlen;
3201 	/* Convert RNDIS packet message offsets */
3202 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3203 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3204 
3205 	/*
3206 	 * Fast path: Chimney sending.
3207 	 */
3208 	if (chim != NULL) {
3209 		struct hn_txdesc *tgt_txd = txd;
3210 
3211 		if (txr->hn_agg_txd != NULL) {
3212 			tgt_txd = txr->hn_agg_txd;
3213 #ifdef INVARIANTS
3214 			*m_head0 = NULL;
3215 #endif
3216 		}
3217 
3218 		KASSERT(pkt == chim,
3219 		    ("RNDIS pkt not in chimney sending buffer"));
3220 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3221 		    ("chimney sending buffer is not used"));
3222 		tgt_txd->chim_size += pkt->rm_len;
3223 
3224 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3225 		    ((uint8_t *)chim) + pkt_hlen);
3226 
3227 		txr->hn_gpa_cnt = 0;
3228 		txr->hn_sendpkt = hn_txpkt_chim;
3229 		goto done;
3230 	}
3231 
3232 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3233 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3234 	    ("chimney buffer is used"));
3235 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3236 
3237 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3238 	if (__predict_false(error)) {
3239 		int freed __diagused;
3240 
3241 		/*
3242 		 * This mbuf is not linked w/ the txd yet, so free it now.
3243 		 */
3244 		m_freem(m_head);
3245 		*m_head0 = NULL;
3246 
3247 		freed = hn_txdesc_put(txr, txd);
3248 		KASSERT(freed != 0,
3249 		    ("fail to free txd upon txdma error"));
3250 
3251 		txr->hn_txdma_failed++;
3252 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3253 		return error;
3254 	}
3255 	*m_head0 = m_head;
3256 
3257 	/* +1 RNDIS packet message */
3258 	txr->hn_gpa_cnt = nsegs + 1;
3259 
3260 	/* send packet with page buffer */
3261 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3262 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3263 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3264 
3265 	/*
3266 	 * Fill the page buffers with mbuf info after the page
3267 	 * buffer for RNDIS packet message.
3268 	 */
3269 	for (i = 0; i < nsegs; ++i) {
3270 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3271 
3272 		gpa->gpa_page = atop(segs[i].ds_addr);
3273 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3274 		gpa->gpa_len = segs[i].ds_len;
3275 	}
3276 
3277 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3278 	txd->chim_size = 0;
3279 	txr->hn_sendpkt = hn_txpkt_sglist;
3280 done:
3281 	txd->m = m_head;
3282 
3283 	/* Set the completion routine */
3284 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3285 
3286 	/* Update temporary stats for later use. */
3287 	txr->hn_stat_pkts++;
3288 	txr->hn_stat_size += m_head->m_pkthdr.len;
3289 	if (m_head->m_flags & M_MCAST)
3290 		txr->hn_stat_mcasts++;
3291 
3292 	return 0;
3293 }
3294 
3295 /*
3296  * NOTE:
3297  * If this function fails, then txd will be freed, but the mbuf
3298  * associated w/ the txd will _not_ be freed.
3299  */
3300 static int
3301 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3302 {
3303 	int error, send_failed = 0, has_bpf;
3304 
3305 again:
3306 	has_bpf = bpf_peers_present(if_getbpf(ifp));
3307 	if (has_bpf) {
3308 		/*
3309 		 * Make sure that this txd and any aggregated txds are not
3310 		 * freed before ETHER_BPF_MTAP.
3311 		 */
3312 		hn_txdesc_hold(txd);
3313 	}
3314 	error = txr->hn_sendpkt(txr, txd);
3315 	if (!error) {
3316 		if (has_bpf) {
3317 			const struct hn_txdesc *tmp_txd;
3318 
3319 			ETHER_BPF_MTAP(ifp, txd->m);
3320 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3321 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3322 		}
3323 
3324 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3325 #ifdef HN_IFSTART_SUPPORT
3326 		if (!hn_use_if_start)
3327 #endif
3328 		{
3329 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3330 			    txr->hn_stat_size);
3331 			if (txr->hn_stat_mcasts != 0) {
3332 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3333 				    txr->hn_stat_mcasts);
3334 			}
3335 		}
3336 		txr->hn_pkts += txr->hn_stat_pkts;
3337 		txr->hn_sends++;
3338 	}
3339 	if (has_bpf)
3340 		hn_txdesc_put(txr, txd);
3341 
3342 	if (__predict_false(error)) {
3343 		int freed __diagused;
3344 
3345 		/*
3346 		 * This should "really rarely" happen.
3347 		 *
3348 		 * XXX Too many RX to be acked or too many sideband
3349 		 * commands to run?  Ask netvsc_channel_rollup()
3350 		 * to kick start later.
3351 		 */
3352 		txr->hn_has_txeof = 1;
3353 		if (!send_failed) {
3354 			txr->hn_send_failed++;
3355 			send_failed = 1;
3356 			/*
3357 			 * Try sending again after set hn_has_txeof;
3358 			 * in case that we missed the last
3359 			 * netvsc_channel_rollup().
3360 			 */
3361 			goto again;
3362 		}
3363 		if_printf(ifp, "send failed\n");
3364 
3365 		/*
3366 		 * Caller will perform further processing on the
3367 		 * associated mbuf, so don't free it in hn_txdesc_put();
3368 		 * only unload it from the DMA map in hn_txdesc_put(),
3369 		 * if it was loaded.
3370 		 */
3371 		txd->m = NULL;
3372 		freed = hn_txdesc_put(txr, txd);
3373 		KASSERT(freed != 0,
3374 		    ("fail to free txd upon send error"));
3375 
3376 		txr->hn_send_failed++;
3377 	}
3378 
3379 	/* Reset temporary stats, after this sending is done. */
3380 	txr->hn_stat_size = 0;
3381 	txr->hn_stat_pkts = 0;
3382 	txr->hn_stat_mcasts = 0;
3383 
3384 	return (error);
3385 }
3386 
3387 /*
3388  * Append the specified data to the indicated mbuf chain,
3389  * Extend the mbuf chain if the new data does not fit in
3390  * existing space.
3391  *
3392  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3393  * There should be an equivalent in the kernel mbuf code,
3394  * but there does not appear to be one yet.
3395  *
3396  * Differs from m_append() in that additional mbufs are
3397  * allocated with cluster size MJUMPAGESIZE, and filled
3398  * accordingly.
3399  *
3400  * Return the last mbuf in the chain or NULL if failed to
3401  * allocate new mbuf.
3402  */
3403 static struct mbuf *
3404 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3405 {
3406 	struct mbuf *m, *n;
3407 	int remainder, space;
3408 
3409 	for (m = m0; m->m_next != NULL; m = m->m_next)
3410 		;
3411 	remainder = len;
3412 	space = M_TRAILINGSPACE(m);
3413 	if (space > 0) {
3414 		/*
3415 		 * Copy into available space.
3416 		 */
3417 		if (space > remainder)
3418 			space = remainder;
3419 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3420 		m->m_len += space;
3421 		cp += space;
3422 		remainder -= space;
3423 	}
3424 	while (remainder > 0) {
3425 		/*
3426 		 * Allocate a new mbuf; could check space
3427 		 * and allocate a cluster instead.
3428 		 */
3429 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3430 		if (n == NULL)
3431 			return NULL;
3432 		n->m_len = min(MJUMPAGESIZE, remainder);
3433 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3434 		cp += n->m_len;
3435 		remainder -= n->m_len;
3436 		m->m_next = n;
3437 		m = n;
3438 	}
3439 
3440 	return m;
3441 }
3442 
3443 #if defined(INET) || defined(INET6)
3444 static __inline int
3445 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3446 {
3447 	if (hn_lro_mbufq_depth) {
3448 		tcp_lro_queue_mbuf(lc, m);
3449 		return 0;
3450 	}
3451 	return tcp_lro_rx(lc, m, 0);
3452 }
3453 #endif
3454 
3455 static int
3456 hn_rxpkt(struct hn_rx_ring *rxr)
3457 {
3458 	if_t ifp, hn_ifp = rxr->hn_ifp;
3459 	struct mbuf *m_new, *n;
3460 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3461 	int hash_type = M_HASHTYPE_NONE;
3462 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3463 	int i;
3464 
3465 	ifp = hn_ifp;
3466 	if (rxr->hn_rxvf_ifp != NULL) {
3467 		/*
3468 		 * Non-transparent mode VF; pretend this packet is from
3469 		 * the VF.
3470 		 */
3471 		ifp = rxr->hn_rxvf_ifp;
3472 		is_vf = 1;
3473 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3474 		/* Transparent mode VF. */
3475 		is_vf = 1;
3476 	}
3477 
3478 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
3479 		/*
3480 		 * NOTE:
3481 		 * See the NOTE of hn_rndis_init_fixat().  This
3482 		 * function can be reached, immediately after the
3483 		 * RNDIS is initialized but before the ifnet is
3484 		 * setup on the hn_attach() path; drop the unexpected
3485 		 * packets.
3486 		 */
3487 		return (0);
3488 	}
3489 
3490 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3491 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3492 		return (0);
3493 	}
3494 
3495 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3496 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3497 		if (m_new == NULL) {
3498 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3499 			return (0);
3500 		}
3501 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3502 		    rxr->rsc.frag_len[0]);
3503 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3504 	} else {
3505 		/*
3506 		 * Get an mbuf with a cluster.  For packets 2K or less,
3507 		 * get a standard 2K cluster.  For anything larger, get a
3508 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3509 		 * if looped around to the Hyper-V TX channel, so avoid them.
3510 		 */
3511 		size = MCLBYTES;
3512 		if (rxr->rsc.pktlen > MCLBYTES) {
3513 			/* 4096 */
3514 			size = MJUMPAGESIZE;
3515 		}
3516 
3517 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3518 		if (m_new == NULL) {
3519 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3520 			return (0);
3521 		}
3522 
3523 		n = m_new;
3524 		for (i = 0; i < rxr->rsc.cnt; i++) {
3525 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3526 			    rxr->rsc.frag_data[i]);
3527 			if (n == NULL) {
3528 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3529 				return (0);
3530 			} else {
3531 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3532 			}
3533 		}
3534 	}
3535 	if (rxr->rsc.pktlen <= MHLEN)
3536 		rxr->hn_small_pkts++;
3537 
3538 	m_new->m_pkthdr.rcvif = ifp;
3539 
3540 	if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0))
3541 		do_csum = 0;
3542 
3543 	/* receive side checksum offload */
3544 	if (rxr->rsc.csum_info != NULL) {
3545 		/* IP csum offload */
3546 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3547 			m_new->m_pkthdr.csum_flags |=
3548 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3549 			rxr->hn_csum_ip++;
3550 		}
3551 
3552 		/* TCP/UDP csum offload */
3553 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3554 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3555 			m_new->m_pkthdr.csum_flags |=
3556 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3557 			m_new->m_pkthdr.csum_data = 0xffff;
3558 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3559 				rxr->hn_csum_tcp++;
3560 			else
3561 				rxr->hn_csum_udp++;
3562 		}
3563 
3564 		/*
3565 		 * XXX
3566 		 * As of this write (Oct 28th, 2016), host side will turn
3567 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3568 		 * the do_lro setting here is actually _not_ accurate.  We
3569 		 * depend on the RSS hash type check to reset do_lro.
3570 		 */
3571 		if ((*(rxr->rsc.csum_info) &
3572 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3573 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3574 			do_lro = 1;
3575 	} else {
3576 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3577 		if (l3proto == ETHERTYPE_IP) {
3578 			if (l4proto == IPPROTO_TCP) {
3579 				if (do_csum &&
3580 				    (rxr->hn_trust_hcsum &
3581 				     HN_TRUST_HCSUM_TCP)) {
3582 					rxr->hn_csum_trusted++;
3583 					m_new->m_pkthdr.csum_flags |=
3584 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3585 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3586 					m_new->m_pkthdr.csum_data = 0xffff;
3587 				}
3588 				do_lro = 1;
3589 			} else if (l4proto == IPPROTO_UDP) {
3590 				if (do_csum &&
3591 				    (rxr->hn_trust_hcsum &
3592 				     HN_TRUST_HCSUM_UDP)) {
3593 					rxr->hn_csum_trusted++;
3594 					m_new->m_pkthdr.csum_flags |=
3595 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3596 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3597 					m_new->m_pkthdr.csum_data = 0xffff;
3598 				}
3599 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3600 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3601 				rxr->hn_csum_trusted++;
3602 				m_new->m_pkthdr.csum_flags |=
3603 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3604 			}
3605 		}
3606 	}
3607 
3608 	if (rxr->rsc.vlan_info != NULL) {
3609 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3610 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3611 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3612 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3613 		m_new->m_flags |= M_VLANTAG;
3614 	}
3615 
3616 	/*
3617 	 * If VF is activated (tranparent/non-transparent mode does not
3618 	 * matter here).
3619 	 *
3620 	 * - Disable LRO
3621 	 *
3622 	 *   hn(4) will only receive broadcast packets, multicast packets,
3623 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3624 	 *   packet types.
3625 	 *
3626 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3627 	 *   all, since the LRO flush will use hn(4) as the receiving
3628 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3629 	 */
3630 	if (is_vf)
3631 		do_lro = 0;
3632 
3633 	/*
3634 	 * If VF is activated (tranparent/non-transparent mode does not
3635 	 * matter here), do _not_ mess with unsupported hash types or
3636 	 * functions.
3637 	 */
3638 	if (rxr->rsc.hash_info != NULL) {
3639 		rxr->hn_rss_pkts++;
3640 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3641 		if (!is_vf)
3642 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3643 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3644 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3645 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3646 			    rxr->hn_mbuf_hash);
3647 
3648 			/*
3649 			 * NOTE:
3650 			 * do_lro is resetted, if the hash types are not TCP
3651 			 * related.  See the comment in the above csum_flags
3652 			 * setup section.
3653 			 */
3654 			switch (type) {
3655 			case NDIS_HASH_IPV4:
3656 				hash_type = M_HASHTYPE_RSS_IPV4;
3657 				do_lro = 0;
3658 				break;
3659 
3660 			case NDIS_HASH_TCP_IPV4:
3661 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3662 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3663 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3664 
3665 					if (is_vf)
3666 						def_htype = M_HASHTYPE_NONE;
3667 
3668 					/*
3669 					 * UDP 4-tuple hash is delivered as
3670 					 * TCP 4-tuple hash.
3671 					 */
3672 					if (l3proto == ETHERTYPE_MAX) {
3673 						hn_rxpkt_proto(m_new,
3674 						    &l3proto, &l4proto);
3675 					}
3676 					if (l3proto == ETHERTYPE_IP) {
3677 						if (l4proto == IPPROTO_UDP &&
3678 						    (rxr->hn_mbuf_hash &
3679 						     NDIS_HASH_UDP_IPV4_X)) {
3680 							hash_type =
3681 							M_HASHTYPE_RSS_UDP_IPV4;
3682 							do_lro = 0;
3683 						} else if (l4proto !=
3684 						    IPPROTO_TCP) {
3685 							hash_type = def_htype;
3686 							do_lro = 0;
3687 						}
3688 					} else {
3689 						hash_type = def_htype;
3690 						do_lro = 0;
3691 					}
3692 				}
3693 				break;
3694 
3695 			case NDIS_HASH_IPV6:
3696 				hash_type = M_HASHTYPE_RSS_IPV6;
3697 				do_lro = 0;
3698 				break;
3699 
3700 			case NDIS_HASH_IPV6_EX:
3701 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3702 				do_lro = 0;
3703 				break;
3704 
3705 			case NDIS_HASH_TCP_IPV6:
3706 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3707 				break;
3708 
3709 			case NDIS_HASH_TCP_IPV6_EX:
3710 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3711 				break;
3712 			}
3713 		}
3714 	} else if (!is_vf) {
3715 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3716 		hash_type = M_HASHTYPE_OPAQUE;
3717 	}
3718 	M_HASHTYPE_SET(m_new, hash_type);
3719 
3720 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3721 	if (hn_ifp != ifp) {
3722 		const struct ether_header *eh;
3723 
3724 		/*
3725 		 * Non-transparent mode VF is activated.
3726 		 */
3727 
3728 		/*
3729 		 * Allow tapping on hn(4).
3730 		 */
3731 		ETHER_BPF_MTAP(hn_ifp, m_new);
3732 
3733 		/*
3734 		 * Update hn(4)'s stats.
3735 		 */
3736 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3737 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3738 		/* Checked at the beginning of this function. */
3739 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3740 		eh = mtod(m_new, struct ether_header *);
3741 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3742 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3743 	}
3744 	rxr->hn_pkts++;
3745 
3746 	if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) {
3747 #if defined(INET) || defined(INET6)
3748 		struct lro_ctrl *lro = &rxr->hn_lro;
3749 
3750 		if (lro->lro_cnt) {
3751 			rxr->hn_lro_tried++;
3752 			if (hn_lro_rx(lro, m_new) == 0) {
3753 				/* DONE! */
3754 				return 0;
3755 			}
3756 		}
3757 #endif
3758 	}
3759 	if_input(ifp, m_new);
3760 
3761 	return (0);
3762 }
3763 
3764 static int
3765 hn_ioctl(if_t ifp, u_long cmd, caddr_t data)
3766 {
3767 	struct hn_softc *sc = if_getsoftc(ifp);
3768 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3769 	if_t vf_ifp;
3770 	int mask, error = 0;
3771 	struct ifrsskey *ifrk;
3772 	struct ifrsshash *ifrh;
3773 	uint32_t mtu;
3774 
3775 	switch (cmd) {
3776 	case SIOCSIFMTU:
3777 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3778 			error = EINVAL;
3779 			break;
3780 		}
3781 
3782 		HN_LOCK(sc);
3783 
3784 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3785 			HN_UNLOCK(sc);
3786 			break;
3787 		}
3788 
3789 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3790 			/* Can't change MTU */
3791 			HN_UNLOCK(sc);
3792 			error = EOPNOTSUPP;
3793 			break;
3794 		}
3795 
3796 		if (if_getmtu(ifp) == ifr->ifr_mtu) {
3797 			HN_UNLOCK(sc);
3798 			break;
3799 		}
3800 
3801 		if (hn_xpnt_vf_isready(sc)) {
3802 			vf_ifp = sc->hn_vf_ifp;
3803 			ifr_vf = *ifr;
3804 			strlcpy(ifr_vf.ifr_name, if_name(vf_ifp),
3805 			    sizeof(ifr_vf.ifr_name));
3806 			error = ifhwioctl(SIOCSIFMTU,vf_ifp,
3807 			    (caddr_t)&ifr_vf, curthread);
3808 			if (error) {
3809 				HN_UNLOCK(sc);
3810 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3811 				    if_name(vf_ifp), ifr->ifr_mtu, error);
3812 				break;
3813 			}
3814 		}
3815 
3816 		/*
3817 		 * Suspend this interface before the synthetic parts
3818 		 * are ripped.
3819 		 */
3820 		hn_suspend(sc);
3821 
3822 		/*
3823 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3824 		 */
3825 		hn_synth_detach(sc);
3826 
3827 		/*
3828 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3829 		 * with the new MTU setting.
3830 		 */
3831 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3832 		if (error) {
3833 			HN_UNLOCK(sc);
3834 			break;
3835 		}
3836 
3837 		error = hn_rndis_get_mtu(sc, &mtu);
3838 		if (error)
3839 			mtu = ifr->ifr_mtu;
3840 		else if (bootverbose)
3841 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3842 
3843 		/*
3844 		 * Commit the requested MTU, after the synthetic parts
3845 		 * have been successfully attached.
3846 		 */
3847 		if (mtu >= ifr->ifr_mtu) {
3848 			mtu = ifr->ifr_mtu;
3849 		} else {
3850 			if_printf(ifp, "fixup mtu %d -> %u\n",
3851 			    ifr->ifr_mtu, mtu);
3852 		}
3853 		if_setmtu(ifp, mtu);
3854 
3855 		/*
3856 		 * Synthetic parts' reattach may change the chimney
3857 		 * sending size; update it.
3858 		 */
3859 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3860 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3861 
3862 		/*
3863 		 * Make sure that various parameters based on MTU are
3864 		 * still valid, after the MTU change.
3865 		 */
3866 		hn_mtu_change_fixup(sc);
3867 
3868 		/*
3869 		 * All done!  Resume the interface now.
3870 		 */
3871 		hn_resume(sc);
3872 
3873 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3874 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3875 			/*
3876 			 * Since we have reattached the NVS part,
3877 			 * change the datapath to VF again; in case
3878 			 * that it is lost, after the NVS was detached.
3879 			 */
3880 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3881 		}
3882 
3883 		HN_UNLOCK(sc);
3884 		break;
3885 
3886 	case SIOCSIFFLAGS:
3887 		HN_LOCK(sc);
3888 
3889 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3890 			HN_UNLOCK(sc);
3891 			break;
3892 		}
3893 
3894 		if (hn_xpnt_vf_isready(sc))
3895 			hn_xpnt_vf_saveifflags(sc);
3896 
3897 		if (if_getflags(ifp) & IFF_UP) {
3898 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3899 				/*
3900 				 * Caller meight hold mutex, e.g.
3901 				 * bpf; use busy-wait for the RNDIS
3902 				 * reply.
3903 				 */
3904 				HN_NO_SLEEPING(sc);
3905 				hn_rxfilter_config(sc);
3906 				HN_SLEEPING_OK(sc);
3907 
3908 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3909 					error = hn_xpnt_vf_iocsetflags(sc);
3910 			} else {
3911 				hn_init_locked(sc);
3912 			}
3913 		} else {
3914 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
3915 				hn_stop(sc, false);
3916 		}
3917 		sc->hn_if_flags = if_getflags(ifp);
3918 
3919 		HN_UNLOCK(sc);
3920 		break;
3921 
3922 	case SIOCSIFCAP:
3923 		HN_LOCK(sc);
3924 
3925 		if (hn_xpnt_vf_isready(sc)) {
3926 			ifr_vf = *ifr;
3927 			strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp),
3928 			    sizeof(ifr_vf.ifr_name));
3929 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3930 			HN_UNLOCK(sc);
3931 			break;
3932 		}
3933 
3934 		/*
3935 		 * Fix up requested capabilities w/ supported capabilities,
3936 		 * since the supported capabilities could have been changed.
3937 		 */
3938 		mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^
3939 		    if_getcapenable(ifp);
3940 
3941 		if (mask & IFCAP_TXCSUM) {
3942 			if_togglecapenable(ifp, IFCAP_TXCSUM);
3943 			if (if_getcapenable(ifp) & IFCAP_TXCSUM)
3944 				if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0);
3945 			else
3946 				if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc));
3947 		}
3948 		if (mask & IFCAP_TXCSUM_IPV6) {
3949 			if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6);
3950 			if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
3951 				if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0);
3952 			else
3953 				if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc));
3954 		}
3955 
3956 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3957 		if (mask & IFCAP_RXCSUM)
3958 			if_togglecapenable(ifp, IFCAP_RXCSUM);
3959 #ifdef foo
3960 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3961 		if (mask & IFCAP_RXCSUM_IPV6)
3962 			if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6);
3963 #endif
3964 
3965 		if (mask & IFCAP_LRO)
3966 			if_togglecapenable(ifp, IFCAP_LRO);
3967 
3968 		if (mask & IFCAP_TSO4) {
3969 			if_togglecapenable(ifp, IFCAP_TSO4);
3970 			if (if_getcapenable(ifp) & IFCAP_TSO4)
3971 				if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
3972 			else
3973 				if_sethwassistbits(ifp, 0, CSUM_IP_TSO);
3974 		}
3975 		if (mask & IFCAP_TSO6) {
3976 			if_togglecapenable(ifp, IFCAP_TSO6);
3977 			if (if_getcapenable(ifp) & IFCAP_TSO6)
3978 				if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
3979 			else
3980 				if_sethwassistbits(ifp, 0, CSUM_IP6_TSO);
3981 		}
3982 
3983 		HN_UNLOCK(sc);
3984 		break;
3985 
3986 	case SIOCADDMULTI:
3987 	case SIOCDELMULTI:
3988 		HN_LOCK(sc);
3989 
3990 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3991 			HN_UNLOCK(sc);
3992 			break;
3993 		}
3994 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3995 			/*
3996 			 * Multicast uses mutex; use busy-wait for
3997 			 * the RNDIS reply.
3998 			 */
3999 			HN_NO_SLEEPING(sc);
4000 			hn_rxfilter_config(sc);
4001 			HN_SLEEPING_OK(sc);
4002 		}
4003 
4004 		/* XXX vlan(4) style mcast addr maintenance */
4005 		if (hn_xpnt_vf_isready(sc)) {
4006 			int old_if_flags;
4007 
4008 			old_if_flags = if_getflags(sc->hn_vf_ifp);
4009 			hn_xpnt_vf_saveifflags(sc);
4010 
4011 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
4012 			    ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) &
4013 			     IFF_ALLMULTI))
4014 				error = hn_xpnt_vf_iocsetflags(sc);
4015 		}
4016 
4017 		HN_UNLOCK(sc);
4018 		break;
4019 
4020 	case SIOCSIFMEDIA:
4021 	case SIOCGIFMEDIA:
4022 		HN_LOCK(sc);
4023 		if (hn_xpnt_vf_isready(sc)) {
4024 			/*
4025 			 * SIOCGIFMEDIA expects ifmediareq, so don't
4026 			 * create and pass ifr_vf to the VF here; just
4027 			 * replace the ifr_name.
4028 			 */
4029 			vf_ifp = sc->hn_vf_ifp;
4030 			strlcpy(ifr->ifr_name, if_name(vf_ifp),
4031 			    sizeof(ifr->ifr_name));
4032 			error = ifhwioctl(cmd, vf_ifp, data, curthread);
4033 			/* Restore the ifr_name. */
4034 			strlcpy(ifr->ifr_name, if_name(ifp),
4035 			    sizeof(ifr->ifr_name));
4036 			HN_UNLOCK(sc);
4037 			break;
4038 		}
4039 		HN_UNLOCK(sc);
4040 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4041 		break;
4042 
4043 	case SIOCGIFRSSHASH:
4044 		ifrh = (struct ifrsshash *)data;
4045 		HN_LOCK(sc);
4046 		if (sc->hn_rx_ring_inuse == 1) {
4047 			HN_UNLOCK(sc);
4048 			ifrh->ifrh_func = RSS_FUNC_NONE;
4049 			ifrh->ifrh_types = 0;
4050 			break;
4051 		}
4052 
4053 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4054 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4055 		else
4056 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4057 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4058 		HN_UNLOCK(sc);
4059 		break;
4060 
4061 	case SIOCGIFRSSKEY:
4062 		ifrk = (struct ifrsskey *)data;
4063 		HN_LOCK(sc);
4064 		if (sc->hn_rx_ring_inuse == 1) {
4065 			HN_UNLOCK(sc);
4066 			ifrk->ifrk_func = RSS_FUNC_NONE;
4067 			ifrk->ifrk_keylen = 0;
4068 			break;
4069 		}
4070 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4071 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4072 		else
4073 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4074 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4075 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4076 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4077 		HN_UNLOCK(sc);
4078 		break;
4079 
4080 	default:
4081 		error = ether_ioctl(ifp, cmd, data);
4082 		break;
4083 	}
4084 	return (error);
4085 }
4086 
4087 static void
4088 hn_stop(struct hn_softc *sc, bool detaching)
4089 {
4090 	if_t ifp = sc->hn_ifp;
4091 	int i;
4092 
4093 	HN_LOCK_ASSERT(sc);
4094 
4095 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4096 	    ("synthetic parts were not attached"));
4097 
4098 	/* Clear RUNNING bit ASAP. */
4099 	if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
4100 
4101 	/* Disable polling. */
4102 	hn_polling(sc, 0);
4103 
4104 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4105 		KASSERT(sc->hn_vf_ifp != NULL,
4106 		    ("%s: VF is not attached", if_name(ifp)));
4107 
4108 		/* Mark transparent mode VF as disabled. */
4109 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4110 
4111 		/*
4112 		 * NOTE:
4113 		 * Datapath setting must happen _before_ bringing
4114 		 * the VF down.
4115 		 */
4116 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4117 
4118 		/*
4119 		 * Bring the VF down.
4120 		 */
4121 		hn_xpnt_vf_saveifflags(sc);
4122 		if_setflagbits(ifp, 0, IFF_UP);
4123 		hn_xpnt_vf_iocsetflags(sc);
4124 	}
4125 
4126 	/* Suspend data transfers. */
4127 	hn_suspend_data(sc);
4128 
4129 	/* Clear OACTIVE bit. */
4130 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4131 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4132 		sc->hn_tx_ring[i].hn_oactive = 0;
4133 
4134 	/*
4135 	 * If the non-transparent mode VF is active, make sure
4136 	 * that the RX filter still allows packet reception.
4137 	 */
4138 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4139 		hn_rxfilter_config(sc);
4140 }
4141 
4142 static void
4143 hn_init_locked(struct hn_softc *sc)
4144 {
4145 	if_t ifp = sc->hn_ifp;
4146 	int i;
4147 
4148 	HN_LOCK_ASSERT(sc);
4149 
4150 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4151 		return;
4152 
4153 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
4154 		return;
4155 
4156 	/* Configure RX filter */
4157 	hn_rxfilter_config(sc);
4158 
4159 	/* Clear OACTIVE bit. */
4160 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4161 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4162 		sc->hn_tx_ring[i].hn_oactive = 0;
4163 
4164 	/* Clear TX 'suspended' bit. */
4165 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4166 
4167 	if (hn_xpnt_vf_isready(sc)) {
4168 		/* Initialize transparent VF. */
4169 		hn_xpnt_vf_init(sc);
4170 	}
4171 
4172 	/* Everything is ready; unleash! */
4173 	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
4174 
4175 	/* Re-enable polling if requested. */
4176 	if (sc->hn_pollhz > 0)
4177 		hn_polling(sc, sc->hn_pollhz);
4178 }
4179 
4180 static void
4181 hn_init(void *xsc)
4182 {
4183 	struct hn_softc *sc = xsc;
4184 
4185 	HN_LOCK(sc);
4186 	hn_init_locked(sc);
4187 	HN_UNLOCK(sc);
4188 }
4189 
4190 static int
4191 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4192 {
4193 	struct hn_softc *sc = arg1;
4194 	unsigned int lenlim;
4195 	int error;
4196 
4197 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4198 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4199 	if (error || req->newptr == NULL)
4200 		return error;
4201 
4202 	HN_LOCK(sc);
4203 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4204 	    lenlim > TCP_LRO_LENGTH_MAX) {
4205 		HN_UNLOCK(sc);
4206 		return EINVAL;
4207 	}
4208 	hn_set_lro_lenlim(sc, lenlim);
4209 	HN_UNLOCK(sc);
4210 
4211 	return 0;
4212 }
4213 
4214 static int
4215 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4216 {
4217 	struct hn_softc *sc = arg1;
4218 	int ackcnt, error, i;
4219 
4220 	/*
4221 	 * lro_ackcnt_lim is append count limit,
4222 	 * +1 to turn it into aggregation limit.
4223 	 */
4224 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4225 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4226 	if (error || req->newptr == NULL)
4227 		return error;
4228 
4229 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4230 		return EINVAL;
4231 
4232 	/*
4233 	 * Convert aggregation limit back to append
4234 	 * count limit.
4235 	 */
4236 	--ackcnt;
4237 	HN_LOCK(sc);
4238 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4239 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4240 	HN_UNLOCK(sc);
4241 	return 0;
4242 }
4243 
4244 static int
4245 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4246 {
4247 	struct hn_softc *sc = arg1;
4248 	int hcsum = arg2;
4249 	int on, error, i;
4250 
4251 	on = 0;
4252 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4253 		on = 1;
4254 
4255 	error = sysctl_handle_int(oidp, &on, 0, req);
4256 	if (error || req->newptr == NULL)
4257 		return error;
4258 
4259 	HN_LOCK(sc);
4260 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4261 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4262 
4263 		if (on)
4264 			rxr->hn_trust_hcsum |= hcsum;
4265 		else
4266 			rxr->hn_trust_hcsum &= ~hcsum;
4267 	}
4268 	HN_UNLOCK(sc);
4269 	return 0;
4270 }
4271 
4272 static int
4273 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4274 {
4275 	struct hn_softc *sc = arg1;
4276 	int chim_size, error;
4277 
4278 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4279 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4280 	if (error || req->newptr == NULL)
4281 		return error;
4282 
4283 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4284 		return EINVAL;
4285 
4286 	HN_LOCK(sc);
4287 	hn_set_chim_size(sc, chim_size);
4288 	HN_UNLOCK(sc);
4289 	return 0;
4290 }
4291 
4292 static int
4293 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4294 {
4295 	struct hn_softc *sc = arg1;
4296 	int ofs = arg2, i, error;
4297 	struct hn_rx_ring *rxr;
4298 	uint64_t stat;
4299 
4300 	stat = 0;
4301 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4302 		rxr = &sc->hn_rx_ring[i];
4303 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4304 	}
4305 
4306 	error = sysctl_handle_64(oidp, &stat, 0, req);
4307 	if (error || req->newptr == NULL)
4308 		return error;
4309 
4310 	/* Zero out this stat. */
4311 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4312 		rxr = &sc->hn_rx_ring[i];
4313 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4314 	}
4315 	return 0;
4316 }
4317 
4318 static int
4319 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4320 {
4321 	struct hn_softc *sc = arg1;
4322 	int ofs = arg2, i, error;
4323 	struct hn_rx_ring *rxr;
4324 	u_long stat;
4325 
4326 	stat = 0;
4327 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4328 		rxr = &sc->hn_rx_ring[i];
4329 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4330 	}
4331 
4332 	error = sysctl_handle_long(oidp, &stat, 0, req);
4333 	if (error || req->newptr == NULL)
4334 		return error;
4335 
4336 	/* Zero out this stat. */
4337 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4338 		rxr = &sc->hn_rx_ring[i];
4339 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4340 	}
4341 	return 0;
4342 }
4343 
4344 static int
4345 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4346 {
4347 	struct hn_softc *sc = arg1;
4348 	int ofs = arg2, i, error;
4349 	struct hn_tx_ring *txr;
4350 	u_long stat;
4351 
4352 	stat = 0;
4353 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4354 		txr = &sc->hn_tx_ring[i];
4355 		stat += *((u_long *)((uint8_t *)txr + ofs));
4356 	}
4357 
4358 	error = sysctl_handle_long(oidp, &stat, 0, req);
4359 	if (error || req->newptr == NULL)
4360 		return error;
4361 
4362 	/* Zero out this stat. */
4363 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4364 		txr = &sc->hn_tx_ring[i];
4365 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4366 	}
4367 	return 0;
4368 }
4369 
4370 static int
4371 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4372 {
4373 	struct hn_softc *sc = arg1;
4374 	int ofs = arg2, i, error, conf;
4375 	struct hn_tx_ring *txr;
4376 
4377 	txr = &sc->hn_tx_ring[0];
4378 	conf = *((int *)((uint8_t *)txr + ofs));
4379 
4380 	error = sysctl_handle_int(oidp, &conf, 0, req);
4381 	if (error || req->newptr == NULL)
4382 		return error;
4383 
4384 	HN_LOCK(sc);
4385 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4386 		txr = &sc->hn_tx_ring[i];
4387 		*((int *)((uint8_t *)txr + ofs)) = conf;
4388 	}
4389 	HN_UNLOCK(sc);
4390 
4391 	return 0;
4392 }
4393 
4394 static int
4395 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4396 {
4397 	struct hn_softc *sc = arg1;
4398 	int error, size;
4399 
4400 	size = sc->hn_agg_size;
4401 	error = sysctl_handle_int(oidp, &size, 0, req);
4402 	if (error || req->newptr == NULL)
4403 		return (error);
4404 
4405 	HN_LOCK(sc);
4406 	sc->hn_agg_size = size;
4407 	hn_set_txagg(sc);
4408 	HN_UNLOCK(sc);
4409 
4410 	return (0);
4411 }
4412 
4413 static int
4414 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4415 {
4416 	struct hn_softc *sc = arg1;
4417 	int error, pkts;
4418 
4419 	pkts = sc->hn_agg_pkts;
4420 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4421 	if (error || req->newptr == NULL)
4422 		return (error);
4423 
4424 	HN_LOCK(sc);
4425 	sc->hn_agg_pkts = pkts;
4426 	hn_set_txagg(sc);
4427 	HN_UNLOCK(sc);
4428 
4429 	return (0);
4430 }
4431 
4432 static int
4433 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4434 {
4435 	struct hn_softc *sc = arg1;
4436 	int pkts;
4437 
4438 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4439 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4440 }
4441 
4442 static int
4443 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4444 {
4445 	struct hn_softc *sc = arg1;
4446 	int align;
4447 
4448 	align = sc->hn_tx_ring[0].hn_agg_align;
4449 	return (sysctl_handle_int(oidp, &align, 0, req));
4450 }
4451 
4452 static void
4453 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4454 {
4455 	if (pollhz == 0)
4456 		vmbus_chan_poll_disable(chan);
4457 	else
4458 		vmbus_chan_poll_enable(chan, pollhz);
4459 }
4460 
4461 static void
4462 hn_polling(struct hn_softc *sc, u_int pollhz)
4463 {
4464 	int nsubch = sc->hn_rx_ring_inuse - 1;
4465 
4466 	HN_LOCK_ASSERT(sc);
4467 
4468 	if (nsubch > 0) {
4469 		struct vmbus_channel **subch;
4470 		int i;
4471 
4472 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4473 		for (i = 0; i < nsubch; ++i)
4474 			hn_chan_polling(subch[i], pollhz);
4475 		vmbus_subchan_rel(subch, nsubch);
4476 	}
4477 	hn_chan_polling(sc->hn_prichan, pollhz);
4478 }
4479 
4480 static int
4481 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4482 {
4483 	struct hn_softc *sc = arg1;
4484 	int pollhz, error;
4485 
4486 	pollhz = sc->hn_pollhz;
4487 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4488 	if (error || req->newptr == NULL)
4489 		return (error);
4490 
4491 	if (pollhz != 0 &&
4492 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4493 		return (EINVAL);
4494 
4495 	HN_LOCK(sc);
4496 	if (sc->hn_pollhz != pollhz) {
4497 		sc->hn_pollhz = pollhz;
4498 		if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) &&
4499 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4500 			hn_polling(sc, sc->hn_pollhz);
4501 	}
4502 	HN_UNLOCK(sc);
4503 
4504 	return (0);
4505 }
4506 
4507 static int
4508 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4509 {
4510 	struct hn_softc *sc = arg1;
4511 	char verstr[16];
4512 
4513 	snprintf(verstr, sizeof(verstr), "%u.%u",
4514 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4515 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4516 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4517 }
4518 
4519 static int
4520 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4521 {
4522 	struct hn_softc *sc = arg1;
4523 	char caps_str[128];
4524 	uint32_t caps;
4525 
4526 	HN_LOCK(sc);
4527 	caps = sc->hn_caps;
4528 	HN_UNLOCK(sc);
4529 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4530 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4531 }
4532 
4533 static int
4534 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4535 {
4536 	struct hn_softc *sc = arg1;
4537 	char assist_str[128];
4538 	uint32_t hwassist;
4539 
4540 	HN_LOCK(sc);
4541 	hwassist = if_gethwassist(sc->hn_ifp);
4542 	HN_UNLOCK(sc);
4543 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4544 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4545 }
4546 
4547 static int
4548 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4549 {
4550 	struct hn_softc *sc = arg1;
4551 	char filter_str[128];
4552 	uint32_t filter;
4553 
4554 	HN_LOCK(sc);
4555 	filter = sc->hn_rx_filter;
4556 	HN_UNLOCK(sc);
4557 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4558 	    NDIS_PACKET_TYPES);
4559 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4560 }
4561 
4562 static int
4563 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
4564 {
4565 	struct hn_softc *sc = arg1;
4566 	uint32_t mtu;
4567 	int error;
4568 	HN_LOCK(sc);
4569 	error = hn_rndis_get_mtu(sc, &mtu);
4570 	if (error) {
4571 		if_printf(sc->hn_ifp, "failed to get mtu\n");
4572 		goto back;
4573 	}
4574 	error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4575 	if (error || req->newptr == NULL)
4576 		goto back;
4577 
4578 	error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4579 	if (error)
4580 		goto back;
4581 	error = hn_rndis_reconf_offload(sc, mtu);
4582 back:
4583 	HN_UNLOCK(sc);
4584 	return (error);
4585 }
4586 #ifndef RSS
4587 
4588 static int
4589 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4590 {
4591 	struct hn_softc *sc = arg1;
4592 	int error;
4593 
4594 	HN_LOCK(sc);
4595 
4596 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4597 	if (error || req->newptr == NULL)
4598 		goto back;
4599 
4600 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4601 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4602 		/*
4603 		 * RSS key is synchronized w/ VF's, don't allow users
4604 		 * to change it.
4605 		 */
4606 		error = EBUSY;
4607 		goto back;
4608 	}
4609 
4610 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4611 	if (error)
4612 		goto back;
4613 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4614 
4615 	if (sc->hn_rx_ring_inuse > 1) {
4616 		error = hn_rss_reconfig(sc);
4617 	} else {
4618 		/* Not RSS capable, at least for now; just save the RSS key. */
4619 		error = 0;
4620 	}
4621 back:
4622 	HN_UNLOCK(sc);
4623 	return (error);
4624 }
4625 
4626 static int
4627 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4628 {
4629 	struct hn_softc *sc = arg1;
4630 	int error;
4631 
4632 	HN_LOCK(sc);
4633 
4634 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4635 	if (error || req->newptr == NULL)
4636 		goto back;
4637 
4638 	/*
4639 	 * Don't allow RSS indirect table change, if this interface is not
4640 	 * RSS capable currently.
4641 	 */
4642 	if (sc->hn_rx_ring_inuse == 1) {
4643 		error = EOPNOTSUPP;
4644 		goto back;
4645 	}
4646 
4647 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4648 	if (error)
4649 		goto back;
4650 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4651 
4652 	hn_rss_ind_fixup(sc);
4653 	error = hn_rss_reconfig(sc);
4654 back:
4655 	HN_UNLOCK(sc);
4656 	return (error);
4657 }
4658 
4659 #endif	/* !RSS */
4660 
4661 static int
4662 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4663 {
4664 	struct hn_softc *sc = arg1;
4665 	char hash_str[128];
4666 	uint32_t hash;
4667 
4668 	HN_LOCK(sc);
4669 	hash = sc->hn_rss_hash;
4670 	HN_UNLOCK(sc);
4671 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4672 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4673 }
4674 
4675 static int
4676 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4677 {
4678 	struct hn_softc *sc = arg1;
4679 	char hash_str[128];
4680 	uint32_t hash;
4681 
4682 	HN_LOCK(sc);
4683 	hash = sc->hn_rss_hcap;
4684 	HN_UNLOCK(sc);
4685 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4686 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4687 }
4688 
4689 static int
4690 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4691 {
4692 	struct hn_softc *sc = arg1;
4693 	char hash_str[128];
4694 	uint32_t hash;
4695 
4696 	HN_LOCK(sc);
4697 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4698 	HN_UNLOCK(sc);
4699 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4700 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4701 }
4702 
4703 static int
4704 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4705 {
4706 	struct hn_softc *sc = arg1;
4707 	char vf_name[IFNAMSIZ + 1];
4708 	if_t vf_ifp;
4709 
4710 	HN_LOCK(sc);
4711 	vf_name[0] = '\0';
4712 	vf_ifp = sc->hn_vf_ifp;
4713 	if (vf_ifp != NULL)
4714 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4715 	HN_UNLOCK(sc);
4716 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4717 }
4718 
4719 static int
4720 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4721 {
4722 	struct hn_softc *sc = arg1;
4723 	char vf_name[IFNAMSIZ + 1];
4724 	if_t vf_ifp;
4725 
4726 	HN_LOCK(sc);
4727 	vf_name[0] = '\0';
4728 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4729 	if (vf_ifp != NULL)
4730 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4731 	HN_UNLOCK(sc);
4732 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4733 }
4734 
4735 static int
4736 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4737 {
4738 	struct rm_priotracker pt;
4739 	struct sbuf *sb;
4740 	int error, i;
4741 	bool first;
4742 
4743 	error = sysctl_wire_old_buffer(req, 0);
4744 	if (error != 0)
4745 		return (error);
4746 
4747 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4748 	if (sb == NULL)
4749 		return (ENOMEM);
4750 
4751 	rm_rlock(&hn_vfmap_lock, &pt);
4752 
4753 	first = true;
4754 	for (i = 0; i < hn_vfmap_size; ++i) {
4755 		struct epoch_tracker et;
4756 		if_t ifp;
4757 
4758 		if (hn_vfmap[i] == NULL)
4759 			continue;
4760 
4761 		NET_EPOCH_ENTER(et);
4762 		ifp = ifnet_byindex(i);
4763 		if (ifp != NULL) {
4764 			if (first)
4765 				sbuf_printf(sb, "%s", if_name(ifp));
4766 			else
4767 				sbuf_printf(sb, " %s", if_name(ifp));
4768 			first = false;
4769 		}
4770 		NET_EPOCH_EXIT(et);
4771 	}
4772 
4773 	rm_runlock(&hn_vfmap_lock, &pt);
4774 
4775 	error = sbuf_finish(sb);
4776 	sbuf_delete(sb);
4777 	return (error);
4778 }
4779 
4780 static int
4781 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4782 {
4783 	struct rm_priotracker pt;
4784 	struct sbuf *sb;
4785 	int error, i;
4786 	bool first;
4787 
4788 	error = sysctl_wire_old_buffer(req, 0);
4789 	if (error != 0)
4790 		return (error);
4791 
4792 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4793 	if (sb == NULL)
4794 		return (ENOMEM);
4795 
4796 	rm_rlock(&hn_vfmap_lock, &pt);
4797 
4798 	first = true;
4799 	for (i = 0; i < hn_vfmap_size; ++i) {
4800 		struct epoch_tracker et;
4801 		if_t ifp, hn_ifp;
4802 
4803 		hn_ifp = hn_vfmap[i];
4804 		if (hn_ifp == NULL)
4805 			continue;
4806 
4807 		NET_EPOCH_ENTER(et);
4808 		ifp = ifnet_byindex(i);
4809 		if (ifp != NULL) {
4810 			if (first) {
4811 				sbuf_printf(sb, "%s:%s", if_name(ifp),
4812 				    if_name(hn_ifp));
4813 			} else {
4814 				sbuf_printf(sb, " %s:%s", if_name(ifp),
4815 				    if_name(hn_ifp));
4816 			}
4817 			first = false;
4818 		}
4819 		NET_EPOCH_EXIT(et);
4820 	}
4821 
4822 	rm_runlock(&hn_vfmap_lock, &pt);
4823 
4824 	error = sbuf_finish(sb);
4825 	sbuf_delete(sb);
4826 	return (error);
4827 }
4828 
4829 static int
4830 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4831 {
4832 	struct hn_softc *sc = arg1;
4833 	int error, onoff = 0;
4834 
4835 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4836 		onoff = 1;
4837 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4838 	if (error || req->newptr == NULL)
4839 		return (error);
4840 
4841 	HN_LOCK(sc);
4842 	/* NOTE: hn_vf_lock for hn_transmit() */
4843 	rm_wlock(&sc->hn_vf_lock);
4844 	if (onoff)
4845 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4846 	else
4847 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4848 	rm_wunlock(&sc->hn_vf_lock);
4849 	HN_UNLOCK(sc);
4850 
4851 	return (0);
4852 }
4853 
4854 static int
4855 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4856 {
4857 	struct hn_softc *sc = arg1;
4858 	int enabled = 0;
4859 
4860 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4861 		enabled = 1;
4862 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4863 }
4864 
4865 static int
4866 hn_check_iplen(const struct mbuf *m, int hoff)
4867 {
4868 	const struct ip *ip;
4869 	int len, iphlen, iplen;
4870 	const struct tcphdr *th;
4871 	int thoff;				/* TCP data offset */
4872 
4873 	len = hoff + sizeof(struct ip);
4874 
4875 	/* The packet must be at least the size of an IP header. */
4876 	if (m->m_pkthdr.len < len)
4877 		return IPPROTO_DONE;
4878 
4879 	/* The fixed IP header must reside completely in the first mbuf. */
4880 	if (m->m_len < len)
4881 		return IPPROTO_DONE;
4882 
4883 	ip = mtodo(m, hoff);
4884 
4885 	/* Bound check the packet's stated IP header length. */
4886 	iphlen = ip->ip_hl << 2;
4887 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4888 		return IPPROTO_DONE;
4889 
4890 	/* The full IP header must reside completely in the one mbuf. */
4891 	if (m->m_len < hoff + iphlen)
4892 		return IPPROTO_DONE;
4893 
4894 	iplen = ntohs(ip->ip_len);
4895 
4896 	/*
4897 	 * Check that the amount of data in the buffers is as
4898 	 * at least much as the IP header would have us expect.
4899 	 */
4900 	if (m->m_pkthdr.len < hoff + iplen)
4901 		return IPPROTO_DONE;
4902 
4903 	/*
4904 	 * Ignore IP fragments.
4905 	 */
4906 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4907 		return IPPROTO_DONE;
4908 
4909 	/*
4910 	 * The TCP/IP or UDP/IP header must be entirely contained within
4911 	 * the first fragment of a packet.
4912 	 */
4913 	switch (ip->ip_p) {
4914 	case IPPROTO_TCP:
4915 		if (iplen < iphlen + sizeof(struct tcphdr))
4916 			return IPPROTO_DONE;
4917 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4918 			return IPPROTO_DONE;
4919 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4920 		thoff = th->th_off << 2;
4921 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4922 			return IPPROTO_DONE;
4923 		if (m->m_len < hoff + iphlen + thoff)
4924 			return IPPROTO_DONE;
4925 		break;
4926 	case IPPROTO_UDP:
4927 		if (iplen < iphlen + sizeof(struct udphdr))
4928 			return IPPROTO_DONE;
4929 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4930 			return IPPROTO_DONE;
4931 		break;
4932 	default:
4933 		if (iplen < iphlen)
4934 			return IPPROTO_DONE;
4935 		break;
4936 	}
4937 	return ip->ip_p;
4938 }
4939 
4940 static void
4941 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4942 {
4943 	const struct ether_header *eh;
4944 	uint16_t etype;
4945 	int hoff;
4946 
4947 	hoff = sizeof(*eh);
4948 	/* Checked at the beginning of this function. */
4949 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4950 
4951 	eh = mtod(m_new, const struct ether_header *);
4952 	etype = ntohs(eh->ether_type);
4953 	if (etype == ETHERTYPE_VLAN) {
4954 		const struct ether_vlan_header *evl;
4955 
4956 		hoff = sizeof(*evl);
4957 		if (m_new->m_len < hoff)
4958 			return;
4959 		evl = mtod(m_new, const struct ether_vlan_header *);
4960 		etype = ntohs(evl->evl_proto);
4961 	}
4962 	*l3proto = etype;
4963 
4964 	if (etype == ETHERTYPE_IP)
4965 		*l4proto = hn_check_iplen(m_new, hoff);
4966 	else
4967 		*l4proto = IPPROTO_DONE;
4968 }
4969 
4970 static int
4971 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4972 {
4973 	struct sysctl_oid_list *child;
4974 	struct sysctl_ctx_list *ctx;
4975 	device_t dev = sc->hn_dev;
4976 #if defined(INET) || defined(INET6)
4977 	int lroent_cnt;
4978 #endif
4979 	int i;
4980 
4981 	/*
4982 	 * Create RXBUF for reception.
4983 	 *
4984 	 * NOTE:
4985 	 * - It is shared by all channels.
4986 	 * - A large enough buffer is allocated, certain version of NVSes
4987 	 *   may further limit the usable space.
4988 	 */
4989 	sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
4990 	    0ul, ~0ul, PAGE_SIZE, 0);
4991 	if (sc->hn_rxbuf == NULL) {
4992 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4993 		return (ENOMEM);
4994 	}
4995 
4996 	sc->hn_rx_ring_cnt = ring_cnt;
4997 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4998 
4999 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
5000 	    M_DEVBUF, M_WAITOK | M_ZERO);
5001 
5002 #if defined(INET) || defined(INET6)
5003 	lroent_cnt = hn_lro_entry_count;
5004 	if (lroent_cnt < TCP_LRO_ENTRIES)
5005 		lroent_cnt = TCP_LRO_ENTRIES;
5006 	if (bootverbose)
5007 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
5008 #endif	/* INET || INET6 */
5009 
5010 	ctx = device_get_sysctl_ctx(dev);
5011 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
5012 
5013 	/* Create dev.hn.UNIT.rx sysctl tree */
5014 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
5015 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5016 
5017 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5018 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5019 
5020 		rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF,
5021 		    M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0);
5022 		if (rxr->hn_br == NULL) {
5023 			device_printf(dev, "allocate bufring failed\n");
5024 			return (ENOMEM);
5025 		}
5026 
5027 		if (hn_trust_hosttcp)
5028 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
5029 		if (hn_trust_hostudp)
5030 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
5031 		if (hn_trust_hostip)
5032 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
5033 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
5034 		rxr->hn_ifp = sc->hn_ifp;
5035 		if (i < sc->hn_tx_ring_cnt)
5036 			rxr->hn_txr = &sc->hn_tx_ring[i];
5037 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
5038 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
5039 		rxr->hn_rx_idx = i;
5040 		rxr->hn_rxbuf = sc->hn_rxbuf;
5041 
5042 		/*
5043 		 * Initialize LRO.
5044 		 */
5045 #if defined(INET) || defined(INET6)
5046 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5047 		    hn_lro_mbufq_depth);
5048 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5049 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5050 #endif	/* INET || INET6 */
5051 
5052 		if (sc->hn_rx_sysctl_tree != NULL) {
5053 			char name[16];
5054 
5055 			/*
5056 			 * Create per RX ring sysctl tree:
5057 			 * dev.hn.UNIT.rx.RINGID
5058 			 */
5059 			snprintf(name, sizeof(name), "%d", i);
5060 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5061 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5062 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5063 
5064 			if (rxr->hn_rx_sysctl_tree != NULL) {
5065 				SYSCTL_ADD_ULONG(ctx,
5066 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5067 				    OID_AUTO, "packets",
5068 				    CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5069 				    "# of packets received");
5070 				SYSCTL_ADD_ULONG(ctx,
5071 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5072 				    OID_AUTO, "rss_pkts",
5073 				    CTLFLAG_RW | CTLFLAG_STATS,
5074 				    &rxr->hn_rss_pkts,
5075 				    "# of packets w/ RSS info received");
5076 				SYSCTL_ADD_ULONG(ctx,
5077 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5078 				    OID_AUTO, "rsc_pkts",
5079 				    CTLFLAG_RW | CTLFLAG_STATS,
5080 				    &rxr->hn_rsc_pkts,
5081 				    "# of RSC packets received");
5082 				SYSCTL_ADD_ULONG(ctx,
5083 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5084 				    OID_AUTO, "rsc_drop",
5085 				    CTLFLAG_RW | CTLFLAG_STATS,
5086 				    &rxr->hn_rsc_drop,
5087 				    "# of RSC fragments dropped");
5088 				SYSCTL_ADD_INT(ctx,
5089 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5090 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5091 				    &rxr->hn_pktbuf_len, 0,
5092 				    "Temporary channel packet buffer length");
5093 			}
5094 		}
5095 	}
5096 
5097 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5098 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5099 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5100 	    hn_rx_stat_u64_sysctl,
5101 	    "LU", "LRO queued");
5102 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5103 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5104 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5105 	    hn_rx_stat_u64_sysctl,
5106 	    "LU", "LRO flushed");
5107 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5108 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5109 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5110 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5111 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5112 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5113 	    hn_lro_lenlim_sysctl, "IU",
5114 	    "Max # of data bytes to be aggregated by LRO");
5115 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5116 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5117 	    hn_lro_ackcnt_sysctl, "I",
5118 	    "Max # of ACKs to be aggregated by LRO");
5119 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5120 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5121 	    hn_trust_hcsum_sysctl, "I",
5122 	    "Trust tcp segment verification on host side, "
5123 	    "when csum info is missing");
5124 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5125 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5126 	    hn_trust_hcsum_sysctl, "I",
5127 	    "Trust udp datagram verification on host side, "
5128 	    "when csum info is missing");
5129 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5130 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5131 	    hn_trust_hcsum_sysctl, "I",
5132 	    "Trust ip packet verification on host side, "
5133 	    "when csum info is missing");
5134 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5135 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5136 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5137 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5138 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5139 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5140 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5141 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5142 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5143 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5144 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5145 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5146 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5147 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5148 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5149 	    hn_rx_stat_ulong_sysctl, "LU",
5150 	    "# of packets that we trust host's csum verification");
5151 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5152 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5153 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5154 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5155 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5156 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5157 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5158 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5159 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5160 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5161 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5162 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5163 
5164 	return (0);
5165 }
5166 
5167 static void
5168 hn_destroy_rx_data(struct hn_softc *sc)
5169 {
5170 	int i;
5171 
5172 	if (sc->hn_rxbuf != NULL) {
5173 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5174 			contigfree(sc->hn_rxbuf, HN_RXBUF_SIZE, M_DEVBUF);
5175 		else
5176 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5177 		sc->hn_rxbuf = NULL;
5178 	}
5179 
5180 	if (sc->hn_rx_ring_cnt == 0)
5181 		return;
5182 
5183 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5184 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5185 
5186 		if (rxr->hn_br == NULL)
5187 			continue;
5188 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5189 			contigfree(rxr->hn_br, HN_TXBR_SIZE + HN_RXBR_SIZE,
5190 			    M_DEVBUF);
5191 		} else {
5192 			device_printf(sc->hn_dev,
5193 			    "%dth channel bufring is referenced", i);
5194 		}
5195 		rxr->hn_br = NULL;
5196 
5197 #if defined(INET) || defined(INET6)
5198 		tcp_lro_free(&rxr->hn_lro);
5199 #endif
5200 		free(rxr->hn_pktbuf, M_DEVBUF);
5201 	}
5202 	free(sc->hn_rx_ring, M_DEVBUF);
5203 	sc->hn_rx_ring = NULL;
5204 
5205 	sc->hn_rx_ring_cnt = 0;
5206 	sc->hn_rx_ring_inuse = 0;
5207 }
5208 
5209 static int
5210 hn_tx_ring_create(struct hn_softc *sc, int id)
5211 {
5212 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5213 	device_t dev = sc->hn_dev;
5214 	bus_dma_tag_t parent_dtag;
5215 	int error, i;
5216 
5217 	txr->hn_sc = sc;
5218 	txr->hn_tx_idx = id;
5219 
5220 #ifndef HN_USE_TXDESC_BUFRING
5221 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5222 #endif
5223 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5224 
5225 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5226 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5227 	    M_DEVBUF, M_WAITOK | M_ZERO);
5228 #ifndef HN_USE_TXDESC_BUFRING
5229 	SLIST_INIT(&txr->hn_txlist);
5230 #else
5231 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5232 	    M_WAITOK, &txr->hn_tx_lock);
5233 #endif
5234 
5235 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5236 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5237 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5238 	} else {
5239 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5240 	}
5241 
5242 #ifdef HN_IFSTART_SUPPORT
5243 	if (hn_use_if_start) {
5244 		txr->hn_txeof = hn_start_txeof;
5245 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5246 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5247 	} else
5248 #endif
5249 	{
5250 		int br_depth;
5251 
5252 		txr->hn_txeof = hn_xmit_txeof;
5253 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5254 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5255 
5256 		br_depth = hn_get_txswq_depth(txr);
5257 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5258 		    M_WAITOK, &txr->hn_tx_lock);
5259 	}
5260 
5261 	txr->hn_direct_tx_size = hn_direct_tx_size;
5262 
5263 	/*
5264 	 * Always schedule transmission instead of trying to do direct
5265 	 * transmission.  This one gives the best performance so far.
5266 	 */
5267 	txr->hn_sched_tx = 1;
5268 
5269 	parent_dtag = bus_get_dma_tag(dev);
5270 
5271 	/* DMA tag for RNDIS packet messages. */
5272 	error = bus_dma_tag_create(parent_dtag, /* parent */
5273 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5274 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5275 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5276 	    BUS_SPACE_MAXADDR,		/* highaddr */
5277 	    NULL, NULL,			/* filter, filterarg */
5278 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5279 	    1,				/* nsegments */
5280 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5281 	    0,				/* flags */
5282 	    NULL,			/* lockfunc */
5283 	    NULL,			/* lockfuncarg */
5284 	    &txr->hn_tx_rndis_dtag);
5285 	if (error) {
5286 		device_printf(dev, "failed to create rndis dmatag\n");
5287 		return error;
5288 	}
5289 
5290 	/* DMA tag for data. */
5291 	error = bus_dma_tag_create(parent_dtag, /* parent */
5292 	    1,				/* alignment */
5293 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5294 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5295 	    BUS_SPACE_MAXADDR,		/* highaddr */
5296 	    NULL, NULL,			/* filter, filterarg */
5297 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5298 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5299 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5300 	    0,				/* flags */
5301 	    NULL,			/* lockfunc */
5302 	    NULL,			/* lockfuncarg */
5303 	    &txr->hn_tx_data_dtag);
5304 	if (error) {
5305 		device_printf(dev, "failed to create data dmatag\n");
5306 		return error;
5307 	}
5308 
5309 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5310 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5311 
5312 		txd->txr = txr;
5313 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5314 		STAILQ_INIT(&txd->agg_list);
5315 
5316 		/*
5317 		 * Allocate and load RNDIS packet message.
5318 		 */
5319         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5320 		    (void **)&txd->rndis_pkt,
5321 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5322 		    &txd->rndis_pkt_dmap);
5323 		if (error) {
5324 			device_printf(dev,
5325 			    "failed to allocate rndis_packet_msg, %d\n", i);
5326 			return error;
5327 		}
5328 
5329 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5330 		    txd->rndis_pkt_dmap,
5331 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5332 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5333 		    BUS_DMA_NOWAIT);
5334 		if (error) {
5335 			device_printf(dev,
5336 			    "failed to load rndis_packet_msg, %d\n", i);
5337 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5338 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5339 			return error;
5340 		}
5341 
5342 		/* DMA map for TX data. */
5343 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5344 		    &txd->data_dmap);
5345 		if (error) {
5346 			device_printf(dev,
5347 			    "failed to allocate tx data dmamap\n");
5348 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5349 			    txd->rndis_pkt_dmap);
5350 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5351 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5352 			return error;
5353 		}
5354 
5355 		/* All set, put it to list */
5356 		txd->flags |= HN_TXD_FLAG_ONLIST;
5357 #ifndef HN_USE_TXDESC_BUFRING
5358 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5359 #else
5360 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5361 #endif
5362 	}
5363 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5364 
5365 	if (sc->hn_tx_sysctl_tree != NULL) {
5366 		struct sysctl_oid_list *child;
5367 		struct sysctl_ctx_list *ctx;
5368 		char name[16];
5369 
5370 		/*
5371 		 * Create per TX ring sysctl tree:
5372 		 * dev.hn.UNIT.tx.RINGID
5373 		 */
5374 		ctx = device_get_sysctl_ctx(dev);
5375 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5376 
5377 		snprintf(name, sizeof(name), "%d", id);
5378 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5379 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5380 
5381 		if (txr->hn_tx_sysctl_tree != NULL) {
5382 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5383 
5384 #ifdef HN_DEBUG
5385 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5386 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5387 			    "# of available TX descs");
5388 #endif
5389 #ifdef HN_IFSTART_SUPPORT
5390 			if (!hn_use_if_start)
5391 #endif
5392 			{
5393 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5394 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5395 				    "over active");
5396 			}
5397 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5398 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5399 			    "# of packets transmitted");
5400 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5401 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5402 			    "# of sends");
5403 		}
5404 	}
5405 
5406 	return 0;
5407 }
5408 
5409 static void
5410 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5411 {
5412 	struct hn_tx_ring *txr = txd->txr;
5413 
5414 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5415 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5416 
5417 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5418 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5419 	    txd->rndis_pkt_dmap);
5420 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5421 }
5422 
5423 static void
5424 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5425 {
5426 
5427 	KASSERT(txd->refs == 0 || txd->refs == 1,
5428 	    ("invalid txd refs %d", txd->refs));
5429 
5430 	/* Aggregated txds will be freed by their aggregating txd. */
5431 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5432 		int freed __diagused;
5433 
5434 		freed = hn_txdesc_put(txr, txd);
5435 		KASSERT(freed, ("can't free txdesc"));
5436 	}
5437 }
5438 
5439 static void
5440 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5441 {
5442 	int i;
5443 
5444 	if (txr->hn_txdesc == NULL)
5445 		return;
5446 
5447 	/*
5448 	 * NOTE:
5449 	 * Because the freeing of aggregated txds will be deferred
5450 	 * to the aggregating txd, two passes are used here:
5451 	 * - The first pass GCes any pending txds.  This GC is necessary,
5452 	 *   since if the channels are revoked, hypervisor will not
5453 	 *   deliver send-done for all pending txds.
5454 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5455 	 *   were freed.
5456 	 */
5457 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5458 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5459 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5460 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5461 
5462 	if (txr->hn_tx_data_dtag != NULL)
5463 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5464 	if (txr->hn_tx_rndis_dtag != NULL)
5465 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5466 
5467 #ifdef HN_USE_TXDESC_BUFRING
5468 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5469 #endif
5470 
5471 	free(txr->hn_txdesc, M_DEVBUF);
5472 	txr->hn_txdesc = NULL;
5473 
5474 	if (txr->hn_mbuf_br != NULL)
5475 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5476 
5477 #ifndef HN_USE_TXDESC_BUFRING
5478 	mtx_destroy(&txr->hn_txlist_spin);
5479 #endif
5480 	mtx_destroy(&txr->hn_tx_lock);
5481 }
5482 
5483 static int
5484 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5485 {
5486 	struct sysctl_oid_list *child;
5487 	struct sysctl_ctx_list *ctx;
5488 	int i;
5489 
5490 	/*
5491 	 * Create TXBUF for chimney sending.
5492 	 *
5493 	 * NOTE: It is shared by all channels.
5494 	 */
5495 	sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
5496 	    0ul, ~0ul, PAGE_SIZE, 0);
5497 	if (sc->hn_chim == NULL) {
5498 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5499 		return (ENOMEM);
5500 	}
5501 
5502 	sc->hn_tx_ring_cnt = ring_cnt;
5503 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5504 
5505 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5506 	    M_DEVBUF, M_WAITOK | M_ZERO);
5507 
5508 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5509 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5510 
5511 	/* Create dev.hn.UNIT.tx sysctl tree */
5512 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5513 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5514 
5515 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5516 		int error;
5517 
5518 		error = hn_tx_ring_create(sc, i);
5519 		if (error)
5520 			return error;
5521 	}
5522 
5523 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5524 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5525 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5526 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5527 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5528 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5529 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5530 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5531 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5532 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5533 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5534 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5535 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5536 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5537 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5538 	    hn_tx_stat_ulong_sysctl, "LU",
5539 	    "# of packet transmission aggregation flush failure");
5540 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5541 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5542 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5543 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5544 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5545 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5546 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5547 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5548 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5549 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5550 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5551 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5552 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5553 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5554 	    "# of total TX descs");
5555 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5556 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5557 	    "Chimney send packet size upper boundary");
5558 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5559 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5560 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5561 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5562 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5563 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5564 	    hn_tx_conf_int_sysctl, "I",
5565 	    "Size of the packet for direct transmission");
5566 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5567 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5568 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5569 	    hn_tx_conf_int_sysctl, "I",
5570 	    "Always schedule transmission "
5571 	    "instead of doing direct transmission");
5572 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5573 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5574 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5575 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5576 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5577 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5578 	    "Applied packet transmission aggregation size");
5579 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5580 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5581 	    hn_txagg_pktmax_sysctl, "I",
5582 	    "Applied packet transmission aggregation packets");
5583 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5584 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5585 	    hn_txagg_align_sysctl, "I",
5586 	    "Applied packet transmission aggregation alignment");
5587 
5588 	return 0;
5589 }
5590 
5591 static void
5592 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5593 {
5594 	int i;
5595 
5596 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5597 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5598 }
5599 
5600 static void
5601 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5602 {
5603 	if_t ifp = sc->hn_ifp;
5604 	u_int hw_tsomax;
5605 	int tso_minlen;
5606 
5607 	HN_LOCK_ASSERT(sc);
5608 
5609 	if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5610 		return;
5611 
5612 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5613 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5614 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5615 
5616 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5617 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5618 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5619 
5620 	if (tso_maxlen < tso_minlen)
5621 		tso_maxlen = tso_minlen;
5622 	else if (tso_maxlen > IP_MAXPACKET)
5623 		tso_maxlen = IP_MAXPACKET;
5624 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5625 		tso_maxlen = sc->hn_ndis_tso_szmax;
5626 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5627 
5628 	if (hn_xpnt_vf_isready(sc)) {
5629 		if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp))
5630 			hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp);
5631 	}
5632 	if_sethwtsomax(ifp, hw_tsomax);
5633 	if (bootverbose)
5634 		if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp));
5635 }
5636 
5637 static void
5638 hn_fixup_tx_data(struct hn_softc *sc)
5639 {
5640 	uint64_t csum_assist;
5641 	int i;
5642 
5643 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5644 	if (hn_tx_chimney_size > 0 &&
5645 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5646 		hn_set_chim_size(sc, hn_tx_chimney_size);
5647 
5648 	csum_assist = 0;
5649 	if (sc->hn_caps & HN_CAP_IPCS)
5650 		csum_assist |= CSUM_IP;
5651 	if (sc->hn_caps & HN_CAP_TCP4CS)
5652 		csum_assist |= CSUM_IP_TCP;
5653 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5654 		csum_assist |= CSUM_IP_UDP;
5655 	if (sc->hn_caps & HN_CAP_TCP6CS)
5656 		csum_assist |= CSUM_IP6_TCP;
5657 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5658 		csum_assist |= CSUM_IP6_UDP;
5659 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5660 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5661 
5662 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5663 		/*
5664 		 * Support HASHVAL pktinfo on TX path.
5665 		 */
5666 		if (bootverbose)
5667 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5668 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5669 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5670 	}
5671 }
5672 
5673 static void
5674 hn_fixup_rx_data(struct hn_softc *sc)
5675 {
5676 
5677 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5678 		int i;
5679 
5680 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5681 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5682 	}
5683 }
5684 
5685 static void
5686 hn_destroy_tx_data(struct hn_softc *sc)
5687 {
5688 	int i;
5689 
5690 	if (sc->hn_chim != NULL) {
5691 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5692 			contigfree(sc->hn_chim, HN_CHIM_SIZE, M_DEVBUF);
5693 		} else {
5694 			device_printf(sc->hn_dev,
5695 			    "chimney sending buffer is referenced");
5696 		}
5697 		sc->hn_chim = NULL;
5698 	}
5699 
5700 	if (sc->hn_tx_ring_cnt == 0)
5701 		return;
5702 
5703 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5704 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5705 
5706 	free(sc->hn_tx_ring, M_DEVBUF);
5707 	sc->hn_tx_ring = NULL;
5708 
5709 	sc->hn_tx_ring_cnt = 0;
5710 	sc->hn_tx_ring_inuse = 0;
5711 }
5712 
5713 #ifdef HN_IFSTART_SUPPORT
5714 
5715 static void
5716 hn_start_taskfunc(void *xtxr, int pending __unused)
5717 {
5718 	struct hn_tx_ring *txr = xtxr;
5719 
5720 	mtx_lock(&txr->hn_tx_lock);
5721 	hn_start_locked(txr, 0);
5722 	mtx_unlock(&txr->hn_tx_lock);
5723 }
5724 
5725 static int
5726 hn_start_locked(struct hn_tx_ring *txr, int len)
5727 {
5728 	struct hn_softc *sc = txr->hn_sc;
5729 	if_t ifp = sc->hn_ifp;
5730 	int sched = 0;
5731 
5732 	KASSERT(hn_use_if_start,
5733 	    ("hn_start_locked is called, when if_start is disabled"));
5734 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5735 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5736 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5737 
5738 	if (__predict_false(txr->hn_suspended))
5739 		return (0);
5740 
5741 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5742 	    IFF_DRV_RUNNING)
5743 		return (0);
5744 
5745 	while (!if_sendq_empty(ifp)) {
5746 		struct hn_txdesc *txd;
5747 		struct mbuf *m_head;
5748 		int error;
5749 
5750 		m_head = if_dequeue(ifp);
5751 		if (m_head == NULL)
5752 			break;
5753 
5754 		if (len > 0 && m_head->m_pkthdr.len > len) {
5755 			/*
5756 			 * This sending could be time consuming; let callers
5757 			 * dispatch this packet sending (and sending of any
5758 			 * following up packets) to tx taskqueue.
5759 			 */
5760 			if_sendq_prepend(ifp, m_head);
5761 			sched = 1;
5762 			break;
5763 		}
5764 
5765 #if defined(INET6) || defined(INET)
5766 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5767 			m_head = hn_tso_fixup(m_head);
5768 			if (__predict_false(m_head == NULL)) {
5769 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5770 				continue;
5771 			}
5772 		} else if (m_head->m_pkthdr.csum_flags &
5773 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5774 			m_head = hn_set_hlen(m_head);
5775 			if (__predict_false(m_head == NULL)) {
5776 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5777 				continue;
5778 			}
5779 		}
5780 #endif
5781 
5782 		txd = hn_txdesc_get(txr);
5783 		if (txd == NULL) {
5784 			txr->hn_no_txdescs++;
5785 			if_sendq_prepend(ifp, m_head);
5786 			if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
5787 			break;
5788 		}
5789 
5790 		error = hn_encap(ifp, txr, txd, &m_head);
5791 		if (error) {
5792 			/* Both txd and m_head are freed */
5793 			KASSERT(txr->hn_agg_txd == NULL,
5794 			    ("encap failed w/ pending aggregating txdesc"));
5795 			continue;
5796 		}
5797 
5798 		if (txr->hn_agg_pktleft == 0) {
5799 			if (txr->hn_agg_txd != NULL) {
5800 				KASSERT(m_head == NULL,
5801 				    ("pending mbuf for aggregating txdesc"));
5802 				error = hn_flush_txagg(ifp, txr);
5803 				if (__predict_false(error)) {
5804 					if_setdrvflagbits(ifp,
5805 					    IFF_DRV_OACTIVE, 0);
5806 					break;
5807 				}
5808 			} else {
5809 				KASSERT(m_head != NULL, ("mbuf was freed"));
5810 				error = hn_txpkt(ifp, txr, txd);
5811 				if (__predict_false(error)) {
5812 					/* txd is freed, but m_head is not */
5813 					if_sendq_prepend(ifp, m_head);
5814 					if_setdrvflagbits(ifp,
5815 					    IFF_DRV_OACTIVE, 0);
5816 					break;
5817 				}
5818 			}
5819 		}
5820 #ifdef INVARIANTS
5821 		else {
5822 			KASSERT(txr->hn_agg_txd != NULL,
5823 			    ("no aggregating txdesc"));
5824 			KASSERT(m_head == NULL,
5825 			    ("pending mbuf for aggregating txdesc"));
5826 		}
5827 #endif
5828 	}
5829 
5830 	/* Flush pending aggerated transmission. */
5831 	if (txr->hn_agg_txd != NULL)
5832 		hn_flush_txagg(ifp, txr);
5833 	return (sched);
5834 }
5835 
5836 static void
5837 hn_start(if_t ifp)
5838 {
5839 	struct hn_softc *sc = if_getsoftc(ifp);
5840 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5841 
5842 	if (txr->hn_sched_tx)
5843 		goto do_sched;
5844 
5845 	if (mtx_trylock(&txr->hn_tx_lock)) {
5846 		int sched;
5847 
5848 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5849 		mtx_unlock(&txr->hn_tx_lock);
5850 		if (!sched)
5851 			return;
5852 	}
5853 do_sched:
5854 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5855 }
5856 
5857 static void
5858 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5859 {
5860 	struct hn_tx_ring *txr = xtxr;
5861 
5862 	mtx_lock(&txr->hn_tx_lock);
5863 	if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE);
5864 	hn_start_locked(txr, 0);
5865 	mtx_unlock(&txr->hn_tx_lock);
5866 }
5867 
5868 static void
5869 hn_start_txeof(struct hn_tx_ring *txr)
5870 {
5871 	struct hn_softc *sc = txr->hn_sc;
5872 	if_t ifp = sc->hn_ifp;
5873 
5874 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5875 
5876 	if (txr->hn_sched_tx)
5877 		goto do_sched;
5878 
5879 	if (mtx_trylock(&txr->hn_tx_lock)) {
5880 		int sched;
5881 
5882 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5883 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5884 		mtx_unlock(&txr->hn_tx_lock);
5885 		if (sched) {
5886 			taskqueue_enqueue(txr->hn_tx_taskq,
5887 			    &txr->hn_tx_task);
5888 		}
5889 	} else {
5890 do_sched:
5891 		/*
5892 		 * Release the OACTIVE earlier, with the hope, that
5893 		 * others could catch up.  The task will clear the
5894 		 * flag again with the hn_tx_lock to avoid possible
5895 		 * races.
5896 		 */
5897 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5898 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5899 	}
5900 }
5901 
5902 #endif	/* HN_IFSTART_SUPPORT */
5903 
5904 static int
5905 hn_xmit(struct hn_tx_ring *txr, int len)
5906 {
5907 	struct hn_softc *sc = txr->hn_sc;
5908 	if_t ifp = sc->hn_ifp;
5909 	struct mbuf *m_head;
5910 	int sched = 0;
5911 
5912 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5913 #ifdef HN_IFSTART_SUPPORT
5914 	KASSERT(hn_use_if_start == 0,
5915 	    ("hn_xmit is called, when if_start is enabled"));
5916 #endif
5917 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5918 
5919 	if (__predict_false(txr->hn_suspended))
5920 		return (0);
5921 
5922 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5923 		return (0);
5924 
5925 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5926 		struct hn_txdesc *txd;
5927 		int error;
5928 
5929 		if (len > 0 && m_head->m_pkthdr.len > len) {
5930 			/*
5931 			 * This sending could be time consuming; let callers
5932 			 * dispatch this packet sending (and sending of any
5933 			 * following up packets) to tx taskqueue.
5934 			 */
5935 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5936 			sched = 1;
5937 			break;
5938 		}
5939 
5940 		txd = hn_txdesc_get(txr);
5941 		if (txd == NULL) {
5942 			txr->hn_no_txdescs++;
5943 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5944 			txr->hn_oactive = 1;
5945 			break;
5946 		}
5947 
5948 		error = hn_encap(ifp, txr, txd, &m_head);
5949 		if (error) {
5950 			/* Both txd and m_head are freed; discard */
5951 			KASSERT(txr->hn_agg_txd == NULL,
5952 			    ("encap failed w/ pending aggregating txdesc"));
5953 			drbr_advance(ifp, txr->hn_mbuf_br);
5954 			continue;
5955 		}
5956 
5957 		if (txr->hn_agg_pktleft == 0) {
5958 			if (txr->hn_agg_txd != NULL) {
5959 				KASSERT(m_head == NULL,
5960 				    ("pending mbuf for aggregating txdesc"));
5961 				error = hn_flush_txagg(ifp, txr);
5962 				if (__predict_false(error)) {
5963 					txr->hn_oactive = 1;
5964 					break;
5965 				}
5966 			} else {
5967 				KASSERT(m_head != NULL, ("mbuf was freed"));
5968 				error = hn_txpkt(ifp, txr, txd);
5969 				if (__predict_false(error)) {
5970 					/* txd is freed, but m_head is not */
5971 					drbr_putback(ifp, txr->hn_mbuf_br,
5972 					    m_head);
5973 					txr->hn_oactive = 1;
5974 					break;
5975 				}
5976 			}
5977 		}
5978 #ifdef INVARIANTS
5979 		else {
5980 			KASSERT(txr->hn_agg_txd != NULL,
5981 			    ("no aggregating txdesc"));
5982 			KASSERT(m_head == NULL,
5983 			    ("pending mbuf for aggregating txdesc"));
5984 		}
5985 #endif
5986 
5987 		/* Sent */
5988 		drbr_advance(ifp, txr->hn_mbuf_br);
5989 	}
5990 
5991 	/* Flush pending aggerated transmission. */
5992 	if (txr->hn_agg_txd != NULL)
5993 		hn_flush_txagg(ifp, txr);
5994 	return (sched);
5995 }
5996 
5997 static int
5998 hn_transmit(if_t ifp, struct mbuf *m)
5999 {
6000 	struct hn_softc *sc = if_getsoftc(ifp);
6001 	struct hn_tx_ring *txr;
6002 	int error, idx = 0;
6003 
6004 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
6005 		struct rm_priotracker pt;
6006 
6007 		rm_rlock(&sc->hn_vf_lock, &pt);
6008 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6009 			struct mbuf *m_bpf = NULL;
6010 			int obytes, omcast;
6011 
6012 			obytes = m->m_pkthdr.len;
6013 			omcast = (m->m_flags & M_MCAST) != 0;
6014 
6015 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
6016 				if (bpf_peers_present(if_getbpf(ifp))) {
6017 					m_bpf = m_copypacket(m, M_NOWAIT);
6018 					if (m_bpf == NULL) {
6019 						/*
6020 						 * Failed to grab a shallow
6021 						 * copy; tap now.
6022 						 */
6023 						ETHER_BPF_MTAP(ifp, m);
6024 					}
6025 				}
6026 			} else {
6027 				ETHER_BPF_MTAP(ifp, m);
6028 			}
6029 
6030 			error = if_transmit(sc->hn_vf_ifp, m);
6031 			rm_runlock(&sc->hn_vf_lock, &pt);
6032 
6033 			if (m_bpf != NULL) {
6034 				if (!error)
6035 					ETHER_BPF_MTAP(ifp, m_bpf);
6036 				m_freem(m_bpf);
6037 			}
6038 
6039 			if (error == ENOBUFS) {
6040 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6041 			} else if (error) {
6042 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6043 			} else {
6044 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6045 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6046 				if (omcast) {
6047 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6048 					    omcast);
6049 				}
6050 			}
6051 			return (error);
6052 		}
6053 		rm_runlock(&sc->hn_vf_lock, &pt);
6054 	}
6055 
6056 #if defined(INET6) || defined(INET)
6057 	/*
6058 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6059 	 * since packet headers should be cache-hot.
6060 	 */
6061 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6062 		m = hn_tso_fixup(m);
6063 		if (__predict_false(m == NULL)) {
6064 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6065 			return EIO;
6066 		}
6067 	} else if (m->m_pkthdr.csum_flags &
6068 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6069 		m = hn_set_hlen(m);
6070 		if (__predict_false(m == NULL)) {
6071 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6072 			return EIO;
6073 		}
6074 	}
6075 #endif
6076 
6077 	/*
6078 	 * Select the TX ring based on flowid
6079 	 */
6080 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6081 #ifdef RSS
6082 		uint32_t bid;
6083 
6084 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6085 		    &bid) == 0)
6086 			idx = bid % sc->hn_tx_ring_inuse;
6087 		else
6088 #endif
6089 		{
6090 #if defined(INET6) || defined(INET)
6091 			int tcpsyn = 0;
6092 
6093 			if (m->m_pkthdr.len < 128 &&
6094 			    (m->m_pkthdr.csum_flags &
6095 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6096 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6097 				m = hn_check_tcpsyn(m, &tcpsyn);
6098 				if (__predict_false(m == NULL)) {
6099 					if_inc_counter(ifp,
6100 					    IFCOUNTER_OERRORS, 1);
6101 					return (EIO);
6102 				}
6103 			}
6104 #else
6105 			const int tcpsyn = 0;
6106 #endif
6107 			if (tcpsyn)
6108 				idx = 0;
6109 			else
6110 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6111 		}
6112 	}
6113 	txr = &sc->hn_tx_ring[idx];
6114 
6115 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6116 	if (error) {
6117 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6118 		return error;
6119 	}
6120 
6121 	if (txr->hn_oactive)
6122 		return 0;
6123 
6124 	if (txr->hn_sched_tx)
6125 		goto do_sched;
6126 
6127 	if (mtx_trylock(&txr->hn_tx_lock)) {
6128 		int sched;
6129 
6130 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6131 		mtx_unlock(&txr->hn_tx_lock);
6132 		if (!sched)
6133 			return 0;
6134 	}
6135 do_sched:
6136 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6137 	return 0;
6138 }
6139 
6140 static void
6141 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6142 {
6143 	struct mbuf *m;
6144 
6145 	mtx_lock(&txr->hn_tx_lock);
6146 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6147 		m_freem(m);
6148 	mtx_unlock(&txr->hn_tx_lock);
6149 }
6150 
6151 static void
6152 hn_xmit_qflush(if_t ifp)
6153 {
6154 	struct hn_softc *sc = if_getsoftc(ifp);
6155 	struct rm_priotracker pt;
6156 	int i;
6157 
6158 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6159 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6160 	if_qflush(ifp);
6161 
6162 	rm_rlock(&sc->hn_vf_lock, &pt);
6163 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6164 		if_qflush(sc->hn_vf_ifp);
6165 	rm_runlock(&sc->hn_vf_lock, &pt);
6166 }
6167 
6168 static void
6169 hn_xmit_txeof(struct hn_tx_ring *txr)
6170 {
6171 
6172 	if (txr->hn_sched_tx)
6173 		goto do_sched;
6174 
6175 	if (mtx_trylock(&txr->hn_tx_lock)) {
6176 		int sched;
6177 
6178 		txr->hn_oactive = 0;
6179 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6180 		mtx_unlock(&txr->hn_tx_lock);
6181 		if (sched) {
6182 			taskqueue_enqueue(txr->hn_tx_taskq,
6183 			    &txr->hn_tx_task);
6184 		}
6185 	} else {
6186 do_sched:
6187 		/*
6188 		 * Release the oactive earlier, with the hope, that
6189 		 * others could catch up.  The task will clear the
6190 		 * oactive again with the hn_tx_lock to avoid possible
6191 		 * races.
6192 		 */
6193 		txr->hn_oactive = 0;
6194 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6195 	}
6196 }
6197 
6198 static void
6199 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6200 {
6201 	struct hn_tx_ring *txr = xtxr;
6202 
6203 	mtx_lock(&txr->hn_tx_lock);
6204 	hn_xmit(txr, 0);
6205 	mtx_unlock(&txr->hn_tx_lock);
6206 }
6207 
6208 static void
6209 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6210 {
6211 	struct hn_tx_ring *txr = xtxr;
6212 
6213 	mtx_lock(&txr->hn_tx_lock);
6214 	txr->hn_oactive = 0;
6215 	hn_xmit(txr, 0);
6216 	mtx_unlock(&txr->hn_tx_lock);
6217 }
6218 
6219 static int
6220 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6221 {
6222 	struct vmbus_chan_br cbr;
6223 	struct hn_rx_ring *rxr;
6224 	struct hn_tx_ring *txr = NULL;
6225 	int idx, error;
6226 
6227 	idx = vmbus_chan_subidx(chan);
6228 
6229 	/*
6230 	 * Link this channel to RX/TX ring.
6231 	 */
6232 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6233 	    ("invalid channel index %d, should > 0 && < %d",
6234 	     idx, sc->hn_rx_ring_inuse));
6235 	rxr = &sc->hn_rx_ring[idx];
6236 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6237 	    ("RX ring %d already attached", idx));
6238 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6239 	rxr->hn_chan = chan;
6240 
6241 	if (bootverbose) {
6242 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6243 		    idx, vmbus_chan_id(chan));
6244 	}
6245 
6246 	if (idx < sc->hn_tx_ring_inuse) {
6247 		txr = &sc->hn_tx_ring[idx];
6248 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6249 		    ("TX ring %d already attached", idx));
6250 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6251 
6252 		txr->hn_chan = chan;
6253 		if (bootverbose) {
6254 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6255 			    idx, vmbus_chan_id(chan));
6256 		}
6257 	}
6258 
6259 	/* Bind this channel to a proper CPU. */
6260 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6261 
6262 	/*
6263 	 * Open this channel
6264 	 */
6265 	cbr.cbr = rxr->hn_br;
6266 	cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br);
6267 	cbr.cbr_txsz = HN_TXBR_SIZE;
6268 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6269 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6270 	if (error) {
6271 		if (error == EISCONN) {
6272 			if_printf(sc->hn_ifp, "bufring is connected after "
6273 			    "chan%u open failure\n", vmbus_chan_id(chan));
6274 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6275 		} else {
6276 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6277 			    vmbus_chan_id(chan), error);
6278 		}
6279 	}
6280 	return (error);
6281 }
6282 
6283 static void
6284 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6285 {
6286 	struct hn_rx_ring *rxr;
6287 	int idx, error;
6288 
6289 	idx = vmbus_chan_subidx(chan);
6290 
6291 	/*
6292 	 * Link this channel to RX/TX ring.
6293 	 */
6294 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6295 	    ("invalid channel index %d, should > 0 && < %d",
6296 	     idx, sc->hn_rx_ring_inuse));
6297 	rxr = &sc->hn_rx_ring[idx];
6298 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6299 	    ("RX ring %d is not attached", idx));
6300 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6301 
6302 	if (idx < sc->hn_tx_ring_inuse) {
6303 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6304 
6305 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6306 		    ("TX ring %d is not attached attached", idx));
6307 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6308 	}
6309 
6310 	/*
6311 	 * Close this channel.
6312 	 *
6313 	 * NOTE:
6314 	 * Channel closing does _not_ destroy the target channel.
6315 	 */
6316 	error = vmbus_chan_close_direct(chan);
6317 	if (error == EISCONN) {
6318 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6319 		    "after being closed\n", vmbus_chan_id(chan));
6320 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6321 	} else if (error) {
6322 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6323 		    vmbus_chan_id(chan), error);
6324 	}
6325 }
6326 
6327 static int
6328 hn_attach_subchans(struct hn_softc *sc)
6329 {
6330 	struct vmbus_channel **subchans;
6331 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6332 	int i, error = 0;
6333 
6334 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6335 
6336 	/* Attach the sub-channels. */
6337 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6338 	for (i = 0; i < subchan_cnt; ++i) {
6339 		int error1;
6340 
6341 		error1 = hn_chan_attach(sc, subchans[i]);
6342 		if (error1) {
6343 			error = error1;
6344 			/* Move on; all channels will be detached later. */
6345 		}
6346 	}
6347 	vmbus_subchan_rel(subchans, subchan_cnt);
6348 
6349 	if (error) {
6350 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6351 	} else {
6352 		if (bootverbose) {
6353 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6354 			    subchan_cnt);
6355 		}
6356 	}
6357 	return (error);
6358 }
6359 
6360 static void
6361 hn_detach_allchans(struct hn_softc *sc)
6362 {
6363 	struct vmbus_channel **subchans;
6364 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6365 	int i;
6366 
6367 	if (subchan_cnt == 0)
6368 		goto back;
6369 
6370 	/* Detach the sub-channels. */
6371 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6372 	for (i = 0; i < subchan_cnt; ++i)
6373 		hn_chan_detach(sc, subchans[i]);
6374 	vmbus_subchan_rel(subchans, subchan_cnt);
6375 
6376 back:
6377 	/*
6378 	 * Detach the primary channel, _after_ all sub-channels
6379 	 * are detached.
6380 	 */
6381 	hn_chan_detach(sc, sc->hn_prichan);
6382 
6383 	/* Wait for sub-channels to be destroyed, if any. */
6384 	vmbus_subchan_drain(sc->hn_prichan);
6385 
6386 #ifdef INVARIANTS
6387 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6388 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6389 		    HN_RX_FLAG_ATTACHED) == 0,
6390 		    ("%dth RX ring is still attached", i));
6391 	}
6392 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6393 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6394 		    HN_TX_FLAG_ATTACHED) == 0,
6395 		    ("%dth TX ring is still attached", i));
6396 	}
6397 #endif
6398 }
6399 
6400 static int
6401 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6402 {
6403 	struct vmbus_channel **subchans;
6404 	int nchan, rxr_cnt, error;
6405 
6406 	nchan = *nsubch + 1;
6407 	if (nchan == 1) {
6408 		/*
6409 		 * Multiple RX/TX rings are not requested.
6410 		 */
6411 		*nsubch = 0;
6412 		return (0);
6413 	}
6414 
6415 	/*
6416 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6417 	 * table entries.
6418 	 */
6419 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6420 	if (error) {
6421 		/* No RSS; this is benign. */
6422 		*nsubch = 0;
6423 		return (0);
6424 	}
6425 	if (bootverbose) {
6426 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6427 		    rxr_cnt, nchan);
6428 	}
6429 
6430 	if (nchan > rxr_cnt)
6431 		nchan = rxr_cnt;
6432 	if (nchan == 1) {
6433 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6434 		*nsubch = 0;
6435 		return (0);
6436 	}
6437 
6438 	/*
6439 	 * Allocate sub-channels from NVS.
6440 	 */
6441 	*nsubch = nchan - 1;
6442 	error = hn_nvs_alloc_subchans(sc, nsubch);
6443 	if (error || *nsubch == 0) {
6444 		/* Failed to allocate sub-channels. */
6445 		*nsubch = 0;
6446 		return (0);
6447 	}
6448 
6449 	/*
6450 	 * Wait for all sub-channels to become ready before moving on.
6451 	 */
6452 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6453 	vmbus_subchan_rel(subchans, *nsubch);
6454 	return (0);
6455 }
6456 
6457 static bool
6458 hn_synth_attachable(const struct hn_softc *sc)
6459 {
6460 	int i;
6461 
6462 	if (sc->hn_flags & HN_FLAG_ERRORS)
6463 		return (false);
6464 
6465 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6466 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6467 
6468 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6469 			return (false);
6470 	}
6471 	return (true);
6472 }
6473 
6474 /*
6475  * Make sure that the RX filter is zero after the successful
6476  * RNDIS initialization.
6477  *
6478  * NOTE:
6479  * Under certain conditions on certain versions of Hyper-V,
6480  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6481  * after the successful RNDIS initialization, which breaks
6482  * the assumption of any following code (well, it breaks the
6483  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6484  * explicitly, drain packets sneaking through, and drain the
6485  * interrupt taskqueues scheduled due to the stealth packets.
6486  */
6487 static void
6488 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6489 {
6490 
6491 	hn_disable_rx(sc);
6492 	hn_drain_rxtx(sc, nchan);
6493 }
6494 
6495 static int
6496 hn_synth_attach(struct hn_softc *sc, int mtu)
6497 {
6498 #define ATTACHED_NVS		0x0002
6499 #define ATTACHED_RNDIS		0x0004
6500 
6501 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6502 	int error, nsubch, nchan = 1, i, rndis_inited;
6503 	uint32_t old_caps, attached = 0;
6504 
6505 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6506 	    ("synthetic parts were attached"));
6507 
6508 	if (!hn_synth_attachable(sc))
6509 		return (ENXIO);
6510 
6511 	/* Save capabilities for later verification. */
6512 	old_caps = sc->hn_caps;
6513 	sc->hn_caps = 0;
6514 
6515 	/* Clear RSS stuffs. */
6516 	sc->hn_rss_ind_size = 0;
6517 	sc->hn_rss_hash = 0;
6518 	sc->hn_rss_hcap = 0;
6519 
6520 	/*
6521 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6522 	 */
6523 	error = hn_chan_attach(sc, sc->hn_prichan);
6524 	if (error)
6525 		goto failed;
6526 
6527 	/*
6528 	 * Attach NVS.
6529 	 */
6530 	error = hn_nvs_attach(sc, mtu);
6531 	if (error)
6532 		goto failed;
6533 	attached |= ATTACHED_NVS;
6534 
6535 	/*
6536 	 * Attach RNDIS _after_ NVS is attached.
6537 	 */
6538 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6539 	if (rndis_inited)
6540 		attached |= ATTACHED_RNDIS;
6541 	if (error)
6542 		goto failed;
6543 
6544 	/*
6545 	 * Make sure capabilities are not changed.
6546 	 */
6547 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6548 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6549 		    old_caps, sc->hn_caps);
6550 		error = ENXIO;
6551 		goto failed;
6552 	}
6553 
6554 	/*
6555 	 * Allocate sub-channels for multi-TX/RX rings.
6556 	 *
6557 	 * NOTE:
6558 	 * The # of RX rings that can be used is equivalent to the # of
6559 	 * channels to be requested.
6560 	 */
6561 	nsubch = sc->hn_rx_ring_cnt - 1;
6562 	error = hn_synth_alloc_subchans(sc, &nsubch);
6563 	if (error)
6564 		goto failed;
6565 	/* NOTE: _Full_ synthetic parts detach is required now. */
6566 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6567 
6568 	/*
6569 	 * Set the # of TX/RX rings that could be used according to
6570 	 * the # of channels that NVS offered.
6571 	 */
6572 	nchan = nsubch + 1;
6573 	hn_set_ring_inuse(sc, nchan);
6574 	if (nchan == 1) {
6575 		/* Only the primary channel can be used; done */
6576 		goto back;
6577 	}
6578 
6579 	/*
6580 	 * Attach the sub-channels.
6581 	 *
6582 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6583 	 */
6584 	error = hn_attach_subchans(sc);
6585 	if (error)
6586 		goto failed;
6587 
6588 	/*
6589 	 * Configure RSS key and indirect table _after_ all sub-channels
6590 	 * are attached.
6591 	 */
6592 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6593 		/*
6594 		 * RSS key is not set yet; set it to the default RSS key.
6595 		 */
6596 		if (bootverbose)
6597 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6598 #ifdef RSS
6599 		rss_getkey(rss->rss_key);
6600 #else
6601 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6602 #endif
6603 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6604 	}
6605 
6606 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6607 		/*
6608 		 * RSS indirect table is not set yet; set it up in round-
6609 		 * robin fashion.
6610 		 */
6611 		if (bootverbose) {
6612 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6613 			    "table\n");
6614 		}
6615 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6616 			uint32_t subidx;
6617 
6618 #ifdef RSS
6619 			subidx = rss_get_indirection_to_bucket(i);
6620 #else
6621 			subidx = i;
6622 #endif
6623 			rss->rss_ind[i] = subidx % nchan;
6624 		}
6625 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6626 	} else {
6627 		/*
6628 		 * # of usable channels may be changed, so we have to
6629 		 * make sure that all entries in RSS indirect table
6630 		 * are valid.
6631 		 *
6632 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6633 		 */
6634 		hn_rss_ind_fixup(sc);
6635 	}
6636 
6637 	sc->hn_rss_hash = sc->hn_rss_hcap;
6638 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6639 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6640 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6641 		hn_vf_rss_fixup(sc, false);
6642 	}
6643 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6644 	if (error)
6645 		goto failed;
6646 back:
6647 	/*
6648 	 * Fixup transmission aggregation setup.
6649 	 */
6650 	hn_set_txagg(sc);
6651 	hn_rndis_init_fixat(sc, nchan);
6652 	return (0);
6653 
6654 failed:
6655 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6656 		hn_rndis_init_fixat(sc, nchan);
6657 		hn_synth_detach(sc);
6658 	} else {
6659 		if (attached & ATTACHED_RNDIS) {
6660 			hn_rndis_init_fixat(sc, nchan);
6661 			hn_rndis_detach(sc);
6662 		}
6663 		if (attached & ATTACHED_NVS)
6664 			hn_nvs_detach(sc);
6665 		hn_chan_detach(sc, sc->hn_prichan);
6666 		/* Restore old capabilities. */
6667 		sc->hn_caps = old_caps;
6668 	}
6669 	return (error);
6670 
6671 #undef ATTACHED_RNDIS
6672 #undef ATTACHED_NVS
6673 }
6674 
6675 /*
6676  * NOTE:
6677  * The interface must have been suspended though hn_suspend(), before
6678  * this function get called.
6679  */
6680 static void
6681 hn_synth_detach(struct hn_softc *sc)
6682 {
6683 
6684 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6685 	    ("synthetic parts were not attached"));
6686 
6687 	/* Detach the RNDIS first. */
6688 	hn_rndis_detach(sc);
6689 
6690 	/* Detach NVS. */
6691 	hn_nvs_detach(sc);
6692 
6693 	/* Detach all of the channels. */
6694 	hn_detach_allchans(sc);
6695 
6696 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6697 		/*
6698 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6699 		 */
6700 		int error;
6701 
6702 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6703 		    sc->hn_rxbuf_gpadl);
6704 		if (error) {
6705 			if_printf(sc->hn_ifp,
6706 			    "rxbuf gpadl disconn failed: %d\n", error);
6707 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6708 		}
6709 		sc->hn_rxbuf_gpadl = 0;
6710 	}
6711 
6712 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6713 		/*
6714 		 * Host is post-Win2016, disconnect chimney sending buffer from
6715 		 * primary channel here.
6716 		 */
6717 		int error;
6718 
6719 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6720 		    sc->hn_chim_gpadl);
6721 		if (error) {
6722 			if_printf(sc->hn_ifp,
6723 			    "chim gpadl disconn failed: %d\n", error);
6724 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6725 		}
6726 		sc->hn_chim_gpadl = 0;
6727 	}
6728 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6729 }
6730 
6731 static void
6732 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6733 {
6734 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6735 	    ("invalid ring count %d", ring_cnt));
6736 
6737 	if (sc->hn_tx_ring_cnt > ring_cnt)
6738 		sc->hn_tx_ring_inuse = ring_cnt;
6739 	else
6740 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6741 	sc->hn_rx_ring_inuse = ring_cnt;
6742 
6743 #ifdef RSS
6744 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6745 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6746 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6747 		    rss_getnumbuckets());
6748 	}
6749 #endif
6750 
6751 	if (bootverbose) {
6752 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6753 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6754 	}
6755 }
6756 
6757 static void
6758 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6759 {
6760 
6761 	/*
6762 	 * NOTE:
6763 	 * The TX bufring will not be drained by the hypervisor,
6764 	 * if the primary channel is revoked.
6765 	 */
6766 	while (!vmbus_chan_rx_empty(chan) ||
6767 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6768 	     !vmbus_chan_tx_empty(chan)))
6769 		pause("waitch", 1);
6770 	vmbus_chan_intr_drain(chan);
6771 }
6772 
6773 static void
6774 hn_disable_rx(struct hn_softc *sc)
6775 {
6776 
6777 	/*
6778 	 * Disable RX by clearing RX filter forcefully.
6779 	 */
6780 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6781 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6782 
6783 	/*
6784 	 * Give RNDIS enough time to flush all pending data packets.
6785 	 */
6786 	pause("waitrx", (200 * hz) / 1000);
6787 }
6788 
6789 /*
6790  * NOTE:
6791  * RX/TX _must_ have been suspended/disabled, before this function
6792  * is called.
6793  */
6794 static void
6795 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6796 {
6797 	struct vmbus_channel **subch = NULL;
6798 	int nsubch;
6799 
6800 	/*
6801 	 * Drain RX/TX bufrings and interrupts.
6802 	 */
6803 	nsubch = nchan - 1;
6804 	if (nsubch > 0)
6805 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6806 
6807 	if (subch != NULL) {
6808 		int i;
6809 
6810 		for (i = 0; i < nsubch; ++i)
6811 			hn_chan_drain(sc, subch[i]);
6812 	}
6813 	hn_chan_drain(sc, sc->hn_prichan);
6814 
6815 	if (subch != NULL)
6816 		vmbus_subchan_rel(subch, nsubch);
6817 }
6818 
6819 static void
6820 hn_suspend_data(struct hn_softc *sc)
6821 {
6822 	struct hn_tx_ring *txr;
6823 	int i;
6824 
6825 	HN_LOCK_ASSERT(sc);
6826 
6827 	/*
6828 	 * Suspend TX.
6829 	 */
6830 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6831 		txr = &sc->hn_tx_ring[i];
6832 
6833 		mtx_lock(&txr->hn_tx_lock);
6834 		txr->hn_suspended = 1;
6835 		mtx_unlock(&txr->hn_tx_lock);
6836 		/* No one is able send more packets now. */
6837 
6838 		/*
6839 		 * Wait for all pending sends to finish.
6840 		 *
6841 		 * NOTE:
6842 		 * We will _not_ receive all pending send-done, if the
6843 		 * primary channel is revoked.
6844 		 */
6845 		while (hn_tx_ring_pending(txr) &&
6846 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6847 			pause("hnwtx", 1 /* 1 tick */);
6848 	}
6849 
6850 	/*
6851 	 * Disable RX.
6852 	 */
6853 	hn_disable_rx(sc);
6854 
6855 	/*
6856 	 * Drain RX/TX.
6857 	 */
6858 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6859 
6860 	/*
6861 	 * Drain any pending TX tasks.
6862 	 *
6863 	 * NOTE:
6864 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6865 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6866 	 */
6867 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6868 		txr = &sc->hn_tx_ring[i];
6869 
6870 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6871 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6872 	}
6873 }
6874 
6875 static void
6876 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6877 {
6878 
6879 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6880 }
6881 
6882 static void
6883 hn_suspend_mgmt(struct hn_softc *sc)
6884 {
6885 	struct task task;
6886 
6887 	HN_LOCK_ASSERT(sc);
6888 
6889 	/*
6890 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6891 	 * through hn_mgmt_taskq.
6892 	 */
6893 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6894 	vmbus_chan_run_task(sc->hn_prichan, &task);
6895 
6896 	/*
6897 	 * Make sure that all pending management tasks are completed.
6898 	 */
6899 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6900 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6901 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6902 }
6903 
6904 static void
6905 hn_suspend(struct hn_softc *sc)
6906 {
6907 
6908 	/* Disable polling. */
6909 	hn_polling(sc, 0);
6910 
6911 	/*
6912 	 * If the non-transparent mode VF is activated, the synthetic
6913 	 * device is receiving packets, so the data path of the
6914 	 * synthetic device must be suspended.
6915 	 */
6916 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6917 	    (sc->hn_flags & HN_FLAG_RXVF))
6918 		hn_suspend_data(sc);
6919 	hn_suspend_mgmt(sc);
6920 }
6921 
6922 static void
6923 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6924 {
6925 	int i;
6926 
6927 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6928 	    ("invalid TX ring count %d", tx_ring_cnt));
6929 
6930 	for (i = 0; i < tx_ring_cnt; ++i) {
6931 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6932 
6933 		mtx_lock(&txr->hn_tx_lock);
6934 		txr->hn_suspended = 0;
6935 		mtx_unlock(&txr->hn_tx_lock);
6936 	}
6937 }
6938 
6939 static void
6940 hn_resume_data(struct hn_softc *sc)
6941 {
6942 	int i;
6943 
6944 	HN_LOCK_ASSERT(sc);
6945 
6946 	/*
6947 	 * Re-enable RX.
6948 	 */
6949 	hn_rxfilter_config(sc);
6950 
6951 	/*
6952 	 * Make sure to clear suspend status on "all" TX rings,
6953 	 * since hn_tx_ring_inuse can be changed after
6954 	 * hn_suspend_data().
6955 	 */
6956 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6957 
6958 #ifdef HN_IFSTART_SUPPORT
6959 	if (!hn_use_if_start)
6960 #endif
6961 	{
6962 		/*
6963 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6964 		 * reduced.
6965 		 */
6966 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6967 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6968 	}
6969 
6970 	/*
6971 	 * Kick start TX.
6972 	 */
6973 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6974 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6975 
6976 		/*
6977 		 * Use txeof task, so that any pending oactive can be
6978 		 * cleared properly.
6979 		 */
6980 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6981 	}
6982 }
6983 
6984 static void
6985 hn_resume_mgmt(struct hn_softc *sc)
6986 {
6987 
6988 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6989 
6990 	/*
6991 	 * Kick off network change detection, if it was pending.
6992 	 * If no network change was pending, start link status
6993 	 * checks, which is more lightweight than network change
6994 	 * detection.
6995 	 */
6996 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6997 		hn_change_network(sc);
6998 	else
6999 		hn_update_link_status(sc);
7000 }
7001 
7002 static void
7003 hn_resume(struct hn_softc *sc)
7004 {
7005 
7006 	/*
7007 	 * If the non-transparent mode VF is activated, the synthetic
7008 	 * device have to receive packets, so the data path of the
7009 	 * synthetic device must be resumed.
7010 	 */
7011 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
7012 	    (sc->hn_flags & HN_FLAG_RXVF))
7013 		hn_resume_data(sc);
7014 
7015 	/*
7016 	 * Don't resume link status change if VF is attached/activated.
7017 	 * - In the non-transparent VF mode, the synthetic device marks
7018 	 *   link down until the VF is deactivated; i.e. VF is down.
7019 	 * - In transparent VF mode, VF's media status is used until
7020 	 *   the VF is detached.
7021 	 */
7022 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
7023 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
7024 		hn_resume_mgmt(sc);
7025 
7026 	/*
7027 	 * Re-enable polling if this interface is running and
7028 	 * the polling is requested.
7029 	 */
7030 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
7031 		hn_polling(sc, sc->hn_pollhz);
7032 }
7033 
7034 static void
7035 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
7036 {
7037 	const struct rndis_status_msg *msg;
7038 	int ofs;
7039 
7040 	if (dlen < sizeof(*msg)) {
7041 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7042 		return;
7043 	}
7044 	msg = data;
7045 
7046 	switch (msg->rm_status) {
7047 	case RNDIS_STATUS_MEDIA_CONNECT:
7048 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7049 		hn_update_link_status(sc);
7050 		break;
7051 
7052 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7053 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7054 		/* Not really useful; ignore. */
7055 		break;
7056 
7057 	case RNDIS_STATUS_NETWORK_CHANGE:
7058 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7059 		if (dlen < ofs + msg->rm_stbuflen ||
7060 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7061 			if_printf(sc->hn_ifp, "network changed\n");
7062 		} else {
7063 			uint32_t change;
7064 
7065 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7066 			    sizeof(change));
7067 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7068 			    change);
7069 		}
7070 		hn_change_network(sc);
7071 		break;
7072 
7073 	default:
7074 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7075 		    msg->rm_status);
7076 		break;
7077 	}
7078 }
7079 
7080 static int
7081 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7082 {
7083 	const struct rndis_pktinfo *pi = info_data;
7084 	uint32_t mask = 0;
7085 
7086 	while (info_dlen != 0) {
7087 		const void *data;
7088 		uint32_t dlen;
7089 
7090 		if (__predict_false(info_dlen < sizeof(*pi)))
7091 			return (EINVAL);
7092 		if (__predict_false(info_dlen < pi->rm_size))
7093 			return (EINVAL);
7094 		info_dlen -= pi->rm_size;
7095 
7096 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7097 			return (EINVAL);
7098 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7099 			return (EINVAL);
7100 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7101 		data = pi->rm_data;
7102 
7103 		if (pi->rm_internal == 1) {
7104 			switch (pi->rm_type) {
7105 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7106 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7107 					return (EINVAL);
7108 				info->pktinfo_id =
7109 				    (const struct packet_info_id *)data;
7110 				mask |= HN_RXINFO_PKTINFO_ID;
7111 				break;
7112 
7113 			default:
7114 				goto next;
7115 			}
7116 		} else {
7117 			switch (pi->rm_type) {
7118 			case NDIS_PKTINFO_TYPE_VLAN:
7119 				if (__predict_false(dlen
7120 				    < NDIS_VLAN_INFO_SIZE))
7121 					return (EINVAL);
7122 				info->vlan_info = (const uint32_t *)data;
7123 				mask |= HN_RXINFO_VLAN;
7124 				break;
7125 
7126 			case NDIS_PKTINFO_TYPE_CSUM:
7127 				if (__predict_false(dlen
7128 				    < NDIS_RXCSUM_INFO_SIZE))
7129 					return (EINVAL);
7130 				info->csum_info = (const uint32_t *)data;
7131 				mask |= HN_RXINFO_CSUM;
7132 				break;
7133 
7134 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7135 				if (__predict_false(dlen
7136 				    < HN_NDIS_HASH_VALUE_SIZE))
7137 					return (EINVAL);
7138 				info->hash_value = (const uint32_t *)data;
7139 				mask |= HN_RXINFO_HASHVAL;
7140 				break;
7141 
7142 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7143 				if (__predict_false(dlen
7144 				    < HN_NDIS_HASH_INFO_SIZE))
7145 					return (EINVAL);
7146 				info->hash_info = (const uint32_t *)data;
7147 				mask |= HN_RXINFO_HASHINF;
7148 				break;
7149 
7150 			default:
7151 				goto next;
7152 			}
7153 		}
7154 
7155 		if (mask == HN_RXINFO_ALL) {
7156 			/* All found; done */
7157 			break;
7158 		}
7159 next:
7160 		pi = (const struct rndis_pktinfo *)
7161 		    ((const uint8_t *)pi + pi->rm_size);
7162 	}
7163 
7164 	/*
7165 	 * Final fixup.
7166 	 * - If there is no hash value, invalidate the hash info.
7167 	 */
7168 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7169 		info->hash_info = NULL;
7170 	return (0);
7171 }
7172 
7173 static __inline bool
7174 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7175 {
7176 
7177 	if (off < check_off) {
7178 		if (__predict_true(off + len <= check_off))
7179 			return (false);
7180 	} else if (off > check_off) {
7181 		if (__predict_true(check_off + check_len <= off))
7182 			return (false);
7183 	}
7184 	return (true);
7185 }
7186 
7187 static __inline void
7188 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7189 		uint32_t len, struct hn_rxinfo *info)
7190 {
7191 	uint32_t cnt = rxr->rsc.cnt;
7192 
7193 	if (cnt) {
7194 		rxr->rsc.pktlen += len;
7195 	} else {
7196 		rxr->rsc.vlan_info = info->vlan_info;
7197 		rxr->rsc.csum_info = info->csum_info;
7198 		rxr->rsc.hash_info = info->hash_info;
7199 		rxr->rsc.hash_value = info->hash_value;
7200 		rxr->rsc.pktlen = len;
7201 	}
7202 
7203 	rxr->rsc.frag_data[cnt] = data;
7204 	rxr->rsc.frag_len[cnt] = len;
7205 	rxr->rsc.cnt++;
7206 }
7207 
7208 static void
7209 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7210 {
7211 	const struct rndis_packet_msg *pkt;
7212 	struct hn_rxinfo info;
7213 	int data_off, pktinfo_off, data_len, pktinfo_len;
7214 	bool rsc_more= false;
7215 
7216 	/*
7217 	 * Check length.
7218 	 */
7219 	if (__predict_false(dlen < sizeof(*pkt))) {
7220 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7221 		return;
7222 	}
7223 	pkt = data;
7224 
7225 	if (__predict_false(dlen < pkt->rm_len)) {
7226 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7227 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7228 		return;
7229 	}
7230 	if (__predict_false(pkt->rm_len <
7231 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7232 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7233 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7234 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7235 		    pkt->rm_pktinfolen);
7236 		return;
7237 	}
7238 	if (__predict_false(pkt->rm_datalen == 0)) {
7239 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7240 		return;
7241 	}
7242 
7243 	/*
7244 	 * Check offests.
7245 	 */
7246 #define IS_OFFSET_INVALID(ofs)			\
7247 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7248 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7249 
7250 	/* XXX Hyper-V does not meet data offset alignment requirement */
7251 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7252 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7253 		    "data offset %u\n", pkt->rm_dataoffset);
7254 		return;
7255 	}
7256 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7257 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7258 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7259 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7260 		return;
7261 	}
7262 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7263 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7264 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7265 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7266 		return;
7267 	}
7268 
7269 #undef IS_OFFSET_INVALID
7270 
7271 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7272 	data_len = pkt->rm_datalen;
7273 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7274 	pktinfo_len = pkt->rm_pktinfolen;
7275 
7276 	/*
7277 	 * Check OOB coverage.
7278 	 */
7279 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7280 		int oob_off, oob_len;
7281 
7282 		if_printf(rxr->hn_ifp, "got oobdata\n");
7283 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7284 		oob_len = pkt->rm_oobdatalen;
7285 
7286 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7287 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7288 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7289 			    pkt->rm_len, oob_off, oob_len);
7290 			return;
7291 		}
7292 
7293 		/*
7294 		 * Check against data.
7295 		 */
7296 		if (hn_rndis_check_overlap(oob_off, oob_len,
7297 		    data_off, data_len)) {
7298 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7299 			    "oob overlaps data, oob abs %d len %d, "
7300 			    "data abs %d len %d\n",
7301 			    oob_off, oob_len, data_off, data_len);
7302 			return;
7303 		}
7304 
7305 		/*
7306 		 * Check against pktinfo.
7307 		 */
7308 		if (pktinfo_len != 0 &&
7309 		    hn_rndis_check_overlap(oob_off, oob_len,
7310 		    pktinfo_off, pktinfo_len)) {
7311 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7312 			    "oob overlaps pktinfo, oob abs %d len %d, "
7313 			    "pktinfo abs %d len %d\n",
7314 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7315 			return;
7316 		}
7317 	}
7318 
7319 	/*
7320 	 * Check per-packet-info coverage and find useful per-packet-info.
7321 	 */
7322 	info.vlan_info = NULL;
7323 	info.csum_info = NULL;
7324 	info.hash_info = NULL;
7325 	info.pktinfo_id = NULL;
7326 
7327 	if (__predict_true(pktinfo_len != 0)) {
7328 		bool overlap;
7329 		int error;
7330 
7331 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7332 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7333 			    "pktinfo overflow, msglen %u, "
7334 			    "pktinfo abs %d len %d\n",
7335 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7336 			return;
7337 		}
7338 
7339 		/*
7340 		 * Check packet info coverage.
7341 		 */
7342 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7343 		    data_off, data_len);
7344 		if (__predict_false(overlap)) {
7345 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7346 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7347 			    "data abs %d len %d\n",
7348 			    pktinfo_off, pktinfo_len, data_off, data_len);
7349 			return;
7350 		}
7351 
7352 		/*
7353 		 * Find useful per-packet-info.
7354 		 */
7355 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7356 		    pktinfo_len, &info);
7357 		if (__predict_false(error)) {
7358 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7359 			    "pktinfo\n");
7360 			return;
7361 		}
7362 	}
7363 
7364 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7365 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7366 		    "data overflow, msglen %u, data abs %d len %d\n",
7367 		    pkt->rm_len, data_off, data_len);
7368 		return;
7369 	}
7370 
7371 	/* Identify RSC fragments, drop invalid packets */
7372 	if ((info.pktinfo_id != NULL) &&
7373 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7374 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7375 			rxr->rsc.cnt = 0;
7376 			rxr->hn_rsc_pkts++;
7377 		} else if (rxr->rsc.cnt == 0)
7378 			goto drop;
7379 
7380 		rsc_more = true;
7381 
7382 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7383 			rsc_more = false;
7384 
7385 		if (rsc_more && rxr->rsc.is_last)
7386 			goto drop;
7387 	} else {
7388 		rxr->rsc.cnt = 0;
7389 	}
7390 
7391 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7392 		goto drop;
7393 
7394 	/* Store data in per rx ring structure */
7395 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7396 	    data_len, &info);
7397 
7398 	if (rsc_more)
7399 		return;
7400 
7401 	hn_rxpkt(rxr);
7402 	rxr->rsc.cnt = 0;
7403 	return;
7404 drop:
7405 	rxr->hn_rsc_drop++;
7406 	return;
7407 }
7408 
7409 static __inline void
7410 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7411 {
7412 	const struct rndis_msghdr *hdr;
7413 
7414 	if (__predict_false(dlen < sizeof(*hdr))) {
7415 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7416 		return;
7417 	}
7418 	hdr = data;
7419 
7420 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7421 		/* Hot data path. */
7422 		hn_rndis_rx_data(rxr, data, dlen);
7423 		/* Done! */
7424 		return;
7425 	}
7426 
7427 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7428 		hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen);
7429 	else
7430 		hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen);
7431 }
7432 
7433 static void
7434 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7435 {
7436 	const struct hn_nvs_hdr *hdr;
7437 
7438 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7439 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7440 		return;
7441 	}
7442 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7443 
7444 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7445 		/* Useless; ignore */
7446 		return;
7447 	}
7448 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7449 }
7450 
7451 static void
7452 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7453     const struct vmbus_chanpkt_hdr *pkt)
7454 {
7455 	struct hn_nvs_sendctx *sndc;
7456 
7457 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7458 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7459 	    VMBUS_CHANPKT_DATALEN(pkt));
7460 	/*
7461 	 * NOTE:
7462 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7463 	 * its callback.
7464 	 */
7465 }
7466 
7467 static void
7468 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7469     const struct vmbus_chanpkt_hdr *pkthdr)
7470 {
7471 	struct epoch_tracker et;
7472 	const struct vmbus_chanpkt_rxbuf *pkt;
7473 	const struct hn_nvs_hdr *nvs_hdr;
7474 	int count, i, hlen;
7475 
7476 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7477 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7478 		return;
7479 	}
7480 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7481 
7482 	/* Make sure that this is a RNDIS message. */
7483 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7484 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7485 		    nvs_hdr->nvs_type);
7486 		return;
7487 	}
7488 
7489 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7490 	if (__predict_false(hlen < sizeof(*pkt))) {
7491 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7492 		return;
7493 	}
7494 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7495 
7496 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7497 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7498 		    pkt->cp_rxbuf_id);
7499 		return;
7500 	}
7501 
7502 	count = pkt->cp_rxbuf_cnt;
7503 	if (__predict_false(hlen <
7504 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7505 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7506 		return;
7507 	}
7508 
7509 	NET_EPOCH_ENTER(et);
7510 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7511 	for (i = 0; i < count; ++i) {
7512 		int ofs, len;
7513 
7514 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7515 		len = pkt->cp_rxbuf[i].rb_len;
7516 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7517 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7518 			    "ofs %d, len %d\n", i, ofs, len);
7519 			continue;
7520 		}
7521 
7522 		rxr->rsc.is_last = (i == (count - 1));
7523 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7524 	}
7525 	NET_EPOCH_EXIT(et);
7526 
7527 	/*
7528 	 * Ack the consumed RXBUF associated w/ this channel packet,
7529 	 * so that this RXBUF can be recycled by the hypervisor.
7530 	 */
7531 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7532 }
7533 
7534 static void
7535 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7536     uint64_t tid)
7537 {
7538 	struct hn_nvs_rndis_ack ack;
7539 	int retries, error;
7540 
7541 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7542 	ack.nvs_status = HN_NVS_STATUS_OK;
7543 
7544 	retries = 0;
7545 again:
7546 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7547 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7548 	if (__predict_false(error == EAGAIN)) {
7549 		/*
7550 		 * NOTE:
7551 		 * This should _not_ happen in real world, since the
7552 		 * consumption of the TX bufring from the TX path is
7553 		 * controlled.
7554 		 */
7555 		if (rxr->hn_ack_failed == 0)
7556 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7557 		rxr->hn_ack_failed++;
7558 		retries++;
7559 		if (retries < 10) {
7560 			DELAY(100);
7561 			goto again;
7562 		}
7563 		/* RXBUF leaks! */
7564 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7565 	}
7566 }
7567 
7568 static void
7569 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7570 {
7571 	struct hn_rx_ring *rxr = xrxr;
7572 	struct hn_softc *sc = if_getsoftc(rxr->hn_ifp);
7573 
7574 	for (;;) {
7575 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7576 		int error, pktlen;
7577 
7578 		pktlen = rxr->hn_pktbuf_len;
7579 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7580 		if (__predict_false(error == ENOBUFS)) {
7581 			void *nbuf;
7582 			int nlen;
7583 
7584 			/*
7585 			 * Expand channel packet buffer.
7586 			 *
7587 			 * XXX
7588 			 * Use M_WAITOK here, since allocation failure
7589 			 * is fatal.
7590 			 */
7591 			nlen = rxr->hn_pktbuf_len * 2;
7592 			while (nlen < pktlen)
7593 				nlen *= 2;
7594 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7595 
7596 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7597 			    rxr->hn_pktbuf_len, nlen);
7598 
7599 			free(rxr->hn_pktbuf, M_DEVBUF);
7600 			rxr->hn_pktbuf = nbuf;
7601 			rxr->hn_pktbuf_len = nlen;
7602 			/* Retry! */
7603 			continue;
7604 		} else if (__predict_false(error == EAGAIN)) {
7605 			/* No more channel packets; done! */
7606 			break;
7607 		}
7608 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7609 
7610 		switch (pkt->cph_type) {
7611 		case VMBUS_CHANPKT_TYPE_COMP:
7612 			hn_nvs_handle_comp(sc, chan, pkt);
7613 			break;
7614 
7615 		case VMBUS_CHANPKT_TYPE_RXBUF:
7616 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7617 			break;
7618 
7619 		case VMBUS_CHANPKT_TYPE_INBAND:
7620 			hn_nvs_handle_notify(sc, pkt);
7621 			break;
7622 
7623 		default:
7624 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7625 			    pkt->cph_type);
7626 			break;
7627 		}
7628 	}
7629 	hn_chan_rollup(rxr, rxr->hn_txr);
7630 }
7631 
7632 static void
7633 hn_sysinit(void *arg __unused)
7634 {
7635 	int i;
7636 
7637 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7638 
7639 #ifdef HN_IFSTART_SUPPORT
7640 	/*
7641 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7642 	 * mainly due to the IFF_DRV_OACTIVE flag.
7643 	 */
7644 	if (hn_xpnt_vf && hn_use_if_start) {
7645 		hn_use_if_start = 0;
7646 		printf("hn: tranparent VF mode, if_transmit will be used, "
7647 		    "instead of if_start\n");
7648 	}
7649 #endif
7650 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7651 		printf("hn: invalid transparent VF attach routing "
7652 		    "wait timeout %d, reset to %d\n",
7653 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7654 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7655 	}
7656 
7657 	/*
7658 	 * Initialize VF map.
7659 	 */
7660 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7661 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7662 	hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF,
7663 	    M_WAITOK | M_ZERO);
7664 
7665 	/*
7666 	 * Fix the # of TX taskqueues.
7667 	 */
7668 	if (hn_tx_taskq_cnt <= 0)
7669 		hn_tx_taskq_cnt = 1;
7670 	else if (hn_tx_taskq_cnt > mp_ncpus)
7671 		hn_tx_taskq_cnt = mp_ncpus;
7672 
7673 	/*
7674 	 * Fix the TX taskqueue mode.
7675 	 */
7676 	switch (hn_tx_taskq_mode) {
7677 	case HN_TX_TASKQ_M_INDEP:
7678 	case HN_TX_TASKQ_M_GLOBAL:
7679 	case HN_TX_TASKQ_M_EVTTQ:
7680 		break;
7681 	default:
7682 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7683 		break;
7684 	}
7685 
7686 	if (vm_guest != VM_GUEST_HV)
7687 		return;
7688 
7689 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7690 		return;
7691 
7692 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7693 	    M_DEVBUF, M_WAITOK);
7694 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7695 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7696 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7697 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7698 		    "hn tx%d", i);
7699 	}
7700 }
7701 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7702 
7703 static void
7704 hn_sysuninit(void *arg __unused)
7705 {
7706 
7707 	if (hn_tx_taskque != NULL) {
7708 		int i;
7709 
7710 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7711 			taskqueue_free(hn_tx_taskque[i]);
7712 		free(hn_tx_taskque, M_DEVBUF);
7713 	}
7714 
7715 	if (hn_vfmap != NULL)
7716 		free(hn_vfmap, M_DEVBUF);
7717 	rm_destroy(&hn_vfmap_lock);
7718 
7719 	counter_u64_free(hn_udpcs_fixup);
7720 }
7721 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7722