xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision b214fcceacad6b842545150664bd2695c1c2b34f)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/proc.h>
75 #include <sys/rmlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/socket.h>
80 #include <sys/sockio.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/buf_ring.h>
85 #include <sys/eventhandler.h>
86 #include <sys/epoch.h>
87 
88 #include <machine/atomic.h>
89 #include <machine/in_cksum.h>
90 
91 #include <net/bpf.h>
92 #include <net/ethernet.h>
93 #include <net/if.h>
94 #include <net/if_dl.h>
95 #include <net/if_media.h>
96 #include <net/if_types.h>
97 #include <net/if_var.h>
98 #include <net/rndis.h>
99 #ifdef RSS
100 #include <net/rss_config.h>
101 #endif
102 
103 #include <netinet/in_systm.h>
104 #include <netinet/in.h>
105 #include <netinet/ip.h>
106 #include <netinet/ip6.h>
107 #include <netinet/tcp.h>
108 #include <netinet/tcp_lro.h>
109 #include <netinet/udp.h>
110 
111 #include <dev/hyperv/include/hyperv.h>
112 #include <dev/hyperv/include/hyperv_busdma.h>
113 #include <dev/hyperv/include/vmbus.h>
114 #include <dev/hyperv/include/vmbus_xact.h>
115 
116 #include <dev/hyperv/netvsc/ndis.h>
117 #include <dev/hyperv/netvsc/if_hnreg.h>
118 #include <dev/hyperv/netvsc/if_hnvar.h>
119 #include <dev/hyperv/netvsc/hn_nvs.h>
120 #include <dev/hyperv/netvsc/hn_rndis.h>
121 
122 #include "vmbus_if.h"
123 
124 #define HN_IFSTART_SUPPORT
125 
126 #define HN_RING_CNT_DEF_MAX		8
127 
128 #define HN_VFMAP_SIZE_DEF		8
129 
130 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
131 
132 /* YYY should get it from the underlying channel */
133 #define HN_TX_DESC_CNT			512
134 
135 #define HN_RNDIS_PKT_LEN					\
136 	(sizeof(struct rndis_packet_msg) +			\
137 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
138 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
139 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
140 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
141 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
142 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
143 
144 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
145 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
146 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
147 /* -1 for RNDIS packet message */
148 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
149 
150 #define HN_DIRECT_TX_SIZE_DEF		128
151 
152 #define HN_EARLY_TXEOF_THRESH		8
153 
154 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
155 
156 #define HN_LROENT_CNT_DEF		128
157 
158 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
159 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
160 /* YYY 2*MTU is a bit rough, but should be good enough. */
161 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
162 
163 #define HN_LRO_ACKCNT_DEF		1
164 
165 #define HN_LOCK_INIT(sc)		\
166 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
167 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
168 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
169 #define HN_LOCK(sc)					\
170 do {							\
171 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
172 		/* Relinquish cpu to avoid deadlock */	\
173 		sched_relinquish(curthread);		\
174 		DELAY(1000);				\
175 	}						\
176 } while (0)
177 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
178 
179 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
180 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
181 #define HN_CSUM_IP_HWASSIST(sc)		\
182 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
183 #define HN_CSUM_IP6_HWASSIST(sc)	\
184 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
185 
186 #define HN_PKTSIZE_MIN(align)		\
187 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
188 	    HN_RNDIS_PKT_LEN, (align))
189 #define HN_PKTSIZE(m, align)		\
190 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
191 
192 #ifdef RSS
193 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
194 #else
195 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
196 #endif
197 
198 struct hn_txdesc {
199 #ifndef HN_USE_TXDESC_BUFRING
200 	SLIST_ENTRY(hn_txdesc)		link;
201 #endif
202 	STAILQ_ENTRY(hn_txdesc)		agg_link;
203 
204 	/* Aggregated txdescs, in sending order. */
205 	STAILQ_HEAD(, hn_txdesc)	agg_list;
206 
207 	/* The oldest packet, if transmission aggregation happens. */
208 	struct mbuf			*m;
209 	struct hn_tx_ring		*txr;
210 	int				refs;
211 	uint32_t			flags;	/* HN_TXD_FLAG_ */
212 	struct hn_nvs_sendctx		send_ctx;
213 	uint32_t			chim_index;
214 	int				chim_size;
215 
216 	bus_dmamap_t			data_dmap;
217 
218 	bus_addr_t			rndis_pkt_paddr;
219 	struct rndis_packet_msg		*rndis_pkt;
220 	bus_dmamap_t			rndis_pkt_dmap;
221 };
222 
223 #define HN_TXD_FLAG_ONLIST		0x0001
224 #define HN_TXD_FLAG_DMAMAP		0x0002
225 #define HN_TXD_FLAG_ONAGG		0x0004
226 
227 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
228 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
229 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
230 
231 struct packet_info_id {
232 	uint8_t				ver;
233 	uint8_t				flag;
234 	uint16_t			pkt_id;
235 };
236 
237 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
238 
239 
240 struct hn_rxinfo {
241 	const uint32_t			*vlan_info;
242 	const uint32_t			*csum_info;
243 	const uint32_t			*hash_info;
244 	const uint32_t			*hash_value;
245 	const struct packet_info_id	*pktinfo_id;
246 };
247 
248 struct hn_rxvf_setarg {
249 	struct hn_rx_ring	*rxr;
250 	struct ifnet		*vf_ifp;
251 };
252 
253 #define HN_RXINFO_VLAN			0x0001
254 #define HN_RXINFO_CSUM			0x0002
255 #define HN_RXINFO_HASHINF		0x0004
256 #define HN_RXINFO_HASHVAL		0x0008
257 #define HN_RXINFO_PKTINFO_ID		0x0010
258 #define HN_RXINFO_ALL			\
259 	(HN_RXINFO_VLAN |		\
260 	 HN_RXINFO_CSUM |		\
261 	 HN_RXINFO_HASHINF |		\
262 	 HN_RXINFO_HASHVAL |		\
263 	 HN_RXINFO_PKTINFO_ID)
264 
265 static int			hn_probe(device_t);
266 static int			hn_attach(device_t);
267 static int			hn_detach(device_t);
268 static int			hn_shutdown(device_t);
269 static void			hn_chan_callback(struct vmbus_channel *,
270 				    void *);
271 
272 static void			hn_init(void *);
273 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
274 #ifdef HN_IFSTART_SUPPORT
275 static void			hn_start(struct ifnet *);
276 #endif
277 static int			hn_transmit(struct ifnet *, struct mbuf *);
278 static void			hn_xmit_qflush(struct ifnet *);
279 static int			hn_ifmedia_upd(struct ifnet *);
280 static void			hn_ifmedia_sts(struct ifnet *,
281 				    struct ifmediareq *);
282 
283 static void			hn_ifnet_event(void *, struct ifnet *, int);
284 static void			hn_ifaddr_event(void *, struct ifnet *);
285 static void			hn_ifnet_attevent(void *, struct ifnet *);
286 static void			hn_ifnet_detevent(void *, struct ifnet *);
287 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
288 
289 static bool			hn_ismyvf(const struct hn_softc *,
290 				    const struct ifnet *);
291 static void			hn_rxvf_change(struct hn_softc *,
292 				    struct ifnet *, bool);
293 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
294 static void			hn_rxvf_set_task(void *, int);
295 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
296 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
297 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
298 				    struct ifreq *);
299 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
300 static bool			hn_xpnt_vf_isready(struct hn_softc *);
301 static void			hn_xpnt_vf_setready(struct hn_softc *);
302 static void			hn_xpnt_vf_init_taskfunc(void *, int);
303 static void			hn_xpnt_vf_init(struct hn_softc *);
304 static void			hn_xpnt_vf_setenable(struct hn_softc *);
305 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
306 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
307 static void			hn_vf_rss_restore(struct hn_softc *);
308 
309 static int			hn_rndis_rxinfo(const void *, int,
310 				    struct hn_rxinfo *);
311 static void			hn_rndis_rx_data(struct hn_rx_ring *,
312 				    const void *, int);
313 static void			hn_rndis_rx_status(struct hn_softc *,
314 				    const void *, int);
315 static void			hn_rndis_init_fixat(struct hn_softc *, int);
316 
317 static void			hn_nvs_handle_notify(struct hn_softc *,
318 				    const struct vmbus_chanpkt_hdr *);
319 static void			hn_nvs_handle_comp(struct hn_softc *,
320 				    struct vmbus_channel *,
321 				    const struct vmbus_chanpkt_hdr *);
322 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
323 				    struct vmbus_channel *,
324 				    const struct vmbus_chanpkt_hdr *);
325 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
326 				    struct vmbus_channel *, uint64_t);
327 
328 #if __FreeBSD_version >= 1100099
329 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
334 #if __FreeBSD_version < 1100095
335 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
336 #else
337 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
338 #endif
339 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
346 #ifndef RSS
347 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
349 #endif
350 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
358 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
359 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
360 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
361 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
362 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
363 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
364 
365 static void			hn_stop(struct hn_softc *, bool);
366 static void			hn_init_locked(struct hn_softc *);
367 static int			hn_chan_attach(struct hn_softc *,
368 				    struct vmbus_channel *);
369 static void			hn_chan_detach(struct hn_softc *,
370 				    struct vmbus_channel *);
371 static int			hn_attach_subchans(struct hn_softc *);
372 static void			hn_detach_allchans(struct hn_softc *);
373 static void			hn_chan_rollup(struct hn_rx_ring *,
374 				    struct hn_tx_ring *);
375 static void			hn_set_ring_inuse(struct hn_softc *, int);
376 static int			hn_synth_attach(struct hn_softc *, int);
377 static void			hn_synth_detach(struct hn_softc *);
378 static int			hn_synth_alloc_subchans(struct hn_softc *,
379 				    int *);
380 static bool			hn_synth_attachable(const struct hn_softc *);
381 static void			hn_suspend(struct hn_softc *);
382 static void			hn_suspend_data(struct hn_softc *);
383 static void			hn_suspend_mgmt(struct hn_softc *);
384 static void			hn_resume(struct hn_softc *);
385 static void			hn_resume_data(struct hn_softc *);
386 static void			hn_resume_mgmt(struct hn_softc *);
387 static void			hn_suspend_mgmt_taskfunc(void *, int);
388 static void			hn_chan_drain(struct hn_softc *,
389 				    struct vmbus_channel *);
390 static void			hn_disable_rx(struct hn_softc *);
391 static void			hn_drain_rxtx(struct hn_softc *, int);
392 static void			hn_polling(struct hn_softc *, u_int);
393 static void			hn_chan_polling(struct vmbus_channel *, u_int);
394 static void			hn_mtu_change_fixup(struct hn_softc *);
395 
396 static void			hn_update_link_status(struct hn_softc *);
397 static void			hn_change_network(struct hn_softc *);
398 static void			hn_link_taskfunc(void *, int);
399 static void			hn_netchg_init_taskfunc(void *, int);
400 static void			hn_netchg_status_taskfunc(void *, int);
401 static void			hn_link_status(struct hn_softc *);
402 
403 static int			hn_create_rx_data(struct hn_softc *, int);
404 static void			hn_destroy_rx_data(struct hn_softc *);
405 static int			hn_check_iplen(const struct mbuf *, int);
406 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
407 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
408 static int			hn_rxfilter_config(struct hn_softc *);
409 static int			hn_rss_reconfig(struct hn_softc *);
410 static void			hn_rss_ind_fixup(struct hn_softc *);
411 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
412 static int			hn_rxpkt(struct hn_rx_ring *);
413 static uint32_t			hn_rss_type_fromndis(uint32_t);
414 static uint32_t			hn_rss_type_tondis(uint32_t);
415 
416 static int			hn_tx_ring_create(struct hn_softc *, int);
417 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
418 static int			hn_create_tx_data(struct hn_softc *, int);
419 static void			hn_fixup_tx_data(struct hn_softc *);
420 static void			hn_fixup_rx_data(struct hn_softc *);
421 static void			hn_destroy_tx_data(struct hn_softc *);
422 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
423 static void			hn_txdesc_gc(struct hn_tx_ring *,
424 				    struct hn_txdesc *);
425 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
426 				    struct hn_txdesc *, struct mbuf **);
427 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
428 				    struct hn_txdesc *);
429 static void			hn_set_chim_size(struct hn_softc *, int);
430 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
431 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
432 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
433 static void			hn_resume_tx(struct hn_softc *, int);
434 static void			hn_set_txagg(struct hn_softc *);
435 static void			*hn_try_txagg(struct ifnet *,
436 				    struct hn_tx_ring *, struct hn_txdesc *,
437 				    int);
438 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
439 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
440 				    struct hn_softc *, struct vmbus_channel *,
441 				    const void *, int);
442 static int			hn_txpkt_sglist(struct hn_tx_ring *,
443 				    struct hn_txdesc *);
444 static int			hn_txpkt_chim(struct hn_tx_ring *,
445 				    struct hn_txdesc *);
446 static int			hn_xmit(struct hn_tx_ring *, int);
447 static void			hn_xmit_taskfunc(void *, int);
448 static void			hn_xmit_txeof(struct hn_tx_ring *);
449 static void			hn_xmit_txeof_taskfunc(void *, int);
450 #ifdef HN_IFSTART_SUPPORT
451 static int			hn_start_locked(struct hn_tx_ring *, int);
452 static void			hn_start_taskfunc(void *, int);
453 static void			hn_start_txeof(struct hn_tx_ring *);
454 static void			hn_start_txeof_taskfunc(void *, int);
455 #endif
456 
457 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
458     "Hyper-V network interface");
459 
460 /* Trust tcp segment verification on host side. */
461 static int			hn_trust_hosttcp = 1;
462 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
463     &hn_trust_hosttcp, 0,
464     "Trust tcp segment verification on host side, "
465     "when csum info is missing (global setting)");
466 
467 /* Trust udp datagrams verification on host side. */
468 static int			hn_trust_hostudp = 1;
469 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
470     &hn_trust_hostudp, 0,
471     "Trust udp datagram verification on host side, "
472     "when csum info is missing (global setting)");
473 
474 /* Trust ip packets verification on host side. */
475 static int			hn_trust_hostip = 1;
476 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
477     &hn_trust_hostip, 0,
478     "Trust ip packet verification on host side, "
479     "when csum info is missing (global setting)");
480 
481 /*
482  * Offload UDP/IPv4 checksum.
483  */
484 static int			hn_enable_udp4cs = 1;
485 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
486     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
487 
488 /*
489  * Offload UDP/IPv6 checksum.
490  */
491 static int			hn_enable_udp6cs = 1;
492 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
493     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
494 
495 /* Stats. */
496 static counter_u64_t		hn_udpcs_fixup;
497 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
498     &hn_udpcs_fixup, "# of UDP checksum fixup");
499 
500 /*
501  * See hn_set_hlen().
502  *
503  * This value is for Azure.  For Hyper-V, set this above
504  * 65536 to disable UDP datagram checksum fixup.
505  */
506 static int			hn_udpcs_fixup_mtu = 1420;
507 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
508     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
509 
510 /* Limit TSO burst size */
511 static int			hn_tso_maxlen = IP_MAXPACKET;
512 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
513     &hn_tso_maxlen, 0, "TSO burst limit");
514 
515 /* Limit chimney send size */
516 static int			hn_tx_chimney_size = 0;
517 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
518     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
519 
520 /* Limit the size of packet for direct transmission */
521 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
522 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
523     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
524 
525 /* # of LRO entries per RX ring */
526 #if defined(INET) || defined(INET6)
527 #if __FreeBSD_version >= 1100095
528 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
529 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
530     &hn_lro_entry_count, 0, "LRO entry count");
531 #endif
532 #endif
533 
534 static int			hn_tx_taskq_cnt = 1;
535 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
536     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
537 
538 #define HN_TX_TASKQ_M_INDEP	0
539 #define HN_TX_TASKQ_M_GLOBAL	1
540 #define HN_TX_TASKQ_M_EVTTQ	2
541 
542 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
543 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
544     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
545     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
546 
547 #ifndef HN_USE_TXDESC_BUFRING
548 static int			hn_use_txdesc_bufring = 0;
549 #else
550 static int			hn_use_txdesc_bufring = 1;
551 #endif
552 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
553     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
554 
555 #ifdef HN_IFSTART_SUPPORT
556 /* Use ifnet.if_start instead of ifnet.if_transmit */
557 static int			hn_use_if_start = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
559     &hn_use_if_start, 0, "Use if_start TX method");
560 #endif
561 
562 /* # of channels to use */
563 static int			hn_chan_cnt = 0;
564 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
565     &hn_chan_cnt, 0,
566     "# of channels to use; each channel has one RX ring and one TX ring");
567 
568 /* # of transmit rings to use */
569 static int			hn_tx_ring_cnt = 0;
570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
571     &hn_tx_ring_cnt, 0, "# of TX rings to use");
572 
573 /* Software TX ring deptch */
574 static int			hn_tx_swq_depth = 0;
575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
576     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
577 
578 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
579 #if __FreeBSD_version >= 1100095
580 static u_int			hn_lro_mbufq_depth = 0;
581 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
582     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
583 #endif
584 
585 /* Packet transmission aggregation size limit */
586 static int			hn_tx_agg_size = -1;
587 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
588     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
589 
590 /* Packet transmission aggregation count limit */
591 static int			hn_tx_agg_pkts = -1;
592 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
593     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
594 
595 /* VF list */
596 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
597     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
598     hn_vflist_sysctl, "A",
599     "VF list");
600 
601 /* VF mapping */
602 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
603     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
604     hn_vfmap_sysctl, "A",
605     "VF mapping");
606 
607 /* Transparent VF */
608 static int			hn_xpnt_vf = 1;
609 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
610     &hn_xpnt_vf, 0, "Transparent VF mod");
611 
612 /* Accurate BPF support for Transparent VF */
613 static int			hn_xpnt_vf_accbpf = 0;
614 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
615     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
616 
617 /* Extra wait for transparent VF attach routing; unit seconds. */
618 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
619 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
620     &hn_xpnt_vf_attwait, 0,
621     "Extra wait for transparent VF attach routing; unit: seconds");
622 
623 static u_int			hn_cpu_index;	/* next CPU for channel */
624 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
625 
626 static struct rmlock		hn_vfmap_lock;
627 static int			hn_vfmap_size;
628 static struct ifnet		**hn_vfmap;
629 
630 #ifndef RSS
631 static const uint8_t
632 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
633 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
634 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
635 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
636 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
637 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
638 };
639 #endif	/* !RSS */
640 
641 static const struct hyperv_guid	hn_guid = {
642 	.hv_guid = {
643 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
644 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
645 };
646 
647 static device_method_t hn_methods[] = {
648 	/* Device interface */
649 	DEVMETHOD(device_probe,		hn_probe),
650 	DEVMETHOD(device_attach,	hn_attach),
651 	DEVMETHOD(device_detach,	hn_detach),
652 	DEVMETHOD(device_shutdown,	hn_shutdown),
653 	DEVMETHOD_END
654 };
655 
656 static driver_t hn_driver = {
657 	"hn",
658 	hn_methods,
659 	sizeof(struct hn_softc)
660 };
661 
662 static devclass_t hn_devclass;
663 
664 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
665 MODULE_VERSION(hn, 1);
666 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
667 
668 #if __FreeBSD_version >= 1100099
669 static void
670 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
671 {
672 	int i;
673 
674 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
675 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
676 }
677 #endif
678 
679 static int
680 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
681 {
682 
683 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
684 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
685 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
686 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
687 }
688 
689 static int
690 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
691 {
692 	struct hn_nvs_rndis rndis;
693 
694 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
695 	    txd->chim_size > 0, ("invalid rndis chim txd"));
696 
697 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
698 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
699 	rndis.nvs_chim_idx = txd->chim_index;
700 	rndis.nvs_chim_sz = txd->chim_size;
701 
702 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
703 	    &rndis, sizeof(rndis), &txd->send_ctx));
704 }
705 
706 static __inline uint32_t
707 hn_chim_alloc(struct hn_softc *sc)
708 {
709 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
710 	u_long *bmap = sc->hn_chim_bmap;
711 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
712 
713 	for (i = 0; i < bmap_cnt; ++i) {
714 		int idx;
715 
716 		idx = ffsl(~bmap[i]);
717 		if (idx == 0)
718 			continue;
719 
720 		--idx; /* ffsl is 1-based */
721 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
722 		    ("invalid i %d and idx %d", i, idx));
723 
724 		if (atomic_testandset_long(&bmap[i], idx))
725 			continue;
726 
727 		ret = i * LONG_BIT + idx;
728 		break;
729 	}
730 	return (ret);
731 }
732 
733 static __inline void
734 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
735 {
736 	u_long mask;
737 	uint32_t idx;
738 
739 	idx = chim_idx / LONG_BIT;
740 	KASSERT(idx < sc->hn_chim_bmap_cnt,
741 	    ("invalid chimney index 0x%x", chim_idx));
742 
743 	mask = 1UL << (chim_idx % LONG_BIT);
744 	KASSERT(sc->hn_chim_bmap[idx] & mask,
745 	    ("index bitmap 0x%lx, chimney index %u, "
746 	     "bitmap idx %d, bitmask 0x%lx",
747 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
748 
749 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
750 }
751 
752 #if defined(INET6) || defined(INET)
753 
754 #define PULLUP_HDR(m, len)				\
755 do {							\
756 	if (__predict_false((m)->m_len < (len))) {	\
757 		(m) = m_pullup((m), (len));		\
758 		if ((m) == NULL)			\
759 			return (NULL);			\
760 	}						\
761 } while (0)
762 
763 /*
764  * NOTE: If this function failed, the m_head would be freed.
765  */
766 static __inline struct mbuf *
767 hn_tso_fixup(struct mbuf *m_head)
768 {
769 	struct ether_vlan_header *evl;
770 	struct tcphdr *th;
771 	int ehlen;
772 
773 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
774 
775 	PULLUP_HDR(m_head, sizeof(*evl));
776 	evl = mtod(m_head, struct ether_vlan_header *);
777 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
778 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
779 	else
780 		ehlen = ETHER_HDR_LEN;
781 	m_head->m_pkthdr.l2hlen = ehlen;
782 
783 #ifdef INET
784 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
785 		struct ip *ip;
786 		int iphlen;
787 
788 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
789 		ip = mtodo(m_head, ehlen);
790 		iphlen = ip->ip_hl << 2;
791 		m_head->m_pkthdr.l3hlen = iphlen;
792 
793 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
794 		th = mtodo(m_head, ehlen + iphlen);
795 
796 		ip->ip_len = 0;
797 		ip->ip_sum = 0;
798 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
799 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
800 	}
801 #endif
802 #if defined(INET6) && defined(INET)
803 	else
804 #endif
805 #ifdef INET6
806 	{
807 		struct ip6_hdr *ip6;
808 
809 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
810 		ip6 = mtodo(m_head, ehlen);
811 		if (ip6->ip6_nxt != IPPROTO_TCP) {
812 			m_freem(m_head);
813 			return (NULL);
814 		}
815 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
816 
817 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
818 		th = mtodo(m_head, ehlen + sizeof(*ip6));
819 
820 		ip6->ip6_plen = 0;
821 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
822 	}
823 #endif
824 	return (m_head);
825 }
826 
827 /*
828  * NOTE: If this function failed, the m_head would be freed.
829  */
830 static __inline struct mbuf *
831 hn_set_hlen(struct mbuf *m_head)
832 {
833 	const struct ether_vlan_header *evl;
834 	int ehlen;
835 
836 	PULLUP_HDR(m_head, sizeof(*evl));
837 	evl = mtod(m_head, const struct ether_vlan_header *);
838 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
839 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
840 	else
841 		ehlen = ETHER_HDR_LEN;
842 	m_head->m_pkthdr.l2hlen = ehlen;
843 
844 #ifdef INET
845 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
846 		const struct ip *ip;
847 		int iphlen;
848 
849 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
850 		ip = mtodo(m_head, ehlen);
851 		iphlen = ip->ip_hl << 2;
852 		m_head->m_pkthdr.l3hlen = iphlen;
853 
854 		/*
855 		 * UDP checksum offload does not work in Azure, if the
856 		 * following conditions meet:
857 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
858 		 * - IP_DF is not set in the IP hdr.
859 		 *
860 		 * Fallback to software checksum for these UDP datagrams.
861 		 */
862 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
863 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
864 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
865 			uint16_t off = ehlen + iphlen;
866 
867 			counter_u64_add(hn_udpcs_fixup, 1);
868 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
869 			*(uint16_t *)(m_head->m_data + off +
870                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
871 			    m_head, m_head->m_pkthdr.len, off);
872 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
873 		}
874 	}
875 #endif
876 #if defined(INET6) && defined(INET)
877 	else
878 #endif
879 #ifdef INET6
880 	{
881 		const struct ip6_hdr *ip6;
882 
883 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
884 		ip6 = mtodo(m_head, ehlen);
885 		if (ip6->ip6_nxt != IPPROTO_TCP &&
886 		    ip6->ip6_nxt != IPPROTO_UDP) {
887 			m_freem(m_head);
888 			return (NULL);
889 		}
890 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
891 	}
892 #endif
893 	return (m_head);
894 }
895 
896 /*
897  * NOTE: If this function failed, the m_head would be freed.
898  */
899 static __inline struct mbuf *
900 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
901 {
902 	const struct tcphdr *th;
903 	int ehlen, iphlen;
904 
905 	*tcpsyn = 0;
906 	ehlen = m_head->m_pkthdr.l2hlen;
907 	iphlen = m_head->m_pkthdr.l3hlen;
908 
909 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
910 	th = mtodo(m_head, ehlen + iphlen);
911 	if (th->th_flags & TH_SYN)
912 		*tcpsyn = 1;
913 	return (m_head);
914 }
915 
916 #undef PULLUP_HDR
917 
918 #endif	/* INET6 || INET */
919 
920 static int
921 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
922 {
923 	int error = 0;
924 
925 	HN_LOCK_ASSERT(sc);
926 
927 	if (sc->hn_rx_filter != filter) {
928 		error = hn_rndis_set_rxfilter(sc, filter);
929 		if (!error)
930 			sc->hn_rx_filter = filter;
931 	}
932 	return (error);
933 }
934 
935 static int
936 hn_rxfilter_config(struct hn_softc *sc)
937 {
938 	struct ifnet *ifp = sc->hn_ifp;
939 	uint32_t filter;
940 
941 	HN_LOCK_ASSERT(sc);
942 
943 	/*
944 	 * If the non-transparent mode VF is activated, we don't know how
945 	 * its RX filter is configured, so stick the synthetic device in
946 	 * the promiscous mode.
947 	 */
948 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
949 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
950 	} else {
951 		filter = NDIS_PACKET_TYPE_DIRECTED;
952 		if (ifp->if_flags & IFF_BROADCAST)
953 			filter |= NDIS_PACKET_TYPE_BROADCAST;
954 		/* TODO: support multicast list */
955 		if ((ifp->if_flags & IFF_ALLMULTI) ||
956 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
957 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
958 	}
959 	return (hn_set_rxfilter(sc, filter));
960 }
961 
962 static void
963 hn_set_txagg(struct hn_softc *sc)
964 {
965 	uint32_t size, pkts;
966 	int i;
967 
968 	/*
969 	 * Setup aggregation size.
970 	 */
971 	if (sc->hn_agg_size < 0)
972 		size = UINT32_MAX;
973 	else
974 		size = sc->hn_agg_size;
975 
976 	if (sc->hn_rndis_agg_size < size)
977 		size = sc->hn_rndis_agg_size;
978 
979 	/* NOTE: We only aggregate packets using chimney sending buffers. */
980 	if (size > (uint32_t)sc->hn_chim_szmax)
981 		size = sc->hn_chim_szmax;
982 
983 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
984 		/* Disable */
985 		size = 0;
986 		pkts = 0;
987 		goto done;
988 	}
989 
990 	/* NOTE: Type of the per TX ring setting is 'int'. */
991 	if (size > INT_MAX)
992 		size = INT_MAX;
993 
994 	/*
995 	 * Setup aggregation packet count.
996 	 */
997 	if (sc->hn_agg_pkts < 0)
998 		pkts = UINT32_MAX;
999 	else
1000 		pkts = sc->hn_agg_pkts;
1001 
1002 	if (sc->hn_rndis_agg_pkts < pkts)
1003 		pkts = sc->hn_rndis_agg_pkts;
1004 
1005 	if (pkts <= 1) {
1006 		/* Disable */
1007 		size = 0;
1008 		pkts = 0;
1009 		goto done;
1010 	}
1011 
1012 	/* NOTE: Type of the per TX ring setting is 'short'. */
1013 	if (pkts > SHRT_MAX)
1014 		pkts = SHRT_MAX;
1015 
1016 done:
1017 	/* NOTE: Type of the per TX ring setting is 'short'. */
1018 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1019 		/* Disable */
1020 		size = 0;
1021 		pkts = 0;
1022 	}
1023 
1024 	if (bootverbose) {
1025 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1026 		    size, pkts, sc->hn_rndis_agg_align);
1027 	}
1028 
1029 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1030 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1031 
1032 		mtx_lock(&txr->hn_tx_lock);
1033 		txr->hn_agg_szmax = size;
1034 		txr->hn_agg_pktmax = pkts;
1035 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1036 		mtx_unlock(&txr->hn_tx_lock);
1037 	}
1038 }
1039 
1040 static int
1041 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1042 {
1043 
1044 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1045 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1046 		return txr->hn_txdesc_cnt;
1047 	return hn_tx_swq_depth;
1048 }
1049 
1050 static int
1051 hn_rss_reconfig(struct hn_softc *sc)
1052 {
1053 	int error;
1054 
1055 	HN_LOCK_ASSERT(sc);
1056 
1057 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1058 		return (ENXIO);
1059 
1060 	/*
1061 	 * Disable RSS first.
1062 	 *
1063 	 * NOTE:
1064 	 * Direct reconfiguration by setting the UNCHG flags does
1065 	 * _not_ work properly.
1066 	 */
1067 	if (bootverbose)
1068 		if_printf(sc->hn_ifp, "disable RSS\n");
1069 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1070 	if (error) {
1071 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1072 		return (error);
1073 	}
1074 
1075 	/*
1076 	 * Reenable the RSS w/ the updated RSS key or indirect
1077 	 * table.
1078 	 */
1079 	if (bootverbose)
1080 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1081 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1082 	if (error) {
1083 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1084 		return (error);
1085 	}
1086 	return (0);
1087 }
1088 
1089 static void
1090 hn_rss_ind_fixup(struct hn_softc *sc)
1091 {
1092 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1093 	int i, nchan;
1094 
1095 	nchan = sc->hn_rx_ring_inuse;
1096 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1097 
1098 	/*
1099 	 * Check indirect table to make sure that all channels in it
1100 	 * can be used.
1101 	 */
1102 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1103 		if (rss->rss_ind[i] >= nchan) {
1104 			if_printf(sc->hn_ifp,
1105 			    "RSS indirect table %d fixup: %u -> %d\n",
1106 			    i, rss->rss_ind[i], nchan - 1);
1107 			rss->rss_ind[i] = nchan - 1;
1108 		}
1109 	}
1110 }
1111 
1112 static int
1113 hn_ifmedia_upd(struct ifnet *ifp __unused)
1114 {
1115 
1116 	return EOPNOTSUPP;
1117 }
1118 
1119 static void
1120 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1121 {
1122 	struct hn_softc *sc = ifp->if_softc;
1123 
1124 	ifmr->ifm_status = IFM_AVALID;
1125 	ifmr->ifm_active = IFM_ETHER;
1126 
1127 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1128 		ifmr->ifm_active |= IFM_NONE;
1129 		return;
1130 	}
1131 	ifmr->ifm_status |= IFM_ACTIVE;
1132 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1133 }
1134 
1135 static void
1136 hn_rxvf_set_task(void *xarg, int pending __unused)
1137 {
1138 	struct hn_rxvf_setarg *arg = xarg;
1139 
1140 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1141 }
1142 
1143 static void
1144 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1145 {
1146 	struct hn_rx_ring *rxr;
1147 	struct hn_rxvf_setarg arg;
1148 	struct task task;
1149 	int i;
1150 
1151 	HN_LOCK_ASSERT(sc);
1152 
1153 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1154 
1155 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1156 		rxr = &sc->hn_rx_ring[i];
1157 
1158 		if (i < sc->hn_rx_ring_inuse) {
1159 			arg.rxr = rxr;
1160 			arg.vf_ifp = vf_ifp;
1161 			vmbus_chan_run_task(rxr->hn_chan, &task);
1162 		} else {
1163 			rxr->hn_rxvf_ifp = vf_ifp;
1164 		}
1165 	}
1166 }
1167 
1168 static bool
1169 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1170 {
1171 	const struct ifnet *hn_ifp;
1172 
1173 	hn_ifp = sc->hn_ifp;
1174 
1175 	if (ifp == hn_ifp)
1176 		return (false);
1177 
1178 	if (ifp->if_alloctype != IFT_ETHER)
1179 		return (false);
1180 
1181 	/* Ignore lagg/vlan interfaces */
1182 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1183 	    strcmp(ifp->if_dname, "vlan") == 0)
1184 		return (false);
1185 
1186 	/*
1187 	 * During detach events ifp->if_addr might be NULL.
1188 	 * Make sure the bcmp() below doesn't panic on that:
1189 	 */
1190 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1191 		return (false);
1192 
1193 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1194 		return (false);
1195 
1196 	return (true);
1197 }
1198 
1199 static void
1200 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1201 {
1202 	struct ifnet *hn_ifp;
1203 
1204 	HN_LOCK(sc);
1205 
1206 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1207 		goto out;
1208 
1209 	if (!hn_ismyvf(sc, ifp))
1210 		goto out;
1211 	hn_ifp = sc->hn_ifp;
1212 
1213 	if (rxvf) {
1214 		if (sc->hn_flags & HN_FLAG_RXVF)
1215 			goto out;
1216 
1217 		sc->hn_flags |= HN_FLAG_RXVF;
1218 		hn_rxfilter_config(sc);
1219 	} else {
1220 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1221 			goto out;
1222 
1223 		sc->hn_flags &= ~HN_FLAG_RXVF;
1224 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1225 			hn_rxfilter_config(sc);
1226 		else
1227 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1228 	}
1229 
1230 	hn_nvs_set_datapath(sc,
1231 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1232 
1233 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1234 
1235 	if (rxvf) {
1236 		hn_vf_rss_fixup(sc, true);
1237 		hn_suspend_mgmt(sc);
1238 		sc->hn_link_flags &=
1239 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1240 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1241 	} else {
1242 		hn_vf_rss_restore(sc);
1243 		hn_resume_mgmt(sc);
1244 	}
1245 
1246 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1247 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1248 
1249 	if (bootverbose) {
1250 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1251 		    rxvf ? "to" : "from", ifp->if_xname);
1252 	}
1253 out:
1254 	HN_UNLOCK(sc);
1255 }
1256 
1257 static void
1258 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1259 {
1260 
1261 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1262 		return;
1263 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1264 }
1265 
1266 static void
1267 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1268 {
1269 
1270 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1271 }
1272 
1273 static int
1274 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1275 {
1276 	struct ifnet *ifp, *vf_ifp;
1277 	uint64_t tmp;
1278 	int error;
1279 
1280 	HN_LOCK_ASSERT(sc);
1281 	ifp = sc->hn_ifp;
1282 	vf_ifp = sc->hn_vf_ifp;
1283 
1284 	/*
1285 	 * Fix up requested capabilities w/ supported capabilities,
1286 	 * since the supported capabilities could have been changed.
1287 	 */
1288 	ifr->ifr_reqcap &= ifp->if_capabilities;
1289 	/* Pass SIOCSIFCAP to VF. */
1290 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1291 
1292 	/*
1293 	 * NOTE:
1294 	 * The error will be propagated to the callers, however, it
1295 	 * is _not_ useful here.
1296 	 */
1297 
1298 	/*
1299 	 * Merge VF's enabled capabilities.
1300 	 */
1301 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1302 
1303 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1304 	if (ifp->if_capenable & IFCAP_TXCSUM)
1305 		ifp->if_hwassist |= tmp;
1306 	else
1307 		ifp->if_hwassist &= ~tmp;
1308 
1309 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1310 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1311 		ifp->if_hwassist |= tmp;
1312 	else
1313 		ifp->if_hwassist &= ~tmp;
1314 
1315 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1316 	if (ifp->if_capenable & IFCAP_TSO4)
1317 		ifp->if_hwassist |= tmp;
1318 	else
1319 		ifp->if_hwassist &= ~tmp;
1320 
1321 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1322 	if (ifp->if_capenable & IFCAP_TSO6)
1323 		ifp->if_hwassist |= tmp;
1324 	else
1325 		ifp->if_hwassist &= ~tmp;
1326 
1327 	return (error);
1328 }
1329 
1330 static int
1331 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1332 {
1333 	struct ifnet *vf_ifp;
1334 	struct ifreq ifr;
1335 
1336 	HN_LOCK_ASSERT(sc);
1337 	vf_ifp = sc->hn_vf_ifp;
1338 
1339 	memset(&ifr, 0, sizeof(ifr));
1340 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1341 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1342 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1343 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1344 }
1345 
1346 static void
1347 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1348 {
1349 	struct ifnet *ifp = sc->hn_ifp;
1350 	int allmulti = 0;
1351 
1352 	HN_LOCK_ASSERT(sc);
1353 
1354 	/* XXX vlan(4) style mcast addr maintenance */
1355 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1356 		allmulti = IFF_ALLMULTI;
1357 
1358 	/* Always set the VF's if_flags */
1359 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1360 }
1361 
1362 static void
1363 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1364 {
1365 	struct rm_priotracker pt;
1366 	struct ifnet *hn_ifp = NULL;
1367 	struct mbuf *mn;
1368 
1369 	/*
1370 	 * XXX racy, if hn(4) ever detached.
1371 	 */
1372 	rm_rlock(&hn_vfmap_lock, &pt);
1373 	if (vf_ifp->if_index < hn_vfmap_size)
1374 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1375 	rm_runlock(&hn_vfmap_lock, &pt);
1376 
1377 	if (hn_ifp != NULL) {
1378 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1379 			/*
1380 			 * Allow tapping on the VF.
1381 			 */
1382 			ETHER_BPF_MTAP(vf_ifp, mn);
1383 
1384 			/*
1385 			 * Update VF stats.
1386 			 */
1387 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1388 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1389 				    mn->m_pkthdr.len);
1390 			}
1391 			/*
1392 			 * XXX IFCOUNTER_IMCAST
1393 			 * This stat updating is kinda invasive, since it
1394 			 * requires two checks on the mbuf: the length check
1395 			 * and the ethernet header check.  As of this write,
1396 			 * all multicast packets go directly to hn(4), which
1397 			 * makes imcast stat updating in the VF a try in vian.
1398 			 */
1399 
1400 			/*
1401 			 * Fix up rcvif and increase hn(4)'s ipackets.
1402 			 */
1403 			mn->m_pkthdr.rcvif = hn_ifp;
1404 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1405 		}
1406 		/*
1407 		 * Go through hn(4)'s if_input.
1408 		 */
1409 		hn_ifp->if_input(hn_ifp, m);
1410 	} else {
1411 		/*
1412 		 * In the middle of the transition; free this
1413 		 * mbuf chain.
1414 		 */
1415 		while (m != NULL) {
1416 			mn = m->m_nextpkt;
1417 			m->m_nextpkt = NULL;
1418 			m_freem(m);
1419 			m = mn;
1420 		}
1421 	}
1422 }
1423 
1424 static void
1425 hn_mtu_change_fixup(struct hn_softc *sc)
1426 {
1427 	struct ifnet *ifp;
1428 
1429 	HN_LOCK_ASSERT(sc);
1430 	ifp = sc->hn_ifp;
1431 
1432 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1433 #if __FreeBSD_version >= 1100099
1434 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1435 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1436 #endif
1437 }
1438 
1439 static uint32_t
1440 hn_rss_type_fromndis(uint32_t rss_hash)
1441 {
1442 	uint32_t types = 0;
1443 
1444 	if (rss_hash & NDIS_HASH_IPV4)
1445 		types |= RSS_TYPE_IPV4;
1446 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1447 		types |= RSS_TYPE_TCP_IPV4;
1448 	if (rss_hash & NDIS_HASH_IPV6)
1449 		types |= RSS_TYPE_IPV6;
1450 	if (rss_hash & NDIS_HASH_IPV6_EX)
1451 		types |= RSS_TYPE_IPV6_EX;
1452 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1453 		types |= RSS_TYPE_TCP_IPV6;
1454 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1455 		types |= RSS_TYPE_TCP_IPV6_EX;
1456 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1457 		types |= RSS_TYPE_UDP_IPV4;
1458 	return (types);
1459 }
1460 
1461 static uint32_t
1462 hn_rss_type_tondis(uint32_t types)
1463 {
1464 	uint32_t rss_hash = 0;
1465 
1466 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1467 	    ("UDP6 and UDP6EX are not supported"));
1468 
1469 	if (types & RSS_TYPE_IPV4)
1470 		rss_hash |= NDIS_HASH_IPV4;
1471 	if (types & RSS_TYPE_TCP_IPV4)
1472 		rss_hash |= NDIS_HASH_TCP_IPV4;
1473 	if (types & RSS_TYPE_IPV6)
1474 		rss_hash |= NDIS_HASH_IPV6;
1475 	if (types & RSS_TYPE_IPV6_EX)
1476 		rss_hash |= NDIS_HASH_IPV6_EX;
1477 	if (types & RSS_TYPE_TCP_IPV6)
1478 		rss_hash |= NDIS_HASH_TCP_IPV6;
1479 	if (types & RSS_TYPE_TCP_IPV6_EX)
1480 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1481 	if (types & RSS_TYPE_UDP_IPV4)
1482 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1483 	return (rss_hash);
1484 }
1485 
1486 static void
1487 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1488 {
1489 	int i;
1490 
1491 	HN_LOCK_ASSERT(sc);
1492 
1493 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1494 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1495 }
1496 
1497 static void
1498 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1499 {
1500 	struct ifnet *ifp, *vf_ifp;
1501 	struct ifrsshash ifrh;
1502 	struct ifrsskey ifrk;
1503 	int error;
1504 	uint32_t my_types, diff_types, mbuf_types = 0;
1505 
1506 	HN_LOCK_ASSERT(sc);
1507 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1508 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1509 
1510 	if (sc->hn_rx_ring_inuse == 1) {
1511 		/* No RSS on synthetic parts; done. */
1512 		return;
1513 	}
1514 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1515 		/* Synthetic parts do not support Toeplitz; done. */
1516 		return;
1517 	}
1518 
1519 	ifp = sc->hn_ifp;
1520 	vf_ifp = sc->hn_vf_ifp;
1521 
1522 	/*
1523 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1524 	 * supported.
1525 	 */
1526 	memset(&ifrk, 0, sizeof(ifrk));
1527 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1528 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1529 	if (error) {
1530 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1531 		    vf_ifp->if_xname, error);
1532 		goto done;
1533 	}
1534 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1535 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1536 		    vf_ifp->if_xname, ifrk.ifrk_func);
1537 		goto done;
1538 	}
1539 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1540 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1541 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1542 		goto done;
1543 	}
1544 
1545 	/*
1546 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1547 	 */
1548 	memset(&ifrh, 0, sizeof(ifrh));
1549 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1550 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1551 	if (error) {
1552 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1553 		    vf_ifp->if_xname, error);
1554 		goto done;
1555 	}
1556 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1557 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1558 		    vf_ifp->if_xname, ifrh.ifrh_func);
1559 		goto done;
1560 	}
1561 
1562 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1563 	if ((ifrh.ifrh_types & my_types) == 0) {
1564 		/* This disables RSS; ignore it then */
1565 		if_printf(ifp, "%s intersection of RSS types failed.  "
1566 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1567 		    ifrh.ifrh_types, my_types);
1568 		goto done;
1569 	}
1570 
1571 	diff_types = my_types ^ ifrh.ifrh_types;
1572 	my_types &= ifrh.ifrh_types;
1573 	mbuf_types = my_types;
1574 
1575 	/*
1576 	 * Detect RSS hash value/type confliction.
1577 	 *
1578 	 * NOTE:
1579 	 * We don't disable the hash type, but stop delivery the hash
1580 	 * value/type through mbufs on RX path.
1581 	 *
1582 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1583 	 * hash is delivered with type of TCP_IPV4.  This means if
1584 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1585 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1586 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1587 	 * here.
1588 	 */
1589 	if ((my_types & RSS_TYPE_IPV4) &&
1590 	    (diff_types & ifrh.ifrh_types &
1591 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1592 		/* Conflict; disable IPV4 hash type/value delivery. */
1593 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1594 		mbuf_types &= ~RSS_TYPE_IPV4;
1595 	}
1596 	if ((my_types & RSS_TYPE_IPV6) &&
1597 	    (diff_types & ifrh.ifrh_types &
1598 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1599 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1600 	      RSS_TYPE_IPV6_EX))) {
1601 		/* Conflict; disable IPV6 hash type/value delivery. */
1602 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1603 		mbuf_types &= ~RSS_TYPE_IPV6;
1604 	}
1605 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1606 	    (diff_types & ifrh.ifrh_types &
1607 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1608 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1609 	      RSS_TYPE_IPV6))) {
1610 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1611 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1612 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1613 	}
1614 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1615 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1616 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1617 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1618 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1619 	}
1620 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1621 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1622 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1623 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1624 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1625 	}
1626 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1627 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1628 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1629 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1630 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1631 	}
1632 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1633 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1634 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1635 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1636 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1637 	}
1638 
1639 	/*
1640 	 * Indirect table does not matter.
1641 	 */
1642 
1643 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1644 	    hn_rss_type_tondis(my_types);
1645 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1646 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1647 
1648 	if (reconf) {
1649 		error = hn_rss_reconfig(sc);
1650 		if (error) {
1651 			/* XXX roll-back? */
1652 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1653 			/* XXX keep going. */
1654 		}
1655 	}
1656 done:
1657 	/* Hash deliverability for mbufs. */
1658 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1659 }
1660 
1661 static void
1662 hn_vf_rss_restore(struct hn_softc *sc)
1663 {
1664 
1665 	HN_LOCK_ASSERT(sc);
1666 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1667 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1668 
1669 	if (sc->hn_rx_ring_inuse == 1)
1670 		goto done;
1671 
1672 	/*
1673 	 * Restore hash types.  Key does _not_ matter.
1674 	 */
1675 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1676 		int error;
1677 
1678 		sc->hn_rss_hash = sc->hn_rss_hcap;
1679 		error = hn_rss_reconfig(sc);
1680 		if (error) {
1681 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1682 			    error);
1683 			/* XXX keep going. */
1684 		}
1685 	}
1686 done:
1687 	/* Hash deliverability for mbufs. */
1688 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1689 }
1690 
1691 static void
1692 hn_xpnt_vf_setready(struct hn_softc *sc)
1693 {
1694 	struct ifnet *ifp, *vf_ifp;
1695 	struct ifreq ifr;
1696 
1697 	HN_LOCK_ASSERT(sc);
1698 	ifp = sc->hn_ifp;
1699 	vf_ifp = sc->hn_vf_ifp;
1700 
1701 	/*
1702 	 * Mark the VF ready.
1703 	 */
1704 	sc->hn_vf_rdytick = 0;
1705 
1706 	/*
1707 	 * Save information for restoration.
1708 	 */
1709 	sc->hn_saved_caps = ifp->if_capabilities;
1710 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1711 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1712 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1713 
1714 	/*
1715 	 * Intersect supported/enabled capabilities.
1716 	 *
1717 	 * NOTE:
1718 	 * if_hwassist is not changed here.
1719 	 */
1720 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1721 	ifp->if_capenable &= ifp->if_capabilities;
1722 
1723 	/*
1724 	 * Fix TSO settings.
1725 	 */
1726 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1727 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1728 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1729 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1730 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1731 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1732 
1733 	/*
1734 	 * Change VF's enabled capabilities.
1735 	 */
1736 	memset(&ifr, 0, sizeof(ifr));
1737 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1738 	ifr.ifr_reqcap = ifp->if_capenable;
1739 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1740 
1741 	if (ifp->if_mtu != ETHERMTU) {
1742 		int error;
1743 
1744 		/*
1745 		 * Change VF's MTU.
1746 		 */
1747 		memset(&ifr, 0, sizeof(ifr));
1748 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1749 		ifr.ifr_mtu = ifp->if_mtu;
1750 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1751 		if (error) {
1752 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1753 			    vf_ifp->if_xname, ifp->if_mtu);
1754 			if (ifp->if_mtu > ETHERMTU) {
1755 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1756 
1757 				/*
1758 				 * XXX
1759 				 * No need to adjust the synthetic parts' MTU;
1760 				 * failure of the adjustment will cause us
1761 				 * infinite headache.
1762 				 */
1763 				ifp->if_mtu = ETHERMTU;
1764 				hn_mtu_change_fixup(sc);
1765 			}
1766 		}
1767 	}
1768 }
1769 
1770 static bool
1771 hn_xpnt_vf_isready(struct hn_softc *sc)
1772 {
1773 
1774 	HN_LOCK_ASSERT(sc);
1775 
1776 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1777 		return (false);
1778 
1779 	if (sc->hn_vf_rdytick == 0)
1780 		return (true);
1781 
1782 	if (sc->hn_vf_rdytick > ticks)
1783 		return (false);
1784 
1785 	/* Mark VF as ready. */
1786 	hn_xpnt_vf_setready(sc);
1787 	return (true);
1788 }
1789 
1790 static void
1791 hn_xpnt_vf_setenable(struct hn_softc *sc)
1792 {
1793 	int i;
1794 
1795 	HN_LOCK_ASSERT(sc);
1796 
1797 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1798 	rm_wlock(&sc->hn_vf_lock);
1799 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1800 	rm_wunlock(&sc->hn_vf_lock);
1801 
1802 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1803 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1804 }
1805 
1806 static void
1807 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1808 {
1809 	int i;
1810 
1811 	HN_LOCK_ASSERT(sc);
1812 
1813 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1814 	rm_wlock(&sc->hn_vf_lock);
1815 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1816 	if (clear_vf)
1817 		sc->hn_vf_ifp = NULL;
1818 	rm_wunlock(&sc->hn_vf_lock);
1819 
1820 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1821 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1822 }
1823 
1824 static void
1825 hn_xpnt_vf_init(struct hn_softc *sc)
1826 {
1827 	int error;
1828 
1829 	HN_LOCK_ASSERT(sc);
1830 
1831 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1832 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1833 
1834 	if (bootverbose) {
1835 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1836 		    sc->hn_vf_ifp->if_xname);
1837 	}
1838 
1839 	/*
1840 	 * Bring the VF up.
1841 	 */
1842 	hn_xpnt_vf_saveifflags(sc);
1843 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1844 	error = hn_xpnt_vf_iocsetflags(sc);
1845 	if (error) {
1846 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1847 		    sc->hn_vf_ifp->if_xname, error);
1848 		return;
1849 	}
1850 
1851 	/*
1852 	 * NOTE:
1853 	 * Datapath setting must happen _after_ bringing the VF up.
1854 	 */
1855 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1856 
1857 	/*
1858 	 * NOTE:
1859 	 * Fixup RSS related bits _after_ the VF is brought up, since
1860 	 * many VFs generate RSS key during it's initialization.
1861 	 */
1862 	hn_vf_rss_fixup(sc, true);
1863 
1864 	/* Mark transparent mode VF as enabled. */
1865 	hn_xpnt_vf_setenable(sc);
1866 }
1867 
1868 static void
1869 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1870 {
1871 	struct hn_softc *sc = xsc;
1872 
1873 	HN_LOCK(sc);
1874 
1875 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1876 		goto done;
1877 	if (sc->hn_vf_ifp == NULL)
1878 		goto done;
1879 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1880 		goto done;
1881 
1882 	if (sc->hn_vf_rdytick != 0) {
1883 		/* Mark VF as ready. */
1884 		hn_xpnt_vf_setready(sc);
1885 	}
1886 
1887 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1888 		/*
1889 		 * Delayed VF initialization.
1890 		 */
1891 		if (bootverbose) {
1892 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1893 			    sc->hn_vf_ifp->if_xname);
1894 		}
1895 		hn_xpnt_vf_init(sc);
1896 	}
1897 done:
1898 	HN_UNLOCK(sc);
1899 }
1900 
1901 static void
1902 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1903 {
1904 	struct hn_softc *sc = xsc;
1905 
1906 	HN_LOCK(sc);
1907 
1908 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1909 		goto done;
1910 
1911 	if (!hn_ismyvf(sc, ifp))
1912 		goto done;
1913 
1914 	if (sc->hn_vf_ifp != NULL) {
1915 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1916 		    sc->hn_vf_ifp->if_xname);
1917 		goto done;
1918 	}
1919 
1920 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1921 		/*
1922 		 * ifnet.if_start is _not_ supported by transparent
1923 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1924 		 */
1925 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1926 		    "in transparent VF mode.\n", ifp->if_xname);
1927 		goto done;
1928 	}
1929 
1930 	rm_wlock(&hn_vfmap_lock);
1931 
1932 	if (ifp->if_index >= hn_vfmap_size) {
1933 		struct ifnet **newmap;
1934 		int newsize;
1935 
1936 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1937 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1938 		    M_WAITOK | M_ZERO);
1939 
1940 		memcpy(newmap, hn_vfmap,
1941 		    sizeof(struct ifnet *) * hn_vfmap_size);
1942 		free(hn_vfmap, M_DEVBUF);
1943 		hn_vfmap = newmap;
1944 		hn_vfmap_size = newsize;
1945 	}
1946 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1947 	    ("%s: ifindex %d was mapped to %s",
1948 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1949 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1950 
1951 	rm_wunlock(&hn_vfmap_lock);
1952 
1953 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1954 	rm_wlock(&sc->hn_vf_lock);
1955 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1956 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1957 	sc->hn_vf_ifp = ifp;
1958 	rm_wunlock(&sc->hn_vf_lock);
1959 
1960 	if (hn_xpnt_vf) {
1961 		int wait_ticks;
1962 
1963 		/*
1964 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1965 		 * Save vf_ifp's current if_input for later restoration.
1966 		 */
1967 		sc->hn_vf_input = ifp->if_input;
1968 		ifp->if_input = hn_xpnt_vf_input;
1969 
1970 		/*
1971 		 * Stop link status management; use the VF's.
1972 		 */
1973 		hn_suspend_mgmt(sc);
1974 
1975 		/*
1976 		 * Give VF sometime to complete its attach routing.
1977 		 */
1978 		wait_ticks = hn_xpnt_vf_attwait * hz;
1979 		sc->hn_vf_rdytick = ticks + wait_ticks;
1980 
1981 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1982 		    wait_ticks);
1983 	}
1984 done:
1985 	HN_UNLOCK(sc);
1986 }
1987 
1988 static void
1989 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1990 {
1991 	struct hn_softc *sc = xsc;
1992 
1993 	HN_LOCK(sc);
1994 
1995 	if (sc->hn_vf_ifp == NULL)
1996 		goto done;
1997 
1998 	if (!hn_ismyvf(sc, ifp))
1999 		goto done;
2000 
2001 	if (hn_xpnt_vf) {
2002 		/*
2003 		 * Make sure that the delayed initialization is not running.
2004 		 *
2005 		 * NOTE:
2006 		 * - This lock _must_ be released, since the hn_vf_init task
2007 		 *   will try holding this lock.
2008 		 * - It is safe to release this lock here, since the
2009 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
2010 		 *
2011 		 * XXX racy, if hn(4) ever detached.
2012 		 */
2013 		HN_UNLOCK(sc);
2014 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2015 		HN_LOCK(sc);
2016 
2017 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2018 		    sc->hn_ifp->if_xname));
2019 		ifp->if_input = sc->hn_vf_input;
2020 		sc->hn_vf_input = NULL;
2021 
2022 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2023 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2024 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2025 
2026 		if (sc->hn_vf_rdytick == 0) {
2027 			/*
2028 			 * The VF was ready; restore some settings.
2029 			 */
2030 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2031 			/*
2032 			 * NOTE:
2033 			 * There is _no_ need to fixup if_capenable and
2034 			 * if_hwassist, since the if_capabilities before
2035 			 * restoration was an intersection of the VF's
2036 			 * if_capabilites and the synthetic device's
2037 			 * if_capabilites.
2038 			 */
2039 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2040 			sc->hn_ifp->if_hw_tsomaxsegcount =
2041 			    sc->hn_saved_tsosegcnt;
2042 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2043 		}
2044 
2045 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2046 			/*
2047 			 * Restore RSS settings.
2048 			 */
2049 			hn_vf_rss_restore(sc);
2050 
2051 			/*
2052 			 * Resume link status management, which was suspended
2053 			 * by hn_ifnet_attevent().
2054 			 */
2055 			hn_resume_mgmt(sc);
2056 		}
2057 	}
2058 
2059 	/* Mark transparent mode VF as disabled. */
2060 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2061 
2062 	rm_wlock(&hn_vfmap_lock);
2063 
2064 	KASSERT(ifp->if_index < hn_vfmap_size,
2065 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2066 	if (hn_vfmap[ifp->if_index] != NULL) {
2067 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2068 		    ("%s: ifindex %d was mapped to %s",
2069 		     ifp->if_xname, ifp->if_index,
2070 		     hn_vfmap[ifp->if_index]->if_xname));
2071 		hn_vfmap[ifp->if_index] = NULL;
2072 	}
2073 
2074 	rm_wunlock(&hn_vfmap_lock);
2075 done:
2076 	HN_UNLOCK(sc);
2077 }
2078 
2079 static void
2080 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2081 {
2082 	struct hn_softc *sc = xsc;
2083 
2084 	if (sc->hn_vf_ifp == ifp)
2085 		if_link_state_change(sc->hn_ifp, link_state);
2086 }
2087 
2088 static int
2089 hn_probe(device_t dev)
2090 {
2091 
2092 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2093 		device_set_desc(dev, "Hyper-V Network Interface");
2094 		return BUS_PROBE_DEFAULT;
2095 	}
2096 	return ENXIO;
2097 }
2098 
2099 static int
2100 hn_attach(device_t dev)
2101 {
2102 	struct hn_softc *sc = device_get_softc(dev);
2103 	struct sysctl_oid_list *child;
2104 	struct sysctl_ctx_list *ctx;
2105 	uint8_t eaddr[ETHER_ADDR_LEN];
2106 	struct ifnet *ifp = NULL;
2107 	int error, ring_cnt, tx_ring_cnt;
2108 	uint32_t mtu;
2109 
2110 	sc->hn_dev = dev;
2111 	sc->hn_prichan = vmbus_get_channel(dev);
2112 	HN_LOCK_INIT(sc);
2113 	rm_init(&sc->hn_vf_lock, "hnvf");
2114 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2115 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2116 
2117 	/*
2118 	 * Initialize these tunables once.
2119 	 */
2120 	sc->hn_agg_size = hn_tx_agg_size;
2121 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2122 
2123 	/*
2124 	 * Setup taskqueue for transmission.
2125 	 */
2126 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2127 		int i;
2128 
2129 		sc->hn_tx_taskqs =
2130 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2131 		    M_DEVBUF, M_WAITOK);
2132 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2133 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2134 			    M_WAITOK, taskqueue_thread_enqueue,
2135 			    &sc->hn_tx_taskqs[i]);
2136 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2137 			    "%s tx%d", device_get_nameunit(dev), i);
2138 		}
2139 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2140 		sc->hn_tx_taskqs = hn_tx_taskque;
2141 	}
2142 
2143 	/*
2144 	 * Setup taskqueue for mangement tasks, e.g. link status.
2145 	 */
2146 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2147 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2148 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2149 	    device_get_nameunit(dev));
2150 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2151 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2152 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2153 	    hn_netchg_status_taskfunc, sc);
2154 
2155 	if (hn_xpnt_vf) {
2156 		/*
2157 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2158 		 */
2159 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2160 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2161 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2162 		    device_get_nameunit(dev));
2163 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2164 		    hn_xpnt_vf_init_taskfunc, sc);
2165 	}
2166 
2167 	/*
2168 	 * Allocate ifnet and setup its name earlier, so that if_printf
2169 	 * can be used by functions, which will be called after
2170 	 * ether_ifattach().
2171 	 */
2172 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2173 	ifp->if_softc = sc;
2174 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2175 
2176 	/*
2177 	 * Initialize ifmedia earlier so that it can be unconditionally
2178 	 * destroyed, if error happened later on.
2179 	 */
2180 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2181 
2182 	/*
2183 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2184 	 * to use (tx_ring_cnt).
2185 	 *
2186 	 * NOTE:
2187 	 * The # of RX rings to use is same as the # of channels to use.
2188 	 */
2189 	ring_cnt = hn_chan_cnt;
2190 	if (ring_cnt <= 0) {
2191 		/* Default */
2192 		ring_cnt = mp_ncpus;
2193 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2194 			ring_cnt = HN_RING_CNT_DEF_MAX;
2195 	} else if (ring_cnt > mp_ncpus) {
2196 		ring_cnt = mp_ncpus;
2197 	}
2198 #ifdef RSS
2199 	if (ring_cnt > rss_getnumbuckets())
2200 		ring_cnt = rss_getnumbuckets();
2201 #endif
2202 
2203 	tx_ring_cnt = hn_tx_ring_cnt;
2204 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2205 		tx_ring_cnt = ring_cnt;
2206 #ifdef HN_IFSTART_SUPPORT
2207 	if (hn_use_if_start) {
2208 		/* ifnet.if_start only needs one TX ring. */
2209 		tx_ring_cnt = 1;
2210 	}
2211 #endif
2212 
2213 	/*
2214 	 * Set the leader CPU for channels.
2215 	 */
2216 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2217 
2218 	/*
2219 	 * Create enough TX/RX rings, even if only limited number of
2220 	 * channels can be allocated.
2221 	 */
2222 	error = hn_create_tx_data(sc, tx_ring_cnt);
2223 	if (error)
2224 		goto failed;
2225 	error = hn_create_rx_data(sc, ring_cnt);
2226 	if (error)
2227 		goto failed;
2228 
2229 	/*
2230 	 * Create transaction context for NVS and RNDIS transactions.
2231 	 */
2232 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2233 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2234 	if (sc->hn_xact == NULL) {
2235 		error = ENXIO;
2236 		goto failed;
2237 	}
2238 
2239 	/*
2240 	 * Install orphan handler for the revocation of this device's
2241 	 * primary channel.
2242 	 *
2243 	 * NOTE:
2244 	 * The processing order is critical here:
2245 	 * Install the orphan handler, _before_ testing whether this
2246 	 * device's primary channel has been revoked or not.
2247 	 */
2248 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2249 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2250 		error = ENXIO;
2251 		goto failed;
2252 	}
2253 
2254 	/*
2255 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2256 	 */
2257 	error = hn_synth_attach(sc, ETHERMTU);
2258 	if (error)
2259 		goto failed;
2260 
2261 	error = hn_rndis_get_eaddr(sc, eaddr);
2262 	if (error)
2263 		goto failed;
2264 
2265 	error = hn_rndis_get_mtu(sc, &mtu);
2266 	if (error)
2267 		mtu = ETHERMTU;
2268 	else if (bootverbose)
2269 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2270 
2271 #if __FreeBSD_version >= 1100099
2272 	if (sc->hn_rx_ring_inuse > 1) {
2273 		/*
2274 		 * Reduce TCP segment aggregation limit for multiple
2275 		 * RX rings to increase ACK timeliness.
2276 		 */
2277 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2278 	}
2279 #endif
2280 
2281 	/*
2282 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2283 	 */
2284 	hn_fixup_tx_data(sc);
2285 	hn_fixup_rx_data(sc);
2286 
2287 	ctx = device_get_sysctl_ctx(dev);
2288 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2289 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2290 	    &sc->hn_nvs_ver, 0, "NVS version");
2291 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2292 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2293 	    hn_ndis_version_sysctl, "A", "NDIS version");
2294 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2295 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2296 	    hn_caps_sysctl, "A", "capabilities");
2297 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2298 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2299 	    hn_hwassist_sysctl, "A", "hwassist");
2300 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2301 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2302 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2303 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2304 	    "max # of TSO segments");
2305 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2306 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2307 	    "max size of TSO segment");
2308 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2309 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2310 	    hn_rxfilter_sysctl, "A", "rxfilter");
2311 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2312 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2313 	    hn_rss_hash_sysctl, "A", "RSS hash");
2314 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2315 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2316 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2317 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2318 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2319 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2320 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2321 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2322 #ifndef RSS
2323 	/*
2324 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2325 	 */
2326 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2327 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2328 	    hn_rss_key_sysctl, "IU", "RSS key");
2329 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2330 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2331 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2332 #endif
2333 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2334 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2335 	    "RNDIS offered packet transmission aggregation size limit");
2336 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2337 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2338 	    "RNDIS offered packet transmission aggregation count limit");
2339 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2340 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2341 	    "RNDIS packet transmission aggregation alignment");
2342 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2343 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2344 	    hn_txagg_size_sysctl, "I",
2345 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2346 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2347 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2348 	    hn_txagg_pkts_sysctl, "I",
2349 	    "Packet transmission aggregation packets, "
2350 	    "0 -- disable, -1 -- auto");
2351 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2352 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2353 	    hn_polling_sysctl, "I",
2354 	    "Polling frequency: [100,1000000], 0 disable polling");
2355 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2356 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2357 	    hn_vf_sysctl, "A", "Virtual Function's name");
2358 	if (!hn_xpnt_vf) {
2359 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2360 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2361 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2362 	} else {
2363 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2364 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2365 		    hn_xpnt_vf_enabled_sysctl, "I",
2366 		    "Transparent VF enabled");
2367 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2368 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2369 		    hn_xpnt_vf_accbpf_sysctl, "I",
2370 		    "Accurate BPF for transparent VF");
2371 	}
2372 
2373 	/*
2374 	 * Setup the ifmedia, which has been initialized earlier.
2375 	 */
2376 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2377 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2378 	/* XXX ifmedia_set really should do this for us */
2379 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2380 
2381 	/*
2382 	 * Setup the ifnet for this interface.
2383 	 */
2384 
2385 	ifp->if_baudrate = IF_Gbps(10);
2386 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2387 	ifp->if_ioctl = hn_ioctl;
2388 	ifp->if_init = hn_init;
2389 #ifdef HN_IFSTART_SUPPORT
2390 	if (hn_use_if_start) {
2391 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2392 
2393 		ifp->if_start = hn_start;
2394 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2395 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2396 		IFQ_SET_READY(&ifp->if_snd);
2397 	} else
2398 #endif
2399 	{
2400 		ifp->if_transmit = hn_transmit;
2401 		ifp->if_qflush = hn_xmit_qflush;
2402 	}
2403 
2404 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2405 #ifdef foo
2406 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2407 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2408 #endif
2409 	if (sc->hn_caps & HN_CAP_VLAN) {
2410 		/* XXX not sure about VLAN_MTU. */
2411 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2412 	}
2413 
2414 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2415 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2416 		ifp->if_capabilities |= IFCAP_TXCSUM;
2417 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2418 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2419 	if (sc->hn_caps & HN_CAP_TSO4) {
2420 		ifp->if_capabilities |= IFCAP_TSO4;
2421 		ifp->if_hwassist |= CSUM_IP_TSO;
2422 	}
2423 	if (sc->hn_caps & HN_CAP_TSO6) {
2424 		ifp->if_capabilities |= IFCAP_TSO6;
2425 		ifp->if_hwassist |= CSUM_IP6_TSO;
2426 	}
2427 
2428 	/* Enable all available capabilities by default. */
2429 	ifp->if_capenable = ifp->if_capabilities;
2430 
2431 	/*
2432 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2433 	 * be enabled through SIOCSIFCAP.
2434 	 */
2435 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2436 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2437 
2438 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2439 		/*
2440 		 * Lock hn_set_tso_maxsize() to simplify its
2441 		 * internal logic.
2442 		 */
2443 		HN_LOCK(sc);
2444 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2445 		HN_UNLOCK(sc);
2446 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2447 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2448 	}
2449 
2450 	ether_ifattach(ifp, eaddr);
2451 
2452 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2453 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2454 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2455 	}
2456 	if (mtu < ETHERMTU) {
2457 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2458 		ifp->if_mtu = mtu;
2459 	}
2460 
2461 	/* Inform the upper layer about the long frame support. */
2462 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2463 
2464 	/*
2465 	 * Kick off link status check.
2466 	 */
2467 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2468 	hn_update_link_status(sc);
2469 
2470 	if (!hn_xpnt_vf) {
2471 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2472 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2473 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2474 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2475 	} else {
2476 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2477 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2478 	}
2479 
2480 	/*
2481 	 * NOTE:
2482 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2483 	 * since interface's LLADDR is needed; interface LLADDR is not
2484 	 * available when ifnet_arrival event is triggered.
2485 	 */
2486 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2487 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2488 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2489 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2490 
2491 	return (0);
2492 failed:
2493 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2494 		hn_synth_detach(sc);
2495 	hn_detach(dev);
2496 	return (error);
2497 }
2498 
2499 static int
2500 hn_detach(device_t dev)
2501 {
2502 	struct hn_softc *sc = device_get_softc(dev);
2503 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2504 
2505 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2506 		/*
2507 		 * In case that the vmbus missed the orphan handler
2508 		 * installation.
2509 		 */
2510 		vmbus_xact_ctx_orphan(sc->hn_xact);
2511 	}
2512 
2513 	if (sc->hn_ifaddr_evthand != NULL)
2514 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2515 	if (sc->hn_ifnet_evthand != NULL)
2516 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2517 	if (sc->hn_ifnet_atthand != NULL) {
2518 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2519 		    sc->hn_ifnet_atthand);
2520 	}
2521 	if (sc->hn_ifnet_dethand != NULL) {
2522 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2523 		    sc->hn_ifnet_dethand);
2524 	}
2525 	if (sc->hn_ifnet_lnkhand != NULL)
2526 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2527 
2528 	vf_ifp = sc->hn_vf_ifp;
2529 	__compiler_membar();
2530 	if (vf_ifp != NULL)
2531 		hn_ifnet_detevent(sc, vf_ifp);
2532 
2533 	if (device_is_attached(dev)) {
2534 		HN_LOCK(sc);
2535 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2536 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2537 				hn_stop(sc, true);
2538 			/*
2539 			 * NOTE:
2540 			 * hn_stop() only suspends data, so managment
2541 			 * stuffs have to be suspended manually here.
2542 			 */
2543 			hn_suspend_mgmt(sc);
2544 			hn_synth_detach(sc);
2545 		}
2546 		HN_UNLOCK(sc);
2547 		ether_ifdetach(ifp);
2548 	}
2549 
2550 	ifmedia_removeall(&sc->hn_media);
2551 	hn_destroy_rx_data(sc);
2552 	hn_destroy_tx_data(sc);
2553 
2554 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2555 		int i;
2556 
2557 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2558 			taskqueue_free(sc->hn_tx_taskqs[i]);
2559 		free(sc->hn_tx_taskqs, M_DEVBUF);
2560 	}
2561 	taskqueue_free(sc->hn_mgmt_taskq0);
2562 	if (sc->hn_vf_taskq != NULL)
2563 		taskqueue_free(sc->hn_vf_taskq);
2564 
2565 	if (sc->hn_xact != NULL) {
2566 		/*
2567 		 * Uninstall the orphan handler _before_ the xact is
2568 		 * destructed.
2569 		 */
2570 		vmbus_chan_unset_orphan(sc->hn_prichan);
2571 		vmbus_xact_ctx_destroy(sc->hn_xact);
2572 	}
2573 
2574 	if_free(ifp);
2575 
2576 	HN_LOCK_DESTROY(sc);
2577 	rm_destroy(&sc->hn_vf_lock);
2578 	return (0);
2579 }
2580 
2581 static int
2582 hn_shutdown(device_t dev)
2583 {
2584 
2585 	return (0);
2586 }
2587 
2588 static void
2589 hn_link_status(struct hn_softc *sc)
2590 {
2591 	uint32_t link_status;
2592 	int error;
2593 
2594 	error = hn_rndis_get_linkstatus(sc, &link_status);
2595 	if (error) {
2596 		/* XXX what to do? */
2597 		return;
2598 	}
2599 
2600 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2601 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2602 	else
2603 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2604 	if_link_state_change(sc->hn_ifp,
2605 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2606 	    LINK_STATE_UP : LINK_STATE_DOWN);
2607 }
2608 
2609 static void
2610 hn_link_taskfunc(void *xsc, int pending __unused)
2611 {
2612 	struct hn_softc *sc = xsc;
2613 
2614 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2615 		return;
2616 	hn_link_status(sc);
2617 }
2618 
2619 static void
2620 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2621 {
2622 	struct hn_softc *sc = xsc;
2623 
2624 	/* Prevent any link status checks from running. */
2625 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2626 
2627 	/*
2628 	 * Fake up a [link down --> link up] state change; 5 seconds
2629 	 * delay is used, which closely simulates miibus reaction
2630 	 * upon link down event.
2631 	 */
2632 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2633 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2634 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2635 	    &sc->hn_netchg_status, 5 * hz);
2636 }
2637 
2638 static void
2639 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2640 {
2641 	struct hn_softc *sc = xsc;
2642 
2643 	/* Re-allow link status checks. */
2644 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2645 	hn_link_status(sc);
2646 }
2647 
2648 static void
2649 hn_update_link_status(struct hn_softc *sc)
2650 {
2651 
2652 	if (sc->hn_mgmt_taskq != NULL)
2653 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2654 }
2655 
2656 static void
2657 hn_change_network(struct hn_softc *sc)
2658 {
2659 
2660 	if (sc->hn_mgmt_taskq != NULL)
2661 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2662 }
2663 
2664 static __inline int
2665 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2666     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2667 {
2668 	struct mbuf *m = *m_head;
2669 	int error;
2670 
2671 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2672 
2673 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2674 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2675 	if (error == EFBIG) {
2676 		struct mbuf *m_new;
2677 
2678 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2679 		if (m_new == NULL)
2680 			return ENOBUFS;
2681 		else
2682 			*m_head = m = m_new;
2683 		txr->hn_tx_collapsed++;
2684 
2685 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2686 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2687 	}
2688 	if (!error) {
2689 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2690 		    BUS_DMASYNC_PREWRITE);
2691 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2692 	}
2693 	return error;
2694 }
2695 
2696 static __inline int
2697 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2698 {
2699 
2700 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2701 	    ("put an onlist txd %#x", txd->flags));
2702 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2703 	    ("put an onagg txd %#x", txd->flags));
2704 
2705 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2706 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2707 		return 0;
2708 
2709 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2710 		struct hn_txdesc *tmp_txd;
2711 
2712 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2713 			int freed __diagused;
2714 
2715 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2716 			    ("resursive aggregation on aggregated txdesc"));
2717 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2718 			    ("not aggregated txdesc"));
2719 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2720 			    ("aggregated txdesc uses dmamap"));
2721 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2722 			    ("aggregated txdesc consumes "
2723 			     "chimney sending buffer"));
2724 			KASSERT(tmp_txd->chim_size == 0,
2725 			    ("aggregated txdesc has non-zero "
2726 			     "chimney sending size"));
2727 
2728 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2729 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2730 			freed = hn_txdesc_put(txr, tmp_txd);
2731 			KASSERT(freed, ("failed to free aggregated txdesc"));
2732 		}
2733 	}
2734 
2735 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2736 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2737 		    ("chim txd uses dmamap"));
2738 		hn_chim_free(txr->hn_sc, txd->chim_index);
2739 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2740 		txd->chim_size = 0;
2741 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2742 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2743 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2744 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2745 		    txd->data_dmap);
2746 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2747 	}
2748 
2749 	if (txd->m != NULL) {
2750 		m_freem(txd->m);
2751 		txd->m = NULL;
2752 	}
2753 
2754 	txd->flags |= HN_TXD_FLAG_ONLIST;
2755 #ifndef HN_USE_TXDESC_BUFRING
2756 	mtx_lock_spin(&txr->hn_txlist_spin);
2757 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2758 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2759 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2760 	txr->hn_txdesc_avail++;
2761 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2762 	mtx_unlock_spin(&txr->hn_txlist_spin);
2763 #else	/* HN_USE_TXDESC_BUFRING */
2764 #ifdef HN_DEBUG
2765 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2766 #endif
2767 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2768 #endif	/* !HN_USE_TXDESC_BUFRING */
2769 
2770 	return 1;
2771 }
2772 
2773 static __inline struct hn_txdesc *
2774 hn_txdesc_get(struct hn_tx_ring *txr)
2775 {
2776 	struct hn_txdesc *txd;
2777 
2778 #ifndef HN_USE_TXDESC_BUFRING
2779 	mtx_lock_spin(&txr->hn_txlist_spin);
2780 	txd = SLIST_FIRST(&txr->hn_txlist);
2781 	if (txd != NULL) {
2782 		KASSERT(txr->hn_txdesc_avail > 0,
2783 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2784 		txr->hn_txdesc_avail--;
2785 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2786 	}
2787 	mtx_unlock_spin(&txr->hn_txlist_spin);
2788 #else
2789 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2790 #endif
2791 
2792 	if (txd != NULL) {
2793 #ifdef HN_USE_TXDESC_BUFRING
2794 #ifdef HN_DEBUG
2795 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2796 #endif
2797 #endif	/* HN_USE_TXDESC_BUFRING */
2798 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2799 		    STAILQ_EMPTY(&txd->agg_list) &&
2800 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2801 		    txd->chim_size == 0 &&
2802 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2803 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2804 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2805 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2806 		txd->refs = 1;
2807 	}
2808 	return txd;
2809 }
2810 
2811 static __inline void
2812 hn_txdesc_hold(struct hn_txdesc *txd)
2813 {
2814 
2815 	/* 0->1 transition will never work */
2816 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2817 	atomic_add_int(&txd->refs, 1);
2818 }
2819 
2820 static __inline void
2821 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2822 {
2823 
2824 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2825 	    ("recursive aggregation on aggregating txdesc"));
2826 
2827 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2828 	    ("already aggregated"));
2829 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2830 	    ("recursive aggregation on to-be-aggregated txdesc"));
2831 
2832 	txd->flags |= HN_TXD_FLAG_ONAGG;
2833 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2834 }
2835 
2836 static bool
2837 hn_tx_ring_pending(struct hn_tx_ring *txr)
2838 {
2839 	bool pending = false;
2840 
2841 #ifndef HN_USE_TXDESC_BUFRING
2842 	mtx_lock_spin(&txr->hn_txlist_spin);
2843 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2844 		pending = true;
2845 	mtx_unlock_spin(&txr->hn_txlist_spin);
2846 #else
2847 	if (!buf_ring_full(txr->hn_txdesc_br))
2848 		pending = true;
2849 #endif
2850 	return (pending);
2851 }
2852 
2853 static __inline void
2854 hn_txeof(struct hn_tx_ring *txr)
2855 {
2856 	txr->hn_has_txeof = 0;
2857 	txr->hn_txeof(txr);
2858 }
2859 
2860 static void
2861 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2862     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2863 {
2864 	struct hn_txdesc *txd = sndc->hn_cbarg;
2865 	struct hn_tx_ring *txr;
2866 
2867 	txr = txd->txr;
2868 	KASSERT(txr->hn_chan == chan,
2869 	    ("channel mismatch, on chan%u, should be chan%u",
2870 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2871 
2872 	txr->hn_has_txeof = 1;
2873 	hn_txdesc_put(txr, txd);
2874 
2875 	++txr->hn_txdone_cnt;
2876 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2877 		txr->hn_txdone_cnt = 0;
2878 		if (txr->hn_oactive)
2879 			hn_txeof(txr);
2880 	}
2881 }
2882 
2883 static void
2884 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2885 {
2886 #if defined(INET) || defined(INET6)
2887 	struct epoch_tracker et;
2888 
2889 	NET_EPOCH_ENTER(et);
2890 	tcp_lro_flush_all(&rxr->hn_lro);
2891 	NET_EPOCH_EXIT(et);
2892 #endif
2893 
2894 	/*
2895 	 * NOTE:
2896 	 * 'txr' could be NULL, if multiple channels and
2897 	 * ifnet.if_start method are enabled.
2898 	 */
2899 	if (txr == NULL || !txr->hn_has_txeof)
2900 		return;
2901 
2902 	txr->hn_txdone_cnt = 0;
2903 	hn_txeof(txr);
2904 }
2905 
2906 static __inline uint32_t
2907 hn_rndis_pktmsg_offset(uint32_t ofs)
2908 {
2909 
2910 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2911 	    ("invalid RNDIS packet msg offset %u", ofs));
2912 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2913 }
2914 
2915 static __inline void *
2916 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2917     size_t pi_dlen, uint32_t pi_type)
2918 {
2919 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2920 	struct rndis_pktinfo *pi;
2921 
2922 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2923 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2924 
2925 	/*
2926 	 * Per-packet-info does not move; it only grows.
2927 	 *
2928 	 * NOTE:
2929 	 * rm_pktinfooffset in this phase counts from the beginning
2930 	 * of rndis_packet_msg.
2931 	 */
2932 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2933 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2934 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2935 	    pkt->rm_pktinfolen);
2936 	pkt->rm_pktinfolen += pi_size;
2937 
2938 	pi->rm_size = pi_size;
2939 	pi->rm_type = pi_type;
2940 	pi->rm_internal = 0;
2941 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2942 
2943 	return (pi->rm_data);
2944 }
2945 
2946 static __inline int
2947 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2948 {
2949 	struct hn_txdesc *txd;
2950 	struct mbuf *m;
2951 	int error, pkts;
2952 
2953 	txd = txr->hn_agg_txd;
2954 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2955 
2956 	/*
2957 	 * Since hn_txpkt() will reset this temporary stat, save
2958 	 * it now, so that oerrors can be updated properly, if
2959 	 * hn_txpkt() ever fails.
2960 	 */
2961 	pkts = txr->hn_stat_pkts;
2962 
2963 	/*
2964 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2965 	 * failure, save it for later freeing, if hn_txpkt() ever
2966 	 * fails.
2967 	 */
2968 	m = txd->m;
2969 	error = hn_txpkt(ifp, txr, txd);
2970 	if (__predict_false(error)) {
2971 		/* txd is freed, but m is not. */
2972 		m_freem(m);
2973 
2974 		txr->hn_flush_failed++;
2975 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2976 	}
2977 
2978 	/* Reset all aggregation states. */
2979 	txr->hn_agg_txd = NULL;
2980 	txr->hn_agg_szleft = 0;
2981 	txr->hn_agg_pktleft = 0;
2982 	txr->hn_agg_prevpkt = NULL;
2983 
2984 	return (error);
2985 }
2986 
2987 static void *
2988 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2989     int pktsize)
2990 {
2991 	void *chim;
2992 
2993 	if (txr->hn_agg_txd != NULL) {
2994 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2995 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2996 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2997 			int olen;
2998 
2999 			/*
3000 			 * Update the previous RNDIS packet's total length,
3001 			 * it can be increased due to the mandatory alignment
3002 			 * padding for this RNDIS packet.  And update the
3003 			 * aggregating txdesc's chimney sending buffer size
3004 			 * accordingly.
3005 			 *
3006 			 * XXX
3007 			 * Zero-out the padding, as required by the RNDIS spec.
3008 			 */
3009 			olen = pkt->rm_len;
3010 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
3011 			agg_txd->chim_size += pkt->rm_len - olen;
3012 
3013 			/* Link this txdesc to the parent. */
3014 			hn_txdesc_agg(agg_txd, txd);
3015 
3016 			chim = (uint8_t *)pkt + pkt->rm_len;
3017 			/* Save the current packet for later fixup. */
3018 			txr->hn_agg_prevpkt = chim;
3019 
3020 			txr->hn_agg_pktleft--;
3021 			txr->hn_agg_szleft -= pktsize;
3022 			if (txr->hn_agg_szleft <=
3023 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3024 				/*
3025 				 * Probably can't aggregate more packets,
3026 				 * flush this aggregating txdesc proactively.
3027 				 */
3028 				txr->hn_agg_pktleft = 0;
3029 			}
3030 			/* Done! */
3031 			return (chim);
3032 		}
3033 		hn_flush_txagg(ifp, txr);
3034 	}
3035 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3036 
3037 	txr->hn_tx_chimney_tried++;
3038 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3039 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3040 		return (NULL);
3041 	txr->hn_tx_chimney++;
3042 
3043 	chim = txr->hn_sc->hn_chim +
3044 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3045 
3046 	if (txr->hn_agg_pktmax > 1 &&
3047 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3048 		txr->hn_agg_txd = txd;
3049 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3050 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3051 		txr->hn_agg_prevpkt = chim;
3052 	}
3053 	return (chim);
3054 }
3055 
3056 /*
3057  * NOTE:
3058  * If this function fails, then both txd and m_head0 will be freed.
3059  */
3060 static int
3061 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3062     struct mbuf **m_head0)
3063 {
3064 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3065 	int error, nsegs, i;
3066 	struct mbuf *m_head = *m_head0;
3067 	struct rndis_packet_msg *pkt;
3068 	uint32_t *pi_data;
3069 	void *chim = NULL;
3070 	int pkt_hlen, pkt_size;
3071 
3072 	pkt = txd->rndis_pkt;
3073 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3074 	if (pkt_size < txr->hn_chim_size) {
3075 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3076 		if (chim != NULL)
3077 			pkt = chim;
3078 	} else {
3079 		if (txr->hn_agg_txd != NULL)
3080 			hn_flush_txagg(ifp, txr);
3081 	}
3082 
3083 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3084 	pkt->rm_len = m_head->m_pkthdr.len;
3085 	pkt->rm_dataoffset = 0;
3086 	pkt->rm_datalen = m_head->m_pkthdr.len;
3087 	pkt->rm_oobdataoffset = 0;
3088 	pkt->rm_oobdatalen = 0;
3089 	pkt->rm_oobdataelements = 0;
3090 	pkt->rm_pktinfooffset = sizeof(*pkt);
3091 	pkt->rm_pktinfolen = 0;
3092 	pkt->rm_vchandle = 0;
3093 	pkt->rm_reserved = 0;
3094 
3095 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3096 		/*
3097 		 * Set the hash value for this packet.
3098 		 */
3099 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3100 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3101 
3102 		if (M_HASHTYPE_ISHASH(m_head))
3103 			/*
3104 			 * The flowid field contains the hash value host
3105 			 * set in the rx queue if it is a ip forwarding pkt.
3106 			 * Set the same hash value so host can send on the
3107 			 * cpu it was received.
3108 			 */
3109 			*pi_data = m_head->m_pkthdr.flowid;
3110 		else
3111 			/*
3112 			 * Otherwise just put the tx queue index.
3113 			 */
3114 			*pi_data = txr->hn_tx_idx;
3115 	}
3116 
3117 	if (m_head->m_flags & M_VLANTAG) {
3118 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3119 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3120 		*pi_data = NDIS_VLAN_INFO_MAKE(
3121 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3122 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3123 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3124 	}
3125 
3126 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3127 #if defined(INET6) || defined(INET)
3128 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3129 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3130 #ifdef INET
3131 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3132 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3133 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3134 			    m_head->m_pkthdr.tso_segsz);
3135 		}
3136 #endif
3137 #if defined(INET6) && defined(INET)
3138 		else
3139 #endif
3140 #ifdef INET6
3141 		{
3142 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3143 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3144 			    m_head->m_pkthdr.tso_segsz);
3145 		}
3146 #endif
3147 #endif	/* INET6 || INET */
3148 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3149 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3150 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3151 		if (m_head->m_pkthdr.csum_flags &
3152 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3153 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3154 		} else {
3155 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3156 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3157 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3158 		}
3159 
3160 		if (m_head->m_pkthdr.csum_flags &
3161 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3162 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3163 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3164 		} else if (m_head->m_pkthdr.csum_flags &
3165 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3166 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3167 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3168 		}
3169 	}
3170 
3171 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3172 	/* Fixup RNDIS packet message total length */
3173 	pkt->rm_len += pkt_hlen;
3174 	/* Convert RNDIS packet message offsets */
3175 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3176 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3177 
3178 	/*
3179 	 * Fast path: Chimney sending.
3180 	 */
3181 	if (chim != NULL) {
3182 		struct hn_txdesc *tgt_txd = txd;
3183 
3184 		if (txr->hn_agg_txd != NULL) {
3185 			tgt_txd = txr->hn_agg_txd;
3186 #ifdef INVARIANTS
3187 			*m_head0 = NULL;
3188 #endif
3189 		}
3190 
3191 		KASSERT(pkt == chim,
3192 		    ("RNDIS pkt not in chimney sending buffer"));
3193 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3194 		    ("chimney sending buffer is not used"));
3195 		tgt_txd->chim_size += pkt->rm_len;
3196 
3197 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3198 		    ((uint8_t *)chim) + pkt_hlen);
3199 
3200 		txr->hn_gpa_cnt = 0;
3201 		txr->hn_sendpkt = hn_txpkt_chim;
3202 		goto done;
3203 	}
3204 
3205 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3206 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3207 	    ("chimney buffer is used"));
3208 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3209 
3210 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3211 	if (__predict_false(error)) {
3212 		int freed __diagused;
3213 
3214 		/*
3215 		 * This mbuf is not linked w/ the txd yet, so free it now.
3216 		 */
3217 		m_freem(m_head);
3218 		*m_head0 = NULL;
3219 
3220 		freed = hn_txdesc_put(txr, txd);
3221 		KASSERT(freed != 0,
3222 		    ("fail to free txd upon txdma error"));
3223 
3224 		txr->hn_txdma_failed++;
3225 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3226 		return error;
3227 	}
3228 	*m_head0 = m_head;
3229 
3230 	/* +1 RNDIS packet message */
3231 	txr->hn_gpa_cnt = nsegs + 1;
3232 
3233 	/* send packet with page buffer */
3234 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3235 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3236 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3237 
3238 	/*
3239 	 * Fill the page buffers with mbuf info after the page
3240 	 * buffer for RNDIS packet message.
3241 	 */
3242 	for (i = 0; i < nsegs; ++i) {
3243 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3244 
3245 		gpa->gpa_page = atop(segs[i].ds_addr);
3246 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3247 		gpa->gpa_len = segs[i].ds_len;
3248 	}
3249 
3250 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3251 	txd->chim_size = 0;
3252 	txr->hn_sendpkt = hn_txpkt_sglist;
3253 done:
3254 	txd->m = m_head;
3255 
3256 	/* Set the completion routine */
3257 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3258 
3259 	/* Update temporary stats for later use. */
3260 	txr->hn_stat_pkts++;
3261 	txr->hn_stat_size += m_head->m_pkthdr.len;
3262 	if (m_head->m_flags & M_MCAST)
3263 		txr->hn_stat_mcasts++;
3264 
3265 	return 0;
3266 }
3267 
3268 /*
3269  * NOTE:
3270  * If this function fails, then txd will be freed, but the mbuf
3271  * associated w/ the txd will _not_ be freed.
3272  */
3273 static int
3274 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3275 {
3276 	int error, send_failed = 0, has_bpf;
3277 
3278 again:
3279 	has_bpf = bpf_peers_present(ifp->if_bpf);
3280 	if (has_bpf) {
3281 		/*
3282 		 * Make sure that this txd and any aggregated txds are not
3283 		 * freed before ETHER_BPF_MTAP.
3284 		 */
3285 		hn_txdesc_hold(txd);
3286 	}
3287 	error = txr->hn_sendpkt(txr, txd);
3288 	if (!error) {
3289 		if (has_bpf) {
3290 			const struct hn_txdesc *tmp_txd;
3291 
3292 			ETHER_BPF_MTAP(ifp, txd->m);
3293 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3294 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3295 		}
3296 
3297 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3298 #ifdef HN_IFSTART_SUPPORT
3299 		if (!hn_use_if_start)
3300 #endif
3301 		{
3302 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3303 			    txr->hn_stat_size);
3304 			if (txr->hn_stat_mcasts != 0) {
3305 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3306 				    txr->hn_stat_mcasts);
3307 			}
3308 		}
3309 		txr->hn_pkts += txr->hn_stat_pkts;
3310 		txr->hn_sends++;
3311 	}
3312 	if (has_bpf)
3313 		hn_txdesc_put(txr, txd);
3314 
3315 	if (__predict_false(error)) {
3316 		int freed __diagused;
3317 
3318 		/*
3319 		 * This should "really rarely" happen.
3320 		 *
3321 		 * XXX Too many RX to be acked or too many sideband
3322 		 * commands to run?  Ask netvsc_channel_rollup()
3323 		 * to kick start later.
3324 		 */
3325 		txr->hn_has_txeof = 1;
3326 		if (!send_failed) {
3327 			txr->hn_send_failed++;
3328 			send_failed = 1;
3329 			/*
3330 			 * Try sending again after set hn_has_txeof;
3331 			 * in case that we missed the last
3332 			 * netvsc_channel_rollup().
3333 			 */
3334 			goto again;
3335 		}
3336 		if_printf(ifp, "send failed\n");
3337 
3338 		/*
3339 		 * Caller will perform further processing on the
3340 		 * associated mbuf, so don't free it in hn_txdesc_put();
3341 		 * only unload it from the DMA map in hn_txdesc_put(),
3342 		 * if it was loaded.
3343 		 */
3344 		txd->m = NULL;
3345 		freed = hn_txdesc_put(txr, txd);
3346 		KASSERT(freed != 0,
3347 		    ("fail to free txd upon send error"));
3348 
3349 		txr->hn_send_failed++;
3350 	}
3351 
3352 	/* Reset temporary stats, after this sending is done. */
3353 	txr->hn_stat_size = 0;
3354 	txr->hn_stat_pkts = 0;
3355 	txr->hn_stat_mcasts = 0;
3356 
3357 	return (error);
3358 }
3359 
3360 /*
3361  * Append the specified data to the indicated mbuf chain,
3362  * Extend the mbuf chain if the new data does not fit in
3363  * existing space.
3364  *
3365  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3366  * There should be an equivalent in the kernel mbuf code,
3367  * but there does not appear to be one yet.
3368  *
3369  * Differs from m_append() in that additional mbufs are
3370  * allocated with cluster size MJUMPAGESIZE, and filled
3371  * accordingly.
3372  *
3373  * Return the last mbuf in the chain or NULL if failed to
3374  * allocate new mbuf.
3375  */
3376 static struct mbuf *
3377 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3378 {
3379 	struct mbuf *m, *n;
3380 	int remainder, space;
3381 
3382 	for (m = m0; m->m_next != NULL; m = m->m_next)
3383 		;
3384 	remainder = len;
3385 	space = M_TRAILINGSPACE(m);
3386 	if (space > 0) {
3387 		/*
3388 		 * Copy into available space.
3389 		 */
3390 		if (space > remainder)
3391 			space = remainder;
3392 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3393 		m->m_len += space;
3394 		cp += space;
3395 		remainder -= space;
3396 	}
3397 	while (remainder > 0) {
3398 		/*
3399 		 * Allocate a new mbuf; could check space
3400 		 * and allocate a cluster instead.
3401 		 */
3402 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3403 		if (n == NULL)
3404 			return NULL;
3405 		n->m_len = min(MJUMPAGESIZE, remainder);
3406 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3407 		cp += n->m_len;
3408 		remainder -= n->m_len;
3409 		m->m_next = n;
3410 		m = n;
3411 	}
3412 
3413 	return m;
3414 }
3415 
3416 #if defined(INET) || defined(INET6)
3417 static __inline int
3418 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3419 {
3420 #if __FreeBSD_version >= 1100095
3421 	if (hn_lro_mbufq_depth) {
3422 		tcp_lro_queue_mbuf(lc, m);
3423 		return 0;
3424 	}
3425 #endif
3426 	return tcp_lro_rx(lc, m, 0);
3427 }
3428 #endif
3429 
3430 static int
3431 hn_rxpkt(struct hn_rx_ring *rxr)
3432 {
3433 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3434 	struct mbuf *m_new, *n;
3435 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3436 	int hash_type = M_HASHTYPE_NONE;
3437 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3438 	int i;
3439 
3440 	ifp = hn_ifp;
3441 	if (rxr->hn_rxvf_ifp != NULL) {
3442 		/*
3443 		 * Non-transparent mode VF; pretend this packet is from
3444 		 * the VF.
3445 		 */
3446 		ifp = rxr->hn_rxvf_ifp;
3447 		is_vf = 1;
3448 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3449 		/* Transparent mode VF. */
3450 		is_vf = 1;
3451 	}
3452 
3453 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3454 		/*
3455 		 * NOTE:
3456 		 * See the NOTE of hn_rndis_init_fixat().  This
3457 		 * function can be reached, immediately after the
3458 		 * RNDIS is initialized but before the ifnet is
3459 		 * setup on the hn_attach() path; drop the unexpected
3460 		 * packets.
3461 		 */
3462 		return (0);
3463 	}
3464 
3465 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3466 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3467 		return (0);
3468 	}
3469 
3470 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3471 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3472 		if (m_new == NULL) {
3473 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3474 			return (0);
3475 		}
3476 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3477 		    rxr->rsc.frag_len[0]);
3478 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3479 	} else {
3480 		/*
3481 		 * Get an mbuf with a cluster.  For packets 2K or less,
3482 		 * get a standard 2K cluster.  For anything larger, get a
3483 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3484 		 * if looped around to the Hyper-V TX channel, so avoid them.
3485 		 */
3486 		size = MCLBYTES;
3487 		if (rxr->rsc.pktlen > MCLBYTES) {
3488 			/* 4096 */
3489 			size = MJUMPAGESIZE;
3490 		}
3491 
3492 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3493 		if (m_new == NULL) {
3494 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3495 			return (0);
3496 		}
3497 
3498 		n = m_new;
3499 		for (i = 0; i < rxr->rsc.cnt; i++) {
3500 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3501 			    rxr->rsc.frag_data[i]);
3502 			if (n == NULL) {
3503 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3504 				return (0);
3505 			} else {
3506 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3507 			}
3508 		}
3509 	}
3510 	if (rxr->rsc.pktlen <= MHLEN)
3511 		rxr->hn_small_pkts++;
3512 
3513 	m_new->m_pkthdr.rcvif = ifp;
3514 
3515 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3516 		do_csum = 0;
3517 
3518 	/* receive side checksum offload */
3519 	if (rxr->rsc.csum_info != NULL) {
3520 		/* IP csum offload */
3521 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3522 			m_new->m_pkthdr.csum_flags |=
3523 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3524 			rxr->hn_csum_ip++;
3525 		}
3526 
3527 		/* TCP/UDP csum offload */
3528 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3529 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3530 			m_new->m_pkthdr.csum_flags |=
3531 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3532 			m_new->m_pkthdr.csum_data = 0xffff;
3533 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3534 				rxr->hn_csum_tcp++;
3535 			else
3536 				rxr->hn_csum_udp++;
3537 		}
3538 
3539 		/*
3540 		 * XXX
3541 		 * As of this write (Oct 28th, 2016), host side will turn
3542 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3543 		 * the do_lro setting here is actually _not_ accurate.  We
3544 		 * depend on the RSS hash type check to reset do_lro.
3545 		 */
3546 		if ((*(rxr->rsc.csum_info) &
3547 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3548 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3549 			do_lro = 1;
3550 	} else {
3551 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3552 		if (l3proto == ETHERTYPE_IP) {
3553 			if (l4proto == IPPROTO_TCP) {
3554 				if (do_csum &&
3555 				    (rxr->hn_trust_hcsum &
3556 				     HN_TRUST_HCSUM_TCP)) {
3557 					rxr->hn_csum_trusted++;
3558 					m_new->m_pkthdr.csum_flags |=
3559 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3560 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3561 					m_new->m_pkthdr.csum_data = 0xffff;
3562 				}
3563 				do_lro = 1;
3564 			} else if (l4proto == IPPROTO_UDP) {
3565 				if (do_csum &&
3566 				    (rxr->hn_trust_hcsum &
3567 				     HN_TRUST_HCSUM_UDP)) {
3568 					rxr->hn_csum_trusted++;
3569 					m_new->m_pkthdr.csum_flags |=
3570 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3571 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3572 					m_new->m_pkthdr.csum_data = 0xffff;
3573 				}
3574 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3575 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3576 				rxr->hn_csum_trusted++;
3577 				m_new->m_pkthdr.csum_flags |=
3578 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3579 			}
3580 		}
3581 	}
3582 
3583 	if (rxr->rsc.vlan_info != NULL) {
3584 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3585 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3586 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3587 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3588 		m_new->m_flags |= M_VLANTAG;
3589 	}
3590 
3591 	/*
3592 	 * If VF is activated (tranparent/non-transparent mode does not
3593 	 * matter here).
3594 	 *
3595 	 * - Disable LRO
3596 	 *
3597 	 *   hn(4) will only receive broadcast packets, multicast packets,
3598 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3599 	 *   packet types.
3600 	 *
3601 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3602 	 *   all, since the LRO flush will use hn(4) as the receiving
3603 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3604 	 */
3605 	if (is_vf)
3606 		do_lro = 0;
3607 
3608 	/*
3609 	 * If VF is activated (tranparent/non-transparent mode does not
3610 	 * matter here), do _not_ mess with unsupported hash types or
3611 	 * functions.
3612 	 */
3613 	if (rxr->rsc.hash_info != NULL) {
3614 		rxr->hn_rss_pkts++;
3615 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3616 		if (!is_vf)
3617 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3618 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3619 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3620 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3621 			    rxr->hn_mbuf_hash);
3622 
3623 			/*
3624 			 * NOTE:
3625 			 * do_lro is resetted, if the hash types are not TCP
3626 			 * related.  See the comment in the above csum_flags
3627 			 * setup section.
3628 			 */
3629 			switch (type) {
3630 			case NDIS_HASH_IPV4:
3631 				hash_type = M_HASHTYPE_RSS_IPV4;
3632 				do_lro = 0;
3633 				break;
3634 
3635 			case NDIS_HASH_TCP_IPV4:
3636 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3637 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3638 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3639 
3640 					if (is_vf)
3641 						def_htype = M_HASHTYPE_NONE;
3642 
3643 					/*
3644 					 * UDP 4-tuple hash is delivered as
3645 					 * TCP 4-tuple hash.
3646 					 */
3647 					if (l3proto == ETHERTYPE_MAX) {
3648 						hn_rxpkt_proto(m_new,
3649 						    &l3proto, &l4proto);
3650 					}
3651 					if (l3proto == ETHERTYPE_IP) {
3652 						if (l4proto == IPPROTO_UDP &&
3653 						    (rxr->hn_mbuf_hash &
3654 						     NDIS_HASH_UDP_IPV4_X)) {
3655 							hash_type =
3656 							M_HASHTYPE_RSS_UDP_IPV4;
3657 							do_lro = 0;
3658 						} else if (l4proto !=
3659 						    IPPROTO_TCP) {
3660 							hash_type = def_htype;
3661 							do_lro = 0;
3662 						}
3663 					} else {
3664 						hash_type = def_htype;
3665 						do_lro = 0;
3666 					}
3667 				}
3668 				break;
3669 
3670 			case NDIS_HASH_IPV6:
3671 				hash_type = M_HASHTYPE_RSS_IPV6;
3672 				do_lro = 0;
3673 				break;
3674 
3675 			case NDIS_HASH_IPV6_EX:
3676 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3677 				do_lro = 0;
3678 				break;
3679 
3680 			case NDIS_HASH_TCP_IPV6:
3681 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3682 				break;
3683 
3684 			case NDIS_HASH_TCP_IPV6_EX:
3685 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3686 				break;
3687 			}
3688 		}
3689 	} else if (!is_vf) {
3690 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3691 		hash_type = M_HASHTYPE_OPAQUE;
3692 	}
3693 	M_HASHTYPE_SET(m_new, hash_type);
3694 
3695 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3696 	if (hn_ifp != ifp) {
3697 		const struct ether_header *eh;
3698 
3699 		/*
3700 		 * Non-transparent mode VF is activated.
3701 		 */
3702 
3703 		/*
3704 		 * Allow tapping on hn(4).
3705 		 */
3706 		ETHER_BPF_MTAP(hn_ifp, m_new);
3707 
3708 		/*
3709 		 * Update hn(4)'s stats.
3710 		 */
3711 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3712 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3713 		/* Checked at the beginning of this function. */
3714 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3715 		eh = mtod(m_new, struct ether_header *);
3716 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3717 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3718 	}
3719 	rxr->hn_pkts++;
3720 
3721 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3722 #if defined(INET) || defined(INET6)
3723 		struct lro_ctrl *lro = &rxr->hn_lro;
3724 
3725 		if (lro->lro_cnt) {
3726 			rxr->hn_lro_tried++;
3727 			if (hn_lro_rx(lro, m_new) == 0) {
3728 				/* DONE! */
3729 				return 0;
3730 			}
3731 		}
3732 #endif
3733 	}
3734 	ifp->if_input(ifp, m_new);
3735 
3736 	return (0);
3737 }
3738 
3739 static int
3740 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3741 {
3742 	struct hn_softc *sc = ifp->if_softc;
3743 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3744 	struct ifnet *vf_ifp;
3745 	int mask, error = 0;
3746 	struct ifrsskey *ifrk;
3747 	struct ifrsshash *ifrh;
3748 	uint32_t mtu;
3749 
3750 	switch (cmd) {
3751 	case SIOCSIFMTU:
3752 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3753 			error = EINVAL;
3754 			break;
3755 		}
3756 
3757 		HN_LOCK(sc);
3758 
3759 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3760 			HN_UNLOCK(sc);
3761 			break;
3762 		}
3763 
3764 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3765 			/* Can't change MTU */
3766 			HN_UNLOCK(sc);
3767 			error = EOPNOTSUPP;
3768 			break;
3769 		}
3770 
3771 		if (ifp->if_mtu == ifr->ifr_mtu) {
3772 			HN_UNLOCK(sc);
3773 			break;
3774 		}
3775 
3776 		if (hn_xpnt_vf_isready(sc)) {
3777 			vf_ifp = sc->hn_vf_ifp;
3778 			ifr_vf = *ifr;
3779 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3780 			    sizeof(ifr_vf.ifr_name));
3781 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3782 			    (caddr_t)&ifr_vf);
3783 			if (error) {
3784 				HN_UNLOCK(sc);
3785 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3786 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3787 				break;
3788 			}
3789 		}
3790 
3791 		/*
3792 		 * Suspend this interface before the synthetic parts
3793 		 * are ripped.
3794 		 */
3795 		hn_suspend(sc);
3796 
3797 		/*
3798 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3799 		 */
3800 		hn_synth_detach(sc);
3801 
3802 		/*
3803 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3804 		 * with the new MTU setting.
3805 		 */
3806 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3807 		if (error) {
3808 			HN_UNLOCK(sc);
3809 			break;
3810 		}
3811 
3812 		error = hn_rndis_get_mtu(sc, &mtu);
3813 		if (error)
3814 			mtu = ifr->ifr_mtu;
3815 		else if (bootverbose)
3816 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3817 
3818 		/*
3819 		 * Commit the requested MTU, after the synthetic parts
3820 		 * have been successfully attached.
3821 		 */
3822 		if (mtu >= ifr->ifr_mtu) {
3823 			mtu = ifr->ifr_mtu;
3824 		} else {
3825 			if_printf(ifp, "fixup mtu %d -> %u\n",
3826 			    ifr->ifr_mtu, mtu);
3827 		}
3828 		ifp->if_mtu = mtu;
3829 
3830 		/*
3831 		 * Synthetic parts' reattach may change the chimney
3832 		 * sending size; update it.
3833 		 */
3834 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3835 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3836 
3837 		/*
3838 		 * Make sure that various parameters based on MTU are
3839 		 * still valid, after the MTU change.
3840 		 */
3841 		hn_mtu_change_fixup(sc);
3842 
3843 		/*
3844 		 * All done!  Resume the interface now.
3845 		 */
3846 		hn_resume(sc);
3847 
3848 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3849 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3850 			/*
3851 			 * Since we have reattached the NVS part,
3852 			 * change the datapath to VF again; in case
3853 			 * that it is lost, after the NVS was detached.
3854 			 */
3855 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3856 		}
3857 
3858 		HN_UNLOCK(sc);
3859 		break;
3860 
3861 	case SIOCSIFFLAGS:
3862 		HN_LOCK(sc);
3863 
3864 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3865 			HN_UNLOCK(sc);
3866 			break;
3867 		}
3868 
3869 		if (hn_xpnt_vf_isready(sc))
3870 			hn_xpnt_vf_saveifflags(sc);
3871 
3872 		if (ifp->if_flags & IFF_UP) {
3873 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3874 				/*
3875 				 * Caller meight hold mutex, e.g.
3876 				 * bpf; use busy-wait for the RNDIS
3877 				 * reply.
3878 				 */
3879 				HN_NO_SLEEPING(sc);
3880 				hn_rxfilter_config(sc);
3881 				HN_SLEEPING_OK(sc);
3882 
3883 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3884 					error = hn_xpnt_vf_iocsetflags(sc);
3885 			} else {
3886 				hn_init_locked(sc);
3887 			}
3888 		} else {
3889 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3890 				hn_stop(sc, false);
3891 		}
3892 		sc->hn_if_flags = ifp->if_flags;
3893 
3894 		HN_UNLOCK(sc);
3895 		break;
3896 
3897 	case SIOCSIFCAP:
3898 		HN_LOCK(sc);
3899 
3900 		if (hn_xpnt_vf_isready(sc)) {
3901 			ifr_vf = *ifr;
3902 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3903 			    sizeof(ifr_vf.ifr_name));
3904 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3905 			HN_UNLOCK(sc);
3906 			break;
3907 		}
3908 
3909 		/*
3910 		 * Fix up requested capabilities w/ supported capabilities,
3911 		 * since the supported capabilities could have been changed.
3912 		 */
3913 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3914 		    ifp->if_capenable;
3915 
3916 		if (mask & IFCAP_TXCSUM) {
3917 			ifp->if_capenable ^= IFCAP_TXCSUM;
3918 			if (ifp->if_capenable & IFCAP_TXCSUM)
3919 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3920 			else
3921 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3922 		}
3923 		if (mask & IFCAP_TXCSUM_IPV6) {
3924 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3925 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3926 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3927 			else
3928 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3929 		}
3930 
3931 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3932 		if (mask & IFCAP_RXCSUM)
3933 			ifp->if_capenable ^= IFCAP_RXCSUM;
3934 #ifdef foo
3935 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3936 		if (mask & IFCAP_RXCSUM_IPV6)
3937 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3938 #endif
3939 
3940 		if (mask & IFCAP_LRO)
3941 			ifp->if_capenable ^= IFCAP_LRO;
3942 
3943 		if (mask & IFCAP_TSO4) {
3944 			ifp->if_capenable ^= IFCAP_TSO4;
3945 			if (ifp->if_capenable & IFCAP_TSO4)
3946 				ifp->if_hwassist |= CSUM_IP_TSO;
3947 			else
3948 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3949 		}
3950 		if (mask & IFCAP_TSO6) {
3951 			ifp->if_capenable ^= IFCAP_TSO6;
3952 			if (ifp->if_capenable & IFCAP_TSO6)
3953 				ifp->if_hwassist |= CSUM_IP6_TSO;
3954 			else
3955 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3956 		}
3957 
3958 		HN_UNLOCK(sc);
3959 		break;
3960 
3961 	case SIOCADDMULTI:
3962 	case SIOCDELMULTI:
3963 		HN_LOCK(sc);
3964 
3965 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3966 			HN_UNLOCK(sc);
3967 			break;
3968 		}
3969 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3970 			/*
3971 			 * Multicast uses mutex; use busy-wait for
3972 			 * the RNDIS reply.
3973 			 */
3974 			HN_NO_SLEEPING(sc);
3975 			hn_rxfilter_config(sc);
3976 			HN_SLEEPING_OK(sc);
3977 		}
3978 
3979 		/* XXX vlan(4) style mcast addr maintenance */
3980 		if (hn_xpnt_vf_isready(sc)) {
3981 			int old_if_flags;
3982 
3983 			old_if_flags = sc->hn_vf_ifp->if_flags;
3984 			hn_xpnt_vf_saveifflags(sc);
3985 
3986 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3987 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3988 			     IFF_ALLMULTI))
3989 				error = hn_xpnt_vf_iocsetflags(sc);
3990 		}
3991 
3992 		HN_UNLOCK(sc);
3993 		break;
3994 
3995 	case SIOCSIFMEDIA:
3996 	case SIOCGIFMEDIA:
3997 		HN_LOCK(sc);
3998 		if (hn_xpnt_vf_isready(sc)) {
3999 			/*
4000 			 * SIOCGIFMEDIA expects ifmediareq, so don't
4001 			 * create and pass ifr_vf to the VF here; just
4002 			 * replace the ifr_name.
4003 			 */
4004 			vf_ifp = sc->hn_vf_ifp;
4005 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
4006 			    sizeof(ifr->ifr_name));
4007 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
4008 			/* Restore the ifr_name. */
4009 			strlcpy(ifr->ifr_name, ifp->if_xname,
4010 			    sizeof(ifr->ifr_name));
4011 			HN_UNLOCK(sc);
4012 			break;
4013 		}
4014 		HN_UNLOCK(sc);
4015 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4016 		break;
4017 
4018 	case SIOCGIFRSSHASH:
4019 		ifrh = (struct ifrsshash *)data;
4020 		HN_LOCK(sc);
4021 		if (sc->hn_rx_ring_inuse == 1) {
4022 			HN_UNLOCK(sc);
4023 			ifrh->ifrh_func = RSS_FUNC_NONE;
4024 			ifrh->ifrh_types = 0;
4025 			break;
4026 		}
4027 
4028 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4029 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4030 		else
4031 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4032 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4033 		HN_UNLOCK(sc);
4034 		break;
4035 
4036 	case SIOCGIFRSSKEY:
4037 		ifrk = (struct ifrsskey *)data;
4038 		HN_LOCK(sc);
4039 		if (sc->hn_rx_ring_inuse == 1) {
4040 			HN_UNLOCK(sc);
4041 			ifrk->ifrk_func = RSS_FUNC_NONE;
4042 			ifrk->ifrk_keylen = 0;
4043 			break;
4044 		}
4045 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4046 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4047 		else
4048 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4049 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4050 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4051 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4052 		HN_UNLOCK(sc);
4053 		break;
4054 
4055 	default:
4056 		error = ether_ioctl(ifp, cmd, data);
4057 		break;
4058 	}
4059 	return (error);
4060 }
4061 
4062 static void
4063 hn_stop(struct hn_softc *sc, bool detaching)
4064 {
4065 	struct ifnet *ifp = sc->hn_ifp;
4066 	int i;
4067 
4068 	HN_LOCK_ASSERT(sc);
4069 
4070 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4071 	    ("synthetic parts were not attached"));
4072 
4073 	/* Clear RUNNING bit ASAP. */
4074 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4075 
4076 	/* Disable polling. */
4077 	hn_polling(sc, 0);
4078 
4079 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4080 		KASSERT(sc->hn_vf_ifp != NULL,
4081 		    ("%s: VF is not attached", ifp->if_xname));
4082 
4083 		/* Mark transparent mode VF as disabled. */
4084 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4085 
4086 		/*
4087 		 * NOTE:
4088 		 * Datapath setting must happen _before_ bringing
4089 		 * the VF down.
4090 		 */
4091 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4092 
4093 		/*
4094 		 * Bring the VF down.
4095 		 */
4096 		hn_xpnt_vf_saveifflags(sc);
4097 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4098 		hn_xpnt_vf_iocsetflags(sc);
4099 	}
4100 
4101 	/* Suspend data transfers. */
4102 	hn_suspend_data(sc);
4103 
4104 	/* Clear OACTIVE bit. */
4105 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4106 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4107 		sc->hn_tx_ring[i].hn_oactive = 0;
4108 
4109 	/*
4110 	 * If the non-transparent mode VF is active, make sure
4111 	 * that the RX filter still allows packet reception.
4112 	 */
4113 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4114 		hn_rxfilter_config(sc);
4115 }
4116 
4117 static void
4118 hn_init_locked(struct hn_softc *sc)
4119 {
4120 	struct ifnet *ifp = sc->hn_ifp;
4121 	int i;
4122 
4123 	HN_LOCK_ASSERT(sc);
4124 
4125 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4126 		return;
4127 
4128 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4129 		return;
4130 
4131 	/* Configure RX filter */
4132 	hn_rxfilter_config(sc);
4133 
4134 	/* Clear OACTIVE bit. */
4135 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4136 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4137 		sc->hn_tx_ring[i].hn_oactive = 0;
4138 
4139 	/* Clear TX 'suspended' bit. */
4140 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4141 
4142 	if (hn_xpnt_vf_isready(sc)) {
4143 		/* Initialize transparent VF. */
4144 		hn_xpnt_vf_init(sc);
4145 	}
4146 
4147 	/* Everything is ready; unleash! */
4148 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4149 
4150 	/* Re-enable polling if requested. */
4151 	if (sc->hn_pollhz > 0)
4152 		hn_polling(sc, sc->hn_pollhz);
4153 }
4154 
4155 static void
4156 hn_init(void *xsc)
4157 {
4158 	struct hn_softc *sc = xsc;
4159 
4160 	HN_LOCK(sc);
4161 	hn_init_locked(sc);
4162 	HN_UNLOCK(sc);
4163 }
4164 
4165 #if __FreeBSD_version >= 1100099
4166 
4167 static int
4168 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4169 {
4170 	struct hn_softc *sc = arg1;
4171 	unsigned int lenlim;
4172 	int error;
4173 
4174 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4175 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4176 	if (error || req->newptr == NULL)
4177 		return error;
4178 
4179 	HN_LOCK(sc);
4180 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4181 	    lenlim > TCP_LRO_LENGTH_MAX) {
4182 		HN_UNLOCK(sc);
4183 		return EINVAL;
4184 	}
4185 	hn_set_lro_lenlim(sc, lenlim);
4186 	HN_UNLOCK(sc);
4187 
4188 	return 0;
4189 }
4190 
4191 static int
4192 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4193 {
4194 	struct hn_softc *sc = arg1;
4195 	int ackcnt, error, i;
4196 
4197 	/*
4198 	 * lro_ackcnt_lim is append count limit,
4199 	 * +1 to turn it into aggregation limit.
4200 	 */
4201 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4202 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4203 	if (error || req->newptr == NULL)
4204 		return error;
4205 
4206 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4207 		return EINVAL;
4208 
4209 	/*
4210 	 * Convert aggregation limit back to append
4211 	 * count limit.
4212 	 */
4213 	--ackcnt;
4214 	HN_LOCK(sc);
4215 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4216 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4217 	HN_UNLOCK(sc);
4218 	return 0;
4219 }
4220 
4221 #endif
4222 
4223 static int
4224 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4225 {
4226 	struct hn_softc *sc = arg1;
4227 	int hcsum = arg2;
4228 	int on, error, i;
4229 
4230 	on = 0;
4231 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4232 		on = 1;
4233 
4234 	error = sysctl_handle_int(oidp, &on, 0, req);
4235 	if (error || req->newptr == NULL)
4236 		return error;
4237 
4238 	HN_LOCK(sc);
4239 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4240 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4241 
4242 		if (on)
4243 			rxr->hn_trust_hcsum |= hcsum;
4244 		else
4245 			rxr->hn_trust_hcsum &= ~hcsum;
4246 	}
4247 	HN_UNLOCK(sc);
4248 	return 0;
4249 }
4250 
4251 static int
4252 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4253 {
4254 	struct hn_softc *sc = arg1;
4255 	int chim_size, error;
4256 
4257 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4258 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4259 	if (error || req->newptr == NULL)
4260 		return error;
4261 
4262 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4263 		return EINVAL;
4264 
4265 	HN_LOCK(sc);
4266 	hn_set_chim_size(sc, chim_size);
4267 	HN_UNLOCK(sc);
4268 	return 0;
4269 }
4270 
4271 #if __FreeBSD_version < 1100095
4272 static int
4273 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4274 {
4275 	struct hn_softc *sc = arg1;
4276 	int ofs = arg2, i, error;
4277 	struct hn_rx_ring *rxr;
4278 	uint64_t stat;
4279 
4280 	stat = 0;
4281 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4282 		rxr = &sc->hn_rx_ring[i];
4283 		stat += *((int *)((uint8_t *)rxr + ofs));
4284 	}
4285 
4286 	error = sysctl_handle_64(oidp, &stat, 0, req);
4287 	if (error || req->newptr == NULL)
4288 		return error;
4289 
4290 	/* Zero out this stat. */
4291 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4292 		rxr = &sc->hn_rx_ring[i];
4293 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4294 	}
4295 	return 0;
4296 }
4297 #else
4298 static int
4299 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4300 {
4301 	struct hn_softc *sc = arg1;
4302 	int ofs = arg2, i, error;
4303 	struct hn_rx_ring *rxr;
4304 	uint64_t stat;
4305 
4306 	stat = 0;
4307 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4308 		rxr = &sc->hn_rx_ring[i];
4309 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4310 	}
4311 
4312 	error = sysctl_handle_64(oidp, &stat, 0, req);
4313 	if (error || req->newptr == NULL)
4314 		return error;
4315 
4316 	/* Zero out this stat. */
4317 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4318 		rxr = &sc->hn_rx_ring[i];
4319 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4320 	}
4321 	return 0;
4322 }
4323 
4324 #endif
4325 
4326 static int
4327 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4328 {
4329 	struct hn_softc *sc = arg1;
4330 	int ofs = arg2, i, error;
4331 	struct hn_rx_ring *rxr;
4332 	u_long stat;
4333 
4334 	stat = 0;
4335 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4336 		rxr = &sc->hn_rx_ring[i];
4337 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4338 	}
4339 
4340 	error = sysctl_handle_long(oidp, &stat, 0, req);
4341 	if (error || req->newptr == NULL)
4342 		return error;
4343 
4344 	/* Zero out this stat. */
4345 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4346 		rxr = &sc->hn_rx_ring[i];
4347 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4348 	}
4349 	return 0;
4350 }
4351 
4352 static int
4353 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4354 {
4355 	struct hn_softc *sc = arg1;
4356 	int ofs = arg2, i, error;
4357 	struct hn_tx_ring *txr;
4358 	u_long stat;
4359 
4360 	stat = 0;
4361 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4362 		txr = &sc->hn_tx_ring[i];
4363 		stat += *((u_long *)((uint8_t *)txr + ofs));
4364 	}
4365 
4366 	error = sysctl_handle_long(oidp, &stat, 0, req);
4367 	if (error || req->newptr == NULL)
4368 		return error;
4369 
4370 	/* Zero out this stat. */
4371 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4372 		txr = &sc->hn_tx_ring[i];
4373 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4374 	}
4375 	return 0;
4376 }
4377 
4378 static int
4379 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4380 {
4381 	struct hn_softc *sc = arg1;
4382 	int ofs = arg2, i, error, conf;
4383 	struct hn_tx_ring *txr;
4384 
4385 	txr = &sc->hn_tx_ring[0];
4386 	conf = *((int *)((uint8_t *)txr + ofs));
4387 
4388 	error = sysctl_handle_int(oidp, &conf, 0, req);
4389 	if (error || req->newptr == NULL)
4390 		return error;
4391 
4392 	HN_LOCK(sc);
4393 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4394 		txr = &sc->hn_tx_ring[i];
4395 		*((int *)((uint8_t *)txr + ofs)) = conf;
4396 	}
4397 	HN_UNLOCK(sc);
4398 
4399 	return 0;
4400 }
4401 
4402 static int
4403 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4404 {
4405 	struct hn_softc *sc = arg1;
4406 	int error, size;
4407 
4408 	size = sc->hn_agg_size;
4409 	error = sysctl_handle_int(oidp, &size, 0, req);
4410 	if (error || req->newptr == NULL)
4411 		return (error);
4412 
4413 	HN_LOCK(sc);
4414 	sc->hn_agg_size = size;
4415 	hn_set_txagg(sc);
4416 	HN_UNLOCK(sc);
4417 
4418 	return (0);
4419 }
4420 
4421 static int
4422 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4423 {
4424 	struct hn_softc *sc = arg1;
4425 	int error, pkts;
4426 
4427 	pkts = sc->hn_agg_pkts;
4428 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4429 	if (error || req->newptr == NULL)
4430 		return (error);
4431 
4432 	HN_LOCK(sc);
4433 	sc->hn_agg_pkts = pkts;
4434 	hn_set_txagg(sc);
4435 	HN_UNLOCK(sc);
4436 
4437 	return (0);
4438 }
4439 
4440 static int
4441 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4442 {
4443 	struct hn_softc *sc = arg1;
4444 	int pkts;
4445 
4446 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4447 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4448 }
4449 
4450 static int
4451 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4452 {
4453 	struct hn_softc *sc = arg1;
4454 	int align;
4455 
4456 	align = sc->hn_tx_ring[0].hn_agg_align;
4457 	return (sysctl_handle_int(oidp, &align, 0, req));
4458 }
4459 
4460 static void
4461 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4462 {
4463 	if (pollhz == 0)
4464 		vmbus_chan_poll_disable(chan);
4465 	else
4466 		vmbus_chan_poll_enable(chan, pollhz);
4467 }
4468 
4469 static void
4470 hn_polling(struct hn_softc *sc, u_int pollhz)
4471 {
4472 	int nsubch = sc->hn_rx_ring_inuse - 1;
4473 
4474 	HN_LOCK_ASSERT(sc);
4475 
4476 	if (nsubch > 0) {
4477 		struct vmbus_channel **subch;
4478 		int i;
4479 
4480 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4481 		for (i = 0; i < nsubch; ++i)
4482 			hn_chan_polling(subch[i], pollhz);
4483 		vmbus_subchan_rel(subch, nsubch);
4484 	}
4485 	hn_chan_polling(sc->hn_prichan, pollhz);
4486 }
4487 
4488 static int
4489 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4490 {
4491 	struct hn_softc *sc = arg1;
4492 	int pollhz, error;
4493 
4494 	pollhz = sc->hn_pollhz;
4495 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4496 	if (error || req->newptr == NULL)
4497 		return (error);
4498 
4499 	if (pollhz != 0 &&
4500 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4501 		return (EINVAL);
4502 
4503 	HN_LOCK(sc);
4504 	if (sc->hn_pollhz != pollhz) {
4505 		sc->hn_pollhz = pollhz;
4506 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4507 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4508 			hn_polling(sc, sc->hn_pollhz);
4509 	}
4510 	HN_UNLOCK(sc);
4511 
4512 	return (0);
4513 }
4514 
4515 static int
4516 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4517 {
4518 	struct hn_softc *sc = arg1;
4519 	char verstr[16];
4520 
4521 	snprintf(verstr, sizeof(verstr), "%u.%u",
4522 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4523 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4524 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4525 }
4526 
4527 static int
4528 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4529 {
4530 	struct hn_softc *sc = arg1;
4531 	char caps_str[128];
4532 	uint32_t caps;
4533 
4534 	HN_LOCK(sc);
4535 	caps = sc->hn_caps;
4536 	HN_UNLOCK(sc);
4537 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4538 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4539 }
4540 
4541 static int
4542 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4543 {
4544 	struct hn_softc *sc = arg1;
4545 	char assist_str[128];
4546 	uint32_t hwassist;
4547 
4548 	HN_LOCK(sc);
4549 	hwassist = sc->hn_ifp->if_hwassist;
4550 	HN_UNLOCK(sc);
4551 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4552 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4553 }
4554 
4555 static int
4556 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4557 {
4558 	struct hn_softc *sc = arg1;
4559 	char filter_str[128];
4560 	uint32_t filter;
4561 
4562 	HN_LOCK(sc);
4563 	filter = sc->hn_rx_filter;
4564 	HN_UNLOCK(sc);
4565 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4566 	    NDIS_PACKET_TYPES);
4567 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4568 }
4569 
4570 #ifndef RSS
4571 
4572 static int
4573 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4574 {
4575 	struct hn_softc *sc = arg1;
4576 	int error;
4577 
4578 	HN_LOCK(sc);
4579 
4580 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4581 	if (error || req->newptr == NULL)
4582 		goto back;
4583 
4584 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4585 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4586 		/*
4587 		 * RSS key is synchronized w/ VF's, don't allow users
4588 		 * to change it.
4589 		 */
4590 		error = EBUSY;
4591 		goto back;
4592 	}
4593 
4594 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4595 	if (error)
4596 		goto back;
4597 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4598 
4599 	if (sc->hn_rx_ring_inuse > 1) {
4600 		error = hn_rss_reconfig(sc);
4601 	} else {
4602 		/* Not RSS capable, at least for now; just save the RSS key. */
4603 		error = 0;
4604 	}
4605 back:
4606 	HN_UNLOCK(sc);
4607 	return (error);
4608 }
4609 
4610 static int
4611 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4612 {
4613 	struct hn_softc *sc = arg1;
4614 	int error;
4615 
4616 	HN_LOCK(sc);
4617 
4618 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4619 	if (error || req->newptr == NULL)
4620 		goto back;
4621 
4622 	/*
4623 	 * Don't allow RSS indirect table change, if this interface is not
4624 	 * RSS capable currently.
4625 	 */
4626 	if (sc->hn_rx_ring_inuse == 1) {
4627 		error = EOPNOTSUPP;
4628 		goto back;
4629 	}
4630 
4631 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4632 	if (error)
4633 		goto back;
4634 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4635 
4636 	hn_rss_ind_fixup(sc);
4637 	error = hn_rss_reconfig(sc);
4638 back:
4639 	HN_UNLOCK(sc);
4640 	return (error);
4641 }
4642 
4643 #endif	/* !RSS */
4644 
4645 static int
4646 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4647 {
4648 	struct hn_softc *sc = arg1;
4649 	char hash_str[128];
4650 	uint32_t hash;
4651 
4652 	HN_LOCK(sc);
4653 	hash = sc->hn_rss_hash;
4654 	HN_UNLOCK(sc);
4655 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4656 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4657 }
4658 
4659 static int
4660 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4661 {
4662 	struct hn_softc *sc = arg1;
4663 	char hash_str[128];
4664 	uint32_t hash;
4665 
4666 	HN_LOCK(sc);
4667 	hash = sc->hn_rss_hcap;
4668 	HN_UNLOCK(sc);
4669 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4670 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4671 }
4672 
4673 static int
4674 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4675 {
4676 	struct hn_softc *sc = arg1;
4677 	char hash_str[128];
4678 	uint32_t hash;
4679 
4680 	HN_LOCK(sc);
4681 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4682 	HN_UNLOCK(sc);
4683 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4684 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4685 }
4686 
4687 static int
4688 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4689 {
4690 	struct hn_softc *sc = arg1;
4691 	char vf_name[IFNAMSIZ + 1];
4692 	struct ifnet *vf_ifp;
4693 
4694 	HN_LOCK(sc);
4695 	vf_name[0] = '\0';
4696 	vf_ifp = sc->hn_vf_ifp;
4697 	if (vf_ifp != NULL)
4698 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4699 	HN_UNLOCK(sc);
4700 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4701 }
4702 
4703 static int
4704 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4705 {
4706 	struct hn_softc *sc = arg1;
4707 	char vf_name[IFNAMSIZ + 1];
4708 	struct ifnet *vf_ifp;
4709 
4710 	HN_LOCK(sc);
4711 	vf_name[0] = '\0';
4712 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4713 	if (vf_ifp != NULL)
4714 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4715 	HN_UNLOCK(sc);
4716 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4717 }
4718 
4719 static int
4720 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4721 {
4722 	struct rm_priotracker pt;
4723 	struct sbuf *sb;
4724 	int error, i;
4725 	bool first;
4726 
4727 	error = sysctl_wire_old_buffer(req, 0);
4728 	if (error != 0)
4729 		return (error);
4730 
4731 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4732 	if (sb == NULL)
4733 		return (ENOMEM);
4734 
4735 	rm_rlock(&hn_vfmap_lock, &pt);
4736 
4737 	first = true;
4738 	for (i = 0; i < hn_vfmap_size; ++i) {
4739 		struct epoch_tracker et;
4740 		struct ifnet *ifp;
4741 
4742 		if (hn_vfmap[i] == NULL)
4743 			continue;
4744 
4745 		NET_EPOCH_ENTER(et);
4746 		ifp = ifnet_byindex(i);
4747 		if (ifp != NULL) {
4748 			if (first)
4749 				sbuf_printf(sb, "%s", ifp->if_xname);
4750 			else
4751 				sbuf_printf(sb, " %s", ifp->if_xname);
4752 			first = false;
4753 		}
4754 		NET_EPOCH_EXIT(et);
4755 	}
4756 
4757 	rm_runlock(&hn_vfmap_lock, &pt);
4758 
4759 	error = sbuf_finish(sb);
4760 	sbuf_delete(sb);
4761 	return (error);
4762 }
4763 
4764 static int
4765 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4766 {
4767 	struct rm_priotracker pt;
4768 	struct sbuf *sb;
4769 	int error, i;
4770 	bool first;
4771 
4772 	error = sysctl_wire_old_buffer(req, 0);
4773 	if (error != 0)
4774 		return (error);
4775 
4776 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4777 	if (sb == NULL)
4778 		return (ENOMEM);
4779 
4780 	rm_rlock(&hn_vfmap_lock, &pt);
4781 
4782 	first = true;
4783 	for (i = 0; i < hn_vfmap_size; ++i) {
4784 		struct epoch_tracker et;
4785 		struct ifnet *ifp, *hn_ifp;
4786 
4787 		hn_ifp = hn_vfmap[i];
4788 		if (hn_ifp == NULL)
4789 			continue;
4790 
4791 		NET_EPOCH_ENTER(et);
4792 		ifp = ifnet_byindex(i);
4793 		if (ifp != NULL) {
4794 			if (first) {
4795 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4796 				    hn_ifp->if_xname);
4797 			} else {
4798 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4799 				    hn_ifp->if_xname);
4800 			}
4801 			first = false;
4802 		}
4803 		NET_EPOCH_EXIT(et);
4804 	}
4805 
4806 	rm_runlock(&hn_vfmap_lock, &pt);
4807 
4808 	error = sbuf_finish(sb);
4809 	sbuf_delete(sb);
4810 	return (error);
4811 }
4812 
4813 static int
4814 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4815 {
4816 	struct hn_softc *sc = arg1;
4817 	int error, onoff = 0;
4818 
4819 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4820 		onoff = 1;
4821 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4822 	if (error || req->newptr == NULL)
4823 		return (error);
4824 
4825 	HN_LOCK(sc);
4826 	/* NOTE: hn_vf_lock for hn_transmit() */
4827 	rm_wlock(&sc->hn_vf_lock);
4828 	if (onoff)
4829 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4830 	else
4831 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4832 	rm_wunlock(&sc->hn_vf_lock);
4833 	HN_UNLOCK(sc);
4834 
4835 	return (0);
4836 }
4837 
4838 static int
4839 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4840 {
4841 	struct hn_softc *sc = arg1;
4842 	int enabled = 0;
4843 
4844 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4845 		enabled = 1;
4846 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4847 }
4848 
4849 static int
4850 hn_check_iplen(const struct mbuf *m, int hoff)
4851 {
4852 	const struct ip *ip;
4853 	int len, iphlen, iplen;
4854 	const struct tcphdr *th;
4855 	int thoff;				/* TCP data offset */
4856 
4857 	len = hoff + sizeof(struct ip);
4858 
4859 	/* The packet must be at least the size of an IP header. */
4860 	if (m->m_pkthdr.len < len)
4861 		return IPPROTO_DONE;
4862 
4863 	/* The fixed IP header must reside completely in the first mbuf. */
4864 	if (m->m_len < len)
4865 		return IPPROTO_DONE;
4866 
4867 	ip = mtodo(m, hoff);
4868 
4869 	/* Bound check the packet's stated IP header length. */
4870 	iphlen = ip->ip_hl << 2;
4871 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4872 		return IPPROTO_DONE;
4873 
4874 	/* The full IP header must reside completely in the one mbuf. */
4875 	if (m->m_len < hoff + iphlen)
4876 		return IPPROTO_DONE;
4877 
4878 	iplen = ntohs(ip->ip_len);
4879 
4880 	/*
4881 	 * Check that the amount of data in the buffers is as
4882 	 * at least much as the IP header would have us expect.
4883 	 */
4884 	if (m->m_pkthdr.len < hoff + iplen)
4885 		return IPPROTO_DONE;
4886 
4887 	/*
4888 	 * Ignore IP fragments.
4889 	 */
4890 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4891 		return IPPROTO_DONE;
4892 
4893 	/*
4894 	 * The TCP/IP or UDP/IP header must be entirely contained within
4895 	 * the first fragment of a packet.
4896 	 */
4897 	switch (ip->ip_p) {
4898 	case IPPROTO_TCP:
4899 		if (iplen < iphlen + sizeof(struct tcphdr))
4900 			return IPPROTO_DONE;
4901 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4902 			return IPPROTO_DONE;
4903 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4904 		thoff = th->th_off << 2;
4905 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4906 			return IPPROTO_DONE;
4907 		if (m->m_len < hoff + iphlen + thoff)
4908 			return IPPROTO_DONE;
4909 		break;
4910 	case IPPROTO_UDP:
4911 		if (iplen < iphlen + sizeof(struct udphdr))
4912 			return IPPROTO_DONE;
4913 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4914 			return IPPROTO_DONE;
4915 		break;
4916 	default:
4917 		if (iplen < iphlen)
4918 			return IPPROTO_DONE;
4919 		break;
4920 	}
4921 	return ip->ip_p;
4922 }
4923 
4924 static void
4925 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4926 {
4927 	const struct ether_header *eh;
4928 	uint16_t etype;
4929 	int hoff;
4930 
4931 	hoff = sizeof(*eh);
4932 	/* Checked at the beginning of this function. */
4933 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4934 
4935 	eh = mtod(m_new, const struct ether_header *);
4936 	etype = ntohs(eh->ether_type);
4937 	if (etype == ETHERTYPE_VLAN) {
4938 		const struct ether_vlan_header *evl;
4939 
4940 		hoff = sizeof(*evl);
4941 		if (m_new->m_len < hoff)
4942 			return;
4943 		evl = mtod(m_new, const struct ether_vlan_header *);
4944 		etype = ntohs(evl->evl_proto);
4945 	}
4946 	*l3proto = etype;
4947 
4948 	if (etype == ETHERTYPE_IP)
4949 		*l4proto = hn_check_iplen(m_new, hoff);
4950 	else
4951 		*l4proto = IPPROTO_DONE;
4952 }
4953 
4954 static int
4955 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4956 {
4957 	struct sysctl_oid_list *child;
4958 	struct sysctl_ctx_list *ctx;
4959 	device_t dev = sc->hn_dev;
4960 #if defined(INET) || defined(INET6)
4961 #if __FreeBSD_version >= 1100095
4962 	int lroent_cnt;
4963 #endif
4964 #endif
4965 	int i;
4966 
4967 	/*
4968 	 * Create RXBUF for reception.
4969 	 *
4970 	 * NOTE:
4971 	 * - It is shared by all channels.
4972 	 * - A large enough buffer is allocated, certain version of NVSes
4973 	 *   may further limit the usable space.
4974 	 */
4975 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4976 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4977 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4978 	if (sc->hn_rxbuf == NULL) {
4979 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4980 		return (ENOMEM);
4981 	}
4982 
4983 	sc->hn_rx_ring_cnt = ring_cnt;
4984 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4985 
4986 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4987 	    M_DEVBUF, M_WAITOK | M_ZERO);
4988 
4989 #if defined(INET) || defined(INET6)
4990 #if __FreeBSD_version >= 1100095
4991 	lroent_cnt = hn_lro_entry_count;
4992 	if (lroent_cnt < TCP_LRO_ENTRIES)
4993 		lroent_cnt = TCP_LRO_ENTRIES;
4994 	if (bootverbose)
4995 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4996 #endif
4997 #endif	/* INET || INET6 */
4998 
4999 	ctx = device_get_sysctl_ctx(dev);
5000 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
5001 
5002 	/* Create dev.hn.UNIT.rx sysctl tree */
5003 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
5004 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5005 
5006 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5007 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5008 
5009 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
5010 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
5011 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
5012 		if (rxr->hn_br == NULL) {
5013 			device_printf(dev, "allocate bufring failed\n");
5014 			return (ENOMEM);
5015 		}
5016 
5017 		if (hn_trust_hosttcp)
5018 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
5019 		if (hn_trust_hostudp)
5020 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
5021 		if (hn_trust_hostip)
5022 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
5023 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
5024 		rxr->hn_ifp = sc->hn_ifp;
5025 		if (i < sc->hn_tx_ring_cnt)
5026 			rxr->hn_txr = &sc->hn_tx_ring[i];
5027 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
5028 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
5029 		rxr->hn_rx_idx = i;
5030 		rxr->hn_rxbuf = sc->hn_rxbuf;
5031 
5032 		/*
5033 		 * Initialize LRO.
5034 		 */
5035 #if defined(INET) || defined(INET6)
5036 #if __FreeBSD_version >= 1100095
5037 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5038 		    hn_lro_mbufq_depth);
5039 #else
5040 		tcp_lro_init(&rxr->hn_lro);
5041 		rxr->hn_lro.ifp = sc->hn_ifp;
5042 #endif
5043 #if __FreeBSD_version >= 1100099
5044 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5045 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5046 #endif
5047 #endif	/* INET || INET6 */
5048 
5049 		if (sc->hn_rx_sysctl_tree != NULL) {
5050 			char name[16];
5051 
5052 			/*
5053 			 * Create per RX ring sysctl tree:
5054 			 * dev.hn.UNIT.rx.RINGID
5055 			 */
5056 			snprintf(name, sizeof(name), "%d", i);
5057 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5058 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5059 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5060 
5061 			if (rxr->hn_rx_sysctl_tree != NULL) {
5062 				SYSCTL_ADD_ULONG(ctx,
5063 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5064 				    OID_AUTO, "packets",
5065 				    CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5066 				    "# of packets received");
5067 				SYSCTL_ADD_ULONG(ctx,
5068 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5069 				    OID_AUTO, "rss_pkts",
5070 				    CTLFLAG_RW | CTLFLAG_STATS,
5071 				    &rxr->hn_rss_pkts,
5072 				    "# of packets w/ RSS info received");
5073 				SYSCTL_ADD_ULONG(ctx,
5074 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5075 				    OID_AUTO, "rsc_pkts",
5076 				    CTLFLAG_RW | CTLFLAG_STATS,
5077 				    &rxr->hn_rsc_pkts,
5078 				    "# of RSC packets received");
5079 				SYSCTL_ADD_ULONG(ctx,
5080 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5081 				    OID_AUTO, "rsc_drop",
5082 				    CTLFLAG_RW | CTLFLAG_STATS,
5083 				    &rxr->hn_rsc_drop,
5084 				    "# of RSC fragments dropped");
5085 				SYSCTL_ADD_INT(ctx,
5086 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5087 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5088 				    &rxr->hn_pktbuf_len, 0,
5089 				    "Temporary channel packet buffer length");
5090 			}
5091 		}
5092 	}
5093 
5094 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5095 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5096 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5097 #if __FreeBSD_version < 1100095
5098 	    hn_rx_stat_int_sysctl,
5099 #else
5100 	    hn_rx_stat_u64_sysctl,
5101 #endif
5102 	    "LU", "LRO queued");
5103 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5104 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5105 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5106 #if __FreeBSD_version < 1100095
5107 	    hn_rx_stat_int_sysctl,
5108 #else
5109 	    hn_rx_stat_u64_sysctl,
5110 #endif
5111 	    "LU", "LRO flushed");
5112 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5113 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5114 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5115 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5116 #if __FreeBSD_version >= 1100099
5117 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5118 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5119 	    hn_lro_lenlim_sysctl, "IU",
5120 	    "Max # of data bytes to be aggregated by LRO");
5121 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5122 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5123 	    hn_lro_ackcnt_sysctl, "I",
5124 	    "Max # of ACKs to be aggregated by LRO");
5125 #endif
5126 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5127 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5128 	    hn_trust_hcsum_sysctl, "I",
5129 	    "Trust tcp segment verification on host side, "
5130 	    "when csum info is missing");
5131 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5132 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5133 	    hn_trust_hcsum_sysctl, "I",
5134 	    "Trust udp datagram verification on host side, "
5135 	    "when csum info is missing");
5136 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5137 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5138 	    hn_trust_hcsum_sysctl, "I",
5139 	    "Trust ip packet verification on host side, "
5140 	    "when csum info is missing");
5141 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5142 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5143 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5144 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5145 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5146 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5147 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5148 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5149 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5150 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5151 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5152 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5153 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5154 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5155 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5156 	    hn_rx_stat_ulong_sysctl, "LU",
5157 	    "# of packets that we trust host's csum verification");
5158 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5159 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5160 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5161 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5162 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5163 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5164 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5165 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5166 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5167 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5168 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5169 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5170 
5171 	return (0);
5172 }
5173 
5174 static void
5175 hn_destroy_rx_data(struct hn_softc *sc)
5176 {
5177 	int i;
5178 
5179 	if (sc->hn_rxbuf != NULL) {
5180 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5181 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5182 		else
5183 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5184 		sc->hn_rxbuf = NULL;
5185 	}
5186 
5187 	if (sc->hn_rx_ring_cnt == 0)
5188 		return;
5189 
5190 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5191 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5192 
5193 		if (rxr->hn_br == NULL)
5194 			continue;
5195 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5196 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5197 		} else {
5198 			device_printf(sc->hn_dev,
5199 			    "%dth channel bufring is referenced", i);
5200 		}
5201 		rxr->hn_br = NULL;
5202 
5203 #if defined(INET) || defined(INET6)
5204 		tcp_lro_free(&rxr->hn_lro);
5205 #endif
5206 		free(rxr->hn_pktbuf, M_DEVBUF);
5207 	}
5208 	free(sc->hn_rx_ring, M_DEVBUF);
5209 	sc->hn_rx_ring = NULL;
5210 
5211 	sc->hn_rx_ring_cnt = 0;
5212 	sc->hn_rx_ring_inuse = 0;
5213 }
5214 
5215 static int
5216 hn_tx_ring_create(struct hn_softc *sc, int id)
5217 {
5218 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5219 	device_t dev = sc->hn_dev;
5220 	bus_dma_tag_t parent_dtag;
5221 	int error, i;
5222 
5223 	txr->hn_sc = sc;
5224 	txr->hn_tx_idx = id;
5225 
5226 #ifndef HN_USE_TXDESC_BUFRING
5227 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5228 #endif
5229 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5230 
5231 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5232 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5233 	    M_DEVBUF, M_WAITOK | M_ZERO);
5234 #ifndef HN_USE_TXDESC_BUFRING
5235 	SLIST_INIT(&txr->hn_txlist);
5236 #else
5237 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5238 	    M_WAITOK, &txr->hn_tx_lock);
5239 #endif
5240 
5241 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5242 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5243 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5244 	} else {
5245 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5246 	}
5247 
5248 #ifdef HN_IFSTART_SUPPORT
5249 	if (hn_use_if_start) {
5250 		txr->hn_txeof = hn_start_txeof;
5251 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5252 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5253 	} else
5254 #endif
5255 	{
5256 		int br_depth;
5257 
5258 		txr->hn_txeof = hn_xmit_txeof;
5259 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5260 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5261 
5262 		br_depth = hn_get_txswq_depth(txr);
5263 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5264 		    M_WAITOK, &txr->hn_tx_lock);
5265 	}
5266 
5267 	txr->hn_direct_tx_size = hn_direct_tx_size;
5268 
5269 	/*
5270 	 * Always schedule transmission instead of trying to do direct
5271 	 * transmission.  This one gives the best performance so far.
5272 	 */
5273 	txr->hn_sched_tx = 1;
5274 
5275 	parent_dtag = bus_get_dma_tag(dev);
5276 
5277 	/* DMA tag for RNDIS packet messages. */
5278 	error = bus_dma_tag_create(parent_dtag, /* parent */
5279 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5280 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5281 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5282 	    BUS_SPACE_MAXADDR,		/* highaddr */
5283 	    NULL, NULL,			/* filter, filterarg */
5284 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5285 	    1,				/* nsegments */
5286 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5287 	    0,				/* flags */
5288 	    NULL,			/* lockfunc */
5289 	    NULL,			/* lockfuncarg */
5290 	    &txr->hn_tx_rndis_dtag);
5291 	if (error) {
5292 		device_printf(dev, "failed to create rndis dmatag\n");
5293 		return error;
5294 	}
5295 
5296 	/* DMA tag for data. */
5297 	error = bus_dma_tag_create(parent_dtag, /* parent */
5298 	    1,				/* alignment */
5299 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5300 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5301 	    BUS_SPACE_MAXADDR,		/* highaddr */
5302 	    NULL, NULL,			/* filter, filterarg */
5303 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5304 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5305 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5306 	    0,				/* flags */
5307 	    NULL,			/* lockfunc */
5308 	    NULL,			/* lockfuncarg */
5309 	    &txr->hn_tx_data_dtag);
5310 	if (error) {
5311 		device_printf(dev, "failed to create data dmatag\n");
5312 		return error;
5313 	}
5314 
5315 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5316 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5317 
5318 		txd->txr = txr;
5319 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5320 		STAILQ_INIT(&txd->agg_list);
5321 
5322 		/*
5323 		 * Allocate and load RNDIS packet message.
5324 		 */
5325         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5326 		    (void **)&txd->rndis_pkt,
5327 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5328 		    &txd->rndis_pkt_dmap);
5329 		if (error) {
5330 			device_printf(dev,
5331 			    "failed to allocate rndis_packet_msg, %d\n", i);
5332 			return error;
5333 		}
5334 
5335 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5336 		    txd->rndis_pkt_dmap,
5337 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5338 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5339 		    BUS_DMA_NOWAIT);
5340 		if (error) {
5341 			device_printf(dev,
5342 			    "failed to load rndis_packet_msg, %d\n", i);
5343 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5344 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5345 			return error;
5346 		}
5347 
5348 		/* DMA map for TX data. */
5349 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5350 		    &txd->data_dmap);
5351 		if (error) {
5352 			device_printf(dev,
5353 			    "failed to allocate tx data dmamap\n");
5354 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5355 			    txd->rndis_pkt_dmap);
5356 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5357 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5358 			return error;
5359 		}
5360 
5361 		/* All set, put it to list */
5362 		txd->flags |= HN_TXD_FLAG_ONLIST;
5363 #ifndef HN_USE_TXDESC_BUFRING
5364 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5365 #else
5366 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5367 #endif
5368 	}
5369 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5370 
5371 	if (sc->hn_tx_sysctl_tree != NULL) {
5372 		struct sysctl_oid_list *child;
5373 		struct sysctl_ctx_list *ctx;
5374 		char name[16];
5375 
5376 		/*
5377 		 * Create per TX ring sysctl tree:
5378 		 * dev.hn.UNIT.tx.RINGID
5379 		 */
5380 		ctx = device_get_sysctl_ctx(dev);
5381 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5382 
5383 		snprintf(name, sizeof(name), "%d", id);
5384 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5385 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5386 
5387 		if (txr->hn_tx_sysctl_tree != NULL) {
5388 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5389 
5390 #ifdef HN_DEBUG
5391 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5392 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5393 			    "# of available TX descs");
5394 #endif
5395 #ifdef HN_IFSTART_SUPPORT
5396 			if (!hn_use_if_start)
5397 #endif
5398 			{
5399 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5400 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5401 				    "over active");
5402 			}
5403 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5404 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5405 			    "# of packets transmitted");
5406 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5407 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5408 			    "# of sends");
5409 		}
5410 	}
5411 
5412 	return 0;
5413 }
5414 
5415 static void
5416 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5417 {
5418 	struct hn_tx_ring *txr = txd->txr;
5419 
5420 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5421 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5422 
5423 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5424 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5425 	    txd->rndis_pkt_dmap);
5426 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5427 }
5428 
5429 static void
5430 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5431 {
5432 
5433 	KASSERT(txd->refs == 0 || txd->refs == 1,
5434 	    ("invalid txd refs %d", txd->refs));
5435 
5436 	/* Aggregated txds will be freed by their aggregating txd. */
5437 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5438 		int freed __diagused;
5439 
5440 		freed = hn_txdesc_put(txr, txd);
5441 		KASSERT(freed, ("can't free txdesc"));
5442 	}
5443 }
5444 
5445 static void
5446 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5447 {
5448 	int i;
5449 
5450 	if (txr->hn_txdesc == NULL)
5451 		return;
5452 
5453 	/*
5454 	 * NOTE:
5455 	 * Because the freeing of aggregated txds will be deferred
5456 	 * to the aggregating txd, two passes are used here:
5457 	 * - The first pass GCes any pending txds.  This GC is necessary,
5458 	 *   since if the channels are revoked, hypervisor will not
5459 	 *   deliver send-done for all pending txds.
5460 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5461 	 *   were freed.
5462 	 */
5463 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5464 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5465 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5466 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5467 
5468 	if (txr->hn_tx_data_dtag != NULL)
5469 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5470 	if (txr->hn_tx_rndis_dtag != NULL)
5471 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5472 
5473 #ifdef HN_USE_TXDESC_BUFRING
5474 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5475 #endif
5476 
5477 	free(txr->hn_txdesc, M_DEVBUF);
5478 	txr->hn_txdesc = NULL;
5479 
5480 	if (txr->hn_mbuf_br != NULL)
5481 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5482 
5483 #ifndef HN_USE_TXDESC_BUFRING
5484 	mtx_destroy(&txr->hn_txlist_spin);
5485 #endif
5486 	mtx_destroy(&txr->hn_tx_lock);
5487 }
5488 
5489 static int
5490 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5491 {
5492 	struct sysctl_oid_list *child;
5493 	struct sysctl_ctx_list *ctx;
5494 	int i;
5495 
5496 	/*
5497 	 * Create TXBUF for chimney sending.
5498 	 *
5499 	 * NOTE: It is shared by all channels.
5500 	 */
5501 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5502 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5503 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5504 	if (sc->hn_chim == NULL) {
5505 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5506 		return (ENOMEM);
5507 	}
5508 
5509 	sc->hn_tx_ring_cnt = ring_cnt;
5510 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5511 
5512 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5513 	    M_DEVBUF, M_WAITOK | M_ZERO);
5514 
5515 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5516 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5517 
5518 	/* Create dev.hn.UNIT.tx sysctl tree */
5519 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5520 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5521 
5522 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5523 		int error;
5524 
5525 		error = hn_tx_ring_create(sc, i);
5526 		if (error)
5527 			return error;
5528 	}
5529 
5530 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5531 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5532 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5533 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5534 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5535 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5536 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5537 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5538 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5539 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5540 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5541 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5542 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5543 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5544 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5545 	    hn_tx_stat_ulong_sysctl, "LU",
5546 	    "# of packet transmission aggregation flush failure");
5547 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5548 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5549 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5550 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5551 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5552 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5553 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5554 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5555 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5556 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5557 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5558 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5559 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5560 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5561 	    "# of total TX descs");
5562 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5563 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5564 	    "Chimney send packet size upper boundary");
5565 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5566 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5567 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5568 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5569 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5570 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5571 	    hn_tx_conf_int_sysctl, "I",
5572 	    "Size of the packet for direct transmission");
5573 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5574 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5575 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5576 	    hn_tx_conf_int_sysctl, "I",
5577 	    "Always schedule transmission "
5578 	    "instead of doing direct transmission");
5579 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5580 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5581 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5582 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5583 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5584 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5585 	    "Applied packet transmission aggregation size");
5586 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5587 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5588 	    hn_txagg_pktmax_sysctl, "I",
5589 	    "Applied packet transmission aggregation packets");
5590 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5591 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5592 	    hn_txagg_align_sysctl, "I",
5593 	    "Applied packet transmission aggregation alignment");
5594 
5595 	return 0;
5596 }
5597 
5598 static void
5599 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5600 {
5601 	int i;
5602 
5603 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5604 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5605 }
5606 
5607 static void
5608 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5609 {
5610 	struct ifnet *ifp = sc->hn_ifp;
5611 	u_int hw_tsomax;
5612 	int tso_minlen;
5613 
5614 	HN_LOCK_ASSERT(sc);
5615 
5616 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5617 		return;
5618 
5619 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5620 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5621 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5622 
5623 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5624 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5625 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5626 
5627 	if (tso_maxlen < tso_minlen)
5628 		tso_maxlen = tso_minlen;
5629 	else if (tso_maxlen > IP_MAXPACKET)
5630 		tso_maxlen = IP_MAXPACKET;
5631 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5632 		tso_maxlen = sc->hn_ndis_tso_szmax;
5633 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5634 
5635 	if (hn_xpnt_vf_isready(sc)) {
5636 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5637 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5638 	}
5639 	ifp->if_hw_tsomax = hw_tsomax;
5640 	if (bootverbose)
5641 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5642 }
5643 
5644 static void
5645 hn_fixup_tx_data(struct hn_softc *sc)
5646 {
5647 	uint64_t csum_assist;
5648 	int i;
5649 
5650 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5651 	if (hn_tx_chimney_size > 0 &&
5652 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5653 		hn_set_chim_size(sc, hn_tx_chimney_size);
5654 
5655 	csum_assist = 0;
5656 	if (sc->hn_caps & HN_CAP_IPCS)
5657 		csum_assist |= CSUM_IP;
5658 	if (sc->hn_caps & HN_CAP_TCP4CS)
5659 		csum_assist |= CSUM_IP_TCP;
5660 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5661 		csum_assist |= CSUM_IP_UDP;
5662 	if (sc->hn_caps & HN_CAP_TCP6CS)
5663 		csum_assist |= CSUM_IP6_TCP;
5664 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5665 		csum_assist |= CSUM_IP6_UDP;
5666 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5667 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5668 
5669 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5670 		/*
5671 		 * Support HASHVAL pktinfo on TX path.
5672 		 */
5673 		if (bootverbose)
5674 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5675 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5676 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5677 	}
5678 }
5679 
5680 static void
5681 hn_fixup_rx_data(struct hn_softc *sc)
5682 {
5683 
5684 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5685 		int i;
5686 
5687 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5688 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5689 	}
5690 }
5691 
5692 static void
5693 hn_destroy_tx_data(struct hn_softc *sc)
5694 {
5695 	int i;
5696 
5697 	if (sc->hn_chim != NULL) {
5698 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5699 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5700 		} else {
5701 			device_printf(sc->hn_dev,
5702 			    "chimney sending buffer is referenced");
5703 		}
5704 		sc->hn_chim = NULL;
5705 	}
5706 
5707 	if (sc->hn_tx_ring_cnt == 0)
5708 		return;
5709 
5710 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5711 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5712 
5713 	free(sc->hn_tx_ring, M_DEVBUF);
5714 	sc->hn_tx_ring = NULL;
5715 
5716 	sc->hn_tx_ring_cnt = 0;
5717 	sc->hn_tx_ring_inuse = 0;
5718 }
5719 
5720 #ifdef HN_IFSTART_SUPPORT
5721 
5722 static void
5723 hn_start_taskfunc(void *xtxr, int pending __unused)
5724 {
5725 	struct hn_tx_ring *txr = xtxr;
5726 
5727 	mtx_lock(&txr->hn_tx_lock);
5728 	hn_start_locked(txr, 0);
5729 	mtx_unlock(&txr->hn_tx_lock);
5730 }
5731 
5732 static int
5733 hn_start_locked(struct hn_tx_ring *txr, int len)
5734 {
5735 	struct hn_softc *sc = txr->hn_sc;
5736 	struct ifnet *ifp = sc->hn_ifp;
5737 	int sched = 0;
5738 
5739 	KASSERT(hn_use_if_start,
5740 	    ("hn_start_locked is called, when if_start is disabled"));
5741 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5742 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5743 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5744 
5745 	if (__predict_false(txr->hn_suspended))
5746 		return (0);
5747 
5748 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5749 	    IFF_DRV_RUNNING)
5750 		return (0);
5751 
5752 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5753 		struct hn_txdesc *txd;
5754 		struct mbuf *m_head;
5755 		int error;
5756 
5757 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5758 		if (m_head == NULL)
5759 			break;
5760 
5761 		if (len > 0 && m_head->m_pkthdr.len > len) {
5762 			/*
5763 			 * This sending could be time consuming; let callers
5764 			 * dispatch this packet sending (and sending of any
5765 			 * following up packets) to tx taskqueue.
5766 			 */
5767 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5768 			sched = 1;
5769 			break;
5770 		}
5771 
5772 #if defined(INET6) || defined(INET)
5773 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5774 			m_head = hn_tso_fixup(m_head);
5775 			if (__predict_false(m_head == NULL)) {
5776 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5777 				continue;
5778 			}
5779 		} else if (m_head->m_pkthdr.csum_flags &
5780 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5781 			m_head = hn_set_hlen(m_head);
5782 			if (__predict_false(m_head == NULL)) {
5783 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5784 				continue;
5785 			}
5786 		}
5787 #endif
5788 
5789 		txd = hn_txdesc_get(txr);
5790 		if (txd == NULL) {
5791 			txr->hn_no_txdescs++;
5792 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5793 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5794 			break;
5795 		}
5796 
5797 		error = hn_encap(ifp, txr, txd, &m_head);
5798 		if (error) {
5799 			/* Both txd and m_head are freed */
5800 			KASSERT(txr->hn_agg_txd == NULL,
5801 			    ("encap failed w/ pending aggregating txdesc"));
5802 			continue;
5803 		}
5804 
5805 		if (txr->hn_agg_pktleft == 0) {
5806 			if (txr->hn_agg_txd != NULL) {
5807 				KASSERT(m_head == NULL,
5808 				    ("pending mbuf for aggregating txdesc"));
5809 				error = hn_flush_txagg(ifp, txr);
5810 				if (__predict_false(error)) {
5811 					atomic_set_int(&ifp->if_drv_flags,
5812 					    IFF_DRV_OACTIVE);
5813 					break;
5814 				}
5815 			} else {
5816 				KASSERT(m_head != NULL, ("mbuf was freed"));
5817 				error = hn_txpkt(ifp, txr, txd);
5818 				if (__predict_false(error)) {
5819 					/* txd is freed, but m_head is not */
5820 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5821 					atomic_set_int(&ifp->if_drv_flags,
5822 					    IFF_DRV_OACTIVE);
5823 					break;
5824 				}
5825 			}
5826 		}
5827 #ifdef INVARIANTS
5828 		else {
5829 			KASSERT(txr->hn_agg_txd != NULL,
5830 			    ("no aggregating txdesc"));
5831 			KASSERT(m_head == NULL,
5832 			    ("pending mbuf for aggregating txdesc"));
5833 		}
5834 #endif
5835 	}
5836 
5837 	/* Flush pending aggerated transmission. */
5838 	if (txr->hn_agg_txd != NULL)
5839 		hn_flush_txagg(ifp, txr);
5840 	return (sched);
5841 }
5842 
5843 static void
5844 hn_start(struct ifnet *ifp)
5845 {
5846 	struct hn_softc *sc = ifp->if_softc;
5847 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5848 
5849 	if (txr->hn_sched_tx)
5850 		goto do_sched;
5851 
5852 	if (mtx_trylock(&txr->hn_tx_lock)) {
5853 		int sched;
5854 
5855 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5856 		mtx_unlock(&txr->hn_tx_lock);
5857 		if (!sched)
5858 			return;
5859 	}
5860 do_sched:
5861 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5862 }
5863 
5864 static void
5865 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5866 {
5867 	struct hn_tx_ring *txr = xtxr;
5868 
5869 	mtx_lock(&txr->hn_tx_lock);
5870 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5871 	hn_start_locked(txr, 0);
5872 	mtx_unlock(&txr->hn_tx_lock);
5873 }
5874 
5875 static void
5876 hn_start_txeof(struct hn_tx_ring *txr)
5877 {
5878 	struct hn_softc *sc = txr->hn_sc;
5879 	struct ifnet *ifp = sc->hn_ifp;
5880 
5881 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5882 
5883 	if (txr->hn_sched_tx)
5884 		goto do_sched;
5885 
5886 	if (mtx_trylock(&txr->hn_tx_lock)) {
5887 		int sched;
5888 
5889 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5890 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5891 		mtx_unlock(&txr->hn_tx_lock);
5892 		if (sched) {
5893 			taskqueue_enqueue(txr->hn_tx_taskq,
5894 			    &txr->hn_tx_task);
5895 		}
5896 	} else {
5897 do_sched:
5898 		/*
5899 		 * Release the OACTIVE earlier, with the hope, that
5900 		 * others could catch up.  The task will clear the
5901 		 * flag again with the hn_tx_lock to avoid possible
5902 		 * races.
5903 		 */
5904 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5905 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5906 	}
5907 }
5908 
5909 #endif	/* HN_IFSTART_SUPPORT */
5910 
5911 static int
5912 hn_xmit(struct hn_tx_ring *txr, int len)
5913 {
5914 	struct hn_softc *sc = txr->hn_sc;
5915 	struct ifnet *ifp = sc->hn_ifp;
5916 	struct mbuf *m_head;
5917 	int sched = 0;
5918 
5919 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5920 #ifdef HN_IFSTART_SUPPORT
5921 	KASSERT(hn_use_if_start == 0,
5922 	    ("hn_xmit is called, when if_start is enabled"));
5923 #endif
5924 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5925 
5926 	if (__predict_false(txr->hn_suspended))
5927 		return (0);
5928 
5929 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5930 		return (0);
5931 
5932 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5933 		struct hn_txdesc *txd;
5934 		int error;
5935 
5936 		if (len > 0 && m_head->m_pkthdr.len > len) {
5937 			/*
5938 			 * This sending could be time consuming; let callers
5939 			 * dispatch this packet sending (and sending of any
5940 			 * following up packets) to tx taskqueue.
5941 			 */
5942 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5943 			sched = 1;
5944 			break;
5945 		}
5946 
5947 		txd = hn_txdesc_get(txr);
5948 		if (txd == NULL) {
5949 			txr->hn_no_txdescs++;
5950 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5951 			txr->hn_oactive = 1;
5952 			break;
5953 		}
5954 
5955 		error = hn_encap(ifp, txr, txd, &m_head);
5956 		if (error) {
5957 			/* Both txd and m_head are freed; discard */
5958 			KASSERT(txr->hn_agg_txd == NULL,
5959 			    ("encap failed w/ pending aggregating txdesc"));
5960 			drbr_advance(ifp, txr->hn_mbuf_br);
5961 			continue;
5962 		}
5963 
5964 		if (txr->hn_agg_pktleft == 0) {
5965 			if (txr->hn_agg_txd != NULL) {
5966 				KASSERT(m_head == NULL,
5967 				    ("pending mbuf for aggregating txdesc"));
5968 				error = hn_flush_txagg(ifp, txr);
5969 				if (__predict_false(error)) {
5970 					txr->hn_oactive = 1;
5971 					break;
5972 				}
5973 			} else {
5974 				KASSERT(m_head != NULL, ("mbuf was freed"));
5975 				error = hn_txpkt(ifp, txr, txd);
5976 				if (__predict_false(error)) {
5977 					/* txd is freed, but m_head is not */
5978 					drbr_putback(ifp, txr->hn_mbuf_br,
5979 					    m_head);
5980 					txr->hn_oactive = 1;
5981 					break;
5982 				}
5983 			}
5984 		}
5985 #ifdef INVARIANTS
5986 		else {
5987 			KASSERT(txr->hn_agg_txd != NULL,
5988 			    ("no aggregating txdesc"));
5989 			KASSERT(m_head == NULL,
5990 			    ("pending mbuf for aggregating txdesc"));
5991 		}
5992 #endif
5993 
5994 		/* Sent */
5995 		drbr_advance(ifp, txr->hn_mbuf_br);
5996 	}
5997 
5998 	/* Flush pending aggerated transmission. */
5999 	if (txr->hn_agg_txd != NULL)
6000 		hn_flush_txagg(ifp, txr);
6001 	return (sched);
6002 }
6003 
6004 static int
6005 hn_transmit(struct ifnet *ifp, struct mbuf *m)
6006 {
6007 	struct hn_softc *sc = ifp->if_softc;
6008 	struct hn_tx_ring *txr;
6009 	int error, idx = 0;
6010 
6011 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
6012 		struct rm_priotracker pt;
6013 
6014 		rm_rlock(&sc->hn_vf_lock, &pt);
6015 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6016 			struct mbuf *m_bpf = NULL;
6017 			int obytes, omcast;
6018 
6019 			obytes = m->m_pkthdr.len;
6020 			omcast = (m->m_flags & M_MCAST) != 0;
6021 
6022 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
6023 				if (bpf_peers_present(ifp->if_bpf)) {
6024 					m_bpf = m_copypacket(m, M_NOWAIT);
6025 					if (m_bpf == NULL) {
6026 						/*
6027 						 * Failed to grab a shallow
6028 						 * copy; tap now.
6029 						 */
6030 						ETHER_BPF_MTAP(ifp, m);
6031 					}
6032 				}
6033 			} else {
6034 				ETHER_BPF_MTAP(ifp, m);
6035 			}
6036 
6037 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
6038 			rm_runlock(&sc->hn_vf_lock, &pt);
6039 
6040 			if (m_bpf != NULL) {
6041 				if (!error)
6042 					ETHER_BPF_MTAP(ifp, m_bpf);
6043 				m_freem(m_bpf);
6044 			}
6045 
6046 			if (error == ENOBUFS) {
6047 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6048 			} else if (error) {
6049 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6050 			} else {
6051 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6052 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6053 				if (omcast) {
6054 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6055 					    omcast);
6056 				}
6057 			}
6058 			return (error);
6059 		}
6060 		rm_runlock(&sc->hn_vf_lock, &pt);
6061 	}
6062 
6063 #if defined(INET6) || defined(INET)
6064 	/*
6065 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6066 	 * since packet headers should be cache-hot.
6067 	 */
6068 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6069 		m = hn_tso_fixup(m);
6070 		if (__predict_false(m == NULL)) {
6071 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6072 			return EIO;
6073 		}
6074 	} else if (m->m_pkthdr.csum_flags &
6075 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6076 		m = hn_set_hlen(m);
6077 		if (__predict_false(m == NULL)) {
6078 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6079 			return EIO;
6080 		}
6081 	}
6082 #endif
6083 
6084 	/*
6085 	 * Select the TX ring based on flowid
6086 	 */
6087 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6088 #ifdef RSS
6089 		uint32_t bid;
6090 
6091 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6092 		    &bid) == 0)
6093 			idx = bid % sc->hn_tx_ring_inuse;
6094 		else
6095 #endif
6096 		{
6097 #if defined(INET6) || defined(INET)
6098 			int tcpsyn = 0;
6099 
6100 			if (m->m_pkthdr.len < 128 &&
6101 			    (m->m_pkthdr.csum_flags &
6102 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6103 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6104 				m = hn_check_tcpsyn(m, &tcpsyn);
6105 				if (__predict_false(m == NULL)) {
6106 					if_inc_counter(ifp,
6107 					    IFCOUNTER_OERRORS, 1);
6108 					return (EIO);
6109 				}
6110 			}
6111 #else
6112 			const int tcpsyn = 0;
6113 #endif
6114 			if (tcpsyn)
6115 				idx = 0;
6116 			else
6117 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6118 		}
6119 	}
6120 	txr = &sc->hn_tx_ring[idx];
6121 
6122 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6123 	if (error) {
6124 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6125 		return error;
6126 	}
6127 
6128 	if (txr->hn_oactive)
6129 		return 0;
6130 
6131 	if (txr->hn_sched_tx)
6132 		goto do_sched;
6133 
6134 	if (mtx_trylock(&txr->hn_tx_lock)) {
6135 		int sched;
6136 
6137 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6138 		mtx_unlock(&txr->hn_tx_lock);
6139 		if (!sched)
6140 			return 0;
6141 	}
6142 do_sched:
6143 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6144 	return 0;
6145 }
6146 
6147 static void
6148 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6149 {
6150 	struct mbuf *m;
6151 
6152 	mtx_lock(&txr->hn_tx_lock);
6153 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6154 		m_freem(m);
6155 	mtx_unlock(&txr->hn_tx_lock);
6156 }
6157 
6158 static void
6159 hn_xmit_qflush(struct ifnet *ifp)
6160 {
6161 	struct hn_softc *sc = ifp->if_softc;
6162 	struct rm_priotracker pt;
6163 	int i;
6164 
6165 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6166 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6167 	if_qflush(ifp);
6168 
6169 	rm_rlock(&sc->hn_vf_lock, &pt);
6170 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6171 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6172 	rm_runlock(&sc->hn_vf_lock, &pt);
6173 }
6174 
6175 static void
6176 hn_xmit_txeof(struct hn_tx_ring *txr)
6177 {
6178 
6179 	if (txr->hn_sched_tx)
6180 		goto do_sched;
6181 
6182 	if (mtx_trylock(&txr->hn_tx_lock)) {
6183 		int sched;
6184 
6185 		txr->hn_oactive = 0;
6186 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6187 		mtx_unlock(&txr->hn_tx_lock);
6188 		if (sched) {
6189 			taskqueue_enqueue(txr->hn_tx_taskq,
6190 			    &txr->hn_tx_task);
6191 		}
6192 	} else {
6193 do_sched:
6194 		/*
6195 		 * Release the oactive earlier, with the hope, that
6196 		 * others could catch up.  The task will clear the
6197 		 * oactive again with the hn_tx_lock to avoid possible
6198 		 * races.
6199 		 */
6200 		txr->hn_oactive = 0;
6201 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6202 	}
6203 }
6204 
6205 static void
6206 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6207 {
6208 	struct hn_tx_ring *txr = xtxr;
6209 
6210 	mtx_lock(&txr->hn_tx_lock);
6211 	hn_xmit(txr, 0);
6212 	mtx_unlock(&txr->hn_tx_lock);
6213 }
6214 
6215 static void
6216 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6217 {
6218 	struct hn_tx_ring *txr = xtxr;
6219 
6220 	mtx_lock(&txr->hn_tx_lock);
6221 	txr->hn_oactive = 0;
6222 	hn_xmit(txr, 0);
6223 	mtx_unlock(&txr->hn_tx_lock);
6224 }
6225 
6226 static int
6227 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6228 {
6229 	struct vmbus_chan_br cbr;
6230 	struct hn_rx_ring *rxr;
6231 	struct hn_tx_ring *txr = NULL;
6232 	int idx, error;
6233 
6234 	idx = vmbus_chan_subidx(chan);
6235 
6236 	/*
6237 	 * Link this channel to RX/TX ring.
6238 	 */
6239 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6240 	    ("invalid channel index %d, should > 0 && < %d",
6241 	     idx, sc->hn_rx_ring_inuse));
6242 	rxr = &sc->hn_rx_ring[idx];
6243 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6244 	    ("RX ring %d already attached", idx));
6245 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6246 	rxr->hn_chan = chan;
6247 
6248 	if (bootverbose) {
6249 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6250 		    idx, vmbus_chan_id(chan));
6251 	}
6252 
6253 	if (idx < sc->hn_tx_ring_inuse) {
6254 		txr = &sc->hn_tx_ring[idx];
6255 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6256 		    ("TX ring %d already attached", idx));
6257 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6258 
6259 		txr->hn_chan = chan;
6260 		if (bootverbose) {
6261 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6262 			    idx, vmbus_chan_id(chan));
6263 		}
6264 	}
6265 
6266 	/* Bind this channel to a proper CPU. */
6267 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6268 
6269 	/*
6270 	 * Open this channel
6271 	 */
6272 	cbr.cbr = rxr->hn_br;
6273 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6274 	cbr.cbr_txsz = HN_TXBR_SIZE;
6275 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6276 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6277 	if (error) {
6278 		if (error == EISCONN) {
6279 			if_printf(sc->hn_ifp, "bufring is connected after "
6280 			    "chan%u open failure\n", vmbus_chan_id(chan));
6281 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6282 		} else {
6283 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6284 			    vmbus_chan_id(chan), error);
6285 		}
6286 	}
6287 	return (error);
6288 }
6289 
6290 static void
6291 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6292 {
6293 	struct hn_rx_ring *rxr;
6294 	int idx, error;
6295 
6296 	idx = vmbus_chan_subidx(chan);
6297 
6298 	/*
6299 	 * Link this channel to RX/TX ring.
6300 	 */
6301 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6302 	    ("invalid channel index %d, should > 0 && < %d",
6303 	     idx, sc->hn_rx_ring_inuse));
6304 	rxr = &sc->hn_rx_ring[idx];
6305 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6306 	    ("RX ring %d is not attached", idx));
6307 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6308 
6309 	if (idx < sc->hn_tx_ring_inuse) {
6310 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6311 
6312 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6313 		    ("TX ring %d is not attached attached", idx));
6314 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6315 	}
6316 
6317 	/*
6318 	 * Close this channel.
6319 	 *
6320 	 * NOTE:
6321 	 * Channel closing does _not_ destroy the target channel.
6322 	 */
6323 	error = vmbus_chan_close_direct(chan);
6324 	if (error == EISCONN) {
6325 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6326 		    "after being closed\n", vmbus_chan_id(chan));
6327 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6328 	} else if (error) {
6329 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6330 		    vmbus_chan_id(chan), error);
6331 	}
6332 }
6333 
6334 static int
6335 hn_attach_subchans(struct hn_softc *sc)
6336 {
6337 	struct vmbus_channel **subchans;
6338 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6339 	int i, error = 0;
6340 
6341 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6342 
6343 	/* Attach the sub-channels. */
6344 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6345 	for (i = 0; i < subchan_cnt; ++i) {
6346 		int error1;
6347 
6348 		error1 = hn_chan_attach(sc, subchans[i]);
6349 		if (error1) {
6350 			error = error1;
6351 			/* Move on; all channels will be detached later. */
6352 		}
6353 	}
6354 	vmbus_subchan_rel(subchans, subchan_cnt);
6355 
6356 	if (error) {
6357 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6358 	} else {
6359 		if (bootverbose) {
6360 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6361 			    subchan_cnt);
6362 		}
6363 	}
6364 	return (error);
6365 }
6366 
6367 static void
6368 hn_detach_allchans(struct hn_softc *sc)
6369 {
6370 	struct vmbus_channel **subchans;
6371 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6372 	int i;
6373 
6374 	if (subchan_cnt == 0)
6375 		goto back;
6376 
6377 	/* Detach the sub-channels. */
6378 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6379 	for (i = 0; i < subchan_cnt; ++i)
6380 		hn_chan_detach(sc, subchans[i]);
6381 	vmbus_subchan_rel(subchans, subchan_cnt);
6382 
6383 back:
6384 	/*
6385 	 * Detach the primary channel, _after_ all sub-channels
6386 	 * are detached.
6387 	 */
6388 	hn_chan_detach(sc, sc->hn_prichan);
6389 
6390 	/* Wait for sub-channels to be destroyed, if any. */
6391 	vmbus_subchan_drain(sc->hn_prichan);
6392 
6393 #ifdef INVARIANTS
6394 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6395 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6396 		    HN_RX_FLAG_ATTACHED) == 0,
6397 		    ("%dth RX ring is still attached", i));
6398 	}
6399 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6400 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6401 		    HN_TX_FLAG_ATTACHED) == 0,
6402 		    ("%dth TX ring is still attached", i));
6403 	}
6404 #endif
6405 }
6406 
6407 static int
6408 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6409 {
6410 	struct vmbus_channel **subchans;
6411 	int nchan, rxr_cnt, error;
6412 
6413 	nchan = *nsubch + 1;
6414 	if (nchan == 1) {
6415 		/*
6416 		 * Multiple RX/TX rings are not requested.
6417 		 */
6418 		*nsubch = 0;
6419 		return (0);
6420 	}
6421 
6422 	/*
6423 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6424 	 * table entries.
6425 	 */
6426 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6427 	if (error) {
6428 		/* No RSS; this is benign. */
6429 		*nsubch = 0;
6430 		return (0);
6431 	}
6432 	if (bootverbose) {
6433 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6434 		    rxr_cnt, nchan);
6435 	}
6436 
6437 	if (nchan > rxr_cnt)
6438 		nchan = rxr_cnt;
6439 	if (nchan == 1) {
6440 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6441 		*nsubch = 0;
6442 		return (0);
6443 	}
6444 
6445 	/*
6446 	 * Allocate sub-channels from NVS.
6447 	 */
6448 	*nsubch = nchan - 1;
6449 	error = hn_nvs_alloc_subchans(sc, nsubch);
6450 	if (error || *nsubch == 0) {
6451 		/* Failed to allocate sub-channels. */
6452 		*nsubch = 0;
6453 		return (0);
6454 	}
6455 
6456 	/*
6457 	 * Wait for all sub-channels to become ready before moving on.
6458 	 */
6459 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6460 	vmbus_subchan_rel(subchans, *nsubch);
6461 	return (0);
6462 }
6463 
6464 static bool
6465 hn_synth_attachable(const struct hn_softc *sc)
6466 {
6467 	int i;
6468 
6469 	if (sc->hn_flags & HN_FLAG_ERRORS)
6470 		return (false);
6471 
6472 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6473 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6474 
6475 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6476 			return (false);
6477 	}
6478 	return (true);
6479 }
6480 
6481 /*
6482  * Make sure that the RX filter is zero after the successful
6483  * RNDIS initialization.
6484  *
6485  * NOTE:
6486  * Under certain conditions on certain versions of Hyper-V,
6487  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6488  * after the successful RNDIS initialization, which breaks
6489  * the assumption of any following code (well, it breaks the
6490  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6491  * explicitly, drain packets sneaking through, and drain the
6492  * interrupt taskqueues scheduled due to the stealth packets.
6493  */
6494 static void
6495 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6496 {
6497 
6498 	hn_disable_rx(sc);
6499 	hn_drain_rxtx(sc, nchan);
6500 }
6501 
6502 static int
6503 hn_synth_attach(struct hn_softc *sc, int mtu)
6504 {
6505 #define ATTACHED_NVS		0x0002
6506 #define ATTACHED_RNDIS		0x0004
6507 
6508 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6509 	int error, nsubch, nchan = 1, i, rndis_inited;
6510 	uint32_t old_caps, attached = 0;
6511 
6512 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6513 	    ("synthetic parts were attached"));
6514 
6515 	if (!hn_synth_attachable(sc))
6516 		return (ENXIO);
6517 
6518 	/* Save capabilities for later verification. */
6519 	old_caps = sc->hn_caps;
6520 	sc->hn_caps = 0;
6521 
6522 	/* Clear RSS stuffs. */
6523 	sc->hn_rss_ind_size = 0;
6524 	sc->hn_rss_hash = 0;
6525 	sc->hn_rss_hcap = 0;
6526 
6527 	/*
6528 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6529 	 */
6530 	error = hn_chan_attach(sc, sc->hn_prichan);
6531 	if (error)
6532 		goto failed;
6533 
6534 	/*
6535 	 * Attach NVS.
6536 	 */
6537 	error = hn_nvs_attach(sc, mtu);
6538 	if (error)
6539 		goto failed;
6540 	attached |= ATTACHED_NVS;
6541 
6542 	/*
6543 	 * Attach RNDIS _after_ NVS is attached.
6544 	 */
6545 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6546 	if (rndis_inited)
6547 		attached |= ATTACHED_RNDIS;
6548 	if (error)
6549 		goto failed;
6550 
6551 	/*
6552 	 * Make sure capabilities are not changed.
6553 	 */
6554 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6555 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6556 		    old_caps, sc->hn_caps);
6557 		error = ENXIO;
6558 		goto failed;
6559 	}
6560 
6561 	/*
6562 	 * Allocate sub-channels for multi-TX/RX rings.
6563 	 *
6564 	 * NOTE:
6565 	 * The # of RX rings that can be used is equivalent to the # of
6566 	 * channels to be requested.
6567 	 */
6568 	nsubch = sc->hn_rx_ring_cnt - 1;
6569 	error = hn_synth_alloc_subchans(sc, &nsubch);
6570 	if (error)
6571 		goto failed;
6572 	/* NOTE: _Full_ synthetic parts detach is required now. */
6573 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6574 
6575 	/*
6576 	 * Set the # of TX/RX rings that could be used according to
6577 	 * the # of channels that NVS offered.
6578 	 */
6579 	nchan = nsubch + 1;
6580 	hn_set_ring_inuse(sc, nchan);
6581 	if (nchan == 1) {
6582 		/* Only the primary channel can be used; done */
6583 		goto back;
6584 	}
6585 
6586 	/*
6587 	 * Attach the sub-channels.
6588 	 *
6589 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6590 	 */
6591 	error = hn_attach_subchans(sc);
6592 	if (error)
6593 		goto failed;
6594 
6595 	/*
6596 	 * Configure RSS key and indirect table _after_ all sub-channels
6597 	 * are attached.
6598 	 */
6599 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6600 		/*
6601 		 * RSS key is not set yet; set it to the default RSS key.
6602 		 */
6603 		if (bootverbose)
6604 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6605 #ifdef RSS
6606 		rss_getkey(rss->rss_key);
6607 #else
6608 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6609 #endif
6610 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6611 	}
6612 
6613 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6614 		/*
6615 		 * RSS indirect table is not set yet; set it up in round-
6616 		 * robin fashion.
6617 		 */
6618 		if (bootverbose) {
6619 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6620 			    "table\n");
6621 		}
6622 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6623 			uint32_t subidx;
6624 
6625 #ifdef RSS
6626 			subidx = rss_get_indirection_to_bucket(i);
6627 #else
6628 			subidx = i;
6629 #endif
6630 			rss->rss_ind[i] = subidx % nchan;
6631 		}
6632 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6633 	} else {
6634 		/*
6635 		 * # of usable channels may be changed, so we have to
6636 		 * make sure that all entries in RSS indirect table
6637 		 * are valid.
6638 		 *
6639 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6640 		 */
6641 		hn_rss_ind_fixup(sc);
6642 	}
6643 
6644 	sc->hn_rss_hash = sc->hn_rss_hcap;
6645 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6646 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6647 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6648 		hn_vf_rss_fixup(sc, false);
6649 	}
6650 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6651 	if (error)
6652 		goto failed;
6653 back:
6654 	/*
6655 	 * Fixup transmission aggregation setup.
6656 	 */
6657 	hn_set_txagg(sc);
6658 	hn_rndis_init_fixat(sc, nchan);
6659 	return (0);
6660 
6661 failed:
6662 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6663 		hn_rndis_init_fixat(sc, nchan);
6664 		hn_synth_detach(sc);
6665 	} else {
6666 		if (attached & ATTACHED_RNDIS) {
6667 			hn_rndis_init_fixat(sc, nchan);
6668 			hn_rndis_detach(sc);
6669 		}
6670 		if (attached & ATTACHED_NVS)
6671 			hn_nvs_detach(sc);
6672 		hn_chan_detach(sc, sc->hn_prichan);
6673 		/* Restore old capabilities. */
6674 		sc->hn_caps = old_caps;
6675 	}
6676 	return (error);
6677 
6678 #undef ATTACHED_RNDIS
6679 #undef ATTACHED_NVS
6680 }
6681 
6682 /*
6683  * NOTE:
6684  * The interface must have been suspended though hn_suspend(), before
6685  * this function get called.
6686  */
6687 static void
6688 hn_synth_detach(struct hn_softc *sc)
6689 {
6690 
6691 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6692 	    ("synthetic parts were not attached"));
6693 
6694 	/* Detach the RNDIS first. */
6695 	hn_rndis_detach(sc);
6696 
6697 	/* Detach NVS. */
6698 	hn_nvs_detach(sc);
6699 
6700 	/* Detach all of the channels. */
6701 	hn_detach_allchans(sc);
6702 
6703 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6704 		/*
6705 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6706 		 */
6707 		int error;
6708 
6709 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6710 		    sc->hn_rxbuf_gpadl);
6711 		if (error) {
6712 			if_printf(sc->hn_ifp,
6713 			    "rxbuf gpadl disconn failed: %d\n", error);
6714 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6715 		}
6716 		sc->hn_rxbuf_gpadl = 0;
6717 	}
6718 
6719 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6720 		/*
6721 		 * Host is post-Win2016, disconnect chimney sending buffer from
6722 		 * primary channel here.
6723 		 */
6724 		int error;
6725 
6726 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6727 		    sc->hn_chim_gpadl);
6728 		if (error) {
6729 			if_printf(sc->hn_ifp,
6730 			    "chim gpadl disconn failed: %d\n", error);
6731 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6732 		}
6733 		sc->hn_chim_gpadl = 0;
6734 	}
6735 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6736 }
6737 
6738 static void
6739 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6740 {
6741 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6742 	    ("invalid ring count %d", ring_cnt));
6743 
6744 	if (sc->hn_tx_ring_cnt > ring_cnt)
6745 		sc->hn_tx_ring_inuse = ring_cnt;
6746 	else
6747 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6748 	sc->hn_rx_ring_inuse = ring_cnt;
6749 
6750 #ifdef RSS
6751 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6752 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6753 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6754 		    rss_getnumbuckets());
6755 	}
6756 #endif
6757 
6758 	if (bootverbose) {
6759 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6760 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6761 	}
6762 }
6763 
6764 static void
6765 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6766 {
6767 
6768 	/*
6769 	 * NOTE:
6770 	 * The TX bufring will not be drained by the hypervisor,
6771 	 * if the primary channel is revoked.
6772 	 */
6773 	while (!vmbus_chan_rx_empty(chan) ||
6774 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6775 	     !vmbus_chan_tx_empty(chan)))
6776 		pause("waitch", 1);
6777 	vmbus_chan_intr_drain(chan);
6778 }
6779 
6780 static void
6781 hn_disable_rx(struct hn_softc *sc)
6782 {
6783 
6784 	/*
6785 	 * Disable RX by clearing RX filter forcefully.
6786 	 */
6787 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6788 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6789 
6790 	/*
6791 	 * Give RNDIS enough time to flush all pending data packets.
6792 	 */
6793 	pause("waitrx", (200 * hz) / 1000);
6794 }
6795 
6796 /*
6797  * NOTE:
6798  * RX/TX _must_ have been suspended/disabled, before this function
6799  * is called.
6800  */
6801 static void
6802 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6803 {
6804 	struct vmbus_channel **subch = NULL;
6805 	int nsubch;
6806 
6807 	/*
6808 	 * Drain RX/TX bufrings and interrupts.
6809 	 */
6810 	nsubch = nchan - 1;
6811 	if (nsubch > 0)
6812 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6813 
6814 	if (subch != NULL) {
6815 		int i;
6816 
6817 		for (i = 0; i < nsubch; ++i)
6818 			hn_chan_drain(sc, subch[i]);
6819 	}
6820 	hn_chan_drain(sc, sc->hn_prichan);
6821 
6822 	if (subch != NULL)
6823 		vmbus_subchan_rel(subch, nsubch);
6824 }
6825 
6826 static void
6827 hn_suspend_data(struct hn_softc *sc)
6828 {
6829 	struct hn_tx_ring *txr;
6830 	int i;
6831 
6832 	HN_LOCK_ASSERT(sc);
6833 
6834 	/*
6835 	 * Suspend TX.
6836 	 */
6837 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6838 		txr = &sc->hn_tx_ring[i];
6839 
6840 		mtx_lock(&txr->hn_tx_lock);
6841 		txr->hn_suspended = 1;
6842 		mtx_unlock(&txr->hn_tx_lock);
6843 		/* No one is able send more packets now. */
6844 
6845 		/*
6846 		 * Wait for all pending sends to finish.
6847 		 *
6848 		 * NOTE:
6849 		 * We will _not_ receive all pending send-done, if the
6850 		 * primary channel is revoked.
6851 		 */
6852 		while (hn_tx_ring_pending(txr) &&
6853 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6854 			pause("hnwtx", 1 /* 1 tick */);
6855 	}
6856 
6857 	/*
6858 	 * Disable RX.
6859 	 */
6860 	hn_disable_rx(sc);
6861 
6862 	/*
6863 	 * Drain RX/TX.
6864 	 */
6865 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6866 
6867 	/*
6868 	 * Drain any pending TX tasks.
6869 	 *
6870 	 * NOTE:
6871 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6872 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6873 	 */
6874 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6875 		txr = &sc->hn_tx_ring[i];
6876 
6877 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6878 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6879 	}
6880 }
6881 
6882 static void
6883 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6884 {
6885 
6886 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6887 }
6888 
6889 static void
6890 hn_suspend_mgmt(struct hn_softc *sc)
6891 {
6892 	struct task task;
6893 
6894 	HN_LOCK_ASSERT(sc);
6895 
6896 	/*
6897 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6898 	 * through hn_mgmt_taskq.
6899 	 */
6900 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6901 	vmbus_chan_run_task(sc->hn_prichan, &task);
6902 
6903 	/*
6904 	 * Make sure that all pending management tasks are completed.
6905 	 */
6906 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6907 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6908 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6909 }
6910 
6911 static void
6912 hn_suspend(struct hn_softc *sc)
6913 {
6914 
6915 	/* Disable polling. */
6916 	hn_polling(sc, 0);
6917 
6918 	/*
6919 	 * If the non-transparent mode VF is activated, the synthetic
6920 	 * device is receiving packets, so the data path of the
6921 	 * synthetic device must be suspended.
6922 	 */
6923 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6924 	    (sc->hn_flags & HN_FLAG_RXVF))
6925 		hn_suspend_data(sc);
6926 	hn_suspend_mgmt(sc);
6927 }
6928 
6929 static void
6930 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6931 {
6932 	int i;
6933 
6934 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6935 	    ("invalid TX ring count %d", tx_ring_cnt));
6936 
6937 	for (i = 0; i < tx_ring_cnt; ++i) {
6938 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6939 
6940 		mtx_lock(&txr->hn_tx_lock);
6941 		txr->hn_suspended = 0;
6942 		mtx_unlock(&txr->hn_tx_lock);
6943 	}
6944 }
6945 
6946 static void
6947 hn_resume_data(struct hn_softc *sc)
6948 {
6949 	int i;
6950 
6951 	HN_LOCK_ASSERT(sc);
6952 
6953 	/*
6954 	 * Re-enable RX.
6955 	 */
6956 	hn_rxfilter_config(sc);
6957 
6958 	/*
6959 	 * Make sure to clear suspend status on "all" TX rings,
6960 	 * since hn_tx_ring_inuse can be changed after
6961 	 * hn_suspend_data().
6962 	 */
6963 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6964 
6965 #ifdef HN_IFSTART_SUPPORT
6966 	if (!hn_use_if_start)
6967 #endif
6968 	{
6969 		/*
6970 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6971 		 * reduced.
6972 		 */
6973 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6974 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6975 	}
6976 
6977 	/*
6978 	 * Kick start TX.
6979 	 */
6980 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6981 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6982 
6983 		/*
6984 		 * Use txeof task, so that any pending oactive can be
6985 		 * cleared properly.
6986 		 */
6987 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6988 	}
6989 }
6990 
6991 static void
6992 hn_resume_mgmt(struct hn_softc *sc)
6993 {
6994 
6995 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6996 
6997 	/*
6998 	 * Kick off network change detection, if it was pending.
6999 	 * If no network change was pending, start link status
7000 	 * checks, which is more lightweight than network change
7001 	 * detection.
7002 	 */
7003 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
7004 		hn_change_network(sc);
7005 	else
7006 		hn_update_link_status(sc);
7007 }
7008 
7009 static void
7010 hn_resume(struct hn_softc *sc)
7011 {
7012 
7013 	/*
7014 	 * If the non-transparent mode VF is activated, the synthetic
7015 	 * device have to receive packets, so the data path of the
7016 	 * synthetic device must be resumed.
7017 	 */
7018 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
7019 	    (sc->hn_flags & HN_FLAG_RXVF))
7020 		hn_resume_data(sc);
7021 
7022 	/*
7023 	 * Don't resume link status change if VF is attached/activated.
7024 	 * - In the non-transparent VF mode, the synthetic device marks
7025 	 *   link down until the VF is deactivated; i.e. VF is down.
7026 	 * - In transparent VF mode, VF's media status is used until
7027 	 *   the VF is detached.
7028 	 */
7029 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
7030 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
7031 		hn_resume_mgmt(sc);
7032 
7033 	/*
7034 	 * Re-enable polling if this interface is running and
7035 	 * the polling is requested.
7036 	 */
7037 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
7038 		hn_polling(sc, sc->hn_pollhz);
7039 }
7040 
7041 static void
7042 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
7043 {
7044 	const struct rndis_status_msg *msg;
7045 	int ofs;
7046 
7047 	if (dlen < sizeof(*msg)) {
7048 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7049 		return;
7050 	}
7051 	msg = data;
7052 
7053 	switch (msg->rm_status) {
7054 	case RNDIS_STATUS_MEDIA_CONNECT:
7055 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7056 		hn_update_link_status(sc);
7057 		break;
7058 
7059 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7060 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7061 		/* Not really useful; ignore. */
7062 		break;
7063 
7064 	case RNDIS_STATUS_NETWORK_CHANGE:
7065 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7066 		if (dlen < ofs + msg->rm_stbuflen ||
7067 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7068 			if_printf(sc->hn_ifp, "network changed\n");
7069 		} else {
7070 			uint32_t change;
7071 
7072 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7073 			    sizeof(change));
7074 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7075 			    change);
7076 		}
7077 		hn_change_network(sc);
7078 		break;
7079 
7080 	default:
7081 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7082 		    msg->rm_status);
7083 		break;
7084 	}
7085 }
7086 
7087 static int
7088 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7089 {
7090 	const struct rndis_pktinfo *pi = info_data;
7091 	uint32_t mask = 0;
7092 
7093 	while (info_dlen != 0) {
7094 		const void *data;
7095 		uint32_t dlen;
7096 
7097 		if (__predict_false(info_dlen < sizeof(*pi)))
7098 			return (EINVAL);
7099 		if (__predict_false(info_dlen < pi->rm_size))
7100 			return (EINVAL);
7101 		info_dlen -= pi->rm_size;
7102 
7103 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7104 			return (EINVAL);
7105 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7106 			return (EINVAL);
7107 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7108 		data = pi->rm_data;
7109 
7110 		if (pi->rm_internal == 1) {
7111 			switch (pi->rm_type) {
7112 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7113 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7114 					return (EINVAL);
7115 				info->pktinfo_id =
7116 				    (const struct packet_info_id *)data;
7117 				mask |= HN_RXINFO_PKTINFO_ID;
7118 				break;
7119 
7120 			default:
7121 				goto next;
7122 			}
7123 		} else {
7124 			switch (pi->rm_type) {
7125 			case NDIS_PKTINFO_TYPE_VLAN:
7126 				if (__predict_false(dlen
7127 				    < NDIS_VLAN_INFO_SIZE))
7128 					return (EINVAL);
7129 				info->vlan_info = (const uint32_t *)data;
7130 				mask |= HN_RXINFO_VLAN;
7131 				break;
7132 
7133 			case NDIS_PKTINFO_TYPE_CSUM:
7134 				if (__predict_false(dlen
7135 				    < NDIS_RXCSUM_INFO_SIZE))
7136 					return (EINVAL);
7137 				info->csum_info = (const uint32_t *)data;
7138 				mask |= HN_RXINFO_CSUM;
7139 				break;
7140 
7141 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7142 				if (__predict_false(dlen
7143 				    < HN_NDIS_HASH_VALUE_SIZE))
7144 					return (EINVAL);
7145 				info->hash_value = (const uint32_t *)data;
7146 				mask |= HN_RXINFO_HASHVAL;
7147 				break;
7148 
7149 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7150 				if (__predict_false(dlen
7151 				    < HN_NDIS_HASH_INFO_SIZE))
7152 					return (EINVAL);
7153 				info->hash_info = (const uint32_t *)data;
7154 				mask |= HN_RXINFO_HASHINF;
7155 				break;
7156 
7157 			default:
7158 				goto next;
7159 			}
7160 		}
7161 
7162 		if (mask == HN_RXINFO_ALL) {
7163 			/* All found; done */
7164 			break;
7165 		}
7166 next:
7167 		pi = (const struct rndis_pktinfo *)
7168 		    ((const uint8_t *)pi + pi->rm_size);
7169 	}
7170 
7171 	/*
7172 	 * Final fixup.
7173 	 * - If there is no hash value, invalidate the hash info.
7174 	 */
7175 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7176 		info->hash_info = NULL;
7177 	return (0);
7178 }
7179 
7180 static __inline bool
7181 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7182 {
7183 
7184 	if (off < check_off) {
7185 		if (__predict_true(off + len <= check_off))
7186 			return (false);
7187 	} else if (off > check_off) {
7188 		if (__predict_true(check_off + check_len <= off))
7189 			return (false);
7190 	}
7191 	return (true);
7192 }
7193 
7194 static __inline void
7195 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7196 		uint32_t len, struct hn_rxinfo *info)
7197 {
7198 	uint32_t cnt = rxr->rsc.cnt;
7199 
7200 	if (cnt) {
7201 		rxr->rsc.pktlen += len;
7202 	} else {
7203 		rxr->rsc.vlan_info = info->vlan_info;
7204 		rxr->rsc.csum_info = info->csum_info;
7205 		rxr->rsc.hash_info = info->hash_info;
7206 		rxr->rsc.hash_value = info->hash_value;
7207 		rxr->rsc.pktlen = len;
7208 	}
7209 
7210 	rxr->rsc.frag_data[cnt] = data;
7211 	rxr->rsc.frag_len[cnt] = len;
7212 	rxr->rsc.cnt++;
7213 }
7214 
7215 static void
7216 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7217 {
7218 	const struct rndis_packet_msg *pkt;
7219 	struct hn_rxinfo info;
7220 	int data_off, pktinfo_off, data_len, pktinfo_len;
7221 	bool rsc_more= false;
7222 
7223 	/*
7224 	 * Check length.
7225 	 */
7226 	if (__predict_false(dlen < sizeof(*pkt))) {
7227 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7228 		return;
7229 	}
7230 	pkt = data;
7231 
7232 	if (__predict_false(dlen < pkt->rm_len)) {
7233 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7234 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7235 		return;
7236 	}
7237 	if (__predict_false(pkt->rm_len <
7238 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7239 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7240 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7241 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7242 		    pkt->rm_pktinfolen);
7243 		return;
7244 	}
7245 	if (__predict_false(pkt->rm_datalen == 0)) {
7246 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7247 		return;
7248 	}
7249 
7250 	/*
7251 	 * Check offests.
7252 	 */
7253 #define IS_OFFSET_INVALID(ofs)			\
7254 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7255 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7256 
7257 	/* XXX Hyper-V does not meet data offset alignment requirement */
7258 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7259 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7260 		    "data offset %u\n", pkt->rm_dataoffset);
7261 		return;
7262 	}
7263 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7264 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7265 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7266 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7267 		return;
7268 	}
7269 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7270 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7271 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7272 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7273 		return;
7274 	}
7275 
7276 #undef IS_OFFSET_INVALID
7277 
7278 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7279 	data_len = pkt->rm_datalen;
7280 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7281 	pktinfo_len = pkt->rm_pktinfolen;
7282 
7283 	/*
7284 	 * Check OOB coverage.
7285 	 */
7286 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7287 		int oob_off, oob_len;
7288 
7289 		if_printf(rxr->hn_ifp, "got oobdata\n");
7290 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7291 		oob_len = pkt->rm_oobdatalen;
7292 
7293 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7294 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7295 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7296 			    pkt->rm_len, oob_off, oob_len);
7297 			return;
7298 		}
7299 
7300 		/*
7301 		 * Check against data.
7302 		 */
7303 		if (hn_rndis_check_overlap(oob_off, oob_len,
7304 		    data_off, data_len)) {
7305 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7306 			    "oob overlaps data, oob abs %d len %d, "
7307 			    "data abs %d len %d\n",
7308 			    oob_off, oob_len, data_off, data_len);
7309 			return;
7310 		}
7311 
7312 		/*
7313 		 * Check against pktinfo.
7314 		 */
7315 		if (pktinfo_len != 0 &&
7316 		    hn_rndis_check_overlap(oob_off, oob_len,
7317 		    pktinfo_off, pktinfo_len)) {
7318 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7319 			    "oob overlaps pktinfo, oob abs %d len %d, "
7320 			    "pktinfo abs %d len %d\n",
7321 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7322 			return;
7323 		}
7324 	}
7325 
7326 	/*
7327 	 * Check per-packet-info coverage and find useful per-packet-info.
7328 	 */
7329 	info.vlan_info = NULL;
7330 	info.csum_info = NULL;
7331 	info.hash_info = NULL;
7332 	info.pktinfo_id = NULL;
7333 
7334 	if (__predict_true(pktinfo_len != 0)) {
7335 		bool overlap;
7336 		int error;
7337 
7338 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7339 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7340 			    "pktinfo overflow, msglen %u, "
7341 			    "pktinfo abs %d len %d\n",
7342 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7343 			return;
7344 		}
7345 
7346 		/*
7347 		 * Check packet info coverage.
7348 		 */
7349 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7350 		    data_off, data_len);
7351 		if (__predict_false(overlap)) {
7352 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7353 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7354 			    "data abs %d len %d\n",
7355 			    pktinfo_off, pktinfo_len, data_off, data_len);
7356 			return;
7357 		}
7358 
7359 		/*
7360 		 * Find useful per-packet-info.
7361 		 */
7362 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7363 		    pktinfo_len, &info);
7364 		if (__predict_false(error)) {
7365 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7366 			    "pktinfo\n");
7367 			return;
7368 		}
7369 	}
7370 
7371 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7372 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7373 		    "data overflow, msglen %u, data abs %d len %d\n",
7374 		    pkt->rm_len, data_off, data_len);
7375 		return;
7376 	}
7377 
7378 	/* Identify RSC fragments, drop invalid packets */
7379 	if ((info.pktinfo_id != NULL) &&
7380 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7381 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7382 			rxr->rsc.cnt = 0;
7383 			rxr->hn_rsc_pkts++;
7384 		} else if (rxr->rsc.cnt == 0)
7385 			goto drop;
7386 
7387 		rsc_more = true;
7388 
7389 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7390 			rsc_more = false;
7391 
7392 		if (rsc_more && rxr->rsc.is_last)
7393 			goto drop;
7394 	} else {
7395 		rxr->rsc.cnt = 0;
7396 	}
7397 
7398 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7399 		goto drop;
7400 
7401 	/* Store data in per rx ring structure */
7402 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7403 	    data_len, &info);
7404 
7405 	if (rsc_more)
7406 		return;
7407 
7408 	hn_rxpkt(rxr);
7409 	rxr->rsc.cnt = 0;
7410 	return;
7411 drop:
7412 	rxr->hn_rsc_drop++;
7413 	return;
7414 }
7415 
7416 static __inline void
7417 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7418 {
7419 	const struct rndis_msghdr *hdr;
7420 
7421 	if (__predict_false(dlen < sizeof(*hdr))) {
7422 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7423 		return;
7424 	}
7425 	hdr = data;
7426 
7427 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7428 		/* Hot data path. */
7429 		hn_rndis_rx_data(rxr, data, dlen);
7430 		/* Done! */
7431 		return;
7432 	}
7433 
7434 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7435 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7436 	else
7437 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7438 }
7439 
7440 static void
7441 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7442 {
7443 	const struct hn_nvs_hdr *hdr;
7444 
7445 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7446 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7447 		return;
7448 	}
7449 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7450 
7451 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7452 		/* Useless; ignore */
7453 		return;
7454 	}
7455 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7456 }
7457 
7458 static void
7459 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7460     const struct vmbus_chanpkt_hdr *pkt)
7461 {
7462 	struct hn_nvs_sendctx *sndc;
7463 
7464 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7465 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7466 	    VMBUS_CHANPKT_DATALEN(pkt));
7467 	/*
7468 	 * NOTE:
7469 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7470 	 * its callback.
7471 	 */
7472 }
7473 
7474 static void
7475 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7476     const struct vmbus_chanpkt_hdr *pkthdr)
7477 {
7478 	struct epoch_tracker et;
7479 	const struct vmbus_chanpkt_rxbuf *pkt;
7480 	const struct hn_nvs_hdr *nvs_hdr;
7481 	int count, i, hlen;
7482 
7483 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7484 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7485 		return;
7486 	}
7487 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7488 
7489 	/* Make sure that this is a RNDIS message. */
7490 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7491 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7492 		    nvs_hdr->nvs_type);
7493 		return;
7494 	}
7495 
7496 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7497 	if (__predict_false(hlen < sizeof(*pkt))) {
7498 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7499 		return;
7500 	}
7501 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7502 
7503 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7504 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7505 		    pkt->cp_rxbuf_id);
7506 		return;
7507 	}
7508 
7509 	count = pkt->cp_rxbuf_cnt;
7510 	if (__predict_false(hlen <
7511 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7512 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7513 		return;
7514 	}
7515 
7516 	NET_EPOCH_ENTER(et);
7517 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7518 	for (i = 0; i < count; ++i) {
7519 		int ofs, len;
7520 
7521 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7522 		len = pkt->cp_rxbuf[i].rb_len;
7523 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7524 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7525 			    "ofs %d, len %d\n", i, ofs, len);
7526 			continue;
7527 		}
7528 
7529 		rxr->rsc.is_last = (i == (count - 1));
7530 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7531 	}
7532 	NET_EPOCH_EXIT(et);
7533 
7534 	/*
7535 	 * Ack the consumed RXBUF associated w/ this channel packet,
7536 	 * so that this RXBUF can be recycled by the hypervisor.
7537 	 */
7538 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7539 }
7540 
7541 static void
7542 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7543     uint64_t tid)
7544 {
7545 	struct hn_nvs_rndis_ack ack;
7546 	int retries, error;
7547 
7548 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7549 	ack.nvs_status = HN_NVS_STATUS_OK;
7550 
7551 	retries = 0;
7552 again:
7553 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7554 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7555 	if (__predict_false(error == EAGAIN)) {
7556 		/*
7557 		 * NOTE:
7558 		 * This should _not_ happen in real world, since the
7559 		 * consumption of the TX bufring from the TX path is
7560 		 * controlled.
7561 		 */
7562 		if (rxr->hn_ack_failed == 0)
7563 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7564 		rxr->hn_ack_failed++;
7565 		retries++;
7566 		if (retries < 10) {
7567 			DELAY(100);
7568 			goto again;
7569 		}
7570 		/* RXBUF leaks! */
7571 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7572 	}
7573 }
7574 
7575 static void
7576 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7577 {
7578 	struct hn_rx_ring *rxr = xrxr;
7579 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7580 
7581 	for (;;) {
7582 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7583 		int error, pktlen;
7584 
7585 		pktlen = rxr->hn_pktbuf_len;
7586 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7587 		if (__predict_false(error == ENOBUFS)) {
7588 			void *nbuf;
7589 			int nlen;
7590 
7591 			/*
7592 			 * Expand channel packet buffer.
7593 			 *
7594 			 * XXX
7595 			 * Use M_WAITOK here, since allocation failure
7596 			 * is fatal.
7597 			 */
7598 			nlen = rxr->hn_pktbuf_len * 2;
7599 			while (nlen < pktlen)
7600 				nlen *= 2;
7601 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7602 
7603 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7604 			    rxr->hn_pktbuf_len, nlen);
7605 
7606 			free(rxr->hn_pktbuf, M_DEVBUF);
7607 			rxr->hn_pktbuf = nbuf;
7608 			rxr->hn_pktbuf_len = nlen;
7609 			/* Retry! */
7610 			continue;
7611 		} else if (__predict_false(error == EAGAIN)) {
7612 			/* No more channel packets; done! */
7613 			break;
7614 		}
7615 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7616 
7617 		switch (pkt->cph_type) {
7618 		case VMBUS_CHANPKT_TYPE_COMP:
7619 			hn_nvs_handle_comp(sc, chan, pkt);
7620 			break;
7621 
7622 		case VMBUS_CHANPKT_TYPE_RXBUF:
7623 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7624 			break;
7625 
7626 		case VMBUS_CHANPKT_TYPE_INBAND:
7627 			hn_nvs_handle_notify(sc, pkt);
7628 			break;
7629 
7630 		default:
7631 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7632 			    pkt->cph_type);
7633 			break;
7634 		}
7635 	}
7636 	hn_chan_rollup(rxr, rxr->hn_txr);
7637 }
7638 
7639 static void
7640 hn_sysinit(void *arg __unused)
7641 {
7642 	int i;
7643 
7644 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7645 
7646 #ifdef HN_IFSTART_SUPPORT
7647 	/*
7648 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7649 	 * mainly due to the IFF_DRV_OACTIVE flag.
7650 	 */
7651 	if (hn_xpnt_vf && hn_use_if_start) {
7652 		hn_use_if_start = 0;
7653 		printf("hn: tranparent VF mode, if_transmit will be used, "
7654 		    "instead of if_start\n");
7655 	}
7656 #endif
7657 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7658 		printf("hn: invalid transparent VF attach routing "
7659 		    "wait timeout %d, reset to %d\n",
7660 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7661 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7662 	}
7663 
7664 	/*
7665 	 * Initialize VF map.
7666 	 */
7667 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7668 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7669 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7670 	    M_WAITOK | M_ZERO);
7671 
7672 	/*
7673 	 * Fix the # of TX taskqueues.
7674 	 */
7675 	if (hn_tx_taskq_cnt <= 0)
7676 		hn_tx_taskq_cnt = 1;
7677 	else if (hn_tx_taskq_cnt > mp_ncpus)
7678 		hn_tx_taskq_cnt = mp_ncpus;
7679 
7680 	/*
7681 	 * Fix the TX taskqueue mode.
7682 	 */
7683 	switch (hn_tx_taskq_mode) {
7684 	case HN_TX_TASKQ_M_INDEP:
7685 	case HN_TX_TASKQ_M_GLOBAL:
7686 	case HN_TX_TASKQ_M_EVTTQ:
7687 		break;
7688 	default:
7689 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7690 		break;
7691 	}
7692 
7693 	if (vm_guest != VM_GUEST_HV)
7694 		return;
7695 
7696 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7697 		return;
7698 
7699 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7700 	    M_DEVBUF, M_WAITOK);
7701 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7702 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7703 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7704 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7705 		    "hn tx%d", i);
7706 	}
7707 }
7708 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7709 
7710 static void
7711 hn_sysuninit(void *arg __unused)
7712 {
7713 
7714 	if (hn_tx_taskque != NULL) {
7715 		int i;
7716 
7717 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7718 			taskqueue_free(hn_tx_taskque[i]);
7719 		free(hn_tx_taskque, M_DEVBUF);
7720 	}
7721 
7722 	if (hn_vfmap != NULL)
7723 		free(hn_vfmap, M_DEVBUF);
7724 	rm_destroy(&hn_vfmap_lock);
7725 
7726 	counter_u64_free(hn_udpcs_fixup);
7727 }
7728 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7729