xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 80f39bd95f22322152709ea5fae3a3c546044c9c)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/proc.h>
75 #include <sys/rmlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/socket.h>
80 #include <sys/sockio.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/buf_ring.h>
85 #include <sys/eventhandler.h>
86 
87 #include <machine/atomic.h>
88 #include <machine/in_cksum.h>
89 
90 #include <net/bpf.h>
91 #include <net/ethernet.h>
92 #include <net/if.h>
93 #include <net/if_dl.h>
94 #include <net/if_media.h>
95 #include <net/if_types.h>
96 #include <net/if_var.h>
97 #include <net/rndis.h>
98 #ifdef RSS
99 #include <net/rss_config.h>
100 #endif
101 
102 #include <netinet/in_systm.h>
103 #include <netinet/in.h>
104 #include <netinet/ip.h>
105 #include <netinet/ip6.h>
106 #include <netinet/tcp.h>
107 #include <netinet/tcp_lro.h>
108 #include <netinet/udp.h>
109 
110 #include <dev/hyperv/include/hyperv.h>
111 #include <dev/hyperv/include/hyperv_busdma.h>
112 #include <dev/hyperv/include/vmbus.h>
113 #include <dev/hyperv/include/vmbus_xact.h>
114 
115 #include <dev/hyperv/netvsc/ndis.h>
116 #include <dev/hyperv/netvsc/if_hnreg.h>
117 #include <dev/hyperv/netvsc/if_hnvar.h>
118 #include <dev/hyperv/netvsc/hn_nvs.h>
119 #include <dev/hyperv/netvsc/hn_rndis.h>
120 
121 #include "vmbus_if.h"
122 
123 #define HN_IFSTART_SUPPORT
124 
125 #define HN_RING_CNT_DEF_MAX		8
126 
127 #define HN_VFMAP_SIZE_DEF		8
128 
129 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
130 
131 /* YYY should get it from the underlying channel */
132 #define HN_TX_DESC_CNT			512
133 
134 #define HN_RNDIS_PKT_LEN					\
135 	(sizeof(struct rndis_packet_msg) +			\
136 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
137 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
138 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
139 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
140 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
141 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
142 
143 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
144 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
145 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
146 /* -1 for RNDIS packet message */
147 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
148 
149 #define HN_DIRECT_TX_SIZE_DEF		128
150 
151 #define HN_EARLY_TXEOF_THRESH		8
152 
153 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
154 
155 #define HN_LROENT_CNT_DEF		128
156 
157 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
158 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
159 /* YYY 2*MTU is a bit rough, but should be good enough. */
160 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
161 
162 #define HN_LRO_ACKCNT_DEF		1
163 
164 #define HN_LOCK_INIT(sc)		\
165 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
166 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
167 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
168 #define HN_LOCK(sc)					\
169 do {							\
170 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
171 		/* Relinquish cpu to avoid deadlock */	\
172 		sched_relinquish(curthread);		\
173 		DELAY(1000);				\
174 	}						\
175 } while (0)
176 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
177 
178 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
179 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
180 #define HN_CSUM_IP_HWASSIST(sc)		\
181 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
182 #define HN_CSUM_IP6_HWASSIST(sc)	\
183 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
184 
185 #define HN_PKTSIZE_MIN(align)		\
186 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
187 	    HN_RNDIS_PKT_LEN, (align))
188 #define HN_PKTSIZE(m, align)		\
189 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
190 
191 #ifdef RSS
192 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
193 #else
194 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
195 #endif
196 
197 struct hn_txdesc {
198 #ifndef HN_USE_TXDESC_BUFRING
199 	SLIST_ENTRY(hn_txdesc)		link;
200 #endif
201 	STAILQ_ENTRY(hn_txdesc)		agg_link;
202 
203 	/* Aggregated txdescs, in sending order. */
204 	STAILQ_HEAD(, hn_txdesc)	agg_list;
205 
206 	/* The oldest packet, if transmission aggregation happens. */
207 	struct mbuf			*m;
208 	struct hn_tx_ring		*txr;
209 	int				refs;
210 	uint32_t			flags;	/* HN_TXD_FLAG_ */
211 	struct hn_nvs_sendctx		send_ctx;
212 	uint32_t			chim_index;
213 	int				chim_size;
214 
215 	bus_dmamap_t			data_dmap;
216 
217 	bus_addr_t			rndis_pkt_paddr;
218 	struct rndis_packet_msg		*rndis_pkt;
219 	bus_dmamap_t			rndis_pkt_dmap;
220 };
221 
222 #define HN_TXD_FLAG_ONLIST		0x0001
223 #define HN_TXD_FLAG_DMAMAP		0x0002
224 #define HN_TXD_FLAG_ONAGG		0x0004
225 
226 struct hn_rxinfo {
227 	uint32_t			vlan_info;
228 	uint32_t			csum_info;
229 	uint32_t			hash_info;
230 	uint32_t			hash_value;
231 };
232 
233 struct hn_rxvf_setarg {
234 	struct hn_rx_ring	*rxr;
235 	struct ifnet		*vf_ifp;
236 };
237 
238 #define HN_RXINFO_VLAN			0x0001
239 #define HN_RXINFO_CSUM			0x0002
240 #define HN_RXINFO_HASHINF		0x0004
241 #define HN_RXINFO_HASHVAL		0x0008
242 #define HN_RXINFO_ALL			\
243 	(HN_RXINFO_VLAN |		\
244 	 HN_RXINFO_CSUM |		\
245 	 HN_RXINFO_HASHINF |		\
246 	 HN_RXINFO_HASHVAL)
247 
248 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
249 #define HN_NDIS_RXCSUM_INFO_INVALID	0
250 #define HN_NDIS_HASH_INFO_INVALID	0
251 
252 static int			hn_probe(device_t);
253 static int			hn_attach(device_t);
254 static int			hn_detach(device_t);
255 static int			hn_shutdown(device_t);
256 static void			hn_chan_callback(struct vmbus_channel *,
257 				    void *);
258 
259 static void			hn_init(void *);
260 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
261 #ifdef HN_IFSTART_SUPPORT
262 static void			hn_start(struct ifnet *);
263 #endif
264 static int			hn_transmit(struct ifnet *, struct mbuf *);
265 static void			hn_xmit_qflush(struct ifnet *);
266 static int			hn_ifmedia_upd(struct ifnet *);
267 static void			hn_ifmedia_sts(struct ifnet *,
268 				    struct ifmediareq *);
269 
270 static void			hn_ifnet_event(void *, struct ifnet *, int);
271 static void			hn_ifaddr_event(void *, struct ifnet *);
272 static void			hn_ifnet_attevent(void *, struct ifnet *);
273 static void			hn_ifnet_detevent(void *, struct ifnet *);
274 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
275 
276 static bool			hn_ismyvf(const struct hn_softc *,
277 				    const struct ifnet *);
278 static void			hn_rxvf_change(struct hn_softc *,
279 				    struct ifnet *, bool);
280 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
281 static void			hn_rxvf_set_task(void *, int);
282 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
283 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
284 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
285 				    struct ifreq *);
286 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
287 static bool			hn_xpnt_vf_isready(struct hn_softc *);
288 static void			hn_xpnt_vf_setready(struct hn_softc *);
289 static void			hn_xpnt_vf_init_taskfunc(void *, int);
290 static void			hn_xpnt_vf_init(struct hn_softc *);
291 static void			hn_xpnt_vf_setenable(struct hn_softc *);
292 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
293 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
294 static void			hn_vf_rss_restore(struct hn_softc *);
295 
296 static int			hn_rndis_rxinfo(const void *, int,
297 				    struct hn_rxinfo *);
298 static void			hn_rndis_rx_data(struct hn_rx_ring *,
299 				    const void *, int);
300 static void			hn_rndis_rx_status(struct hn_softc *,
301 				    const void *, int);
302 static void			hn_rndis_init_fixat(struct hn_softc *, int);
303 
304 static void			hn_nvs_handle_notify(struct hn_softc *,
305 				    const struct vmbus_chanpkt_hdr *);
306 static void			hn_nvs_handle_comp(struct hn_softc *,
307 				    struct vmbus_channel *,
308 				    const struct vmbus_chanpkt_hdr *);
309 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
310 				    struct vmbus_channel *,
311 				    const struct vmbus_chanpkt_hdr *);
312 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
313 				    struct vmbus_channel *, uint64_t);
314 
315 #if __FreeBSD_version >= 1100099
316 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
317 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
318 #endif
319 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
320 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
321 #if __FreeBSD_version < 1100095
322 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
323 #else
324 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
325 #endif
326 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
327 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
328 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
329 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
331 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
333 #ifndef RSS
334 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
336 #endif
337 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
346 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
347 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
349 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
350 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
351 
352 static void			hn_stop(struct hn_softc *, bool);
353 static void			hn_init_locked(struct hn_softc *);
354 static int			hn_chan_attach(struct hn_softc *,
355 				    struct vmbus_channel *);
356 static void			hn_chan_detach(struct hn_softc *,
357 				    struct vmbus_channel *);
358 static int			hn_attach_subchans(struct hn_softc *);
359 static void			hn_detach_allchans(struct hn_softc *);
360 static void			hn_chan_rollup(struct hn_rx_ring *,
361 				    struct hn_tx_ring *);
362 static void			hn_set_ring_inuse(struct hn_softc *, int);
363 static int			hn_synth_attach(struct hn_softc *, int);
364 static void			hn_synth_detach(struct hn_softc *);
365 static int			hn_synth_alloc_subchans(struct hn_softc *,
366 				    int *);
367 static bool			hn_synth_attachable(const struct hn_softc *);
368 static void			hn_suspend(struct hn_softc *);
369 static void			hn_suspend_data(struct hn_softc *);
370 static void			hn_suspend_mgmt(struct hn_softc *);
371 static void			hn_resume(struct hn_softc *);
372 static void			hn_resume_data(struct hn_softc *);
373 static void			hn_resume_mgmt(struct hn_softc *);
374 static void			hn_suspend_mgmt_taskfunc(void *, int);
375 static void			hn_chan_drain(struct hn_softc *,
376 				    struct vmbus_channel *);
377 static void			hn_disable_rx(struct hn_softc *);
378 static void			hn_drain_rxtx(struct hn_softc *, int);
379 static void			hn_polling(struct hn_softc *, u_int);
380 static void			hn_chan_polling(struct vmbus_channel *, u_int);
381 static void			hn_mtu_change_fixup(struct hn_softc *);
382 
383 static void			hn_update_link_status(struct hn_softc *);
384 static void			hn_change_network(struct hn_softc *);
385 static void			hn_link_taskfunc(void *, int);
386 static void			hn_netchg_init_taskfunc(void *, int);
387 static void			hn_netchg_status_taskfunc(void *, int);
388 static void			hn_link_status(struct hn_softc *);
389 
390 static int			hn_create_rx_data(struct hn_softc *, int);
391 static void			hn_destroy_rx_data(struct hn_softc *);
392 static int			hn_check_iplen(const struct mbuf *, int);
393 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
394 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
395 static int			hn_rxfilter_config(struct hn_softc *);
396 static int			hn_rss_reconfig(struct hn_softc *);
397 static void			hn_rss_ind_fixup(struct hn_softc *);
398 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
399 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
400 				    int, const struct hn_rxinfo *);
401 static uint32_t			hn_rss_type_fromndis(uint32_t);
402 static uint32_t			hn_rss_type_tondis(uint32_t);
403 
404 static int			hn_tx_ring_create(struct hn_softc *, int);
405 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
406 static int			hn_create_tx_data(struct hn_softc *, int);
407 static void			hn_fixup_tx_data(struct hn_softc *);
408 static void			hn_fixup_rx_data(struct hn_softc *);
409 static void			hn_destroy_tx_data(struct hn_softc *);
410 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
411 static void			hn_txdesc_gc(struct hn_tx_ring *,
412 				    struct hn_txdesc *);
413 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
414 				    struct hn_txdesc *, struct mbuf **);
415 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
416 				    struct hn_txdesc *);
417 static void			hn_set_chim_size(struct hn_softc *, int);
418 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
419 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
420 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
421 static void			hn_resume_tx(struct hn_softc *, int);
422 static void			hn_set_txagg(struct hn_softc *);
423 static void			*hn_try_txagg(struct ifnet *,
424 				    struct hn_tx_ring *, struct hn_txdesc *,
425 				    int);
426 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
427 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
428 				    struct hn_softc *, struct vmbus_channel *,
429 				    const void *, int);
430 static int			hn_txpkt_sglist(struct hn_tx_ring *,
431 				    struct hn_txdesc *);
432 static int			hn_txpkt_chim(struct hn_tx_ring *,
433 				    struct hn_txdesc *);
434 static int			hn_xmit(struct hn_tx_ring *, int);
435 static void			hn_xmit_taskfunc(void *, int);
436 static void			hn_xmit_txeof(struct hn_tx_ring *);
437 static void			hn_xmit_txeof_taskfunc(void *, int);
438 #ifdef HN_IFSTART_SUPPORT
439 static int			hn_start_locked(struct hn_tx_ring *, int);
440 static void			hn_start_taskfunc(void *, int);
441 static void			hn_start_txeof(struct hn_tx_ring *);
442 static void			hn_start_txeof_taskfunc(void *, int);
443 #endif
444 
445 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
446     "Hyper-V network interface");
447 
448 /* Trust tcp segements verification on host side. */
449 static int			hn_trust_hosttcp = 1;
450 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
451     &hn_trust_hosttcp, 0,
452     "Trust tcp segement verification on host side, "
453     "when csum info is missing (global setting)");
454 
455 /* Trust udp datagrams verification on host side. */
456 static int			hn_trust_hostudp = 1;
457 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
458     &hn_trust_hostudp, 0,
459     "Trust udp datagram verification on host side, "
460     "when csum info is missing (global setting)");
461 
462 /* Trust ip packets verification on host side. */
463 static int			hn_trust_hostip = 1;
464 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
465     &hn_trust_hostip, 0,
466     "Trust ip packet verification on host side, "
467     "when csum info is missing (global setting)");
468 
469 /*
470  * Offload UDP/IPv4 checksum.
471  */
472 static int			hn_enable_udp4cs = 1;
473 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
474     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
475 
476 /*
477  * Offload UDP/IPv6 checksum.
478  */
479 static int			hn_enable_udp6cs = 1;
480 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
481     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
482 
483 /* Stats. */
484 static counter_u64_t		hn_udpcs_fixup;
485 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
486     &hn_udpcs_fixup, "# of UDP checksum fixup");
487 
488 /*
489  * See hn_set_hlen().
490  *
491  * This value is for Azure.  For Hyper-V, set this above
492  * 65536 to disable UDP datagram checksum fixup.
493  */
494 static int			hn_udpcs_fixup_mtu = 1420;
495 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
496     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
497 
498 /* Limit TSO burst size */
499 static int			hn_tso_maxlen = IP_MAXPACKET;
500 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
501     &hn_tso_maxlen, 0, "TSO burst limit");
502 
503 /* Limit chimney send size */
504 static int			hn_tx_chimney_size = 0;
505 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
506     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
507 
508 /* Limit the size of packet for direct transmission */
509 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
510 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
511     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
512 
513 /* # of LRO entries per RX ring */
514 #if defined(INET) || defined(INET6)
515 #if __FreeBSD_version >= 1100095
516 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
517 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
518     &hn_lro_entry_count, 0, "LRO entry count");
519 #endif
520 #endif
521 
522 static int			hn_tx_taskq_cnt = 1;
523 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
524     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
525 
526 #define HN_TX_TASKQ_M_INDEP	0
527 #define HN_TX_TASKQ_M_GLOBAL	1
528 #define HN_TX_TASKQ_M_EVTTQ	2
529 
530 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
532     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
533     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
534 
535 #ifndef HN_USE_TXDESC_BUFRING
536 static int			hn_use_txdesc_bufring = 0;
537 #else
538 static int			hn_use_txdesc_bufring = 1;
539 #endif
540 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
541     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
542 
543 #ifdef HN_IFSTART_SUPPORT
544 /* Use ifnet.if_start instead of ifnet.if_transmit */
545 static int			hn_use_if_start = 0;
546 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
547     &hn_use_if_start, 0, "Use if_start TX method");
548 #endif
549 
550 /* # of channels to use */
551 static int			hn_chan_cnt = 0;
552 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
553     &hn_chan_cnt, 0,
554     "# of channels to use; each channel has one RX ring and one TX ring");
555 
556 /* # of transmit rings to use */
557 static int			hn_tx_ring_cnt = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
559     &hn_tx_ring_cnt, 0, "# of TX rings to use");
560 
561 /* Software TX ring deptch */
562 static int			hn_tx_swq_depth = 0;
563 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
564     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
565 
566 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
567 #if __FreeBSD_version >= 1100095
568 static u_int			hn_lro_mbufq_depth = 0;
569 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
570     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
571 #endif
572 
573 /* Packet transmission aggregation size limit */
574 static int			hn_tx_agg_size = -1;
575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
576     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
577 
578 /* Packet transmission aggregation count limit */
579 static int			hn_tx_agg_pkts = -1;
580 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
581     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
582 
583 /* VF list */
584 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
585     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
586     hn_vflist_sysctl, "A",
587     "VF list");
588 
589 /* VF mapping */
590 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
591     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
592     hn_vfmap_sysctl, "A",
593     "VF mapping");
594 
595 /* Transparent VF */
596 static int			hn_xpnt_vf = 1;
597 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
598     &hn_xpnt_vf, 0, "Transparent VF mod");
599 
600 /* Accurate BPF support for Transparent VF */
601 static int			hn_xpnt_vf_accbpf = 0;
602 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
603     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
604 
605 /* Extra wait for transparent VF attach routing; unit seconds. */
606 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
607 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
608     &hn_xpnt_vf_attwait, 0,
609     "Extra wait for transparent VF attach routing; unit: seconds");
610 
611 static u_int			hn_cpu_index;	/* next CPU for channel */
612 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
613 
614 static struct rmlock		hn_vfmap_lock;
615 static int			hn_vfmap_size;
616 static struct ifnet		**hn_vfmap;
617 
618 #ifndef RSS
619 static const uint8_t
620 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
621 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
622 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
623 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
624 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
625 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
626 };
627 #endif	/* !RSS */
628 
629 static const struct hyperv_guid	hn_guid = {
630 	.hv_guid = {
631 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
632 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
633 };
634 
635 static device_method_t hn_methods[] = {
636 	/* Device interface */
637 	DEVMETHOD(device_probe,		hn_probe),
638 	DEVMETHOD(device_attach,	hn_attach),
639 	DEVMETHOD(device_detach,	hn_detach),
640 	DEVMETHOD(device_shutdown,	hn_shutdown),
641 	DEVMETHOD_END
642 };
643 
644 static driver_t hn_driver = {
645 	"hn",
646 	hn_methods,
647 	sizeof(struct hn_softc)
648 };
649 
650 static devclass_t hn_devclass;
651 
652 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
653 MODULE_VERSION(hn, 1);
654 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
655 
656 #if __FreeBSD_version >= 1100099
657 static void
658 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
659 {
660 	int i;
661 
662 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
663 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
664 }
665 #endif
666 
667 static int
668 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
669 {
670 
671 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
672 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
673 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
674 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
675 }
676 
677 static int
678 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
679 {
680 	struct hn_nvs_rndis rndis;
681 
682 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
683 	    txd->chim_size > 0, ("invalid rndis chim txd"));
684 
685 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
686 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
687 	rndis.nvs_chim_idx = txd->chim_index;
688 	rndis.nvs_chim_sz = txd->chim_size;
689 
690 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
691 	    &rndis, sizeof(rndis), &txd->send_ctx));
692 }
693 
694 static __inline uint32_t
695 hn_chim_alloc(struct hn_softc *sc)
696 {
697 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
698 	u_long *bmap = sc->hn_chim_bmap;
699 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
700 
701 	for (i = 0; i < bmap_cnt; ++i) {
702 		int idx;
703 
704 		idx = ffsl(~bmap[i]);
705 		if (idx == 0)
706 			continue;
707 
708 		--idx; /* ffsl is 1-based */
709 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
710 		    ("invalid i %d and idx %d", i, idx));
711 
712 		if (atomic_testandset_long(&bmap[i], idx))
713 			continue;
714 
715 		ret = i * LONG_BIT + idx;
716 		break;
717 	}
718 	return (ret);
719 }
720 
721 static __inline void
722 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
723 {
724 	u_long mask;
725 	uint32_t idx;
726 
727 	idx = chim_idx / LONG_BIT;
728 	KASSERT(idx < sc->hn_chim_bmap_cnt,
729 	    ("invalid chimney index 0x%x", chim_idx));
730 
731 	mask = 1UL << (chim_idx % LONG_BIT);
732 	KASSERT(sc->hn_chim_bmap[idx] & mask,
733 	    ("index bitmap 0x%lx, chimney index %u, "
734 	     "bitmap idx %d, bitmask 0x%lx",
735 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
736 
737 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
738 }
739 
740 #if defined(INET6) || defined(INET)
741 
742 #define PULLUP_HDR(m, len)				\
743 do {							\
744 	if (__predict_false((m)->m_len < (len))) {	\
745 		(m) = m_pullup((m), (len));		\
746 		if ((m) == NULL)			\
747 			return (NULL);			\
748 	}						\
749 } while (0)
750 
751 /*
752  * NOTE: If this function failed, the m_head would be freed.
753  */
754 static __inline struct mbuf *
755 hn_tso_fixup(struct mbuf *m_head)
756 {
757 	struct ether_vlan_header *evl;
758 	struct tcphdr *th;
759 	int ehlen;
760 
761 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
762 
763 	PULLUP_HDR(m_head, sizeof(*evl));
764 	evl = mtod(m_head, struct ether_vlan_header *);
765 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
766 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
767 	else
768 		ehlen = ETHER_HDR_LEN;
769 	m_head->m_pkthdr.l2hlen = ehlen;
770 
771 #ifdef INET
772 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
773 		struct ip *ip;
774 		int iphlen;
775 
776 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
777 		ip = mtodo(m_head, ehlen);
778 		iphlen = ip->ip_hl << 2;
779 		m_head->m_pkthdr.l3hlen = iphlen;
780 
781 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
782 		th = mtodo(m_head, ehlen + iphlen);
783 
784 		ip->ip_len = 0;
785 		ip->ip_sum = 0;
786 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
787 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
788 	}
789 #endif
790 #if defined(INET6) && defined(INET)
791 	else
792 #endif
793 #ifdef INET6
794 	{
795 		struct ip6_hdr *ip6;
796 
797 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
798 		ip6 = mtodo(m_head, ehlen);
799 		if (ip6->ip6_nxt != IPPROTO_TCP) {
800 			m_freem(m_head);
801 			return (NULL);
802 		}
803 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
804 
805 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
806 		th = mtodo(m_head, ehlen + sizeof(*ip6));
807 
808 		ip6->ip6_plen = 0;
809 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
810 	}
811 #endif
812 	return (m_head);
813 }
814 
815 /*
816  * NOTE: If this function failed, the m_head would be freed.
817  */
818 static __inline struct mbuf *
819 hn_set_hlen(struct mbuf *m_head)
820 {
821 	const struct ether_vlan_header *evl;
822 	int ehlen;
823 
824 	PULLUP_HDR(m_head, sizeof(*evl));
825 	evl = mtod(m_head, const struct ether_vlan_header *);
826 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
827 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
828 	else
829 		ehlen = ETHER_HDR_LEN;
830 	m_head->m_pkthdr.l2hlen = ehlen;
831 
832 #ifdef INET
833 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
834 		const struct ip *ip;
835 		int iphlen;
836 
837 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
838 		ip = mtodo(m_head, ehlen);
839 		iphlen = ip->ip_hl << 2;
840 		m_head->m_pkthdr.l3hlen = iphlen;
841 
842 		/*
843 		 * UDP checksum offload does not work in Azure, if the
844 		 * following conditions meet:
845 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
846 		 * - IP_DF is not set in the IP hdr.
847 		 *
848 		 * Fallback to software checksum for these UDP datagrams.
849 		 */
850 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
851 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
852 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
853 			uint16_t off = ehlen + iphlen;
854 
855 			counter_u64_add(hn_udpcs_fixup, 1);
856 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
857 			*(uint16_t *)(m_head->m_data + off +
858                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
859 			    m_head, m_head->m_pkthdr.len, off);
860 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
861 		}
862 	}
863 #endif
864 #if defined(INET6) && defined(INET)
865 	else
866 #endif
867 #ifdef INET6
868 	{
869 		const struct ip6_hdr *ip6;
870 
871 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
872 		ip6 = mtodo(m_head, ehlen);
873 		if (ip6->ip6_nxt != IPPROTO_TCP &&
874 		    ip6->ip6_nxt != IPPROTO_UDP) {
875 			m_freem(m_head);
876 			return (NULL);
877 		}
878 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
879 	}
880 #endif
881 	return (m_head);
882 }
883 
884 /*
885  * NOTE: If this function failed, the m_head would be freed.
886  */
887 static __inline struct mbuf *
888 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
889 {
890 	const struct tcphdr *th;
891 	int ehlen, iphlen;
892 
893 	*tcpsyn = 0;
894 	ehlen = m_head->m_pkthdr.l2hlen;
895 	iphlen = m_head->m_pkthdr.l3hlen;
896 
897 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
898 	th = mtodo(m_head, ehlen + iphlen);
899 	if (th->th_flags & TH_SYN)
900 		*tcpsyn = 1;
901 	return (m_head);
902 }
903 
904 #undef PULLUP_HDR
905 
906 #endif	/* INET6 || INET */
907 
908 static int
909 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
910 {
911 	int error = 0;
912 
913 	HN_LOCK_ASSERT(sc);
914 
915 	if (sc->hn_rx_filter != filter) {
916 		error = hn_rndis_set_rxfilter(sc, filter);
917 		if (!error)
918 			sc->hn_rx_filter = filter;
919 	}
920 	return (error);
921 }
922 
923 static int
924 hn_rxfilter_config(struct hn_softc *sc)
925 {
926 	struct ifnet *ifp = sc->hn_ifp;
927 	uint32_t filter;
928 
929 	HN_LOCK_ASSERT(sc);
930 
931 	/*
932 	 * If the non-transparent mode VF is activated, we don't know how
933 	 * its RX filter is configured, so stick the synthetic device in
934 	 * the promiscous mode.
935 	 */
936 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
937 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
938 	} else {
939 		filter = NDIS_PACKET_TYPE_DIRECTED;
940 		if (ifp->if_flags & IFF_BROADCAST)
941 			filter |= NDIS_PACKET_TYPE_BROADCAST;
942 		/* TODO: support multicast list */
943 		if ((ifp->if_flags & IFF_ALLMULTI) ||
944 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
945 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
946 	}
947 	return (hn_set_rxfilter(sc, filter));
948 }
949 
950 static void
951 hn_set_txagg(struct hn_softc *sc)
952 {
953 	uint32_t size, pkts;
954 	int i;
955 
956 	/*
957 	 * Setup aggregation size.
958 	 */
959 	if (sc->hn_agg_size < 0)
960 		size = UINT32_MAX;
961 	else
962 		size = sc->hn_agg_size;
963 
964 	if (sc->hn_rndis_agg_size < size)
965 		size = sc->hn_rndis_agg_size;
966 
967 	/* NOTE: We only aggregate packets using chimney sending buffers. */
968 	if (size > (uint32_t)sc->hn_chim_szmax)
969 		size = sc->hn_chim_szmax;
970 
971 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
972 		/* Disable */
973 		size = 0;
974 		pkts = 0;
975 		goto done;
976 	}
977 
978 	/* NOTE: Type of the per TX ring setting is 'int'. */
979 	if (size > INT_MAX)
980 		size = INT_MAX;
981 
982 	/*
983 	 * Setup aggregation packet count.
984 	 */
985 	if (sc->hn_agg_pkts < 0)
986 		pkts = UINT32_MAX;
987 	else
988 		pkts = sc->hn_agg_pkts;
989 
990 	if (sc->hn_rndis_agg_pkts < pkts)
991 		pkts = sc->hn_rndis_agg_pkts;
992 
993 	if (pkts <= 1) {
994 		/* Disable */
995 		size = 0;
996 		pkts = 0;
997 		goto done;
998 	}
999 
1000 	/* NOTE: Type of the per TX ring setting is 'short'. */
1001 	if (pkts > SHRT_MAX)
1002 		pkts = SHRT_MAX;
1003 
1004 done:
1005 	/* NOTE: Type of the per TX ring setting is 'short'. */
1006 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1007 		/* Disable */
1008 		size = 0;
1009 		pkts = 0;
1010 	}
1011 
1012 	if (bootverbose) {
1013 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1014 		    size, pkts, sc->hn_rndis_agg_align);
1015 	}
1016 
1017 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1018 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1019 
1020 		mtx_lock(&txr->hn_tx_lock);
1021 		txr->hn_agg_szmax = size;
1022 		txr->hn_agg_pktmax = pkts;
1023 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1024 		mtx_unlock(&txr->hn_tx_lock);
1025 	}
1026 }
1027 
1028 static int
1029 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1030 {
1031 
1032 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1033 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1034 		return txr->hn_txdesc_cnt;
1035 	return hn_tx_swq_depth;
1036 }
1037 
1038 static int
1039 hn_rss_reconfig(struct hn_softc *sc)
1040 {
1041 	int error;
1042 
1043 	HN_LOCK_ASSERT(sc);
1044 
1045 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1046 		return (ENXIO);
1047 
1048 	/*
1049 	 * Disable RSS first.
1050 	 *
1051 	 * NOTE:
1052 	 * Direct reconfiguration by setting the UNCHG flags does
1053 	 * _not_ work properly.
1054 	 */
1055 	if (bootverbose)
1056 		if_printf(sc->hn_ifp, "disable RSS\n");
1057 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1058 	if (error) {
1059 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1060 		return (error);
1061 	}
1062 
1063 	/*
1064 	 * Reenable the RSS w/ the updated RSS key or indirect
1065 	 * table.
1066 	 */
1067 	if (bootverbose)
1068 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1069 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1070 	if (error) {
1071 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1072 		return (error);
1073 	}
1074 	return (0);
1075 }
1076 
1077 static void
1078 hn_rss_ind_fixup(struct hn_softc *sc)
1079 {
1080 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1081 	int i, nchan;
1082 
1083 	nchan = sc->hn_rx_ring_inuse;
1084 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1085 
1086 	/*
1087 	 * Check indirect table to make sure that all channels in it
1088 	 * can be used.
1089 	 */
1090 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1091 		if (rss->rss_ind[i] >= nchan) {
1092 			if_printf(sc->hn_ifp,
1093 			    "RSS indirect table %d fixup: %u -> %d\n",
1094 			    i, rss->rss_ind[i], nchan - 1);
1095 			rss->rss_ind[i] = nchan - 1;
1096 		}
1097 	}
1098 }
1099 
1100 static int
1101 hn_ifmedia_upd(struct ifnet *ifp __unused)
1102 {
1103 
1104 	return EOPNOTSUPP;
1105 }
1106 
1107 static void
1108 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1109 {
1110 	struct hn_softc *sc = ifp->if_softc;
1111 
1112 	ifmr->ifm_status = IFM_AVALID;
1113 	ifmr->ifm_active = IFM_ETHER;
1114 
1115 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1116 		ifmr->ifm_active |= IFM_NONE;
1117 		return;
1118 	}
1119 	ifmr->ifm_status |= IFM_ACTIVE;
1120 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1121 }
1122 
1123 static void
1124 hn_rxvf_set_task(void *xarg, int pending __unused)
1125 {
1126 	struct hn_rxvf_setarg *arg = xarg;
1127 
1128 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1129 }
1130 
1131 static void
1132 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1133 {
1134 	struct hn_rx_ring *rxr;
1135 	struct hn_rxvf_setarg arg;
1136 	struct task task;
1137 	int i;
1138 
1139 	HN_LOCK_ASSERT(sc);
1140 
1141 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1142 
1143 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1144 		rxr = &sc->hn_rx_ring[i];
1145 
1146 		if (i < sc->hn_rx_ring_inuse) {
1147 			arg.rxr = rxr;
1148 			arg.vf_ifp = vf_ifp;
1149 			vmbus_chan_run_task(rxr->hn_chan, &task);
1150 		} else {
1151 			rxr->hn_rxvf_ifp = vf_ifp;
1152 		}
1153 	}
1154 }
1155 
1156 static bool
1157 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1158 {
1159 	const struct ifnet *hn_ifp;
1160 
1161 	hn_ifp = sc->hn_ifp;
1162 
1163 	if (ifp == hn_ifp)
1164 		return (false);
1165 
1166 	if (ifp->if_alloctype != IFT_ETHER)
1167 		return (false);
1168 
1169 	/* Ignore lagg/vlan interfaces */
1170 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1171 	    strcmp(ifp->if_dname, "vlan") == 0)
1172 		return (false);
1173 
1174 	/*
1175 	 * During detach events ifp->if_addr might be NULL.
1176 	 * Make sure the bcmp() below doesn't panic on that:
1177 	 */
1178 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1179 		return (false);
1180 
1181 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1182 		return (false);
1183 
1184 	return (true);
1185 }
1186 
1187 static void
1188 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1189 {
1190 	struct ifnet *hn_ifp;
1191 
1192 	HN_LOCK(sc);
1193 
1194 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1195 		goto out;
1196 
1197 	if (!hn_ismyvf(sc, ifp))
1198 		goto out;
1199 	hn_ifp = sc->hn_ifp;
1200 
1201 	if (rxvf) {
1202 		if (sc->hn_flags & HN_FLAG_RXVF)
1203 			goto out;
1204 
1205 		sc->hn_flags |= HN_FLAG_RXVF;
1206 		hn_rxfilter_config(sc);
1207 	} else {
1208 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1209 			goto out;
1210 
1211 		sc->hn_flags &= ~HN_FLAG_RXVF;
1212 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1213 			hn_rxfilter_config(sc);
1214 		else
1215 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1216 	}
1217 
1218 	hn_nvs_set_datapath(sc,
1219 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1220 
1221 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1222 
1223 	if (rxvf) {
1224 		hn_vf_rss_fixup(sc, true);
1225 		hn_suspend_mgmt(sc);
1226 		sc->hn_link_flags &=
1227 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1228 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1229 	} else {
1230 		hn_vf_rss_restore(sc);
1231 		hn_resume_mgmt(sc);
1232 	}
1233 
1234 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1235 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1236 
1237 	if (bootverbose) {
1238 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1239 		    rxvf ? "to" : "from", ifp->if_xname);
1240 	}
1241 out:
1242 	HN_UNLOCK(sc);
1243 }
1244 
1245 static void
1246 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1247 {
1248 
1249 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1250 		return;
1251 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1252 }
1253 
1254 static void
1255 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1256 {
1257 
1258 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1259 }
1260 
1261 static int
1262 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1263 {
1264 	struct ifnet *ifp, *vf_ifp;
1265 	uint64_t tmp;
1266 	int error;
1267 
1268 	HN_LOCK_ASSERT(sc);
1269 	ifp = sc->hn_ifp;
1270 	vf_ifp = sc->hn_vf_ifp;
1271 
1272 	/*
1273 	 * Fix up requested capabilities w/ supported capabilities,
1274 	 * since the supported capabilities could have been changed.
1275 	 */
1276 	ifr->ifr_reqcap &= ifp->if_capabilities;
1277 	/* Pass SIOCSIFCAP to VF. */
1278 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1279 
1280 	/*
1281 	 * NOTE:
1282 	 * The error will be propagated to the callers, however, it
1283 	 * is _not_ useful here.
1284 	 */
1285 
1286 	/*
1287 	 * Merge VF's enabled capabilities.
1288 	 */
1289 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1290 
1291 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1292 	if (ifp->if_capenable & IFCAP_TXCSUM)
1293 		ifp->if_hwassist |= tmp;
1294 	else
1295 		ifp->if_hwassist &= ~tmp;
1296 
1297 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1298 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1299 		ifp->if_hwassist |= tmp;
1300 	else
1301 		ifp->if_hwassist &= ~tmp;
1302 
1303 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1304 	if (ifp->if_capenable & IFCAP_TSO4)
1305 		ifp->if_hwassist |= tmp;
1306 	else
1307 		ifp->if_hwassist &= ~tmp;
1308 
1309 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1310 	if (ifp->if_capenable & IFCAP_TSO6)
1311 		ifp->if_hwassist |= tmp;
1312 	else
1313 		ifp->if_hwassist &= ~tmp;
1314 
1315 	return (error);
1316 }
1317 
1318 static int
1319 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1320 {
1321 	struct ifnet *vf_ifp;
1322 	struct ifreq ifr;
1323 
1324 	HN_LOCK_ASSERT(sc);
1325 	vf_ifp = sc->hn_vf_ifp;
1326 
1327 	memset(&ifr, 0, sizeof(ifr));
1328 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1329 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1330 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1331 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1332 }
1333 
1334 static void
1335 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1336 {
1337 	struct ifnet *ifp = sc->hn_ifp;
1338 	int allmulti = 0;
1339 
1340 	HN_LOCK_ASSERT(sc);
1341 
1342 	/* XXX vlan(4) style mcast addr maintenance */
1343 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1344 		allmulti = IFF_ALLMULTI;
1345 
1346 	/* Always set the VF's if_flags */
1347 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1348 }
1349 
1350 static void
1351 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1352 {
1353 	struct rm_priotracker pt;
1354 	struct ifnet *hn_ifp = NULL;
1355 	struct mbuf *mn;
1356 
1357 	/*
1358 	 * XXX racy, if hn(4) ever detached.
1359 	 */
1360 	rm_rlock(&hn_vfmap_lock, &pt);
1361 	if (vf_ifp->if_index < hn_vfmap_size)
1362 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1363 	rm_runlock(&hn_vfmap_lock, &pt);
1364 
1365 	if (hn_ifp != NULL) {
1366 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1367 			/*
1368 			 * Allow tapping on the VF.
1369 			 */
1370 			ETHER_BPF_MTAP(vf_ifp, mn);
1371 
1372 			/*
1373 			 * Update VF stats.
1374 			 */
1375 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1376 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1377 				    mn->m_pkthdr.len);
1378 			}
1379 			/*
1380 			 * XXX IFCOUNTER_IMCAST
1381 			 * This stat updating is kinda invasive, since it
1382 			 * requires two checks on the mbuf: the length check
1383 			 * and the ethernet header check.  As of this write,
1384 			 * all multicast packets go directly to hn(4), which
1385 			 * makes imcast stat updating in the VF a try in vian.
1386 			 */
1387 
1388 			/*
1389 			 * Fix up rcvif and increase hn(4)'s ipackets.
1390 			 */
1391 			mn->m_pkthdr.rcvif = hn_ifp;
1392 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1393 		}
1394 		/*
1395 		 * Go through hn(4)'s if_input.
1396 		 */
1397 		hn_ifp->if_input(hn_ifp, m);
1398 	} else {
1399 		/*
1400 		 * In the middle of the transition; free this
1401 		 * mbuf chain.
1402 		 */
1403 		while (m != NULL) {
1404 			mn = m->m_nextpkt;
1405 			m->m_nextpkt = NULL;
1406 			m_freem(m);
1407 			m = mn;
1408 		}
1409 	}
1410 }
1411 
1412 static void
1413 hn_mtu_change_fixup(struct hn_softc *sc)
1414 {
1415 	struct ifnet *ifp;
1416 
1417 	HN_LOCK_ASSERT(sc);
1418 	ifp = sc->hn_ifp;
1419 
1420 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1421 #if __FreeBSD_version >= 1100099
1422 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1423 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1424 #endif
1425 }
1426 
1427 static uint32_t
1428 hn_rss_type_fromndis(uint32_t rss_hash)
1429 {
1430 	uint32_t types = 0;
1431 
1432 	if (rss_hash & NDIS_HASH_IPV4)
1433 		types |= RSS_TYPE_IPV4;
1434 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1435 		types |= RSS_TYPE_TCP_IPV4;
1436 	if (rss_hash & NDIS_HASH_IPV6)
1437 		types |= RSS_TYPE_IPV6;
1438 	if (rss_hash & NDIS_HASH_IPV6_EX)
1439 		types |= RSS_TYPE_IPV6_EX;
1440 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1441 		types |= RSS_TYPE_TCP_IPV6;
1442 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1443 		types |= RSS_TYPE_TCP_IPV6_EX;
1444 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1445 		types |= RSS_TYPE_UDP_IPV4;
1446 	return (types);
1447 }
1448 
1449 static uint32_t
1450 hn_rss_type_tondis(uint32_t types)
1451 {
1452 	uint32_t rss_hash = 0;
1453 
1454 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1455 	    ("UDP6 and UDP6EX are not supported"));
1456 
1457 	if (types & RSS_TYPE_IPV4)
1458 		rss_hash |= NDIS_HASH_IPV4;
1459 	if (types & RSS_TYPE_TCP_IPV4)
1460 		rss_hash |= NDIS_HASH_TCP_IPV4;
1461 	if (types & RSS_TYPE_IPV6)
1462 		rss_hash |= NDIS_HASH_IPV6;
1463 	if (types & RSS_TYPE_IPV6_EX)
1464 		rss_hash |= NDIS_HASH_IPV6_EX;
1465 	if (types & RSS_TYPE_TCP_IPV6)
1466 		rss_hash |= NDIS_HASH_TCP_IPV6;
1467 	if (types & RSS_TYPE_TCP_IPV6_EX)
1468 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1469 	if (types & RSS_TYPE_UDP_IPV4)
1470 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1471 	return (rss_hash);
1472 }
1473 
1474 static void
1475 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1476 {
1477 	int i;
1478 
1479 	HN_LOCK_ASSERT(sc);
1480 
1481 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1482 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1483 }
1484 
1485 static void
1486 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1487 {
1488 	struct ifnet *ifp, *vf_ifp;
1489 	struct ifrsshash ifrh;
1490 	struct ifrsskey ifrk;
1491 	int error;
1492 	uint32_t my_types, diff_types, mbuf_types = 0;
1493 
1494 	HN_LOCK_ASSERT(sc);
1495 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1496 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1497 
1498 	if (sc->hn_rx_ring_inuse == 1) {
1499 		/* No RSS on synthetic parts; done. */
1500 		return;
1501 	}
1502 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1503 		/* Synthetic parts do not support Toeplitz; done. */
1504 		return;
1505 	}
1506 
1507 	ifp = sc->hn_ifp;
1508 	vf_ifp = sc->hn_vf_ifp;
1509 
1510 	/*
1511 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1512 	 * supported.
1513 	 */
1514 	memset(&ifrk, 0, sizeof(ifrk));
1515 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1516 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1517 	if (error) {
1518 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1519 		    vf_ifp->if_xname, error);
1520 		goto done;
1521 	}
1522 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1523 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1524 		    vf_ifp->if_xname, ifrk.ifrk_func);
1525 		goto done;
1526 	}
1527 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1528 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1529 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1530 		goto done;
1531 	}
1532 
1533 	/*
1534 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1535 	 */
1536 	memset(&ifrh, 0, sizeof(ifrh));
1537 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1538 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1539 	if (error) {
1540 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1541 		    vf_ifp->if_xname, error);
1542 		goto done;
1543 	}
1544 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1545 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1546 		    vf_ifp->if_xname, ifrh.ifrh_func);
1547 		goto done;
1548 	}
1549 
1550 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1551 	if ((ifrh.ifrh_types & my_types) == 0) {
1552 		/* This disables RSS; ignore it then */
1553 		if_printf(ifp, "%s intersection of RSS types failed.  "
1554 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1555 		    ifrh.ifrh_types, my_types);
1556 		goto done;
1557 	}
1558 
1559 	diff_types = my_types ^ ifrh.ifrh_types;
1560 	my_types &= ifrh.ifrh_types;
1561 	mbuf_types = my_types;
1562 
1563 	/*
1564 	 * Detect RSS hash value/type confliction.
1565 	 *
1566 	 * NOTE:
1567 	 * We don't disable the hash type, but stop delivery the hash
1568 	 * value/type through mbufs on RX path.
1569 	 *
1570 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1571 	 * hash is delivered with type of TCP_IPV4.  This means if
1572 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1573 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1574 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1575 	 * here.
1576 	 */
1577 	if ((my_types & RSS_TYPE_IPV4) &&
1578 	    (diff_types & ifrh.ifrh_types &
1579 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1580 		/* Conflict; disable IPV4 hash type/value delivery. */
1581 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1582 		mbuf_types &= ~RSS_TYPE_IPV4;
1583 	}
1584 	if ((my_types & RSS_TYPE_IPV6) &&
1585 	    (diff_types & ifrh.ifrh_types &
1586 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1587 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1588 	      RSS_TYPE_IPV6_EX))) {
1589 		/* Conflict; disable IPV6 hash type/value delivery. */
1590 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1591 		mbuf_types &= ~RSS_TYPE_IPV6;
1592 	}
1593 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1594 	    (diff_types & ifrh.ifrh_types &
1595 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1596 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1597 	      RSS_TYPE_IPV6))) {
1598 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1599 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1600 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1601 	}
1602 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1603 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1604 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1605 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1606 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1607 	}
1608 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1609 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1610 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1611 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1612 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1613 	}
1614 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1615 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1616 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1617 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1618 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1619 	}
1620 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1621 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1622 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1623 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1624 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1625 	}
1626 
1627 	/*
1628 	 * Indirect table does not matter.
1629 	 */
1630 
1631 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1632 	    hn_rss_type_tondis(my_types);
1633 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1634 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1635 
1636 	if (reconf) {
1637 		error = hn_rss_reconfig(sc);
1638 		if (error) {
1639 			/* XXX roll-back? */
1640 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1641 			/* XXX keep going. */
1642 		}
1643 	}
1644 done:
1645 	/* Hash deliverability for mbufs. */
1646 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1647 }
1648 
1649 static void
1650 hn_vf_rss_restore(struct hn_softc *sc)
1651 {
1652 
1653 	HN_LOCK_ASSERT(sc);
1654 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1655 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1656 
1657 	if (sc->hn_rx_ring_inuse == 1)
1658 		goto done;
1659 
1660 	/*
1661 	 * Restore hash types.  Key does _not_ matter.
1662 	 */
1663 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1664 		int error;
1665 
1666 		sc->hn_rss_hash = sc->hn_rss_hcap;
1667 		error = hn_rss_reconfig(sc);
1668 		if (error) {
1669 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1670 			    error);
1671 			/* XXX keep going. */
1672 		}
1673 	}
1674 done:
1675 	/* Hash deliverability for mbufs. */
1676 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1677 }
1678 
1679 static void
1680 hn_xpnt_vf_setready(struct hn_softc *sc)
1681 {
1682 	struct ifnet *ifp, *vf_ifp;
1683 	struct ifreq ifr;
1684 
1685 	HN_LOCK_ASSERT(sc);
1686 	ifp = sc->hn_ifp;
1687 	vf_ifp = sc->hn_vf_ifp;
1688 
1689 	/*
1690 	 * Mark the VF ready.
1691 	 */
1692 	sc->hn_vf_rdytick = 0;
1693 
1694 	/*
1695 	 * Save information for restoration.
1696 	 */
1697 	sc->hn_saved_caps = ifp->if_capabilities;
1698 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1699 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1700 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1701 
1702 	/*
1703 	 * Intersect supported/enabled capabilities.
1704 	 *
1705 	 * NOTE:
1706 	 * if_hwassist is not changed here.
1707 	 */
1708 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1709 	ifp->if_capenable &= ifp->if_capabilities;
1710 
1711 	/*
1712 	 * Fix TSO settings.
1713 	 */
1714 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1715 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1716 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1717 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1718 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1719 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1720 
1721 	/*
1722 	 * Change VF's enabled capabilities.
1723 	 */
1724 	memset(&ifr, 0, sizeof(ifr));
1725 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1726 	ifr.ifr_reqcap = ifp->if_capenable;
1727 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1728 
1729 	if (ifp->if_mtu != ETHERMTU) {
1730 		int error;
1731 
1732 		/*
1733 		 * Change VF's MTU.
1734 		 */
1735 		memset(&ifr, 0, sizeof(ifr));
1736 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1737 		ifr.ifr_mtu = ifp->if_mtu;
1738 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1739 		if (error) {
1740 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1741 			    vf_ifp->if_xname, ifp->if_mtu);
1742 			if (ifp->if_mtu > ETHERMTU) {
1743 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1744 
1745 				/*
1746 				 * XXX
1747 				 * No need to adjust the synthetic parts' MTU;
1748 				 * failure of the adjustment will cause us
1749 				 * infinite headache.
1750 				 */
1751 				ifp->if_mtu = ETHERMTU;
1752 				hn_mtu_change_fixup(sc);
1753 			}
1754 		}
1755 	}
1756 }
1757 
1758 static bool
1759 hn_xpnt_vf_isready(struct hn_softc *sc)
1760 {
1761 
1762 	HN_LOCK_ASSERT(sc);
1763 
1764 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1765 		return (false);
1766 
1767 	if (sc->hn_vf_rdytick == 0)
1768 		return (true);
1769 
1770 	if (sc->hn_vf_rdytick > ticks)
1771 		return (false);
1772 
1773 	/* Mark VF as ready. */
1774 	hn_xpnt_vf_setready(sc);
1775 	return (true);
1776 }
1777 
1778 static void
1779 hn_xpnt_vf_setenable(struct hn_softc *sc)
1780 {
1781 	int i;
1782 
1783 	HN_LOCK_ASSERT(sc);
1784 
1785 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1786 	rm_wlock(&sc->hn_vf_lock);
1787 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1788 	rm_wunlock(&sc->hn_vf_lock);
1789 
1790 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1791 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1792 }
1793 
1794 static void
1795 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1796 {
1797 	int i;
1798 
1799 	HN_LOCK_ASSERT(sc);
1800 
1801 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1802 	rm_wlock(&sc->hn_vf_lock);
1803 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1804 	if (clear_vf)
1805 		sc->hn_vf_ifp = NULL;
1806 	rm_wunlock(&sc->hn_vf_lock);
1807 
1808 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1809 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1810 }
1811 
1812 static void
1813 hn_xpnt_vf_init(struct hn_softc *sc)
1814 {
1815 	int error;
1816 
1817 	HN_LOCK_ASSERT(sc);
1818 
1819 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1820 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1821 
1822 	if (bootverbose) {
1823 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1824 		    sc->hn_vf_ifp->if_xname);
1825 	}
1826 
1827 	/*
1828 	 * Bring the VF up.
1829 	 */
1830 	hn_xpnt_vf_saveifflags(sc);
1831 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1832 	error = hn_xpnt_vf_iocsetflags(sc);
1833 	if (error) {
1834 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1835 		    sc->hn_vf_ifp->if_xname, error);
1836 		return;
1837 	}
1838 
1839 	/*
1840 	 * NOTE:
1841 	 * Datapath setting must happen _after_ bringing the VF up.
1842 	 */
1843 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1844 
1845 	/*
1846 	 * NOTE:
1847 	 * Fixup RSS related bits _after_ the VF is brought up, since
1848 	 * many VFs generate RSS key during it's initialization.
1849 	 */
1850 	hn_vf_rss_fixup(sc, true);
1851 
1852 	/* Mark transparent mode VF as enabled. */
1853 	hn_xpnt_vf_setenable(sc);
1854 }
1855 
1856 static void
1857 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1858 {
1859 	struct hn_softc *sc = xsc;
1860 
1861 	HN_LOCK(sc);
1862 
1863 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1864 		goto done;
1865 	if (sc->hn_vf_ifp == NULL)
1866 		goto done;
1867 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1868 		goto done;
1869 
1870 	if (sc->hn_vf_rdytick != 0) {
1871 		/* Mark VF as ready. */
1872 		hn_xpnt_vf_setready(sc);
1873 	}
1874 
1875 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1876 		/*
1877 		 * Delayed VF initialization.
1878 		 */
1879 		if (bootverbose) {
1880 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1881 			    sc->hn_vf_ifp->if_xname);
1882 		}
1883 		hn_xpnt_vf_init(sc);
1884 	}
1885 done:
1886 	HN_UNLOCK(sc);
1887 }
1888 
1889 static void
1890 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1891 {
1892 	struct hn_softc *sc = xsc;
1893 
1894 	HN_LOCK(sc);
1895 
1896 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1897 		goto done;
1898 
1899 	if (!hn_ismyvf(sc, ifp))
1900 		goto done;
1901 
1902 	if (sc->hn_vf_ifp != NULL) {
1903 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1904 		    sc->hn_vf_ifp->if_xname);
1905 		goto done;
1906 	}
1907 
1908 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1909 		/*
1910 		 * ifnet.if_start is _not_ supported by transparent
1911 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1912 		 */
1913 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1914 		    "in transparent VF mode.\n", ifp->if_xname);
1915 		goto done;
1916 	}
1917 
1918 	rm_wlock(&hn_vfmap_lock);
1919 
1920 	if (ifp->if_index >= hn_vfmap_size) {
1921 		struct ifnet **newmap;
1922 		int newsize;
1923 
1924 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1925 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1926 		    M_WAITOK | M_ZERO);
1927 
1928 		memcpy(newmap, hn_vfmap,
1929 		    sizeof(struct ifnet *) * hn_vfmap_size);
1930 		free(hn_vfmap, M_DEVBUF);
1931 		hn_vfmap = newmap;
1932 		hn_vfmap_size = newsize;
1933 	}
1934 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1935 	    ("%s: ifindex %d was mapped to %s",
1936 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1937 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1938 
1939 	rm_wunlock(&hn_vfmap_lock);
1940 
1941 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1942 	rm_wlock(&sc->hn_vf_lock);
1943 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1944 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1945 	sc->hn_vf_ifp = ifp;
1946 	rm_wunlock(&sc->hn_vf_lock);
1947 
1948 	if (hn_xpnt_vf) {
1949 		int wait_ticks;
1950 
1951 		/*
1952 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1953 		 * Save vf_ifp's current if_input for later restoration.
1954 		 */
1955 		sc->hn_vf_input = ifp->if_input;
1956 		ifp->if_input = hn_xpnt_vf_input;
1957 
1958 		/*
1959 		 * Stop link status management; use the VF's.
1960 		 */
1961 		hn_suspend_mgmt(sc);
1962 
1963 		/*
1964 		 * Give VF sometime to complete its attach routing.
1965 		 */
1966 		wait_ticks = hn_xpnt_vf_attwait * hz;
1967 		sc->hn_vf_rdytick = ticks + wait_ticks;
1968 
1969 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1970 		    wait_ticks);
1971 	}
1972 done:
1973 	HN_UNLOCK(sc);
1974 }
1975 
1976 static void
1977 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1978 {
1979 	struct hn_softc *sc = xsc;
1980 
1981 	HN_LOCK(sc);
1982 
1983 	if (sc->hn_vf_ifp == NULL)
1984 		goto done;
1985 
1986 	if (!hn_ismyvf(sc, ifp))
1987 		goto done;
1988 
1989 	if (hn_xpnt_vf) {
1990 		/*
1991 		 * Make sure that the delayed initialization is not running.
1992 		 *
1993 		 * NOTE:
1994 		 * - This lock _must_ be released, since the hn_vf_init task
1995 		 *   will try holding this lock.
1996 		 * - It is safe to release this lock here, since the
1997 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1998 		 *
1999 		 * XXX racy, if hn(4) ever detached.
2000 		 */
2001 		HN_UNLOCK(sc);
2002 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2003 		HN_LOCK(sc);
2004 
2005 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2006 		    sc->hn_ifp->if_xname));
2007 		ifp->if_input = sc->hn_vf_input;
2008 		sc->hn_vf_input = NULL;
2009 
2010 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2011 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2012 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2013 
2014 		if (sc->hn_vf_rdytick == 0) {
2015 			/*
2016 			 * The VF was ready; restore some settings.
2017 			 */
2018 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2019 			/*
2020 			 * NOTE:
2021 			 * There is _no_ need to fixup if_capenable and
2022 			 * if_hwassist, since the if_capabilities before
2023 			 * restoration was an intersection of the VF's
2024 			 * if_capabilites and the synthetic device's
2025 			 * if_capabilites.
2026 			 */
2027 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2028 			sc->hn_ifp->if_hw_tsomaxsegcount =
2029 			    sc->hn_saved_tsosegcnt;
2030 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2031 		}
2032 
2033 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2034 			/*
2035 			 * Restore RSS settings.
2036 			 */
2037 			hn_vf_rss_restore(sc);
2038 
2039 			/*
2040 			 * Resume link status management, which was suspended
2041 			 * by hn_ifnet_attevent().
2042 			 */
2043 			hn_resume_mgmt(sc);
2044 		}
2045 	}
2046 
2047 	/* Mark transparent mode VF as disabled. */
2048 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2049 
2050 	rm_wlock(&hn_vfmap_lock);
2051 
2052 	KASSERT(ifp->if_index < hn_vfmap_size,
2053 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2054 	if (hn_vfmap[ifp->if_index] != NULL) {
2055 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2056 		    ("%s: ifindex %d was mapped to %s",
2057 		     ifp->if_xname, ifp->if_index,
2058 		     hn_vfmap[ifp->if_index]->if_xname));
2059 		hn_vfmap[ifp->if_index] = NULL;
2060 	}
2061 
2062 	rm_wunlock(&hn_vfmap_lock);
2063 done:
2064 	HN_UNLOCK(sc);
2065 }
2066 
2067 static void
2068 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2069 {
2070 	struct hn_softc *sc = xsc;
2071 
2072 	if (sc->hn_vf_ifp == ifp)
2073 		if_link_state_change(sc->hn_ifp, link_state);
2074 }
2075 
2076 static int
2077 hn_probe(device_t dev)
2078 {
2079 
2080 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2081 		device_set_desc(dev, "Hyper-V Network Interface");
2082 		return BUS_PROBE_DEFAULT;
2083 	}
2084 	return ENXIO;
2085 }
2086 
2087 static int
2088 hn_attach(device_t dev)
2089 {
2090 	struct hn_softc *sc = device_get_softc(dev);
2091 	struct sysctl_oid_list *child;
2092 	struct sysctl_ctx_list *ctx;
2093 	uint8_t eaddr[ETHER_ADDR_LEN];
2094 	struct ifnet *ifp = NULL;
2095 	int error, ring_cnt, tx_ring_cnt;
2096 	uint32_t mtu;
2097 
2098 	sc->hn_dev = dev;
2099 	sc->hn_prichan = vmbus_get_channel(dev);
2100 	HN_LOCK_INIT(sc);
2101 	rm_init(&sc->hn_vf_lock, "hnvf");
2102 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2103 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2104 
2105 	/*
2106 	 * Initialize these tunables once.
2107 	 */
2108 	sc->hn_agg_size = hn_tx_agg_size;
2109 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2110 
2111 	/*
2112 	 * Setup taskqueue for transmission.
2113 	 */
2114 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2115 		int i;
2116 
2117 		sc->hn_tx_taskqs =
2118 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2119 		    M_DEVBUF, M_WAITOK);
2120 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2121 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2122 			    M_WAITOK, taskqueue_thread_enqueue,
2123 			    &sc->hn_tx_taskqs[i]);
2124 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2125 			    "%s tx%d", device_get_nameunit(dev), i);
2126 		}
2127 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2128 		sc->hn_tx_taskqs = hn_tx_taskque;
2129 	}
2130 
2131 	/*
2132 	 * Setup taskqueue for mangement tasks, e.g. link status.
2133 	 */
2134 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2135 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2136 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2137 	    device_get_nameunit(dev));
2138 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2139 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2140 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2141 	    hn_netchg_status_taskfunc, sc);
2142 
2143 	if (hn_xpnt_vf) {
2144 		/*
2145 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2146 		 */
2147 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2148 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2149 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2150 		    device_get_nameunit(dev));
2151 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2152 		    hn_xpnt_vf_init_taskfunc, sc);
2153 	}
2154 
2155 	/*
2156 	 * Allocate ifnet and setup its name earlier, so that if_printf
2157 	 * can be used by functions, which will be called after
2158 	 * ether_ifattach().
2159 	 */
2160 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2161 	ifp->if_softc = sc;
2162 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2163 
2164 	/*
2165 	 * Initialize ifmedia earlier so that it can be unconditionally
2166 	 * destroyed, if error happened later on.
2167 	 */
2168 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2169 
2170 	/*
2171 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2172 	 * to use (tx_ring_cnt).
2173 	 *
2174 	 * NOTE:
2175 	 * The # of RX rings to use is same as the # of channels to use.
2176 	 */
2177 	ring_cnt = hn_chan_cnt;
2178 	if (ring_cnt <= 0) {
2179 		/* Default */
2180 		ring_cnt = mp_ncpus;
2181 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2182 			ring_cnt = HN_RING_CNT_DEF_MAX;
2183 	} else if (ring_cnt > mp_ncpus) {
2184 		ring_cnt = mp_ncpus;
2185 	}
2186 #ifdef RSS
2187 	if (ring_cnt > rss_getnumbuckets())
2188 		ring_cnt = rss_getnumbuckets();
2189 #endif
2190 
2191 	tx_ring_cnt = hn_tx_ring_cnt;
2192 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2193 		tx_ring_cnt = ring_cnt;
2194 #ifdef HN_IFSTART_SUPPORT
2195 	if (hn_use_if_start) {
2196 		/* ifnet.if_start only needs one TX ring. */
2197 		tx_ring_cnt = 1;
2198 	}
2199 #endif
2200 
2201 	/*
2202 	 * Set the leader CPU for channels.
2203 	 */
2204 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2205 
2206 	/*
2207 	 * Create enough TX/RX rings, even if only limited number of
2208 	 * channels can be allocated.
2209 	 */
2210 	error = hn_create_tx_data(sc, tx_ring_cnt);
2211 	if (error)
2212 		goto failed;
2213 	error = hn_create_rx_data(sc, ring_cnt);
2214 	if (error)
2215 		goto failed;
2216 
2217 	/*
2218 	 * Create transaction context for NVS and RNDIS transactions.
2219 	 */
2220 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2221 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2222 	if (sc->hn_xact == NULL) {
2223 		error = ENXIO;
2224 		goto failed;
2225 	}
2226 
2227 	/*
2228 	 * Install orphan handler for the revocation of this device's
2229 	 * primary channel.
2230 	 *
2231 	 * NOTE:
2232 	 * The processing order is critical here:
2233 	 * Install the orphan handler, _before_ testing whether this
2234 	 * device's primary channel has been revoked or not.
2235 	 */
2236 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2237 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2238 		error = ENXIO;
2239 		goto failed;
2240 	}
2241 
2242 	/*
2243 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2244 	 */
2245 	error = hn_synth_attach(sc, ETHERMTU);
2246 	if (error)
2247 		goto failed;
2248 
2249 	error = hn_rndis_get_eaddr(sc, eaddr);
2250 	if (error)
2251 		goto failed;
2252 
2253 	error = hn_rndis_get_mtu(sc, &mtu);
2254 	if (error)
2255 		mtu = ETHERMTU;
2256 	else if (bootverbose)
2257 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2258 
2259 #if __FreeBSD_version >= 1100099
2260 	if (sc->hn_rx_ring_inuse > 1) {
2261 		/*
2262 		 * Reduce TCP segment aggregation limit for multiple
2263 		 * RX rings to increase ACK timeliness.
2264 		 */
2265 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2266 	}
2267 #endif
2268 
2269 	/*
2270 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2271 	 */
2272 	hn_fixup_tx_data(sc);
2273 	hn_fixup_rx_data(sc);
2274 
2275 	ctx = device_get_sysctl_ctx(dev);
2276 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2277 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2278 	    &sc->hn_nvs_ver, 0, "NVS version");
2279 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2280 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2281 	    hn_ndis_version_sysctl, "A", "NDIS version");
2282 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2283 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2284 	    hn_caps_sysctl, "A", "capabilities");
2285 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2286 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2287 	    hn_hwassist_sysctl, "A", "hwassist");
2288 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2289 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2290 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2291 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2292 	    "max # of TSO segments");
2293 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2294 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2295 	    "max size of TSO segment");
2296 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2297 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2298 	    hn_rxfilter_sysctl, "A", "rxfilter");
2299 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2300 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2301 	    hn_rss_hash_sysctl, "A", "RSS hash");
2302 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2303 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2304 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2305 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2306 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2307 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2308 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2309 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2310 #ifndef RSS
2311 	/*
2312 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2313 	 */
2314 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2315 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2316 	    hn_rss_key_sysctl, "IU", "RSS key");
2317 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2318 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2319 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2320 #endif
2321 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2322 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2323 	    "RNDIS offered packet transmission aggregation size limit");
2324 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2325 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2326 	    "RNDIS offered packet transmission aggregation count limit");
2327 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2328 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2329 	    "RNDIS packet transmission aggregation alignment");
2330 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2331 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2332 	    hn_txagg_size_sysctl, "I",
2333 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2334 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2335 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2336 	    hn_txagg_pkts_sysctl, "I",
2337 	    "Packet transmission aggregation packets, "
2338 	    "0 -- disable, -1 -- auto");
2339 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2340 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2341 	    hn_polling_sysctl, "I",
2342 	    "Polling frequency: [100,1000000], 0 disable polling");
2343 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2344 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2345 	    hn_vf_sysctl, "A", "Virtual Function's name");
2346 	if (!hn_xpnt_vf) {
2347 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2348 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2349 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2350 	} else {
2351 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2352 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2353 		    hn_xpnt_vf_enabled_sysctl, "I",
2354 		    "Transparent VF enabled");
2355 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2356 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2357 		    hn_xpnt_vf_accbpf_sysctl, "I",
2358 		    "Accurate BPF for transparent VF");
2359 	}
2360 
2361 	/*
2362 	 * Setup the ifmedia, which has been initialized earlier.
2363 	 */
2364 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2365 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2366 	/* XXX ifmedia_set really should do this for us */
2367 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2368 
2369 	/*
2370 	 * Setup the ifnet for this interface.
2371 	 */
2372 
2373 	ifp->if_baudrate = IF_Gbps(10);
2374 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2375 	ifp->if_ioctl = hn_ioctl;
2376 	ifp->if_init = hn_init;
2377 #ifdef HN_IFSTART_SUPPORT
2378 	if (hn_use_if_start) {
2379 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2380 
2381 		ifp->if_start = hn_start;
2382 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2383 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2384 		IFQ_SET_READY(&ifp->if_snd);
2385 	} else
2386 #endif
2387 	{
2388 		ifp->if_transmit = hn_transmit;
2389 		ifp->if_qflush = hn_xmit_qflush;
2390 	}
2391 
2392 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2393 #ifdef foo
2394 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2395 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2396 #endif
2397 	if (sc->hn_caps & HN_CAP_VLAN) {
2398 		/* XXX not sure about VLAN_MTU. */
2399 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2400 	}
2401 
2402 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2403 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2404 		ifp->if_capabilities |= IFCAP_TXCSUM;
2405 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2406 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2407 	if (sc->hn_caps & HN_CAP_TSO4) {
2408 		ifp->if_capabilities |= IFCAP_TSO4;
2409 		ifp->if_hwassist |= CSUM_IP_TSO;
2410 	}
2411 	if (sc->hn_caps & HN_CAP_TSO6) {
2412 		ifp->if_capabilities |= IFCAP_TSO6;
2413 		ifp->if_hwassist |= CSUM_IP6_TSO;
2414 	}
2415 
2416 	/* Enable all available capabilities by default. */
2417 	ifp->if_capenable = ifp->if_capabilities;
2418 
2419 	/*
2420 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2421 	 * be enabled through SIOCSIFCAP.
2422 	 */
2423 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2424 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2425 
2426 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2427 		/*
2428 		 * Lock hn_set_tso_maxsize() to simplify its
2429 		 * internal logic.
2430 		 */
2431 		HN_LOCK(sc);
2432 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2433 		HN_UNLOCK(sc);
2434 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2435 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2436 	}
2437 
2438 	ether_ifattach(ifp, eaddr);
2439 
2440 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2441 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2442 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2443 	}
2444 	if (mtu < ETHERMTU) {
2445 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2446 		ifp->if_mtu = mtu;
2447 	}
2448 
2449 	/* Inform the upper layer about the long frame support. */
2450 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2451 
2452 	/*
2453 	 * Kick off link status check.
2454 	 */
2455 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2456 	hn_update_link_status(sc);
2457 
2458 	if (!hn_xpnt_vf) {
2459 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2460 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2461 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2462 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2463 	} else {
2464 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2465 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2466 	}
2467 
2468 	/*
2469 	 * NOTE:
2470 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2471 	 * since interface's LLADDR is needed; interface LLADDR is not
2472 	 * available when ifnet_arrival event is triggered.
2473 	 */
2474 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2475 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2476 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2477 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2478 
2479 	return (0);
2480 failed:
2481 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2482 		hn_synth_detach(sc);
2483 	hn_detach(dev);
2484 	return (error);
2485 }
2486 
2487 static int
2488 hn_detach(device_t dev)
2489 {
2490 	struct hn_softc *sc = device_get_softc(dev);
2491 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2492 
2493 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2494 		/*
2495 		 * In case that the vmbus missed the orphan handler
2496 		 * installation.
2497 		 */
2498 		vmbus_xact_ctx_orphan(sc->hn_xact);
2499 	}
2500 
2501 	if (sc->hn_ifaddr_evthand != NULL)
2502 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2503 	if (sc->hn_ifnet_evthand != NULL)
2504 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2505 	if (sc->hn_ifnet_atthand != NULL) {
2506 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2507 		    sc->hn_ifnet_atthand);
2508 	}
2509 	if (sc->hn_ifnet_dethand != NULL) {
2510 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2511 		    sc->hn_ifnet_dethand);
2512 	}
2513 	if (sc->hn_ifnet_lnkhand != NULL)
2514 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2515 
2516 	vf_ifp = sc->hn_vf_ifp;
2517 	__compiler_membar();
2518 	if (vf_ifp != NULL)
2519 		hn_ifnet_detevent(sc, vf_ifp);
2520 
2521 	if (device_is_attached(dev)) {
2522 		HN_LOCK(sc);
2523 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2524 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2525 				hn_stop(sc, true);
2526 			/*
2527 			 * NOTE:
2528 			 * hn_stop() only suspends data, so managment
2529 			 * stuffs have to be suspended manually here.
2530 			 */
2531 			hn_suspend_mgmt(sc);
2532 			hn_synth_detach(sc);
2533 		}
2534 		HN_UNLOCK(sc);
2535 		ether_ifdetach(ifp);
2536 	}
2537 
2538 	ifmedia_removeall(&sc->hn_media);
2539 	hn_destroy_rx_data(sc);
2540 	hn_destroy_tx_data(sc);
2541 
2542 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2543 		int i;
2544 
2545 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2546 			taskqueue_free(sc->hn_tx_taskqs[i]);
2547 		free(sc->hn_tx_taskqs, M_DEVBUF);
2548 	}
2549 	taskqueue_free(sc->hn_mgmt_taskq0);
2550 	if (sc->hn_vf_taskq != NULL)
2551 		taskqueue_free(sc->hn_vf_taskq);
2552 
2553 	if (sc->hn_xact != NULL) {
2554 		/*
2555 		 * Uninstall the orphan handler _before_ the xact is
2556 		 * destructed.
2557 		 */
2558 		vmbus_chan_unset_orphan(sc->hn_prichan);
2559 		vmbus_xact_ctx_destroy(sc->hn_xact);
2560 	}
2561 
2562 	if_free(ifp);
2563 
2564 	HN_LOCK_DESTROY(sc);
2565 	rm_destroy(&sc->hn_vf_lock);
2566 	return (0);
2567 }
2568 
2569 static int
2570 hn_shutdown(device_t dev)
2571 {
2572 
2573 	return (0);
2574 }
2575 
2576 static void
2577 hn_link_status(struct hn_softc *sc)
2578 {
2579 	uint32_t link_status;
2580 	int error;
2581 
2582 	error = hn_rndis_get_linkstatus(sc, &link_status);
2583 	if (error) {
2584 		/* XXX what to do? */
2585 		return;
2586 	}
2587 
2588 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2589 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2590 	else
2591 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2592 	if_link_state_change(sc->hn_ifp,
2593 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2594 	    LINK_STATE_UP : LINK_STATE_DOWN);
2595 }
2596 
2597 static void
2598 hn_link_taskfunc(void *xsc, int pending __unused)
2599 {
2600 	struct hn_softc *sc = xsc;
2601 
2602 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2603 		return;
2604 	hn_link_status(sc);
2605 }
2606 
2607 static void
2608 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2609 {
2610 	struct hn_softc *sc = xsc;
2611 
2612 	/* Prevent any link status checks from running. */
2613 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2614 
2615 	/*
2616 	 * Fake up a [link down --> link up] state change; 5 seconds
2617 	 * delay is used, which closely simulates miibus reaction
2618 	 * upon link down event.
2619 	 */
2620 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2621 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2622 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2623 	    &sc->hn_netchg_status, 5 * hz);
2624 }
2625 
2626 static void
2627 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2628 {
2629 	struct hn_softc *sc = xsc;
2630 
2631 	/* Re-allow link status checks. */
2632 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2633 	hn_link_status(sc);
2634 }
2635 
2636 static void
2637 hn_update_link_status(struct hn_softc *sc)
2638 {
2639 
2640 	if (sc->hn_mgmt_taskq != NULL)
2641 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2642 }
2643 
2644 static void
2645 hn_change_network(struct hn_softc *sc)
2646 {
2647 
2648 	if (sc->hn_mgmt_taskq != NULL)
2649 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2650 }
2651 
2652 static __inline int
2653 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2654     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2655 {
2656 	struct mbuf *m = *m_head;
2657 	int error;
2658 
2659 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2660 
2661 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2662 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2663 	if (error == EFBIG) {
2664 		struct mbuf *m_new;
2665 
2666 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2667 		if (m_new == NULL)
2668 			return ENOBUFS;
2669 		else
2670 			*m_head = m = m_new;
2671 		txr->hn_tx_collapsed++;
2672 
2673 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2674 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2675 	}
2676 	if (!error) {
2677 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2678 		    BUS_DMASYNC_PREWRITE);
2679 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2680 	}
2681 	return error;
2682 }
2683 
2684 static __inline int
2685 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2686 {
2687 
2688 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2689 	    ("put an onlist txd %#x", txd->flags));
2690 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2691 	    ("put an onagg txd %#x", txd->flags));
2692 
2693 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2694 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2695 		return 0;
2696 
2697 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2698 		struct hn_txdesc *tmp_txd;
2699 
2700 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2701 			int freed;
2702 
2703 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2704 			    ("resursive aggregation on aggregated txdesc"));
2705 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2706 			    ("not aggregated txdesc"));
2707 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2708 			    ("aggregated txdesc uses dmamap"));
2709 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2710 			    ("aggregated txdesc consumes "
2711 			     "chimney sending buffer"));
2712 			KASSERT(tmp_txd->chim_size == 0,
2713 			    ("aggregated txdesc has non-zero "
2714 			     "chimney sending size"));
2715 
2716 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2717 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2718 			freed = hn_txdesc_put(txr, tmp_txd);
2719 			KASSERT(freed, ("failed to free aggregated txdesc"));
2720 		}
2721 	}
2722 
2723 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2724 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2725 		    ("chim txd uses dmamap"));
2726 		hn_chim_free(txr->hn_sc, txd->chim_index);
2727 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2728 		txd->chim_size = 0;
2729 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2730 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2731 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2732 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2733 		    txd->data_dmap);
2734 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2735 	}
2736 
2737 	if (txd->m != NULL) {
2738 		m_freem(txd->m);
2739 		txd->m = NULL;
2740 	}
2741 
2742 	txd->flags |= HN_TXD_FLAG_ONLIST;
2743 #ifndef HN_USE_TXDESC_BUFRING
2744 	mtx_lock_spin(&txr->hn_txlist_spin);
2745 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2746 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2747 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2748 	txr->hn_txdesc_avail++;
2749 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2750 	mtx_unlock_spin(&txr->hn_txlist_spin);
2751 #else	/* HN_USE_TXDESC_BUFRING */
2752 #ifdef HN_DEBUG
2753 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2754 #endif
2755 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2756 #endif	/* !HN_USE_TXDESC_BUFRING */
2757 
2758 	return 1;
2759 }
2760 
2761 static __inline struct hn_txdesc *
2762 hn_txdesc_get(struct hn_tx_ring *txr)
2763 {
2764 	struct hn_txdesc *txd;
2765 
2766 #ifndef HN_USE_TXDESC_BUFRING
2767 	mtx_lock_spin(&txr->hn_txlist_spin);
2768 	txd = SLIST_FIRST(&txr->hn_txlist);
2769 	if (txd != NULL) {
2770 		KASSERT(txr->hn_txdesc_avail > 0,
2771 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2772 		txr->hn_txdesc_avail--;
2773 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2774 	}
2775 	mtx_unlock_spin(&txr->hn_txlist_spin);
2776 #else
2777 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2778 #endif
2779 
2780 	if (txd != NULL) {
2781 #ifdef HN_USE_TXDESC_BUFRING
2782 #ifdef HN_DEBUG
2783 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2784 #endif
2785 #endif	/* HN_USE_TXDESC_BUFRING */
2786 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2787 		    STAILQ_EMPTY(&txd->agg_list) &&
2788 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2789 		    txd->chim_size == 0 &&
2790 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2791 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2792 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2793 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2794 		txd->refs = 1;
2795 	}
2796 	return txd;
2797 }
2798 
2799 static __inline void
2800 hn_txdesc_hold(struct hn_txdesc *txd)
2801 {
2802 
2803 	/* 0->1 transition will never work */
2804 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2805 	atomic_add_int(&txd->refs, 1);
2806 }
2807 
2808 static __inline void
2809 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2810 {
2811 
2812 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2813 	    ("recursive aggregation on aggregating txdesc"));
2814 
2815 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2816 	    ("already aggregated"));
2817 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2818 	    ("recursive aggregation on to-be-aggregated txdesc"));
2819 
2820 	txd->flags |= HN_TXD_FLAG_ONAGG;
2821 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2822 }
2823 
2824 static bool
2825 hn_tx_ring_pending(struct hn_tx_ring *txr)
2826 {
2827 	bool pending = false;
2828 
2829 #ifndef HN_USE_TXDESC_BUFRING
2830 	mtx_lock_spin(&txr->hn_txlist_spin);
2831 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2832 		pending = true;
2833 	mtx_unlock_spin(&txr->hn_txlist_spin);
2834 #else
2835 	if (!buf_ring_full(txr->hn_txdesc_br))
2836 		pending = true;
2837 #endif
2838 	return (pending);
2839 }
2840 
2841 static __inline void
2842 hn_txeof(struct hn_tx_ring *txr)
2843 {
2844 	txr->hn_has_txeof = 0;
2845 	txr->hn_txeof(txr);
2846 }
2847 
2848 static void
2849 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2850     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2851 {
2852 	struct hn_txdesc *txd = sndc->hn_cbarg;
2853 	struct hn_tx_ring *txr;
2854 
2855 	txr = txd->txr;
2856 	KASSERT(txr->hn_chan == chan,
2857 	    ("channel mismatch, on chan%u, should be chan%u",
2858 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2859 
2860 	txr->hn_has_txeof = 1;
2861 	hn_txdesc_put(txr, txd);
2862 
2863 	++txr->hn_txdone_cnt;
2864 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2865 		txr->hn_txdone_cnt = 0;
2866 		if (txr->hn_oactive)
2867 			hn_txeof(txr);
2868 	}
2869 }
2870 
2871 static void
2872 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2873 {
2874 #if defined(INET) || defined(INET6)
2875 	tcp_lro_flush_all(&rxr->hn_lro);
2876 #endif
2877 
2878 	/*
2879 	 * NOTE:
2880 	 * 'txr' could be NULL, if multiple channels and
2881 	 * ifnet.if_start method are enabled.
2882 	 */
2883 	if (txr == NULL || !txr->hn_has_txeof)
2884 		return;
2885 
2886 	txr->hn_txdone_cnt = 0;
2887 	hn_txeof(txr);
2888 }
2889 
2890 static __inline uint32_t
2891 hn_rndis_pktmsg_offset(uint32_t ofs)
2892 {
2893 
2894 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2895 	    ("invalid RNDIS packet msg offset %u", ofs));
2896 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2897 }
2898 
2899 static __inline void *
2900 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2901     size_t pi_dlen, uint32_t pi_type)
2902 {
2903 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2904 	struct rndis_pktinfo *pi;
2905 
2906 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2907 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2908 
2909 	/*
2910 	 * Per-packet-info does not move; it only grows.
2911 	 *
2912 	 * NOTE:
2913 	 * rm_pktinfooffset in this phase counts from the beginning
2914 	 * of rndis_packet_msg.
2915 	 */
2916 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2917 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2918 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2919 	    pkt->rm_pktinfolen);
2920 	pkt->rm_pktinfolen += pi_size;
2921 
2922 	pi->rm_size = pi_size;
2923 	pi->rm_type = pi_type;
2924 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2925 
2926 	return (pi->rm_data);
2927 }
2928 
2929 static __inline int
2930 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2931 {
2932 	struct hn_txdesc *txd;
2933 	struct mbuf *m;
2934 	int error, pkts;
2935 
2936 	txd = txr->hn_agg_txd;
2937 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2938 
2939 	/*
2940 	 * Since hn_txpkt() will reset this temporary stat, save
2941 	 * it now, so that oerrors can be updated properly, if
2942 	 * hn_txpkt() ever fails.
2943 	 */
2944 	pkts = txr->hn_stat_pkts;
2945 
2946 	/*
2947 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2948 	 * failure, save it for later freeing, if hn_txpkt() ever
2949 	 * fails.
2950 	 */
2951 	m = txd->m;
2952 	error = hn_txpkt(ifp, txr, txd);
2953 	if (__predict_false(error)) {
2954 		/* txd is freed, but m is not. */
2955 		m_freem(m);
2956 
2957 		txr->hn_flush_failed++;
2958 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2959 	}
2960 
2961 	/* Reset all aggregation states. */
2962 	txr->hn_agg_txd = NULL;
2963 	txr->hn_agg_szleft = 0;
2964 	txr->hn_agg_pktleft = 0;
2965 	txr->hn_agg_prevpkt = NULL;
2966 
2967 	return (error);
2968 }
2969 
2970 static void *
2971 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2972     int pktsize)
2973 {
2974 	void *chim;
2975 
2976 	if (txr->hn_agg_txd != NULL) {
2977 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2978 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2979 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2980 			int olen;
2981 
2982 			/*
2983 			 * Update the previous RNDIS packet's total length,
2984 			 * it can be increased due to the mandatory alignment
2985 			 * padding for this RNDIS packet.  And update the
2986 			 * aggregating txdesc's chimney sending buffer size
2987 			 * accordingly.
2988 			 *
2989 			 * XXX
2990 			 * Zero-out the padding, as required by the RNDIS spec.
2991 			 */
2992 			olen = pkt->rm_len;
2993 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2994 			agg_txd->chim_size += pkt->rm_len - olen;
2995 
2996 			/* Link this txdesc to the parent. */
2997 			hn_txdesc_agg(agg_txd, txd);
2998 
2999 			chim = (uint8_t *)pkt + pkt->rm_len;
3000 			/* Save the current packet for later fixup. */
3001 			txr->hn_agg_prevpkt = chim;
3002 
3003 			txr->hn_agg_pktleft--;
3004 			txr->hn_agg_szleft -= pktsize;
3005 			if (txr->hn_agg_szleft <=
3006 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3007 				/*
3008 				 * Probably can't aggregate more packets,
3009 				 * flush this aggregating txdesc proactively.
3010 				 */
3011 				txr->hn_agg_pktleft = 0;
3012 			}
3013 			/* Done! */
3014 			return (chim);
3015 		}
3016 		hn_flush_txagg(ifp, txr);
3017 	}
3018 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3019 
3020 	txr->hn_tx_chimney_tried++;
3021 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3022 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3023 		return (NULL);
3024 	txr->hn_tx_chimney++;
3025 
3026 	chim = txr->hn_sc->hn_chim +
3027 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3028 
3029 	if (txr->hn_agg_pktmax > 1 &&
3030 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3031 		txr->hn_agg_txd = txd;
3032 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3033 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3034 		txr->hn_agg_prevpkt = chim;
3035 	}
3036 	return (chim);
3037 }
3038 
3039 /*
3040  * NOTE:
3041  * If this function fails, then both txd and m_head0 will be freed.
3042  */
3043 static int
3044 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3045     struct mbuf **m_head0)
3046 {
3047 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3048 	int error, nsegs, i;
3049 	struct mbuf *m_head = *m_head0;
3050 	struct rndis_packet_msg *pkt;
3051 	uint32_t *pi_data;
3052 	void *chim = NULL;
3053 	int pkt_hlen, pkt_size;
3054 
3055 	pkt = txd->rndis_pkt;
3056 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3057 	if (pkt_size < txr->hn_chim_size) {
3058 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3059 		if (chim != NULL)
3060 			pkt = chim;
3061 	} else {
3062 		if (txr->hn_agg_txd != NULL)
3063 			hn_flush_txagg(ifp, txr);
3064 	}
3065 
3066 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3067 	pkt->rm_len = m_head->m_pkthdr.len;
3068 	pkt->rm_dataoffset = 0;
3069 	pkt->rm_datalen = m_head->m_pkthdr.len;
3070 	pkt->rm_oobdataoffset = 0;
3071 	pkt->rm_oobdatalen = 0;
3072 	pkt->rm_oobdataelements = 0;
3073 	pkt->rm_pktinfooffset = sizeof(*pkt);
3074 	pkt->rm_pktinfolen = 0;
3075 	pkt->rm_vchandle = 0;
3076 	pkt->rm_reserved = 0;
3077 
3078 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3079 		/*
3080 		 * Set the hash value for this packet.
3081 		 */
3082 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3083 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3084 
3085 		if (M_HASHTYPE_ISHASH(m_head))
3086 			/*
3087 			 * The flowid field contains the hash value host
3088 			 * set in the rx queue if it is a ip forwarding pkt.
3089 			 * Set the same hash value so host can send on the
3090 			 * cpu it was received.
3091 			 */
3092 			*pi_data = m_head->m_pkthdr.flowid;
3093 		else
3094 			/*
3095 			 * Otherwise just put the tx queue index.
3096 			 */
3097 			*pi_data = txr->hn_tx_idx;
3098 	}
3099 
3100 	if (m_head->m_flags & M_VLANTAG) {
3101 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3102 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3103 		*pi_data = NDIS_VLAN_INFO_MAKE(
3104 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3105 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3106 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3107 	}
3108 
3109 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3110 #if defined(INET6) || defined(INET)
3111 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3112 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3113 #ifdef INET
3114 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3115 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3116 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3117 			    m_head->m_pkthdr.tso_segsz);
3118 		}
3119 #endif
3120 #if defined(INET6) && defined(INET)
3121 		else
3122 #endif
3123 #ifdef INET6
3124 		{
3125 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3126 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3127 			    m_head->m_pkthdr.tso_segsz);
3128 		}
3129 #endif
3130 #endif	/* INET6 || INET */
3131 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3132 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3133 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3134 		if (m_head->m_pkthdr.csum_flags &
3135 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3136 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3137 		} else {
3138 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3139 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3140 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3141 		}
3142 
3143 		if (m_head->m_pkthdr.csum_flags &
3144 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3145 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3146 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3147 		} else if (m_head->m_pkthdr.csum_flags &
3148 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3149 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3150 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3151 		}
3152 	}
3153 
3154 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3155 	/* Fixup RNDIS packet message total length */
3156 	pkt->rm_len += pkt_hlen;
3157 	/* Convert RNDIS packet message offsets */
3158 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3159 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3160 
3161 	/*
3162 	 * Fast path: Chimney sending.
3163 	 */
3164 	if (chim != NULL) {
3165 		struct hn_txdesc *tgt_txd = txd;
3166 
3167 		if (txr->hn_agg_txd != NULL) {
3168 			tgt_txd = txr->hn_agg_txd;
3169 #ifdef INVARIANTS
3170 			*m_head0 = NULL;
3171 #endif
3172 		}
3173 
3174 		KASSERT(pkt == chim,
3175 		    ("RNDIS pkt not in chimney sending buffer"));
3176 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3177 		    ("chimney sending buffer is not used"));
3178 		tgt_txd->chim_size += pkt->rm_len;
3179 
3180 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3181 		    ((uint8_t *)chim) + pkt_hlen);
3182 
3183 		txr->hn_gpa_cnt = 0;
3184 		txr->hn_sendpkt = hn_txpkt_chim;
3185 		goto done;
3186 	}
3187 
3188 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3189 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3190 	    ("chimney buffer is used"));
3191 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3192 
3193 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3194 	if (__predict_false(error)) {
3195 		int freed;
3196 
3197 		/*
3198 		 * This mbuf is not linked w/ the txd yet, so free it now.
3199 		 */
3200 		m_freem(m_head);
3201 		*m_head0 = NULL;
3202 
3203 		freed = hn_txdesc_put(txr, txd);
3204 		KASSERT(freed != 0,
3205 		    ("fail to free txd upon txdma error"));
3206 
3207 		txr->hn_txdma_failed++;
3208 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3209 		return error;
3210 	}
3211 	*m_head0 = m_head;
3212 
3213 	/* +1 RNDIS packet message */
3214 	txr->hn_gpa_cnt = nsegs + 1;
3215 
3216 	/* send packet with page buffer */
3217 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3218 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3219 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3220 
3221 	/*
3222 	 * Fill the page buffers with mbuf info after the page
3223 	 * buffer for RNDIS packet message.
3224 	 */
3225 	for (i = 0; i < nsegs; ++i) {
3226 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3227 
3228 		gpa->gpa_page = atop(segs[i].ds_addr);
3229 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3230 		gpa->gpa_len = segs[i].ds_len;
3231 	}
3232 
3233 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3234 	txd->chim_size = 0;
3235 	txr->hn_sendpkt = hn_txpkt_sglist;
3236 done:
3237 	txd->m = m_head;
3238 
3239 	/* Set the completion routine */
3240 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3241 
3242 	/* Update temporary stats for later use. */
3243 	txr->hn_stat_pkts++;
3244 	txr->hn_stat_size += m_head->m_pkthdr.len;
3245 	if (m_head->m_flags & M_MCAST)
3246 		txr->hn_stat_mcasts++;
3247 
3248 	return 0;
3249 }
3250 
3251 /*
3252  * NOTE:
3253  * If this function fails, then txd will be freed, but the mbuf
3254  * associated w/ the txd will _not_ be freed.
3255  */
3256 static int
3257 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3258 {
3259 	int error, send_failed = 0, has_bpf;
3260 
3261 again:
3262 	has_bpf = bpf_peers_present(ifp->if_bpf);
3263 	if (has_bpf) {
3264 		/*
3265 		 * Make sure that this txd and any aggregated txds are not
3266 		 * freed before ETHER_BPF_MTAP.
3267 		 */
3268 		hn_txdesc_hold(txd);
3269 	}
3270 	error = txr->hn_sendpkt(txr, txd);
3271 	if (!error) {
3272 		if (has_bpf) {
3273 			const struct hn_txdesc *tmp_txd;
3274 
3275 			ETHER_BPF_MTAP(ifp, txd->m);
3276 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3277 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3278 		}
3279 
3280 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3281 #ifdef HN_IFSTART_SUPPORT
3282 		if (!hn_use_if_start)
3283 #endif
3284 		{
3285 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3286 			    txr->hn_stat_size);
3287 			if (txr->hn_stat_mcasts != 0) {
3288 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3289 				    txr->hn_stat_mcasts);
3290 			}
3291 		}
3292 		txr->hn_pkts += txr->hn_stat_pkts;
3293 		txr->hn_sends++;
3294 	}
3295 	if (has_bpf)
3296 		hn_txdesc_put(txr, txd);
3297 
3298 	if (__predict_false(error)) {
3299 		int freed;
3300 
3301 		/*
3302 		 * This should "really rarely" happen.
3303 		 *
3304 		 * XXX Too many RX to be acked or too many sideband
3305 		 * commands to run?  Ask netvsc_channel_rollup()
3306 		 * to kick start later.
3307 		 */
3308 		txr->hn_has_txeof = 1;
3309 		if (!send_failed) {
3310 			txr->hn_send_failed++;
3311 			send_failed = 1;
3312 			/*
3313 			 * Try sending again after set hn_has_txeof;
3314 			 * in case that we missed the last
3315 			 * netvsc_channel_rollup().
3316 			 */
3317 			goto again;
3318 		}
3319 		if_printf(ifp, "send failed\n");
3320 
3321 		/*
3322 		 * Caller will perform further processing on the
3323 		 * associated mbuf, so don't free it in hn_txdesc_put();
3324 		 * only unload it from the DMA map in hn_txdesc_put(),
3325 		 * if it was loaded.
3326 		 */
3327 		txd->m = NULL;
3328 		freed = hn_txdesc_put(txr, txd);
3329 		KASSERT(freed != 0,
3330 		    ("fail to free txd upon send error"));
3331 
3332 		txr->hn_send_failed++;
3333 	}
3334 
3335 	/* Reset temporary stats, after this sending is done. */
3336 	txr->hn_stat_size = 0;
3337 	txr->hn_stat_pkts = 0;
3338 	txr->hn_stat_mcasts = 0;
3339 
3340 	return (error);
3341 }
3342 
3343 /*
3344  * Append the specified data to the indicated mbuf chain,
3345  * Extend the mbuf chain if the new data does not fit in
3346  * existing space.
3347  *
3348  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3349  * There should be an equivalent in the kernel mbuf code,
3350  * but there does not appear to be one yet.
3351  *
3352  * Differs from m_append() in that additional mbufs are
3353  * allocated with cluster size MJUMPAGESIZE, and filled
3354  * accordingly.
3355  *
3356  * Return 1 if able to complete the job; otherwise 0.
3357  */
3358 static int
3359 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3360 {
3361 	struct mbuf *m, *n;
3362 	int remainder, space;
3363 
3364 	for (m = m0; m->m_next != NULL; m = m->m_next)
3365 		;
3366 	remainder = len;
3367 	space = M_TRAILINGSPACE(m);
3368 	if (space > 0) {
3369 		/*
3370 		 * Copy into available space.
3371 		 */
3372 		if (space > remainder)
3373 			space = remainder;
3374 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3375 		m->m_len += space;
3376 		cp += space;
3377 		remainder -= space;
3378 	}
3379 	while (remainder > 0) {
3380 		/*
3381 		 * Allocate a new mbuf; could check space
3382 		 * and allocate a cluster instead.
3383 		 */
3384 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3385 		if (n == NULL)
3386 			break;
3387 		n->m_len = min(MJUMPAGESIZE, remainder);
3388 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3389 		cp += n->m_len;
3390 		remainder -= n->m_len;
3391 		m->m_next = n;
3392 		m = n;
3393 	}
3394 	if (m0->m_flags & M_PKTHDR)
3395 		m0->m_pkthdr.len += len - remainder;
3396 
3397 	return (remainder == 0);
3398 }
3399 
3400 #if defined(INET) || defined(INET6)
3401 static __inline int
3402 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3403 {
3404 #if __FreeBSD_version >= 1100095
3405 	if (hn_lro_mbufq_depth) {
3406 		tcp_lro_queue_mbuf(lc, m);
3407 		return 0;
3408 	}
3409 #endif
3410 	return tcp_lro_rx(lc, m, 0);
3411 }
3412 #endif
3413 
3414 static int
3415 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3416     const struct hn_rxinfo *info)
3417 {
3418 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3419 	struct mbuf *m_new;
3420 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3421 	int hash_type = M_HASHTYPE_NONE;
3422 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3423 
3424 	ifp = hn_ifp;
3425 	if (rxr->hn_rxvf_ifp != NULL) {
3426 		/*
3427 		 * Non-transparent mode VF; pretend this packet is from
3428 		 * the VF.
3429 		 */
3430 		ifp = rxr->hn_rxvf_ifp;
3431 		is_vf = 1;
3432 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3433 		/* Transparent mode VF. */
3434 		is_vf = 1;
3435 	}
3436 
3437 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3438 		/*
3439 		 * NOTE:
3440 		 * See the NOTE of hn_rndis_init_fixat().  This
3441 		 * function can be reached, immediately after the
3442 		 * RNDIS is initialized but before the ifnet is
3443 		 * setup on the hn_attach() path; drop the unexpected
3444 		 * packets.
3445 		 */
3446 		return (0);
3447 	}
3448 
3449 	if (__predict_false(dlen < ETHER_HDR_LEN)) {
3450 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3451 		return (0);
3452 	}
3453 
3454 	if (dlen <= MHLEN) {
3455 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3456 		if (m_new == NULL) {
3457 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3458 			return (0);
3459 		}
3460 		memcpy(mtod(m_new, void *), data, dlen);
3461 		m_new->m_pkthdr.len = m_new->m_len = dlen;
3462 		rxr->hn_small_pkts++;
3463 	} else {
3464 		/*
3465 		 * Get an mbuf with a cluster.  For packets 2K or less,
3466 		 * get a standard 2K cluster.  For anything larger, get a
3467 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3468 		 * if looped around to the Hyper-V TX channel, so avoid them.
3469 		 */
3470 		size = MCLBYTES;
3471 		if (dlen > MCLBYTES) {
3472 			/* 4096 */
3473 			size = MJUMPAGESIZE;
3474 		}
3475 
3476 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3477 		if (m_new == NULL) {
3478 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3479 			return (0);
3480 		}
3481 
3482 		hv_m_append(m_new, dlen, data);
3483 	}
3484 	m_new->m_pkthdr.rcvif = ifp;
3485 
3486 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3487 		do_csum = 0;
3488 
3489 	/* receive side checksum offload */
3490 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3491 		/* IP csum offload */
3492 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3493 			m_new->m_pkthdr.csum_flags |=
3494 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3495 			rxr->hn_csum_ip++;
3496 		}
3497 
3498 		/* TCP/UDP csum offload */
3499 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3500 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3501 			m_new->m_pkthdr.csum_flags |=
3502 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3503 			m_new->m_pkthdr.csum_data = 0xffff;
3504 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3505 				rxr->hn_csum_tcp++;
3506 			else
3507 				rxr->hn_csum_udp++;
3508 		}
3509 
3510 		/*
3511 		 * XXX
3512 		 * As of this write (Oct 28th, 2016), host side will turn
3513 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3514 		 * the do_lro setting here is actually _not_ accurate.  We
3515 		 * depend on the RSS hash type check to reset do_lro.
3516 		 */
3517 		if ((info->csum_info &
3518 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3519 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3520 			do_lro = 1;
3521 	} else {
3522 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3523 		if (l3proto == ETHERTYPE_IP) {
3524 			if (l4proto == IPPROTO_TCP) {
3525 				if (do_csum &&
3526 				    (rxr->hn_trust_hcsum &
3527 				     HN_TRUST_HCSUM_TCP)) {
3528 					rxr->hn_csum_trusted++;
3529 					m_new->m_pkthdr.csum_flags |=
3530 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3531 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3532 					m_new->m_pkthdr.csum_data = 0xffff;
3533 				}
3534 				do_lro = 1;
3535 			} else if (l4proto == IPPROTO_UDP) {
3536 				if (do_csum &&
3537 				    (rxr->hn_trust_hcsum &
3538 				     HN_TRUST_HCSUM_UDP)) {
3539 					rxr->hn_csum_trusted++;
3540 					m_new->m_pkthdr.csum_flags |=
3541 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3542 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3543 					m_new->m_pkthdr.csum_data = 0xffff;
3544 				}
3545 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3546 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3547 				rxr->hn_csum_trusted++;
3548 				m_new->m_pkthdr.csum_flags |=
3549 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3550 			}
3551 		}
3552 	}
3553 
3554 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3555 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3556 		    NDIS_VLAN_INFO_ID(info->vlan_info),
3557 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3558 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3559 		m_new->m_flags |= M_VLANTAG;
3560 	}
3561 
3562 	/*
3563 	 * If VF is activated (tranparent/non-transparent mode does not
3564 	 * matter here).
3565 	 *
3566 	 * - Disable LRO
3567 	 *
3568 	 *   hn(4) will only receive broadcast packets, multicast packets,
3569 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3570 	 *   packet types.
3571 	 *
3572 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3573 	 *   all, since the LRO flush will use hn(4) as the receiving
3574 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3575 	 */
3576 	if (is_vf)
3577 		do_lro = 0;
3578 
3579 	/*
3580 	 * If VF is activated (tranparent/non-transparent mode does not
3581 	 * matter here), do _not_ mess with unsupported hash types or
3582 	 * functions.
3583 	 */
3584 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3585 		rxr->hn_rss_pkts++;
3586 		m_new->m_pkthdr.flowid = info->hash_value;
3587 		if (!is_vf)
3588 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3589 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3590 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3591 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3592 			    rxr->hn_mbuf_hash);
3593 
3594 			/*
3595 			 * NOTE:
3596 			 * do_lro is resetted, if the hash types are not TCP
3597 			 * related.  See the comment in the above csum_flags
3598 			 * setup section.
3599 			 */
3600 			switch (type) {
3601 			case NDIS_HASH_IPV4:
3602 				hash_type = M_HASHTYPE_RSS_IPV4;
3603 				do_lro = 0;
3604 				break;
3605 
3606 			case NDIS_HASH_TCP_IPV4:
3607 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3608 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3609 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3610 
3611 					if (is_vf)
3612 						def_htype = M_HASHTYPE_NONE;
3613 
3614 					/*
3615 					 * UDP 4-tuple hash is delivered as
3616 					 * TCP 4-tuple hash.
3617 					 */
3618 					if (l3proto == ETHERTYPE_MAX) {
3619 						hn_rxpkt_proto(m_new,
3620 						    &l3proto, &l4proto);
3621 					}
3622 					if (l3proto == ETHERTYPE_IP) {
3623 						if (l4proto == IPPROTO_UDP &&
3624 						    (rxr->hn_mbuf_hash &
3625 						     NDIS_HASH_UDP_IPV4_X)) {
3626 							hash_type =
3627 							M_HASHTYPE_RSS_UDP_IPV4;
3628 							do_lro = 0;
3629 						} else if (l4proto !=
3630 						    IPPROTO_TCP) {
3631 							hash_type = def_htype;
3632 							do_lro = 0;
3633 						}
3634 					} else {
3635 						hash_type = def_htype;
3636 						do_lro = 0;
3637 					}
3638 				}
3639 				break;
3640 
3641 			case NDIS_HASH_IPV6:
3642 				hash_type = M_HASHTYPE_RSS_IPV6;
3643 				do_lro = 0;
3644 				break;
3645 
3646 			case NDIS_HASH_IPV6_EX:
3647 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3648 				do_lro = 0;
3649 				break;
3650 
3651 			case NDIS_HASH_TCP_IPV6:
3652 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3653 				break;
3654 
3655 			case NDIS_HASH_TCP_IPV6_EX:
3656 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3657 				break;
3658 			}
3659 		}
3660 	} else if (!is_vf) {
3661 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3662 		hash_type = M_HASHTYPE_OPAQUE;
3663 	}
3664 	M_HASHTYPE_SET(m_new, hash_type);
3665 
3666 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3667 	if (hn_ifp != ifp) {
3668 		const struct ether_header *eh;
3669 
3670 		/*
3671 		 * Non-transparent mode VF is activated.
3672 		 */
3673 
3674 		/*
3675 		 * Allow tapping on hn(4).
3676 		 */
3677 		ETHER_BPF_MTAP(hn_ifp, m_new);
3678 
3679 		/*
3680 		 * Update hn(4)'s stats.
3681 		 */
3682 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3683 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3684 		/* Checked at the beginning of this function. */
3685 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3686 		eh = mtod(m_new, struct ether_header *);
3687 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3688 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3689 	}
3690 	rxr->hn_pkts++;
3691 
3692 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3693 #if defined(INET) || defined(INET6)
3694 		struct lro_ctrl *lro = &rxr->hn_lro;
3695 
3696 		if (lro->lro_cnt) {
3697 			rxr->hn_lro_tried++;
3698 			if (hn_lro_rx(lro, m_new) == 0) {
3699 				/* DONE! */
3700 				return 0;
3701 			}
3702 		}
3703 #endif
3704 	}
3705 	ifp->if_input(ifp, m_new);
3706 
3707 	return (0);
3708 }
3709 
3710 static int
3711 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3712 {
3713 	struct hn_softc *sc = ifp->if_softc;
3714 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3715 	struct ifnet *vf_ifp;
3716 	int mask, error = 0;
3717 	struct ifrsskey *ifrk;
3718 	struct ifrsshash *ifrh;
3719 	uint32_t mtu;
3720 
3721 	switch (cmd) {
3722 	case SIOCSIFMTU:
3723 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3724 			error = EINVAL;
3725 			break;
3726 		}
3727 
3728 		HN_LOCK(sc);
3729 
3730 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3731 			HN_UNLOCK(sc);
3732 			break;
3733 		}
3734 
3735 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3736 			/* Can't change MTU */
3737 			HN_UNLOCK(sc);
3738 			error = EOPNOTSUPP;
3739 			break;
3740 		}
3741 
3742 		if (ifp->if_mtu == ifr->ifr_mtu) {
3743 			HN_UNLOCK(sc);
3744 			break;
3745 		}
3746 
3747 		if (hn_xpnt_vf_isready(sc)) {
3748 			vf_ifp = sc->hn_vf_ifp;
3749 			ifr_vf = *ifr;
3750 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3751 			    sizeof(ifr_vf.ifr_name));
3752 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3753 			    (caddr_t)&ifr_vf);
3754 			if (error) {
3755 				HN_UNLOCK(sc);
3756 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3757 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3758 				break;
3759 			}
3760 		}
3761 
3762 		/*
3763 		 * Suspend this interface before the synthetic parts
3764 		 * are ripped.
3765 		 */
3766 		hn_suspend(sc);
3767 
3768 		/*
3769 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3770 		 */
3771 		hn_synth_detach(sc);
3772 
3773 		/*
3774 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3775 		 * with the new MTU setting.
3776 		 */
3777 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3778 		if (error) {
3779 			HN_UNLOCK(sc);
3780 			break;
3781 		}
3782 
3783 		error = hn_rndis_get_mtu(sc, &mtu);
3784 		if (error)
3785 			mtu = ifr->ifr_mtu;
3786 		else if (bootverbose)
3787 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3788 
3789 		/*
3790 		 * Commit the requested MTU, after the synthetic parts
3791 		 * have been successfully attached.
3792 		 */
3793 		if (mtu >= ifr->ifr_mtu) {
3794 			mtu = ifr->ifr_mtu;
3795 		} else {
3796 			if_printf(ifp, "fixup mtu %d -> %u\n",
3797 			    ifr->ifr_mtu, mtu);
3798 		}
3799 		ifp->if_mtu = mtu;
3800 
3801 		/*
3802 		 * Synthetic parts' reattach may change the chimney
3803 		 * sending size; update it.
3804 		 */
3805 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3806 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3807 
3808 		/*
3809 		 * Make sure that various parameters based on MTU are
3810 		 * still valid, after the MTU change.
3811 		 */
3812 		hn_mtu_change_fixup(sc);
3813 
3814 		/*
3815 		 * All done!  Resume the interface now.
3816 		 */
3817 		hn_resume(sc);
3818 
3819 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3820 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3821 			/*
3822 			 * Since we have reattached the NVS part,
3823 			 * change the datapath to VF again; in case
3824 			 * that it is lost, after the NVS was detached.
3825 			 */
3826 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3827 		}
3828 
3829 		HN_UNLOCK(sc);
3830 		break;
3831 
3832 	case SIOCSIFFLAGS:
3833 		HN_LOCK(sc);
3834 
3835 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3836 			HN_UNLOCK(sc);
3837 			break;
3838 		}
3839 
3840 		if (hn_xpnt_vf_isready(sc))
3841 			hn_xpnt_vf_saveifflags(sc);
3842 
3843 		if (ifp->if_flags & IFF_UP) {
3844 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3845 				/*
3846 				 * Caller meight hold mutex, e.g.
3847 				 * bpf; use busy-wait for the RNDIS
3848 				 * reply.
3849 				 */
3850 				HN_NO_SLEEPING(sc);
3851 				hn_rxfilter_config(sc);
3852 				HN_SLEEPING_OK(sc);
3853 
3854 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3855 					error = hn_xpnt_vf_iocsetflags(sc);
3856 			} else {
3857 				hn_init_locked(sc);
3858 			}
3859 		} else {
3860 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3861 				hn_stop(sc, false);
3862 		}
3863 		sc->hn_if_flags = ifp->if_flags;
3864 
3865 		HN_UNLOCK(sc);
3866 		break;
3867 
3868 	case SIOCSIFCAP:
3869 		HN_LOCK(sc);
3870 
3871 		if (hn_xpnt_vf_isready(sc)) {
3872 			ifr_vf = *ifr;
3873 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3874 			    sizeof(ifr_vf.ifr_name));
3875 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3876 			HN_UNLOCK(sc);
3877 			break;
3878 		}
3879 
3880 		/*
3881 		 * Fix up requested capabilities w/ supported capabilities,
3882 		 * since the supported capabilities could have been changed.
3883 		 */
3884 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3885 		    ifp->if_capenable;
3886 
3887 		if (mask & IFCAP_TXCSUM) {
3888 			ifp->if_capenable ^= IFCAP_TXCSUM;
3889 			if (ifp->if_capenable & IFCAP_TXCSUM)
3890 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3891 			else
3892 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3893 		}
3894 		if (mask & IFCAP_TXCSUM_IPV6) {
3895 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3896 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3897 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3898 			else
3899 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3900 		}
3901 
3902 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3903 		if (mask & IFCAP_RXCSUM)
3904 			ifp->if_capenable ^= IFCAP_RXCSUM;
3905 #ifdef foo
3906 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3907 		if (mask & IFCAP_RXCSUM_IPV6)
3908 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3909 #endif
3910 
3911 		if (mask & IFCAP_LRO)
3912 			ifp->if_capenable ^= IFCAP_LRO;
3913 
3914 		if (mask & IFCAP_TSO4) {
3915 			ifp->if_capenable ^= IFCAP_TSO4;
3916 			if (ifp->if_capenable & IFCAP_TSO4)
3917 				ifp->if_hwassist |= CSUM_IP_TSO;
3918 			else
3919 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3920 		}
3921 		if (mask & IFCAP_TSO6) {
3922 			ifp->if_capenable ^= IFCAP_TSO6;
3923 			if (ifp->if_capenable & IFCAP_TSO6)
3924 				ifp->if_hwassist |= CSUM_IP6_TSO;
3925 			else
3926 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3927 		}
3928 
3929 		HN_UNLOCK(sc);
3930 		break;
3931 
3932 	case SIOCADDMULTI:
3933 	case SIOCDELMULTI:
3934 		HN_LOCK(sc);
3935 
3936 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3937 			HN_UNLOCK(sc);
3938 			break;
3939 		}
3940 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3941 			/*
3942 			 * Multicast uses mutex; use busy-wait for
3943 			 * the RNDIS reply.
3944 			 */
3945 			HN_NO_SLEEPING(sc);
3946 			hn_rxfilter_config(sc);
3947 			HN_SLEEPING_OK(sc);
3948 		}
3949 
3950 		/* XXX vlan(4) style mcast addr maintenance */
3951 		if (hn_xpnt_vf_isready(sc)) {
3952 			int old_if_flags;
3953 
3954 			old_if_flags = sc->hn_vf_ifp->if_flags;
3955 			hn_xpnt_vf_saveifflags(sc);
3956 
3957 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3958 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3959 			     IFF_ALLMULTI))
3960 				error = hn_xpnt_vf_iocsetflags(sc);
3961 		}
3962 
3963 		HN_UNLOCK(sc);
3964 		break;
3965 
3966 	case SIOCSIFMEDIA:
3967 	case SIOCGIFMEDIA:
3968 		HN_LOCK(sc);
3969 		if (hn_xpnt_vf_isready(sc)) {
3970 			/*
3971 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3972 			 * create and pass ifr_vf to the VF here; just
3973 			 * replace the ifr_name.
3974 			 */
3975 			vf_ifp = sc->hn_vf_ifp;
3976 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3977 			    sizeof(ifr->ifr_name));
3978 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3979 			/* Restore the ifr_name. */
3980 			strlcpy(ifr->ifr_name, ifp->if_xname,
3981 			    sizeof(ifr->ifr_name));
3982 			HN_UNLOCK(sc);
3983 			break;
3984 		}
3985 		HN_UNLOCK(sc);
3986 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3987 		break;
3988 
3989 	case SIOCGIFRSSHASH:
3990 		ifrh = (struct ifrsshash *)data;
3991 		HN_LOCK(sc);
3992 		if (sc->hn_rx_ring_inuse == 1) {
3993 			HN_UNLOCK(sc);
3994 			ifrh->ifrh_func = RSS_FUNC_NONE;
3995 			ifrh->ifrh_types = 0;
3996 			break;
3997 		}
3998 
3999 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4000 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4001 		else
4002 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4003 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4004 		HN_UNLOCK(sc);
4005 		break;
4006 
4007 	case SIOCGIFRSSKEY:
4008 		ifrk = (struct ifrsskey *)data;
4009 		HN_LOCK(sc);
4010 		if (sc->hn_rx_ring_inuse == 1) {
4011 			HN_UNLOCK(sc);
4012 			ifrk->ifrk_func = RSS_FUNC_NONE;
4013 			ifrk->ifrk_keylen = 0;
4014 			break;
4015 		}
4016 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4017 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4018 		else
4019 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4020 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4021 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4022 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4023 		HN_UNLOCK(sc);
4024 		break;
4025 
4026 	default:
4027 		error = ether_ioctl(ifp, cmd, data);
4028 		break;
4029 	}
4030 	return (error);
4031 }
4032 
4033 static void
4034 hn_stop(struct hn_softc *sc, bool detaching)
4035 {
4036 	struct ifnet *ifp = sc->hn_ifp;
4037 	int i;
4038 
4039 	HN_LOCK_ASSERT(sc);
4040 
4041 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4042 	    ("synthetic parts were not attached"));
4043 
4044 	/* Clear RUNNING bit ASAP. */
4045 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4046 
4047 	/* Disable polling. */
4048 	hn_polling(sc, 0);
4049 
4050 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4051 		KASSERT(sc->hn_vf_ifp != NULL,
4052 		    ("%s: VF is not attached", ifp->if_xname));
4053 
4054 		/* Mark transparent mode VF as disabled. */
4055 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4056 
4057 		/*
4058 		 * NOTE:
4059 		 * Datapath setting must happen _before_ bringing
4060 		 * the VF down.
4061 		 */
4062 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4063 
4064 		/*
4065 		 * Bring the VF down.
4066 		 */
4067 		hn_xpnt_vf_saveifflags(sc);
4068 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4069 		hn_xpnt_vf_iocsetflags(sc);
4070 	}
4071 
4072 	/* Suspend data transfers. */
4073 	hn_suspend_data(sc);
4074 
4075 	/* Clear OACTIVE bit. */
4076 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4077 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4078 		sc->hn_tx_ring[i].hn_oactive = 0;
4079 
4080 	/*
4081 	 * If the non-transparent mode VF is active, make sure
4082 	 * that the RX filter still allows packet reception.
4083 	 */
4084 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4085 		hn_rxfilter_config(sc);
4086 }
4087 
4088 static void
4089 hn_init_locked(struct hn_softc *sc)
4090 {
4091 	struct ifnet *ifp = sc->hn_ifp;
4092 	int i;
4093 
4094 	HN_LOCK_ASSERT(sc);
4095 
4096 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4097 		return;
4098 
4099 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4100 		return;
4101 
4102 	/* Configure RX filter */
4103 	hn_rxfilter_config(sc);
4104 
4105 	/* Clear OACTIVE bit. */
4106 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4107 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4108 		sc->hn_tx_ring[i].hn_oactive = 0;
4109 
4110 	/* Clear TX 'suspended' bit. */
4111 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4112 
4113 	if (hn_xpnt_vf_isready(sc)) {
4114 		/* Initialize transparent VF. */
4115 		hn_xpnt_vf_init(sc);
4116 	}
4117 
4118 	/* Everything is ready; unleash! */
4119 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4120 
4121 	/* Re-enable polling if requested. */
4122 	if (sc->hn_pollhz > 0)
4123 		hn_polling(sc, sc->hn_pollhz);
4124 }
4125 
4126 static void
4127 hn_init(void *xsc)
4128 {
4129 	struct hn_softc *sc = xsc;
4130 
4131 	HN_LOCK(sc);
4132 	hn_init_locked(sc);
4133 	HN_UNLOCK(sc);
4134 }
4135 
4136 #if __FreeBSD_version >= 1100099
4137 
4138 static int
4139 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4140 {
4141 	struct hn_softc *sc = arg1;
4142 	unsigned int lenlim;
4143 	int error;
4144 
4145 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4146 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4147 	if (error || req->newptr == NULL)
4148 		return error;
4149 
4150 	HN_LOCK(sc);
4151 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4152 	    lenlim > TCP_LRO_LENGTH_MAX) {
4153 		HN_UNLOCK(sc);
4154 		return EINVAL;
4155 	}
4156 	hn_set_lro_lenlim(sc, lenlim);
4157 	HN_UNLOCK(sc);
4158 
4159 	return 0;
4160 }
4161 
4162 static int
4163 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4164 {
4165 	struct hn_softc *sc = arg1;
4166 	int ackcnt, error, i;
4167 
4168 	/*
4169 	 * lro_ackcnt_lim is append count limit,
4170 	 * +1 to turn it into aggregation limit.
4171 	 */
4172 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4173 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4174 	if (error || req->newptr == NULL)
4175 		return error;
4176 
4177 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4178 		return EINVAL;
4179 
4180 	/*
4181 	 * Convert aggregation limit back to append
4182 	 * count limit.
4183 	 */
4184 	--ackcnt;
4185 	HN_LOCK(sc);
4186 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4187 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4188 	HN_UNLOCK(sc);
4189 	return 0;
4190 }
4191 
4192 #endif
4193 
4194 static int
4195 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4196 {
4197 	struct hn_softc *sc = arg1;
4198 	int hcsum = arg2;
4199 	int on, error, i;
4200 
4201 	on = 0;
4202 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4203 		on = 1;
4204 
4205 	error = sysctl_handle_int(oidp, &on, 0, req);
4206 	if (error || req->newptr == NULL)
4207 		return error;
4208 
4209 	HN_LOCK(sc);
4210 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4211 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4212 
4213 		if (on)
4214 			rxr->hn_trust_hcsum |= hcsum;
4215 		else
4216 			rxr->hn_trust_hcsum &= ~hcsum;
4217 	}
4218 	HN_UNLOCK(sc);
4219 	return 0;
4220 }
4221 
4222 static int
4223 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4224 {
4225 	struct hn_softc *sc = arg1;
4226 	int chim_size, error;
4227 
4228 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4229 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4230 	if (error || req->newptr == NULL)
4231 		return error;
4232 
4233 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4234 		return EINVAL;
4235 
4236 	HN_LOCK(sc);
4237 	hn_set_chim_size(sc, chim_size);
4238 	HN_UNLOCK(sc);
4239 	return 0;
4240 }
4241 
4242 #if __FreeBSD_version < 1100095
4243 static int
4244 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4245 {
4246 	struct hn_softc *sc = arg1;
4247 	int ofs = arg2, i, error;
4248 	struct hn_rx_ring *rxr;
4249 	uint64_t stat;
4250 
4251 	stat = 0;
4252 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4253 		rxr = &sc->hn_rx_ring[i];
4254 		stat += *((int *)((uint8_t *)rxr + ofs));
4255 	}
4256 
4257 	error = sysctl_handle_64(oidp, &stat, 0, req);
4258 	if (error || req->newptr == NULL)
4259 		return error;
4260 
4261 	/* Zero out this stat. */
4262 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4263 		rxr = &sc->hn_rx_ring[i];
4264 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4265 	}
4266 	return 0;
4267 }
4268 #else
4269 static int
4270 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4271 {
4272 	struct hn_softc *sc = arg1;
4273 	int ofs = arg2, i, error;
4274 	struct hn_rx_ring *rxr;
4275 	uint64_t stat;
4276 
4277 	stat = 0;
4278 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4279 		rxr = &sc->hn_rx_ring[i];
4280 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4281 	}
4282 
4283 	error = sysctl_handle_64(oidp, &stat, 0, req);
4284 	if (error || req->newptr == NULL)
4285 		return error;
4286 
4287 	/* Zero out this stat. */
4288 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4289 		rxr = &sc->hn_rx_ring[i];
4290 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4291 	}
4292 	return 0;
4293 }
4294 
4295 #endif
4296 
4297 static int
4298 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4299 {
4300 	struct hn_softc *sc = arg1;
4301 	int ofs = arg2, i, error;
4302 	struct hn_rx_ring *rxr;
4303 	u_long stat;
4304 
4305 	stat = 0;
4306 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4307 		rxr = &sc->hn_rx_ring[i];
4308 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4309 	}
4310 
4311 	error = sysctl_handle_long(oidp, &stat, 0, req);
4312 	if (error || req->newptr == NULL)
4313 		return error;
4314 
4315 	/* Zero out this stat. */
4316 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4317 		rxr = &sc->hn_rx_ring[i];
4318 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4319 	}
4320 	return 0;
4321 }
4322 
4323 static int
4324 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4325 {
4326 	struct hn_softc *sc = arg1;
4327 	int ofs = arg2, i, error;
4328 	struct hn_tx_ring *txr;
4329 	u_long stat;
4330 
4331 	stat = 0;
4332 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4333 		txr = &sc->hn_tx_ring[i];
4334 		stat += *((u_long *)((uint8_t *)txr + ofs));
4335 	}
4336 
4337 	error = sysctl_handle_long(oidp, &stat, 0, req);
4338 	if (error || req->newptr == NULL)
4339 		return error;
4340 
4341 	/* Zero out this stat. */
4342 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4343 		txr = &sc->hn_tx_ring[i];
4344 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4345 	}
4346 	return 0;
4347 }
4348 
4349 static int
4350 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4351 {
4352 	struct hn_softc *sc = arg1;
4353 	int ofs = arg2, i, error, conf;
4354 	struct hn_tx_ring *txr;
4355 
4356 	txr = &sc->hn_tx_ring[0];
4357 	conf = *((int *)((uint8_t *)txr + ofs));
4358 
4359 	error = sysctl_handle_int(oidp, &conf, 0, req);
4360 	if (error || req->newptr == NULL)
4361 		return error;
4362 
4363 	HN_LOCK(sc);
4364 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4365 		txr = &sc->hn_tx_ring[i];
4366 		*((int *)((uint8_t *)txr + ofs)) = conf;
4367 	}
4368 	HN_UNLOCK(sc);
4369 
4370 	return 0;
4371 }
4372 
4373 static int
4374 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4375 {
4376 	struct hn_softc *sc = arg1;
4377 	int error, size;
4378 
4379 	size = sc->hn_agg_size;
4380 	error = sysctl_handle_int(oidp, &size, 0, req);
4381 	if (error || req->newptr == NULL)
4382 		return (error);
4383 
4384 	HN_LOCK(sc);
4385 	sc->hn_agg_size = size;
4386 	hn_set_txagg(sc);
4387 	HN_UNLOCK(sc);
4388 
4389 	return (0);
4390 }
4391 
4392 static int
4393 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4394 {
4395 	struct hn_softc *sc = arg1;
4396 	int error, pkts;
4397 
4398 	pkts = sc->hn_agg_pkts;
4399 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4400 	if (error || req->newptr == NULL)
4401 		return (error);
4402 
4403 	HN_LOCK(sc);
4404 	sc->hn_agg_pkts = pkts;
4405 	hn_set_txagg(sc);
4406 	HN_UNLOCK(sc);
4407 
4408 	return (0);
4409 }
4410 
4411 static int
4412 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4413 {
4414 	struct hn_softc *sc = arg1;
4415 	int pkts;
4416 
4417 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4418 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4419 }
4420 
4421 static int
4422 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4423 {
4424 	struct hn_softc *sc = arg1;
4425 	int align;
4426 
4427 	align = sc->hn_tx_ring[0].hn_agg_align;
4428 	return (sysctl_handle_int(oidp, &align, 0, req));
4429 }
4430 
4431 static void
4432 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4433 {
4434 	if (pollhz == 0)
4435 		vmbus_chan_poll_disable(chan);
4436 	else
4437 		vmbus_chan_poll_enable(chan, pollhz);
4438 }
4439 
4440 static void
4441 hn_polling(struct hn_softc *sc, u_int pollhz)
4442 {
4443 	int nsubch = sc->hn_rx_ring_inuse - 1;
4444 
4445 	HN_LOCK_ASSERT(sc);
4446 
4447 	if (nsubch > 0) {
4448 		struct vmbus_channel **subch;
4449 		int i;
4450 
4451 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4452 		for (i = 0; i < nsubch; ++i)
4453 			hn_chan_polling(subch[i], pollhz);
4454 		vmbus_subchan_rel(subch, nsubch);
4455 	}
4456 	hn_chan_polling(sc->hn_prichan, pollhz);
4457 }
4458 
4459 static int
4460 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4461 {
4462 	struct hn_softc *sc = arg1;
4463 	int pollhz, error;
4464 
4465 	pollhz = sc->hn_pollhz;
4466 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4467 	if (error || req->newptr == NULL)
4468 		return (error);
4469 
4470 	if (pollhz != 0 &&
4471 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4472 		return (EINVAL);
4473 
4474 	HN_LOCK(sc);
4475 	if (sc->hn_pollhz != pollhz) {
4476 		sc->hn_pollhz = pollhz;
4477 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4478 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4479 			hn_polling(sc, sc->hn_pollhz);
4480 	}
4481 	HN_UNLOCK(sc);
4482 
4483 	return (0);
4484 }
4485 
4486 static int
4487 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4488 {
4489 	struct hn_softc *sc = arg1;
4490 	char verstr[16];
4491 
4492 	snprintf(verstr, sizeof(verstr), "%u.%u",
4493 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4494 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4495 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4496 }
4497 
4498 static int
4499 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4500 {
4501 	struct hn_softc *sc = arg1;
4502 	char caps_str[128];
4503 	uint32_t caps;
4504 
4505 	HN_LOCK(sc);
4506 	caps = sc->hn_caps;
4507 	HN_UNLOCK(sc);
4508 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4509 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4510 }
4511 
4512 static int
4513 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4514 {
4515 	struct hn_softc *sc = arg1;
4516 	char assist_str[128];
4517 	uint32_t hwassist;
4518 
4519 	HN_LOCK(sc);
4520 	hwassist = sc->hn_ifp->if_hwassist;
4521 	HN_UNLOCK(sc);
4522 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4523 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4524 }
4525 
4526 static int
4527 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4528 {
4529 	struct hn_softc *sc = arg1;
4530 	char filter_str[128];
4531 	uint32_t filter;
4532 
4533 	HN_LOCK(sc);
4534 	filter = sc->hn_rx_filter;
4535 	HN_UNLOCK(sc);
4536 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4537 	    NDIS_PACKET_TYPES);
4538 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4539 }
4540 
4541 #ifndef RSS
4542 
4543 static int
4544 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4545 {
4546 	struct hn_softc *sc = arg1;
4547 	int error;
4548 
4549 	HN_LOCK(sc);
4550 
4551 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4552 	if (error || req->newptr == NULL)
4553 		goto back;
4554 
4555 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4556 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4557 		/*
4558 		 * RSS key is synchronized w/ VF's, don't allow users
4559 		 * to change it.
4560 		 */
4561 		error = EBUSY;
4562 		goto back;
4563 	}
4564 
4565 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4566 	if (error)
4567 		goto back;
4568 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4569 
4570 	if (sc->hn_rx_ring_inuse > 1) {
4571 		error = hn_rss_reconfig(sc);
4572 	} else {
4573 		/* Not RSS capable, at least for now; just save the RSS key. */
4574 		error = 0;
4575 	}
4576 back:
4577 	HN_UNLOCK(sc);
4578 	return (error);
4579 }
4580 
4581 static int
4582 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4583 {
4584 	struct hn_softc *sc = arg1;
4585 	int error;
4586 
4587 	HN_LOCK(sc);
4588 
4589 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4590 	if (error || req->newptr == NULL)
4591 		goto back;
4592 
4593 	/*
4594 	 * Don't allow RSS indirect table change, if this interface is not
4595 	 * RSS capable currently.
4596 	 */
4597 	if (sc->hn_rx_ring_inuse == 1) {
4598 		error = EOPNOTSUPP;
4599 		goto back;
4600 	}
4601 
4602 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4603 	if (error)
4604 		goto back;
4605 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4606 
4607 	hn_rss_ind_fixup(sc);
4608 	error = hn_rss_reconfig(sc);
4609 back:
4610 	HN_UNLOCK(sc);
4611 	return (error);
4612 }
4613 
4614 #endif	/* !RSS */
4615 
4616 static int
4617 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4618 {
4619 	struct hn_softc *sc = arg1;
4620 	char hash_str[128];
4621 	uint32_t hash;
4622 
4623 	HN_LOCK(sc);
4624 	hash = sc->hn_rss_hash;
4625 	HN_UNLOCK(sc);
4626 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4627 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4628 }
4629 
4630 static int
4631 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4632 {
4633 	struct hn_softc *sc = arg1;
4634 	char hash_str[128];
4635 	uint32_t hash;
4636 
4637 	HN_LOCK(sc);
4638 	hash = sc->hn_rss_hcap;
4639 	HN_UNLOCK(sc);
4640 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4641 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4642 }
4643 
4644 static int
4645 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4646 {
4647 	struct hn_softc *sc = arg1;
4648 	char hash_str[128];
4649 	uint32_t hash;
4650 
4651 	HN_LOCK(sc);
4652 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4653 	HN_UNLOCK(sc);
4654 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4655 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4656 }
4657 
4658 static int
4659 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4660 {
4661 	struct hn_softc *sc = arg1;
4662 	char vf_name[IFNAMSIZ + 1];
4663 	struct ifnet *vf_ifp;
4664 
4665 	HN_LOCK(sc);
4666 	vf_name[0] = '\0';
4667 	vf_ifp = sc->hn_vf_ifp;
4668 	if (vf_ifp != NULL)
4669 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4670 	HN_UNLOCK(sc);
4671 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4672 }
4673 
4674 static int
4675 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4676 {
4677 	struct hn_softc *sc = arg1;
4678 	char vf_name[IFNAMSIZ + 1];
4679 	struct ifnet *vf_ifp;
4680 
4681 	HN_LOCK(sc);
4682 	vf_name[0] = '\0';
4683 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4684 	if (vf_ifp != NULL)
4685 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4686 	HN_UNLOCK(sc);
4687 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4688 }
4689 
4690 static int
4691 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4692 {
4693 	struct rm_priotracker pt;
4694 	struct sbuf *sb;
4695 	int error, i;
4696 	bool first;
4697 
4698 	error = sysctl_wire_old_buffer(req, 0);
4699 	if (error != 0)
4700 		return (error);
4701 
4702 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4703 	if (sb == NULL)
4704 		return (ENOMEM);
4705 
4706 	rm_rlock(&hn_vfmap_lock, &pt);
4707 
4708 	first = true;
4709 	for (i = 0; i < hn_vfmap_size; ++i) {
4710 		struct ifnet *ifp;
4711 
4712 		if (hn_vfmap[i] == NULL)
4713 			continue;
4714 
4715 		ifp = ifnet_byindex(i);
4716 		if (ifp != NULL) {
4717 			if (first)
4718 				sbuf_printf(sb, "%s", ifp->if_xname);
4719 			else
4720 				sbuf_printf(sb, " %s", ifp->if_xname);
4721 			first = false;
4722 		}
4723 	}
4724 
4725 	rm_runlock(&hn_vfmap_lock, &pt);
4726 
4727 	error = sbuf_finish(sb);
4728 	sbuf_delete(sb);
4729 	return (error);
4730 }
4731 
4732 static int
4733 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4734 {
4735 	struct rm_priotracker pt;
4736 	struct sbuf *sb;
4737 	int error, i;
4738 	bool first;
4739 
4740 	error = sysctl_wire_old_buffer(req, 0);
4741 	if (error != 0)
4742 		return (error);
4743 
4744 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4745 	if (sb == NULL)
4746 		return (ENOMEM);
4747 
4748 	rm_rlock(&hn_vfmap_lock, &pt);
4749 
4750 	first = true;
4751 	for (i = 0; i < hn_vfmap_size; ++i) {
4752 		struct ifnet *ifp, *hn_ifp;
4753 
4754 		hn_ifp = hn_vfmap[i];
4755 		if (hn_ifp == NULL)
4756 			continue;
4757 
4758 		ifp = ifnet_byindex(i);
4759 		if (ifp != NULL) {
4760 			if (first) {
4761 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4762 				    hn_ifp->if_xname);
4763 			} else {
4764 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4765 				    hn_ifp->if_xname);
4766 			}
4767 			first = false;
4768 		}
4769 	}
4770 
4771 	rm_runlock(&hn_vfmap_lock, &pt);
4772 
4773 	error = sbuf_finish(sb);
4774 	sbuf_delete(sb);
4775 	return (error);
4776 }
4777 
4778 static int
4779 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4780 {
4781 	struct hn_softc *sc = arg1;
4782 	int error, onoff = 0;
4783 
4784 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4785 		onoff = 1;
4786 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4787 	if (error || req->newptr == NULL)
4788 		return (error);
4789 
4790 	HN_LOCK(sc);
4791 	/* NOTE: hn_vf_lock for hn_transmit() */
4792 	rm_wlock(&sc->hn_vf_lock);
4793 	if (onoff)
4794 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4795 	else
4796 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4797 	rm_wunlock(&sc->hn_vf_lock);
4798 	HN_UNLOCK(sc);
4799 
4800 	return (0);
4801 }
4802 
4803 static int
4804 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4805 {
4806 	struct hn_softc *sc = arg1;
4807 	int enabled = 0;
4808 
4809 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4810 		enabled = 1;
4811 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4812 }
4813 
4814 static int
4815 hn_check_iplen(const struct mbuf *m, int hoff)
4816 {
4817 	const struct ip *ip;
4818 	int len, iphlen, iplen;
4819 	const struct tcphdr *th;
4820 	int thoff;				/* TCP data offset */
4821 
4822 	len = hoff + sizeof(struct ip);
4823 
4824 	/* The packet must be at least the size of an IP header. */
4825 	if (m->m_pkthdr.len < len)
4826 		return IPPROTO_DONE;
4827 
4828 	/* The fixed IP header must reside completely in the first mbuf. */
4829 	if (m->m_len < len)
4830 		return IPPROTO_DONE;
4831 
4832 	ip = mtodo(m, hoff);
4833 
4834 	/* Bound check the packet's stated IP header length. */
4835 	iphlen = ip->ip_hl << 2;
4836 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4837 		return IPPROTO_DONE;
4838 
4839 	/* The full IP header must reside completely in the one mbuf. */
4840 	if (m->m_len < hoff + iphlen)
4841 		return IPPROTO_DONE;
4842 
4843 	iplen = ntohs(ip->ip_len);
4844 
4845 	/*
4846 	 * Check that the amount of data in the buffers is as
4847 	 * at least much as the IP header would have us expect.
4848 	 */
4849 	if (m->m_pkthdr.len < hoff + iplen)
4850 		return IPPROTO_DONE;
4851 
4852 	/*
4853 	 * Ignore IP fragments.
4854 	 */
4855 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4856 		return IPPROTO_DONE;
4857 
4858 	/*
4859 	 * The TCP/IP or UDP/IP header must be entirely contained within
4860 	 * the first fragment of a packet.
4861 	 */
4862 	switch (ip->ip_p) {
4863 	case IPPROTO_TCP:
4864 		if (iplen < iphlen + sizeof(struct tcphdr))
4865 			return IPPROTO_DONE;
4866 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4867 			return IPPROTO_DONE;
4868 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4869 		thoff = th->th_off << 2;
4870 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4871 			return IPPROTO_DONE;
4872 		if (m->m_len < hoff + iphlen + thoff)
4873 			return IPPROTO_DONE;
4874 		break;
4875 	case IPPROTO_UDP:
4876 		if (iplen < iphlen + sizeof(struct udphdr))
4877 			return IPPROTO_DONE;
4878 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4879 			return IPPROTO_DONE;
4880 		break;
4881 	default:
4882 		if (iplen < iphlen)
4883 			return IPPROTO_DONE;
4884 		break;
4885 	}
4886 	return ip->ip_p;
4887 }
4888 
4889 static void
4890 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4891 {
4892 	const struct ether_header *eh;
4893 	uint16_t etype;
4894 	int hoff;
4895 
4896 	hoff = sizeof(*eh);
4897 	/* Checked at the beginning of this function. */
4898 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4899 
4900 	eh = mtod(m_new, const struct ether_header *);
4901 	etype = ntohs(eh->ether_type);
4902 	if (etype == ETHERTYPE_VLAN) {
4903 		const struct ether_vlan_header *evl;
4904 
4905 		hoff = sizeof(*evl);
4906 		if (m_new->m_len < hoff)
4907 			return;
4908 		evl = mtod(m_new, const struct ether_vlan_header *);
4909 		etype = ntohs(evl->evl_proto);
4910 	}
4911 	*l3proto = etype;
4912 
4913 	if (etype == ETHERTYPE_IP)
4914 		*l4proto = hn_check_iplen(m_new, hoff);
4915 	else
4916 		*l4proto = IPPROTO_DONE;
4917 }
4918 
4919 static int
4920 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4921 {
4922 	struct sysctl_oid_list *child;
4923 	struct sysctl_ctx_list *ctx;
4924 	device_t dev = sc->hn_dev;
4925 #if defined(INET) || defined(INET6)
4926 #if __FreeBSD_version >= 1100095
4927 	int lroent_cnt;
4928 #endif
4929 #endif
4930 	int i;
4931 
4932 	/*
4933 	 * Create RXBUF for reception.
4934 	 *
4935 	 * NOTE:
4936 	 * - It is shared by all channels.
4937 	 * - A large enough buffer is allocated, certain version of NVSes
4938 	 *   may further limit the usable space.
4939 	 */
4940 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4941 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4942 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4943 	if (sc->hn_rxbuf == NULL) {
4944 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4945 		return (ENOMEM);
4946 	}
4947 
4948 	sc->hn_rx_ring_cnt = ring_cnt;
4949 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4950 
4951 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4952 	    M_DEVBUF, M_WAITOK | M_ZERO);
4953 
4954 #if defined(INET) || defined(INET6)
4955 #if __FreeBSD_version >= 1100095
4956 	lroent_cnt = hn_lro_entry_count;
4957 	if (lroent_cnt < TCP_LRO_ENTRIES)
4958 		lroent_cnt = TCP_LRO_ENTRIES;
4959 	if (bootverbose)
4960 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4961 #endif
4962 #endif	/* INET || INET6 */
4963 
4964 	ctx = device_get_sysctl_ctx(dev);
4965 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4966 
4967 	/* Create dev.hn.UNIT.rx sysctl tree */
4968 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4969 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4970 
4971 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4972 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4973 
4974 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4975 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4976 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4977 		if (rxr->hn_br == NULL) {
4978 			device_printf(dev, "allocate bufring failed\n");
4979 			return (ENOMEM);
4980 		}
4981 
4982 		if (hn_trust_hosttcp)
4983 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4984 		if (hn_trust_hostudp)
4985 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4986 		if (hn_trust_hostip)
4987 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4988 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4989 		rxr->hn_ifp = sc->hn_ifp;
4990 		if (i < sc->hn_tx_ring_cnt)
4991 			rxr->hn_txr = &sc->hn_tx_ring[i];
4992 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4993 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4994 		rxr->hn_rx_idx = i;
4995 		rxr->hn_rxbuf = sc->hn_rxbuf;
4996 
4997 		/*
4998 		 * Initialize LRO.
4999 		 */
5000 #if defined(INET) || defined(INET6)
5001 #if __FreeBSD_version >= 1100095
5002 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5003 		    hn_lro_mbufq_depth);
5004 #else
5005 		tcp_lro_init(&rxr->hn_lro);
5006 		rxr->hn_lro.ifp = sc->hn_ifp;
5007 #endif
5008 #if __FreeBSD_version >= 1100099
5009 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5010 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5011 #endif
5012 #endif	/* INET || INET6 */
5013 
5014 		if (sc->hn_rx_sysctl_tree != NULL) {
5015 			char name[16];
5016 
5017 			/*
5018 			 * Create per RX ring sysctl tree:
5019 			 * dev.hn.UNIT.rx.RINGID
5020 			 */
5021 			snprintf(name, sizeof(name), "%d", i);
5022 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5023 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5024 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5025 
5026 			if (rxr->hn_rx_sysctl_tree != NULL) {
5027 				SYSCTL_ADD_ULONG(ctx,
5028 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5029 				    OID_AUTO, "packets", CTLFLAG_RW,
5030 				    &rxr->hn_pkts, "# of packets received");
5031 				SYSCTL_ADD_ULONG(ctx,
5032 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5033 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
5034 				    &rxr->hn_rss_pkts,
5035 				    "# of packets w/ RSS info received");
5036 				SYSCTL_ADD_INT(ctx,
5037 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5038 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5039 				    &rxr->hn_pktbuf_len, 0,
5040 				    "Temporary channel packet buffer length");
5041 			}
5042 		}
5043 	}
5044 
5045 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5046 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5047 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5048 #if __FreeBSD_version < 1100095
5049 	    hn_rx_stat_int_sysctl,
5050 #else
5051 	    hn_rx_stat_u64_sysctl,
5052 #endif
5053 	    "LU", "LRO queued");
5054 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5055 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5056 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5057 #if __FreeBSD_version < 1100095
5058 	    hn_rx_stat_int_sysctl,
5059 #else
5060 	    hn_rx_stat_u64_sysctl,
5061 #endif
5062 	    "LU", "LRO flushed");
5063 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5064 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5065 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5066 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5067 #if __FreeBSD_version >= 1100099
5068 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5069 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5070 	    hn_lro_lenlim_sysctl, "IU",
5071 	    "Max # of data bytes to be aggregated by LRO");
5072 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5073 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5074 	    hn_lro_ackcnt_sysctl, "I",
5075 	    "Max # of ACKs to be aggregated by LRO");
5076 #endif
5077 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5078 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5079 	    hn_trust_hcsum_sysctl, "I",
5080 	    "Trust tcp segement verification on host side, "
5081 	    "when csum info is missing");
5082 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5083 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5084 	    hn_trust_hcsum_sysctl, "I",
5085 	    "Trust udp datagram verification on host side, "
5086 	    "when csum info is missing");
5087 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5088 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5089 	    hn_trust_hcsum_sysctl, "I",
5090 	    "Trust ip packet verification on host side, "
5091 	    "when csum info is missing");
5092 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5093 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5094 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5095 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5096 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5097 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5098 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5099 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5100 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5101 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5102 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5103 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5104 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5105 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5106 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5107 	    hn_rx_stat_ulong_sysctl, "LU",
5108 	    "# of packets that we trust host's csum verification");
5109 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5110 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5111 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5112 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5113 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5114 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5115 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5116 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5117 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5118 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5119 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5120 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5121 
5122 	return (0);
5123 }
5124 
5125 static void
5126 hn_destroy_rx_data(struct hn_softc *sc)
5127 {
5128 	int i;
5129 
5130 	if (sc->hn_rxbuf != NULL) {
5131 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5132 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5133 		else
5134 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5135 		sc->hn_rxbuf = NULL;
5136 	}
5137 
5138 	if (sc->hn_rx_ring_cnt == 0)
5139 		return;
5140 
5141 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5142 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5143 
5144 		if (rxr->hn_br == NULL)
5145 			continue;
5146 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5147 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5148 		} else {
5149 			device_printf(sc->hn_dev,
5150 			    "%dth channel bufring is referenced", i);
5151 		}
5152 		rxr->hn_br = NULL;
5153 
5154 #if defined(INET) || defined(INET6)
5155 		tcp_lro_free(&rxr->hn_lro);
5156 #endif
5157 		free(rxr->hn_pktbuf, M_DEVBUF);
5158 	}
5159 	free(sc->hn_rx_ring, M_DEVBUF);
5160 	sc->hn_rx_ring = NULL;
5161 
5162 	sc->hn_rx_ring_cnt = 0;
5163 	sc->hn_rx_ring_inuse = 0;
5164 }
5165 
5166 static int
5167 hn_tx_ring_create(struct hn_softc *sc, int id)
5168 {
5169 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5170 	device_t dev = sc->hn_dev;
5171 	bus_dma_tag_t parent_dtag;
5172 	int error, i;
5173 
5174 	txr->hn_sc = sc;
5175 	txr->hn_tx_idx = id;
5176 
5177 #ifndef HN_USE_TXDESC_BUFRING
5178 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5179 #endif
5180 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5181 
5182 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5183 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5184 	    M_DEVBUF, M_WAITOK | M_ZERO);
5185 #ifndef HN_USE_TXDESC_BUFRING
5186 	SLIST_INIT(&txr->hn_txlist);
5187 #else
5188 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5189 	    M_WAITOK, &txr->hn_tx_lock);
5190 #endif
5191 
5192 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5193 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5194 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5195 	} else {
5196 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5197 	}
5198 
5199 #ifdef HN_IFSTART_SUPPORT
5200 	if (hn_use_if_start) {
5201 		txr->hn_txeof = hn_start_txeof;
5202 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5203 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5204 	} else
5205 #endif
5206 	{
5207 		int br_depth;
5208 
5209 		txr->hn_txeof = hn_xmit_txeof;
5210 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5211 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5212 
5213 		br_depth = hn_get_txswq_depth(txr);
5214 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5215 		    M_WAITOK, &txr->hn_tx_lock);
5216 	}
5217 
5218 	txr->hn_direct_tx_size = hn_direct_tx_size;
5219 
5220 	/*
5221 	 * Always schedule transmission instead of trying to do direct
5222 	 * transmission.  This one gives the best performance so far.
5223 	 */
5224 	txr->hn_sched_tx = 1;
5225 
5226 	parent_dtag = bus_get_dma_tag(dev);
5227 
5228 	/* DMA tag for RNDIS packet messages. */
5229 	error = bus_dma_tag_create(parent_dtag, /* parent */
5230 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5231 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5232 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5233 	    BUS_SPACE_MAXADDR,		/* highaddr */
5234 	    NULL, NULL,			/* filter, filterarg */
5235 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5236 	    1,				/* nsegments */
5237 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5238 	    0,				/* flags */
5239 	    NULL,			/* lockfunc */
5240 	    NULL,			/* lockfuncarg */
5241 	    &txr->hn_tx_rndis_dtag);
5242 	if (error) {
5243 		device_printf(dev, "failed to create rndis dmatag\n");
5244 		return error;
5245 	}
5246 
5247 	/* DMA tag for data. */
5248 	error = bus_dma_tag_create(parent_dtag, /* parent */
5249 	    1,				/* alignment */
5250 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5251 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5252 	    BUS_SPACE_MAXADDR,		/* highaddr */
5253 	    NULL, NULL,			/* filter, filterarg */
5254 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5255 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5256 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5257 	    0,				/* flags */
5258 	    NULL,			/* lockfunc */
5259 	    NULL,			/* lockfuncarg */
5260 	    &txr->hn_tx_data_dtag);
5261 	if (error) {
5262 		device_printf(dev, "failed to create data dmatag\n");
5263 		return error;
5264 	}
5265 
5266 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5267 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5268 
5269 		txd->txr = txr;
5270 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5271 		STAILQ_INIT(&txd->agg_list);
5272 
5273 		/*
5274 		 * Allocate and load RNDIS packet message.
5275 		 */
5276         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5277 		    (void **)&txd->rndis_pkt,
5278 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5279 		    &txd->rndis_pkt_dmap);
5280 		if (error) {
5281 			device_printf(dev,
5282 			    "failed to allocate rndis_packet_msg, %d\n", i);
5283 			return error;
5284 		}
5285 
5286 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5287 		    txd->rndis_pkt_dmap,
5288 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5289 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5290 		    BUS_DMA_NOWAIT);
5291 		if (error) {
5292 			device_printf(dev,
5293 			    "failed to load rndis_packet_msg, %d\n", i);
5294 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5295 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5296 			return error;
5297 		}
5298 
5299 		/* DMA map for TX data. */
5300 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5301 		    &txd->data_dmap);
5302 		if (error) {
5303 			device_printf(dev,
5304 			    "failed to allocate tx data dmamap\n");
5305 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5306 			    txd->rndis_pkt_dmap);
5307 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5308 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5309 			return error;
5310 		}
5311 
5312 		/* All set, put it to list */
5313 		txd->flags |= HN_TXD_FLAG_ONLIST;
5314 #ifndef HN_USE_TXDESC_BUFRING
5315 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5316 #else
5317 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5318 #endif
5319 	}
5320 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5321 
5322 	if (sc->hn_tx_sysctl_tree != NULL) {
5323 		struct sysctl_oid_list *child;
5324 		struct sysctl_ctx_list *ctx;
5325 		char name[16];
5326 
5327 		/*
5328 		 * Create per TX ring sysctl tree:
5329 		 * dev.hn.UNIT.tx.RINGID
5330 		 */
5331 		ctx = device_get_sysctl_ctx(dev);
5332 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5333 
5334 		snprintf(name, sizeof(name), "%d", id);
5335 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5336 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5337 
5338 		if (txr->hn_tx_sysctl_tree != NULL) {
5339 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5340 
5341 #ifdef HN_DEBUG
5342 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5343 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5344 			    "# of available TX descs");
5345 #endif
5346 #ifdef HN_IFSTART_SUPPORT
5347 			if (!hn_use_if_start)
5348 #endif
5349 			{
5350 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5351 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5352 				    "over active");
5353 			}
5354 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5355 			    CTLFLAG_RW, &txr->hn_pkts,
5356 			    "# of packets transmitted");
5357 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5358 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5359 		}
5360 	}
5361 
5362 	return 0;
5363 }
5364 
5365 static void
5366 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5367 {
5368 	struct hn_tx_ring *txr = txd->txr;
5369 
5370 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5371 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5372 
5373 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5374 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5375 	    txd->rndis_pkt_dmap);
5376 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5377 }
5378 
5379 static void
5380 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5381 {
5382 
5383 	KASSERT(txd->refs == 0 || txd->refs == 1,
5384 	    ("invalid txd refs %d", txd->refs));
5385 
5386 	/* Aggregated txds will be freed by their aggregating txd. */
5387 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5388 		int freed;
5389 
5390 		freed = hn_txdesc_put(txr, txd);
5391 		KASSERT(freed, ("can't free txdesc"));
5392 	}
5393 }
5394 
5395 static void
5396 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5397 {
5398 	int i;
5399 
5400 	if (txr->hn_txdesc == NULL)
5401 		return;
5402 
5403 	/*
5404 	 * NOTE:
5405 	 * Because the freeing of aggregated txds will be deferred
5406 	 * to the aggregating txd, two passes are used here:
5407 	 * - The first pass GCes any pending txds.  This GC is necessary,
5408 	 *   since if the channels are revoked, hypervisor will not
5409 	 *   deliver send-done for all pending txds.
5410 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5411 	 *   were freed.
5412 	 */
5413 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5414 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5415 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5416 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5417 
5418 	if (txr->hn_tx_data_dtag != NULL)
5419 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5420 	if (txr->hn_tx_rndis_dtag != NULL)
5421 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5422 
5423 #ifdef HN_USE_TXDESC_BUFRING
5424 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5425 #endif
5426 
5427 	free(txr->hn_txdesc, M_DEVBUF);
5428 	txr->hn_txdesc = NULL;
5429 
5430 	if (txr->hn_mbuf_br != NULL)
5431 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5432 
5433 #ifndef HN_USE_TXDESC_BUFRING
5434 	mtx_destroy(&txr->hn_txlist_spin);
5435 #endif
5436 	mtx_destroy(&txr->hn_tx_lock);
5437 }
5438 
5439 static int
5440 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5441 {
5442 	struct sysctl_oid_list *child;
5443 	struct sysctl_ctx_list *ctx;
5444 	int i;
5445 
5446 	/*
5447 	 * Create TXBUF for chimney sending.
5448 	 *
5449 	 * NOTE: It is shared by all channels.
5450 	 */
5451 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5452 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5453 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5454 	if (sc->hn_chim == NULL) {
5455 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5456 		return (ENOMEM);
5457 	}
5458 
5459 	sc->hn_tx_ring_cnt = ring_cnt;
5460 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5461 
5462 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5463 	    M_DEVBUF, M_WAITOK | M_ZERO);
5464 
5465 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5466 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5467 
5468 	/* Create dev.hn.UNIT.tx sysctl tree */
5469 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5470 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5471 
5472 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5473 		int error;
5474 
5475 		error = hn_tx_ring_create(sc, i);
5476 		if (error)
5477 			return error;
5478 	}
5479 
5480 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5481 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5482 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5483 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5484 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5485 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5486 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5487 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5488 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5489 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5490 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5491 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5492 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5493 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5494 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5495 	    hn_tx_stat_ulong_sysctl, "LU",
5496 	    "# of packet transmission aggregation flush failure");
5497 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5498 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5499 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5500 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5501 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5502 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5503 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5504 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5505 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5506 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5507 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5508 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5509 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5510 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5511 	    "# of total TX descs");
5512 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5513 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5514 	    "Chimney send packet size upper boundary");
5515 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5516 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5517 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5518 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5519 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5520 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5521 	    hn_tx_conf_int_sysctl, "I",
5522 	    "Size of the packet for direct transmission");
5523 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5524 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5525 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5526 	    hn_tx_conf_int_sysctl, "I",
5527 	    "Always schedule transmission "
5528 	    "instead of doing direct transmission");
5529 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5530 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5531 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5532 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5533 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5534 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5535 	    "Applied packet transmission aggregation size");
5536 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5537 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5538 	    hn_txagg_pktmax_sysctl, "I",
5539 	    "Applied packet transmission aggregation packets");
5540 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5541 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5542 	    hn_txagg_align_sysctl, "I",
5543 	    "Applied packet transmission aggregation alignment");
5544 
5545 	return 0;
5546 }
5547 
5548 static void
5549 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5550 {
5551 	int i;
5552 
5553 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5554 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5555 }
5556 
5557 static void
5558 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5559 {
5560 	struct ifnet *ifp = sc->hn_ifp;
5561 	u_int hw_tsomax;
5562 	int tso_minlen;
5563 
5564 	HN_LOCK_ASSERT(sc);
5565 
5566 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5567 		return;
5568 
5569 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5570 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5571 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5572 
5573 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5574 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5575 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5576 
5577 	if (tso_maxlen < tso_minlen)
5578 		tso_maxlen = tso_minlen;
5579 	else if (tso_maxlen > IP_MAXPACKET)
5580 		tso_maxlen = IP_MAXPACKET;
5581 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5582 		tso_maxlen = sc->hn_ndis_tso_szmax;
5583 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5584 
5585 	if (hn_xpnt_vf_isready(sc)) {
5586 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5587 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5588 	}
5589 	ifp->if_hw_tsomax = hw_tsomax;
5590 	if (bootverbose)
5591 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5592 }
5593 
5594 static void
5595 hn_fixup_tx_data(struct hn_softc *sc)
5596 {
5597 	uint64_t csum_assist;
5598 	int i;
5599 
5600 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5601 	if (hn_tx_chimney_size > 0 &&
5602 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5603 		hn_set_chim_size(sc, hn_tx_chimney_size);
5604 
5605 	csum_assist = 0;
5606 	if (sc->hn_caps & HN_CAP_IPCS)
5607 		csum_assist |= CSUM_IP;
5608 	if (sc->hn_caps & HN_CAP_TCP4CS)
5609 		csum_assist |= CSUM_IP_TCP;
5610 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5611 		csum_assist |= CSUM_IP_UDP;
5612 	if (sc->hn_caps & HN_CAP_TCP6CS)
5613 		csum_assist |= CSUM_IP6_TCP;
5614 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5615 		csum_assist |= CSUM_IP6_UDP;
5616 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5617 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5618 
5619 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5620 		/*
5621 		 * Support HASHVAL pktinfo on TX path.
5622 		 */
5623 		if (bootverbose)
5624 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5625 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5626 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5627 	}
5628 }
5629 
5630 static void
5631 hn_fixup_rx_data(struct hn_softc *sc)
5632 {
5633 
5634 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5635 		int i;
5636 
5637 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5638 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5639 	}
5640 }
5641 
5642 static void
5643 hn_destroy_tx_data(struct hn_softc *sc)
5644 {
5645 	int i;
5646 
5647 	if (sc->hn_chim != NULL) {
5648 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5649 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5650 		} else {
5651 			device_printf(sc->hn_dev,
5652 			    "chimney sending buffer is referenced");
5653 		}
5654 		sc->hn_chim = NULL;
5655 	}
5656 
5657 	if (sc->hn_tx_ring_cnt == 0)
5658 		return;
5659 
5660 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5661 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5662 
5663 	free(sc->hn_tx_ring, M_DEVBUF);
5664 	sc->hn_tx_ring = NULL;
5665 
5666 	sc->hn_tx_ring_cnt = 0;
5667 	sc->hn_tx_ring_inuse = 0;
5668 }
5669 
5670 #ifdef HN_IFSTART_SUPPORT
5671 
5672 static void
5673 hn_start_taskfunc(void *xtxr, int pending __unused)
5674 {
5675 	struct hn_tx_ring *txr = xtxr;
5676 
5677 	mtx_lock(&txr->hn_tx_lock);
5678 	hn_start_locked(txr, 0);
5679 	mtx_unlock(&txr->hn_tx_lock);
5680 }
5681 
5682 static int
5683 hn_start_locked(struct hn_tx_ring *txr, int len)
5684 {
5685 	struct hn_softc *sc = txr->hn_sc;
5686 	struct ifnet *ifp = sc->hn_ifp;
5687 	int sched = 0;
5688 
5689 	KASSERT(hn_use_if_start,
5690 	    ("hn_start_locked is called, when if_start is disabled"));
5691 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5692 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5693 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5694 
5695 	if (__predict_false(txr->hn_suspended))
5696 		return (0);
5697 
5698 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5699 	    IFF_DRV_RUNNING)
5700 		return (0);
5701 
5702 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5703 		struct hn_txdesc *txd;
5704 		struct mbuf *m_head;
5705 		int error;
5706 
5707 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5708 		if (m_head == NULL)
5709 			break;
5710 
5711 		if (len > 0 && m_head->m_pkthdr.len > len) {
5712 			/*
5713 			 * This sending could be time consuming; let callers
5714 			 * dispatch this packet sending (and sending of any
5715 			 * following up packets) to tx taskqueue.
5716 			 */
5717 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5718 			sched = 1;
5719 			break;
5720 		}
5721 
5722 #if defined(INET6) || defined(INET)
5723 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5724 			m_head = hn_tso_fixup(m_head);
5725 			if (__predict_false(m_head == NULL)) {
5726 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5727 				continue;
5728 			}
5729 		} else if (m_head->m_pkthdr.csum_flags &
5730 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5731 			m_head = hn_set_hlen(m_head);
5732 			if (__predict_false(m_head == NULL)) {
5733 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5734 				continue;
5735 			}
5736 		}
5737 #endif
5738 
5739 		txd = hn_txdesc_get(txr);
5740 		if (txd == NULL) {
5741 			txr->hn_no_txdescs++;
5742 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5743 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5744 			break;
5745 		}
5746 
5747 		error = hn_encap(ifp, txr, txd, &m_head);
5748 		if (error) {
5749 			/* Both txd and m_head are freed */
5750 			KASSERT(txr->hn_agg_txd == NULL,
5751 			    ("encap failed w/ pending aggregating txdesc"));
5752 			continue;
5753 		}
5754 
5755 		if (txr->hn_agg_pktleft == 0) {
5756 			if (txr->hn_agg_txd != NULL) {
5757 				KASSERT(m_head == NULL,
5758 				    ("pending mbuf for aggregating txdesc"));
5759 				error = hn_flush_txagg(ifp, txr);
5760 				if (__predict_false(error)) {
5761 					atomic_set_int(&ifp->if_drv_flags,
5762 					    IFF_DRV_OACTIVE);
5763 					break;
5764 				}
5765 			} else {
5766 				KASSERT(m_head != NULL, ("mbuf was freed"));
5767 				error = hn_txpkt(ifp, txr, txd);
5768 				if (__predict_false(error)) {
5769 					/* txd is freed, but m_head is not */
5770 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5771 					atomic_set_int(&ifp->if_drv_flags,
5772 					    IFF_DRV_OACTIVE);
5773 					break;
5774 				}
5775 			}
5776 		}
5777 #ifdef INVARIANTS
5778 		else {
5779 			KASSERT(txr->hn_agg_txd != NULL,
5780 			    ("no aggregating txdesc"));
5781 			KASSERT(m_head == NULL,
5782 			    ("pending mbuf for aggregating txdesc"));
5783 		}
5784 #endif
5785 	}
5786 
5787 	/* Flush pending aggerated transmission. */
5788 	if (txr->hn_agg_txd != NULL)
5789 		hn_flush_txagg(ifp, txr);
5790 	return (sched);
5791 }
5792 
5793 static void
5794 hn_start(struct ifnet *ifp)
5795 {
5796 	struct hn_softc *sc = ifp->if_softc;
5797 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5798 
5799 	if (txr->hn_sched_tx)
5800 		goto do_sched;
5801 
5802 	if (mtx_trylock(&txr->hn_tx_lock)) {
5803 		int sched;
5804 
5805 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5806 		mtx_unlock(&txr->hn_tx_lock);
5807 		if (!sched)
5808 			return;
5809 	}
5810 do_sched:
5811 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5812 }
5813 
5814 static void
5815 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5816 {
5817 	struct hn_tx_ring *txr = xtxr;
5818 
5819 	mtx_lock(&txr->hn_tx_lock);
5820 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5821 	hn_start_locked(txr, 0);
5822 	mtx_unlock(&txr->hn_tx_lock);
5823 }
5824 
5825 static void
5826 hn_start_txeof(struct hn_tx_ring *txr)
5827 {
5828 	struct hn_softc *sc = txr->hn_sc;
5829 	struct ifnet *ifp = sc->hn_ifp;
5830 
5831 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5832 
5833 	if (txr->hn_sched_tx)
5834 		goto do_sched;
5835 
5836 	if (mtx_trylock(&txr->hn_tx_lock)) {
5837 		int sched;
5838 
5839 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5840 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5841 		mtx_unlock(&txr->hn_tx_lock);
5842 		if (sched) {
5843 			taskqueue_enqueue(txr->hn_tx_taskq,
5844 			    &txr->hn_tx_task);
5845 		}
5846 	} else {
5847 do_sched:
5848 		/*
5849 		 * Release the OACTIVE earlier, with the hope, that
5850 		 * others could catch up.  The task will clear the
5851 		 * flag again with the hn_tx_lock to avoid possible
5852 		 * races.
5853 		 */
5854 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5855 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5856 	}
5857 }
5858 
5859 #endif	/* HN_IFSTART_SUPPORT */
5860 
5861 static int
5862 hn_xmit(struct hn_tx_ring *txr, int len)
5863 {
5864 	struct hn_softc *sc = txr->hn_sc;
5865 	struct ifnet *ifp = sc->hn_ifp;
5866 	struct mbuf *m_head;
5867 	int sched = 0;
5868 
5869 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5870 #ifdef HN_IFSTART_SUPPORT
5871 	KASSERT(hn_use_if_start == 0,
5872 	    ("hn_xmit is called, when if_start is enabled"));
5873 #endif
5874 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5875 
5876 	if (__predict_false(txr->hn_suspended))
5877 		return (0);
5878 
5879 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5880 		return (0);
5881 
5882 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5883 		struct hn_txdesc *txd;
5884 		int error;
5885 
5886 		if (len > 0 && m_head->m_pkthdr.len > len) {
5887 			/*
5888 			 * This sending could be time consuming; let callers
5889 			 * dispatch this packet sending (and sending of any
5890 			 * following up packets) to tx taskqueue.
5891 			 */
5892 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5893 			sched = 1;
5894 			break;
5895 		}
5896 
5897 		txd = hn_txdesc_get(txr);
5898 		if (txd == NULL) {
5899 			txr->hn_no_txdescs++;
5900 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5901 			txr->hn_oactive = 1;
5902 			break;
5903 		}
5904 
5905 		error = hn_encap(ifp, txr, txd, &m_head);
5906 		if (error) {
5907 			/* Both txd and m_head are freed; discard */
5908 			KASSERT(txr->hn_agg_txd == NULL,
5909 			    ("encap failed w/ pending aggregating txdesc"));
5910 			drbr_advance(ifp, txr->hn_mbuf_br);
5911 			continue;
5912 		}
5913 
5914 		if (txr->hn_agg_pktleft == 0) {
5915 			if (txr->hn_agg_txd != NULL) {
5916 				KASSERT(m_head == NULL,
5917 				    ("pending mbuf for aggregating txdesc"));
5918 				error = hn_flush_txagg(ifp, txr);
5919 				if (__predict_false(error)) {
5920 					txr->hn_oactive = 1;
5921 					break;
5922 				}
5923 			} else {
5924 				KASSERT(m_head != NULL, ("mbuf was freed"));
5925 				error = hn_txpkt(ifp, txr, txd);
5926 				if (__predict_false(error)) {
5927 					/* txd is freed, but m_head is not */
5928 					drbr_putback(ifp, txr->hn_mbuf_br,
5929 					    m_head);
5930 					txr->hn_oactive = 1;
5931 					break;
5932 				}
5933 			}
5934 		}
5935 #ifdef INVARIANTS
5936 		else {
5937 			KASSERT(txr->hn_agg_txd != NULL,
5938 			    ("no aggregating txdesc"));
5939 			KASSERT(m_head == NULL,
5940 			    ("pending mbuf for aggregating txdesc"));
5941 		}
5942 #endif
5943 
5944 		/* Sent */
5945 		drbr_advance(ifp, txr->hn_mbuf_br);
5946 	}
5947 
5948 	/* Flush pending aggerated transmission. */
5949 	if (txr->hn_agg_txd != NULL)
5950 		hn_flush_txagg(ifp, txr);
5951 	return (sched);
5952 }
5953 
5954 static int
5955 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5956 {
5957 	struct hn_softc *sc = ifp->if_softc;
5958 	struct hn_tx_ring *txr;
5959 	int error, idx = 0;
5960 
5961 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5962 		struct rm_priotracker pt;
5963 
5964 		rm_rlock(&sc->hn_vf_lock, &pt);
5965 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5966 			struct mbuf *m_bpf = NULL;
5967 			int obytes, omcast;
5968 
5969 			obytes = m->m_pkthdr.len;
5970 			omcast = (m->m_flags & M_MCAST) != 0;
5971 
5972 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5973 				if (bpf_peers_present(ifp->if_bpf)) {
5974 					m_bpf = m_copypacket(m, M_NOWAIT);
5975 					if (m_bpf == NULL) {
5976 						/*
5977 						 * Failed to grab a shallow
5978 						 * copy; tap now.
5979 						 */
5980 						ETHER_BPF_MTAP(ifp, m);
5981 					}
5982 				}
5983 			} else {
5984 				ETHER_BPF_MTAP(ifp, m);
5985 			}
5986 
5987 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5988 			rm_runlock(&sc->hn_vf_lock, &pt);
5989 
5990 			if (m_bpf != NULL) {
5991 				if (!error)
5992 					ETHER_BPF_MTAP(ifp, m_bpf);
5993 				m_freem(m_bpf);
5994 			}
5995 
5996 			if (error == ENOBUFS) {
5997 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5998 			} else if (error) {
5999 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6000 			} else {
6001 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6002 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6003 				if (omcast) {
6004 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6005 					    omcast);
6006 				}
6007 			}
6008 			return (error);
6009 		}
6010 		rm_runlock(&sc->hn_vf_lock, &pt);
6011 	}
6012 
6013 #if defined(INET6) || defined(INET)
6014 	/*
6015 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6016 	 * since packet headers should be cache-hot.
6017 	 */
6018 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6019 		m = hn_tso_fixup(m);
6020 		if (__predict_false(m == NULL)) {
6021 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6022 			return EIO;
6023 		}
6024 	} else if (m->m_pkthdr.csum_flags &
6025 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6026 		m = hn_set_hlen(m);
6027 		if (__predict_false(m == NULL)) {
6028 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6029 			return EIO;
6030 		}
6031 	}
6032 #endif
6033 
6034 	/*
6035 	 * Select the TX ring based on flowid
6036 	 */
6037 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6038 #ifdef RSS
6039 		uint32_t bid;
6040 
6041 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6042 		    &bid) == 0)
6043 			idx = bid % sc->hn_tx_ring_inuse;
6044 		else
6045 #endif
6046 		{
6047 #if defined(INET6) || defined(INET)
6048 			int tcpsyn = 0;
6049 
6050 			if (m->m_pkthdr.len < 128 &&
6051 			    (m->m_pkthdr.csum_flags &
6052 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6053 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6054 				m = hn_check_tcpsyn(m, &tcpsyn);
6055 				if (__predict_false(m == NULL)) {
6056 					if_inc_counter(ifp,
6057 					    IFCOUNTER_OERRORS, 1);
6058 					return (EIO);
6059 				}
6060 			}
6061 #else
6062 			const int tcpsyn = 0;
6063 #endif
6064 			if (tcpsyn)
6065 				idx = 0;
6066 			else
6067 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6068 		}
6069 	}
6070 	txr = &sc->hn_tx_ring[idx];
6071 
6072 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6073 	if (error) {
6074 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6075 		return error;
6076 	}
6077 
6078 	if (txr->hn_oactive)
6079 		return 0;
6080 
6081 	if (txr->hn_sched_tx)
6082 		goto do_sched;
6083 
6084 	if (mtx_trylock(&txr->hn_tx_lock)) {
6085 		int sched;
6086 
6087 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6088 		mtx_unlock(&txr->hn_tx_lock);
6089 		if (!sched)
6090 			return 0;
6091 	}
6092 do_sched:
6093 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6094 	return 0;
6095 }
6096 
6097 static void
6098 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6099 {
6100 	struct mbuf *m;
6101 
6102 	mtx_lock(&txr->hn_tx_lock);
6103 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6104 		m_freem(m);
6105 	mtx_unlock(&txr->hn_tx_lock);
6106 }
6107 
6108 static void
6109 hn_xmit_qflush(struct ifnet *ifp)
6110 {
6111 	struct hn_softc *sc = ifp->if_softc;
6112 	struct rm_priotracker pt;
6113 	int i;
6114 
6115 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6116 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6117 	if_qflush(ifp);
6118 
6119 	rm_rlock(&sc->hn_vf_lock, &pt);
6120 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6121 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6122 	rm_runlock(&sc->hn_vf_lock, &pt);
6123 }
6124 
6125 static void
6126 hn_xmit_txeof(struct hn_tx_ring *txr)
6127 {
6128 
6129 	if (txr->hn_sched_tx)
6130 		goto do_sched;
6131 
6132 	if (mtx_trylock(&txr->hn_tx_lock)) {
6133 		int sched;
6134 
6135 		txr->hn_oactive = 0;
6136 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6137 		mtx_unlock(&txr->hn_tx_lock);
6138 		if (sched) {
6139 			taskqueue_enqueue(txr->hn_tx_taskq,
6140 			    &txr->hn_tx_task);
6141 		}
6142 	} else {
6143 do_sched:
6144 		/*
6145 		 * Release the oactive earlier, with the hope, that
6146 		 * others could catch up.  The task will clear the
6147 		 * oactive again with the hn_tx_lock to avoid possible
6148 		 * races.
6149 		 */
6150 		txr->hn_oactive = 0;
6151 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6152 	}
6153 }
6154 
6155 static void
6156 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6157 {
6158 	struct hn_tx_ring *txr = xtxr;
6159 
6160 	mtx_lock(&txr->hn_tx_lock);
6161 	hn_xmit(txr, 0);
6162 	mtx_unlock(&txr->hn_tx_lock);
6163 }
6164 
6165 static void
6166 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6167 {
6168 	struct hn_tx_ring *txr = xtxr;
6169 
6170 	mtx_lock(&txr->hn_tx_lock);
6171 	txr->hn_oactive = 0;
6172 	hn_xmit(txr, 0);
6173 	mtx_unlock(&txr->hn_tx_lock);
6174 }
6175 
6176 static int
6177 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6178 {
6179 	struct vmbus_chan_br cbr;
6180 	struct hn_rx_ring *rxr;
6181 	struct hn_tx_ring *txr = NULL;
6182 	int idx, error;
6183 
6184 	idx = vmbus_chan_subidx(chan);
6185 
6186 	/*
6187 	 * Link this channel to RX/TX ring.
6188 	 */
6189 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6190 	    ("invalid channel index %d, should > 0 && < %d",
6191 	     idx, sc->hn_rx_ring_inuse));
6192 	rxr = &sc->hn_rx_ring[idx];
6193 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6194 	    ("RX ring %d already attached", idx));
6195 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6196 	rxr->hn_chan = chan;
6197 
6198 	if (bootverbose) {
6199 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6200 		    idx, vmbus_chan_id(chan));
6201 	}
6202 
6203 	if (idx < sc->hn_tx_ring_inuse) {
6204 		txr = &sc->hn_tx_ring[idx];
6205 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6206 		    ("TX ring %d already attached", idx));
6207 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6208 
6209 		txr->hn_chan = chan;
6210 		if (bootverbose) {
6211 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6212 			    idx, vmbus_chan_id(chan));
6213 		}
6214 	}
6215 
6216 	/* Bind this channel to a proper CPU. */
6217 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6218 
6219 	/*
6220 	 * Open this channel
6221 	 */
6222 	cbr.cbr = rxr->hn_br;
6223 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6224 	cbr.cbr_txsz = HN_TXBR_SIZE;
6225 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6226 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6227 	if (error) {
6228 		if (error == EISCONN) {
6229 			if_printf(sc->hn_ifp, "bufring is connected after "
6230 			    "chan%u open failure\n", vmbus_chan_id(chan));
6231 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6232 		} else {
6233 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6234 			    vmbus_chan_id(chan), error);
6235 		}
6236 	}
6237 	return (error);
6238 }
6239 
6240 static void
6241 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6242 {
6243 	struct hn_rx_ring *rxr;
6244 	int idx, error;
6245 
6246 	idx = vmbus_chan_subidx(chan);
6247 
6248 	/*
6249 	 * Link this channel to RX/TX ring.
6250 	 */
6251 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6252 	    ("invalid channel index %d, should > 0 && < %d",
6253 	     idx, sc->hn_rx_ring_inuse));
6254 	rxr = &sc->hn_rx_ring[idx];
6255 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6256 	    ("RX ring %d is not attached", idx));
6257 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6258 
6259 	if (idx < sc->hn_tx_ring_inuse) {
6260 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6261 
6262 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6263 		    ("TX ring %d is not attached attached", idx));
6264 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6265 	}
6266 
6267 	/*
6268 	 * Close this channel.
6269 	 *
6270 	 * NOTE:
6271 	 * Channel closing does _not_ destroy the target channel.
6272 	 */
6273 	error = vmbus_chan_close_direct(chan);
6274 	if (error == EISCONN) {
6275 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6276 		    "after being closed\n", vmbus_chan_id(chan));
6277 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6278 	} else if (error) {
6279 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6280 		    vmbus_chan_id(chan), error);
6281 	}
6282 }
6283 
6284 static int
6285 hn_attach_subchans(struct hn_softc *sc)
6286 {
6287 	struct vmbus_channel **subchans;
6288 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6289 	int i, error = 0;
6290 
6291 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6292 
6293 	/* Attach the sub-channels. */
6294 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6295 	for (i = 0; i < subchan_cnt; ++i) {
6296 		int error1;
6297 
6298 		error1 = hn_chan_attach(sc, subchans[i]);
6299 		if (error1) {
6300 			error = error1;
6301 			/* Move on; all channels will be detached later. */
6302 		}
6303 	}
6304 	vmbus_subchan_rel(subchans, subchan_cnt);
6305 
6306 	if (error) {
6307 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6308 	} else {
6309 		if (bootverbose) {
6310 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6311 			    subchan_cnt);
6312 		}
6313 	}
6314 	return (error);
6315 }
6316 
6317 static void
6318 hn_detach_allchans(struct hn_softc *sc)
6319 {
6320 	struct vmbus_channel **subchans;
6321 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6322 	int i;
6323 
6324 	if (subchan_cnt == 0)
6325 		goto back;
6326 
6327 	/* Detach the sub-channels. */
6328 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6329 	for (i = 0; i < subchan_cnt; ++i)
6330 		hn_chan_detach(sc, subchans[i]);
6331 	vmbus_subchan_rel(subchans, subchan_cnt);
6332 
6333 back:
6334 	/*
6335 	 * Detach the primary channel, _after_ all sub-channels
6336 	 * are detached.
6337 	 */
6338 	hn_chan_detach(sc, sc->hn_prichan);
6339 
6340 	/* Wait for sub-channels to be destroyed, if any. */
6341 	vmbus_subchan_drain(sc->hn_prichan);
6342 
6343 #ifdef INVARIANTS
6344 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6345 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6346 		    HN_RX_FLAG_ATTACHED) == 0,
6347 		    ("%dth RX ring is still attached", i));
6348 	}
6349 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6350 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6351 		    HN_TX_FLAG_ATTACHED) == 0,
6352 		    ("%dth TX ring is still attached", i));
6353 	}
6354 #endif
6355 }
6356 
6357 static int
6358 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6359 {
6360 	struct vmbus_channel **subchans;
6361 	int nchan, rxr_cnt, error;
6362 
6363 	nchan = *nsubch + 1;
6364 	if (nchan == 1) {
6365 		/*
6366 		 * Multiple RX/TX rings are not requested.
6367 		 */
6368 		*nsubch = 0;
6369 		return (0);
6370 	}
6371 
6372 	/*
6373 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6374 	 * table entries.
6375 	 */
6376 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6377 	if (error) {
6378 		/* No RSS; this is benign. */
6379 		*nsubch = 0;
6380 		return (0);
6381 	}
6382 	if (bootverbose) {
6383 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6384 		    rxr_cnt, nchan);
6385 	}
6386 
6387 	if (nchan > rxr_cnt)
6388 		nchan = rxr_cnt;
6389 	if (nchan == 1) {
6390 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6391 		*nsubch = 0;
6392 		return (0);
6393 	}
6394 
6395 	/*
6396 	 * Allocate sub-channels from NVS.
6397 	 */
6398 	*nsubch = nchan - 1;
6399 	error = hn_nvs_alloc_subchans(sc, nsubch);
6400 	if (error || *nsubch == 0) {
6401 		/* Failed to allocate sub-channels. */
6402 		*nsubch = 0;
6403 		return (0);
6404 	}
6405 
6406 	/*
6407 	 * Wait for all sub-channels to become ready before moving on.
6408 	 */
6409 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6410 	vmbus_subchan_rel(subchans, *nsubch);
6411 	return (0);
6412 }
6413 
6414 static bool
6415 hn_synth_attachable(const struct hn_softc *sc)
6416 {
6417 	int i;
6418 
6419 	if (sc->hn_flags & HN_FLAG_ERRORS)
6420 		return (false);
6421 
6422 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6423 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6424 
6425 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6426 			return (false);
6427 	}
6428 	return (true);
6429 }
6430 
6431 /*
6432  * Make sure that the RX filter is zero after the successful
6433  * RNDIS initialization.
6434  *
6435  * NOTE:
6436  * Under certain conditions on certain versions of Hyper-V,
6437  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6438  * after the successful RNDIS initialization, which breaks
6439  * the assumption of any following code (well, it breaks the
6440  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6441  * explicitly, drain packets sneaking through, and drain the
6442  * interrupt taskqueues scheduled due to the stealth packets.
6443  */
6444 static void
6445 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6446 {
6447 
6448 	hn_disable_rx(sc);
6449 	hn_drain_rxtx(sc, nchan);
6450 }
6451 
6452 static int
6453 hn_synth_attach(struct hn_softc *sc, int mtu)
6454 {
6455 #define ATTACHED_NVS		0x0002
6456 #define ATTACHED_RNDIS		0x0004
6457 
6458 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6459 	int error, nsubch, nchan = 1, i, rndis_inited;
6460 	uint32_t old_caps, attached = 0;
6461 
6462 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6463 	    ("synthetic parts were attached"));
6464 
6465 	if (!hn_synth_attachable(sc))
6466 		return (ENXIO);
6467 
6468 	/* Save capabilities for later verification. */
6469 	old_caps = sc->hn_caps;
6470 	sc->hn_caps = 0;
6471 
6472 	/* Clear RSS stuffs. */
6473 	sc->hn_rss_ind_size = 0;
6474 	sc->hn_rss_hash = 0;
6475 	sc->hn_rss_hcap = 0;
6476 
6477 	/*
6478 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6479 	 */
6480 	error = hn_chan_attach(sc, sc->hn_prichan);
6481 	if (error)
6482 		goto failed;
6483 
6484 	/*
6485 	 * Attach NVS.
6486 	 */
6487 	error = hn_nvs_attach(sc, mtu);
6488 	if (error)
6489 		goto failed;
6490 	attached |= ATTACHED_NVS;
6491 
6492 	/*
6493 	 * Attach RNDIS _after_ NVS is attached.
6494 	 */
6495 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6496 	if (rndis_inited)
6497 		attached |= ATTACHED_RNDIS;
6498 	if (error)
6499 		goto failed;
6500 
6501 	/*
6502 	 * Make sure capabilities are not changed.
6503 	 */
6504 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6505 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6506 		    old_caps, sc->hn_caps);
6507 		error = ENXIO;
6508 		goto failed;
6509 	}
6510 
6511 	/*
6512 	 * Allocate sub-channels for multi-TX/RX rings.
6513 	 *
6514 	 * NOTE:
6515 	 * The # of RX rings that can be used is equivalent to the # of
6516 	 * channels to be requested.
6517 	 */
6518 	nsubch = sc->hn_rx_ring_cnt - 1;
6519 	error = hn_synth_alloc_subchans(sc, &nsubch);
6520 	if (error)
6521 		goto failed;
6522 	/* NOTE: _Full_ synthetic parts detach is required now. */
6523 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6524 
6525 	/*
6526 	 * Set the # of TX/RX rings that could be used according to
6527 	 * the # of channels that NVS offered.
6528 	 */
6529 	nchan = nsubch + 1;
6530 	hn_set_ring_inuse(sc, nchan);
6531 	if (nchan == 1) {
6532 		/* Only the primary channel can be used; done */
6533 		goto back;
6534 	}
6535 
6536 	/*
6537 	 * Attach the sub-channels.
6538 	 *
6539 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6540 	 */
6541 	error = hn_attach_subchans(sc);
6542 	if (error)
6543 		goto failed;
6544 
6545 	/*
6546 	 * Configure RSS key and indirect table _after_ all sub-channels
6547 	 * are attached.
6548 	 */
6549 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6550 		/*
6551 		 * RSS key is not set yet; set it to the default RSS key.
6552 		 */
6553 		if (bootverbose)
6554 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6555 #ifdef RSS
6556 		rss_getkey(rss->rss_key);
6557 #else
6558 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6559 #endif
6560 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6561 	}
6562 
6563 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6564 		/*
6565 		 * RSS indirect table is not set yet; set it up in round-
6566 		 * robin fashion.
6567 		 */
6568 		if (bootverbose) {
6569 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6570 			    "table\n");
6571 		}
6572 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6573 			uint32_t subidx;
6574 
6575 #ifdef RSS
6576 			subidx = rss_get_indirection_to_bucket(i);
6577 #else
6578 			subidx = i;
6579 #endif
6580 			rss->rss_ind[i] = subidx % nchan;
6581 		}
6582 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6583 	} else {
6584 		/*
6585 		 * # of usable channels may be changed, so we have to
6586 		 * make sure that all entries in RSS indirect table
6587 		 * are valid.
6588 		 *
6589 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6590 		 */
6591 		hn_rss_ind_fixup(sc);
6592 	}
6593 
6594 	sc->hn_rss_hash = sc->hn_rss_hcap;
6595 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6596 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6597 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6598 		hn_vf_rss_fixup(sc, false);
6599 	}
6600 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6601 	if (error)
6602 		goto failed;
6603 back:
6604 	/*
6605 	 * Fixup transmission aggregation setup.
6606 	 */
6607 	hn_set_txagg(sc);
6608 	hn_rndis_init_fixat(sc, nchan);
6609 	return (0);
6610 
6611 failed:
6612 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6613 		hn_rndis_init_fixat(sc, nchan);
6614 		hn_synth_detach(sc);
6615 	} else {
6616 		if (attached & ATTACHED_RNDIS) {
6617 			hn_rndis_init_fixat(sc, nchan);
6618 			hn_rndis_detach(sc);
6619 		}
6620 		if (attached & ATTACHED_NVS)
6621 			hn_nvs_detach(sc);
6622 		hn_chan_detach(sc, sc->hn_prichan);
6623 		/* Restore old capabilities. */
6624 		sc->hn_caps = old_caps;
6625 	}
6626 	return (error);
6627 
6628 #undef ATTACHED_RNDIS
6629 #undef ATTACHED_NVS
6630 }
6631 
6632 /*
6633  * NOTE:
6634  * The interface must have been suspended though hn_suspend(), before
6635  * this function get called.
6636  */
6637 static void
6638 hn_synth_detach(struct hn_softc *sc)
6639 {
6640 
6641 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6642 	    ("synthetic parts were not attached"));
6643 
6644 	/* Detach the RNDIS first. */
6645 	hn_rndis_detach(sc);
6646 
6647 	/* Detach NVS. */
6648 	hn_nvs_detach(sc);
6649 
6650 	/* Detach all of the channels. */
6651 	hn_detach_allchans(sc);
6652 
6653 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6654 		/*
6655 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6656 		 */
6657 		int error;
6658 
6659 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6660 		    sc->hn_rxbuf_gpadl);
6661 		if (error) {
6662 			if_printf(sc->hn_ifp,
6663 			    "rxbuf gpadl disconn failed: %d\n", error);
6664 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6665 		}
6666 		sc->hn_rxbuf_gpadl = 0;
6667 	}
6668 
6669 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6670 		/*
6671 		 * Host is post-Win2016, disconnect chimney sending buffer from
6672 		 * primary channel here.
6673 		 */
6674 		int error;
6675 
6676 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6677 		    sc->hn_chim_gpadl);
6678 		if (error) {
6679 			if_printf(sc->hn_ifp,
6680 			    "chim gpadl disconn failed: %d\n", error);
6681 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6682 		}
6683 		sc->hn_chim_gpadl = 0;
6684 	}
6685 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6686 }
6687 
6688 static void
6689 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6690 {
6691 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6692 	    ("invalid ring count %d", ring_cnt));
6693 
6694 	if (sc->hn_tx_ring_cnt > ring_cnt)
6695 		sc->hn_tx_ring_inuse = ring_cnt;
6696 	else
6697 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6698 	sc->hn_rx_ring_inuse = ring_cnt;
6699 
6700 #ifdef RSS
6701 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6702 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6703 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6704 		    rss_getnumbuckets());
6705 	}
6706 #endif
6707 
6708 	if (bootverbose) {
6709 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6710 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6711 	}
6712 }
6713 
6714 static void
6715 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6716 {
6717 
6718 	/*
6719 	 * NOTE:
6720 	 * The TX bufring will not be drained by the hypervisor,
6721 	 * if the primary channel is revoked.
6722 	 */
6723 	while (!vmbus_chan_rx_empty(chan) ||
6724 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6725 	     !vmbus_chan_tx_empty(chan)))
6726 		pause("waitch", 1);
6727 	vmbus_chan_intr_drain(chan);
6728 }
6729 
6730 static void
6731 hn_disable_rx(struct hn_softc *sc)
6732 {
6733 
6734 	/*
6735 	 * Disable RX by clearing RX filter forcefully.
6736 	 */
6737 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6738 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6739 
6740 	/*
6741 	 * Give RNDIS enough time to flush all pending data packets.
6742 	 */
6743 	pause("waitrx", (200 * hz) / 1000);
6744 }
6745 
6746 /*
6747  * NOTE:
6748  * RX/TX _must_ have been suspended/disabled, before this function
6749  * is called.
6750  */
6751 static void
6752 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6753 {
6754 	struct vmbus_channel **subch = NULL;
6755 	int nsubch;
6756 
6757 	/*
6758 	 * Drain RX/TX bufrings and interrupts.
6759 	 */
6760 	nsubch = nchan - 1;
6761 	if (nsubch > 0)
6762 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6763 
6764 	if (subch != NULL) {
6765 		int i;
6766 
6767 		for (i = 0; i < nsubch; ++i)
6768 			hn_chan_drain(sc, subch[i]);
6769 	}
6770 	hn_chan_drain(sc, sc->hn_prichan);
6771 
6772 	if (subch != NULL)
6773 		vmbus_subchan_rel(subch, nsubch);
6774 }
6775 
6776 static void
6777 hn_suspend_data(struct hn_softc *sc)
6778 {
6779 	struct hn_tx_ring *txr;
6780 	int i;
6781 
6782 	HN_LOCK_ASSERT(sc);
6783 
6784 	/*
6785 	 * Suspend TX.
6786 	 */
6787 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6788 		txr = &sc->hn_tx_ring[i];
6789 
6790 		mtx_lock(&txr->hn_tx_lock);
6791 		txr->hn_suspended = 1;
6792 		mtx_unlock(&txr->hn_tx_lock);
6793 		/* No one is able send more packets now. */
6794 
6795 		/*
6796 		 * Wait for all pending sends to finish.
6797 		 *
6798 		 * NOTE:
6799 		 * We will _not_ receive all pending send-done, if the
6800 		 * primary channel is revoked.
6801 		 */
6802 		while (hn_tx_ring_pending(txr) &&
6803 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6804 			pause("hnwtx", 1 /* 1 tick */);
6805 	}
6806 
6807 	/*
6808 	 * Disable RX.
6809 	 */
6810 	hn_disable_rx(sc);
6811 
6812 	/*
6813 	 * Drain RX/TX.
6814 	 */
6815 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6816 
6817 	/*
6818 	 * Drain any pending TX tasks.
6819 	 *
6820 	 * NOTE:
6821 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6822 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6823 	 */
6824 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6825 		txr = &sc->hn_tx_ring[i];
6826 
6827 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6828 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6829 	}
6830 }
6831 
6832 static void
6833 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6834 {
6835 
6836 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6837 }
6838 
6839 static void
6840 hn_suspend_mgmt(struct hn_softc *sc)
6841 {
6842 	struct task task;
6843 
6844 	HN_LOCK_ASSERT(sc);
6845 
6846 	/*
6847 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6848 	 * through hn_mgmt_taskq.
6849 	 */
6850 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6851 	vmbus_chan_run_task(sc->hn_prichan, &task);
6852 
6853 	/*
6854 	 * Make sure that all pending management tasks are completed.
6855 	 */
6856 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6857 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6858 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6859 }
6860 
6861 static void
6862 hn_suspend(struct hn_softc *sc)
6863 {
6864 
6865 	/* Disable polling. */
6866 	hn_polling(sc, 0);
6867 
6868 	/*
6869 	 * If the non-transparent mode VF is activated, the synthetic
6870 	 * device is receiving packets, so the data path of the
6871 	 * synthetic device must be suspended.
6872 	 */
6873 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6874 	    (sc->hn_flags & HN_FLAG_RXVF))
6875 		hn_suspend_data(sc);
6876 	hn_suspend_mgmt(sc);
6877 }
6878 
6879 static void
6880 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6881 {
6882 	int i;
6883 
6884 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6885 	    ("invalid TX ring count %d", tx_ring_cnt));
6886 
6887 	for (i = 0; i < tx_ring_cnt; ++i) {
6888 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6889 
6890 		mtx_lock(&txr->hn_tx_lock);
6891 		txr->hn_suspended = 0;
6892 		mtx_unlock(&txr->hn_tx_lock);
6893 	}
6894 }
6895 
6896 static void
6897 hn_resume_data(struct hn_softc *sc)
6898 {
6899 	int i;
6900 
6901 	HN_LOCK_ASSERT(sc);
6902 
6903 	/*
6904 	 * Re-enable RX.
6905 	 */
6906 	hn_rxfilter_config(sc);
6907 
6908 	/*
6909 	 * Make sure to clear suspend status on "all" TX rings,
6910 	 * since hn_tx_ring_inuse can be changed after
6911 	 * hn_suspend_data().
6912 	 */
6913 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6914 
6915 #ifdef HN_IFSTART_SUPPORT
6916 	if (!hn_use_if_start)
6917 #endif
6918 	{
6919 		/*
6920 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6921 		 * reduced.
6922 		 */
6923 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6924 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6925 	}
6926 
6927 	/*
6928 	 * Kick start TX.
6929 	 */
6930 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6931 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6932 
6933 		/*
6934 		 * Use txeof task, so that any pending oactive can be
6935 		 * cleared properly.
6936 		 */
6937 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6938 	}
6939 }
6940 
6941 static void
6942 hn_resume_mgmt(struct hn_softc *sc)
6943 {
6944 
6945 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6946 
6947 	/*
6948 	 * Kick off network change detection, if it was pending.
6949 	 * If no network change was pending, start link status
6950 	 * checks, which is more lightweight than network change
6951 	 * detection.
6952 	 */
6953 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6954 		hn_change_network(sc);
6955 	else
6956 		hn_update_link_status(sc);
6957 }
6958 
6959 static void
6960 hn_resume(struct hn_softc *sc)
6961 {
6962 
6963 	/*
6964 	 * If the non-transparent mode VF is activated, the synthetic
6965 	 * device have to receive packets, so the data path of the
6966 	 * synthetic device must be resumed.
6967 	 */
6968 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6969 	    (sc->hn_flags & HN_FLAG_RXVF))
6970 		hn_resume_data(sc);
6971 
6972 	/*
6973 	 * Don't resume link status change if VF is attached/activated.
6974 	 * - In the non-transparent VF mode, the synthetic device marks
6975 	 *   link down until the VF is deactivated; i.e. VF is down.
6976 	 * - In transparent VF mode, VF's media status is used until
6977 	 *   the VF is detached.
6978 	 */
6979 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6980 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6981 		hn_resume_mgmt(sc);
6982 
6983 	/*
6984 	 * Re-enable polling if this interface is running and
6985 	 * the polling is requested.
6986 	 */
6987 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6988 		hn_polling(sc, sc->hn_pollhz);
6989 }
6990 
6991 static void
6992 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6993 {
6994 	const struct rndis_status_msg *msg;
6995 	int ofs;
6996 
6997 	if (dlen < sizeof(*msg)) {
6998 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6999 		return;
7000 	}
7001 	msg = data;
7002 
7003 	switch (msg->rm_status) {
7004 	case RNDIS_STATUS_MEDIA_CONNECT:
7005 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7006 		hn_update_link_status(sc);
7007 		break;
7008 
7009 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7010 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7011 		/* Not really useful; ignore. */
7012 		break;
7013 
7014 	case RNDIS_STATUS_NETWORK_CHANGE:
7015 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7016 		if (dlen < ofs + msg->rm_stbuflen ||
7017 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7018 			if_printf(sc->hn_ifp, "network changed\n");
7019 		} else {
7020 			uint32_t change;
7021 
7022 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7023 			    sizeof(change));
7024 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7025 			    change);
7026 		}
7027 		hn_change_network(sc);
7028 		break;
7029 
7030 	default:
7031 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7032 		    msg->rm_status);
7033 		break;
7034 	}
7035 }
7036 
7037 static int
7038 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7039 {
7040 	const struct rndis_pktinfo *pi = info_data;
7041 	uint32_t mask = 0;
7042 
7043 	while (info_dlen != 0) {
7044 		const void *data;
7045 		uint32_t dlen;
7046 
7047 		if (__predict_false(info_dlen < sizeof(*pi)))
7048 			return (EINVAL);
7049 		if (__predict_false(info_dlen < pi->rm_size))
7050 			return (EINVAL);
7051 		info_dlen -= pi->rm_size;
7052 
7053 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7054 			return (EINVAL);
7055 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7056 			return (EINVAL);
7057 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7058 		data = pi->rm_data;
7059 
7060 		switch (pi->rm_type) {
7061 		case NDIS_PKTINFO_TYPE_VLAN:
7062 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
7063 				return (EINVAL);
7064 			info->vlan_info = *((const uint32_t *)data);
7065 			mask |= HN_RXINFO_VLAN;
7066 			break;
7067 
7068 		case NDIS_PKTINFO_TYPE_CSUM:
7069 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
7070 				return (EINVAL);
7071 			info->csum_info = *((const uint32_t *)data);
7072 			mask |= HN_RXINFO_CSUM;
7073 			break;
7074 
7075 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7076 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
7077 				return (EINVAL);
7078 			info->hash_value = *((const uint32_t *)data);
7079 			mask |= HN_RXINFO_HASHVAL;
7080 			break;
7081 
7082 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
7083 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
7084 				return (EINVAL);
7085 			info->hash_info = *((const uint32_t *)data);
7086 			mask |= HN_RXINFO_HASHINF;
7087 			break;
7088 
7089 		default:
7090 			goto next;
7091 		}
7092 
7093 		if (mask == HN_RXINFO_ALL) {
7094 			/* All found; done */
7095 			break;
7096 		}
7097 next:
7098 		pi = (const struct rndis_pktinfo *)
7099 		    ((const uint8_t *)pi + pi->rm_size);
7100 	}
7101 
7102 	/*
7103 	 * Final fixup.
7104 	 * - If there is no hash value, invalidate the hash info.
7105 	 */
7106 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7107 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
7108 	return (0);
7109 }
7110 
7111 static __inline bool
7112 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7113 {
7114 
7115 	if (off < check_off) {
7116 		if (__predict_true(off + len <= check_off))
7117 			return (false);
7118 	} else if (off > check_off) {
7119 		if (__predict_true(check_off + check_len <= off))
7120 			return (false);
7121 	}
7122 	return (true);
7123 }
7124 
7125 static void
7126 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7127 {
7128 	const struct rndis_packet_msg *pkt;
7129 	struct hn_rxinfo info;
7130 	int data_off, pktinfo_off, data_len, pktinfo_len;
7131 
7132 	/*
7133 	 * Check length.
7134 	 */
7135 	if (__predict_false(dlen < sizeof(*pkt))) {
7136 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7137 		return;
7138 	}
7139 	pkt = data;
7140 
7141 	if (__predict_false(dlen < pkt->rm_len)) {
7142 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7143 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7144 		return;
7145 	}
7146 	if (__predict_false(pkt->rm_len <
7147 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7148 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7149 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7150 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7151 		    pkt->rm_pktinfolen);
7152 		return;
7153 	}
7154 	if (__predict_false(pkt->rm_datalen == 0)) {
7155 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7156 		return;
7157 	}
7158 
7159 	/*
7160 	 * Check offests.
7161 	 */
7162 #define IS_OFFSET_INVALID(ofs)			\
7163 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7164 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7165 
7166 	/* XXX Hyper-V does not meet data offset alignment requirement */
7167 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7168 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7169 		    "data offset %u\n", pkt->rm_dataoffset);
7170 		return;
7171 	}
7172 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7173 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7174 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7175 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7176 		return;
7177 	}
7178 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7179 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7180 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7181 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7182 		return;
7183 	}
7184 
7185 #undef IS_OFFSET_INVALID
7186 
7187 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7188 	data_len = pkt->rm_datalen;
7189 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7190 	pktinfo_len = pkt->rm_pktinfolen;
7191 
7192 	/*
7193 	 * Check OOB coverage.
7194 	 */
7195 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7196 		int oob_off, oob_len;
7197 
7198 		if_printf(rxr->hn_ifp, "got oobdata\n");
7199 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7200 		oob_len = pkt->rm_oobdatalen;
7201 
7202 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7203 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7204 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7205 			    pkt->rm_len, oob_off, oob_len);
7206 			return;
7207 		}
7208 
7209 		/*
7210 		 * Check against data.
7211 		 */
7212 		if (hn_rndis_check_overlap(oob_off, oob_len,
7213 		    data_off, data_len)) {
7214 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7215 			    "oob overlaps data, oob abs %d len %d, "
7216 			    "data abs %d len %d\n",
7217 			    oob_off, oob_len, data_off, data_len);
7218 			return;
7219 		}
7220 
7221 		/*
7222 		 * Check against pktinfo.
7223 		 */
7224 		if (pktinfo_len != 0 &&
7225 		    hn_rndis_check_overlap(oob_off, oob_len,
7226 		    pktinfo_off, pktinfo_len)) {
7227 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7228 			    "oob overlaps pktinfo, oob abs %d len %d, "
7229 			    "pktinfo abs %d len %d\n",
7230 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7231 			return;
7232 		}
7233 	}
7234 
7235 	/*
7236 	 * Check per-packet-info coverage and find useful per-packet-info.
7237 	 */
7238 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7239 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7240 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7241 	if (__predict_true(pktinfo_len != 0)) {
7242 		bool overlap;
7243 		int error;
7244 
7245 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7246 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7247 			    "pktinfo overflow, msglen %u, "
7248 			    "pktinfo abs %d len %d\n",
7249 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7250 			return;
7251 		}
7252 
7253 		/*
7254 		 * Check packet info coverage.
7255 		 */
7256 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7257 		    data_off, data_len);
7258 		if (__predict_false(overlap)) {
7259 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7260 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7261 			    "data abs %d len %d\n",
7262 			    pktinfo_off, pktinfo_len, data_off, data_len);
7263 			return;
7264 		}
7265 
7266 		/*
7267 		 * Find useful per-packet-info.
7268 		 */
7269 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7270 		    pktinfo_len, &info);
7271 		if (__predict_false(error)) {
7272 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7273 			    "pktinfo\n");
7274 			return;
7275 		}
7276 	}
7277 
7278 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7279 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7280 		    "data overflow, msglen %u, data abs %d len %d\n",
7281 		    pkt->rm_len, data_off, data_len);
7282 		return;
7283 	}
7284 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7285 }
7286 
7287 static __inline void
7288 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7289 {
7290 	const struct rndis_msghdr *hdr;
7291 
7292 	if (__predict_false(dlen < sizeof(*hdr))) {
7293 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7294 		return;
7295 	}
7296 	hdr = data;
7297 
7298 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7299 		/* Hot data path. */
7300 		hn_rndis_rx_data(rxr, data, dlen);
7301 		/* Done! */
7302 		return;
7303 	}
7304 
7305 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7306 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7307 	else
7308 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7309 }
7310 
7311 static void
7312 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7313 {
7314 	const struct hn_nvs_hdr *hdr;
7315 
7316 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7317 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7318 		return;
7319 	}
7320 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7321 
7322 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7323 		/* Useless; ignore */
7324 		return;
7325 	}
7326 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7327 }
7328 
7329 static void
7330 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7331     const struct vmbus_chanpkt_hdr *pkt)
7332 {
7333 	struct hn_nvs_sendctx *sndc;
7334 
7335 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7336 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7337 	    VMBUS_CHANPKT_DATALEN(pkt));
7338 	/*
7339 	 * NOTE:
7340 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7341 	 * its callback.
7342 	 */
7343 }
7344 
7345 static void
7346 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7347     const struct vmbus_chanpkt_hdr *pkthdr)
7348 {
7349 	const struct vmbus_chanpkt_rxbuf *pkt;
7350 	const struct hn_nvs_hdr *nvs_hdr;
7351 	int count, i, hlen;
7352 
7353 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7354 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7355 		return;
7356 	}
7357 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7358 
7359 	/* Make sure that this is a RNDIS message. */
7360 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7361 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7362 		    nvs_hdr->nvs_type);
7363 		return;
7364 	}
7365 
7366 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7367 	if (__predict_false(hlen < sizeof(*pkt))) {
7368 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7369 		return;
7370 	}
7371 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7372 
7373 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7374 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7375 		    pkt->cp_rxbuf_id);
7376 		return;
7377 	}
7378 
7379 	count = pkt->cp_rxbuf_cnt;
7380 	if (__predict_false(hlen <
7381 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7382 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7383 		return;
7384 	}
7385 
7386 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7387 	for (i = 0; i < count; ++i) {
7388 		int ofs, len;
7389 
7390 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7391 		len = pkt->cp_rxbuf[i].rb_len;
7392 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7393 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7394 			    "ofs %d, len %d\n", i, ofs, len);
7395 			continue;
7396 		}
7397 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7398 	}
7399 
7400 	/*
7401 	 * Ack the consumed RXBUF associated w/ this channel packet,
7402 	 * so that this RXBUF can be recycled by the hypervisor.
7403 	 */
7404 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7405 }
7406 
7407 static void
7408 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7409     uint64_t tid)
7410 {
7411 	struct hn_nvs_rndis_ack ack;
7412 	int retries, error;
7413 
7414 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7415 	ack.nvs_status = HN_NVS_STATUS_OK;
7416 
7417 	retries = 0;
7418 again:
7419 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7420 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7421 	if (__predict_false(error == EAGAIN)) {
7422 		/*
7423 		 * NOTE:
7424 		 * This should _not_ happen in real world, since the
7425 		 * consumption of the TX bufring from the TX path is
7426 		 * controlled.
7427 		 */
7428 		if (rxr->hn_ack_failed == 0)
7429 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7430 		rxr->hn_ack_failed++;
7431 		retries++;
7432 		if (retries < 10) {
7433 			DELAY(100);
7434 			goto again;
7435 		}
7436 		/* RXBUF leaks! */
7437 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7438 	}
7439 }
7440 
7441 static void
7442 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7443 {
7444 	struct hn_rx_ring *rxr = xrxr;
7445 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7446 
7447 	for (;;) {
7448 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7449 		int error, pktlen;
7450 
7451 		pktlen = rxr->hn_pktbuf_len;
7452 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7453 		if (__predict_false(error == ENOBUFS)) {
7454 			void *nbuf;
7455 			int nlen;
7456 
7457 			/*
7458 			 * Expand channel packet buffer.
7459 			 *
7460 			 * XXX
7461 			 * Use M_WAITOK here, since allocation failure
7462 			 * is fatal.
7463 			 */
7464 			nlen = rxr->hn_pktbuf_len * 2;
7465 			while (nlen < pktlen)
7466 				nlen *= 2;
7467 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7468 
7469 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7470 			    rxr->hn_pktbuf_len, nlen);
7471 
7472 			free(rxr->hn_pktbuf, M_DEVBUF);
7473 			rxr->hn_pktbuf = nbuf;
7474 			rxr->hn_pktbuf_len = nlen;
7475 			/* Retry! */
7476 			continue;
7477 		} else if (__predict_false(error == EAGAIN)) {
7478 			/* No more channel packets; done! */
7479 			break;
7480 		}
7481 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7482 
7483 		switch (pkt->cph_type) {
7484 		case VMBUS_CHANPKT_TYPE_COMP:
7485 			hn_nvs_handle_comp(sc, chan, pkt);
7486 			break;
7487 
7488 		case VMBUS_CHANPKT_TYPE_RXBUF:
7489 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7490 			break;
7491 
7492 		case VMBUS_CHANPKT_TYPE_INBAND:
7493 			hn_nvs_handle_notify(sc, pkt);
7494 			break;
7495 
7496 		default:
7497 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7498 			    pkt->cph_type);
7499 			break;
7500 		}
7501 	}
7502 	hn_chan_rollup(rxr, rxr->hn_txr);
7503 }
7504 
7505 static void
7506 hn_sysinit(void *arg __unused)
7507 {
7508 	int i;
7509 
7510 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7511 
7512 #ifdef HN_IFSTART_SUPPORT
7513 	/*
7514 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7515 	 * mainly due to the IFF_DRV_OACTIVE flag.
7516 	 */
7517 	if (hn_xpnt_vf && hn_use_if_start) {
7518 		hn_use_if_start = 0;
7519 		printf("hn: tranparent VF mode, if_transmit will be used, "
7520 		    "instead of if_start\n");
7521 	}
7522 #endif
7523 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7524 		printf("hn: invalid transparent VF attach routing "
7525 		    "wait timeout %d, reset to %d\n",
7526 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7527 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7528 	}
7529 
7530 	/*
7531 	 * Initialize VF map.
7532 	 */
7533 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7534 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7535 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7536 	    M_WAITOK | M_ZERO);
7537 
7538 	/*
7539 	 * Fix the # of TX taskqueues.
7540 	 */
7541 	if (hn_tx_taskq_cnt <= 0)
7542 		hn_tx_taskq_cnt = 1;
7543 	else if (hn_tx_taskq_cnt > mp_ncpus)
7544 		hn_tx_taskq_cnt = mp_ncpus;
7545 
7546 	/*
7547 	 * Fix the TX taskqueue mode.
7548 	 */
7549 	switch (hn_tx_taskq_mode) {
7550 	case HN_TX_TASKQ_M_INDEP:
7551 	case HN_TX_TASKQ_M_GLOBAL:
7552 	case HN_TX_TASKQ_M_EVTTQ:
7553 		break;
7554 	default:
7555 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7556 		break;
7557 	}
7558 
7559 	if (vm_guest != VM_GUEST_HV)
7560 		return;
7561 
7562 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7563 		return;
7564 
7565 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7566 	    M_DEVBUF, M_WAITOK);
7567 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7568 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7569 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7570 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7571 		    "hn tx%d", i);
7572 	}
7573 }
7574 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7575 
7576 static void
7577 hn_sysuninit(void *arg __unused)
7578 {
7579 
7580 	if (hn_tx_taskque != NULL) {
7581 		int i;
7582 
7583 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7584 			taskqueue_free(hn_tx_taskque[i]);
7585 		free(hn_tx_taskque, M_DEVBUF);
7586 	}
7587 
7588 	if (hn_vfmap != NULL)
7589 		free(hn_vfmap, M_DEVBUF);
7590 	rm_destroy(&hn_vfmap_lock);
7591 
7592 	counter_u64_free(hn_udpcs_fixup);
7593 }
7594 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7595