xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 38a52bd3b5cac3da6f7f6eef3dd050e6aa08ebb3)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/proc.h>
75 #include <sys/rmlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/socket.h>
80 #include <sys/sockio.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/buf_ring.h>
85 #include <sys/eventhandler.h>
86 #include <sys/epoch.h>
87 
88 #include <machine/atomic.h>
89 #include <machine/in_cksum.h>
90 
91 #include <net/bpf.h>
92 #include <net/ethernet.h>
93 #include <net/if.h>
94 #include <net/if_dl.h>
95 #include <net/if_media.h>
96 #include <net/if_types.h>
97 #include <net/if_var.h>
98 #include <net/rndis.h>
99 #ifdef RSS
100 #include <net/rss_config.h>
101 #endif
102 
103 #include <netinet/in_systm.h>
104 #include <netinet/in.h>
105 #include <netinet/ip.h>
106 #include <netinet/ip6.h>
107 #include <netinet/tcp.h>
108 #include <netinet/tcp_lro.h>
109 #include <netinet/udp.h>
110 
111 #include <dev/hyperv/include/hyperv.h>
112 #include <dev/hyperv/include/hyperv_busdma.h>
113 #include <dev/hyperv/include/vmbus.h>
114 #include <dev/hyperv/include/vmbus_xact.h>
115 
116 #include <dev/hyperv/netvsc/ndis.h>
117 #include <dev/hyperv/netvsc/if_hnreg.h>
118 #include <dev/hyperv/netvsc/if_hnvar.h>
119 #include <dev/hyperv/netvsc/hn_nvs.h>
120 #include <dev/hyperv/netvsc/hn_rndis.h>
121 
122 #include "vmbus_if.h"
123 
124 #define HN_IFSTART_SUPPORT
125 
126 #define HN_RING_CNT_DEF_MAX		8
127 
128 #define HN_VFMAP_SIZE_DEF		8
129 
130 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
131 
132 /* YYY should get it from the underlying channel */
133 #define HN_TX_DESC_CNT			512
134 
135 #define HN_RNDIS_PKT_LEN					\
136 	(sizeof(struct rndis_packet_msg) +			\
137 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
138 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
139 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
140 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
141 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
142 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
143 
144 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
145 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
146 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
147 /* -1 for RNDIS packet message */
148 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
149 
150 #define HN_DIRECT_TX_SIZE_DEF		128
151 
152 #define HN_EARLY_TXEOF_THRESH		8
153 
154 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
155 
156 #define HN_LROENT_CNT_DEF		128
157 
158 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
159 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
160 /* YYY 2*MTU is a bit rough, but should be good enough. */
161 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
162 
163 #define HN_LRO_ACKCNT_DEF		1
164 
165 #define HN_LOCK_INIT(sc)		\
166 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
167 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
168 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
169 #define HN_LOCK(sc)					\
170 do {							\
171 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
172 		/* Relinquish cpu to avoid deadlock */	\
173 		sched_relinquish(curthread);		\
174 		DELAY(1000);				\
175 	}						\
176 } while (0)
177 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
178 
179 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
180 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
181 #define HN_CSUM_IP_HWASSIST(sc)		\
182 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
183 #define HN_CSUM_IP6_HWASSIST(sc)	\
184 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
185 
186 #define HN_PKTSIZE_MIN(align)		\
187 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
188 	    HN_RNDIS_PKT_LEN, (align))
189 #define HN_PKTSIZE(m, align)		\
190 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
191 
192 #ifdef RSS
193 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
194 #else
195 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
196 #endif
197 
198 struct hn_txdesc {
199 #ifndef HN_USE_TXDESC_BUFRING
200 	SLIST_ENTRY(hn_txdesc)		link;
201 #endif
202 	STAILQ_ENTRY(hn_txdesc)		agg_link;
203 
204 	/* Aggregated txdescs, in sending order. */
205 	STAILQ_HEAD(, hn_txdesc)	agg_list;
206 
207 	/* The oldest packet, if transmission aggregation happens. */
208 	struct mbuf			*m;
209 	struct hn_tx_ring		*txr;
210 	int				refs;
211 	uint32_t			flags;	/* HN_TXD_FLAG_ */
212 	struct hn_nvs_sendctx		send_ctx;
213 	uint32_t			chim_index;
214 	int				chim_size;
215 
216 	bus_dmamap_t			data_dmap;
217 
218 	bus_addr_t			rndis_pkt_paddr;
219 	struct rndis_packet_msg		*rndis_pkt;
220 	bus_dmamap_t			rndis_pkt_dmap;
221 };
222 
223 #define HN_TXD_FLAG_ONLIST		0x0001
224 #define HN_TXD_FLAG_DMAMAP		0x0002
225 #define HN_TXD_FLAG_ONAGG		0x0004
226 
227 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
228 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
229 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
230 
231 struct packet_info_id {
232 	uint8_t				ver;
233 	uint8_t				flag;
234 	uint16_t			pkt_id;
235 };
236 
237 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
238 
239 
240 struct hn_rxinfo {
241 	const uint32_t			*vlan_info;
242 	const uint32_t			*csum_info;
243 	const uint32_t			*hash_info;
244 	const uint32_t			*hash_value;
245 	const struct packet_info_id	*pktinfo_id;
246 };
247 
248 struct hn_rxvf_setarg {
249 	struct hn_rx_ring	*rxr;
250 	struct ifnet		*vf_ifp;
251 };
252 
253 #define HN_RXINFO_VLAN			0x0001
254 #define HN_RXINFO_CSUM			0x0002
255 #define HN_RXINFO_HASHINF		0x0004
256 #define HN_RXINFO_HASHVAL		0x0008
257 #define HN_RXINFO_PKTINFO_ID		0x0010
258 #define HN_RXINFO_ALL			\
259 	(HN_RXINFO_VLAN |		\
260 	 HN_RXINFO_CSUM |		\
261 	 HN_RXINFO_HASHINF |		\
262 	 HN_RXINFO_HASHVAL |		\
263 	 HN_RXINFO_PKTINFO_ID)
264 
265 static int			hn_probe(device_t);
266 static int			hn_attach(device_t);
267 static int			hn_detach(device_t);
268 static int			hn_shutdown(device_t);
269 static void			hn_chan_callback(struct vmbus_channel *,
270 				    void *);
271 
272 static void			hn_init(void *);
273 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
274 #ifdef HN_IFSTART_SUPPORT
275 static void			hn_start(struct ifnet *);
276 #endif
277 static int			hn_transmit(struct ifnet *, struct mbuf *);
278 static void			hn_xmit_qflush(struct ifnet *);
279 static int			hn_ifmedia_upd(struct ifnet *);
280 static void			hn_ifmedia_sts(struct ifnet *,
281 				    struct ifmediareq *);
282 
283 static void			hn_ifnet_event(void *, struct ifnet *, int);
284 static void			hn_ifaddr_event(void *, struct ifnet *);
285 static void			hn_ifnet_attevent(void *, struct ifnet *);
286 static void			hn_ifnet_detevent(void *, struct ifnet *);
287 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
288 
289 static bool			hn_ismyvf(const struct hn_softc *,
290 				    const struct ifnet *);
291 static void			hn_rxvf_change(struct hn_softc *,
292 				    struct ifnet *, bool);
293 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
294 static void			hn_rxvf_set_task(void *, int);
295 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
296 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
297 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
298 				    struct ifreq *);
299 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
300 static bool			hn_xpnt_vf_isready(struct hn_softc *);
301 static void			hn_xpnt_vf_setready(struct hn_softc *);
302 static void			hn_xpnt_vf_init_taskfunc(void *, int);
303 static void			hn_xpnt_vf_init(struct hn_softc *);
304 static void			hn_xpnt_vf_setenable(struct hn_softc *);
305 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
306 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
307 static void			hn_vf_rss_restore(struct hn_softc *);
308 
309 static int			hn_rndis_rxinfo(const void *, int,
310 				    struct hn_rxinfo *);
311 static void			hn_rndis_rx_data(struct hn_rx_ring *,
312 				    const void *, int);
313 static void			hn_rndis_rx_status(struct hn_softc *,
314 				    const void *, int);
315 static void			hn_rndis_init_fixat(struct hn_softc *, int);
316 
317 static void			hn_nvs_handle_notify(struct hn_softc *,
318 				    const struct vmbus_chanpkt_hdr *);
319 static void			hn_nvs_handle_comp(struct hn_softc *,
320 				    struct vmbus_channel *,
321 				    const struct vmbus_chanpkt_hdr *);
322 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
323 				    struct vmbus_channel *,
324 				    const struct vmbus_chanpkt_hdr *);
325 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
326 				    struct vmbus_channel *, uint64_t);
327 
328 #if __FreeBSD_version >= 1100099
329 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
334 #if __FreeBSD_version < 1100095
335 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
336 #else
337 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
338 #endif
339 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
346 #ifndef RSS
347 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
349 #endif
350 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
358 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
359 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
360 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
361 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
362 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
363 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
364 
365 static void			hn_stop(struct hn_softc *, bool);
366 static void			hn_init_locked(struct hn_softc *);
367 static int			hn_chan_attach(struct hn_softc *,
368 				    struct vmbus_channel *);
369 static void			hn_chan_detach(struct hn_softc *,
370 				    struct vmbus_channel *);
371 static int			hn_attach_subchans(struct hn_softc *);
372 static void			hn_detach_allchans(struct hn_softc *);
373 static void			hn_chan_rollup(struct hn_rx_ring *,
374 				    struct hn_tx_ring *);
375 static void			hn_set_ring_inuse(struct hn_softc *, int);
376 static int			hn_synth_attach(struct hn_softc *, int);
377 static void			hn_synth_detach(struct hn_softc *);
378 static int			hn_synth_alloc_subchans(struct hn_softc *,
379 				    int *);
380 static bool			hn_synth_attachable(const struct hn_softc *);
381 static void			hn_suspend(struct hn_softc *);
382 static void			hn_suspend_data(struct hn_softc *);
383 static void			hn_suspend_mgmt(struct hn_softc *);
384 static void			hn_resume(struct hn_softc *);
385 static void			hn_resume_data(struct hn_softc *);
386 static void			hn_resume_mgmt(struct hn_softc *);
387 static void			hn_suspend_mgmt_taskfunc(void *, int);
388 static void			hn_chan_drain(struct hn_softc *,
389 				    struct vmbus_channel *);
390 static void			hn_disable_rx(struct hn_softc *);
391 static void			hn_drain_rxtx(struct hn_softc *, int);
392 static void			hn_polling(struct hn_softc *, u_int);
393 static void			hn_chan_polling(struct vmbus_channel *, u_int);
394 static void			hn_mtu_change_fixup(struct hn_softc *);
395 
396 static void			hn_update_link_status(struct hn_softc *);
397 static void			hn_change_network(struct hn_softc *);
398 static void			hn_link_taskfunc(void *, int);
399 static void			hn_netchg_init_taskfunc(void *, int);
400 static void			hn_netchg_status_taskfunc(void *, int);
401 static void			hn_link_status(struct hn_softc *);
402 
403 static int			hn_create_rx_data(struct hn_softc *, int);
404 static void			hn_destroy_rx_data(struct hn_softc *);
405 static int			hn_check_iplen(const struct mbuf *, int);
406 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
407 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
408 static int			hn_rxfilter_config(struct hn_softc *);
409 static int			hn_rss_reconfig(struct hn_softc *);
410 static void			hn_rss_ind_fixup(struct hn_softc *);
411 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
412 static int			hn_rxpkt(struct hn_rx_ring *);
413 static uint32_t			hn_rss_type_fromndis(uint32_t);
414 static uint32_t			hn_rss_type_tondis(uint32_t);
415 
416 static int			hn_tx_ring_create(struct hn_softc *, int);
417 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
418 static int			hn_create_tx_data(struct hn_softc *, int);
419 static void			hn_fixup_tx_data(struct hn_softc *);
420 static void			hn_fixup_rx_data(struct hn_softc *);
421 static void			hn_destroy_tx_data(struct hn_softc *);
422 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
423 static void			hn_txdesc_gc(struct hn_tx_ring *,
424 				    struct hn_txdesc *);
425 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
426 				    struct hn_txdesc *, struct mbuf **);
427 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
428 				    struct hn_txdesc *);
429 static void			hn_set_chim_size(struct hn_softc *, int);
430 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
431 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
432 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
433 static void			hn_resume_tx(struct hn_softc *, int);
434 static void			hn_set_txagg(struct hn_softc *);
435 static void			*hn_try_txagg(struct ifnet *,
436 				    struct hn_tx_ring *, struct hn_txdesc *,
437 				    int);
438 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
439 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
440 				    struct hn_softc *, struct vmbus_channel *,
441 				    const void *, int);
442 static int			hn_txpkt_sglist(struct hn_tx_ring *,
443 				    struct hn_txdesc *);
444 static int			hn_txpkt_chim(struct hn_tx_ring *,
445 				    struct hn_txdesc *);
446 static int			hn_xmit(struct hn_tx_ring *, int);
447 static void			hn_xmit_taskfunc(void *, int);
448 static void			hn_xmit_txeof(struct hn_tx_ring *);
449 static void			hn_xmit_txeof_taskfunc(void *, int);
450 #ifdef HN_IFSTART_SUPPORT
451 static int			hn_start_locked(struct hn_tx_ring *, int);
452 static void			hn_start_taskfunc(void *, int);
453 static void			hn_start_txeof(struct hn_tx_ring *);
454 static void			hn_start_txeof_taskfunc(void *, int);
455 #endif
456 
457 static int			hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
458 
459 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
460     "Hyper-V network interface");
461 
462 /* Trust tcp segment verification on host side. */
463 static int			hn_trust_hosttcp = 1;
464 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
465     &hn_trust_hosttcp, 0,
466     "Trust tcp segment verification on host side, "
467     "when csum info is missing (global setting)");
468 
469 /* Trust udp datagrams verification on host side. */
470 static int			hn_trust_hostudp = 1;
471 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
472     &hn_trust_hostudp, 0,
473     "Trust udp datagram verification on host side, "
474     "when csum info is missing (global setting)");
475 
476 /* Trust ip packets verification on host side. */
477 static int			hn_trust_hostip = 1;
478 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
479     &hn_trust_hostip, 0,
480     "Trust ip packet verification on host side, "
481     "when csum info is missing (global setting)");
482 
483 /*
484  * Offload UDP/IPv4 checksum.
485  */
486 static int			hn_enable_udp4cs = 1;
487 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
488     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
489 
490 /*
491  * Offload UDP/IPv6 checksum.
492  */
493 static int			hn_enable_udp6cs = 1;
494 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
495     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
496 
497 /* Stats. */
498 static counter_u64_t		hn_udpcs_fixup;
499 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
500     &hn_udpcs_fixup, "# of UDP checksum fixup");
501 
502 /*
503  * See hn_set_hlen().
504  *
505  * This value is for Azure.  For Hyper-V, set this above
506  * 65536 to disable UDP datagram checksum fixup.
507  */
508 static int			hn_udpcs_fixup_mtu = 1420;
509 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
510     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
511 
512 /* Limit TSO burst size */
513 static int			hn_tso_maxlen = IP_MAXPACKET;
514 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
515     &hn_tso_maxlen, 0, "TSO burst limit");
516 
517 /* Limit chimney send size */
518 static int			hn_tx_chimney_size = 0;
519 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
520     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
521 
522 /* Limit the size of packet for direct transmission */
523 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
524 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
525     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
526 
527 /* # of LRO entries per RX ring */
528 #if defined(INET) || defined(INET6)
529 #if __FreeBSD_version >= 1100095
530 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
531 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
532     &hn_lro_entry_count, 0, "LRO entry count");
533 #endif
534 #endif
535 
536 static int			hn_tx_taskq_cnt = 1;
537 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
538     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
539 
540 #define HN_TX_TASKQ_M_INDEP	0
541 #define HN_TX_TASKQ_M_GLOBAL	1
542 #define HN_TX_TASKQ_M_EVTTQ	2
543 
544 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
545 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
546     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
547     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
548 
549 #ifndef HN_USE_TXDESC_BUFRING
550 static int			hn_use_txdesc_bufring = 0;
551 #else
552 static int			hn_use_txdesc_bufring = 1;
553 #endif
554 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
555     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
556 
557 #ifdef HN_IFSTART_SUPPORT
558 /* Use ifnet.if_start instead of ifnet.if_transmit */
559 static int			hn_use_if_start = 0;
560 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
561     &hn_use_if_start, 0, "Use if_start TX method");
562 #endif
563 
564 /* # of channels to use */
565 static int			hn_chan_cnt = 0;
566 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
567     &hn_chan_cnt, 0,
568     "# of channels to use; each channel has one RX ring and one TX ring");
569 
570 /* # of transmit rings to use */
571 static int			hn_tx_ring_cnt = 0;
572 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
573     &hn_tx_ring_cnt, 0, "# of TX rings to use");
574 
575 /* Software TX ring deptch */
576 static int			hn_tx_swq_depth = 0;
577 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
578     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
579 
580 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
581 #if __FreeBSD_version >= 1100095
582 static u_int			hn_lro_mbufq_depth = 0;
583 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
584     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
585 #endif
586 
587 /* Packet transmission aggregation size limit */
588 static int			hn_tx_agg_size = -1;
589 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
590     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
591 
592 /* Packet transmission aggregation count limit */
593 static int			hn_tx_agg_pkts = -1;
594 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
595     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
596 
597 /* VF list */
598 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
599     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
600     hn_vflist_sysctl, "A",
601     "VF list");
602 
603 /* VF mapping */
604 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
605     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
606     hn_vfmap_sysctl, "A",
607     "VF mapping");
608 
609 /* Transparent VF */
610 static int			hn_xpnt_vf = 1;
611 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
612     &hn_xpnt_vf, 0, "Transparent VF mod");
613 
614 /* Accurate BPF support for Transparent VF */
615 static int			hn_xpnt_vf_accbpf = 0;
616 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
617     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
618 
619 /* Extra wait for transparent VF attach routing; unit seconds. */
620 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
621 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
622     &hn_xpnt_vf_attwait, 0,
623     "Extra wait for transparent VF attach routing; unit: seconds");
624 
625 static u_int			hn_cpu_index;	/* next CPU for channel */
626 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
627 
628 static struct rmlock		hn_vfmap_lock;
629 static int			hn_vfmap_size;
630 static struct ifnet		**hn_vfmap;
631 
632 #ifndef RSS
633 static const uint8_t
634 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
635 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
636 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
637 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
638 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
639 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
640 };
641 #endif	/* !RSS */
642 
643 static const struct hyperv_guid	hn_guid = {
644 	.hv_guid = {
645 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
646 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
647 };
648 
649 static device_method_t hn_methods[] = {
650 	/* Device interface */
651 	DEVMETHOD(device_probe,		hn_probe),
652 	DEVMETHOD(device_attach,	hn_attach),
653 	DEVMETHOD(device_detach,	hn_detach),
654 	DEVMETHOD(device_shutdown,	hn_shutdown),
655 	DEVMETHOD_END
656 };
657 
658 static driver_t hn_driver = {
659 	"hn",
660 	hn_methods,
661 	sizeof(struct hn_softc)
662 };
663 
664 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0);
665 MODULE_VERSION(hn, 1);
666 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
667 
668 #if __FreeBSD_version >= 1100099
669 static void
670 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
671 {
672 	int i;
673 
674 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
675 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
676 }
677 #endif
678 
679 static int
680 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
681 {
682 
683 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
684 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
685 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
686 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
687 }
688 
689 static int
690 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
691 {
692 	struct hn_nvs_rndis rndis;
693 
694 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
695 	    txd->chim_size > 0, ("invalid rndis chim txd"));
696 
697 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
698 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
699 	rndis.nvs_chim_idx = txd->chim_index;
700 	rndis.nvs_chim_sz = txd->chim_size;
701 
702 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
703 	    &rndis, sizeof(rndis), &txd->send_ctx));
704 }
705 
706 static __inline uint32_t
707 hn_chim_alloc(struct hn_softc *sc)
708 {
709 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
710 	u_long *bmap = sc->hn_chim_bmap;
711 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
712 
713 	for (i = 0; i < bmap_cnt; ++i) {
714 		int idx;
715 
716 		idx = ffsl(~bmap[i]);
717 		if (idx == 0)
718 			continue;
719 
720 		--idx; /* ffsl is 1-based */
721 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
722 		    ("invalid i %d and idx %d", i, idx));
723 
724 		if (atomic_testandset_long(&bmap[i], idx))
725 			continue;
726 
727 		ret = i * LONG_BIT + idx;
728 		break;
729 	}
730 	return (ret);
731 }
732 
733 static __inline void
734 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
735 {
736 	u_long mask;
737 	uint32_t idx;
738 
739 	idx = chim_idx / LONG_BIT;
740 	KASSERT(idx < sc->hn_chim_bmap_cnt,
741 	    ("invalid chimney index 0x%x", chim_idx));
742 
743 	mask = 1UL << (chim_idx % LONG_BIT);
744 	KASSERT(sc->hn_chim_bmap[idx] & mask,
745 	    ("index bitmap 0x%lx, chimney index %u, "
746 	     "bitmap idx %d, bitmask 0x%lx",
747 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
748 
749 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
750 }
751 
752 #if defined(INET6) || defined(INET)
753 
754 #define PULLUP_HDR(m, len)				\
755 do {							\
756 	if (__predict_false((m)->m_len < (len))) {	\
757 		(m) = m_pullup((m), (len));		\
758 		if ((m) == NULL)			\
759 			return (NULL);			\
760 	}						\
761 } while (0)
762 
763 /*
764  * NOTE: If this function failed, the m_head would be freed.
765  */
766 static __inline struct mbuf *
767 hn_tso_fixup(struct mbuf *m_head)
768 {
769 	struct ether_vlan_header *evl;
770 	struct tcphdr *th;
771 	int ehlen;
772 
773 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
774 
775 	PULLUP_HDR(m_head, sizeof(*evl));
776 	evl = mtod(m_head, struct ether_vlan_header *);
777 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
778 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
779 	else
780 		ehlen = ETHER_HDR_LEN;
781 	m_head->m_pkthdr.l2hlen = ehlen;
782 
783 #ifdef INET
784 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
785 		struct ip *ip;
786 		int iphlen;
787 
788 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
789 		ip = mtodo(m_head, ehlen);
790 		iphlen = ip->ip_hl << 2;
791 		m_head->m_pkthdr.l3hlen = iphlen;
792 
793 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
794 		th = mtodo(m_head, ehlen + iphlen);
795 
796 		ip->ip_len = 0;
797 		ip->ip_sum = 0;
798 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
799 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
800 	}
801 #endif
802 #if defined(INET6) && defined(INET)
803 	else
804 #endif
805 #ifdef INET6
806 	{
807 		struct ip6_hdr *ip6;
808 
809 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
810 		ip6 = mtodo(m_head, ehlen);
811 		if (ip6->ip6_nxt != IPPROTO_TCP) {
812 			m_freem(m_head);
813 			return (NULL);
814 		}
815 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
816 
817 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
818 		th = mtodo(m_head, ehlen + sizeof(*ip6));
819 
820 		ip6->ip6_plen = 0;
821 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
822 	}
823 #endif
824 	return (m_head);
825 }
826 
827 /*
828  * NOTE: If this function failed, the m_head would be freed.
829  */
830 static __inline struct mbuf *
831 hn_set_hlen(struct mbuf *m_head)
832 {
833 	const struct ether_vlan_header *evl;
834 	int ehlen;
835 
836 	PULLUP_HDR(m_head, sizeof(*evl));
837 	evl = mtod(m_head, const struct ether_vlan_header *);
838 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
839 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
840 	else
841 		ehlen = ETHER_HDR_LEN;
842 	m_head->m_pkthdr.l2hlen = ehlen;
843 
844 #ifdef INET
845 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
846 		const struct ip *ip;
847 		int iphlen;
848 
849 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
850 		ip = mtodo(m_head, ehlen);
851 		iphlen = ip->ip_hl << 2;
852 		m_head->m_pkthdr.l3hlen = iphlen;
853 
854 		/*
855 		 * UDP checksum offload does not work in Azure, if the
856 		 * following conditions meet:
857 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
858 		 * - IP_DF is not set in the IP hdr.
859 		 *
860 		 * Fallback to software checksum for these UDP datagrams.
861 		 */
862 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
863 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
864 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
865 			uint16_t off = ehlen + iphlen;
866 
867 			counter_u64_add(hn_udpcs_fixup, 1);
868 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
869 			*(uint16_t *)(m_head->m_data + off +
870                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
871 			    m_head, m_head->m_pkthdr.len, off);
872 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
873 		}
874 	}
875 #endif
876 #if defined(INET6) && defined(INET)
877 	else
878 #endif
879 #ifdef INET6
880 	{
881 		const struct ip6_hdr *ip6;
882 
883 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
884 		ip6 = mtodo(m_head, ehlen);
885 		if (ip6->ip6_nxt != IPPROTO_TCP &&
886 		    ip6->ip6_nxt != IPPROTO_UDP) {
887 			m_freem(m_head);
888 			return (NULL);
889 		}
890 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
891 	}
892 #endif
893 	return (m_head);
894 }
895 
896 /*
897  * NOTE: If this function failed, the m_head would be freed.
898  */
899 static __inline struct mbuf *
900 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
901 {
902 	const struct tcphdr *th;
903 	int ehlen, iphlen;
904 
905 	*tcpsyn = 0;
906 	ehlen = m_head->m_pkthdr.l2hlen;
907 	iphlen = m_head->m_pkthdr.l3hlen;
908 
909 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
910 	th = mtodo(m_head, ehlen + iphlen);
911 	if (th->th_flags & TH_SYN)
912 		*tcpsyn = 1;
913 	return (m_head);
914 }
915 
916 #undef PULLUP_HDR
917 
918 #endif	/* INET6 || INET */
919 
920 static int
921 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
922 {
923 	int error = 0;
924 
925 	HN_LOCK_ASSERT(sc);
926 
927 	if (sc->hn_rx_filter != filter) {
928 		error = hn_rndis_set_rxfilter(sc, filter);
929 		if (!error)
930 			sc->hn_rx_filter = filter;
931 	}
932 	return (error);
933 }
934 
935 static int
936 hn_rxfilter_config(struct hn_softc *sc)
937 {
938 	struct ifnet *ifp = sc->hn_ifp;
939 	uint32_t filter;
940 
941 	HN_LOCK_ASSERT(sc);
942 
943 	/*
944 	 * If the non-transparent mode VF is activated, we don't know how
945 	 * its RX filter is configured, so stick the synthetic device in
946 	 * the promiscous mode.
947 	 */
948 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
949 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
950 	} else {
951 		filter = NDIS_PACKET_TYPE_DIRECTED;
952 		if (ifp->if_flags & IFF_BROADCAST)
953 			filter |= NDIS_PACKET_TYPE_BROADCAST;
954 		/* TODO: support multicast list */
955 		if ((ifp->if_flags & IFF_ALLMULTI) ||
956 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
957 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
958 	}
959 	return (hn_set_rxfilter(sc, filter));
960 }
961 
962 static void
963 hn_set_txagg(struct hn_softc *sc)
964 {
965 	uint32_t size, pkts;
966 	int i;
967 
968 	/*
969 	 * Setup aggregation size.
970 	 */
971 	if (sc->hn_agg_size < 0)
972 		size = UINT32_MAX;
973 	else
974 		size = sc->hn_agg_size;
975 
976 	if (sc->hn_rndis_agg_size < size)
977 		size = sc->hn_rndis_agg_size;
978 
979 	/* NOTE: We only aggregate packets using chimney sending buffers. */
980 	if (size > (uint32_t)sc->hn_chim_szmax)
981 		size = sc->hn_chim_szmax;
982 
983 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
984 		/* Disable */
985 		size = 0;
986 		pkts = 0;
987 		goto done;
988 	}
989 
990 	/* NOTE: Type of the per TX ring setting is 'int'. */
991 	if (size > INT_MAX)
992 		size = INT_MAX;
993 
994 	/*
995 	 * Setup aggregation packet count.
996 	 */
997 	if (sc->hn_agg_pkts < 0)
998 		pkts = UINT32_MAX;
999 	else
1000 		pkts = sc->hn_agg_pkts;
1001 
1002 	if (sc->hn_rndis_agg_pkts < pkts)
1003 		pkts = sc->hn_rndis_agg_pkts;
1004 
1005 	if (pkts <= 1) {
1006 		/* Disable */
1007 		size = 0;
1008 		pkts = 0;
1009 		goto done;
1010 	}
1011 
1012 	/* NOTE: Type of the per TX ring setting is 'short'. */
1013 	if (pkts > SHRT_MAX)
1014 		pkts = SHRT_MAX;
1015 
1016 done:
1017 	/* NOTE: Type of the per TX ring setting is 'short'. */
1018 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1019 		/* Disable */
1020 		size = 0;
1021 		pkts = 0;
1022 	}
1023 
1024 	if (bootverbose) {
1025 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1026 		    size, pkts, sc->hn_rndis_agg_align);
1027 	}
1028 
1029 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1030 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1031 
1032 		mtx_lock(&txr->hn_tx_lock);
1033 		txr->hn_agg_szmax = size;
1034 		txr->hn_agg_pktmax = pkts;
1035 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1036 		mtx_unlock(&txr->hn_tx_lock);
1037 	}
1038 }
1039 
1040 static int
1041 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1042 {
1043 
1044 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1045 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1046 		return txr->hn_txdesc_cnt;
1047 	return hn_tx_swq_depth;
1048 }
1049 
1050 static int
1051 hn_rss_reconfig(struct hn_softc *sc)
1052 {
1053 	int error;
1054 
1055 	HN_LOCK_ASSERT(sc);
1056 
1057 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1058 		return (ENXIO);
1059 
1060 	/*
1061 	 * Disable RSS first.
1062 	 *
1063 	 * NOTE:
1064 	 * Direct reconfiguration by setting the UNCHG flags does
1065 	 * _not_ work properly.
1066 	 */
1067 	if (bootverbose)
1068 		if_printf(sc->hn_ifp, "disable RSS\n");
1069 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1070 	if (error) {
1071 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1072 		return (error);
1073 	}
1074 
1075 	/*
1076 	 * Reenable the RSS w/ the updated RSS key or indirect
1077 	 * table.
1078 	 */
1079 	if (bootverbose)
1080 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1081 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1082 	if (error) {
1083 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1084 		return (error);
1085 	}
1086 	return (0);
1087 }
1088 
1089 static void
1090 hn_rss_ind_fixup(struct hn_softc *sc)
1091 {
1092 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1093 	int i, nchan;
1094 
1095 	nchan = sc->hn_rx_ring_inuse;
1096 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1097 
1098 	/*
1099 	 * Check indirect table to make sure that all channels in it
1100 	 * can be used.
1101 	 */
1102 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1103 		if (rss->rss_ind[i] >= nchan) {
1104 			if_printf(sc->hn_ifp,
1105 			    "RSS indirect table %d fixup: %u -> %d\n",
1106 			    i, rss->rss_ind[i], nchan - 1);
1107 			rss->rss_ind[i] = nchan - 1;
1108 		}
1109 	}
1110 }
1111 
1112 static int
1113 hn_ifmedia_upd(struct ifnet *ifp __unused)
1114 {
1115 
1116 	return EOPNOTSUPP;
1117 }
1118 
1119 static void
1120 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1121 {
1122 	struct hn_softc *sc = ifp->if_softc;
1123 
1124 	ifmr->ifm_status = IFM_AVALID;
1125 	ifmr->ifm_active = IFM_ETHER;
1126 
1127 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1128 		ifmr->ifm_active |= IFM_NONE;
1129 		return;
1130 	}
1131 	ifmr->ifm_status |= IFM_ACTIVE;
1132 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1133 }
1134 
1135 static void
1136 hn_rxvf_set_task(void *xarg, int pending __unused)
1137 {
1138 	struct hn_rxvf_setarg *arg = xarg;
1139 
1140 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1141 }
1142 
1143 static void
1144 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1145 {
1146 	struct hn_rx_ring *rxr;
1147 	struct hn_rxvf_setarg arg;
1148 	struct task task;
1149 	int i;
1150 
1151 	HN_LOCK_ASSERT(sc);
1152 
1153 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1154 
1155 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1156 		rxr = &sc->hn_rx_ring[i];
1157 
1158 		if (i < sc->hn_rx_ring_inuse) {
1159 			arg.rxr = rxr;
1160 			arg.vf_ifp = vf_ifp;
1161 			vmbus_chan_run_task(rxr->hn_chan, &task);
1162 		} else {
1163 			rxr->hn_rxvf_ifp = vf_ifp;
1164 		}
1165 	}
1166 }
1167 
1168 static bool
1169 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1170 {
1171 	const struct ifnet *hn_ifp;
1172 
1173 	hn_ifp = sc->hn_ifp;
1174 
1175 	if (ifp == hn_ifp)
1176 		return (false);
1177 
1178 	if (ifp->if_alloctype != IFT_ETHER)
1179 		return (false);
1180 
1181 	/* Ignore lagg/vlan interfaces */
1182 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1183 	    strcmp(ifp->if_dname, "vlan") == 0)
1184 		return (false);
1185 
1186 	/*
1187 	 * During detach events ifp->if_addr might be NULL.
1188 	 * Make sure the bcmp() below doesn't panic on that:
1189 	 */
1190 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1191 		return (false);
1192 
1193 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1194 		return (false);
1195 
1196 	return (true);
1197 }
1198 
1199 static void
1200 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1201 {
1202 	struct ifnet *hn_ifp;
1203 
1204 	HN_LOCK(sc);
1205 
1206 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1207 		goto out;
1208 
1209 	if (!hn_ismyvf(sc, ifp))
1210 		goto out;
1211 	hn_ifp = sc->hn_ifp;
1212 
1213 	if (rxvf) {
1214 		if (sc->hn_flags & HN_FLAG_RXVF)
1215 			goto out;
1216 
1217 		sc->hn_flags |= HN_FLAG_RXVF;
1218 		hn_rxfilter_config(sc);
1219 	} else {
1220 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1221 			goto out;
1222 
1223 		sc->hn_flags &= ~HN_FLAG_RXVF;
1224 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1225 			hn_rxfilter_config(sc);
1226 		else
1227 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1228 	}
1229 
1230 	hn_nvs_set_datapath(sc,
1231 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1232 
1233 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1234 
1235 	if (rxvf) {
1236 		hn_vf_rss_fixup(sc, true);
1237 		hn_suspend_mgmt(sc);
1238 		sc->hn_link_flags &=
1239 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1240 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1241 	} else {
1242 		hn_vf_rss_restore(sc);
1243 		hn_resume_mgmt(sc);
1244 	}
1245 
1246 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1247 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1248 
1249 	if (bootverbose) {
1250 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1251 		    rxvf ? "to" : "from", ifp->if_xname);
1252 	}
1253 out:
1254 	HN_UNLOCK(sc);
1255 }
1256 
1257 static void
1258 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1259 {
1260 
1261 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1262 		return;
1263 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1264 }
1265 
1266 static void
1267 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1268 {
1269 
1270 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1271 }
1272 
1273 static int
1274 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1275 {
1276 	struct ifnet *ifp, *vf_ifp;
1277 	uint64_t tmp;
1278 	int error;
1279 
1280 	HN_LOCK_ASSERT(sc);
1281 	ifp = sc->hn_ifp;
1282 	vf_ifp = sc->hn_vf_ifp;
1283 
1284 	/*
1285 	 * Fix up requested capabilities w/ supported capabilities,
1286 	 * since the supported capabilities could have been changed.
1287 	 */
1288 	ifr->ifr_reqcap &= ifp->if_capabilities;
1289 	/* Pass SIOCSIFCAP to VF. */
1290 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1291 
1292 	/*
1293 	 * NOTE:
1294 	 * The error will be propagated to the callers, however, it
1295 	 * is _not_ useful here.
1296 	 */
1297 
1298 	/*
1299 	 * Merge VF's enabled capabilities.
1300 	 */
1301 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1302 
1303 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1304 	if (ifp->if_capenable & IFCAP_TXCSUM)
1305 		ifp->if_hwassist |= tmp;
1306 	else
1307 		ifp->if_hwassist &= ~tmp;
1308 
1309 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1310 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1311 		ifp->if_hwassist |= tmp;
1312 	else
1313 		ifp->if_hwassist &= ~tmp;
1314 
1315 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1316 	if (ifp->if_capenable & IFCAP_TSO4)
1317 		ifp->if_hwassist |= tmp;
1318 	else
1319 		ifp->if_hwassist &= ~tmp;
1320 
1321 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1322 	if (ifp->if_capenable & IFCAP_TSO6)
1323 		ifp->if_hwassist |= tmp;
1324 	else
1325 		ifp->if_hwassist &= ~tmp;
1326 
1327 	return (error);
1328 }
1329 
1330 static int
1331 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1332 {
1333 	struct ifnet *vf_ifp;
1334 	struct ifreq ifr;
1335 
1336 	HN_LOCK_ASSERT(sc);
1337 	vf_ifp = sc->hn_vf_ifp;
1338 
1339 	memset(&ifr, 0, sizeof(ifr));
1340 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1341 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1342 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1343 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1344 }
1345 
1346 static void
1347 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1348 {
1349 	struct ifnet *ifp = sc->hn_ifp;
1350 	int allmulti = 0;
1351 
1352 	HN_LOCK_ASSERT(sc);
1353 
1354 	/* XXX vlan(4) style mcast addr maintenance */
1355 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1356 		allmulti = IFF_ALLMULTI;
1357 
1358 	/* Always set the VF's if_flags */
1359 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1360 }
1361 
1362 static void
1363 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1364 {
1365 	struct rm_priotracker pt;
1366 	struct ifnet *hn_ifp = NULL;
1367 	struct mbuf *mn;
1368 
1369 	/*
1370 	 * XXX racy, if hn(4) ever detached.
1371 	 */
1372 	rm_rlock(&hn_vfmap_lock, &pt);
1373 	if (vf_ifp->if_index < hn_vfmap_size)
1374 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1375 	rm_runlock(&hn_vfmap_lock, &pt);
1376 
1377 	if (hn_ifp != NULL) {
1378 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1379 			/*
1380 			 * Allow tapping on the VF.
1381 			 */
1382 			ETHER_BPF_MTAP(vf_ifp, mn);
1383 
1384 			/*
1385 			 * Update VF stats.
1386 			 */
1387 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1388 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1389 				    mn->m_pkthdr.len);
1390 			}
1391 			/*
1392 			 * XXX IFCOUNTER_IMCAST
1393 			 * This stat updating is kinda invasive, since it
1394 			 * requires two checks on the mbuf: the length check
1395 			 * and the ethernet header check.  As of this write,
1396 			 * all multicast packets go directly to hn(4), which
1397 			 * makes imcast stat updating in the VF a try in vian.
1398 			 */
1399 
1400 			/*
1401 			 * Fix up rcvif and increase hn(4)'s ipackets.
1402 			 */
1403 			mn->m_pkthdr.rcvif = hn_ifp;
1404 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1405 		}
1406 		/*
1407 		 * Go through hn(4)'s if_input.
1408 		 */
1409 		hn_ifp->if_input(hn_ifp, m);
1410 	} else {
1411 		/*
1412 		 * In the middle of the transition; free this
1413 		 * mbuf chain.
1414 		 */
1415 		while (m != NULL) {
1416 			mn = m->m_nextpkt;
1417 			m->m_nextpkt = NULL;
1418 			m_freem(m);
1419 			m = mn;
1420 		}
1421 	}
1422 }
1423 
1424 static void
1425 hn_mtu_change_fixup(struct hn_softc *sc)
1426 {
1427 	struct ifnet *ifp;
1428 
1429 	HN_LOCK_ASSERT(sc);
1430 	ifp = sc->hn_ifp;
1431 
1432 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1433 #if __FreeBSD_version >= 1100099
1434 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1435 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1436 #endif
1437 }
1438 
1439 static uint32_t
1440 hn_rss_type_fromndis(uint32_t rss_hash)
1441 {
1442 	uint32_t types = 0;
1443 
1444 	if (rss_hash & NDIS_HASH_IPV4)
1445 		types |= RSS_TYPE_IPV4;
1446 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1447 		types |= RSS_TYPE_TCP_IPV4;
1448 	if (rss_hash & NDIS_HASH_IPV6)
1449 		types |= RSS_TYPE_IPV6;
1450 	if (rss_hash & NDIS_HASH_IPV6_EX)
1451 		types |= RSS_TYPE_IPV6_EX;
1452 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1453 		types |= RSS_TYPE_TCP_IPV6;
1454 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1455 		types |= RSS_TYPE_TCP_IPV6_EX;
1456 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1457 		types |= RSS_TYPE_UDP_IPV4;
1458 	return (types);
1459 }
1460 
1461 static uint32_t
1462 hn_rss_type_tondis(uint32_t types)
1463 {
1464 	uint32_t rss_hash = 0;
1465 
1466 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1467 	    ("UDP6 and UDP6EX are not supported"));
1468 
1469 	if (types & RSS_TYPE_IPV4)
1470 		rss_hash |= NDIS_HASH_IPV4;
1471 	if (types & RSS_TYPE_TCP_IPV4)
1472 		rss_hash |= NDIS_HASH_TCP_IPV4;
1473 	if (types & RSS_TYPE_IPV6)
1474 		rss_hash |= NDIS_HASH_IPV6;
1475 	if (types & RSS_TYPE_IPV6_EX)
1476 		rss_hash |= NDIS_HASH_IPV6_EX;
1477 	if (types & RSS_TYPE_TCP_IPV6)
1478 		rss_hash |= NDIS_HASH_TCP_IPV6;
1479 	if (types & RSS_TYPE_TCP_IPV6_EX)
1480 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1481 	if (types & RSS_TYPE_UDP_IPV4)
1482 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1483 	return (rss_hash);
1484 }
1485 
1486 static void
1487 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1488 {
1489 	int i;
1490 
1491 	HN_LOCK_ASSERT(sc);
1492 
1493 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1494 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1495 }
1496 
1497 static void
1498 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1499 {
1500 	struct ifnet *ifp, *vf_ifp;
1501 	struct ifrsshash ifrh;
1502 	struct ifrsskey ifrk;
1503 	int error;
1504 	uint32_t my_types, diff_types, mbuf_types = 0;
1505 
1506 	HN_LOCK_ASSERT(sc);
1507 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1508 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1509 
1510 	if (sc->hn_rx_ring_inuse == 1) {
1511 		/* No RSS on synthetic parts; done. */
1512 		return;
1513 	}
1514 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1515 		/* Synthetic parts do not support Toeplitz; done. */
1516 		return;
1517 	}
1518 
1519 	ifp = sc->hn_ifp;
1520 	vf_ifp = sc->hn_vf_ifp;
1521 
1522 	/*
1523 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1524 	 * supported.
1525 	 */
1526 	memset(&ifrk, 0, sizeof(ifrk));
1527 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1528 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1529 	if (error) {
1530 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1531 		    vf_ifp->if_xname, error);
1532 		goto done;
1533 	}
1534 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1535 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1536 		    vf_ifp->if_xname, ifrk.ifrk_func);
1537 		goto done;
1538 	}
1539 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1540 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1541 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1542 		goto done;
1543 	}
1544 
1545 	/*
1546 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1547 	 */
1548 	memset(&ifrh, 0, sizeof(ifrh));
1549 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1550 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1551 	if (error) {
1552 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1553 		    vf_ifp->if_xname, error);
1554 		goto done;
1555 	}
1556 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1557 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1558 		    vf_ifp->if_xname, ifrh.ifrh_func);
1559 		goto done;
1560 	}
1561 
1562 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1563 	if ((ifrh.ifrh_types & my_types) == 0) {
1564 		/* This disables RSS; ignore it then */
1565 		if_printf(ifp, "%s intersection of RSS types failed.  "
1566 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1567 		    ifrh.ifrh_types, my_types);
1568 		goto done;
1569 	}
1570 
1571 	diff_types = my_types ^ ifrh.ifrh_types;
1572 	my_types &= ifrh.ifrh_types;
1573 	mbuf_types = my_types;
1574 
1575 	/*
1576 	 * Detect RSS hash value/type confliction.
1577 	 *
1578 	 * NOTE:
1579 	 * We don't disable the hash type, but stop delivery the hash
1580 	 * value/type through mbufs on RX path.
1581 	 *
1582 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1583 	 * hash is delivered with type of TCP_IPV4.  This means if
1584 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1585 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1586 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1587 	 * here.
1588 	 */
1589 	if ((my_types & RSS_TYPE_IPV4) &&
1590 	    (diff_types & ifrh.ifrh_types &
1591 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1592 		/* Conflict; disable IPV4 hash type/value delivery. */
1593 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1594 		mbuf_types &= ~RSS_TYPE_IPV4;
1595 	}
1596 	if ((my_types & RSS_TYPE_IPV6) &&
1597 	    (diff_types & ifrh.ifrh_types &
1598 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1599 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1600 	      RSS_TYPE_IPV6_EX))) {
1601 		/* Conflict; disable IPV6 hash type/value delivery. */
1602 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1603 		mbuf_types &= ~RSS_TYPE_IPV6;
1604 	}
1605 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1606 	    (diff_types & ifrh.ifrh_types &
1607 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1608 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1609 	      RSS_TYPE_IPV6))) {
1610 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1611 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1612 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1613 	}
1614 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1615 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1616 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1617 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1618 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1619 	}
1620 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1621 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1622 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1623 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1624 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1625 	}
1626 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1627 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1628 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1629 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1630 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1631 	}
1632 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1633 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1634 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1635 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1636 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1637 	}
1638 
1639 	/*
1640 	 * Indirect table does not matter.
1641 	 */
1642 
1643 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1644 	    hn_rss_type_tondis(my_types);
1645 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1646 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1647 
1648 	if (reconf) {
1649 		error = hn_rss_reconfig(sc);
1650 		if (error) {
1651 			/* XXX roll-back? */
1652 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1653 			/* XXX keep going. */
1654 		}
1655 	}
1656 done:
1657 	/* Hash deliverability for mbufs. */
1658 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1659 }
1660 
1661 static void
1662 hn_vf_rss_restore(struct hn_softc *sc)
1663 {
1664 
1665 	HN_LOCK_ASSERT(sc);
1666 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1667 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1668 
1669 	if (sc->hn_rx_ring_inuse == 1)
1670 		goto done;
1671 
1672 	/*
1673 	 * Restore hash types.  Key does _not_ matter.
1674 	 */
1675 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1676 		int error;
1677 
1678 		sc->hn_rss_hash = sc->hn_rss_hcap;
1679 		error = hn_rss_reconfig(sc);
1680 		if (error) {
1681 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1682 			    error);
1683 			/* XXX keep going. */
1684 		}
1685 	}
1686 done:
1687 	/* Hash deliverability for mbufs. */
1688 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1689 }
1690 
1691 static void
1692 hn_xpnt_vf_setready(struct hn_softc *sc)
1693 {
1694 	struct ifnet *ifp, *vf_ifp;
1695 	struct ifreq ifr;
1696 
1697 	HN_LOCK_ASSERT(sc);
1698 	ifp = sc->hn_ifp;
1699 	vf_ifp = sc->hn_vf_ifp;
1700 
1701 	/*
1702 	 * Mark the VF ready.
1703 	 */
1704 	sc->hn_vf_rdytick = 0;
1705 
1706 	/*
1707 	 * Save information for restoration.
1708 	 */
1709 	sc->hn_saved_caps = ifp->if_capabilities;
1710 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1711 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1712 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1713 
1714 	/*
1715 	 * Intersect supported/enabled capabilities.
1716 	 *
1717 	 * NOTE:
1718 	 * if_hwassist is not changed here.
1719 	 */
1720 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1721 	ifp->if_capenable &= ifp->if_capabilities;
1722 
1723 	/*
1724 	 * Fix TSO settings.
1725 	 */
1726 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1727 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1728 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1729 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1730 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1731 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1732 
1733 	/*
1734 	 * Change VF's enabled capabilities.
1735 	 */
1736 	memset(&ifr, 0, sizeof(ifr));
1737 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1738 	ifr.ifr_reqcap = ifp->if_capenable;
1739 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1740 
1741 	if (ifp->if_mtu != ETHERMTU) {
1742 		int error;
1743 
1744 		/*
1745 		 * Change VF's MTU.
1746 		 */
1747 		memset(&ifr, 0, sizeof(ifr));
1748 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1749 		ifr.ifr_mtu = ifp->if_mtu;
1750 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1751 		if (error) {
1752 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1753 			    vf_ifp->if_xname, ifp->if_mtu);
1754 			if (ifp->if_mtu > ETHERMTU) {
1755 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1756 
1757 				/*
1758 				 * XXX
1759 				 * No need to adjust the synthetic parts' MTU;
1760 				 * failure of the adjustment will cause us
1761 				 * infinite headache.
1762 				 */
1763 				ifp->if_mtu = ETHERMTU;
1764 				hn_mtu_change_fixup(sc);
1765 			}
1766 		}
1767 	}
1768 }
1769 
1770 static bool
1771 hn_xpnt_vf_isready(struct hn_softc *sc)
1772 {
1773 
1774 	HN_LOCK_ASSERT(sc);
1775 
1776 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1777 		return (false);
1778 
1779 	if (sc->hn_vf_rdytick == 0)
1780 		return (true);
1781 
1782 	if (sc->hn_vf_rdytick > ticks)
1783 		return (false);
1784 
1785 	/* Mark VF as ready. */
1786 	hn_xpnt_vf_setready(sc);
1787 	return (true);
1788 }
1789 
1790 static void
1791 hn_xpnt_vf_setenable(struct hn_softc *sc)
1792 {
1793 	int i;
1794 
1795 	HN_LOCK_ASSERT(sc);
1796 
1797 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1798 	rm_wlock(&sc->hn_vf_lock);
1799 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1800 	rm_wunlock(&sc->hn_vf_lock);
1801 
1802 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1803 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1804 }
1805 
1806 static void
1807 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1808 {
1809 	int i;
1810 
1811 	HN_LOCK_ASSERT(sc);
1812 
1813 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1814 	rm_wlock(&sc->hn_vf_lock);
1815 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1816 	if (clear_vf)
1817 		sc->hn_vf_ifp = NULL;
1818 	rm_wunlock(&sc->hn_vf_lock);
1819 
1820 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1821 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1822 }
1823 
1824 static void
1825 hn_xpnt_vf_init(struct hn_softc *sc)
1826 {
1827 	int error;
1828 
1829 	HN_LOCK_ASSERT(sc);
1830 
1831 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1832 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1833 
1834 	if (bootverbose) {
1835 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1836 		    sc->hn_vf_ifp->if_xname);
1837 	}
1838 
1839 	/*
1840 	 * Bring the VF up.
1841 	 */
1842 	hn_xpnt_vf_saveifflags(sc);
1843 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1844 	error = hn_xpnt_vf_iocsetflags(sc);
1845 	if (error) {
1846 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1847 		    sc->hn_vf_ifp->if_xname, error);
1848 		return;
1849 	}
1850 
1851 	/*
1852 	 * NOTE:
1853 	 * Datapath setting must happen _after_ bringing the VF up.
1854 	 */
1855 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1856 
1857 	/*
1858 	 * NOTE:
1859 	 * Fixup RSS related bits _after_ the VF is brought up, since
1860 	 * many VFs generate RSS key during it's initialization.
1861 	 */
1862 	hn_vf_rss_fixup(sc, true);
1863 
1864 	/* Mark transparent mode VF as enabled. */
1865 	hn_xpnt_vf_setenable(sc);
1866 }
1867 
1868 static void
1869 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1870 {
1871 	struct hn_softc *sc = xsc;
1872 
1873 	HN_LOCK(sc);
1874 
1875 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1876 		goto done;
1877 	if (sc->hn_vf_ifp == NULL)
1878 		goto done;
1879 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1880 		goto done;
1881 
1882 	if (sc->hn_vf_rdytick != 0) {
1883 		/* Mark VF as ready. */
1884 		hn_xpnt_vf_setready(sc);
1885 	}
1886 
1887 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1888 		/*
1889 		 * Delayed VF initialization.
1890 		 */
1891 		if (bootverbose) {
1892 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1893 			    sc->hn_vf_ifp->if_xname);
1894 		}
1895 		hn_xpnt_vf_init(sc);
1896 	}
1897 done:
1898 	HN_UNLOCK(sc);
1899 }
1900 
1901 static void
1902 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1903 {
1904 	struct hn_softc *sc = xsc;
1905 
1906 	HN_LOCK(sc);
1907 
1908 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1909 		goto done;
1910 
1911 	if (!hn_ismyvf(sc, ifp))
1912 		goto done;
1913 
1914 	if (sc->hn_vf_ifp != NULL) {
1915 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1916 		    sc->hn_vf_ifp->if_xname);
1917 		goto done;
1918 	}
1919 
1920 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1921 		/*
1922 		 * ifnet.if_start is _not_ supported by transparent
1923 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1924 		 */
1925 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1926 		    "in transparent VF mode.\n", ifp->if_xname);
1927 		goto done;
1928 	}
1929 
1930 	rm_wlock(&hn_vfmap_lock);
1931 
1932 	if (ifp->if_index >= hn_vfmap_size) {
1933 		struct ifnet **newmap;
1934 		int newsize;
1935 
1936 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1937 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1938 		    M_WAITOK | M_ZERO);
1939 
1940 		memcpy(newmap, hn_vfmap,
1941 		    sizeof(struct ifnet *) * hn_vfmap_size);
1942 		free(hn_vfmap, M_DEVBUF);
1943 		hn_vfmap = newmap;
1944 		hn_vfmap_size = newsize;
1945 	}
1946 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1947 	    ("%s: ifindex %d was mapped to %s",
1948 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1949 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1950 
1951 	rm_wunlock(&hn_vfmap_lock);
1952 
1953 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1954 	rm_wlock(&sc->hn_vf_lock);
1955 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1956 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1957 	sc->hn_vf_ifp = ifp;
1958 	rm_wunlock(&sc->hn_vf_lock);
1959 
1960 	if (hn_xpnt_vf) {
1961 		int wait_ticks;
1962 
1963 		/*
1964 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1965 		 * Save vf_ifp's current if_input for later restoration.
1966 		 */
1967 		sc->hn_vf_input = ifp->if_input;
1968 		ifp->if_input = hn_xpnt_vf_input;
1969 
1970 		/*
1971 		 * Stop link status management; use the VF's.
1972 		 */
1973 		hn_suspend_mgmt(sc);
1974 
1975 		/*
1976 		 * Give VF sometime to complete its attach routing.
1977 		 */
1978 		wait_ticks = hn_xpnt_vf_attwait * hz;
1979 		sc->hn_vf_rdytick = ticks + wait_ticks;
1980 
1981 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1982 		    wait_ticks);
1983 	}
1984 done:
1985 	HN_UNLOCK(sc);
1986 }
1987 
1988 static void
1989 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1990 {
1991 	struct hn_softc *sc = xsc;
1992 
1993 	HN_LOCK(sc);
1994 
1995 	if (sc->hn_vf_ifp == NULL)
1996 		goto done;
1997 
1998 	if (!hn_ismyvf(sc, ifp))
1999 		goto done;
2000 
2001 	if (hn_xpnt_vf) {
2002 		/*
2003 		 * Make sure that the delayed initialization is not running.
2004 		 *
2005 		 * NOTE:
2006 		 * - This lock _must_ be released, since the hn_vf_init task
2007 		 *   will try holding this lock.
2008 		 * - It is safe to release this lock here, since the
2009 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
2010 		 *
2011 		 * XXX racy, if hn(4) ever detached.
2012 		 */
2013 		HN_UNLOCK(sc);
2014 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2015 		HN_LOCK(sc);
2016 
2017 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2018 		    sc->hn_ifp->if_xname));
2019 		ifp->if_input = sc->hn_vf_input;
2020 		sc->hn_vf_input = NULL;
2021 
2022 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2023 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2024 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2025 
2026 		if (sc->hn_vf_rdytick == 0) {
2027 			/*
2028 			 * The VF was ready; restore some settings.
2029 			 */
2030 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2031 			/*
2032 			 * NOTE:
2033 			 * There is _no_ need to fixup if_capenable and
2034 			 * if_hwassist, since the if_capabilities before
2035 			 * restoration was an intersection of the VF's
2036 			 * if_capabilites and the synthetic device's
2037 			 * if_capabilites.
2038 			 */
2039 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2040 			sc->hn_ifp->if_hw_tsomaxsegcount =
2041 			    sc->hn_saved_tsosegcnt;
2042 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2043 		}
2044 
2045 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2046 			/*
2047 			 * Restore RSS settings.
2048 			 */
2049 			hn_vf_rss_restore(sc);
2050 
2051 			/*
2052 			 * Resume link status management, which was suspended
2053 			 * by hn_ifnet_attevent().
2054 			 */
2055 			hn_resume_mgmt(sc);
2056 		}
2057 	}
2058 
2059 	/* Mark transparent mode VF as disabled. */
2060 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2061 
2062 	rm_wlock(&hn_vfmap_lock);
2063 
2064 	KASSERT(ifp->if_index < hn_vfmap_size,
2065 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2066 	if (hn_vfmap[ifp->if_index] != NULL) {
2067 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2068 		    ("%s: ifindex %d was mapped to %s",
2069 		     ifp->if_xname, ifp->if_index,
2070 		     hn_vfmap[ifp->if_index]->if_xname));
2071 		hn_vfmap[ifp->if_index] = NULL;
2072 	}
2073 
2074 	rm_wunlock(&hn_vfmap_lock);
2075 done:
2076 	HN_UNLOCK(sc);
2077 }
2078 
2079 static void
2080 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2081 {
2082 	struct hn_softc *sc = xsc;
2083 
2084 	if (sc->hn_vf_ifp == ifp)
2085 		if_link_state_change(sc->hn_ifp, link_state);
2086 }
2087 
2088 static int
2089 hn_probe(device_t dev)
2090 {
2091 
2092 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2093 		device_set_desc(dev, "Hyper-V Network Interface");
2094 		return BUS_PROBE_DEFAULT;
2095 	}
2096 	return ENXIO;
2097 }
2098 
2099 static int
2100 hn_attach(device_t dev)
2101 {
2102 	struct hn_softc *sc = device_get_softc(dev);
2103 	struct sysctl_oid_list *child;
2104 	struct sysctl_ctx_list *ctx;
2105 	uint8_t eaddr[ETHER_ADDR_LEN];
2106 	struct ifnet *ifp = NULL;
2107 	int error, ring_cnt, tx_ring_cnt;
2108 	uint32_t mtu;
2109 
2110 	sc->hn_dev = dev;
2111 	sc->hn_prichan = vmbus_get_channel(dev);
2112 	HN_LOCK_INIT(sc);
2113 	rm_init(&sc->hn_vf_lock, "hnvf");
2114 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2115 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2116 
2117 	/*
2118 	 * Initialize these tunables once.
2119 	 */
2120 	sc->hn_agg_size = hn_tx_agg_size;
2121 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2122 
2123 	/*
2124 	 * Setup taskqueue for transmission.
2125 	 */
2126 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2127 		int i;
2128 
2129 		sc->hn_tx_taskqs =
2130 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2131 		    M_DEVBUF, M_WAITOK);
2132 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2133 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2134 			    M_WAITOK, taskqueue_thread_enqueue,
2135 			    &sc->hn_tx_taskqs[i]);
2136 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2137 			    "%s tx%d", device_get_nameunit(dev), i);
2138 		}
2139 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2140 		sc->hn_tx_taskqs = hn_tx_taskque;
2141 	}
2142 
2143 	/*
2144 	 * Setup taskqueue for mangement tasks, e.g. link status.
2145 	 */
2146 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2147 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2148 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2149 	    device_get_nameunit(dev));
2150 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2151 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2152 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2153 	    hn_netchg_status_taskfunc, sc);
2154 
2155 	if (hn_xpnt_vf) {
2156 		/*
2157 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2158 		 */
2159 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2160 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2161 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2162 		    device_get_nameunit(dev));
2163 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2164 		    hn_xpnt_vf_init_taskfunc, sc);
2165 	}
2166 
2167 	/*
2168 	 * Allocate ifnet and setup its name earlier, so that if_printf
2169 	 * can be used by functions, which will be called after
2170 	 * ether_ifattach().
2171 	 */
2172 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2173 	ifp->if_softc = sc;
2174 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2175 
2176 	/*
2177 	 * Initialize ifmedia earlier so that it can be unconditionally
2178 	 * destroyed, if error happened later on.
2179 	 */
2180 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2181 
2182 	/*
2183 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2184 	 * to use (tx_ring_cnt).
2185 	 *
2186 	 * NOTE:
2187 	 * The # of RX rings to use is same as the # of channels to use.
2188 	 */
2189 	ring_cnt = hn_chan_cnt;
2190 	if (ring_cnt <= 0) {
2191 		/* Default */
2192 		ring_cnt = mp_ncpus;
2193 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2194 			ring_cnt = HN_RING_CNT_DEF_MAX;
2195 	} else if (ring_cnt > mp_ncpus) {
2196 		ring_cnt = mp_ncpus;
2197 	}
2198 #ifdef RSS
2199 	if (ring_cnt > rss_getnumbuckets())
2200 		ring_cnt = rss_getnumbuckets();
2201 #endif
2202 
2203 	tx_ring_cnt = hn_tx_ring_cnt;
2204 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2205 		tx_ring_cnt = ring_cnt;
2206 #ifdef HN_IFSTART_SUPPORT
2207 	if (hn_use_if_start) {
2208 		/* ifnet.if_start only needs one TX ring. */
2209 		tx_ring_cnt = 1;
2210 	}
2211 #endif
2212 
2213 	/*
2214 	 * Set the leader CPU for channels.
2215 	 */
2216 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2217 
2218 	/*
2219 	 * Create enough TX/RX rings, even if only limited number of
2220 	 * channels can be allocated.
2221 	 */
2222 	error = hn_create_tx_data(sc, tx_ring_cnt);
2223 	if (error)
2224 		goto failed;
2225 	error = hn_create_rx_data(sc, ring_cnt);
2226 	if (error)
2227 		goto failed;
2228 
2229 	/*
2230 	 * Create transaction context for NVS and RNDIS transactions.
2231 	 */
2232 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2233 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2234 	if (sc->hn_xact == NULL) {
2235 		error = ENXIO;
2236 		goto failed;
2237 	}
2238 
2239 	/*
2240 	 * Install orphan handler for the revocation of this device's
2241 	 * primary channel.
2242 	 *
2243 	 * NOTE:
2244 	 * The processing order is critical here:
2245 	 * Install the orphan handler, _before_ testing whether this
2246 	 * device's primary channel has been revoked or not.
2247 	 */
2248 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2249 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2250 		error = ENXIO;
2251 		goto failed;
2252 	}
2253 
2254 	/*
2255 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2256 	 */
2257 	error = hn_synth_attach(sc, ETHERMTU);
2258 	if (error)
2259 		goto failed;
2260 
2261 	error = hn_rndis_get_eaddr(sc, eaddr);
2262 	if (error)
2263 		goto failed;
2264 
2265 	error = hn_rndis_get_mtu(sc, &mtu);
2266 	if (error)
2267 		mtu = ETHERMTU;
2268 	else if (bootverbose)
2269 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2270 
2271 #if __FreeBSD_version >= 1100099
2272 	if (sc->hn_rx_ring_inuse > 1) {
2273 		/*
2274 		 * Reduce TCP segment aggregation limit for multiple
2275 		 * RX rings to increase ACK timeliness.
2276 		 */
2277 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2278 	}
2279 #endif
2280 
2281 	/*
2282 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2283 	 */
2284 	hn_fixup_tx_data(sc);
2285 	hn_fixup_rx_data(sc);
2286 
2287 	ctx = device_get_sysctl_ctx(dev);
2288 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2289 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2290 	    &sc->hn_nvs_ver, 0, "NVS version");
2291 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2292 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2293 	    hn_ndis_version_sysctl, "A", "NDIS version");
2294 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2295 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2296 	    hn_caps_sysctl, "A", "capabilities");
2297 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2298 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2299 	    hn_hwassist_sysctl, "A", "hwassist");
2300 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2301 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2302 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2303 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2304 	    "max # of TSO segments");
2305 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2306 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2307 	    "max size of TSO segment");
2308 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2309 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2310 	    hn_rxfilter_sysctl, "A", "rxfilter");
2311 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2312 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2313 	    hn_rss_hash_sysctl, "A", "RSS hash");
2314 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2315 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2316 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2317 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2318 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2319 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2320 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2321 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2322 #ifndef RSS
2323 	/*
2324 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2325 	 */
2326 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2327 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2328 	    hn_rss_key_sysctl, "IU", "RSS key");
2329 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2330 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2331 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2332 #endif
2333 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2334 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2335 	    "RNDIS offered packet transmission aggregation size limit");
2336 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2337 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2338 	    "RNDIS offered packet transmission aggregation count limit");
2339 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2340 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2341 	    "RNDIS packet transmission aggregation alignment");
2342 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2343 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2344 	    hn_txagg_size_sysctl, "I",
2345 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2346 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2347 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2348 	    hn_txagg_pkts_sysctl, "I",
2349 	    "Packet transmission aggregation packets, "
2350 	    "0 -- disable, -1 -- auto");
2351 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2352 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2353 	    hn_polling_sysctl, "I",
2354 	    "Polling frequency: [100,1000000], 0 disable polling");
2355 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2356 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2357 	    hn_vf_sysctl, "A", "Virtual Function's name");
2358 	if (!hn_xpnt_vf) {
2359 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2360 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2361 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2362 	} else {
2363 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2364 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2365 		    hn_xpnt_vf_enabled_sysctl, "I",
2366 		    "Transparent VF enabled");
2367 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2368 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2369 		    hn_xpnt_vf_accbpf_sysctl, "I",
2370 		    "Accurate BPF for transparent VF");
2371 	}
2372 
2373 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
2374 	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A",
2375 	    "switch to rsc");
2376 
2377 	/*
2378 	 * Setup the ifmedia, which has been initialized earlier.
2379 	 */
2380 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2381 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2382 	/* XXX ifmedia_set really should do this for us */
2383 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2384 
2385 	/*
2386 	 * Setup the ifnet for this interface.
2387 	 */
2388 
2389 	ifp->if_baudrate = IF_Gbps(10);
2390 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2391 	ifp->if_ioctl = hn_ioctl;
2392 	ifp->if_init = hn_init;
2393 #ifdef HN_IFSTART_SUPPORT
2394 	if (hn_use_if_start) {
2395 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2396 
2397 		ifp->if_start = hn_start;
2398 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2399 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2400 		IFQ_SET_READY(&ifp->if_snd);
2401 	} else
2402 #endif
2403 	{
2404 		ifp->if_transmit = hn_transmit;
2405 		ifp->if_qflush = hn_xmit_qflush;
2406 	}
2407 
2408 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2409 #ifdef foo
2410 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2411 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2412 #endif
2413 	if (sc->hn_caps & HN_CAP_VLAN) {
2414 		/* XXX not sure about VLAN_MTU. */
2415 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2416 	}
2417 
2418 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2419 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2420 		ifp->if_capabilities |= IFCAP_TXCSUM;
2421 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2422 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2423 	if (sc->hn_caps & HN_CAP_TSO4) {
2424 		ifp->if_capabilities |= IFCAP_TSO4;
2425 		ifp->if_hwassist |= CSUM_IP_TSO;
2426 	}
2427 	if (sc->hn_caps & HN_CAP_TSO6) {
2428 		ifp->if_capabilities |= IFCAP_TSO6;
2429 		ifp->if_hwassist |= CSUM_IP6_TSO;
2430 	}
2431 
2432 	/* Enable all available capabilities by default. */
2433 	ifp->if_capenable = ifp->if_capabilities;
2434 
2435 	/*
2436 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2437 	 * be enabled through SIOCSIFCAP.
2438 	 */
2439 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2440 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2441 
2442 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2443 		/*
2444 		 * Lock hn_set_tso_maxsize() to simplify its
2445 		 * internal logic.
2446 		 */
2447 		HN_LOCK(sc);
2448 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2449 		HN_UNLOCK(sc);
2450 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2451 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2452 	}
2453 
2454 	ether_ifattach(ifp, eaddr);
2455 
2456 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2457 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2458 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2459 	}
2460 	if (mtu < ETHERMTU) {
2461 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2462 		ifp->if_mtu = mtu;
2463 	}
2464 
2465 	/* Inform the upper layer about the long frame support. */
2466 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2467 
2468 	/*
2469 	 * Kick off link status check.
2470 	 */
2471 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2472 	hn_update_link_status(sc);
2473 
2474 	if (!hn_xpnt_vf) {
2475 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2476 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2477 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2478 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2479 	} else {
2480 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2481 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2482 	}
2483 
2484 	/*
2485 	 * NOTE:
2486 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2487 	 * since interface's LLADDR is needed; interface LLADDR is not
2488 	 * available when ifnet_arrival event is triggered.
2489 	 */
2490 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2491 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2492 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2493 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2494 
2495 	return (0);
2496 failed:
2497 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2498 		hn_synth_detach(sc);
2499 	hn_detach(dev);
2500 	return (error);
2501 }
2502 
2503 static int
2504 hn_detach(device_t dev)
2505 {
2506 	struct hn_softc *sc = device_get_softc(dev);
2507 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2508 
2509 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2510 		/*
2511 		 * In case that the vmbus missed the orphan handler
2512 		 * installation.
2513 		 */
2514 		vmbus_xact_ctx_orphan(sc->hn_xact);
2515 	}
2516 
2517 	if (sc->hn_ifaddr_evthand != NULL)
2518 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2519 	if (sc->hn_ifnet_evthand != NULL)
2520 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2521 	if (sc->hn_ifnet_atthand != NULL) {
2522 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2523 		    sc->hn_ifnet_atthand);
2524 	}
2525 	if (sc->hn_ifnet_dethand != NULL) {
2526 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2527 		    sc->hn_ifnet_dethand);
2528 	}
2529 	if (sc->hn_ifnet_lnkhand != NULL)
2530 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2531 
2532 	vf_ifp = sc->hn_vf_ifp;
2533 	__compiler_membar();
2534 	if (vf_ifp != NULL)
2535 		hn_ifnet_detevent(sc, vf_ifp);
2536 
2537 	if (device_is_attached(dev)) {
2538 		HN_LOCK(sc);
2539 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2540 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2541 				hn_stop(sc, true);
2542 			/*
2543 			 * NOTE:
2544 			 * hn_stop() only suspends data, so managment
2545 			 * stuffs have to be suspended manually here.
2546 			 */
2547 			hn_suspend_mgmt(sc);
2548 			hn_synth_detach(sc);
2549 		}
2550 		HN_UNLOCK(sc);
2551 		ether_ifdetach(ifp);
2552 	}
2553 
2554 	ifmedia_removeall(&sc->hn_media);
2555 	hn_destroy_rx_data(sc);
2556 	hn_destroy_tx_data(sc);
2557 
2558 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2559 		int i;
2560 
2561 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2562 			taskqueue_free(sc->hn_tx_taskqs[i]);
2563 		free(sc->hn_tx_taskqs, M_DEVBUF);
2564 	}
2565 	taskqueue_free(sc->hn_mgmt_taskq0);
2566 	if (sc->hn_vf_taskq != NULL)
2567 		taskqueue_free(sc->hn_vf_taskq);
2568 
2569 	if (sc->hn_xact != NULL) {
2570 		/*
2571 		 * Uninstall the orphan handler _before_ the xact is
2572 		 * destructed.
2573 		 */
2574 		vmbus_chan_unset_orphan(sc->hn_prichan);
2575 		vmbus_xact_ctx_destroy(sc->hn_xact);
2576 	}
2577 
2578 	if_free(ifp);
2579 
2580 	HN_LOCK_DESTROY(sc);
2581 	rm_destroy(&sc->hn_vf_lock);
2582 	return (0);
2583 }
2584 
2585 static int
2586 hn_shutdown(device_t dev)
2587 {
2588 
2589 	return (0);
2590 }
2591 
2592 static void
2593 hn_link_status(struct hn_softc *sc)
2594 {
2595 	uint32_t link_status;
2596 	int error;
2597 
2598 	error = hn_rndis_get_linkstatus(sc, &link_status);
2599 	if (error) {
2600 		/* XXX what to do? */
2601 		return;
2602 	}
2603 
2604 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2605 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2606 	else
2607 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2608 	if_link_state_change(sc->hn_ifp,
2609 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2610 	    LINK_STATE_UP : LINK_STATE_DOWN);
2611 }
2612 
2613 static void
2614 hn_link_taskfunc(void *xsc, int pending __unused)
2615 {
2616 	struct hn_softc *sc = xsc;
2617 
2618 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2619 		return;
2620 	hn_link_status(sc);
2621 }
2622 
2623 static void
2624 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2625 {
2626 	struct hn_softc *sc = xsc;
2627 
2628 	/* Prevent any link status checks from running. */
2629 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2630 
2631 	/*
2632 	 * Fake up a [link down --> link up] state change; 5 seconds
2633 	 * delay is used, which closely simulates miibus reaction
2634 	 * upon link down event.
2635 	 */
2636 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2637 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2638 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2639 	    &sc->hn_netchg_status, 5 * hz);
2640 }
2641 
2642 static void
2643 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2644 {
2645 	struct hn_softc *sc = xsc;
2646 
2647 	/* Re-allow link status checks. */
2648 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2649 	hn_link_status(sc);
2650 }
2651 
2652 static void
2653 hn_update_link_status(struct hn_softc *sc)
2654 {
2655 
2656 	if (sc->hn_mgmt_taskq != NULL)
2657 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2658 }
2659 
2660 static void
2661 hn_change_network(struct hn_softc *sc)
2662 {
2663 
2664 	if (sc->hn_mgmt_taskq != NULL)
2665 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2666 }
2667 
2668 static __inline int
2669 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2670     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2671 {
2672 	struct mbuf *m = *m_head;
2673 	int error;
2674 
2675 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2676 
2677 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2678 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2679 	if (error == EFBIG) {
2680 		struct mbuf *m_new;
2681 
2682 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2683 		if (m_new == NULL)
2684 			return ENOBUFS;
2685 		else
2686 			*m_head = m = m_new;
2687 		txr->hn_tx_collapsed++;
2688 
2689 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2690 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2691 	}
2692 	if (!error) {
2693 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2694 		    BUS_DMASYNC_PREWRITE);
2695 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2696 	}
2697 	return error;
2698 }
2699 
2700 static __inline int
2701 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2702 {
2703 
2704 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2705 	    ("put an onlist txd %#x", txd->flags));
2706 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2707 	    ("put an onagg txd %#x", txd->flags));
2708 
2709 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2710 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2711 		return 0;
2712 
2713 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2714 		struct hn_txdesc *tmp_txd;
2715 
2716 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2717 			int freed __diagused;
2718 
2719 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2720 			    ("resursive aggregation on aggregated txdesc"));
2721 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2722 			    ("not aggregated txdesc"));
2723 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2724 			    ("aggregated txdesc uses dmamap"));
2725 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2726 			    ("aggregated txdesc consumes "
2727 			     "chimney sending buffer"));
2728 			KASSERT(tmp_txd->chim_size == 0,
2729 			    ("aggregated txdesc has non-zero "
2730 			     "chimney sending size"));
2731 
2732 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2733 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2734 			freed = hn_txdesc_put(txr, tmp_txd);
2735 			KASSERT(freed, ("failed to free aggregated txdesc"));
2736 		}
2737 	}
2738 
2739 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2740 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2741 		    ("chim txd uses dmamap"));
2742 		hn_chim_free(txr->hn_sc, txd->chim_index);
2743 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2744 		txd->chim_size = 0;
2745 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2746 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2747 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2748 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2749 		    txd->data_dmap);
2750 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2751 	}
2752 
2753 	if (txd->m != NULL) {
2754 		m_freem(txd->m);
2755 		txd->m = NULL;
2756 	}
2757 
2758 	txd->flags |= HN_TXD_FLAG_ONLIST;
2759 #ifndef HN_USE_TXDESC_BUFRING
2760 	mtx_lock_spin(&txr->hn_txlist_spin);
2761 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2762 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2763 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2764 	txr->hn_txdesc_avail++;
2765 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2766 	mtx_unlock_spin(&txr->hn_txlist_spin);
2767 #else	/* HN_USE_TXDESC_BUFRING */
2768 #ifdef HN_DEBUG
2769 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2770 #endif
2771 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2772 #endif	/* !HN_USE_TXDESC_BUFRING */
2773 
2774 	return 1;
2775 }
2776 
2777 static __inline struct hn_txdesc *
2778 hn_txdesc_get(struct hn_tx_ring *txr)
2779 {
2780 	struct hn_txdesc *txd;
2781 
2782 #ifndef HN_USE_TXDESC_BUFRING
2783 	mtx_lock_spin(&txr->hn_txlist_spin);
2784 	txd = SLIST_FIRST(&txr->hn_txlist);
2785 	if (txd != NULL) {
2786 		KASSERT(txr->hn_txdesc_avail > 0,
2787 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2788 		txr->hn_txdesc_avail--;
2789 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2790 	}
2791 	mtx_unlock_spin(&txr->hn_txlist_spin);
2792 #else
2793 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2794 #endif
2795 
2796 	if (txd != NULL) {
2797 #ifdef HN_USE_TXDESC_BUFRING
2798 #ifdef HN_DEBUG
2799 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2800 #endif
2801 #endif	/* HN_USE_TXDESC_BUFRING */
2802 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2803 		    STAILQ_EMPTY(&txd->agg_list) &&
2804 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2805 		    txd->chim_size == 0 &&
2806 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2807 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2808 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2809 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2810 		txd->refs = 1;
2811 	}
2812 	return txd;
2813 }
2814 
2815 static __inline void
2816 hn_txdesc_hold(struct hn_txdesc *txd)
2817 {
2818 
2819 	/* 0->1 transition will never work */
2820 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2821 	atomic_add_int(&txd->refs, 1);
2822 }
2823 
2824 static __inline void
2825 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2826 {
2827 
2828 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2829 	    ("recursive aggregation on aggregating txdesc"));
2830 
2831 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2832 	    ("already aggregated"));
2833 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2834 	    ("recursive aggregation on to-be-aggregated txdesc"));
2835 
2836 	txd->flags |= HN_TXD_FLAG_ONAGG;
2837 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2838 }
2839 
2840 static bool
2841 hn_tx_ring_pending(struct hn_tx_ring *txr)
2842 {
2843 	bool pending = false;
2844 
2845 #ifndef HN_USE_TXDESC_BUFRING
2846 	mtx_lock_spin(&txr->hn_txlist_spin);
2847 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2848 		pending = true;
2849 	mtx_unlock_spin(&txr->hn_txlist_spin);
2850 #else
2851 	if (!buf_ring_full(txr->hn_txdesc_br))
2852 		pending = true;
2853 #endif
2854 	return (pending);
2855 }
2856 
2857 static __inline void
2858 hn_txeof(struct hn_tx_ring *txr)
2859 {
2860 	txr->hn_has_txeof = 0;
2861 	txr->hn_txeof(txr);
2862 }
2863 
2864 static void
2865 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2866     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2867 {
2868 	struct hn_txdesc *txd = sndc->hn_cbarg;
2869 	struct hn_tx_ring *txr;
2870 
2871 	txr = txd->txr;
2872 	KASSERT(txr->hn_chan == chan,
2873 	    ("channel mismatch, on chan%u, should be chan%u",
2874 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2875 
2876 	txr->hn_has_txeof = 1;
2877 	hn_txdesc_put(txr, txd);
2878 
2879 	++txr->hn_txdone_cnt;
2880 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2881 		txr->hn_txdone_cnt = 0;
2882 		if (txr->hn_oactive)
2883 			hn_txeof(txr);
2884 	}
2885 }
2886 
2887 static void
2888 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2889 {
2890 #if defined(INET) || defined(INET6)
2891 	struct epoch_tracker et;
2892 
2893 	NET_EPOCH_ENTER(et);
2894 	tcp_lro_flush_all(&rxr->hn_lro);
2895 	NET_EPOCH_EXIT(et);
2896 #endif
2897 
2898 	/*
2899 	 * NOTE:
2900 	 * 'txr' could be NULL, if multiple channels and
2901 	 * ifnet.if_start method are enabled.
2902 	 */
2903 	if (txr == NULL || !txr->hn_has_txeof)
2904 		return;
2905 
2906 	txr->hn_txdone_cnt = 0;
2907 	hn_txeof(txr);
2908 }
2909 
2910 static __inline uint32_t
2911 hn_rndis_pktmsg_offset(uint32_t ofs)
2912 {
2913 
2914 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2915 	    ("invalid RNDIS packet msg offset %u", ofs));
2916 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2917 }
2918 
2919 static __inline void *
2920 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2921     size_t pi_dlen, uint32_t pi_type)
2922 {
2923 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2924 	struct rndis_pktinfo *pi;
2925 
2926 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2927 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2928 
2929 	/*
2930 	 * Per-packet-info does not move; it only grows.
2931 	 *
2932 	 * NOTE:
2933 	 * rm_pktinfooffset in this phase counts from the beginning
2934 	 * of rndis_packet_msg.
2935 	 */
2936 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2937 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2938 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2939 	    pkt->rm_pktinfolen);
2940 	pkt->rm_pktinfolen += pi_size;
2941 
2942 	pi->rm_size = pi_size;
2943 	pi->rm_type = pi_type;
2944 	pi->rm_internal = 0;
2945 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2946 
2947 	return (pi->rm_data);
2948 }
2949 
2950 static __inline int
2951 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2952 {
2953 	struct hn_txdesc *txd;
2954 	struct mbuf *m;
2955 	int error, pkts;
2956 
2957 	txd = txr->hn_agg_txd;
2958 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2959 
2960 	/*
2961 	 * Since hn_txpkt() will reset this temporary stat, save
2962 	 * it now, so that oerrors can be updated properly, if
2963 	 * hn_txpkt() ever fails.
2964 	 */
2965 	pkts = txr->hn_stat_pkts;
2966 
2967 	/*
2968 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2969 	 * failure, save it for later freeing, if hn_txpkt() ever
2970 	 * fails.
2971 	 */
2972 	m = txd->m;
2973 	error = hn_txpkt(ifp, txr, txd);
2974 	if (__predict_false(error)) {
2975 		/* txd is freed, but m is not. */
2976 		m_freem(m);
2977 
2978 		txr->hn_flush_failed++;
2979 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2980 	}
2981 
2982 	/* Reset all aggregation states. */
2983 	txr->hn_agg_txd = NULL;
2984 	txr->hn_agg_szleft = 0;
2985 	txr->hn_agg_pktleft = 0;
2986 	txr->hn_agg_prevpkt = NULL;
2987 
2988 	return (error);
2989 }
2990 
2991 static void *
2992 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2993     int pktsize)
2994 {
2995 	void *chim;
2996 
2997 	if (txr->hn_agg_txd != NULL) {
2998 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2999 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
3000 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
3001 			int olen;
3002 
3003 			/*
3004 			 * Update the previous RNDIS packet's total length,
3005 			 * it can be increased due to the mandatory alignment
3006 			 * padding for this RNDIS packet.  And update the
3007 			 * aggregating txdesc's chimney sending buffer size
3008 			 * accordingly.
3009 			 *
3010 			 * XXX
3011 			 * Zero-out the padding, as required by the RNDIS spec.
3012 			 */
3013 			olen = pkt->rm_len;
3014 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
3015 			agg_txd->chim_size += pkt->rm_len - olen;
3016 
3017 			/* Link this txdesc to the parent. */
3018 			hn_txdesc_agg(agg_txd, txd);
3019 
3020 			chim = (uint8_t *)pkt + pkt->rm_len;
3021 			/* Save the current packet for later fixup. */
3022 			txr->hn_agg_prevpkt = chim;
3023 
3024 			txr->hn_agg_pktleft--;
3025 			txr->hn_agg_szleft -= pktsize;
3026 			if (txr->hn_agg_szleft <=
3027 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3028 				/*
3029 				 * Probably can't aggregate more packets,
3030 				 * flush this aggregating txdesc proactively.
3031 				 */
3032 				txr->hn_agg_pktleft = 0;
3033 			}
3034 			/* Done! */
3035 			return (chim);
3036 		}
3037 		hn_flush_txagg(ifp, txr);
3038 	}
3039 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3040 
3041 	txr->hn_tx_chimney_tried++;
3042 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3043 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3044 		return (NULL);
3045 	txr->hn_tx_chimney++;
3046 
3047 	chim = txr->hn_sc->hn_chim +
3048 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3049 
3050 	if (txr->hn_agg_pktmax > 1 &&
3051 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3052 		txr->hn_agg_txd = txd;
3053 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3054 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3055 		txr->hn_agg_prevpkt = chim;
3056 	}
3057 	return (chim);
3058 }
3059 
3060 /*
3061  * NOTE:
3062  * If this function fails, then both txd and m_head0 will be freed.
3063  */
3064 static int
3065 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3066     struct mbuf **m_head0)
3067 {
3068 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3069 	int error, nsegs, i;
3070 	struct mbuf *m_head = *m_head0;
3071 	struct rndis_packet_msg *pkt;
3072 	uint32_t *pi_data;
3073 	void *chim = NULL;
3074 	int pkt_hlen, pkt_size;
3075 
3076 	pkt = txd->rndis_pkt;
3077 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3078 	if (pkt_size < txr->hn_chim_size) {
3079 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3080 		if (chim != NULL)
3081 			pkt = chim;
3082 	} else {
3083 		if (txr->hn_agg_txd != NULL)
3084 			hn_flush_txagg(ifp, txr);
3085 	}
3086 
3087 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3088 	pkt->rm_len = m_head->m_pkthdr.len;
3089 	pkt->rm_dataoffset = 0;
3090 	pkt->rm_datalen = m_head->m_pkthdr.len;
3091 	pkt->rm_oobdataoffset = 0;
3092 	pkt->rm_oobdatalen = 0;
3093 	pkt->rm_oobdataelements = 0;
3094 	pkt->rm_pktinfooffset = sizeof(*pkt);
3095 	pkt->rm_pktinfolen = 0;
3096 	pkt->rm_vchandle = 0;
3097 	pkt->rm_reserved = 0;
3098 
3099 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3100 		/*
3101 		 * Set the hash value for this packet.
3102 		 */
3103 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3104 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3105 
3106 		if (M_HASHTYPE_ISHASH(m_head))
3107 			/*
3108 			 * The flowid field contains the hash value host
3109 			 * set in the rx queue if it is a ip forwarding pkt.
3110 			 * Set the same hash value so host can send on the
3111 			 * cpu it was received.
3112 			 */
3113 			*pi_data = m_head->m_pkthdr.flowid;
3114 		else
3115 			/*
3116 			 * Otherwise just put the tx queue index.
3117 			 */
3118 			*pi_data = txr->hn_tx_idx;
3119 	}
3120 
3121 	if (m_head->m_flags & M_VLANTAG) {
3122 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3123 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3124 		*pi_data = NDIS_VLAN_INFO_MAKE(
3125 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3126 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3127 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3128 	}
3129 
3130 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3131 #if defined(INET6) || defined(INET)
3132 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3133 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3134 #ifdef INET
3135 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3136 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3137 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3138 			    m_head->m_pkthdr.tso_segsz);
3139 		}
3140 #endif
3141 #if defined(INET6) && defined(INET)
3142 		else
3143 #endif
3144 #ifdef INET6
3145 		{
3146 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3147 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3148 			    m_head->m_pkthdr.tso_segsz);
3149 		}
3150 #endif
3151 #endif	/* INET6 || INET */
3152 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3153 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3154 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3155 		if (m_head->m_pkthdr.csum_flags &
3156 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3157 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3158 		} else {
3159 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3160 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3161 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3162 		}
3163 
3164 		if (m_head->m_pkthdr.csum_flags &
3165 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3166 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3167 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3168 		} else if (m_head->m_pkthdr.csum_flags &
3169 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3170 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3171 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3172 		}
3173 	}
3174 
3175 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3176 	/* Fixup RNDIS packet message total length */
3177 	pkt->rm_len += pkt_hlen;
3178 	/* Convert RNDIS packet message offsets */
3179 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3180 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3181 
3182 	/*
3183 	 * Fast path: Chimney sending.
3184 	 */
3185 	if (chim != NULL) {
3186 		struct hn_txdesc *tgt_txd = txd;
3187 
3188 		if (txr->hn_agg_txd != NULL) {
3189 			tgt_txd = txr->hn_agg_txd;
3190 #ifdef INVARIANTS
3191 			*m_head0 = NULL;
3192 #endif
3193 		}
3194 
3195 		KASSERT(pkt == chim,
3196 		    ("RNDIS pkt not in chimney sending buffer"));
3197 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3198 		    ("chimney sending buffer is not used"));
3199 		tgt_txd->chim_size += pkt->rm_len;
3200 
3201 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3202 		    ((uint8_t *)chim) + pkt_hlen);
3203 
3204 		txr->hn_gpa_cnt = 0;
3205 		txr->hn_sendpkt = hn_txpkt_chim;
3206 		goto done;
3207 	}
3208 
3209 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3210 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3211 	    ("chimney buffer is used"));
3212 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3213 
3214 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3215 	if (__predict_false(error)) {
3216 		int freed __diagused;
3217 
3218 		/*
3219 		 * This mbuf is not linked w/ the txd yet, so free it now.
3220 		 */
3221 		m_freem(m_head);
3222 		*m_head0 = NULL;
3223 
3224 		freed = hn_txdesc_put(txr, txd);
3225 		KASSERT(freed != 0,
3226 		    ("fail to free txd upon txdma error"));
3227 
3228 		txr->hn_txdma_failed++;
3229 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3230 		return error;
3231 	}
3232 	*m_head0 = m_head;
3233 
3234 	/* +1 RNDIS packet message */
3235 	txr->hn_gpa_cnt = nsegs + 1;
3236 
3237 	/* send packet with page buffer */
3238 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3239 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3240 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3241 
3242 	/*
3243 	 * Fill the page buffers with mbuf info after the page
3244 	 * buffer for RNDIS packet message.
3245 	 */
3246 	for (i = 0; i < nsegs; ++i) {
3247 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3248 
3249 		gpa->gpa_page = atop(segs[i].ds_addr);
3250 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3251 		gpa->gpa_len = segs[i].ds_len;
3252 	}
3253 
3254 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3255 	txd->chim_size = 0;
3256 	txr->hn_sendpkt = hn_txpkt_sglist;
3257 done:
3258 	txd->m = m_head;
3259 
3260 	/* Set the completion routine */
3261 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3262 
3263 	/* Update temporary stats for later use. */
3264 	txr->hn_stat_pkts++;
3265 	txr->hn_stat_size += m_head->m_pkthdr.len;
3266 	if (m_head->m_flags & M_MCAST)
3267 		txr->hn_stat_mcasts++;
3268 
3269 	return 0;
3270 }
3271 
3272 /*
3273  * NOTE:
3274  * If this function fails, then txd will be freed, but the mbuf
3275  * associated w/ the txd will _not_ be freed.
3276  */
3277 static int
3278 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3279 {
3280 	int error, send_failed = 0, has_bpf;
3281 
3282 again:
3283 	has_bpf = bpf_peers_present(ifp->if_bpf);
3284 	if (has_bpf) {
3285 		/*
3286 		 * Make sure that this txd and any aggregated txds are not
3287 		 * freed before ETHER_BPF_MTAP.
3288 		 */
3289 		hn_txdesc_hold(txd);
3290 	}
3291 	error = txr->hn_sendpkt(txr, txd);
3292 	if (!error) {
3293 		if (has_bpf) {
3294 			const struct hn_txdesc *tmp_txd;
3295 
3296 			ETHER_BPF_MTAP(ifp, txd->m);
3297 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3298 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3299 		}
3300 
3301 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3302 #ifdef HN_IFSTART_SUPPORT
3303 		if (!hn_use_if_start)
3304 #endif
3305 		{
3306 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3307 			    txr->hn_stat_size);
3308 			if (txr->hn_stat_mcasts != 0) {
3309 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3310 				    txr->hn_stat_mcasts);
3311 			}
3312 		}
3313 		txr->hn_pkts += txr->hn_stat_pkts;
3314 		txr->hn_sends++;
3315 	}
3316 	if (has_bpf)
3317 		hn_txdesc_put(txr, txd);
3318 
3319 	if (__predict_false(error)) {
3320 		int freed __diagused;
3321 
3322 		/*
3323 		 * This should "really rarely" happen.
3324 		 *
3325 		 * XXX Too many RX to be acked or too many sideband
3326 		 * commands to run?  Ask netvsc_channel_rollup()
3327 		 * to kick start later.
3328 		 */
3329 		txr->hn_has_txeof = 1;
3330 		if (!send_failed) {
3331 			txr->hn_send_failed++;
3332 			send_failed = 1;
3333 			/*
3334 			 * Try sending again after set hn_has_txeof;
3335 			 * in case that we missed the last
3336 			 * netvsc_channel_rollup().
3337 			 */
3338 			goto again;
3339 		}
3340 		if_printf(ifp, "send failed\n");
3341 
3342 		/*
3343 		 * Caller will perform further processing on the
3344 		 * associated mbuf, so don't free it in hn_txdesc_put();
3345 		 * only unload it from the DMA map in hn_txdesc_put(),
3346 		 * if it was loaded.
3347 		 */
3348 		txd->m = NULL;
3349 		freed = hn_txdesc_put(txr, txd);
3350 		KASSERT(freed != 0,
3351 		    ("fail to free txd upon send error"));
3352 
3353 		txr->hn_send_failed++;
3354 	}
3355 
3356 	/* Reset temporary stats, after this sending is done. */
3357 	txr->hn_stat_size = 0;
3358 	txr->hn_stat_pkts = 0;
3359 	txr->hn_stat_mcasts = 0;
3360 
3361 	return (error);
3362 }
3363 
3364 /*
3365  * Append the specified data to the indicated mbuf chain,
3366  * Extend the mbuf chain if the new data does not fit in
3367  * existing space.
3368  *
3369  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3370  * There should be an equivalent in the kernel mbuf code,
3371  * but there does not appear to be one yet.
3372  *
3373  * Differs from m_append() in that additional mbufs are
3374  * allocated with cluster size MJUMPAGESIZE, and filled
3375  * accordingly.
3376  *
3377  * Return the last mbuf in the chain or NULL if failed to
3378  * allocate new mbuf.
3379  */
3380 static struct mbuf *
3381 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3382 {
3383 	struct mbuf *m, *n;
3384 	int remainder, space;
3385 
3386 	for (m = m0; m->m_next != NULL; m = m->m_next)
3387 		;
3388 	remainder = len;
3389 	space = M_TRAILINGSPACE(m);
3390 	if (space > 0) {
3391 		/*
3392 		 * Copy into available space.
3393 		 */
3394 		if (space > remainder)
3395 			space = remainder;
3396 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3397 		m->m_len += space;
3398 		cp += space;
3399 		remainder -= space;
3400 	}
3401 	while (remainder > 0) {
3402 		/*
3403 		 * Allocate a new mbuf; could check space
3404 		 * and allocate a cluster instead.
3405 		 */
3406 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3407 		if (n == NULL)
3408 			return NULL;
3409 		n->m_len = min(MJUMPAGESIZE, remainder);
3410 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3411 		cp += n->m_len;
3412 		remainder -= n->m_len;
3413 		m->m_next = n;
3414 		m = n;
3415 	}
3416 
3417 	return m;
3418 }
3419 
3420 #if defined(INET) || defined(INET6)
3421 static __inline int
3422 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3423 {
3424 #if __FreeBSD_version >= 1100095
3425 	if (hn_lro_mbufq_depth) {
3426 		tcp_lro_queue_mbuf(lc, m);
3427 		return 0;
3428 	}
3429 #endif
3430 	return tcp_lro_rx(lc, m, 0);
3431 }
3432 #endif
3433 
3434 static int
3435 hn_rxpkt(struct hn_rx_ring *rxr)
3436 {
3437 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3438 	struct mbuf *m_new, *n;
3439 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3440 	int hash_type = M_HASHTYPE_NONE;
3441 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3442 	int i;
3443 
3444 	ifp = hn_ifp;
3445 	if (rxr->hn_rxvf_ifp != NULL) {
3446 		/*
3447 		 * Non-transparent mode VF; pretend this packet is from
3448 		 * the VF.
3449 		 */
3450 		ifp = rxr->hn_rxvf_ifp;
3451 		is_vf = 1;
3452 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3453 		/* Transparent mode VF. */
3454 		is_vf = 1;
3455 	}
3456 
3457 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3458 		/*
3459 		 * NOTE:
3460 		 * See the NOTE of hn_rndis_init_fixat().  This
3461 		 * function can be reached, immediately after the
3462 		 * RNDIS is initialized but before the ifnet is
3463 		 * setup on the hn_attach() path; drop the unexpected
3464 		 * packets.
3465 		 */
3466 		return (0);
3467 	}
3468 
3469 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3470 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3471 		return (0);
3472 	}
3473 
3474 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3475 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3476 		if (m_new == NULL) {
3477 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3478 			return (0);
3479 		}
3480 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3481 		    rxr->rsc.frag_len[0]);
3482 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3483 	} else {
3484 		/*
3485 		 * Get an mbuf with a cluster.  For packets 2K or less,
3486 		 * get a standard 2K cluster.  For anything larger, get a
3487 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3488 		 * if looped around to the Hyper-V TX channel, so avoid them.
3489 		 */
3490 		size = MCLBYTES;
3491 		if (rxr->rsc.pktlen > MCLBYTES) {
3492 			/* 4096 */
3493 			size = MJUMPAGESIZE;
3494 		}
3495 
3496 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3497 		if (m_new == NULL) {
3498 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3499 			return (0);
3500 		}
3501 
3502 		n = m_new;
3503 		for (i = 0; i < rxr->rsc.cnt; i++) {
3504 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3505 			    rxr->rsc.frag_data[i]);
3506 			if (n == NULL) {
3507 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3508 				return (0);
3509 			} else {
3510 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3511 			}
3512 		}
3513 	}
3514 	if (rxr->rsc.pktlen <= MHLEN)
3515 		rxr->hn_small_pkts++;
3516 
3517 	m_new->m_pkthdr.rcvif = ifp;
3518 
3519 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3520 		do_csum = 0;
3521 
3522 	/* receive side checksum offload */
3523 	if (rxr->rsc.csum_info != NULL) {
3524 		/* IP csum offload */
3525 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3526 			m_new->m_pkthdr.csum_flags |=
3527 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3528 			rxr->hn_csum_ip++;
3529 		}
3530 
3531 		/* TCP/UDP csum offload */
3532 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3533 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3534 			m_new->m_pkthdr.csum_flags |=
3535 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3536 			m_new->m_pkthdr.csum_data = 0xffff;
3537 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3538 				rxr->hn_csum_tcp++;
3539 			else
3540 				rxr->hn_csum_udp++;
3541 		}
3542 
3543 		/*
3544 		 * XXX
3545 		 * As of this write (Oct 28th, 2016), host side will turn
3546 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3547 		 * the do_lro setting here is actually _not_ accurate.  We
3548 		 * depend on the RSS hash type check to reset do_lro.
3549 		 */
3550 		if ((*(rxr->rsc.csum_info) &
3551 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3552 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3553 			do_lro = 1;
3554 	} else {
3555 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3556 		if (l3proto == ETHERTYPE_IP) {
3557 			if (l4proto == IPPROTO_TCP) {
3558 				if (do_csum &&
3559 				    (rxr->hn_trust_hcsum &
3560 				     HN_TRUST_HCSUM_TCP)) {
3561 					rxr->hn_csum_trusted++;
3562 					m_new->m_pkthdr.csum_flags |=
3563 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3564 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3565 					m_new->m_pkthdr.csum_data = 0xffff;
3566 				}
3567 				do_lro = 1;
3568 			} else if (l4proto == IPPROTO_UDP) {
3569 				if (do_csum &&
3570 				    (rxr->hn_trust_hcsum &
3571 				     HN_TRUST_HCSUM_UDP)) {
3572 					rxr->hn_csum_trusted++;
3573 					m_new->m_pkthdr.csum_flags |=
3574 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3575 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3576 					m_new->m_pkthdr.csum_data = 0xffff;
3577 				}
3578 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3579 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3580 				rxr->hn_csum_trusted++;
3581 				m_new->m_pkthdr.csum_flags |=
3582 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3583 			}
3584 		}
3585 	}
3586 
3587 	if (rxr->rsc.vlan_info != NULL) {
3588 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3589 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3590 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3591 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3592 		m_new->m_flags |= M_VLANTAG;
3593 	}
3594 
3595 	/*
3596 	 * If VF is activated (tranparent/non-transparent mode does not
3597 	 * matter here).
3598 	 *
3599 	 * - Disable LRO
3600 	 *
3601 	 *   hn(4) will only receive broadcast packets, multicast packets,
3602 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3603 	 *   packet types.
3604 	 *
3605 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3606 	 *   all, since the LRO flush will use hn(4) as the receiving
3607 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3608 	 */
3609 	if (is_vf)
3610 		do_lro = 0;
3611 
3612 	/*
3613 	 * If VF is activated (tranparent/non-transparent mode does not
3614 	 * matter here), do _not_ mess with unsupported hash types or
3615 	 * functions.
3616 	 */
3617 	if (rxr->rsc.hash_info != NULL) {
3618 		rxr->hn_rss_pkts++;
3619 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3620 		if (!is_vf)
3621 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3622 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3623 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3624 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3625 			    rxr->hn_mbuf_hash);
3626 
3627 			/*
3628 			 * NOTE:
3629 			 * do_lro is resetted, if the hash types are not TCP
3630 			 * related.  See the comment in the above csum_flags
3631 			 * setup section.
3632 			 */
3633 			switch (type) {
3634 			case NDIS_HASH_IPV4:
3635 				hash_type = M_HASHTYPE_RSS_IPV4;
3636 				do_lro = 0;
3637 				break;
3638 
3639 			case NDIS_HASH_TCP_IPV4:
3640 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3641 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3642 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3643 
3644 					if (is_vf)
3645 						def_htype = M_HASHTYPE_NONE;
3646 
3647 					/*
3648 					 * UDP 4-tuple hash is delivered as
3649 					 * TCP 4-tuple hash.
3650 					 */
3651 					if (l3proto == ETHERTYPE_MAX) {
3652 						hn_rxpkt_proto(m_new,
3653 						    &l3proto, &l4proto);
3654 					}
3655 					if (l3proto == ETHERTYPE_IP) {
3656 						if (l4proto == IPPROTO_UDP &&
3657 						    (rxr->hn_mbuf_hash &
3658 						     NDIS_HASH_UDP_IPV4_X)) {
3659 							hash_type =
3660 							M_HASHTYPE_RSS_UDP_IPV4;
3661 							do_lro = 0;
3662 						} else if (l4proto !=
3663 						    IPPROTO_TCP) {
3664 							hash_type = def_htype;
3665 							do_lro = 0;
3666 						}
3667 					} else {
3668 						hash_type = def_htype;
3669 						do_lro = 0;
3670 					}
3671 				}
3672 				break;
3673 
3674 			case NDIS_HASH_IPV6:
3675 				hash_type = M_HASHTYPE_RSS_IPV6;
3676 				do_lro = 0;
3677 				break;
3678 
3679 			case NDIS_HASH_IPV6_EX:
3680 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3681 				do_lro = 0;
3682 				break;
3683 
3684 			case NDIS_HASH_TCP_IPV6:
3685 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3686 				break;
3687 
3688 			case NDIS_HASH_TCP_IPV6_EX:
3689 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3690 				break;
3691 			}
3692 		}
3693 	} else if (!is_vf) {
3694 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3695 		hash_type = M_HASHTYPE_OPAQUE;
3696 	}
3697 	M_HASHTYPE_SET(m_new, hash_type);
3698 
3699 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3700 	if (hn_ifp != ifp) {
3701 		const struct ether_header *eh;
3702 
3703 		/*
3704 		 * Non-transparent mode VF is activated.
3705 		 */
3706 
3707 		/*
3708 		 * Allow tapping on hn(4).
3709 		 */
3710 		ETHER_BPF_MTAP(hn_ifp, m_new);
3711 
3712 		/*
3713 		 * Update hn(4)'s stats.
3714 		 */
3715 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3716 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3717 		/* Checked at the beginning of this function. */
3718 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3719 		eh = mtod(m_new, struct ether_header *);
3720 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3721 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3722 	}
3723 	rxr->hn_pkts++;
3724 
3725 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3726 #if defined(INET) || defined(INET6)
3727 		struct lro_ctrl *lro = &rxr->hn_lro;
3728 
3729 		if (lro->lro_cnt) {
3730 			rxr->hn_lro_tried++;
3731 			if (hn_lro_rx(lro, m_new) == 0) {
3732 				/* DONE! */
3733 				return 0;
3734 			}
3735 		}
3736 #endif
3737 	}
3738 	ifp->if_input(ifp, m_new);
3739 
3740 	return (0);
3741 }
3742 
3743 static int
3744 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3745 {
3746 	struct hn_softc *sc = ifp->if_softc;
3747 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3748 	struct ifnet *vf_ifp;
3749 	int mask, error = 0;
3750 	struct ifrsskey *ifrk;
3751 	struct ifrsshash *ifrh;
3752 	uint32_t mtu;
3753 
3754 	switch (cmd) {
3755 	case SIOCSIFMTU:
3756 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3757 			error = EINVAL;
3758 			break;
3759 		}
3760 
3761 		HN_LOCK(sc);
3762 
3763 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3764 			HN_UNLOCK(sc);
3765 			break;
3766 		}
3767 
3768 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3769 			/* Can't change MTU */
3770 			HN_UNLOCK(sc);
3771 			error = EOPNOTSUPP;
3772 			break;
3773 		}
3774 
3775 		if (ifp->if_mtu == ifr->ifr_mtu) {
3776 			HN_UNLOCK(sc);
3777 			break;
3778 		}
3779 
3780 		if (hn_xpnt_vf_isready(sc)) {
3781 			vf_ifp = sc->hn_vf_ifp;
3782 			ifr_vf = *ifr;
3783 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3784 			    sizeof(ifr_vf.ifr_name));
3785 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3786 			    (caddr_t)&ifr_vf);
3787 			if (error) {
3788 				HN_UNLOCK(sc);
3789 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3790 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3791 				break;
3792 			}
3793 		}
3794 
3795 		/*
3796 		 * Suspend this interface before the synthetic parts
3797 		 * are ripped.
3798 		 */
3799 		hn_suspend(sc);
3800 
3801 		/*
3802 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3803 		 */
3804 		hn_synth_detach(sc);
3805 
3806 		/*
3807 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3808 		 * with the new MTU setting.
3809 		 */
3810 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3811 		if (error) {
3812 			HN_UNLOCK(sc);
3813 			break;
3814 		}
3815 
3816 		error = hn_rndis_get_mtu(sc, &mtu);
3817 		if (error)
3818 			mtu = ifr->ifr_mtu;
3819 		else if (bootverbose)
3820 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3821 
3822 		/*
3823 		 * Commit the requested MTU, after the synthetic parts
3824 		 * have been successfully attached.
3825 		 */
3826 		if (mtu >= ifr->ifr_mtu) {
3827 			mtu = ifr->ifr_mtu;
3828 		} else {
3829 			if_printf(ifp, "fixup mtu %d -> %u\n",
3830 			    ifr->ifr_mtu, mtu);
3831 		}
3832 		ifp->if_mtu = mtu;
3833 
3834 		/*
3835 		 * Synthetic parts' reattach may change the chimney
3836 		 * sending size; update it.
3837 		 */
3838 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3839 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3840 
3841 		/*
3842 		 * Make sure that various parameters based on MTU are
3843 		 * still valid, after the MTU change.
3844 		 */
3845 		hn_mtu_change_fixup(sc);
3846 
3847 		/*
3848 		 * All done!  Resume the interface now.
3849 		 */
3850 		hn_resume(sc);
3851 
3852 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3853 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3854 			/*
3855 			 * Since we have reattached the NVS part,
3856 			 * change the datapath to VF again; in case
3857 			 * that it is lost, after the NVS was detached.
3858 			 */
3859 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3860 		}
3861 
3862 		HN_UNLOCK(sc);
3863 		break;
3864 
3865 	case SIOCSIFFLAGS:
3866 		HN_LOCK(sc);
3867 
3868 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3869 			HN_UNLOCK(sc);
3870 			break;
3871 		}
3872 
3873 		if (hn_xpnt_vf_isready(sc))
3874 			hn_xpnt_vf_saveifflags(sc);
3875 
3876 		if (ifp->if_flags & IFF_UP) {
3877 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3878 				/*
3879 				 * Caller meight hold mutex, e.g.
3880 				 * bpf; use busy-wait for the RNDIS
3881 				 * reply.
3882 				 */
3883 				HN_NO_SLEEPING(sc);
3884 				hn_rxfilter_config(sc);
3885 				HN_SLEEPING_OK(sc);
3886 
3887 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3888 					error = hn_xpnt_vf_iocsetflags(sc);
3889 			} else {
3890 				hn_init_locked(sc);
3891 			}
3892 		} else {
3893 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3894 				hn_stop(sc, false);
3895 		}
3896 		sc->hn_if_flags = ifp->if_flags;
3897 
3898 		HN_UNLOCK(sc);
3899 		break;
3900 
3901 	case SIOCSIFCAP:
3902 		HN_LOCK(sc);
3903 
3904 		if (hn_xpnt_vf_isready(sc)) {
3905 			ifr_vf = *ifr;
3906 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3907 			    sizeof(ifr_vf.ifr_name));
3908 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3909 			HN_UNLOCK(sc);
3910 			break;
3911 		}
3912 
3913 		/*
3914 		 * Fix up requested capabilities w/ supported capabilities,
3915 		 * since the supported capabilities could have been changed.
3916 		 */
3917 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3918 		    ifp->if_capenable;
3919 
3920 		if (mask & IFCAP_TXCSUM) {
3921 			ifp->if_capenable ^= IFCAP_TXCSUM;
3922 			if (ifp->if_capenable & IFCAP_TXCSUM)
3923 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3924 			else
3925 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3926 		}
3927 		if (mask & IFCAP_TXCSUM_IPV6) {
3928 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3929 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3930 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3931 			else
3932 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3933 		}
3934 
3935 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3936 		if (mask & IFCAP_RXCSUM)
3937 			ifp->if_capenable ^= IFCAP_RXCSUM;
3938 #ifdef foo
3939 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3940 		if (mask & IFCAP_RXCSUM_IPV6)
3941 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3942 #endif
3943 
3944 		if (mask & IFCAP_LRO)
3945 			ifp->if_capenable ^= IFCAP_LRO;
3946 
3947 		if (mask & IFCAP_TSO4) {
3948 			ifp->if_capenable ^= IFCAP_TSO4;
3949 			if (ifp->if_capenable & IFCAP_TSO4)
3950 				ifp->if_hwassist |= CSUM_IP_TSO;
3951 			else
3952 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3953 		}
3954 		if (mask & IFCAP_TSO6) {
3955 			ifp->if_capenable ^= IFCAP_TSO6;
3956 			if (ifp->if_capenable & IFCAP_TSO6)
3957 				ifp->if_hwassist |= CSUM_IP6_TSO;
3958 			else
3959 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3960 		}
3961 
3962 		HN_UNLOCK(sc);
3963 		break;
3964 
3965 	case SIOCADDMULTI:
3966 	case SIOCDELMULTI:
3967 		HN_LOCK(sc);
3968 
3969 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3970 			HN_UNLOCK(sc);
3971 			break;
3972 		}
3973 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3974 			/*
3975 			 * Multicast uses mutex; use busy-wait for
3976 			 * the RNDIS reply.
3977 			 */
3978 			HN_NO_SLEEPING(sc);
3979 			hn_rxfilter_config(sc);
3980 			HN_SLEEPING_OK(sc);
3981 		}
3982 
3983 		/* XXX vlan(4) style mcast addr maintenance */
3984 		if (hn_xpnt_vf_isready(sc)) {
3985 			int old_if_flags;
3986 
3987 			old_if_flags = sc->hn_vf_ifp->if_flags;
3988 			hn_xpnt_vf_saveifflags(sc);
3989 
3990 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3991 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3992 			     IFF_ALLMULTI))
3993 				error = hn_xpnt_vf_iocsetflags(sc);
3994 		}
3995 
3996 		HN_UNLOCK(sc);
3997 		break;
3998 
3999 	case SIOCSIFMEDIA:
4000 	case SIOCGIFMEDIA:
4001 		HN_LOCK(sc);
4002 		if (hn_xpnt_vf_isready(sc)) {
4003 			/*
4004 			 * SIOCGIFMEDIA expects ifmediareq, so don't
4005 			 * create and pass ifr_vf to the VF here; just
4006 			 * replace the ifr_name.
4007 			 */
4008 			vf_ifp = sc->hn_vf_ifp;
4009 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
4010 			    sizeof(ifr->ifr_name));
4011 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
4012 			/* Restore the ifr_name. */
4013 			strlcpy(ifr->ifr_name, ifp->if_xname,
4014 			    sizeof(ifr->ifr_name));
4015 			HN_UNLOCK(sc);
4016 			break;
4017 		}
4018 		HN_UNLOCK(sc);
4019 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4020 		break;
4021 
4022 	case SIOCGIFRSSHASH:
4023 		ifrh = (struct ifrsshash *)data;
4024 		HN_LOCK(sc);
4025 		if (sc->hn_rx_ring_inuse == 1) {
4026 			HN_UNLOCK(sc);
4027 			ifrh->ifrh_func = RSS_FUNC_NONE;
4028 			ifrh->ifrh_types = 0;
4029 			break;
4030 		}
4031 
4032 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4033 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4034 		else
4035 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4036 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4037 		HN_UNLOCK(sc);
4038 		break;
4039 
4040 	case SIOCGIFRSSKEY:
4041 		ifrk = (struct ifrsskey *)data;
4042 		HN_LOCK(sc);
4043 		if (sc->hn_rx_ring_inuse == 1) {
4044 			HN_UNLOCK(sc);
4045 			ifrk->ifrk_func = RSS_FUNC_NONE;
4046 			ifrk->ifrk_keylen = 0;
4047 			break;
4048 		}
4049 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4050 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4051 		else
4052 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4053 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4054 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4055 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4056 		HN_UNLOCK(sc);
4057 		break;
4058 
4059 	default:
4060 		error = ether_ioctl(ifp, cmd, data);
4061 		break;
4062 	}
4063 	return (error);
4064 }
4065 
4066 static void
4067 hn_stop(struct hn_softc *sc, bool detaching)
4068 {
4069 	struct ifnet *ifp = sc->hn_ifp;
4070 	int i;
4071 
4072 	HN_LOCK_ASSERT(sc);
4073 
4074 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4075 	    ("synthetic parts were not attached"));
4076 
4077 	/* Clear RUNNING bit ASAP. */
4078 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4079 
4080 	/* Disable polling. */
4081 	hn_polling(sc, 0);
4082 
4083 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4084 		KASSERT(sc->hn_vf_ifp != NULL,
4085 		    ("%s: VF is not attached", ifp->if_xname));
4086 
4087 		/* Mark transparent mode VF as disabled. */
4088 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4089 
4090 		/*
4091 		 * NOTE:
4092 		 * Datapath setting must happen _before_ bringing
4093 		 * the VF down.
4094 		 */
4095 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4096 
4097 		/*
4098 		 * Bring the VF down.
4099 		 */
4100 		hn_xpnt_vf_saveifflags(sc);
4101 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4102 		hn_xpnt_vf_iocsetflags(sc);
4103 	}
4104 
4105 	/* Suspend data transfers. */
4106 	hn_suspend_data(sc);
4107 
4108 	/* Clear OACTIVE bit. */
4109 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4110 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4111 		sc->hn_tx_ring[i].hn_oactive = 0;
4112 
4113 	/*
4114 	 * If the non-transparent mode VF is active, make sure
4115 	 * that the RX filter still allows packet reception.
4116 	 */
4117 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4118 		hn_rxfilter_config(sc);
4119 }
4120 
4121 static void
4122 hn_init_locked(struct hn_softc *sc)
4123 {
4124 	struct ifnet *ifp = sc->hn_ifp;
4125 	int i;
4126 
4127 	HN_LOCK_ASSERT(sc);
4128 
4129 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4130 		return;
4131 
4132 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4133 		return;
4134 
4135 	/* Configure RX filter */
4136 	hn_rxfilter_config(sc);
4137 
4138 	/* Clear OACTIVE bit. */
4139 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4140 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4141 		sc->hn_tx_ring[i].hn_oactive = 0;
4142 
4143 	/* Clear TX 'suspended' bit. */
4144 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4145 
4146 	if (hn_xpnt_vf_isready(sc)) {
4147 		/* Initialize transparent VF. */
4148 		hn_xpnt_vf_init(sc);
4149 	}
4150 
4151 	/* Everything is ready; unleash! */
4152 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4153 
4154 	/* Re-enable polling if requested. */
4155 	if (sc->hn_pollhz > 0)
4156 		hn_polling(sc, sc->hn_pollhz);
4157 }
4158 
4159 static void
4160 hn_init(void *xsc)
4161 {
4162 	struct hn_softc *sc = xsc;
4163 
4164 	HN_LOCK(sc);
4165 	hn_init_locked(sc);
4166 	HN_UNLOCK(sc);
4167 }
4168 
4169 #if __FreeBSD_version >= 1100099
4170 
4171 static int
4172 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4173 {
4174 	struct hn_softc *sc = arg1;
4175 	unsigned int lenlim;
4176 	int error;
4177 
4178 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4179 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4180 	if (error || req->newptr == NULL)
4181 		return error;
4182 
4183 	HN_LOCK(sc);
4184 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4185 	    lenlim > TCP_LRO_LENGTH_MAX) {
4186 		HN_UNLOCK(sc);
4187 		return EINVAL;
4188 	}
4189 	hn_set_lro_lenlim(sc, lenlim);
4190 	HN_UNLOCK(sc);
4191 
4192 	return 0;
4193 }
4194 
4195 static int
4196 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4197 {
4198 	struct hn_softc *sc = arg1;
4199 	int ackcnt, error, i;
4200 
4201 	/*
4202 	 * lro_ackcnt_lim is append count limit,
4203 	 * +1 to turn it into aggregation limit.
4204 	 */
4205 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4206 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4207 	if (error || req->newptr == NULL)
4208 		return error;
4209 
4210 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4211 		return EINVAL;
4212 
4213 	/*
4214 	 * Convert aggregation limit back to append
4215 	 * count limit.
4216 	 */
4217 	--ackcnt;
4218 	HN_LOCK(sc);
4219 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4220 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4221 	HN_UNLOCK(sc);
4222 	return 0;
4223 }
4224 
4225 #endif
4226 
4227 static int
4228 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4229 {
4230 	struct hn_softc *sc = arg1;
4231 	int hcsum = arg2;
4232 	int on, error, i;
4233 
4234 	on = 0;
4235 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4236 		on = 1;
4237 
4238 	error = sysctl_handle_int(oidp, &on, 0, req);
4239 	if (error || req->newptr == NULL)
4240 		return error;
4241 
4242 	HN_LOCK(sc);
4243 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4244 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4245 
4246 		if (on)
4247 			rxr->hn_trust_hcsum |= hcsum;
4248 		else
4249 			rxr->hn_trust_hcsum &= ~hcsum;
4250 	}
4251 	HN_UNLOCK(sc);
4252 	return 0;
4253 }
4254 
4255 static int
4256 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4257 {
4258 	struct hn_softc *sc = arg1;
4259 	int chim_size, error;
4260 
4261 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4262 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4263 	if (error || req->newptr == NULL)
4264 		return error;
4265 
4266 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4267 		return EINVAL;
4268 
4269 	HN_LOCK(sc);
4270 	hn_set_chim_size(sc, chim_size);
4271 	HN_UNLOCK(sc);
4272 	return 0;
4273 }
4274 
4275 #if __FreeBSD_version < 1100095
4276 static int
4277 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4278 {
4279 	struct hn_softc *sc = arg1;
4280 	int ofs = arg2, i, error;
4281 	struct hn_rx_ring *rxr;
4282 	uint64_t stat;
4283 
4284 	stat = 0;
4285 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4286 		rxr = &sc->hn_rx_ring[i];
4287 		stat += *((int *)((uint8_t *)rxr + ofs));
4288 	}
4289 
4290 	error = sysctl_handle_64(oidp, &stat, 0, req);
4291 	if (error || req->newptr == NULL)
4292 		return error;
4293 
4294 	/* Zero out this stat. */
4295 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4296 		rxr = &sc->hn_rx_ring[i];
4297 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4298 	}
4299 	return 0;
4300 }
4301 #else
4302 static int
4303 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4304 {
4305 	struct hn_softc *sc = arg1;
4306 	int ofs = arg2, i, error;
4307 	struct hn_rx_ring *rxr;
4308 	uint64_t stat;
4309 
4310 	stat = 0;
4311 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4312 		rxr = &sc->hn_rx_ring[i];
4313 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4314 	}
4315 
4316 	error = sysctl_handle_64(oidp, &stat, 0, req);
4317 	if (error || req->newptr == NULL)
4318 		return error;
4319 
4320 	/* Zero out this stat. */
4321 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4322 		rxr = &sc->hn_rx_ring[i];
4323 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4324 	}
4325 	return 0;
4326 }
4327 
4328 #endif
4329 
4330 static int
4331 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4332 {
4333 	struct hn_softc *sc = arg1;
4334 	int ofs = arg2, i, error;
4335 	struct hn_rx_ring *rxr;
4336 	u_long stat;
4337 
4338 	stat = 0;
4339 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4340 		rxr = &sc->hn_rx_ring[i];
4341 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4342 	}
4343 
4344 	error = sysctl_handle_long(oidp, &stat, 0, req);
4345 	if (error || req->newptr == NULL)
4346 		return error;
4347 
4348 	/* Zero out this stat. */
4349 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4350 		rxr = &sc->hn_rx_ring[i];
4351 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4352 	}
4353 	return 0;
4354 }
4355 
4356 static int
4357 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4358 {
4359 	struct hn_softc *sc = arg1;
4360 	int ofs = arg2, i, error;
4361 	struct hn_tx_ring *txr;
4362 	u_long stat;
4363 
4364 	stat = 0;
4365 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4366 		txr = &sc->hn_tx_ring[i];
4367 		stat += *((u_long *)((uint8_t *)txr + ofs));
4368 	}
4369 
4370 	error = sysctl_handle_long(oidp, &stat, 0, req);
4371 	if (error || req->newptr == NULL)
4372 		return error;
4373 
4374 	/* Zero out this stat. */
4375 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4376 		txr = &sc->hn_tx_ring[i];
4377 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4378 	}
4379 	return 0;
4380 }
4381 
4382 static int
4383 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4384 {
4385 	struct hn_softc *sc = arg1;
4386 	int ofs = arg2, i, error, conf;
4387 	struct hn_tx_ring *txr;
4388 
4389 	txr = &sc->hn_tx_ring[0];
4390 	conf = *((int *)((uint8_t *)txr + ofs));
4391 
4392 	error = sysctl_handle_int(oidp, &conf, 0, req);
4393 	if (error || req->newptr == NULL)
4394 		return error;
4395 
4396 	HN_LOCK(sc);
4397 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4398 		txr = &sc->hn_tx_ring[i];
4399 		*((int *)((uint8_t *)txr + ofs)) = conf;
4400 	}
4401 	HN_UNLOCK(sc);
4402 
4403 	return 0;
4404 }
4405 
4406 static int
4407 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4408 {
4409 	struct hn_softc *sc = arg1;
4410 	int error, size;
4411 
4412 	size = sc->hn_agg_size;
4413 	error = sysctl_handle_int(oidp, &size, 0, req);
4414 	if (error || req->newptr == NULL)
4415 		return (error);
4416 
4417 	HN_LOCK(sc);
4418 	sc->hn_agg_size = size;
4419 	hn_set_txagg(sc);
4420 	HN_UNLOCK(sc);
4421 
4422 	return (0);
4423 }
4424 
4425 static int
4426 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4427 {
4428 	struct hn_softc *sc = arg1;
4429 	int error, pkts;
4430 
4431 	pkts = sc->hn_agg_pkts;
4432 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4433 	if (error || req->newptr == NULL)
4434 		return (error);
4435 
4436 	HN_LOCK(sc);
4437 	sc->hn_agg_pkts = pkts;
4438 	hn_set_txagg(sc);
4439 	HN_UNLOCK(sc);
4440 
4441 	return (0);
4442 }
4443 
4444 static int
4445 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4446 {
4447 	struct hn_softc *sc = arg1;
4448 	int pkts;
4449 
4450 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4451 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4452 }
4453 
4454 static int
4455 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4456 {
4457 	struct hn_softc *sc = arg1;
4458 	int align;
4459 
4460 	align = sc->hn_tx_ring[0].hn_agg_align;
4461 	return (sysctl_handle_int(oidp, &align, 0, req));
4462 }
4463 
4464 static void
4465 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4466 {
4467 	if (pollhz == 0)
4468 		vmbus_chan_poll_disable(chan);
4469 	else
4470 		vmbus_chan_poll_enable(chan, pollhz);
4471 }
4472 
4473 static void
4474 hn_polling(struct hn_softc *sc, u_int pollhz)
4475 {
4476 	int nsubch = sc->hn_rx_ring_inuse - 1;
4477 
4478 	HN_LOCK_ASSERT(sc);
4479 
4480 	if (nsubch > 0) {
4481 		struct vmbus_channel **subch;
4482 		int i;
4483 
4484 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4485 		for (i = 0; i < nsubch; ++i)
4486 			hn_chan_polling(subch[i], pollhz);
4487 		vmbus_subchan_rel(subch, nsubch);
4488 	}
4489 	hn_chan_polling(sc->hn_prichan, pollhz);
4490 }
4491 
4492 static int
4493 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4494 {
4495 	struct hn_softc *sc = arg1;
4496 	int pollhz, error;
4497 
4498 	pollhz = sc->hn_pollhz;
4499 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4500 	if (error || req->newptr == NULL)
4501 		return (error);
4502 
4503 	if (pollhz != 0 &&
4504 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4505 		return (EINVAL);
4506 
4507 	HN_LOCK(sc);
4508 	if (sc->hn_pollhz != pollhz) {
4509 		sc->hn_pollhz = pollhz;
4510 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4511 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4512 			hn_polling(sc, sc->hn_pollhz);
4513 	}
4514 	HN_UNLOCK(sc);
4515 
4516 	return (0);
4517 }
4518 
4519 static int
4520 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4521 {
4522 	struct hn_softc *sc = arg1;
4523 	char verstr[16];
4524 
4525 	snprintf(verstr, sizeof(verstr), "%u.%u",
4526 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4527 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4528 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4529 }
4530 
4531 static int
4532 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4533 {
4534 	struct hn_softc *sc = arg1;
4535 	char caps_str[128];
4536 	uint32_t caps;
4537 
4538 	HN_LOCK(sc);
4539 	caps = sc->hn_caps;
4540 	HN_UNLOCK(sc);
4541 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4542 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4543 }
4544 
4545 static int
4546 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4547 {
4548 	struct hn_softc *sc = arg1;
4549 	char assist_str[128];
4550 	uint32_t hwassist;
4551 
4552 	HN_LOCK(sc);
4553 	hwassist = sc->hn_ifp->if_hwassist;
4554 	HN_UNLOCK(sc);
4555 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4556 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4557 }
4558 
4559 static int
4560 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4561 {
4562 	struct hn_softc *sc = arg1;
4563 	char filter_str[128];
4564 	uint32_t filter;
4565 
4566 	HN_LOCK(sc);
4567 	filter = sc->hn_rx_filter;
4568 	HN_UNLOCK(sc);
4569 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4570 	    NDIS_PACKET_TYPES);
4571 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4572 }
4573 
4574 static int
4575 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
4576 {
4577 	struct hn_softc *sc = arg1;
4578 	uint32_t mtu;
4579 	int error;
4580 	HN_LOCK(sc);
4581 	error = hn_rndis_get_mtu(sc, &mtu);
4582 	if (error) {
4583 		if_printf(sc->hn_ifp, "failed to get mtu\n");
4584 		goto back;
4585 	}
4586 	error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4587 	if (error || req->newptr == NULL)
4588 		goto back;
4589 
4590 	error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4591 	if (error)
4592 		goto back;
4593 	error = hn_rndis_reconf_offload(sc, mtu);
4594 back:
4595 	HN_UNLOCK(sc);
4596 	return (error);
4597 }
4598 #ifndef RSS
4599 
4600 static int
4601 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4602 {
4603 	struct hn_softc *sc = arg1;
4604 	int error;
4605 
4606 	HN_LOCK(sc);
4607 
4608 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4609 	if (error || req->newptr == NULL)
4610 		goto back;
4611 
4612 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4613 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4614 		/*
4615 		 * RSS key is synchronized w/ VF's, don't allow users
4616 		 * to change it.
4617 		 */
4618 		error = EBUSY;
4619 		goto back;
4620 	}
4621 
4622 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4623 	if (error)
4624 		goto back;
4625 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4626 
4627 	if (sc->hn_rx_ring_inuse > 1) {
4628 		error = hn_rss_reconfig(sc);
4629 	} else {
4630 		/* Not RSS capable, at least for now; just save the RSS key. */
4631 		error = 0;
4632 	}
4633 back:
4634 	HN_UNLOCK(sc);
4635 	return (error);
4636 }
4637 
4638 static int
4639 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4640 {
4641 	struct hn_softc *sc = arg1;
4642 	int error;
4643 
4644 	HN_LOCK(sc);
4645 
4646 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4647 	if (error || req->newptr == NULL)
4648 		goto back;
4649 
4650 	/*
4651 	 * Don't allow RSS indirect table change, if this interface is not
4652 	 * RSS capable currently.
4653 	 */
4654 	if (sc->hn_rx_ring_inuse == 1) {
4655 		error = EOPNOTSUPP;
4656 		goto back;
4657 	}
4658 
4659 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4660 	if (error)
4661 		goto back;
4662 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4663 
4664 	hn_rss_ind_fixup(sc);
4665 	error = hn_rss_reconfig(sc);
4666 back:
4667 	HN_UNLOCK(sc);
4668 	return (error);
4669 }
4670 
4671 #endif	/* !RSS */
4672 
4673 static int
4674 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4675 {
4676 	struct hn_softc *sc = arg1;
4677 	char hash_str[128];
4678 	uint32_t hash;
4679 
4680 	HN_LOCK(sc);
4681 	hash = sc->hn_rss_hash;
4682 	HN_UNLOCK(sc);
4683 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4684 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4685 }
4686 
4687 static int
4688 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4689 {
4690 	struct hn_softc *sc = arg1;
4691 	char hash_str[128];
4692 	uint32_t hash;
4693 
4694 	HN_LOCK(sc);
4695 	hash = sc->hn_rss_hcap;
4696 	HN_UNLOCK(sc);
4697 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4698 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4699 }
4700 
4701 static int
4702 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4703 {
4704 	struct hn_softc *sc = arg1;
4705 	char hash_str[128];
4706 	uint32_t hash;
4707 
4708 	HN_LOCK(sc);
4709 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4710 	HN_UNLOCK(sc);
4711 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4712 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4713 }
4714 
4715 static int
4716 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4717 {
4718 	struct hn_softc *sc = arg1;
4719 	char vf_name[IFNAMSIZ + 1];
4720 	struct ifnet *vf_ifp;
4721 
4722 	HN_LOCK(sc);
4723 	vf_name[0] = '\0';
4724 	vf_ifp = sc->hn_vf_ifp;
4725 	if (vf_ifp != NULL)
4726 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4727 	HN_UNLOCK(sc);
4728 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4729 }
4730 
4731 static int
4732 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4733 {
4734 	struct hn_softc *sc = arg1;
4735 	char vf_name[IFNAMSIZ + 1];
4736 	struct ifnet *vf_ifp;
4737 
4738 	HN_LOCK(sc);
4739 	vf_name[0] = '\0';
4740 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4741 	if (vf_ifp != NULL)
4742 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4743 	HN_UNLOCK(sc);
4744 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4745 }
4746 
4747 static int
4748 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4749 {
4750 	struct rm_priotracker pt;
4751 	struct sbuf *sb;
4752 	int error, i;
4753 	bool first;
4754 
4755 	error = sysctl_wire_old_buffer(req, 0);
4756 	if (error != 0)
4757 		return (error);
4758 
4759 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4760 	if (sb == NULL)
4761 		return (ENOMEM);
4762 
4763 	rm_rlock(&hn_vfmap_lock, &pt);
4764 
4765 	first = true;
4766 	for (i = 0; i < hn_vfmap_size; ++i) {
4767 		struct epoch_tracker et;
4768 		struct ifnet *ifp;
4769 
4770 		if (hn_vfmap[i] == NULL)
4771 			continue;
4772 
4773 		NET_EPOCH_ENTER(et);
4774 		ifp = ifnet_byindex(i);
4775 		if (ifp != NULL) {
4776 			if (first)
4777 				sbuf_printf(sb, "%s", ifp->if_xname);
4778 			else
4779 				sbuf_printf(sb, " %s", ifp->if_xname);
4780 			first = false;
4781 		}
4782 		NET_EPOCH_EXIT(et);
4783 	}
4784 
4785 	rm_runlock(&hn_vfmap_lock, &pt);
4786 
4787 	error = sbuf_finish(sb);
4788 	sbuf_delete(sb);
4789 	return (error);
4790 }
4791 
4792 static int
4793 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4794 {
4795 	struct rm_priotracker pt;
4796 	struct sbuf *sb;
4797 	int error, i;
4798 	bool first;
4799 
4800 	error = sysctl_wire_old_buffer(req, 0);
4801 	if (error != 0)
4802 		return (error);
4803 
4804 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4805 	if (sb == NULL)
4806 		return (ENOMEM);
4807 
4808 	rm_rlock(&hn_vfmap_lock, &pt);
4809 
4810 	first = true;
4811 	for (i = 0; i < hn_vfmap_size; ++i) {
4812 		struct epoch_tracker et;
4813 		struct ifnet *ifp, *hn_ifp;
4814 
4815 		hn_ifp = hn_vfmap[i];
4816 		if (hn_ifp == NULL)
4817 			continue;
4818 
4819 		NET_EPOCH_ENTER(et);
4820 		ifp = ifnet_byindex(i);
4821 		if (ifp != NULL) {
4822 			if (first) {
4823 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4824 				    hn_ifp->if_xname);
4825 			} else {
4826 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4827 				    hn_ifp->if_xname);
4828 			}
4829 			first = false;
4830 		}
4831 		NET_EPOCH_EXIT(et);
4832 	}
4833 
4834 	rm_runlock(&hn_vfmap_lock, &pt);
4835 
4836 	error = sbuf_finish(sb);
4837 	sbuf_delete(sb);
4838 	return (error);
4839 }
4840 
4841 static int
4842 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4843 {
4844 	struct hn_softc *sc = arg1;
4845 	int error, onoff = 0;
4846 
4847 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4848 		onoff = 1;
4849 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4850 	if (error || req->newptr == NULL)
4851 		return (error);
4852 
4853 	HN_LOCK(sc);
4854 	/* NOTE: hn_vf_lock for hn_transmit() */
4855 	rm_wlock(&sc->hn_vf_lock);
4856 	if (onoff)
4857 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4858 	else
4859 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4860 	rm_wunlock(&sc->hn_vf_lock);
4861 	HN_UNLOCK(sc);
4862 
4863 	return (0);
4864 }
4865 
4866 static int
4867 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4868 {
4869 	struct hn_softc *sc = arg1;
4870 	int enabled = 0;
4871 
4872 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4873 		enabled = 1;
4874 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4875 }
4876 
4877 static int
4878 hn_check_iplen(const struct mbuf *m, int hoff)
4879 {
4880 	const struct ip *ip;
4881 	int len, iphlen, iplen;
4882 	const struct tcphdr *th;
4883 	int thoff;				/* TCP data offset */
4884 
4885 	len = hoff + sizeof(struct ip);
4886 
4887 	/* The packet must be at least the size of an IP header. */
4888 	if (m->m_pkthdr.len < len)
4889 		return IPPROTO_DONE;
4890 
4891 	/* The fixed IP header must reside completely in the first mbuf. */
4892 	if (m->m_len < len)
4893 		return IPPROTO_DONE;
4894 
4895 	ip = mtodo(m, hoff);
4896 
4897 	/* Bound check the packet's stated IP header length. */
4898 	iphlen = ip->ip_hl << 2;
4899 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4900 		return IPPROTO_DONE;
4901 
4902 	/* The full IP header must reside completely in the one mbuf. */
4903 	if (m->m_len < hoff + iphlen)
4904 		return IPPROTO_DONE;
4905 
4906 	iplen = ntohs(ip->ip_len);
4907 
4908 	/*
4909 	 * Check that the amount of data in the buffers is as
4910 	 * at least much as the IP header would have us expect.
4911 	 */
4912 	if (m->m_pkthdr.len < hoff + iplen)
4913 		return IPPROTO_DONE;
4914 
4915 	/*
4916 	 * Ignore IP fragments.
4917 	 */
4918 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4919 		return IPPROTO_DONE;
4920 
4921 	/*
4922 	 * The TCP/IP or UDP/IP header must be entirely contained within
4923 	 * the first fragment of a packet.
4924 	 */
4925 	switch (ip->ip_p) {
4926 	case IPPROTO_TCP:
4927 		if (iplen < iphlen + sizeof(struct tcphdr))
4928 			return IPPROTO_DONE;
4929 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4930 			return IPPROTO_DONE;
4931 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4932 		thoff = th->th_off << 2;
4933 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4934 			return IPPROTO_DONE;
4935 		if (m->m_len < hoff + iphlen + thoff)
4936 			return IPPROTO_DONE;
4937 		break;
4938 	case IPPROTO_UDP:
4939 		if (iplen < iphlen + sizeof(struct udphdr))
4940 			return IPPROTO_DONE;
4941 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4942 			return IPPROTO_DONE;
4943 		break;
4944 	default:
4945 		if (iplen < iphlen)
4946 			return IPPROTO_DONE;
4947 		break;
4948 	}
4949 	return ip->ip_p;
4950 }
4951 
4952 static void
4953 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4954 {
4955 	const struct ether_header *eh;
4956 	uint16_t etype;
4957 	int hoff;
4958 
4959 	hoff = sizeof(*eh);
4960 	/* Checked at the beginning of this function. */
4961 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4962 
4963 	eh = mtod(m_new, const struct ether_header *);
4964 	etype = ntohs(eh->ether_type);
4965 	if (etype == ETHERTYPE_VLAN) {
4966 		const struct ether_vlan_header *evl;
4967 
4968 		hoff = sizeof(*evl);
4969 		if (m_new->m_len < hoff)
4970 			return;
4971 		evl = mtod(m_new, const struct ether_vlan_header *);
4972 		etype = ntohs(evl->evl_proto);
4973 	}
4974 	*l3proto = etype;
4975 
4976 	if (etype == ETHERTYPE_IP)
4977 		*l4proto = hn_check_iplen(m_new, hoff);
4978 	else
4979 		*l4proto = IPPROTO_DONE;
4980 }
4981 
4982 static int
4983 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4984 {
4985 	struct sysctl_oid_list *child;
4986 	struct sysctl_ctx_list *ctx;
4987 	device_t dev = sc->hn_dev;
4988 #if defined(INET) || defined(INET6)
4989 #if __FreeBSD_version >= 1100095
4990 	int lroent_cnt;
4991 #endif
4992 #endif
4993 	int i;
4994 
4995 	/*
4996 	 * Create RXBUF for reception.
4997 	 *
4998 	 * NOTE:
4999 	 * - It is shared by all channels.
5000 	 * - A large enough buffer is allocated, certain version of NVSes
5001 	 *   may further limit the usable space.
5002 	 */
5003 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
5004 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
5005 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5006 	if (sc->hn_rxbuf == NULL) {
5007 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
5008 		return (ENOMEM);
5009 	}
5010 
5011 	sc->hn_rx_ring_cnt = ring_cnt;
5012 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
5013 
5014 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
5015 	    M_DEVBUF, M_WAITOK | M_ZERO);
5016 
5017 #if defined(INET) || defined(INET6)
5018 #if __FreeBSD_version >= 1100095
5019 	lroent_cnt = hn_lro_entry_count;
5020 	if (lroent_cnt < TCP_LRO_ENTRIES)
5021 		lroent_cnt = TCP_LRO_ENTRIES;
5022 	if (bootverbose)
5023 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
5024 #endif
5025 #endif	/* INET || INET6 */
5026 
5027 	ctx = device_get_sysctl_ctx(dev);
5028 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
5029 
5030 	/* Create dev.hn.UNIT.rx sysctl tree */
5031 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
5032 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5033 
5034 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5035 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5036 
5037 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
5038 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
5039 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
5040 		if (rxr->hn_br == NULL) {
5041 			device_printf(dev, "allocate bufring failed\n");
5042 			return (ENOMEM);
5043 		}
5044 
5045 		if (hn_trust_hosttcp)
5046 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
5047 		if (hn_trust_hostudp)
5048 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
5049 		if (hn_trust_hostip)
5050 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
5051 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
5052 		rxr->hn_ifp = sc->hn_ifp;
5053 		if (i < sc->hn_tx_ring_cnt)
5054 			rxr->hn_txr = &sc->hn_tx_ring[i];
5055 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
5056 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
5057 		rxr->hn_rx_idx = i;
5058 		rxr->hn_rxbuf = sc->hn_rxbuf;
5059 
5060 		/*
5061 		 * Initialize LRO.
5062 		 */
5063 #if defined(INET) || defined(INET6)
5064 #if __FreeBSD_version >= 1100095
5065 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5066 		    hn_lro_mbufq_depth);
5067 #else
5068 		tcp_lro_init(&rxr->hn_lro);
5069 		rxr->hn_lro.ifp = sc->hn_ifp;
5070 #endif
5071 #if __FreeBSD_version >= 1100099
5072 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5073 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5074 #endif
5075 #endif	/* INET || INET6 */
5076 
5077 		if (sc->hn_rx_sysctl_tree != NULL) {
5078 			char name[16];
5079 
5080 			/*
5081 			 * Create per RX ring sysctl tree:
5082 			 * dev.hn.UNIT.rx.RINGID
5083 			 */
5084 			snprintf(name, sizeof(name), "%d", i);
5085 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5086 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5087 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5088 
5089 			if (rxr->hn_rx_sysctl_tree != NULL) {
5090 				SYSCTL_ADD_ULONG(ctx,
5091 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5092 				    OID_AUTO, "packets",
5093 				    CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5094 				    "# of packets received");
5095 				SYSCTL_ADD_ULONG(ctx,
5096 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5097 				    OID_AUTO, "rss_pkts",
5098 				    CTLFLAG_RW | CTLFLAG_STATS,
5099 				    &rxr->hn_rss_pkts,
5100 				    "# of packets w/ RSS info received");
5101 				SYSCTL_ADD_ULONG(ctx,
5102 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5103 				    OID_AUTO, "rsc_pkts",
5104 				    CTLFLAG_RW | CTLFLAG_STATS,
5105 				    &rxr->hn_rsc_pkts,
5106 				    "# of RSC packets received");
5107 				SYSCTL_ADD_ULONG(ctx,
5108 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5109 				    OID_AUTO, "rsc_drop",
5110 				    CTLFLAG_RW | CTLFLAG_STATS,
5111 				    &rxr->hn_rsc_drop,
5112 				    "# of RSC fragments dropped");
5113 				SYSCTL_ADD_INT(ctx,
5114 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5115 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5116 				    &rxr->hn_pktbuf_len, 0,
5117 				    "Temporary channel packet buffer length");
5118 			}
5119 		}
5120 	}
5121 
5122 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5123 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5124 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5125 #if __FreeBSD_version < 1100095
5126 	    hn_rx_stat_int_sysctl,
5127 #else
5128 	    hn_rx_stat_u64_sysctl,
5129 #endif
5130 	    "LU", "LRO queued");
5131 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5132 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5133 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5134 #if __FreeBSD_version < 1100095
5135 	    hn_rx_stat_int_sysctl,
5136 #else
5137 	    hn_rx_stat_u64_sysctl,
5138 #endif
5139 	    "LU", "LRO flushed");
5140 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5141 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5142 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5143 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5144 #if __FreeBSD_version >= 1100099
5145 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5146 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5147 	    hn_lro_lenlim_sysctl, "IU",
5148 	    "Max # of data bytes to be aggregated by LRO");
5149 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5150 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5151 	    hn_lro_ackcnt_sysctl, "I",
5152 	    "Max # of ACKs to be aggregated by LRO");
5153 #endif
5154 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5155 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5156 	    hn_trust_hcsum_sysctl, "I",
5157 	    "Trust tcp segment verification on host side, "
5158 	    "when csum info is missing");
5159 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5160 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5161 	    hn_trust_hcsum_sysctl, "I",
5162 	    "Trust udp datagram verification on host side, "
5163 	    "when csum info is missing");
5164 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5165 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5166 	    hn_trust_hcsum_sysctl, "I",
5167 	    "Trust ip packet verification on host side, "
5168 	    "when csum info is missing");
5169 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5170 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5171 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5172 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5173 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5174 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5175 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5176 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5177 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5178 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5179 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5180 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5181 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5182 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5183 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5184 	    hn_rx_stat_ulong_sysctl, "LU",
5185 	    "# of packets that we trust host's csum verification");
5186 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5187 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5188 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5189 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5190 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5191 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5192 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5193 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5194 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5195 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5196 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5197 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5198 
5199 	return (0);
5200 }
5201 
5202 static void
5203 hn_destroy_rx_data(struct hn_softc *sc)
5204 {
5205 	int i;
5206 
5207 	if (sc->hn_rxbuf != NULL) {
5208 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5209 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5210 		else
5211 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5212 		sc->hn_rxbuf = NULL;
5213 	}
5214 
5215 	if (sc->hn_rx_ring_cnt == 0)
5216 		return;
5217 
5218 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5219 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5220 
5221 		if (rxr->hn_br == NULL)
5222 			continue;
5223 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5224 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5225 		} else {
5226 			device_printf(sc->hn_dev,
5227 			    "%dth channel bufring is referenced", i);
5228 		}
5229 		rxr->hn_br = NULL;
5230 
5231 #if defined(INET) || defined(INET6)
5232 		tcp_lro_free(&rxr->hn_lro);
5233 #endif
5234 		free(rxr->hn_pktbuf, M_DEVBUF);
5235 	}
5236 	free(sc->hn_rx_ring, M_DEVBUF);
5237 	sc->hn_rx_ring = NULL;
5238 
5239 	sc->hn_rx_ring_cnt = 0;
5240 	sc->hn_rx_ring_inuse = 0;
5241 }
5242 
5243 static int
5244 hn_tx_ring_create(struct hn_softc *sc, int id)
5245 {
5246 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5247 	device_t dev = sc->hn_dev;
5248 	bus_dma_tag_t parent_dtag;
5249 	int error, i;
5250 
5251 	txr->hn_sc = sc;
5252 	txr->hn_tx_idx = id;
5253 
5254 #ifndef HN_USE_TXDESC_BUFRING
5255 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5256 #endif
5257 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5258 
5259 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5260 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5261 	    M_DEVBUF, M_WAITOK | M_ZERO);
5262 #ifndef HN_USE_TXDESC_BUFRING
5263 	SLIST_INIT(&txr->hn_txlist);
5264 #else
5265 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5266 	    M_WAITOK, &txr->hn_tx_lock);
5267 #endif
5268 
5269 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5270 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5271 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5272 	} else {
5273 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5274 	}
5275 
5276 #ifdef HN_IFSTART_SUPPORT
5277 	if (hn_use_if_start) {
5278 		txr->hn_txeof = hn_start_txeof;
5279 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5280 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5281 	} else
5282 #endif
5283 	{
5284 		int br_depth;
5285 
5286 		txr->hn_txeof = hn_xmit_txeof;
5287 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5288 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5289 
5290 		br_depth = hn_get_txswq_depth(txr);
5291 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5292 		    M_WAITOK, &txr->hn_tx_lock);
5293 	}
5294 
5295 	txr->hn_direct_tx_size = hn_direct_tx_size;
5296 
5297 	/*
5298 	 * Always schedule transmission instead of trying to do direct
5299 	 * transmission.  This one gives the best performance so far.
5300 	 */
5301 	txr->hn_sched_tx = 1;
5302 
5303 	parent_dtag = bus_get_dma_tag(dev);
5304 
5305 	/* DMA tag for RNDIS packet messages. */
5306 	error = bus_dma_tag_create(parent_dtag, /* parent */
5307 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5308 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5309 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5310 	    BUS_SPACE_MAXADDR,		/* highaddr */
5311 	    NULL, NULL,			/* filter, filterarg */
5312 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5313 	    1,				/* nsegments */
5314 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5315 	    0,				/* flags */
5316 	    NULL,			/* lockfunc */
5317 	    NULL,			/* lockfuncarg */
5318 	    &txr->hn_tx_rndis_dtag);
5319 	if (error) {
5320 		device_printf(dev, "failed to create rndis dmatag\n");
5321 		return error;
5322 	}
5323 
5324 	/* DMA tag for data. */
5325 	error = bus_dma_tag_create(parent_dtag, /* parent */
5326 	    1,				/* alignment */
5327 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5328 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5329 	    BUS_SPACE_MAXADDR,		/* highaddr */
5330 	    NULL, NULL,			/* filter, filterarg */
5331 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5332 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5333 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5334 	    0,				/* flags */
5335 	    NULL,			/* lockfunc */
5336 	    NULL,			/* lockfuncarg */
5337 	    &txr->hn_tx_data_dtag);
5338 	if (error) {
5339 		device_printf(dev, "failed to create data dmatag\n");
5340 		return error;
5341 	}
5342 
5343 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5344 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5345 
5346 		txd->txr = txr;
5347 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5348 		STAILQ_INIT(&txd->agg_list);
5349 
5350 		/*
5351 		 * Allocate and load RNDIS packet message.
5352 		 */
5353         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5354 		    (void **)&txd->rndis_pkt,
5355 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5356 		    &txd->rndis_pkt_dmap);
5357 		if (error) {
5358 			device_printf(dev,
5359 			    "failed to allocate rndis_packet_msg, %d\n", i);
5360 			return error;
5361 		}
5362 
5363 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5364 		    txd->rndis_pkt_dmap,
5365 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5366 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5367 		    BUS_DMA_NOWAIT);
5368 		if (error) {
5369 			device_printf(dev,
5370 			    "failed to load rndis_packet_msg, %d\n", i);
5371 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5372 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5373 			return error;
5374 		}
5375 
5376 		/* DMA map for TX data. */
5377 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5378 		    &txd->data_dmap);
5379 		if (error) {
5380 			device_printf(dev,
5381 			    "failed to allocate tx data dmamap\n");
5382 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5383 			    txd->rndis_pkt_dmap);
5384 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5385 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5386 			return error;
5387 		}
5388 
5389 		/* All set, put it to list */
5390 		txd->flags |= HN_TXD_FLAG_ONLIST;
5391 #ifndef HN_USE_TXDESC_BUFRING
5392 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5393 #else
5394 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5395 #endif
5396 	}
5397 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5398 
5399 	if (sc->hn_tx_sysctl_tree != NULL) {
5400 		struct sysctl_oid_list *child;
5401 		struct sysctl_ctx_list *ctx;
5402 		char name[16];
5403 
5404 		/*
5405 		 * Create per TX ring sysctl tree:
5406 		 * dev.hn.UNIT.tx.RINGID
5407 		 */
5408 		ctx = device_get_sysctl_ctx(dev);
5409 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5410 
5411 		snprintf(name, sizeof(name), "%d", id);
5412 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5413 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5414 
5415 		if (txr->hn_tx_sysctl_tree != NULL) {
5416 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5417 
5418 #ifdef HN_DEBUG
5419 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5420 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5421 			    "# of available TX descs");
5422 #endif
5423 #ifdef HN_IFSTART_SUPPORT
5424 			if (!hn_use_if_start)
5425 #endif
5426 			{
5427 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5428 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5429 				    "over active");
5430 			}
5431 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5432 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5433 			    "# of packets transmitted");
5434 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5435 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5436 			    "# of sends");
5437 		}
5438 	}
5439 
5440 	return 0;
5441 }
5442 
5443 static void
5444 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5445 {
5446 	struct hn_tx_ring *txr = txd->txr;
5447 
5448 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5449 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5450 
5451 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5452 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5453 	    txd->rndis_pkt_dmap);
5454 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5455 }
5456 
5457 static void
5458 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5459 {
5460 
5461 	KASSERT(txd->refs == 0 || txd->refs == 1,
5462 	    ("invalid txd refs %d", txd->refs));
5463 
5464 	/* Aggregated txds will be freed by their aggregating txd. */
5465 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5466 		int freed __diagused;
5467 
5468 		freed = hn_txdesc_put(txr, txd);
5469 		KASSERT(freed, ("can't free txdesc"));
5470 	}
5471 }
5472 
5473 static void
5474 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5475 {
5476 	int i;
5477 
5478 	if (txr->hn_txdesc == NULL)
5479 		return;
5480 
5481 	/*
5482 	 * NOTE:
5483 	 * Because the freeing of aggregated txds will be deferred
5484 	 * to the aggregating txd, two passes are used here:
5485 	 * - The first pass GCes any pending txds.  This GC is necessary,
5486 	 *   since if the channels are revoked, hypervisor will not
5487 	 *   deliver send-done for all pending txds.
5488 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5489 	 *   were freed.
5490 	 */
5491 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5492 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5493 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5494 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5495 
5496 	if (txr->hn_tx_data_dtag != NULL)
5497 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5498 	if (txr->hn_tx_rndis_dtag != NULL)
5499 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5500 
5501 #ifdef HN_USE_TXDESC_BUFRING
5502 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5503 #endif
5504 
5505 	free(txr->hn_txdesc, M_DEVBUF);
5506 	txr->hn_txdesc = NULL;
5507 
5508 	if (txr->hn_mbuf_br != NULL)
5509 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5510 
5511 #ifndef HN_USE_TXDESC_BUFRING
5512 	mtx_destroy(&txr->hn_txlist_spin);
5513 #endif
5514 	mtx_destroy(&txr->hn_tx_lock);
5515 }
5516 
5517 static int
5518 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5519 {
5520 	struct sysctl_oid_list *child;
5521 	struct sysctl_ctx_list *ctx;
5522 	int i;
5523 
5524 	/*
5525 	 * Create TXBUF for chimney sending.
5526 	 *
5527 	 * NOTE: It is shared by all channels.
5528 	 */
5529 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5530 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5531 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5532 	if (sc->hn_chim == NULL) {
5533 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5534 		return (ENOMEM);
5535 	}
5536 
5537 	sc->hn_tx_ring_cnt = ring_cnt;
5538 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5539 
5540 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5541 	    M_DEVBUF, M_WAITOK | M_ZERO);
5542 
5543 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5544 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5545 
5546 	/* Create dev.hn.UNIT.tx sysctl tree */
5547 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5548 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5549 
5550 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5551 		int error;
5552 
5553 		error = hn_tx_ring_create(sc, i);
5554 		if (error)
5555 			return error;
5556 	}
5557 
5558 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5559 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5560 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5561 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5562 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5563 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5564 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5565 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5566 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5567 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5568 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5569 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5570 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5571 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5572 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5573 	    hn_tx_stat_ulong_sysctl, "LU",
5574 	    "# of packet transmission aggregation flush failure");
5575 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5576 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5577 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5578 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5579 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5580 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5581 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5582 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5583 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5584 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5585 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5586 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5587 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5588 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5589 	    "# of total TX descs");
5590 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5591 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5592 	    "Chimney send packet size upper boundary");
5593 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5594 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5595 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5596 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5597 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5598 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5599 	    hn_tx_conf_int_sysctl, "I",
5600 	    "Size of the packet for direct transmission");
5601 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5602 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5603 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5604 	    hn_tx_conf_int_sysctl, "I",
5605 	    "Always schedule transmission "
5606 	    "instead of doing direct transmission");
5607 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5608 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5609 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5610 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5611 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5612 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5613 	    "Applied packet transmission aggregation size");
5614 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5615 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5616 	    hn_txagg_pktmax_sysctl, "I",
5617 	    "Applied packet transmission aggregation packets");
5618 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5619 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5620 	    hn_txagg_align_sysctl, "I",
5621 	    "Applied packet transmission aggregation alignment");
5622 
5623 	return 0;
5624 }
5625 
5626 static void
5627 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5628 {
5629 	int i;
5630 
5631 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5632 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5633 }
5634 
5635 static void
5636 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5637 {
5638 	struct ifnet *ifp = sc->hn_ifp;
5639 	u_int hw_tsomax;
5640 	int tso_minlen;
5641 
5642 	HN_LOCK_ASSERT(sc);
5643 
5644 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5645 		return;
5646 
5647 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5648 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5649 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5650 
5651 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5652 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5653 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5654 
5655 	if (tso_maxlen < tso_minlen)
5656 		tso_maxlen = tso_minlen;
5657 	else if (tso_maxlen > IP_MAXPACKET)
5658 		tso_maxlen = IP_MAXPACKET;
5659 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5660 		tso_maxlen = sc->hn_ndis_tso_szmax;
5661 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5662 
5663 	if (hn_xpnt_vf_isready(sc)) {
5664 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5665 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5666 	}
5667 	ifp->if_hw_tsomax = hw_tsomax;
5668 	if (bootverbose)
5669 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5670 }
5671 
5672 static void
5673 hn_fixup_tx_data(struct hn_softc *sc)
5674 {
5675 	uint64_t csum_assist;
5676 	int i;
5677 
5678 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5679 	if (hn_tx_chimney_size > 0 &&
5680 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5681 		hn_set_chim_size(sc, hn_tx_chimney_size);
5682 
5683 	csum_assist = 0;
5684 	if (sc->hn_caps & HN_CAP_IPCS)
5685 		csum_assist |= CSUM_IP;
5686 	if (sc->hn_caps & HN_CAP_TCP4CS)
5687 		csum_assist |= CSUM_IP_TCP;
5688 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5689 		csum_assist |= CSUM_IP_UDP;
5690 	if (sc->hn_caps & HN_CAP_TCP6CS)
5691 		csum_assist |= CSUM_IP6_TCP;
5692 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5693 		csum_assist |= CSUM_IP6_UDP;
5694 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5695 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5696 
5697 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5698 		/*
5699 		 * Support HASHVAL pktinfo on TX path.
5700 		 */
5701 		if (bootverbose)
5702 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5703 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5704 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5705 	}
5706 }
5707 
5708 static void
5709 hn_fixup_rx_data(struct hn_softc *sc)
5710 {
5711 
5712 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5713 		int i;
5714 
5715 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5716 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5717 	}
5718 }
5719 
5720 static void
5721 hn_destroy_tx_data(struct hn_softc *sc)
5722 {
5723 	int i;
5724 
5725 	if (sc->hn_chim != NULL) {
5726 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5727 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5728 		} else {
5729 			device_printf(sc->hn_dev,
5730 			    "chimney sending buffer is referenced");
5731 		}
5732 		sc->hn_chim = NULL;
5733 	}
5734 
5735 	if (sc->hn_tx_ring_cnt == 0)
5736 		return;
5737 
5738 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5739 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5740 
5741 	free(sc->hn_tx_ring, M_DEVBUF);
5742 	sc->hn_tx_ring = NULL;
5743 
5744 	sc->hn_tx_ring_cnt = 0;
5745 	sc->hn_tx_ring_inuse = 0;
5746 }
5747 
5748 #ifdef HN_IFSTART_SUPPORT
5749 
5750 static void
5751 hn_start_taskfunc(void *xtxr, int pending __unused)
5752 {
5753 	struct hn_tx_ring *txr = xtxr;
5754 
5755 	mtx_lock(&txr->hn_tx_lock);
5756 	hn_start_locked(txr, 0);
5757 	mtx_unlock(&txr->hn_tx_lock);
5758 }
5759 
5760 static int
5761 hn_start_locked(struct hn_tx_ring *txr, int len)
5762 {
5763 	struct hn_softc *sc = txr->hn_sc;
5764 	struct ifnet *ifp = sc->hn_ifp;
5765 	int sched = 0;
5766 
5767 	KASSERT(hn_use_if_start,
5768 	    ("hn_start_locked is called, when if_start is disabled"));
5769 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5770 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5771 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5772 
5773 	if (__predict_false(txr->hn_suspended))
5774 		return (0);
5775 
5776 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5777 	    IFF_DRV_RUNNING)
5778 		return (0);
5779 
5780 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5781 		struct hn_txdesc *txd;
5782 		struct mbuf *m_head;
5783 		int error;
5784 
5785 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5786 		if (m_head == NULL)
5787 			break;
5788 
5789 		if (len > 0 && m_head->m_pkthdr.len > len) {
5790 			/*
5791 			 * This sending could be time consuming; let callers
5792 			 * dispatch this packet sending (and sending of any
5793 			 * following up packets) to tx taskqueue.
5794 			 */
5795 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5796 			sched = 1;
5797 			break;
5798 		}
5799 
5800 #if defined(INET6) || defined(INET)
5801 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5802 			m_head = hn_tso_fixup(m_head);
5803 			if (__predict_false(m_head == NULL)) {
5804 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5805 				continue;
5806 			}
5807 		} else if (m_head->m_pkthdr.csum_flags &
5808 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5809 			m_head = hn_set_hlen(m_head);
5810 			if (__predict_false(m_head == NULL)) {
5811 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5812 				continue;
5813 			}
5814 		}
5815 #endif
5816 
5817 		txd = hn_txdesc_get(txr);
5818 		if (txd == NULL) {
5819 			txr->hn_no_txdescs++;
5820 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5821 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5822 			break;
5823 		}
5824 
5825 		error = hn_encap(ifp, txr, txd, &m_head);
5826 		if (error) {
5827 			/* Both txd and m_head are freed */
5828 			KASSERT(txr->hn_agg_txd == NULL,
5829 			    ("encap failed w/ pending aggregating txdesc"));
5830 			continue;
5831 		}
5832 
5833 		if (txr->hn_agg_pktleft == 0) {
5834 			if (txr->hn_agg_txd != NULL) {
5835 				KASSERT(m_head == NULL,
5836 				    ("pending mbuf for aggregating txdesc"));
5837 				error = hn_flush_txagg(ifp, txr);
5838 				if (__predict_false(error)) {
5839 					atomic_set_int(&ifp->if_drv_flags,
5840 					    IFF_DRV_OACTIVE);
5841 					break;
5842 				}
5843 			} else {
5844 				KASSERT(m_head != NULL, ("mbuf was freed"));
5845 				error = hn_txpkt(ifp, txr, txd);
5846 				if (__predict_false(error)) {
5847 					/* txd is freed, but m_head is not */
5848 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5849 					atomic_set_int(&ifp->if_drv_flags,
5850 					    IFF_DRV_OACTIVE);
5851 					break;
5852 				}
5853 			}
5854 		}
5855 #ifdef INVARIANTS
5856 		else {
5857 			KASSERT(txr->hn_agg_txd != NULL,
5858 			    ("no aggregating txdesc"));
5859 			KASSERT(m_head == NULL,
5860 			    ("pending mbuf for aggregating txdesc"));
5861 		}
5862 #endif
5863 	}
5864 
5865 	/* Flush pending aggerated transmission. */
5866 	if (txr->hn_agg_txd != NULL)
5867 		hn_flush_txagg(ifp, txr);
5868 	return (sched);
5869 }
5870 
5871 static void
5872 hn_start(struct ifnet *ifp)
5873 {
5874 	struct hn_softc *sc = ifp->if_softc;
5875 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5876 
5877 	if (txr->hn_sched_tx)
5878 		goto do_sched;
5879 
5880 	if (mtx_trylock(&txr->hn_tx_lock)) {
5881 		int sched;
5882 
5883 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5884 		mtx_unlock(&txr->hn_tx_lock);
5885 		if (!sched)
5886 			return;
5887 	}
5888 do_sched:
5889 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5890 }
5891 
5892 static void
5893 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5894 {
5895 	struct hn_tx_ring *txr = xtxr;
5896 
5897 	mtx_lock(&txr->hn_tx_lock);
5898 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5899 	hn_start_locked(txr, 0);
5900 	mtx_unlock(&txr->hn_tx_lock);
5901 }
5902 
5903 static void
5904 hn_start_txeof(struct hn_tx_ring *txr)
5905 {
5906 	struct hn_softc *sc = txr->hn_sc;
5907 	struct ifnet *ifp = sc->hn_ifp;
5908 
5909 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5910 
5911 	if (txr->hn_sched_tx)
5912 		goto do_sched;
5913 
5914 	if (mtx_trylock(&txr->hn_tx_lock)) {
5915 		int sched;
5916 
5917 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5918 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5919 		mtx_unlock(&txr->hn_tx_lock);
5920 		if (sched) {
5921 			taskqueue_enqueue(txr->hn_tx_taskq,
5922 			    &txr->hn_tx_task);
5923 		}
5924 	} else {
5925 do_sched:
5926 		/*
5927 		 * Release the OACTIVE earlier, with the hope, that
5928 		 * others could catch up.  The task will clear the
5929 		 * flag again with the hn_tx_lock to avoid possible
5930 		 * races.
5931 		 */
5932 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5933 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5934 	}
5935 }
5936 
5937 #endif	/* HN_IFSTART_SUPPORT */
5938 
5939 static int
5940 hn_xmit(struct hn_tx_ring *txr, int len)
5941 {
5942 	struct hn_softc *sc = txr->hn_sc;
5943 	struct ifnet *ifp = sc->hn_ifp;
5944 	struct mbuf *m_head;
5945 	int sched = 0;
5946 
5947 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5948 #ifdef HN_IFSTART_SUPPORT
5949 	KASSERT(hn_use_if_start == 0,
5950 	    ("hn_xmit is called, when if_start is enabled"));
5951 #endif
5952 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5953 
5954 	if (__predict_false(txr->hn_suspended))
5955 		return (0);
5956 
5957 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5958 		return (0);
5959 
5960 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5961 		struct hn_txdesc *txd;
5962 		int error;
5963 
5964 		if (len > 0 && m_head->m_pkthdr.len > len) {
5965 			/*
5966 			 * This sending could be time consuming; let callers
5967 			 * dispatch this packet sending (and sending of any
5968 			 * following up packets) to tx taskqueue.
5969 			 */
5970 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5971 			sched = 1;
5972 			break;
5973 		}
5974 
5975 		txd = hn_txdesc_get(txr);
5976 		if (txd == NULL) {
5977 			txr->hn_no_txdescs++;
5978 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5979 			txr->hn_oactive = 1;
5980 			break;
5981 		}
5982 
5983 		error = hn_encap(ifp, txr, txd, &m_head);
5984 		if (error) {
5985 			/* Both txd and m_head are freed; discard */
5986 			KASSERT(txr->hn_agg_txd == NULL,
5987 			    ("encap failed w/ pending aggregating txdesc"));
5988 			drbr_advance(ifp, txr->hn_mbuf_br);
5989 			continue;
5990 		}
5991 
5992 		if (txr->hn_agg_pktleft == 0) {
5993 			if (txr->hn_agg_txd != NULL) {
5994 				KASSERT(m_head == NULL,
5995 				    ("pending mbuf for aggregating txdesc"));
5996 				error = hn_flush_txagg(ifp, txr);
5997 				if (__predict_false(error)) {
5998 					txr->hn_oactive = 1;
5999 					break;
6000 				}
6001 			} else {
6002 				KASSERT(m_head != NULL, ("mbuf was freed"));
6003 				error = hn_txpkt(ifp, txr, txd);
6004 				if (__predict_false(error)) {
6005 					/* txd is freed, but m_head is not */
6006 					drbr_putback(ifp, txr->hn_mbuf_br,
6007 					    m_head);
6008 					txr->hn_oactive = 1;
6009 					break;
6010 				}
6011 			}
6012 		}
6013 #ifdef INVARIANTS
6014 		else {
6015 			KASSERT(txr->hn_agg_txd != NULL,
6016 			    ("no aggregating txdesc"));
6017 			KASSERT(m_head == NULL,
6018 			    ("pending mbuf for aggregating txdesc"));
6019 		}
6020 #endif
6021 
6022 		/* Sent */
6023 		drbr_advance(ifp, txr->hn_mbuf_br);
6024 	}
6025 
6026 	/* Flush pending aggerated transmission. */
6027 	if (txr->hn_agg_txd != NULL)
6028 		hn_flush_txagg(ifp, txr);
6029 	return (sched);
6030 }
6031 
6032 static int
6033 hn_transmit(struct ifnet *ifp, struct mbuf *m)
6034 {
6035 	struct hn_softc *sc = ifp->if_softc;
6036 	struct hn_tx_ring *txr;
6037 	int error, idx = 0;
6038 
6039 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
6040 		struct rm_priotracker pt;
6041 
6042 		rm_rlock(&sc->hn_vf_lock, &pt);
6043 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6044 			struct mbuf *m_bpf = NULL;
6045 			int obytes, omcast;
6046 
6047 			obytes = m->m_pkthdr.len;
6048 			omcast = (m->m_flags & M_MCAST) != 0;
6049 
6050 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
6051 				if (bpf_peers_present(ifp->if_bpf)) {
6052 					m_bpf = m_copypacket(m, M_NOWAIT);
6053 					if (m_bpf == NULL) {
6054 						/*
6055 						 * Failed to grab a shallow
6056 						 * copy; tap now.
6057 						 */
6058 						ETHER_BPF_MTAP(ifp, m);
6059 					}
6060 				}
6061 			} else {
6062 				ETHER_BPF_MTAP(ifp, m);
6063 			}
6064 
6065 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
6066 			rm_runlock(&sc->hn_vf_lock, &pt);
6067 
6068 			if (m_bpf != NULL) {
6069 				if (!error)
6070 					ETHER_BPF_MTAP(ifp, m_bpf);
6071 				m_freem(m_bpf);
6072 			}
6073 
6074 			if (error == ENOBUFS) {
6075 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6076 			} else if (error) {
6077 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6078 			} else {
6079 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6080 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6081 				if (omcast) {
6082 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6083 					    omcast);
6084 				}
6085 			}
6086 			return (error);
6087 		}
6088 		rm_runlock(&sc->hn_vf_lock, &pt);
6089 	}
6090 
6091 #if defined(INET6) || defined(INET)
6092 	/*
6093 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6094 	 * since packet headers should be cache-hot.
6095 	 */
6096 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6097 		m = hn_tso_fixup(m);
6098 		if (__predict_false(m == NULL)) {
6099 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6100 			return EIO;
6101 		}
6102 	} else if (m->m_pkthdr.csum_flags &
6103 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6104 		m = hn_set_hlen(m);
6105 		if (__predict_false(m == NULL)) {
6106 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6107 			return EIO;
6108 		}
6109 	}
6110 #endif
6111 
6112 	/*
6113 	 * Select the TX ring based on flowid
6114 	 */
6115 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6116 #ifdef RSS
6117 		uint32_t bid;
6118 
6119 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6120 		    &bid) == 0)
6121 			idx = bid % sc->hn_tx_ring_inuse;
6122 		else
6123 #endif
6124 		{
6125 #if defined(INET6) || defined(INET)
6126 			int tcpsyn = 0;
6127 
6128 			if (m->m_pkthdr.len < 128 &&
6129 			    (m->m_pkthdr.csum_flags &
6130 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6131 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6132 				m = hn_check_tcpsyn(m, &tcpsyn);
6133 				if (__predict_false(m == NULL)) {
6134 					if_inc_counter(ifp,
6135 					    IFCOUNTER_OERRORS, 1);
6136 					return (EIO);
6137 				}
6138 			}
6139 #else
6140 			const int tcpsyn = 0;
6141 #endif
6142 			if (tcpsyn)
6143 				idx = 0;
6144 			else
6145 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6146 		}
6147 	}
6148 	txr = &sc->hn_tx_ring[idx];
6149 
6150 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6151 	if (error) {
6152 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6153 		return error;
6154 	}
6155 
6156 	if (txr->hn_oactive)
6157 		return 0;
6158 
6159 	if (txr->hn_sched_tx)
6160 		goto do_sched;
6161 
6162 	if (mtx_trylock(&txr->hn_tx_lock)) {
6163 		int sched;
6164 
6165 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6166 		mtx_unlock(&txr->hn_tx_lock);
6167 		if (!sched)
6168 			return 0;
6169 	}
6170 do_sched:
6171 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6172 	return 0;
6173 }
6174 
6175 static void
6176 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6177 {
6178 	struct mbuf *m;
6179 
6180 	mtx_lock(&txr->hn_tx_lock);
6181 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6182 		m_freem(m);
6183 	mtx_unlock(&txr->hn_tx_lock);
6184 }
6185 
6186 static void
6187 hn_xmit_qflush(struct ifnet *ifp)
6188 {
6189 	struct hn_softc *sc = ifp->if_softc;
6190 	struct rm_priotracker pt;
6191 	int i;
6192 
6193 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6194 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6195 	if_qflush(ifp);
6196 
6197 	rm_rlock(&sc->hn_vf_lock, &pt);
6198 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6199 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6200 	rm_runlock(&sc->hn_vf_lock, &pt);
6201 }
6202 
6203 static void
6204 hn_xmit_txeof(struct hn_tx_ring *txr)
6205 {
6206 
6207 	if (txr->hn_sched_tx)
6208 		goto do_sched;
6209 
6210 	if (mtx_trylock(&txr->hn_tx_lock)) {
6211 		int sched;
6212 
6213 		txr->hn_oactive = 0;
6214 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6215 		mtx_unlock(&txr->hn_tx_lock);
6216 		if (sched) {
6217 			taskqueue_enqueue(txr->hn_tx_taskq,
6218 			    &txr->hn_tx_task);
6219 		}
6220 	} else {
6221 do_sched:
6222 		/*
6223 		 * Release the oactive earlier, with the hope, that
6224 		 * others could catch up.  The task will clear the
6225 		 * oactive again with the hn_tx_lock to avoid possible
6226 		 * races.
6227 		 */
6228 		txr->hn_oactive = 0;
6229 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6230 	}
6231 }
6232 
6233 static void
6234 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6235 {
6236 	struct hn_tx_ring *txr = xtxr;
6237 
6238 	mtx_lock(&txr->hn_tx_lock);
6239 	hn_xmit(txr, 0);
6240 	mtx_unlock(&txr->hn_tx_lock);
6241 }
6242 
6243 static void
6244 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6245 {
6246 	struct hn_tx_ring *txr = xtxr;
6247 
6248 	mtx_lock(&txr->hn_tx_lock);
6249 	txr->hn_oactive = 0;
6250 	hn_xmit(txr, 0);
6251 	mtx_unlock(&txr->hn_tx_lock);
6252 }
6253 
6254 static int
6255 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6256 {
6257 	struct vmbus_chan_br cbr;
6258 	struct hn_rx_ring *rxr;
6259 	struct hn_tx_ring *txr = NULL;
6260 	int idx, error;
6261 
6262 	idx = vmbus_chan_subidx(chan);
6263 
6264 	/*
6265 	 * Link this channel to RX/TX ring.
6266 	 */
6267 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6268 	    ("invalid channel index %d, should > 0 && < %d",
6269 	     idx, sc->hn_rx_ring_inuse));
6270 	rxr = &sc->hn_rx_ring[idx];
6271 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6272 	    ("RX ring %d already attached", idx));
6273 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6274 	rxr->hn_chan = chan;
6275 
6276 	if (bootverbose) {
6277 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6278 		    idx, vmbus_chan_id(chan));
6279 	}
6280 
6281 	if (idx < sc->hn_tx_ring_inuse) {
6282 		txr = &sc->hn_tx_ring[idx];
6283 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6284 		    ("TX ring %d already attached", idx));
6285 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6286 
6287 		txr->hn_chan = chan;
6288 		if (bootverbose) {
6289 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6290 			    idx, vmbus_chan_id(chan));
6291 		}
6292 	}
6293 
6294 	/* Bind this channel to a proper CPU. */
6295 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6296 
6297 	/*
6298 	 * Open this channel
6299 	 */
6300 	cbr.cbr = rxr->hn_br;
6301 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6302 	cbr.cbr_txsz = HN_TXBR_SIZE;
6303 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6304 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6305 	if (error) {
6306 		if (error == EISCONN) {
6307 			if_printf(sc->hn_ifp, "bufring is connected after "
6308 			    "chan%u open failure\n", vmbus_chan_id(chan));
6309 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6310 		} else {
6311 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6312 			    vmbus_chan_id(chan), error);
6313 		}
6314 	}
6315 	return (error);
6316 }
6317 
6318 static void
6319 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6320 {
6321 	struct hn_rx_ring *rxr;
6322 	int idx, error;
6323 
6324 	idx = vmbus_chan_subidx(chan);
6325 
6326 	/*
6327 	 * Link this channel to RX/TX ring.
6328 	 */
6329 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6330 	    ("invalid channel index %d, should > 0 && < %d",
6331 	     idx, sc->hn_rx_ring_inuse));
6332 	rxr = &sc->hn_rx_ring[idx];
6333 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6334 	    ("RX ring %d is not attached", idx));
6335 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6336 
6337 	if (idx < sc->hn_tx_ring_inuse) {
6338 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6339 
6340 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6341 		    ("TX ring %d is not attached attached", idx));
6342 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6343 	}
6344 
6345 	/*
6346 	 * Close this channel.
6347 	 *
6348 	 * NOTE:
6349 	 * Channel closing does _not_ destroy the target channel.
6350 	 */
6351 	error = vmbus_chan_close_direct(chan);
6352 	if (error == EISCONN) {
6353 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6354 		    "after being closed\n", vmbus_chan_id(chan));
6355 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6356 	} else if (error) {
6357 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6358 		    vmbus_chan_id(chan), error);
6359 	}
6360 }
6361 
6362 static int
6363 hn_attach_subchans(struct hn_softc *sc)
6364 {
6365 	struct vmbus_channel **subchans;
6366 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6367 	int i, error = 0;
6368 
6369 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6370 
6371 	/* Attach the sub-channels. */
6372 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6373 	for (i = 0; i < subchan_cnt; ++i) {
6374 		int error1;
6375 
6376 		error1 = hn_chan_attach(sc, subchans[i]);
6377 		if (error1) {
6378 			error = error1;
6379 			/* Move on; all channels will be detached later. */
6380 		}
6381 	}
6382 	vmbus_subchan_rel(subchans, subchan_cnt);
6383 
6384 	if (error) {
6385 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6386 	} else {
6387 		if (bootverbose) {
6388 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6389 			    subchan_cnt);
6390 		}
6391 	}
6392 	return (error);
6393 }
6394 
6395 static void
6396 hn_detach_allchans(struct hn_softc *sc)
6397 {
6398 	struct vmbus_channel **subchans;
6399 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6400 	int i;
6401 
6402 	if (subchan_cnt == 0)
6403 		goto back;
6404 
6405 	/* Detach the sub-channels. */
6406 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6407 	for (i = 0; i < subchan_cnt; ++i)
6408 		hn_chan_detach(sc, subchans[i]);
6409 	vmbus_subchan_rel(subchans, subchan_cnt);
6410 
6411 back:
6412 	/*
6413 	 * Detach the primary channel, _after_ all sub-channels
6414 	 * are detached.
6415 	 */
6416 	hn_chan_detach(sc, sc->hn_prichan);
6417 
6418 	/* Wait for sub-channels to be destroyed, if any. */
6419 	vmbus_subchan_drain(sc->hn_prichan);
6420 
6421 #ifdef INVARIANTS
6422 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6423 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6424 		    HN_RX_FLAG_ATTACHED) == 0,
6425 		    ("%dth RX ring is still attached", i));
6426 	}
6427 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6428 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6429 		    HN_TX_FLAG_ATTACHED) == 0,
6430 		    ("%dth TX ring is still attached", i));
6431 	}
6432 #endif
6433 }
6434 
6435 static int
6436 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6437 {
6438 	struct vmbus_channel **subchans;
6439 	int nchan, rxr_cnt, error;
6440 
6441 	nchan = *nsubch + 1;
6442 	if (nchan == 1) {
6443 		/*
6444 		 * Multiple RX/TX rings are not requested.
6445 		 */
6446 		*nsubch = 0;
6447 		return (0);
6448 	}
6449 
6450 	/*
6451 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6452 	 * table entries.
6453 	 */
6454 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6455 	if (error) {
6456 		/* No RSS; this is benign. */
6457 		*nsubch = 0;
6458 		return (0);
6459 	}
6460 	if (bootverbose) {
6461 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6462 		    rxr_cnt, nchan);
6463 	}
6464 
6465 	if (nchan > rxr_cnt)
6466 		nchan = rxr_cnt;
6467 	if (nchan == 1) {
6468 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6469 		*nsubch = 0;
6470 		return (0);
6471 	}
6472 
6473 	/*
6474 	 * Allocate sub-channels from NVS.
6475 	 */
6476 	*nsubch = nchan - 1;
6477 	error = hn_nvs_alloc_subchans(sc, nsubch);
6478 	if (error || *nsubch == 0) {
6479 		/* Failed to allocate sub-channels. */
6480 		*nsubch = 0;
6481 		return (0);
6482 	}
6483 
6484 	/*
6485 	 * Wait for all sub-channels to become ready before moving on.
6486 	 */
6487 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6488 	vmbus_subchan_rel(subchans, *nsubch);
6489 	return (0);
6490 }
6491 
6492 static bool
6493 hn_synth_attachable(const struct hn_softc *sc)
6494 {
6495 	int i;
6496 
6497 	if (sc->hn_flags & HN_FLAG_ERRORS)
6498 		return (false);
6499 
6500 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6501 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6502 
6503 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6504 			return (false);
6505 	}
6506 	return (true);
6507 }
6508 
6509 /*
6510  * Make sure that the RX filter is zero after the successful
6511  * RNDIS initialization.
6512  *
6513  * NOTE:
6514  * Under certain conditions on certain versions of Hyper-V,
6515  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6516  * after the successful RNDIS initialization, which breaks
6517  * the assumption of any following code (well, it breaks the
6518  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6519  * explicitly, drain packets sneaking through, and drain the
6520  * interrupt taskqueues scheduled due to the stealth packets.
6521  */
6522 static void
6523 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6524 {
6525 
6526 	hn_disable_rx(sc);
6527 	hn_drain_rxtx(sc, nchan);
6528 }
6529 
6530 static int
6531 hn_synth_attach(struct hn_softc *sc, int mtu)
6532 {
6533 #define ATTACHED_NVS		0x0002
6534 #define ATTACHED_RNDIS		0x0004
6535 
6536 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6537 	int error, nsubch, nchan = 1, i, rndis_inited;
6538 	uint32_t old_caps, attached = 0;
6539 
6540 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6541 	    ("synthetic parts were attached"));
6542 
6543 	if (!hn_synth_attachable(sc))
6544 		return (ENXIO);
6545 
6546 	/* Save capabilities for later verification. */
6547 	old_caps = sc->hn_caps;
6548 	sc->hn_caps = 0;
6549 
6550 	/* Clear RSS stuffs. */
6551 	sc->hn_rss_ind_size = 0;
6552 	sc->hn_rss_hash = 0;
6553 	sc->hn_rss_hcap = 0;
6554 
6555 	/*
6556 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6557 	 */
6558 	error = hn_chan_attach(sc, sc->hn_prichan);
6559 	if (error)
6560 		goto failed;
6561 
6562 	/*
6563 	 * Attach NVS.
6564 	 */
6565 	error = hn_nvs_attach(sc, mtu);
6566 	if (error)
6567 		goto failed;
6568 	attached |= ATTACHED_NVS;
6569 
6570 	/*
6571 	 * Attach RNDIS _after_ NVS is attached.
6572 	 */
6573 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6574 	if (rndis_inited)
6575 		attached |= ATTACHED_RNDIS;
6576 	if (error)
6577 		goto failed;
6578 
6579 	/*
6580 	 * Make sure capabilities are not changed.
6581 	 */
6582 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6583 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6584 		    old_caps, sc->hn_caps);
6585 		error = ENXIO;
6586 		goto failed;
6587 	}
6588 
6589 	/*
6590 	 * Allocate sub-channels for multi-TX/RX rings.
6591 	 *
6592 	 * NOTE:
6593 	 * The # of RX rings that can be used is equivalent to the # of
6594 	 * channels to be requested.
6595 	 */
6596 	nsubch = sc->hn_rx_ring_cnt - 1;
6597 	error = hn_synth_alloc_subchans(sc, &nsubch);
6598 	if (error)
6599 		goto failed;
6600 	/* NOTE: _Full_ synthetic parts detach is required now. */
6601 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6602 
6603 	/*
6604 	 * Set the # of TX/RX rings that could be used according to
6605 	 * the # of channels that NVS offered.
6606 	 */
6607 	nchan = nsubch + 1;
6608 	hn_set_ring_inuse(sc, nchan);
6609 	if (nchan == 1) {
6610 		/* Only the primary channel can be used; done */
6611 		goto back;
6612 	}
6613 
6614 	/*
6615 	 * Attach the sub-channels.
6616 	 *
6617 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6618 	 */
6619 	error = hn_attach_subchans(sc);
6620 	if (error)
6621 		goto failed;
6622 
6623 	/*
6624 	 * Configure RSS key and indirect table _after_ all sub-channels
6625 	 * are attached.
6626 	 */
6627 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6628 		/*
6629 		 * RSS key is not set yet; set it to the default RSS key.
6630 		 */
6631 		if (bootverbose)
6632 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6633 #ifdef RSS
6634 		rss_getkey(rss->rss_key);
6635 #else
6636 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6637 #endif
6638 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6639 	}
6640 
6641 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6642 		/*
6643 		 * RSS indirect table is not set yet; set it up in round-
6644 		 * robin fashion.
6645 		 */
6646 		if (bootverbose) {
6647 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6648 			    "table\n");
6649 		}
6650 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6651 			uint32_t subidx;
6652 
6653 #ifdef RSS
6654 			subidx = rss_get_indirection_to_bucket(i);
6655 #else
6656 			subidx = i;
6657 #endif
6658 			rss->rss_ind[i] = subidx % nchan;
6659 		}
6660 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6661 	} else {
6662 		/*
6663 		 * # of usable channels may be changed, so we have to
6664 		 * make sure that all entries in RSS indirect table
6665 		 * are valid.
6666 		 *
6667 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6668 		 */
6669 		hn_rss_ind_fixup(sc);
6670 	}
6671 
6672 	sc->hn_rss_hash = sc->hn_rss_hcap;
6673 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6674 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6675 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6676 		hn_vf_rss_fixup(sc, false);
6677 	}
6678 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6679 	if (error)
6680 		goto failed;
6681 back:
6682 	/*
6683 	 * Fixup transmission aggregation setup.
6684 	 */
6685 	hn_set_txagg(sc);
6686 	hn_rndis_init_fixat(sc, nchan);
6687 	return (0);
6688 
6689 failed:
6690 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6691 		hn_rndis_init_fixat(sc, nchan);
6692 		hn_synth_detach(sc);
6693 	} else {
6694 		if (attached & ATTACHED_RNDIS) {
6695 			hn_rndis_init_fixat(sc, nchan);
6696 			hn_rndis_detach(sc);
6697 		}
6698 		if (attached & ATTACHED_NVS)
6699 			hn_nvs_detach(sc);
6700 		hn_chan_detach(sc, sc->hn_prichan);
6701 		/* Restore old capabilities. */
6702 		sc->hn_caps = old_caps;
6703 	}
6704 	return (error);
6705 
6706 #undef ATTACHED_RNDIS
6707 #undef ATTACHED_NVS
6708 }
6709 
6710 /*
6711  * NOTE:
6712  * The interface must have been suspended though hn_suspend(), before
6713  * this function get called.
6714  */
6715 static void
6716 hn_synth_detach(struct hn_softc *sc)
6717 {
6718 
6719 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6720 	    ("synthetic parts were not attached"));
6721 
6722 	/* Detach the RNDIS first. */
6723 	hn_rndis_detach(sc);
6724 
6725 	/* Detach NVS. */
6726 	hn_nvs_detach(sc);
6727 
6728 	/* Detach all of the channels. */
6729 	hn_detach_allchans(sc);
6730 
6731 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6732 		/*
6733 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6734 		 */
6735 		int error;
6736 
6737 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6738 		    sc->hn_rxbuf_gpadl);
6739 		if (error) {
6740 			if_printf(sc->hn_ifp,
6741 			    "rxbuf gpadl disconn failed: %d\n", error);
6742 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6743 		}
6744 		sc->hn_rxbuf_gpadl = 0;
6745 	}
6746 
6747 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6748 		/*
6749 		 * Host is post-Win2016, disconnect chimney sending buffer from
6750 		 * primary channel here.
6751 		 */
6752 		int error;
6753 
6754 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6755 		    sc->hn_chim_gpadl);
6756 		if (error) {
6757 			if_printf(sc->hn_ifp,
6758 			    "chim gpadl disconn failed: %d\n", error);
6759 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6760 		}
6761 		sc->hn_chim_gpadl = 0;
6762 	}
6763 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6764 }
6765 
6766 static void
6767 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6768 {
6769 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6770 	    ("invalid ring count %d", ring_cnt));
6771 
6772 	if (sc->hn_tx_ring_cnt > ring_cnt)
6773 		sc->hn_tx_ring_inuse = ring_cnt;
6774 	else
6775 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6776 	sc->hn_rx_ring_inuse = ring_cnt;
6777 
6778 #ifdef RSS
6779 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6780 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6781 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6782 		    rss_getnumbuckets());
6783 	}
6784 #endif
6785 
6786 	if (bootverbose) {
6787 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6788 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6789 	}
6790 }
6791 
6792 static void
6793 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6794 {
6795 
6796 	/*
6797 	 * NOTE:
6798 	 * The TX bufring will not be drained by the hypervisor,
6799 	 * if the primary channel is revoked.
6800 	 */
6801 	while (!vmbus_chan_rx_empty(chan) ||
6802 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6803 	     !vmbus_chan_tx_empty(chan)))
6804 		pause("waitch", 1);
6805 	vmbus_chan_intr_drain(chan);
6806 }
6807 
6808 static void
6809 hn_disable_rx(struct hn_softc *sc)
6810 {
6811 
6812 	/*
6813 	 * Disable RX by clearing RX filter forcefully.
6814 	 */
6815 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6816 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6817 
6818 	/*
6819 	 * Give RNDIS enough time to flush all pending data packets.
6820 	 */
6821 	pause("waitrx", (200 * hz) / 1000);
6822 }
6823 
6824 /*
6825  * NOTE:
6826  * RX/TX _must_ have been suspended/disabled, before this function
6827  * is called.
6828  */
6829 static void
6830 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6831 {
6832 	struct vmbus_channel **subch = NULL;
6833 	int nsubch;
6834 
6835 	/*
6836 	 * Drain RX/TX bufrings and interrupts.
6837 	 */
6838 	nsubch = nchan - 1;
6839 	if (nsubch > 0)
6840 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6841 
6842 	if (subch != NULL) {
6843 		int i;
6844 
6845 		for (i = 0; i < nsubch; ++i)
6846 			hn_chan_drain(sc, subch[i]);
6847 	}
6848 	hn_chan_drain(sc, sc->hn_prichan);
6849 
6850 	if (subch != NULL)
6851 		vmbus_subchan_rel(subch, nsubch);
6852 }
6853 
6854 static void
6855 hn_suspend_data(struct hn_softc *sc)
6856 {
6857 	struct hn_tx_ring *txr;
6858 	int i;
6859 
6860 	HN_LOCK_ASSERT(sc);
6861 
6862 	/*
6863 	 * Suspend TX.
6864 	 */
6865 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6866 		txr = &sc->hn_tx_ring[i];
6867 
6868 		mtx_lock(&txr->hn_tx_lock);
6869 		txr->hn_suspended = 1;
6870 		mtx_unlock(&txr->hn_tx_lock);
6871 		/* No one is able send more packets now. */
6872 
6873 		/*
6874 		 * Wait for all pending sends to finish.
6875 		 *
6876 		 * NOTE:
6877 		 * We will _not_ receive all pending send-done, if the
6878 		 * primary channel is revoked.
6879 		 */
6880 		while (hn_tx_ring_pending(txr) &&
6881 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6882 			pause("hnwtx", 1 /* 1 tick */);
6883 	}
6884 
6885 	/*
6886 	 * Disable RX.
6887 	 */
6888 	hn_disable_rx(sc);
6889 
6890 	/*
6891 	 * Drain RX/TX.
6892 	 */
6893 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6894 
6895 	/*
6896 	 * Drain any pending TX tasks.
6897 	 *
6898 	 * NOTE:
6899 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6900 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6901 	 */
6902 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6903 		txr = &sc->hn_tx_ring[i];
6904 
6905 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6906 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6907 	}
6908 }
6909 
6910 static void
6911 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6912 {
6913 
6914 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6915 }
6916 
6917 static void
6918 hn_suspend_mgmt(struct hn_softc *sc)
6919 {
6920 	struct task task;
6921 
6922 	HN_LOCK_ASSERT(sc);
6923 
6924 	/*
6925 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6926 	 * through hn_mgmt_taskq.
6927 	 */
6928 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6929 	vmbus_chan_run_task(sc->hn_prichan, &task);
6930 
6931 	/*
6932 	 * Make sure that all pending management tasks are completed.
6933 	 */
6934 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6935 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6936 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6937 }
6938 
6939 static void
6940 hn_suspend(struct hn_softc *sc)
6941 {
6942 
6943 	/* Disable polling. */
6944 	hn_polling(sc, 0);
6945 
6946 	/*
6947 	 * If the non-transparent mode VF is activated, the synthetic
6948 	 * device is receiving packets, so the data path of the
6949 	 * synthetic device must be suspended.
6950 	 */
6951 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6952 	    (sc->hn_flags & HN_FLAG_RXVF))
6953 		hn_suspend_data(sc);
6954 	hn_suspend_mgmt(sc);
6955 }
6956 
6957 static void
6958 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6959 {
6960 	int i;
6961 
6962 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6963 	    ("invalid TX ring count %d", tx_ring_cnt));
6964 
6965 	for (i = 0; i < tx_ring_cnt; ++i) {
6966 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6967 
6968 		mtx_lock(&txr->hn_tx_lock);
6969 		txr->hn_suspended = 0;
6970 		mtx_unlock(&txr->hn_tx_lock);
6971 	}
6972 }
6973 
6974 static void
6975 hn_resume_data(struct hn_softc *sc)
6976 {
6977 	int i;
6978 
6979 	HN_LOCK_ASSERT(sc);
6980 
6981 	/*
6982 	 * Re-enable RX.
6983 	 */
6984 	hn_rxfilter_config(sc);
6985 
6986 	/*
6987 	 * Make sure to clear suspend status on "all" TX rings,
6988 	 * since hn_tx_ring_inuse can be changed after
6989 	 * hn_suspend_data().
6990 	 */
6991 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6992 
6993 #ifdef HN_IFSTART_SUPPORT
6994 	if (!hn_use_if_start)
6995 #endif
6996 	{
6997 		/*
6998 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6999 		 * reduced.
7000 		 */
7001 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
7002 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
7003 	}
7004 
7005 	/*
7006 	 * Kick start TX.
7007 	 */
7008 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
7009 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
7010 
7011 		/*
7012 		 * Use txeof task, so that any pending oactive can be
7013 		 * cleared properly.
7014 		 */
7015 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
7016 	}
7017 }
7018 
7019 static void
7020 hn_resume_mgmt(struct hn_softc *sc)
7021 {
7022 
7023 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
7024 
7025 	/*
7026 	 * Kick off network change detection, if it was pending.
7027 	 * If no network change was pending, start link status
7028 	 * checks, which is more lightweight than network change
7029 	 * detection.
7030 	 */
7031 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
7032 		hn_change_network(sc);
7033 	else
7034 		hn_update_link_status(sc);
7035 }
7036 
7037 static void
7038 hn_resume(struct hn_softc *sc)
7039 {
7040 
7041 	/*
7042 	 * If the non-transparent mode VF is activated, the synthetic
7043 	 * device have to receive packets, so the data path of the
7044 	 * synthetic device must be resumed.
7045 	 */
7046 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
7047 	    (sc->hn_flags & HN_FLAG_RXVF))
7048 		hn_resume_data(sc);
7049 
7050 	/*
7051 	 * Don't resume link status change if VF is attached/activated.
7052 	 * - In the non-transparent VF mode, the synthetic device marks
7053 	 *   link down until the VF is deactivated; i.e. VF is down.
7054 	 * - In transparent VF mode, VF's media status is used until
7055 	 *   the VF is detached.
7056 	 */
7057 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
7058 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
7059 		hn_resume_mgmt(sc);
7060 
7061 	/*
7062 	 * Re-enable polling if this interface is running and
7063 	 * the polling is requested.
7064 	 */
7065 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
7066 		hn_polling(sc, sc->hn_pollhz);
7067 }
7068 
7069 static void
7070 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
7071 {
7072 	const struct rndis_status_msg *msg;
7073 	int ofs;
7074 
7075 	if (dlen < sizeof(*msg)) {
7076 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7077 		return;
7078 	}
7079 	msg = data;
7080 
7081 	switch (msg->rm_status) {
7082 	case RNDIS_STATUS_MEDIA_CONNECT:
7083 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7084 		hn_update_link_status(sc);
7085 		break;
7086 
7087 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7088 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7089 		/* Not really useful; ignore. */
7090 		break;
7091 
7092 	case RNDIS_STATUS_NETWORK_CHANGE:
7093 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7094 		if (dlen < ofs + msg->rm_stbuflen ||
7095 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7096 			if_printf(sc->hn_ifp, "network changed\n");
7097 		} else {
7098 			uint32_t change;
7099 
7100 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7101 			    sizeof(change));
7102 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7103 			    change);
7104 		}
7105 		hn_change_network(sc);
7106 		break;
7107 
7108 	default:
7109 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7110 		    msg->rm_status);
7111 		break;
7112 	}
7113 }
7114 
7115 static int
7116 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7117 {
7118 	const struct rndis_pktinfo *pi = info_data;
7119 	uint32_t mask = 0;
7120 
7121 	while (info_dlen != 0) {
7122 		const void *data;
7123 		uint32_t dlen;
7124 
7125 		if (__predict_false(info_dlen < sizeof(*pi)))
7126 			return (EINVAL);
7127 		if (__predict_false(info_dlen < pi->rm_size))
7128 			return (EINVAL);
7129 		info_dlen -= pi->rm_size;
7130 
7131 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7132 			return (EINVAL);
7133 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7134 			return (EINVAL);
7135 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7136 		data = pi->rm_data;
7137 
7138 		if (pi->rm_internal == 1) {
7139 			switch (pi->rm_type) {
7140 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7141 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7142 					return (EINVAL);
7143 				info->pktinfo_id =
7144 				    (const struct packet_info_id *)data;
7145 				mask |= HN_RXINFO_PKTINFO_ID;
7146 				break;
7147 
7148 			default:
7149 				goto next;
7150 			}
7151 		} else {
7152 			switch (pi->rm_type) {
7153 			case NDIS_PKTINFO_TYPE_VLAN:
7154 				if (__predict_false(dlen
7155 				    < NDIS_VLAN_INFO_SIZE))
7156 					return (EINVAL);
7157 				info->vlan_info = (const uint32_t *)data;
7158 				mask |= HN_RXINFO_VLAN;
7159 				break;
7160 
7161 			case NDIS_PKTINFO_TYPE_CSUM:
7162 				if (__predict_false(dlen
7163 				    < NDIS_RXCSUM_INFO_SIZE))
7164 					return (EINVAL);
7165 				info->csum_info = (const uint32_t *)data;
7166 				mask |= HN_RXINFO_CSUM;
7167 				break;
7168 
7169 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7170 				if (__predict_false(dlen
7171 				    < HN_NDIS_HASH_VALUE_SIZE))
7172 					return (EINVAL);
7173 				info->hash_value = (const uint32_t *)data;
7174 				mask |= HN_RXINFO_HASHVAL;
7175 				break;
7176 
7177 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7178 				if (__predict_false(dlen
7179 				    < HN_NDIS_HASH_INFO_SIZE))
7180 					return (EINVAL);
7181 				info->hash_info = (const uint32_t *)data;
7182 				mask |= HN_RXINFO_HASHINF;
7183 				break;
7184 
7185 			default:
7186 				goto next;
7187 			}
7188 		}
7189 
7190 		if (mask == HN_RXINFO_ALL) {
7191 			/* All found; done */
7192 			break;
7193 		}
7194 next:
7195 		pi = (const struct rndis_pktinfo *)
7196 		    ((const uint8_t *)pi + pi->rm_size);
7197 	}
7198 
7199 	/*
7200 	 * Final fixup.
7201 	 * - If there is no hash value, invalidate the hash info.
7202 	 */
7203 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7204 		info->hash_info = NULL;
7205 	return (0);
7206 }
7207 
7208 static __inline bool
7209 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7210 {
7211 
7212 	if (off < check_off) {
7213 		if (__predict_true(off + len <= check_off))
7214 			return (false);
7215 	} else if (off > check_off) {
7216 		if (__predict_true(check_off + check_len <= off))
7217 			return (false);
7218 	}
7219 	return (true);
7220 }
7221 
7222 static __inline void
7223 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7224 		uint32_t len, struct hn_rxinfo *info)
7225 {
7226 	uint32_t cnt = rxr->rsc.cnt;
7227 
7228 	if (cnt) {
7229 		rxr->rsc.pktlen += len;
7230 	} else {
7231 		rxr->rsc.vlan_info = info->vlan_info;
7232 		rxr->rsc.csum_info = info->csum_info;
7233 		rxr->rsc.hash_info = info->hash_info;
7234 		rxr->rsc.hash_value = info->hash_value;
7235 		rxr->rsc.pktlen = len;
7236 	}
7237 
7238 	rxr->rsc.frag_data[cnt] = data;
7239 	rxr->rsc.frag_len[cnt] = len;
7240 	rxr->rsc.cnt++;
7241 }
7242 
7243 static void
7244 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7245 {
7246 	const struct rndis_packet_msg *pkt;
7247 	struct hn_rxinfo info;
7248 	int data_off, pktinfo_off, data_len, pktinfo_len;
7249 	bool rsc_more= false;
7250 
7251 	/*
7252 	 * Check length.
7253 	 */
7254 	if (__predict_false(dlen < sizeof(*pkt))) {
7255 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7256 		return;
7257 	}
7258 	pkt = data;
7259 
7260 	if (__predict_false(dlen < pkt->rm_len)) {
7261 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7262 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7263 		return;
7264 	}
7265 	if (__predict_false(pkt->rm_len <
7266 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7267 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7268 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7269 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7270 		    pkt->rm_pktinfolen);
7271 		return;
7272 	}
7273 	if (__predict_false(pkt->rm_datalen == 0)) {
7274 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7275 		return;
7276 	}
7277 
7278 	/*
7279 	 * Check offests.
7280 	 */
7281 #define IS_OFFSET_INVALID(ofs)			\
7282 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7283 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7284 
7285 	/* XXX Hyper-V does not meet data offset alignment requirement */
7286 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7287 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7288 		    "data offset %u\n", pkt->rm_dataoffset);
7289 		return;
7290 	}
7291 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7292 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7293 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7294 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7295 		return;
7296 	}
7297 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7298 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7299 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7300 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7301 		return;
7302 	}
7303 
7304 #undef IS_OFFSET_INVALID
7305 
7306 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7307 	data_len = pkt->rm_datalen;
7308 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7309 	pktinfo_len = pkt->rm_pktinfolen;
7310 
7311 	/*
7312 	 * Check OOB coverage.
7313 	 */
7314 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7315 		int oob_off, oob_len;
7316 
7317 		if_printf(rxr->hn_ifp, "got oobdata\n");
7318 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7319 		oob_len = pkt->rm_oobdatalen;
7320 
7321 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7322 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7323 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7324 			    pkt->rm_len, oob_off, oob_len);
7325 			return;
7326 		}
7327 
7328 		/*
7329 		 * Check against data.
7330 		 */
7331 		if (hn_rndis_check_overlap(oob_off, oob_len,
7332 		    data_off, data_len)) {
7333 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7334 			    "oob overlaps data, oob abs %d len %d, "
7335 			    "data abs %d len %d\n",
7336 			    oob_off, oob_len, data_off, data_len);
7337 			return;
7338 		}
7339 
7340 		/*
7341 		 * Check against pktinfo.
7342 		 */
7343 		if (pktinfo_len != 0 &&
7344 		    hn_rndis_check_overlap(oob_off, oob_len,
7345 		    pktinfo_off, pktinfo_len)) {
7346 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7347 			    "oob overlaps pktinfo, oob abs %d len %d, "
7348 			    "pktinfo abs %d len %d\n",
7349 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7350 			return;
7351 		}
7352 	}
7353 
7354 	/*
7355 	 * Check per-packet-info coverage and find useful per-packet-info.
7356 	 */
7357 	info.vlan_info = NULL;
7358 	info.csum_info = NULL;
7359 	info.hash_info = NULL;
7360 	info.pktinfo_id = NULL;
7361 
7362 	if (__predict_true(pktinfo_len != 0)) {
7363 		bool overlap;
7364 		int error;
7365 
7366 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7367 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7368 			    "pktinfo overflow, msglen %u, "
7369 			    "pktinfo abs %d len %d\n",
7370 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7371 			return;
7372 		}
7373 
7374 		/*
7375 		 * Check packet info coverage.
7376 		 */
7377 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7378 		    data_off, data_len);
7379 		if (__predict_false(overlap)) {
7380 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7381 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7382 			    "data abs %d len %d\n",
7383 			    pktinfo_off, pktinfo_len, data_off, data_len);
7384 			return;
7385 		}
7386 
7387 		/*
7388 		 * Find useful per-packet-info.
7389 		 */
7390 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7391 		    pktinfo_len, &info);
7392 		if (__predict_false(error)) {
7393 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7394 			    "pktinfo\n");
7395 			return;
7396 		}
7397 	}
7398 
7399 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7400 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7401 		    "data overflow, msglen %u, data abs %d len %d\n",
7402 		    pkt->rm_len, data_off, data_len);
7403 		return;
7404 	}
7405 
7406 	/* Identify RSC fragments, drop invalid packets */
7407 	if ((info.pktinfo_id != NULL) &&
7408 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7409 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7410 			rxr->rsc.cnt = 0;
7411 			rxr->hn_rsc_pkts++;
7412 		} else if (rxr->rsc.cnt == 0)
7413 			goto drop;
7414 
7415 		rsc_more = true;
7416 
7417 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7418 			rsc_more = false;
7419 
7420 		if (rsc_more && rxr->rsc.is_last)
7421 			goto drop;
7422 	} else {
7423 		rxr->rsc.cnt = 0;
7424 	}
7425 
7426 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7427 		goto drop;
7428 
7429 	/* Store data in per rx ring structure */
7430 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7431 	    data_len, &info);
7432 
7433 	if (rsc_more)
7434 		return;
7435 
7436 	hn_rxpkt(rxr);
7437 	rxr->rsc.cnt = 0;
7438 	return;
7439 drop:
7440 	rxr->hn_rsc_drop++;
7441 	return;
7442 }
7443 
7444 static __inline void
7445 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7446 {
7447 	const struct rndis_msghdr *hdr;
7448 
7449 	if (__predict_false(dlen < sizeof(*hdr))) {
7450 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7451 		return;
7452 	}
7453 	hdr = data;
7454 
7455 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7456 		/* Hot data path. */
7457 		hn_rndis_rx_data(rxr, data, dlen);
7458 		/* Done! */
7459 		return;
7460 	}
7461 
7462 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7463 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7464 	else
7465 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7466 }
7467 
7468 static void
7469 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7470 {
7471 	const struct hn_nvs_hdr *hdr;
7472 
7473 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7474 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7475 		return;
7476 	}
7477 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7478 
7479 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7480 		/* Useless; ignore */
7481 		return;
7482 	}
7483 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7484 }
7485 
7486 static void
7487 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7488     const struct vmbus_chanpkt_hdr *pkt)
7489 {
7490 	struct hn_nvs_sendctx *sndc;
7491 
7492 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7493 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7494 	    VMBUS_CHANPKT_DATALEN(pkt));
7495 	/*
7496 	 * NOTE:
7497 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7498 	 * its callback.
7499 	 */
7500 }
7501 
7502 static void
7503 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7504     const struct vmbus_chanpkt_hdr *pkthdr)
7505 {
7506 	struct epoch_tracker et;
7507 	const struct vmbus_chanpkt_rxbuf *pkt;
7508 	const struct hn_nvs_hdr *nvs_hdr;
7509 	int count, i, hlen;
7510 
7511 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7512 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7513 		return;
7514 	}
7515 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7516 
7517 	/* Make sure that this is a RNDIS message. */
7518 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7519 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7520 		    nvs_hdr->nvs_type);
7521 		return;
7522 	}
7523 
7524 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7525 	if (__predict_false(hlen < sizeof(*pkt))) {
7526 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7527 		return;
7528 	}
7529 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7530 
7531 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7532 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7533 		    pkt->cp_rxbuf_id);
7534 		return;
7535 	}
7536 
7537 	count = pkt->cp_rxbuf_cnt;
7538 	if (__predict_false(hlen <
7539 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7540 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7541 		return;
7542 	}
7543 
7544 	NET_EPOCH_ENTER(et);
7545 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7546 	for (i = 0; i < count; ++i) {
7547 		int ofs, len;
7548 
7549 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7550 		len = pkt->cp_rxbuf[i].rb_len;
7551 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7552 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7553 			    "ofs %d, len %d\n", i, ofs, len);
7554 			continue;
7555 		}
7556 
7557 		rxr->rsc.is_last = (i == (count - 1));
7558 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7559 	}
7560 	NET_EPOCH_EXIT(et);
7561 
7562 	/*
7563 	 * Ack the consumed RXBUF associated w/ this channel packet,
7564 	 * so that this RXBUF can be recycled by the hypervisor.
7565 	 */
7566 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7567 }
7568 
7569 static void
7570 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7571     uint64_t tid)
7572 {
7573 	struct hn_nvs_rndis_ack ack;
7574 	int retries, error;
7575 
7576 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7577 	ack.nvs_status = HN_NVS_STATUS_OK;
7578 
7579 	retries = 0;
7580 again:
7581 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7582 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7583 	if (__predict_false(error == EAGAIN)) {
7584 		/*
7585 		 * NOTE:
7586 		 * This should _not_ happen in real world, since the
7587 		 * consumption of the TX bufring from the TX path is
7588 		 * controlled.
7589 		 */
7590 		if (rxr->hn_ack_failed == 0)
7591 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7592 		rxr->hn_ack_failed++;
7593 		retries++;
7594 		if (retries < 10) {
7595 			DELAY(100);
7596 			goto again;
7597 		}
7598 		/* RXBUF leaks! */
7599 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7600 	}
7601 }
7602 
7603 static void
7604 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7605 {
7606 	struct hn_rx_ring *rxr = xrxr;
7607 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7608 
7609 	for (;;) {
7610 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7611 		int error, pktlen;
7612 
7613 		pktlen = rxr->hn_pktbuf_len;
7614 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7615 		if (__predict_false(error == ENOBUFS)) {
7616 			void *nbuf;
7617 			int nlen;
7618 
7619 			/*
7620 			 * Expand channel packet buffer.
7621 			 *
7622 			 * XXX
7623 			 * Use M_WAITOK here, since allocation failure
7624 			 * is fatal.
7625 			 */
7626 			nlen = rxr->hn_pktbuf_len * 2;
7627 			while (nlen < pktlen)
7628 				nlen *= 2;
7629 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7630 
7631 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7632 			    rxr->hn_pktbuf_len, nlen);
7633 
7634 			free(rxr->hn_pktbuf, M_DEVBUF);
7635 			rxr->hn_pktbuf = nbuf;
7636 			rxr->hn_pktbuf_len = nlen;
7637 			/* Retry! */
7638 			continue;
7639 		} else if (__predict_false(error == EAGAIN)) {
7640 			/* No more channel packets; done! */
7641 			break;
7642 		}
7643 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7644 
7645 		switch (pkt->cph_type) {
7646 		case VMBUS_CHANPKT_TYPE_COMP:
7647 			hn_nvs_handle_comp(sc, chan, pkt);
7648 			break;
7649 
7650 		case VMBUS_CHANPKT_TYPE_RXBUF:
7651 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7652 			break;
7653 
7654 		case VMBUS_CHANPKT_TYPE_INBAND:
7655 			hn_nvs_handle_notify(sc, pkt);
7656 			break;
7657 
7658 		default:
7659 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7660 			    pkt->cph_type);
7661 			break;
7662 		}
7663 	}
7664 	hn_chan_rollup(rxr, rxr->hn_txr);
7665 }
7666 
7667 static void
7668 hn_sysinit(void *arg __unused)
7669 {
7670 	int i;
7671 
7672 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7673 
7674 #ifdef HN_IFSTART_SUPPORT
7675 	/*
7676 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7677 	 * mainly due to the IFF_DRV_OACTIVE flag.
7678 	 */
7679 	if (hn_xpnt_vf && hn_use_if_start) {
7680 		hn_use_if_start = 0;
7681 		printf("hn: tranparent VF mode, if_transmit will be used, "
7682 		    "instead of if_start\n");
7683 	}
7684 #endif
7685 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7686 		printf("hn: invalid transparent VF attach routing "
7687 		    "wait timeout %d, reset to %d\n",
7688 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7689 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7690 	}
7691 
7692 	/*
7693 	 * Initialize VF map.
7694 	 */
7695 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7696 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7697 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7698 	    M_WAITOK | M_ZERO);
7699 
7700 	/*
7701 	 * Fix the # of TX taskqueues.
7702 	 */
7703 	if (hn_tx_taskq_cnt <= 0)
7704 		hn_tx_taskq_cnt = 1;
7705 	else if (hn_tx_taskq_cnt > mp_ncpus)
7706 		hn_tx_taskq_cnt = mp_ncpus;
7707 
7708 	/*
7709 	 * Fix the TX taskqueue mode.
7710 	 */
7711 	switch (hn_tx_taskq_mode) {
7712 	case HN_TX_TASKQ_M_INDEP:
7713 	case HN_TX_TASKQ_M_GLOBAL:
7714 	case HN_TX_TASKQ_M_EVTTQ:
7715 		break;
7716 	default:
7717 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7718 		break;
7719 	}
7720 
7721 	if (vm_guest != VM_GUEST_HV)
7722 		return;
7723 
7724 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7725 		return;
7726 
7727 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7728 	    M_DEVBUF, M_WAITOK);
7729 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7730 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7731 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7732 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7733 		    "hn tx%d", i);
7734 	}
7735 }
7736 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7737 
7738 static void
7739 hn_sysuninit(void *arg __unused)
7740 {
7741 
7742 	if (hn_tx_taskque != NULL) {
7743 		int i;
7744 
7745 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7746 			taskqueue_free(hn_tx_taskque[i]);
7747 		free(hn_tx_taskque, M_DEVBUF);
7748 	}
7749 
7750 	if (hn_vfmap != NULL)
7751 		free(hn_vfmap, M_DEVBUF);
7752 	rm_destroy(&hn_vfmap_lock);
7753 
7754 	counter_u64_free(hn_udpcs_fixup);
7755 }
7756 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7757