xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 40427cca7a9ae77b095936fb1954417c290cfb17)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/bus.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/rmlock.h>
73 #include <sys/sbuf.h>
74 #include <sys/smp.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
77 #include <sys/sx.h>
78 #include <sys/sysctl.h>
79 #include <sys/systm.h>
80 #include <sys/taskqueue.h>
81 #include <sys/buf_ring.h>
82 #include <sys/eventhandler.h>
83 
84 #include <machine/atomic.h>
85 #include <machine/in_cksum.h>
86 
87 #include <net/bpf.h>
88 #include <net/ethernet.h>
89 #include <net/if.h>
90 #include <net/if_dl.h>
91 #include <net/if_media.h>
92 #include <net/if_types.h>
93 #include <net/if_var.h>
94 #include <net/rndis.h>
95 #ifdef RSS
96 #include <net/rss_config.h>
97 #endif
98 
99 #include <netinet/in_systm.h>
100 #include <netinet/in.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103 #include <netinet/tcp.h>
104 #include <netinet/tcp_lro.h>
105 #include <netinet/udp.h>
106 
107 #include <dev/hyperv/include/hyperv.h>
108 #include <dev/hyperv/include/hyperv_busdma.h>
109 #include <dev/hyperv/include/vmbus.h>
110 #include <dev/hyperv/include/vmbus_xact.h>
111 
112 #include <dev/hyperv/netvsc/ndis.h>
113 #include <dev/hyperv/netvsc/if_hnreg.h>
114 #include <dev/hyperv/netvsc/if_hnvar.h>
115 #include <dev/hyperv/netvsc/hn_nvs.h>
116 #include <dev/hyperv/netvsc/hn_rndis.h>
117 
118 #include "vmbus_if.h"
119 
120 #define HN_IFSTART_SUPPORT
121 
122 #define HN_RING_CNT_DEF_MAX		8
123 
124 #define HN_VFMAP_SIZE_DEF		8
125 
126 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
127 
128 /* YYY should get it from the underlying channel */
129 #define HN_TX_DESC_CNT			512
130 
131 #define HN_RNDIS_PKT_LEN					\
132 	(sizeof(struct rndis_packet_msg) +			\
133 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
134 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
135 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
136 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
137 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
138 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
139 
140 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
141 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
142 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
143 /* -1 for RNDIS packet message */
144 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
145 
146 #define HN_DIRECT_TX_SIZE_DEF		128
147 
148 #define HN_EARLY_TXEOF_THRESH		8
149 
150 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
151 
152 #define HN_LROENT_CNT_DEF		128
153 
154 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
155 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
156 /* YYY 2*MTU is a bit rough, but should be good enough. */
157 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
158 
159 #define HN_LRO_ACKCNT_DEF		1
160 
161 #define HN_LOCK_INIT(sc)		\
162 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
163 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
164 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
165 #define HN_LOCK(sc)					\
166 do {							\
167 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
168 		DELAY(1000);				\
169 } while (0)
170 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
171 
172 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
173 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
174 #define HN_CSUM_IP_HWASSIST(sc)		\
175 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
176 #define HN_CSUM_IP6_HWASSIST(sc)	\
177 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
178 
179 #define HN_PKTSIZE_MIN(align)		\
180 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
181 	    HN_RNDIS_PKT_LEN, (align))
182 #define HN_PKTSIZE(m, align)		\
183 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
184 
185 #ifdef RSS
186 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
187 #else
188 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
189 #endif
190 
191 struct hn_txdesc {
192 #ifndef HN_USE_TXDESC_BUFRING
193 	SLIST_ENTRY(hn_txdesc)		link;
194 #endif
195 	STAILQ_ENTRY(hn_txdesc)		agg_link;
196 
197 	/* Aggregated txdescs, in sending order. */
198 	STAILQ_HEAD(, hn_txdesc)	agg_list;
199 
200 	/* The oldest packet, if transmission aggregation happens. */
201 	struct mbuf			*m;
202 	struct hn_tx_ring		*txr;
203 	int				refs;
204 	uint32_t			flags;	/* HN_TXD_FLAG_ */
205 	struct hn_nvs_sendctx		send_ctx;
206 	uint32_t			chim_index;
207 	int				chim_size;
208 
209 	bus_dmamap_t			data_dmap;
210 
211 	bus_addr_t			rndis_pkt_paddr;
212 	struct rndis_packet_msg		*rndis_pkt;
213 	bus_dmamap_t			rndis_pkt_dmap;
214 };
215 
216 #define HN_TXD_FLAG_ONLIST		0x0001
217 #define HN_TXD_FLAG_DMAMAP		0x0002
218 #define HN_TXD_FLAG_ONAGG		0x0004
219 
220 struct hn_rxinfo {
221 	uint32_t			vlan_info;
222 	uint32_t			csum_info;
223 	uint32_t			hash_info;
224 	uint32_t			hash_value;
225 };
226 
227 struct hn_rxvf_setarg {
228 	struct hn_rx_ring	*rxr;
229 	struct ifnet		*vf_ifp;
230 };
231 
232 #define HN_RXINFO_VLAN			0x0001
233 #define HN_RXINFO_CSUM			0x0002
234 #define HN_RXINFO_HASHINF		0x0004
235 #define HN_RXINFO_HASHVAL		0x0008
236 #define HN_RXINFO_ALL			\
237 	(HN_RXINFO_VLAN |		\
238 	 HN_RXINFO_CSUM |		\
239 	 HN_RXINFO_HASHINF |		\
240 	 HN_RXINFO_HASHVAL)
241 
242 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
243 #define HN_NDIS_RXCSUM_INFO_INVALID	0
244 #define HN_NDIS_HASH_INFO_INVALID	0
245 
246 static int			hn_probe(device_t);
247 static int			hn_attach(device_t);
248 static int			hn_detach(device_t);
249 static int			hn_shutdown(device_t);
250 static void			hn_chan_callback(struct vmbus_channel *,
251 				    void *);
252 
253 static void			hn_init(void *);
254 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
255 #ifdef HN_IFSTART_SUPPORT
256 static void			hn_start(struct ifnet *);
257 #endif
258 static int			hn_transmit(struct ifnet *, struct mbuf *);
259 static void			hn_xmit_qflush(struct ifnet *);
260 static int			hn_ifmedia_upd(struct ifnet *);
261 static void			hn_ifmedia_sts(struct ifnet *,
262 				    struct ifmediareq *);
263 
264 static void			hn_ifnet_event(void *, struct ifnet *, int);
265 static void			hn_ifaddr_event(void *, struct ifnet *);
266 static void			hn_ifnet_attevent(void *, struct ifnet *);
267 static void			hn_ifnet_detevent(void *, struct ifnet *);
268 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
269 
270 static bool			hn_ismyvf(const struct hn_softc *,
271 				    const struct ifnet *);
272 static void			hn_rxvf_change(struct hn_softc *,
273 				    struct ifnet *, bool);
274 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
275 static void			hn_rxvf_set_task(void *, int);
276 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
277 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
278 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
279 				    struct ifreq *);
280 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
281 static bool			hn_xpnt_vf_isready(struct hn_softc *);
282 static void			hn_xpnt_vf_setready(struct hn_softc *);
283 static void			hn_xpnt_vf_init_taskfunc(void *, int);
284 static void			hn_xpnt_vf_init(struct hn_softc *);
285 static void			hn_xpnt_vf_setenable(struct hn_softc *);
286 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
287 
288 static int			hn_rndis_rxinfo(const void *, int,
289 				    struct hn_rxinfo *);
290 static void			hn_rndis_rx_data(struct hn_rx_ring *,
291 				    const void *, int);
292 static void			hn_rndis_rx_status(struct hn_softc *,
293 				    const void *, int);
294 static void			hn_rndis_init_fixat(struct hn_softc *, int);
295 
296 static void			hn_nvs_handle_notify(struct hn_softc *,
297 				    const struct vmbus_chanpkt_hdr *);
298 static void			hn_nvs_handle_comp(struct hn_softc *,
299 				    struct vmbus_channel *,
300 				    const struct vmbus_chanpkt_hdr *);
301 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
302 				    struct vmbus_channel *,
303 				    const struct vmbus_chanpkt_hdr *);
304 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
305 				    struct vmbus_channel *, uint64_t);
306 
307 #if __FreeBSD_version >= 1100099
308 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
309 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
310 #endif
311 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
312 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
313 #if __FreeBSD_version < 1100095
314 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
315 #else
316 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
317 #endif
318 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
319 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
320 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
321 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
322 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
323 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
324 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
325 #ifndef RSS
326 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
327 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
328 #endif
329 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
331 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
341 
342 static void			hn_stop(struct hn_softc *, bool);
343 static void			hn_init_locked(struct hn_softc *);
344 static int			hn_chan_attach(struct hn_softc *,
345 				    struct vmbus_channel *);
346 static void			hn_chan_detach(struct hn_softc *,
347 				    struct vmbus_channel *);
348 static int			hn_attach_subchans(struct hn_softc *);
349 static void			hn_detach_allchans(struct hn_softc *);
350 static void			hn_chan_rollup(struct hn_rx_ring *,
351 				    struct hn_tx_ring *);
352 static void			hn_set_ring_inuse(struct hn_softc *, int);
353 static int			hn_synth_attach(struct hn_softc *, int);
354 static void			hn_synth_detach(struct hn_softc *);
355 static int			hn_synth_alloc_subchans(struct hn_softc *,
356 				    int *);
357 static bool			hn_synth_attachable(const struct hn_softc *);
358 static void			hn_suspend(struct hn_softc *);
359 static void			hn_suspend_data(struct hn_softc *);
360 static void			hn_suspend_mgmt(struct hn_softc *);
361 static void			hn_resume(struct hn_softc *);
362 static void			hn_resume_data(struct hn_softc *);
363 static void			hn_resume_mgmt(struct hn_softc *);
364 static void			hn_suspend_mgmt_taskfunc(void *, int);
365 static void			hn_chan_drain(struct hn_softc *,
366 				    struct vmbus_channel *);
367 static void			hn_disable_rx(struct hn_softc *);
368 static void			hn_drain_rxtx(struct hn_softc *, int);
369 static void			hn_polling(struct hn_softc *, u_int);
370 static void			hn_chan_polling(struct vmbus_channel *, u_int);
371 static void			hn_mtu_change_fixup(struct hn_softc *);
372 
373 static void			hn_update_link_status(struct hn_softc *);
374 static void			hn_change_network(struct hn_softc *);
375 static void			hn_link_taskfunc(void *, int);
376 static void			hn_netchg_init_taskfunc(void *, int);
377 static void			hn_netchg_status_taskfunc(void *, int);
378 static void			hn_link_status(struct hn_softc *);
379 
380 static int			hn_create_rx_data(struct hn_softc *, int);
381 static void			hn_destroy_rx_data(struct hn_softc *);
382 static int			hn_check_iplen(const struct mbuf *, int);
383 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
384 static int			hn_rxfilter_config(struct hn_softc *);
385 #ifndef RSS
386 static int			hn_rss_reconfig(struct hn_softc *);
387 #endif
388 static void			hn_rss_ind_fixup(struct hn_softc *);
389 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
390 				    int, const struct hn_rxinfo *);
391 
392 static int			hn_tx_ring_create(struct hn_softc *, int);
393 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
394 static int			hn_create_tx_data(struct hn_softc *, int);
395 static void			hn_fixup_tx_data(struct hn_softc *);
396 static void			hn_destroy_tx_data(struct hn_softc *);
397 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
398 static void			hn_txdesc_gc(struct hn_tx_ring *,
399 				    struct hn_txdesc *);
400 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
401 				    struct hn_txdesc *, struct mbuf **);
402 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
403 				    struct hn_txdesc *);
404 static void			hn_set_chim_size(struct hn_softc *, int);
405 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
406 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
407 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
408 static void			hn_resume_tx(struct hn_softc *, int);
409 static void			hn_set_txagg(struct hn_softc *);
410 static void			*hn_try_txagg(struct ifnet *,
411 				    struct hn_tx_ring *, struct hn_txdesc *,
412 				    int);
413 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
414 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
415 				    struct hn_softc *, struct vmbus_channel *,
416 				    const void *, int);
417 static int			hn_txpkt_sglist(struct hn_tx_ring *,
418 				    struct hn_txdesc *);
419 static int			hn_txpkt_chim(struct hn_tx_ring *,
420 				    struct hn_txdesc *);
421 static int			hn_xmit(struct hn_tx_ring *, int);
422 static void			hn_xmit_taskfunc(void *, int);
423 static void			hn_xmit_txeof(struct hn_tx_ring *);
424 static void			hn_xmit_txeof_taskfunc(void *, int);
425 #ifdef HN_IFSTART_SUPPORT
426 static int			hn_start_locked(struct hn_tx_ring *, int);
427 static void			hn_start_taskfunc(void *, int);
428 static void			hn_start_txeof(struct hn_tx_ring *);
429 static void			hn_start_txeof_taskfunc(void *, int);
430 #endif
431 
432 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
433     "Hyper-V network interface");
434 
435 /* Trust tcp segements verification on host side. */
436 static int			hn_trust_hosttcp = 1;
437 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
438     &hn_trust_hosttcp, 0,
439     "Trust tcp segement verification on host side, "
440     "when csum info is missing (global setting)");
441 
442 /* Trust udp datagrams verification on host side. */
443 static int			hn_trust_hostudp = 1;
444 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
445     &hn_trust_hostudp, 0,
446     "Trust udp datagram verification on host side, "
447     "when csum info is missing (global setting)");
448 
449 /* Trust ip packets verification on host side. */
450 static int			hn_trust_hostip = 1;
451 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
452     &hn_trust_hostip, 0,
453     "Trust ip packet verification on host side, "
454     "when csum info is missing (global setting)");
455 
456 /* Limit TSO burst size */
457 static int			hn_tso_maxlen = IP_MAXPACKET;
458 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
459     &hn_tso_maxlen, 0, "TSO burst limit");
460 
461 /* Limit chimney send size */
462 static int			hn_tx_chimney_size = 0;
463 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
464     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
465 
466 /* Limit the size of packet for direct transmission */
467 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
468 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
469     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
470 
471 /* # of LRO entries per RX ring */
472 #if defined(INET) || defined(INET6)
473 #if __FreeBSD_version >= 1100095
474 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
475 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
476     &hn_lro_entry_count, 0, "LRO entry count");
477 #endif
478 #endif
479 
480 static int			hn_tx_taskq_cnt = 1;
481 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
482     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
483 
484 #define HN_TX_TASKQ_M_INDEP	0
485 #define HN_TX_TASKQ_M_GLOBAL	1
486 #define HN_TX_TASKQ_M_EVTTQ	2
487 
488 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
489 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
490     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
491     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
492 
493 #ifndef HN_USE_TXDESC_BUFRING
494 static int			hn_use_txdesc_bufring = 0;
495 #else
496 static int			hn_use_txdesc_bufring = 1;
497 #endif
498 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
499     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
500 
501 #ifdef HN_IFSTART_SUPPORT
502 /* Use ifnet.if_start instead of ifnet.if_transmit */
503 static int			hn_use_if_start = 0;
504 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
505     &hn_use_if_start, 0, "Use if_start TX method");
506 #endif
507 
508 /* # of channels to use */
509 static int			hn_chan_cnt = 0;
510 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
511     &hn_chan_cnt, 0,
512     "# of channels to use; each channel has one RX ring and one TX ring");
513 
514 /* # of transmit rings to use */
515 static int			hn_tx_ring_cnt = 0;
516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
517     &hn_tx_ring_cnt, 0, "# of TX rings to use");
518 
519 /* Software TX ring deptch */
520 static int			hn_tx_swq_depth = 0;
521 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
522     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
523 
524 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
525 #if __FreeBSD_version >= 1100095
526 static u_int			hn_lro_mbufq_depth = 0;
527 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
528     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
529 #endif
530 
531 /* Packet transmission aggregation size limit */
532 static int			hn_tx_agg_size = -1;
533 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
534     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
535 
536 /* Packet transmission aggregation count limit */
537 static int			hn_tx_agg_pkts = -1;
538 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
539     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
540 
541 /* VF list */
542 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
543     0, 0, hn_vflist_sysctl, "A", "VF list");
544 
545 /* VF mapping */
546 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
547     0, 0, hn_vfmap_sysctl, "A", "VF mapping");
548 
549 /* Transparent VF */
550 static int			hn_xpnt_vf = 0;
551 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
552     &hn_xpnt_vf, 0, "Transparent VF mod");
553 
554 /* Accurate BPF support for Transparent VF */
555 static int			hn_xpnt_vf_accbpf = 0;
556 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
557     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
558 
559 /* Extra wait for transparent VF attach routing; unit seconds. */
560 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
561 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
562     &hn_xpnt_vf_attwait, 0,
563     "Extra wait for transparent VF attach routing; unit: seconds");
564 
565 static u_int			hn_cpu_index;	/* next CPU for channel */
566 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
567 
568 static struct rmlock		hn_vfmap_lock;
569 static int			hn_vfmap_size;
570 static struct ifnet		**hn_vfmap;
571 
572 #ifndef RSS
573 static const uint8_t
574 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
575 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
576 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
577 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
578 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
579 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
580 };
581 #endif	/* !RSS */
582 
583 static const struct hyperv_guid	hn_guid = {
584 	.hv_guid = {
585 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
586 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
587 };
588 
589 static device_method_t hn_methods[] = {
590 	/* Device interface */
591 	DEVMETHOD(device_probe,		hn_probe),
592 	DEVMETHOD(device_attach,	hn_attach),
593 	DEVMETHOD(device_detach,	hn_detach),
594 	DEVMETHOD(device_shutdown,	hn_shutdown),
595 	DEVMETHOD_END
596 };
597 
598 static driver_t hn_driver = {
599 	"hn",
600 	hn_methods,
601 	sizeof(struct hn_softc)
602 };
603 
604 static devclass_t hn_devclass;
605 
606 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
607 MODULE_VERSION(hn, 1);
608 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
609 
610 #if __FreeBSD_version >= 1100099
611 static void
612 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
613 {
614 	int i;
615 
616 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
617 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
618 }
619 #endif
620 
621 static int
622 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
623 {
624 
625 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
626 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
627 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
628 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
629 }
630 
631 static int
632 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
633 {
634 	struct hn_nvs_rndis rndis;
635 
636 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
637 	    txd->chim_size > 0, ("invalid rndis chim txd"));
638 
639 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
640 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
641 	rndis.nvs_chim_idx = txd->chim_index;
642 	rndis.nvs_chim_sz = txd->chim_size;
643 
644 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
645 	    &rndis, sizeof(rndis), &txd->send_ctx));
646 }
647 
648 static __inline uint32_t
649 hn_chim_alloc(struct hn_softc *sc)
650 {
651 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
652 	u_long *bmap = sc->hn_chim_bmap;
653 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
654 
655 	for (i = 0; i < bmap_cnt; ++i) {
656 		int idx;
657 
658 		idx = ffsl(~bmap[i]);
659 		if (idx == 0)
660 			continue;
661 
662 		--idx; /* ffsl is 1-based */
663 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
664 		    ("invalid i %d and idx %d", i, idx));
665 
666 		if (atomic_testandset_long(&bmap[i], idx))
667 			continue;
668 
669 		ret = i * LONG_BIT + idx;
670 		break;
671 	}
672 	return (ret);
673 }
674 
675 static __inline void
676 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
677 {
678 	u_long mask;
679 	uint32_t idx;
680 
681 	idx = chim_idx / LONG_BIT;
682 	KASSERT(idx < sc->hn_chim_bmap_cnt,
683 	    ("invalid chimney index 0x%x", chim_idx));
684 
685 	mask = 1UL << (chim_idx % LONG_BIT);
686 	KASSERT(sc->hn_chim_bmap[idx] & mask,
687 	    ("index bitmap 0x%lx, chimney index %u, "
688 	     "bitmap idx %d, bitmask 0x%lx",
689 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
690 
691 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
692 }
693 
694 #if defined(INET6) || defined(INET)
695 
696 #define PULLUP_HDR(m, len)				\
697 do {							\
698 	if (__predict_false((m)->m_len < (len))) {	\
699 		(m) = m_pullup((m), (len));		\
700 		if ((m) == NULL)			\
701 			return (NULL);			\
702 	}						\
703 } while (0)
704 
705 /*
706  * NOTE: If this function failed, the m_head would be freed.
707  */
708 static __inline struct mbuf *
709 hn_tso_fixup(struct mbuf *m_head)
710 {
711 	struct ether_vlan_header *evl;
712 	struct tcphdr *th;
713 	int ehlen;
714 
715 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
716 
717 	PULLUP_HDR(m_head, sizeof(*evl));
718 	evl = mtod(m_head, struct ether_vlan_header *);
719 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
720 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
721 	else
722 		ehlen = ETHER_HDR_LEN;
723 
724 #ifdef INET
725 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
726 		struct ip *ip;
727 		int iphlen;
728 
729 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
730 		ip = mtodo(m_head, ehlen);
731 		iphlen = ip->ip_hl << 2;
732 
733 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
734 		th = mtodo(m_head, ehlen + iphlen);
735 
736 		ip->ip_len = 0;
737 		ip->ip_sum = 0;
738 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
739 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
740 	}
741 #endif
742 #if defined(INET6) && defined(INET)
743 	else
744 #endif
745 #ifdef INET6
746 	{
747 		struct ip6_hdr *ip6;
748 
749 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
750 		ip6 = mtodo(m_head, ehlen);
751 		if (ip6->ip6_nxt != IPPROTO_TCP) {
752 			m_freem(m_head);
753 			return (NULL);
754 		}
755 
756 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
757 		th = mtodo(m_head, ehlen + sizeof(*ip6));
758 
759 		ip6->ip6_plen = 0;
760 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
761 	}
762 #endif
763 	return (m_head);
764 
765 }
766 
767 /*
768  * NOTE: If this function failed, the m_head would be freed.
769  */
770 static __inline struct mbuf *
771 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
772 {
773 	const struct ether_vlan_header *evl;
774 	const struct tcphdr *th;
775 	int ehlen;
776 
777 	*tcpsyn = 0;
778 
779 	PULLUP_HDR(m_head, sizeof(*evl));
780 	evl = mtod(m_head, const struct ether_vlan_header *);
781 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
782 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
783 	else
784 		ehlen = ETHER_HDR_LEN;
785 
786 #ifdef INET
787 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) {
788 		const struct ip *ip;
789 		int iphlen;
790 
791 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
792 		ip = mtodo(m_head, ehlen);
793 		iphlen = ip->ip_hl << 2;
794 
795 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
796 		th = mtodo(m_head, ehlen + iphlen);
797 		if (th->th_flags & TH_SYN)
798 			*tcpsyn = 1;
799 	}
800 #endif
801 #if defined(INET6) && defined(INET)
802 	else
803 #endif
804 #ifdef INET6
805 	{
806 		const struct ip6_hdr *ip6;
807 
808 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
809 		ip6 = mtodo(m_head, ehlen);
810 		if (ip6->ip6_nxt != IPPROTO_TCP)
811 			return (m_head);
812 
813 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
814 		th = mtodo(m_head, ehlen + sizeof(*ip6));
815 		if (th->th_flags & TH_SYN)
816 			*tcpsyn = 1;
817 	}
818 #endif
819 	return (m_head);
820 }
821 
822 #undef PULLUP_HDR
823 
824 #endif	/* INET6 || INET */
825 
826 static int
827 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
828 {
829 	int error = 0;
830 
831 	HN_LOCK_ASSERT(sc);
832 
833 	if (sc->hn_rx_filter != filter) {
834 		error = hn_rndis_set_rxfilter(sc, filter);
835 		if (!error)
836 			sc->hn_rx_filter = filter;
837 	}
838 	return (error);
839 }
840 
841 static int
842 hn_rxfilter_config(struct hn_softc *sc)
843 {
844 	struct ifnet *ifp = sc->hn_ifp;
845 	uint32_t filter;
846 
847 	HN_LOCK_ASSERT(sc);
848 
849 	/*
850 	 * If the non-transparent mode VF is activated, we don't know how
851 	 * its RX filter is configured, so stick the synthetic device in
852 	 * the promiscous mode.
853 	 */
854 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
855 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
856 	} else {
857 		filter = NDIS_PACKET_TYPE_DIRECTED;
858 		if (ifp->if_flags & IFF_BROADCAST)
859 			filter |= NDIS_PACKET_TYPE_BROADCAST;
860 		/* TODO: support multicast list */
861 		if ((ifp->if_flags & IFF_ALLMULTI) ||
862 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
863 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
864 	}
865 	return (hn_set_rxfilter(sc, filter));
866 }
867 
868 static void
869 hn_set_txagg(struct hn_softc *sc)
870 {
871 	uint32_t size, pkts;
872 	int i;
873 
874 	/*
875 	 * Setup aggregation size.
876 	 */
877 	if (sc->hn_agg_size < 0)
878 		size = UINT32_MAX;
879 	else
880 		size = sc->hn_agg_size;
881 
882 	if (sc->hn_rndis_agg_size < size)
883 		size = sc->hn_rndis_agg_size;
884 
885 	/* NOTE: We only aggregate packets using chimney sending buffers. */
886 	if (size > (uint32_t)sc->hn_chim_szmax)
887 		size = sc->hn_chim_szmax;
888 
889 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
890 		/* Disable */
891 		size = 0;
892 		pkts = 0;
893 		goto done;
894 	}
895 
896 	/* NOTE: Type of the per TX ring setting is 'int'. */
897 	if (size > INT_MAX)
898 		size = INT_MAX;
899 
900 	/*
901 	 * Setup aggregation packet count.
902 	 */
903 	if (sc->hn_agg_pkts < 0)
904 		pkts = UINT32_MAX;
905 	else
906 		pkts = sc->hn_agg_pkts;
907 
908 	if (sc->hn_rndis_agg_pkts < pkts)
909 		pkts = sc->hn_rndis_agg_pkts;
910 
911 	if (pkts <= 1) {
912 		/* Disable */
913 		size = 0;
914 		pkts = 0;
915 		goto done;
916 	}
917 
918 	/* NOTE: Type of the per TX ring setting is 'short'. */
919 	if (pkts > SHRT_MAX)
920 		pkts = SHRT_MAX;
921 
922 done:
923 	/* NOTE: Type of the per TX ring setting is 'short'. */
924 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
925 		/* Disable */
926 		size = 0;
927 		pkts = 0;
928 	}
929 
930 	if (bootverbose) {
931 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
932 		    size, pkts, sc->hn_rndis_agg_align);
933 	}
934 
935 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
936 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
937 
938 		mtx_lock(&txr->hn_tx_lock);
939 		txr->hn_agg_szmax = size;
940 		txr->hn_agg_pktmax = pkts;
941 		txr->hn_agg_align = sc->hn_rndis_agg_align;
942 		mtx_unlock(&txr->hn_tx_lock);
943 	}
944 }
945 
946 static int
947 hn_get_txswq_depth(const struct hn_tx_ring *txr)
948 {
949 
950 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
951 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
952 		return txr->hn_txdesc_cnt;
953 	return hn_tx_swq_depth;
954 }
955 
956 #ifndef RSS
957 static int
958 hn_rss_reconfig(struct hn_softc *sc)
959 {
960 	int error;
961 
962 	HN_LOCK_ASSERT(sc);
963 
964 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
965 		return (ENXIO);
966 
967 	/*
968 	 * Disable RSS first.
969 	 *
970 	 * NOTE:
971 	 * Direct reconfiguration by setting the UNCHG flags does
972 	 * _not_ work properly.
973 	 */
974 	if (bootverbose)
975 		if_printf(sc->hn_ifp, "disable RSS\n");
976 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
977 	if (error) {
978 		if_printf(sc->hn_ifp, "RSS disable failed\n");
979 		return (error);
980 	}
981 
982 	/*
983 	 * Reenable the RSS w/ the updated RSS key or indirect
984 	 * table.
985 	 */
986 	if (bootverbose)
987 		if_printf(sc->hn_ifp, "reconfig RSS\n");
988 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
989 	if (error) {
990 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
991 		return (error);
992 	}
993 	return (0);
994 }
995 #endif	/* !RSS */
996 
997 static void
998 hn_rss_ind_fixup(struct hn_softc *sc)
999 {
1000 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1001 	int i, nchan;
1002 
1003 	nchan = sc->hn_rx_ring_inuse;
1004 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1005 
1006 	/*
1007 	 * Check indirect table to make sure that all channels in it
1008 	 * can be used.
1009 	 */
1010 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1011 		if (rss->rss_ind[i] >= nchan) {
1012 			if_printf(sc->hn_ifp,
1013 			    "RSS indirect table %d fixup: %u -> %d\n",
1014 			    i, rss->rss_ind[i], nchan - 1);
1015 			rss->rss_ind[i] = nchan - 1;
1016 		}
1017 	}
1018 }
1019 
1020 static int
1021 hn_ifmedia_upd(struct ifnet *ifp __unused)
1022 {
1023 
1024 	return EOPNOTSUPP;
1025 }
1026 
1027 static void
1028 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1029 {
1030 	struct hn_softc *sc = ifp->if_softc;
1031 
1032 	ifmr->ifm_status = IFM_AVALID;
1033 	ifmr->ifm_active = IFM_ETHER;
1034 
1035 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1036 		ifmr->ifm_active |= IFM_NONE;
1037 		return;
1038 	}
1039 	ifmr->ifm_status |= IFM_ACTIVE;
1040 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1041 }
1042 
1043 static void
1044 hn_rxvf_set_task(void *xarg, int pending __unused)
1045 {
1046 	struct hn_rxvf_setarg *arg = xarg;
1047 
1048 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1049 }
1050 
1051 static void
1052 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1053 {
1054 	struct hn_rx_ring *rxr;
1055 	struct hn_rxvf_setarg arg;
1056 	struct task task;
1057 	int i;
1058 
1059 	HN_LOCK_ASSERT(sc);
1060 
1061 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1062 
1063 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1064 		rxr = &sc->hn_rx_ring[i];
1065 
1066 		if (i < sc->hn_rx_ring_inuse) {
1067 			arg.rxr = rxr;
1068 			arg.vf_ifp = vf_ifp;
1069 			vmbus_chan_run_task(rxr->hn_chan, &task);
1070 		} else {
1071 			rxr->hn_rxvf_ifp = vf_ifp;
1072 		}
1073 	}
1074 }
1075 
1076 static bool
1077 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1078 {
1079 	const struct ifnet *hn_ifp;
1080 
1081 	hn_ifp = sc->hn_ifp;
1082 
1083 	if (ifp == hn_ifp)
1084 		return (false);
1085 
1086 	if (ifp->if_alloctype != IFT_ETHER)
1087 		return (false);
1088 
1089 	/* Ignore lagg/vlan interfaces */
1090 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1091 	    strcmp(ifp->if_dname, "vlan") == 0)
1092 		return (false);
1093 
1094 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1095 		return (false);
1096 
1097 	return (true);
1098 }
1099 
1100 static void
1101 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1102 {
1103 	struct ifnet *hn_ifp;
1104 
1105 	HN_LOCK(sc);
1106 
1107 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1108 		goto out;
1109 
1110 	if (!hn_ismyvf(sc, ifp))
1111 		goto out;
1112 	hn_ifp = sc->hn_ifp;
1113 
1114 	if (rxvf) {
1115 		if (sc->hn_flags & HN_FLAG_RXVF)
1116 			goto out;
1117 
1118 		sc->hn_flags |= HN_FLAG_RXVF;
1119 		hn_rxfilter_config(sc);
1120 	} else {
1121 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1122 			goto out;
1123 
1124 		sc->hn_flags &= ~HN_FLAG_RXVF;
1125 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1126 			hn_rxfilter_config(sc);
1127 		else
1128 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1129 	}
1130 
1131 	hn_nvs_set_datapath(sc,
1132 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1133 
1134 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1135 
1136 	if (rxvf) {
1137 		hn_suspend_mgmt(sc);
1138 		sc->hn_link_flags &=
1139 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1140 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1141 	} else {
1142 		hn_resume_mgmt(sc);
1143 	}
1144 
1145 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1146 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1147 
1148 	if (bootverbose) {
1149 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1150 		    rxvf ? "to" : "from", ifp->if_xname);
1151 	}
1152 out:
1153 	HN_UNLOCK(sc);
1154 }
1155 
1156 static void
1157 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1158 {
1159 
1160 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1161 		return;
1162 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1163 }
1164 
1165 static void
1166 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1167 {
1168 
1169 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1170 }
1171 
1172 static int
1173 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1174 {
1175 	struct ifnet *ifp, *vf_ifp;
1176 	uint64_t tmp;
1177 	int error;
1178 
1179 	HN_LOCK_ASSERT(sc);
1180 	ifp = sc->hn_ifp;
1181 	vf_ifp = sc->hn_vf_ifp;
1182 
1183 	/*
1184 	 * Fix up requested capabilities w/ supported capabilities,
1185 	 * since the supported capabilities could have been changed.
1186 	 */
1187 	ifr->ifr_reqcap &= ifp->if_capabilities;
1188 	/* Pass SIOCSIFCAP to VF. */
1189 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1190 
1191 	/*
1192 	 * NOTE:
1193 	 * The error will be propagated to the callers, however, it
1194 	 * is _not_ useful here.
1195 	 */
1196 
1197 	/*
1198 	 * Merge VF's enabled capabilities.
1199 	 */
1200 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1201 
1202 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1203 	if (ifp->if_capenable & IFCAP_TXCSUM)
1204 		ifp->if_hwassist |= tmp;
1205 	else
1206 		ifp->if_hwassist &= ~tmp;
1207 
1208 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1209 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1210 		ifp->if_hwassist |= tmp;
1211 	else
1212 		ifp->if_hwassist &= ~tmp;
1213 
1214 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1215 	if (ifp->if_capenable & IFCAP_TSO4)
1216 		ifp->if_hwassist |= tmp;
1217 	else
1218 		ifp->if_hwassist &= ~tmp;
1219 
1220 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1221 	if (ifp->if_capenable & IFCAP_TSO6)
1222 		ifp->if_hwassist |= tmp;
1223 	else
1224 		ifp->if_hwassist &= ~tmp;
1225 
1226 	return (error);
1227 }
1228 
1229 static int
1230 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1231 {
1232 	struct ifnet *vf_ifp;
1233 	struct ifreq ifr;
1234 
1235 	HN_LOCK_ASSERT(sc);
1236 	vf_ifp = sc->hn_vf_ifp;
1237 
1238 	memset(&ifr, 0, sizeof(ifr));
1239 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1240 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1241 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1242 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1243 }
1244 
1245 static void
1246 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1247 {
1248 	struct ifnet *ifp = sc->hn_ifp;
1249 	int allmulti = 0;
1250 
1251 	HN_LOCK_ASSERT(sc);
1252 
1253 	/* XXX vlan(4) style mcast addr maintenance */
1254 	if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1255 		allmulti = IFF_ALLMULTI;
1256 
1257 	/* Always set the VF's if_flags */
1258 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1259 }
1260 
1261 static void
1262 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1263 {
1264 	struct rm_priotracker pt;
1265 	struct ifnet *hn_ifp = NULL;
1266 	struct mbuf *mn;
1267 
1268 	/*
1269 	 * XXX racy, if hn(4) ever detached.
1270 	 */
1271 	rm_rlock(&hn_vfmap_lock, &pt);
1272 	if (vf_ifp->if_index < hn_vfmap_size)
1273 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1274 	rm_runlock(&hn_vfmap_lock, &pt);
1275 
1276 	if (hn_ifp != NULL) {
1277 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1278 			/*
1279 			 * Allow tapping on the VF.
1280 			 */
1281 			ETHER_BPF_MTAP(vf_ifp, mn);
1282 
1283 			/*
1284 			 * Update VF stats.
1285 			 */
1286 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1287 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1288 				    mn->m_pkthdr.len);
1289 			}
1290 			/*
1291 			 * XXX IFCOUNTER_IMCAST
1292 			 * This stat updating is kinda invasive, since it
1293 			 * requires two checks on the mbuf: the length check
1294 			 * and the ethernet header check.  As of this write,
1295 			 * all multicast packets go directly to hn(4), which
1296 			 * makes imcast stat updating in the VF a try in vian.
1297 			 */
1298 
1299 			/*
1300 			 * Fix up rcvif and increase hn(4)'s ipackets.
1301 			 */
1302 			mn->m_pkthdr.rcvif = hn_ifp;
1303 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1304 		}
1305 		/*
1306 		 * Go through hn(4)'s if_input.
1307 		 */
1308 		hn_ifp->if_input(hn_ifp, m);
1309 	} else {
1310 		/*
1311 		 * In the middle of the transition; free this
1312 		 * mbuf chain.
1313 		 */
1314 		while (m != NULL) {
1315 			mn = m->m_nextpkt;
1316 			m->m_nextpkt = NULL;
1317 			m_freem(m);
1318 			m = mn;
1319 		}
1320 	}
1321 }
1322 
1323 static void
1324 hn_mtu_change_fixup(struct hn_softc *sc)
1325 {
1326 	struct ifnet *ifp;
1327 
1328 	HN_LOCK_ASSERT(sc);
1329 	ifp = sc->hn_ifp;
1330 
1331 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1332 #if __FreeBSD_version >= 1100099
1333 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1334 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1335 #endif
1336 }
1337 
1338 static void
1339 hn_xpnt_vf_setready(struct hn_softc *sc)
1340 {
1341 	struct ifnet *ifp, *vf_ifp;
1342 	struct ifreq ifr;
1343 
1344 	HN_LOCK_ASSERT(sc);
1345 	ifp = sc->hn_ifp;
1346 	vf_ifp = sc->hn_vf_ifp;
1347 
1348 	/*
1349 	 * Mark the VF ready.
1350 	 */
1351 	sc->hn_vf_rdytick = 0;
1352 
1353 	/*
1354 	 * Save information for restoration.
1355 	 */
1356 	sc->hn_saved_caps = ifp->if_capabilities;
1357 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1358 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1359 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1360 
1361 	/*
1362 	 * Intersect supported/enabled capabilities.
1363 	 *
1364 	 * NOTE:
1365 	 * if_hwassist is not changed here.
1366 	 */
1367 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1368 	ifp->if_capenable &= ifp->if_capabilities;
1369 
1370 	/*
1371 	 * Fix TSO settings.
1372 	 */
1373 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1374 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1375 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1376 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1377 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1378 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1379 
1380 	/*
1381 	 * Change VF's enabled capabilities.
1382 	 */
1383 	memset(&ifr, 0, sizeof(ifr));
1384 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1385 	ifr.ifr_reqcap = ifp->if_capenable;
1386 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1387 
1388 	if (ifp->if_mtu != ETHERMTU) {
1389 		int error;
1390 
1391 		/*
1392 		 * Change VF's MTU.
1393 		 */
1394 		memset(&ifr, 0, sizeof(ifr));
1395 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1396 		ifr.ifr_mtu = ifp->if_mtu;
1397 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1398 		if (error) {
1399 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1400 			    vf_ifp->if_xname, ifp->if_mtu);
1401 			if (ifp->if_mtu > ETHERMTU) {
1402 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1403 
1404 				/*
1405 				 * XXX
1406 				 * No need to adjust the synthetic parts' MTU;
1407 				 * failure of the adjustment will cause us
1408 				 * infinite headache.
1409 				 */
1410 				ifp->if_mtu = ETHERMTU;
1411 				hn_mtu_change_fixup(sc);
1412 			}
1413 		}
1414 	}
1415 }
1416 
1417 static bool
1418 hn_xpnt_vf_isready(struct hn_softc *sc)
1419 {
1420 
1421 	HN_LOCK_ASSERT(sc);
1422 
1423 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1424 		return (false);
1425 
1426 	if (sc->hn_vf_rdytick == 0)
1427 		return (true);
1428 
1429 	if (sc->hn_vf_rdytick > ticks)
1430 		return (false);
1431 
1432 	/* Mark VF as ready. */
1433 	hn_xpnt_vf_setready(sc);
1434 	return (true);
1435 }
1436 
1437 static void
1438 hn_xpnt_vf_setenable(struct hn_softc *sc)
1439 {
1440 	int i;
1441 
1442 	HN_LOCK_ASSERT(sc);
1443 
1444 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1445 	rm_wlock(&sc->hn_vf_lock);
1446 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1447 	rm_wunlock(&sc->hn_vf_lock);
1448 
1449 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1450 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1451 }
1452 
1453 static void
1454 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1455 {
1456 	int i;
1457 
1458 	HN_LOCK_ASSERT(sc);
1459 
1460 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1461 	rm_wlock(&sc->hn_vf_lock);
1462 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1463 	if (clear_vf)
1464 		sc->hn_vf_ifp = NULL;
1465 	rm_wunlock(&sc->hn_vf_lock);
1466 
1467 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1468 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1469 }
1470 
1471 static void
1472 hn_xpnt_vf_init(struct hn_softc *sc)
1473 {
1474 	int error;
1475 
1476 	HN_LOCK_ASSERT(sc);
1477 
1478 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1479 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1480 
1481 	if (bootverbose) {
1482 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1483 		    sc->hn_vf_ifp->if_xname);
1484 	}
1485 
1486 	/*
1487 	 * Bring the VF up.
1488 	 */
1489 	hn_xpnt_vf_saveifflags(sc);
1490 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1491 	error = hn_xpnt_vf_iocsetflags(sc);
1492 	if (error) {
1493 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1494 		    sc->hn_vf_ifp->if_xname, error);
1495 		return;
1496 	}
1497 
1498 	/*
1499 	 * NOTE:
1500 	 * Datapath setting must happen _after_ bringing the VF up.
1501 	 */
1502 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1503 
1504 	/* Mark transparent mode VF as enabled. */
1505 	hn_xpnt_vf_setenable(sc);
1506 }
1507 
1508 static void
1509 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1510 {
1511 	struct hn_softc *sc = xsc;
1512 
1513 	HN_LOCK(sc);
1514 
1515 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1516 		goto done;
1517 	if (sc->hn_vf_ifp == NULL)
1518 		goto done;
1519 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1520 		goto done;
1521 
1522 	if (sc->hn_vf_rdytick != 0) {
1523 		/* Mark VF as ready. */
1524 		hn_xpnt_vf_setready(sc);
1525 	}
1526 
1527 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1528 		/*
1529 		 * Delayed VF initialization.
1530 		 */
1531 		if (bootverbose) {
1532 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1533 			    sc->hn_vf_ifp->if_xname);
1534 		}
1535 		hn_xpnt_vf_init(sc);
1536 	}
1537 done:
1538 	HN_UNLOCK(sc);
1539 }
1540 
1541 static void
1542 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1543 {
1544 	struct hn_softc *sc = xsc;
1545 
1546 	HN_LOCK(sc);
1547 
1548 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1549 		goto done;
1550 
1551 	if (!hn_ismyvf(sc, ifp))
1552 		goto done;
1553 
1554 	if (sc->hn_vf_ifp != NULL) {
1555 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1556 		    sc->hn_vf_ifp->if_xname);
1557 		goto done;
1558 	}
1559 
1560 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1561 		/*
1562 		 * ifnet.if_start is _not_ supported by transparent
1563 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1564 		 */
1565 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1566 		    "in transparent VF mode.\n", ifp->if_xname);
1567 		goto done;
1568 	}
1569 
1570 	rm_wlock(&hn_vfmap_lock);
1571 
1572 	if (ifp->if_index >= hn_vfmap_size) {
1573 		struct ifnet **newmap;
1574 		int newsize;
1575 
1576 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1577 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1578 		    M_WAITOK | M_ZERO);
1579 
1580 		memcpy(newmap, hn_vfmap,
1581 		    sizeof(struct ifnet *) * hn_vfmap_size);
1582 		free(hn_vfmap, M_DEVBUF);
1583 		hn_vfmap = newmap;
1584 		hn_vfmap_size = newsize;
1585 	}
1586 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1587 	    ("%s: ifindex %d was mapped to %s",
1588 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1589 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1590 
1591 	rm_wunlock(&hn_vfmap_lock);
1592 
1593 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1594 	rm_wlock(&sc->hn_vf_lock);
1595 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1596 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1597 	sc->hn_vf_ifp = ifp;
1598 	rm_wunlock(&sc->hn_vf_lock);
1599 
1600 	if (hn_xpnt_vf) {
1601 		int wait_ticks;
1602 
1603 		/*
1604 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1605 		 * Save vf_ifp's current if_input for later restoration.
1606 		 */
1607 		sc->hn_vf_input = ifp->if_input;
1608 		ifp->if_input = hn_xpnt_vf_input;
1609 
1610 		/*
1611 		 * Stop link status management; use the VF's.
1612 		 */
1613 		hn_suspend_mgmt(sc);
1614 
1615 		/*
1616 		 * Give VF sometime to complete its attach routing.
1617 		 */
1618 		wait_ticks = hn_xpnt_vf_attwait * hz;
1619 		sc->hn_vf_rdytick = ticks + wait_ticks;
1620 
1621 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1622 		    wait_ticks);
1623 	}
1624 done:
1625 	HN_UNLOCK(sc);
1626 }
1627 
1628 static void
1629 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1630 {
1631 	struct hn_softc *sc = xsc;
1632 
1633 	HN_LOCK(sc);
1634 
1635 	if (sc->hn_vf_ifp == NULL)
1636 		goto done;
1637 
1638 	if (!hn_ismyvf(sc, ifp))
1639 		goto done;
1640 
1641 	if (hn_xpnt_vf) {
1642 		/*
1643 		 * Make sure that the delayed initialization is not running.
1644 		 *
1645 		 * NOTE:
1646 		 * - This lock _must_ be released, since the hn_vf_init task
1647 		 *   will try holding this lock.
1648 		 * - It is safe to release this lock here, since the
1649 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1650 		 *
1651 		 * XXX racy, if hn(4) ever detached.
1652 		 */
1653 		HN_UNLOCK(sc);
1654 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1655 		HN_LOCK(sc);
1656 
1657 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1658 		    sc->hn_ifp->if_xname));
1659 		ifp->if_input = sc->hn_vf_input;
1660 		sc->hn_vf_input = NULL;
1661 
1662 		if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1663 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1664 
1665 		if (sc->hn_vf_rdytick == 0) {
1666 			/*
1667 			 * The VF was ready; restore some settings.
1668 			 */
1669 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
1670 			/*
1671 			 * NOTE:
1672 			 * There is _no_ need to fixup if_capenable and
1673 			 * if_hwassist, since the if_capabilities before
1674 			 * restoration was an intersection of the VF's
1675 			 * if_capabilites and the synthetic device's
1676 			 * if_capabilites.
1677 			 */
1678 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
1679 			sc->hn_ifp->if_hw_tsomaxsegcount =
1680 			    sc->hn_saved_tsosegcnt;
1681 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
1682 		}
1683 
1684 		/*
1685 		 * Resume link status management, which was suspended
1686 		 * by hn_ifnet_attevent().
1687 		 */
1688 		hn_resume_mgmt(sc);
1689 	}
1690 
1691 	/* Mark transparent mode VF as disabled. */
1692 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
1693 
1694 	rm_wlock(&hn_vfmap_lock);
1695 
1696 	KASSERT(ifp->if_index < hn_vfmap_size,
1697 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
1698 	if (hn_vfmap[ifp->if_index] != NULL) {
1699 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
1700 		    ("%s: ifindex %d was mapped to %s",
1701 		     ifp->if_xname, ifp->if_index,
1702 		     hn_vfmap[ifp->if_index]->if_xname));
1703 		hn_vfmap[ifp->if_index] = NULL;
1704 	}
1705 
1706 	rm_wunlock(&hn_vfmap_lock);
1707 done:
1708 	HN_UNLOCK(sc);
1709 }
1710 
1711 static void
1712 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
1713 {
1714 	struct hn_softc *sc = xsc;
1715 
1716 	if (sc->hn_vf_ifp == ifp)
1717 		if_link_state_change(sc->hn_ifp, link_state);
1718 }
1719 
1720 static int
1721 hn_probe(device_t dev)
1722 {
1723 
1724 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
1725 		device_set_desc(dev, "Hyper-V Network Interface");
1726 		return BUS_PROBE_DEFAULT;
1727 	}
1728 	return ENXIO;
1729 }
1730 
1731 static int
1732 hn_attach(device_t dev)
1733 {
1734 	struct hn_softc *sc = device_get_softc(dev);
1735 	struct sysctl_oid_list *child;
1736 	struct sysctl_ctx_list *ctx;
1737 	uint8_t eaddr[ETHER_ADDR_LEN];
1738 	struct ifnet *ifp = NULL;
1739 	int error, ring_cnt, tx_ring_cnt;
1740 
1741 	sc->hn_dev = dev;
1742 	sc->hn_prichan = vmbus_get_channel(dev);
1743 	HN_LOCK_INIT(sc);
1744 	rm_init(&sc->hn_vf_lock, "hnvf");
1745 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
1746 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
1747 
1748 	/*
1749 	 * Initialize these tunables once.
1750 	 */
1751 	sc->hn_agg_size = hn_tx_agg_size;
1752 	sc->hn_agg_pkts = hn_tx_agg_pkts;
1753 
1754 	/*
1755 	 * Setup taskqueue for transmission.
1756 	 */
1757 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1758 		int i;
1759 
1760 		sc->hn_tx_taskqs =
1761 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1762 		    M_DEVBUF, M_WAITOK);
1763 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1764 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1765 			    M_WAITOK, taskqueue_thread_enqueue,
1766 			    &sc->hn_tx_taskqs[i]);
1767 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1768 			    "%s tx%d", device_get_nameunit(dev), i);
1769 		}
1770 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1771 		sc->hn_tx_taskqs = hn_tx_taskque;
1772 	}
1773 
1774 	/*
1775 	 * Setup taskqueue for mangement tasks, e.g. link status.
1776 	 */
1777 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1778 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1779 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1780 	    device_get_nameunit(dev));
1781 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1782 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1783 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1784 	    hn_netchg_status_taskfunc, sc);
1785 
1786 	if (hn_xpnt_vf) {
1787 		/*
1788 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
1789 		 */
1790 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
1791 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
1792 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
1793 		    device_get_nameunit(dev));
1794 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
1795 		    hn_xpnt_vf_init_taskfunc, sc);
1796 	}
1797 
1798 	/*
1799 	 * Allocate ifnet and setup its name earlier, so that if_printf
1800 	 * can be used by functions, which will be called after
1801 	 * ether_ifattach().
1802 	 */
1803 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
1804 	ifp->if_softc = sc;
1805 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1806 
1807 	/*
1808 	 * Initialize ifmedia earlier so that it can be unconditionally
1809 	 * destroyed, if error happened later on.
1810 	 */
1811 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1812 
1813 	/*
1814 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1815 	 * to use (tx_ring_cnt).
1816 	 *
1817 	 * NOTE:
1818 	 * The # of RX rings to use is same as the # of channels to use.
1819 	 */
1820 	ring_cnt = hn_chan_cnt;
1821 	if (ring_cnt <= 0) {
1822 		/* Default */
1823 		ring_cnt = mp_ncpus;
1824 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
1825 			ring_cnt = HN_RING_CNT_DEF_MAX;
1826 	} else if (ring_cnt > mp_ncpus) {
1827 		ring_cnt = mp_ncpus;
1828 	}
1829 #ifdef RSS
1830 	if (ring_cnt > rss_getnumbuckets())
1831 		ring_cnt = rss_getnumbuckets();
1832 #endif
1833 
1834 	tx_ring_cnt = hn_tx_ring_cnt;
1835 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1836 		tx_ring_cnt = ring_cnt;
1837 #ifdef HN_IFSTART_SUPPORT
1838 	if (hn_use_if_start) {
1839 		/* ifnet.if_start only needs one TX ring. */
1840 		tx_ring_cnt = 1;
1841 	}
1842 #endif
1843 
1844 	/*
1845 	 * Set the leader CPU for channels.
1846 	 */
1847 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1848 
1849 	/*
1850 	 * Create enough TX/RX rings, even if only limited number of
1851 	 * channels can be allocated.
1852 	 */
1853 	error = hn_create_tx_data(sc, tx_ring_cnt);
1854 	if (error)
1855 		goto failed;
1856 	error = hn_create_rx_data(sc, ring_cnt);
1857 	if (error)
1858 		goto failed;
1859 
1860 	/*
1861 	 * Create transaction context for NVS and RNDIS transactions.
1862 	 */
1863 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1864 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1865 	if (sc->hn_xact == NULL) {
1866 		error = ENXIO;
1867 		goto failed;
1868 	}
1869 
1870 	/*
1871 	 * Install orphan handler for the revocation of this device's
1872 	 * primary channel.
1873 	 *
1874 	 * NOTE:
1875 	 * The processing order is critical here:
1876 	 * Install the orphan handler, _before_ testing whether this
1877 	 * device's primary channel has been revoked or not.
1878 	 */
1879 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1880 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1881 		error = ENXIO;
1882 		goto failed;
1883 	}
1884 
1885 	/*
1886 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1887 	 */
1888 	error = hn_synth_attach(sc, ETHERMTU);
1889 	if (error)
1890 		goto failed;
1891 
1892 	error = hn_rndis_get_eaddr(sc, eaddr);
1893 	if (error)
1894 		goto failed;
1895 
1896 #if __FreeBSD_version >= 1100099
1897 	if (sc->hn_rx_ring_inuse > 1) {
1898 		/*
1899 		 * Reduce TCP segment aggregation limit for multiple
1900 		 * RX rings to increase ACK timeliness.
1901 		 */
1902 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1903 	}
1904 #endif
1905 
1906 	/*
1907 	 * Fixup TX stuffs after synthetic parts are attached.
1908 	 */
1909 	hn_fixup_tx_data(sc);
1910 
1911 	ctx = device_get_sysctl_ctx(dev);
1912 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1913 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1914 	    &sc->hn_nvs_ver, 0, "NVS version");
1915 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1916 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1917 	    hn_ndis_version_sysctl, "A", "NDIS version");
1918 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1919 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1920 	    hn_caps_sysctl, "A", "capabilities");
1921 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1922 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1923 	    hn_hwassist_sysctl, "A", "hwassist");
1924 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
1925 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
1926 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
1927 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
1928 	    "max # of TSO segments");
1929 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
1930 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
1931 	    "max size of TSO segment");
1932 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1933 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1934 	    hn_rxfilter_sysctl, "A", "rxfilter");
1935 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1936 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1937 	    hn_rss_hash_sysctl, "A", "RSS hash");
1938 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1939 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1940 #ifndef RSS
1941 	/*
1942 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
1943 	 */
1944 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1945 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1946 	    hn_rss_key_sysctl, "IU", "RSS key");
1947 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1948 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1949 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1950 #endif
1951 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1952 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1953 	    "RNDIS offered packet transmission aggregation size limit");
1954 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1955 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1956 	    "RNDIS offered packet transmission aggregation count limit");
1957 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1958 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1959 	    "RNDIS packet transmission aggregation alignment");
1960 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1961 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1962 	    hn_txagg_size_sysctl, "I",
1963 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1964 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1965 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1966 	    hn_txagg_pkts_sysctl, "I",
1967 	    "Packet transmission aggregation packets, "
1968 	    "0 -- disable, -1 -- auto");
1969 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1970 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1971 	    hn_polling_sysctl, "I",
1972 	    "Polling frequency: [100,1000000], 0 disable polling");
1973 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1974 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1975 	    hn_vf_sysctl, "A", "Virtual Function's name");
1976 	if (!hn_xpnt_vf) {
1977 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
1978 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1979 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
1980 	} else {
1981 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
1982 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1983 		    hn_xpnt_vf_enabled_sysctl, "I",
1984 		    "Transparent VF enabled");
1985 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
1986 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1987 		    hn_xpnt_vf_accbpf_sysctl, "I",
1988 		    "Accurate BPF for transparent VF");
1989 	}
1990 
1991 	/*
1992 	 * Setup the ifmedia, which has been initialized earlier.
1993 	 */
1994 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1995 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1996 	/* XXX ifmedia_set really should do this for us */
1997 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1998 
1999 	/*
2000 	 * Setup the ifnet for this interface.
2001 	 */
2002 
2003 	ifp->if_baudrate = IF_Gbps(10);
2004 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2005 	ifp->if_ioctl = hn_ioctl;
2006 	ifp->if_init = hn_init;
2007 #ifdef HN_IFSTART_SUPPORT
2008 	if (hn_use_if_start) {
2009 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2010 
2011 		ifp->if_start = hn_start;
2012 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2013 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2014 		IFQ_SET_READY(&ifp->if_snd);
2015 	} else
2016 #endif
2017 	{
2018 		ifp->if_transmit = hn_transmit;
2019 		ifp->if_qflush = hn_xmit_qflush;
2020 	}
2021 
2022 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2023 #ifdef foo
2024 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2025 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2026 #endif
2027 	if (sc->hn_caps & HN_CAP_VLAN) {
2028 		/* XXX not sure about VLAN_MTU. */
2029 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2030 	}
2031 
2032 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2033 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2034 		ifp->if_capabilities |= IFCAP_TXCSUM;
2035 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2036 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2037 	if (sc->hn_caps & HN_CAP_TSO4) {
2038 		ifp->if_capabilities |= IFCAP_TSO4;
2039 		ifp->if_hwassist |= CSUM_IP_TSO;
2040 	}
2041 	if (sc->hn_caps & HN_CAP_TSO6) {
2042 		ifp->if_capabilities |= IFCAP_TSO6;
2043 		ifp->if_hwassist |= CSUM_IP6_TSO;
2044 	}
2045 
2046 	/* Enable all available capabilities by default. */
2047 	ifp->if_capenable = ifp->if_capabilities;
2048 
2049 	/*
2050 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2051 	 * be enabled through SIOCSIFCAP.
2052 	 */
2053 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2054 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2055 
2056 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2057 		/*
2058 		 * Lock hn_set_tso_maxsize() to simplify its
2059 		 * internal logic.
2060 		 */
2061 		HN_LOCK(sc);
2062 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2063 		HN_UNLOCK(sc);
2064 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2065 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2066 	}
2067 
2068 	ether_ifattach(ifp, eaddr);
2069 
2070 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2071 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2072 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2073 	}
2074 
2075 	/* Inform the upper layer about the long frame support. */
2076 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2077 
2078 	/*
2079 	 * Kick off link status check.
2080 	 */
2081 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2082 	hn_update_link_status(sc);
2083 
2084 	if (!hn_xpnt_vf) {
2085 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2086 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2087 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2088 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2089 	} else {
2090 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2091 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2092 	}
2093 
2094 	/*
2095 	 * NOTE:
2096 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2097 	 * since interface's LLADDR is needed; interface LLADDR is not
2098 	 * available when ifnet_arrival event is triggered.
2099 	 */
2100 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2101 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2102 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2103 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2104 
2105 	return (0);
2106 failed:
2107 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2108 		hn_synth_detach(sc);
2109 	hn_detach(dev);
2110 	return (error);
2111 }
2112 
2113 static int
2114 hn_detach(device_t dev)
2115 {
2116 	struct hn_softc *sc = device_get_softc(dev);
2117 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2118 
2119 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2120 		/*
2121 		 * In case that the vmbus missed the orphan handler
2122 		 * installation.
2123 		 */
2124 		vmbus_xact_ctx_orphan(sc->hn_xact);
2125 	}
2126 
2127 	if (sc->hn_ifaddr_evthand != NULL)
2128 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2129 	if (sc->hn_ifnet_evthand != NULL)
2130 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2131 	if (sc->hn_ifnet_atthand != NULL) {
2132 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2133 		    sc->hn_ifnet_atthand);
2134 	}
2135 	if (sc->hn_ifnet_dethand != NULL) {
2136 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2137 		    sc->hn_ifnet_dethand);
2138 	}
2139 	if (sc->hn_ifnet_lnkhand != NULL)
2140 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2141 
2142 	vf_ifp = sc->hn_vf_ifp;
2143 	__compiler_membar();
2144 	if (vf_ifp != NULL)
2145 		hn_ifnet_detevent(sc, vf_ifp);
2146 
2147 	if (device_is_attached(dev)) {
2148 		HN_LOCK(sc);
2149 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2150 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2151 				hn_stop(sc, true);
2152 			/*
2153 			 * NOTE:
2154 			 * hn_stop() only suspends data, so managment
2155 			 * stuffs have to be suspended manually here.
2156 			 */
2157 			hn_suspend_mgmt(sc);
2158 			hn_synth_detach(sc);
2159 		}
2160 		HN_UNLOCK(sc);
2161 		ether_ifdetach(ifp);
2162 	}
2163 
2164 	ifmedia_removeall(&sc->hn_media);
2165 	hn_destroy_rx_data(sc);
2166 	hn_destroy_tx_data(sc);
2167 
2168 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2169 		int i;
2170 
2171 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2172 			taskqueue_free(sc->hn_tx_taskqs[i]);
2173 		free(sc->hn_tx_taskqs, M_DEVBUF);
2174 	}
2175 	taskqueue_free(sc->hn_mgmt_taskq0);
2176 	if (sc->hn_vf_taskq != NULL)
2177 		taskqueue_free(sc->hn_vf_taskq);
2178 
2179 	if (sc->hn_xact != NULL) {
2180 		/*
2181 		 * Uninstall the orphan handler _before_ the xact is
2182 		 * destructed.
2183 		 */
2184 		vmbus_chan_unset_orphan(sc->hn_prichan);
2185 		vmbus_xact_ctx_destroy(sc->hn_xact);
2186 	}
2187 
2188 	if_free(ifp);
2189 
2190 	HN_LOCK_DESTROY(sc);
2191 	rm_destroy(&sc->hn_vf_lock);
2192 	return (0);
2193 }
2194 
2195 static int
2196 hn_shutdown(device_t dev)
2197 {
2198 
2199 	return (0);
2200 }
2201 
2202 static void
2203 hn_link_status(struct hn_softc *sc)
2204 {
2205 	uint32_t link_status;
2206 	int error;
2207 
2208 	error = hn_rndis_get_linkstatus(sc, &link_status);
2209 	if (error) {
2210 		/* XXX what to do? */
2211 		return;
2212 	}
2213 
2214 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2215 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2216 	else
2217 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2218 	if_link_state_change(sc->hn_ifp,
2219 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2220 	    LINK_STATE_UP : LINK_STATE_DOWN);
2221 }
2222 
2223 static void
2224 hn_link_taskfunc(void *xsc, int pending __unused)
2225 {
2226 	struct hn_softc *sc = xsc;
2227 
2228 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2229 		return;
2230 	hn_link_status(sc);
2231 }
2232 
2233 static void
2234 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2235 {
2236 	struct hn_softc *sc = xsc;
2237 
2238 	/* Prevent any link status checks from running. */
2239 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2240 
2241 	/*
2242 	 * Fake up a [link down --> link up] state change; 5 seconds
2243 	 * delay is used, which closely simulates miibus reaction
2244 	 * upon link down event.
2245 	 */
2246 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2247 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2248 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2249 	    &sc->hn_netchg_status, 5 * hz);
2250 }
2251 
2252 static void
2253 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2254 {
2255 	struct hn_softc *sc = xsc;
2256 
2257 	/* Re-allow link status checks. */
2258 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2259 	hn_link_status(sc);
2260 }
2261 
2262 static void
2263 hn_update_link_status(struct hn_softc *sc)
2264 {
2265 
2266 	if (sc->hn_mgmt_taskq != NULL)
2267 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2268 }
2269 
2270 static void
2271 hn_change_network(struct hn_softc *sc)
2272 {
2273 
2274 	if (sc->hn_mgmt_taskq != NULL)
2275 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2276 }
2277 
2278 static __inline int
2279 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2280     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2281 {
2282 	struct mbuf *m = *m_head;
2283 	int error;
2284 
2285 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2286 
2287 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2288 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2289 	if (error == EFBIG) {
2290 		struct mbuf *m_new;
2291 
2292 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2293 		if (m_new == NULL)
2294 			return ENOBUFS;
2295 		else
2296 			*m_head = m = m_new;
2297 		txr->hn_tx_collapsed++;
2298 
2299 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2300 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2301 	}
2302 	if (!error) {
2303 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2304 		    BUS_DMASYNC_PREWRITE);
2305 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2306 	}
2307 	return error;
2308 }
2309 
2310 static __inline int
2311 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2312 {
2313 
2314 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2315 	    ("put an onlist txd %#x", txd->flags));
2316 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2317 	    ("put an onagg txd %#x", txd->flags));
2318 
2319 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2320 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2321 		return 0;
2322 
2323 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2324 		struct hn_txdesc *tmp_txd;
2325 
2326 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2327 			int freed;
2328 
2329 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2330 			    ("resursive aggregation on aggregated txdesc"));
2331 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2332 			    ("not aggregated txdesc"));
2333 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2334 			    ("aggregated txdesc uses dmamap"));
2335 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2336 			    ("aggregated txdesc consumes "
2337 			     "chimney sending buffer"));
2338 			KASSERT(tmp_txd->chim_size == 0,
2339 			    ("aggregated txdesc has non-zero "
2340 			     "chimney sending size"));
2341 
2342 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2343 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2344 			freed = hn_txdesc_put(txr, tmp_txd);
2345 			KASSERT(freed, ("failed to free aggregated txdesc"));
2346 		}
2347 	}
2348 
2349 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2350 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2351 		    ("chim txd uses dmamap"));
2352 		hn_chim_free(txr->hn_sc, txd->chim_index);
2353 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2354 		txd->chim_size = 0;
2355 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2356 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2357 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2358 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2359 		    txd->data_dmap);
2360 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2361 	}
2362 
2363 	if (txd->m != NULL) {
2364 		m_freem(txd->m);
2365 		txd->m = NULL;
2366 	}
2367 
2368 	txd->flags |= HN_TXD_FLAG_ONLIST;
2369 #ifndef HN_USE_TXDESC_BUFRING
2370 	mtx_lock_spin(&txr->hn_txlist_spin);
2371 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2372 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2373 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2374 	txr->hn_txdesc_avail++;
2375 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2376 	mtx_unlock_spin(&txr->hn_txlist_spin);
2377 #else	/* HN_USE_TXDESC_BUFRING */
2378 #ifdef HN_DEBUG
2379 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2380 #endif
2381 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2382 #endif	/* !HN_USE_TXDESC_BUFRING */
2383 
2384 	return 1;
2385 }
2386 
2387 static __inline struct hn_txdesc *
2388 hn_txdesc_get(struct hn_tx_ring *txr)
2389 {
2390 	struct hn_txdesc *txd;
2391 
2392 #ifndef HN_USE_TXDESC_BUFRING
2393 	mtx_lock_spin(&txr->hn_txlist_spin);
2394 	txd = SLIST_FIRST(&txr->hn_txlist);
2395 	if (txd != NULL) {
2396 		KASSERT(txr->hn_txdesc_avail > 0,
2397 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2398 		txr->hn_txdesc_avail--;
2399 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2400 	}
2401 	mtx_unlock_spin(&txr->hn_txlist_spin);
2402 #else
2403 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2404 #endif
2405 
2406 	if (txd != NULL) {
2407 #ifdef HN_USE_TXDESC_BUFRING
2408 #ifdef HN_DEBUG
2409 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2410 #endif
2411 #endif	/* HN_USE_TXDESC_BUFRING */
2412 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2413 		    STAILQ_EMPTY(&txd->agg_list) &&
2414 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2415 		    txd->chim_size == 0 &&
2416 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2417 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2418 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2419 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2420 		txd->refs = 1;
2421 	}
2422 	return txd;
2423 }
2424 
2425 static __inline void
2426 hn_txdesc_hold(struct hn_txdesc *txd)
2427 {
2428 
2429 	/* 0->1 transition will never work */
2430 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2431 	atomic_add_int(&txd->refs, 1);
2432 }
2433 
2434 static __inline void
2435 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2436 {
2437 
2438 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2439 	    ("recursive aggregation on aggregating txdesc"));
2440 
2441 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2442 	    ("already aggregated"));
2443 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2444 	    ("recursive aggregation on to-be-aggregated txdesc"));
2445 
2446 	txd->flags |= HN_TXD_FLAG_ONAGG;
2447 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2448 }
2449 
2450 static bool
2451 hn_tx_ring_pending(struct hn_tx_ring *txr)
2452 {
2453 	bool pending = false;
2454 
2455 #ifndef HN_USE_TXDESC_BUFRING
2456 	mtx_lock_spin(&txr->hn_txlist_spin);
2457 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2458 		pending = true;
2459 	mtx_unlock_spin(&txr->hn_txlist_spin);
2460 #else
2461 	if (!buf_ring_full(txr->hn_txdesc_br))
2462 		pending = true;
2463 #endif
2464 	return (pending);
2465 }
2466 
2467 static __inline void
2468 hn_txeof(struct hn_tx_ring *txr)
2469 {
2470 	txr->hn_has_txeof = 0;
2471 	txr->hn_txeof(txr);
2472 }
2473 
2474 static void
2475 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2476     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2477 {
2478 	struct hn_txdesc *txd = sndc->hn_cbarg;
2479 	struct hn_tx_ring *txr;
2480 
2481 	txr = txd->txr;
2482 	KASSERT(txr->hn_chan == chan,
2483 	    ("channel mismatch, on chan%u, should be chan%u",
2484 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2485 
2486 	txr->hn_has_txeof = 1;
2487 	hn_txdesc_put(txr, txd);
2488 
2489 	++txr->hn_txdone_cnt;
2490 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2491 		txr->hn_txdone_cnt = 0;
2492 		if (txr->hn_oactive)
2493 			hn_txeof(txr);
2494 	}
2495 }
2496 
2497 static void
2498 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2499 {
2500 #if defined(INET) || defined(INET6)
2501 	tcp_lro_flush_all(&rxr->hn_lro);
2502 #endif
2503 
2504 	/*
2505 	 * NOTE:
2506 	 * 'txr' could be NULL, if multiple channels and
2507 	 * ifnet.if_start method are enabled.
2508 	 */
2509 	if (txr == NULL || !txr->hn_has_txeof)
2510 		return;
2511 
2512 	txr->hn_txdone_cnt = 0;
2513 	hn_txeof(txr);
2514 }
2515 
2516 static __inline uint32_t
2517 hn_rndis_pktmsg_offset(uint32_t ofs)
2518 {
2519 
2520 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2521 	    ("invalid RNDIS packet msg offset %u", ofs));
2522 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2523 }
2524 
2525 static __inline void *
2526 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2527     size_t pi_dlen, uint32_t pi_type)
2528 {
2529 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2530 	struct rndis_pktinfo *pi;
2531 
2532 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2533 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2534 
2535 	/*
2536 	 * Per-packet-info does not move; it only grows.
2537 	 *
2538 	 * NOTE:
2539 	 * rm_pktinfooffset in this phase counts from the beginning
2540 	 * of rndis_packet_msg.
2541 	 */
2542 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2543 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2544 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2545 	    pkt->rm_pktinfolen);
2546 	pkt->rm_pktinfolen += pi_size;
2547 
2548 	pi->rm_size = pi_size;
2549 	pi->rm_type = pi_type;
2550 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2551 
2552 	return (pi->rm_data);
2553 }
2554 
2555 static __inline int
2556 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2557 {
2558 	struct hn_txdesc *txd;
2559 	struct mbuf *m;
2560 	int error, pkts;
2561 
2562 	txd = txr->hn_agg_txd;
2563 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2564 
2565 	/*
2566 	 * Since hn_txpkt() will reset this temporary stat, save
2567 	 * it now, so that oerrors can be updated properly, if
2568 	 * hn_txpkt() ever fails.
2569 	 */
2570 	pkts = txr->hn_stat_pkts;
2571 
2572 	/*
2573 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2574 	 * failure, save it for later freeing, if hn_txpkt() ever
2575 	 * fails.
2576 	 */
2577 	m = txd->m;
2578 	error = hn_txpkt(ifp, txr, txd);
2579 	if (__predict_false(error)) {
2580 		/* txd is freed, but m is not. */
2581 		m_freem(m);
2582 
2583 		txr->hn_flush_failed++;
2584 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2585 	}
2586 
2587 	/* Reset all aggregation states. */
2588 	txr->hn_agg_txd = NULL;
2589 	txr->hn_agg_szleft = 0;
2590 	txr->hn_agg_pktleft = 0;
2591 	txr->hn_agg_prevpkt = NULL;
2592 
2593 	return (error);
2594 }
2595 
2596 static void *
2597 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2598     int pktsize)
2599 {
2600 	void *chim;
2601 
2602 	if (txr->hn_agg_txd != NULL) {
2603 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2604 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2605 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2606 			int olen;
2607 
2608 			/*
2609 			 * Update the previous RNDIS packet's total length,
2610 			 * it can be increased due to the mandatory alignment
2611 			 * padding for this RNDIS packet.  And update the
2612 			 * aggregating txdesc's chimney sending buffer size
2613 			 * accordingly.
2614 			 *
2615 			 * XXX
2616 			 * Zero-out the padding, as required by the RNDIS spec.
2617 			 */
2618 			olen = pkt->rm_len;
2619 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2620 			agg_txd->chim_size += pkt->rm_len - olen;
2621 
2622 			/* Link this txdesc to the parent. */
2623 			hn_txdesc_agg(agg_txd, txd);
2624 
2625 			chim = (uint8_t *)pkt + pkt->rm_len;
2626 			/* Save the current packet for later fixup. */
2627 			txr->hn_agg_prevpkt = chim;
2628 
2629 			txr->hn_agg_pktleft--;
2630 			txr->hn_agg_szleft -= pktsize;
2631 			if (txr->hn_agg_szleft <=
2632 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2633 				/*
2634 				 * Probably can't aggregate more packets,
2635 				 * flush this aggregating txdesc proactively.
2636 				 */
2637 				txr->hn_agg_pktleft = 0;
2638 			}
2639 			/* Done! */
2640 			return (chim);
2641 		}
2642 		hn_flush_txagg(ifp, txr);
2643 	}
2644 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2645 
2646 	txr->hn_tx_chimney_tried++;
2647 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
2648 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2649 		return (NULL);
2650 	txr->hn_tx_chimney++;
2651 
2652 	chim = txr->hn_sc->hn_chim +
2653 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2654 
2655 	if (txr->hn_agg_pktmax > 1 &&
2656 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2657 		txr->hn_agg_txd = txd;
2658 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2659 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2660 		txr->hn_agg_prevpkt = chim;
2661 	}
2662 	return (chim);
2663 }
2664 
2665 /*
2666  * NOTE:
2667  * If this function fails, then both txd and m_head0 will be freed.
2668  */
2669 static int
2670 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2671     struct mbuf **m_head0)
2672 {
2673 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
2674 	int error, nsegs, i;
2675 	struct mbuf *m_head = *m_head0;
2676 	struct rndis_packet_msg *pkt;
2677 	uint32_t *pi_data;
2678 	void *chim = NULL;
2679 	int pkt_hlen, pkt_size;
2680 
2681 	pkt = txd->rndis_pkt;
2682 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
2683 	if (pkt_size < txr->hn_chim_size) {
2684 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
2685 		if (chim != NULL)
2686 			pkt = chim;
2687 	} else {
2688 		if (txr->hn_agg_txd != NULL)
2689 			hn_flush_txagg(ifp, txr);
2690 	}
2691 
2692 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
2693 	pkt->rm_len = m_head->m_pkthdr.len;
2694 	pkt->rm_dataoffset = 0;
2695 	pkt->rm_datalen = m_head->m_pkthdr.len;
2696 	pkt->rm_oobdataoffset = 0;
2697 	pkt->rm_oobdatalen = 0;
2698 	pkt->rm_oobdataelements = 0;
2699 	pkt->rm_pktinfooffset = sizeof(*pkt);
2700 	pkt->rm_pktinfolen = 0;
2701 	pkt->rm_vchandle = 0;
2702 	pkt->rm_reserved = 0;
2703 
2704 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2705 		/*
2706 		 * Set the hash value for this packet, so that the host could
2707 		 * dispatch the TX done event for this packet back to this TX
2708 		 * ring's channel.
2709 		 */
2710 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2711 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2712 		*pi_data = txr->hn_tx_idx;
2713 	}
2714 
2715 	if (m_head->m_flags & M_VLANTAG) {
2716 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2717 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
2718 		*pi_data = NDIS_VLAN_INFO_MAKE(
2719 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
2720 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
2721 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
2722 	}
2723 
2724 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2725 #if defined(INET6) || defined(INET)
2726 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2727 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
2728 #ifdef INET
2729 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
2730 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
2731 			    m_head->m_pkthdr.tso_segsz);
2732 		}
2733 #endif
2734 #if defined(INET6) && defined(INET)
2735 		else
2736 #endif
2737 #ifdef INET6
2738 		{
2739 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
2740 			    m_head->m_pkthdr.tso_segsz);
2741 		}
2742 #endif
2743 #endif	/* INET6 || INET */
2744 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
2745 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2746 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
2747 		if (m_head->m_pkthdr.csum_flags &
2748 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
2749 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
2750 		} else {
2751 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
2752 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
2753 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
2754 		}
2755 
2756 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
2757 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
2758 		else if (m_head->m_pkthdr.csum_flags &
2759 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
2760 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
2761 	}
2762 
2763 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
2764 	/* Fixup RNDIS packet message total length */
2765 	pkt->rm_len += pkt_hlen;
2766 	/* Convert RNDIS packet message offsets */
2767 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
2768 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2769 
2770 	/*
2771 	 * Fast path: Chimney sending.
2772 	 */
2773 	if (chim != NULL) {
2774 		struct hn_txdesc *tgt_txd = txd;
2775 
2776 		if (txr->hn_agg_txd != NULL) {
2777 			tgt_txd = txr->hn_agg_txd;
2778 #ifdef INVARIANTS
2779 			*m_head0 = NULL;
2780 #endif
2781 		}
2782 
2783 		KASSERT(pkt == chim,
2784 		    ("RNDIS pkt not in chimney sending buffer"));
2785 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2786 		    ("chimney sending buffer is not used"));
2787 		tgt_txd->chim_size += pkt->rm_len;
2788 
2789 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
2790 		    ((uint8_t *)chim) + pkt_hlen);
2791 
2792 		txr->hn_gpa_cnt = 0;
2793 		txr->hn_sendpkt = hn_txpkt_chim;
2794 		goto done;
2795 	}
2796 
2797 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2798 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2799 	    ("chimney buffer is used"));
2800 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2801 
2802 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2803 	if (__predict_false(error)) {
2804 		int freed;
2805 
2806 		/*
2807 		 * This mbuf is not linked w/ the txd yet, so free it now.
2808 		 */
2809 		m_freem(m_head);
2810 		*m_head0 = NULL;
2811 
2812 		freed = hn_txdesc_put(txr, txd);
2813 		KASSERT(freed != 0,
2814 		    ("fail to free txd upon txdma error"));
2815 
2816 		txr->hn_txdma_failed++;
2817 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2818 		return error;
2819 	}
2820 	*m_head0 = m_head;
2821 
2822 	/* +1 RNDIS packet message */
2823 	txr->hn_gpa_cnt = nsegs + 1;
2824 
2825 	/* send packet with page buffer */
2826 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2827 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2828 	txr->hn_gpa[0].gpa_len = pkt_hlen;
2829 
2830 	/*
2831 	 * Fill the page buffers with mbuf info after the page
2832 	 * buffer for RNDIS packet message.
2833 	 */
2834 	for (i = 0; i < nsegs; ++i) {
2835 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2836 
2837 		gpa->gpa_page = atop(segs[i].ds_addr);
2838 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2839 		gpa->gpa_len = segs[i].ds_len;
2840 	}
2841 
2842 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2843 	txd->chim_size = 0;
2844 	txr->hn_sendpkt = hn_txpkt_sglist;
2845 done:
2846 	txd->m = m_head;
2847 
2848 	/* Set the completion routine */
2849 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2850 
2851 	/* Update temporary stats for later use. */
2852 	txr->hn_stat_pkts++;
2853 	txr->hn_stat_size += m_head->m_pkthdr.len;
2854 	if (m_head->m_flags & M_MCAST)
2855 		txr->hn_stat_mcasts++;
2856 
2857 	return 0;
2858 }
2859 
2860 /*
2861  * NOTE:
2862  * If this function fails, then txd will be freed, but the mbuf
2863  * associated w/ the txd will _not_ be freed.
2864  */
2865 static int
2866 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2867 {
2868 	int error, send_failed = 0, has_bpf;
2869 
2870 again:
2871 	has_bpf = bpf_peers_present(ifp->if_bpf);
2872 	if (has_bpf) {
2873 		/*
2874 		 * Make sure that this txd and any aggregated txds are not
2875 		 * freed before ETHER_BPF_MTAP.
2876 		 */
2877 		hn_txdesc_hold(txd);
2878 	}
2879 	error = txr->hn_sendpkt(txr, txd);
2880 	if (!error) {
2881 		if (has_bpf) {
2882 			const struct hn_txdesc *tmp_txd;
2883 
2884 			ETHER_BPF_MTAP(ifp, txd->m);
2885 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2886 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
2887 		}
2888 
2889 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2890 #ifdef HN_IFSTART_SUPPORT
2891 		if (!hn_use_if_start)
2892 #endif
2893 		{
2894 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
2895 			    txr->hn_stat_size);
2896 			if (txr->hn_stat_mcasts != 0) {
2897 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2898 				    txr->hn_stat_mcasts);
2899 			}
2900 		}
2901 		txr->hn_pkts += txr->hn_stat_pkts;
2902 		txr->hn_sends++;
2903 	}
2904 	if (has_bpf)
2905 		hn_txdesc_put(txr, txd);
2906 
2907 	if (__predict_false(error)) {
2908 		int freed;
2909 
2910 		/*
2911 		 * This should "really rarely" happen.
2912 		 *
2913 		 * XXX Too many RX to be acked or too many sideband
2914 		 * commands to run?  Ask netvsc_channel_rollup()
2915 		 * to kick start later.
2916 		 */
2917 		txr->hn_has_txeof = 1;
2918 		if (!send_failed) {
2919 			txr->hn_send_failed++;
2920 			send_failed = 1;
2921 			/*
2922 			 * Try sending again after set hn_has_txeof;
2923 			 * in case that we missed the last
2924 			 * netvsc_channel_rollup().
2925 			 */
2926 			goto again;
2927 		}
2928 		if_printf(ifp, "send failed\n");
2929 
2930 		/*
2931 		 * Caller will perform further processing on the
2932 		 * associated mbuf, so don't free it in hn_txdesc_put();
2933 		 * only unload it from the DMA map in hn_txdesc_put(),
2934 		 * if it was loaded.
2935 		 */
2936 		txd->m = NULL;
2937 		freed = hn_txdesc_put(txr, txd);
2938 		KASSERT(freed != 0,
2939 		    ("fail to free txd upon send error"));
2940 
2941 		txr->hn_send_failed++;
2942 	}
2943 
2944 	/* Reset temporary stats, after this sending is done. */
2945 	txr->hn_stat_size = 0;
2946 	txr->hn_stat_pkts = 0;
2947 	txr->hn_stat_mcasts = 0;
2948 
2949 	return (error);
2950 }
2951 
2952 /*
2953  * Append the specified data to the indicated mbuf chain,
2954  * Extend the mbuf chain if the new data does not fit in
2955  * existing space.
2956  *
2957  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2958  * There should be an equivalent in the kernel mbuf code,
2959  * but there does not appear to be one yet.
2960  *
2961  * Differs from m_append() in that additional mbufs are
2962  * allocated with cluster size MJUMPAGESIZE, and filled
2963  * accordingly.
2964  *
2965  * Return 1 if able to complete the job; otherwise 0.
2966  */
2967 static int
2968 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2969 {
2970 	struct mbuf *m, *n;
2971 	int remainder, space;
2972 
2973 	for (m = m0; m->m_next != NULL; m = m->m_next)
2974 		;
2975 	remainder = len;
2976 	space = M_TRAILINGSPACE(m);
2977 	if (space > 0) {
2978 		/*
2979 		 * Copy into available space.
2980 		 */
2981 		if (space > remainder)
2982 			space = remainder;
2983 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2984 		m->m_len += space;
2985 		cp += space;
2986 		remainder -= space;
2987 	}
2988 	while (remainder > 0) {
2989 		/*
2990 		 * Allocate a new mbuf; could check space
2991 		 * and allocate a cluster instead.
2992 		 */
2993 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2994 		if (n == NULL)
2995 			break;
2996 		n->m_len = min(MJUMPAGESIZE, remainder);
2997 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2998 		cp += n->m_len;
2999 		remainder -= n->m_len;
3000 		m->m_next = n;
3001 		m = n;
3002 	}
3003 	if (m0->m_flags & M_PKTHDR)
3004 		m0->m_pkthdr.len += len - remainder;
3005 
3006 	return (remainder == 0);
3007 }
3008 
3009 #if defined(INET) || defined(INET6)
3010 static __inline int
3011 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3012 {
3013 #if __FreeBSD_version >= 1100095
3014 	if (hn_lro_mbufq_depth) {
3015 		tcp_lro_queue_mbuf(lc, m);
3016 		return 0;
3017 	}
3018 #endif
3019 	return tcp_lro_rx(lc, m, 0);
3020 }
3021 #endif
3022 
3023 static int
3024 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3025     const struct hn_rxinfo *info)
3026 {
3027 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3028 	struct mbuf *m_new;
3029 	int size, do_lro = 0, do_csum = 1;
3030 	int hash_type;
3031 
3032 	/*
3033 	 * If the non-transparent mode VF is active, inject this packet
3034 	 * into the VF.
3035 	 */
3036 	ifp = rxr->hn_rxvf_ifp ? rxr->hn_rxvf_ifp : hn_ifp;
3037 
3038 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3039 		/*
3040 		 * NOTE:
3041 		 * See the NOTE of hn_rndis_init_fixat().  This
3042 		 * function can be reached, immediately after the
3043 		 * RNDIS is initialized but before the ifnet is
3044 		 * setup on the hn_attach() path; drop the unexpected
3045 		 * packets.
3046 		 */
3047 		return (0);
3048 	}
3049 
3050 	if (__predict_false(dlen < ETHER_HDR_LEN)) {
3051 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3052 		return (0);
3053 	}
3054 
3055 	if (dlen <= MHLEN) {
3056 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3057 		if (m_new == NULL) {
3058 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3059 			return (0);
3060 		}
3061 		memcpy(mtod(m_new, void *), data, dlen);
3062 		m_new->m_pkthdr.len = m_new->m_len = dlen;
3063 		rxr->hn_small_pkts++;
3064 	} else {
3065 		/*
3066 		 * Get an mbuf with a cluster.  For packets 2K or less,
3067 		 * get a standard 2K cluster.  For anything larger, get a
3068 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3069 		 * if looped around to the Hyper-V TX channel, so avoid them.
3070 		 */
3071 		size = MCLBYTES;
3072 		if (dlen > MCLBYTES) {
3073 			/* 4096 */
3074 			size = MJUMPAGESIZE;
3075 		}
3076 
3077 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3078 		if (m_new == NULL) {
3079 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3080 			return (0);
3081 		}
3082 
3083 		hv_m_append(m_new, dlen, data);
3084 	}
3085 	m_new->m_pkthdr.rcvif = ifp;
3086 
3087 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3088 		do_csum = 0;
3089 
3090 	/* receive side checksum offload */
3091 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3092 		/* IP csum offload */
3093 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3094 			m_new->m_pkthdr.csum_flags |=
3095 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3096 			rxr->hn_csum_ip++;
3097 		}
3098 
3099 		/* TCP/UDP csum offload */
3100 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3101 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3102 			m_new->m_pkthdr.csum_flags |=
3103 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3104 			m_new->m_pkthdr.csum_data = 0xffff;
3105 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3106 				rxr->hn_csum_tcp++;
3107 			else
3108 				rxr->hn_csum_udp++;
3109 		}
3110 
3111 		/*
3112 		 * XXX
3113 		 * As of this write (Oct 28th, 2016), host side will turn
3114 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3115 		 * the do_lro setting here is actually _not_ accurate.  We
3116 		 * depend on the RSS hash type check to reset do_lro.
3117 		 */
3118 		if ((info->csum_info &
3119 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3120 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3121 			do_lro = 1;
3122 	} else {
3123 		const struct ether_header *eh;
3124 		uint16_t etype;
3125 		int hoff;
3126 
3127 		hoff = sizeof(*eh);
3128 		/* Checked at the beginning of this function. */
3129 		KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
3130 
3131 		eh = mtod(m_new, struct ether_header *);
3132 		etype = ntohs(eh->ether_type);
3133 		if (etype == ETHERTYPE_VLAN) {
3134 			const struct ether_vlan_header *evl;
3135 
3136 			hoff = sizeof(*evl);
3137 			if (m_new->m_len < hoff)
3138 				goto skip;
3139 			evl = mtod(m_new, struct ether_vlan_header *);
3140 			etype = ntohs(evl->evl_proto);
3141 		}
3142 
3143 		if (etype == ETHERTYPE_IP) {
3144 			int pr;
3145 
3146 			pr = hn_check_iplen(m_new, hoff);
3147 			if (pr == IPPROTO_TCP) {
3148 				if (do_csum &&
3149 				    (rxr->hn_trust_hcsum &
3150 				     HN_TRUST_HCSUM_TCP)) {
3151 					rxr->hn_csum_trusted++;
3152 					m_new->m_pkthdr.csum_flags |=
3153 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3154 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3155 					m_new->m_pkthdr.csum_data = 0xffff;
3156 				}
3157 				do_lro = 1;
3158 			} else if (pr == IPPROTO_UDP) {
3159 				if (do_csum &&
3160 				    (rxr->hn_trust_hcsum &
3161 				     HN_TRUST_HCSUM_UDP)) {
3162 					rxr->hn_csum_trusted++;
3163 					m_new->m_pkthdr.csum_flags |=
3164 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3165 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3166 					m_new->m_pkthdr.csum_data = 0xffff;
3167 				}
3168 			} else if (pr != IPPROTO_DONE && do_csum &&
3169 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3170 				rxr->hn_csum_trusted++;
3171 				m_new->m_pkthdr.csum_flags |=
3172 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3173 			}
3174 		}
3175 	}
3176 skip:
3177 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3178 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3179 		    NDIS_VLAN_INFO_ID(info->vlan_info),
3180 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3181 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3182 		m_new->m_flags |= M_VLANTAG;
3183 	}
3184 
3185 	/*
3186 	 * If VF is activated (tranparent/non-transparent mode does not
3187 	 * matter here).
3188 	 *
3189 	 * - Don't setup mbuf hash, if 'options RSS' is set.
3190 	 *
3191 	 *   In Azure, when VF is activated, TCP SYN and SYN|ACK go
3192 	 *   through hn(4) while the rest of segments and ACKs belonging
3193 	 *   to the same TCP 4-tuple go through the VF.  So don't setup
3194 	 *   mbuf hash, if a VF is activated and 'options RSS' is not
3195 	 *   enabled.  hn(4) and the VF may use neither the same RSS
3196 	 *   hash key nor the same RSS hash function, so the hash value
3197 	 *   for packets belonging to the same flow could be different!
3198 	 *
3199 	 * - Disable LRO
3200 	 *
3201 	 *   hn(4) will only receive broadcast packets, multicast packets,
3202 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3203 	 *   packet types.
3204 	 *
3205 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3206 	 *   all, since the LRO flush will use hn(4) as the receiving
3207 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3208 	 */
3209 	if (hn_ifp != ifp || (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF)) {
3210 		do_lro = 0;	/* disable LRO. */
3211 #ifndef RSS
3212 		goto skip_hash;	/* skip mbuf hash setup */
3213 #endif
3214 	}
3215 
3216 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3217 		rxr->hn_rss_pkts++;
3218 		m_new->m_pkthdr.flowid = info->hash_value;
3219 		hash_type = M_HASHTYPE_OPAQUE_HASH;
3220 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3221 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3222 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
3223 
3224 			/*
3225 			 * NOTE:
3226 			 * do_lro is resetted, if the hash types are not TCP
3227 			 * related.  See the comment in the above csum_flags
3228 			 * setup section.
3229 			 */
3230 			switch (type) {
3231 			case NDIS_HASH_IPV4:
3232 				hash_type = M_HASHTYPE_RSS_IPV4;
3233 				do_lro = 0;
3234 				break;
3235 
3236 			case NDIS_HASH_TCP_IPV4:
3237 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3238 				break;
3239 
3240 			case NDIS_HASH_IPV6:
3241 				hash_type = M_HASHTYPE_RSS_IPV6;
3242 				do_lro = 0;
3243 				break;
3244 
3245 			case NDIS_HASH_IPV6_EX:
3246 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3247 				do_lro = 0;
3248 				break;
3249 
3250 			case NDIS_HASH_TCP_IPV6:
3251 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3252 				break;
3253 
3254 			case NDIS_HASH_TCP_IPV6_EX:
3255 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3256 				break;
3257 			}
3258 		}
3259 	} else {
3260 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3261 		hash_type = M_HASHTYPE_OPAQUE;
3262 	}
3263 	M_HASHTYPE_SET(m_new, hash_type);
3264 
3265 #ifndef RSS
3266 skip_hash:
3267 #endif
3268 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3269 	if (hn_ifp != ifp) {
3270 		const struct ether_header *eh;
3271 
3272 		/*
3273 		 * Non-transparent mode VF is activated.
3274 		 */
3275 
3276 		/*
3277 		 * Allow tapping on hn(4).
3278 		 */
3279 		ETHER_BPF_MTAP(hn_ifp, m_new);
3280 
3281 		/*
3282 		 * Update hn(4)'s stats.
3283 		 */
3284 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3285 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3286 		/* Checked at the beginning of this function. */
3287 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3288 		eh = mtod(m_new, struct ether_header *);
3289 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3290 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3291 	}
3292 	rxr->hn_pkts++;
3293 
3294 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3295 #if defined(INET) || defined(INET6)
3296 		struct lro_ctrl *lro = &rxr->hn_lro;
3297 
3298 		if (lro->lro_cnt) {
3299 			rxr->hn_lro_tried++;
3300 			if (hn_lro_rx(lro, m_new) == 0) {
3301 				/* DONE! */
3302 				return 0;
3303 			}
3304 		}
3305 #endif
3306 	}
3307 	ifp->if_input(ifp, m_new);
3308 
3309 	return (0);
3310 }
3311 
3312 static int
3313 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3314 {
3315 	struct hn_softc *sc = ifp->if_softc;
3316 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3317 	struct ifnet *vf_ifp;
3318 	int mask, error = 0;
3319 	struct ifrsskey *ifrk;
3320 	struct ifrsshash *ifrh;
3321 
3322 	switch (cmd) {
3323 	case SIOCSIFMTU:
3324 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3325 			error = EINVAL;
3326 			break;
3327 		}
3328 
3329 		HN_LOCK(sc);
3330 
3331 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3332 			HN_UNLOCK(sc);
3333 			break;
3334 		}
3335 
3336 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3337 			/* Can't change MTU */
3338 			HN_UNLOCK(sc);
3339 			error = EOPNOTSUPP;
3340 			break;
3341 		}
3342 
3343 		if (ifp->if_mtu == ifr->ifr_mtu) {
3344 			HN_UNLOCK(sc);
3345 			break;
3346 		}
3347 
3348 		if (hn_xpnt_vf_isready(sc)) {
3349 			vf_ifp = sc->hn_vf_ifp;
3350 			ifr_vf = *ifr;
3351 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3352 			    sizeof(ifr_vf.ifr_name));
3353 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3354 			    (caddr_t)&ifr_vf);
3355 			if (error) {
3356 				HN_UNLOCK(sc);
3357 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3358 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3359 				break;
3360 			}
3361 		}
3362 
3363 		/*
3364 		 * Suspend this interface before the synthetic parts
3365 		 * are ripped.
3366 		 */
3367 		hn_suspend(sc);
3368 
3369 		/*
3370 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3371 		 */
3372 		hn_synth_detach(sc);
3373 
3374 		/*
3375 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3376 		 * with the new MTU setting.
3377 		 */
3378 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3379 		if (error) {
3380 			HN_UNLOCK(sc);
3381 			break;
3382 		}
3383 
3384 		/*
3385 		 * Commit the requested MTU, after the synthetic parts
3386 		 * have been successfully attached.
3387 		 */
3388 		ifp->if_mtu = ifr->ifr_mtu;
3389 
3390 		/*
3391 		 * Synthetic parts' reattach may change the chimney
3392 		 * sending size; update it.
3393 		 */
3394 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3395 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3396 
3397 		/*
3398 		 * Make sure that various parameters based on MTU are
3399 		 * still valid, after the MTU change.
3400 		 */
3401 		hn_mtu_change_fixup(sc);
3402 
3403 		/*
3404 		 * All done!  Resume the interface now.
3405 		 */
3406 		hn_resume(sc);
3407 
3408 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3409 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3410 			/*
3411 			 * Since we have reattached the NVS part,
3412 			 * change the datapath to VF again; in case
3413 			 * that it is lost, after the NVS was detached.
3414 			 */
3415 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3416 		}
3417 
3418 		HN_UNLOCK(sc);
3419 		break;
3420 
3421 	case SIOCSIFFLAGS:
3422 		HN_LOCK(sc);
3423 
3424 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3425 			HN_UNLOCK(sc);
3426 			break;
3427 		}
3428 
3429 		if (hn_xpnt_vf_isready(sc))
3430 			hn_xpnt_vf_saveifflags(sc);
3431 
3432 		if (ifp->if_flags & IFF_UP) {
3433 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3434 				/*
3435 				 * Caller meight hold mutex, e.g.
3436 				 * bpf; use busy-wait for the RNDIS
3437 				 * reply.
3438 				 */
3439 				HN_NO_SLEEPING(sc);
3440 				hn_rxfilter_config(sc);
3441 				HN_SLEEPING_OK(sc);
3442 
3443 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3444 					error = hn_xpnt_vf_iocsetflags(sc);
3445 			} else {
3446 				hn_init_locked(sc);
3447 			}
3448 		} else {
3449 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3450 				hn_stop(sc, false);
3451 		}
3452 		sc->hn_if_flags = ifp->if_flags;
3453 
3454 		HN_UNLOCK(sc);
3455 		break;
3456 
3457 	case SIOCSIFCAP:
3458 		HN_LOCK(sc);
3459 
3460 		if (hn_xpnt_vf_isready(sc)) {
3461 			ifr_vf = *ifr;
3462 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3463 			    sizeof(ifr_vf.ifr_name));
3464 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3465 			HN_UNLOCK(sc);
3466 			break;
3467 		}
3468 
3469 		/*
3470 		 * Fix up requested capabilities w/ supported capabilities,
3471 		 * since the supported capabilities could have been changed.
3472 		 */
3473 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3474 		    ifp->if_capenable;
3475 
3476 		if (mask & IFCAP_TXCSUM) {
3477 			ifp->if_capenable ^= IFCAP_TXCSUM;
3478 			if (ifp->if_capenable & IFCAP_TXCSUM)
3479 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3480 			else
3481 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3482 		}
3483 		if (mask & IFCAP_TXCSUM_IPV6) {
3484 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3485 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3486 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3487 			else
3488 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3489 		}
3490 
3491 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3492 		if (mask & IFCAP_RXCSUM)
3493 			ifp->if_capenable ^= IFCAP_RXCSUM;
3494 #ifdef foo
3495 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3496 		if (mask & IFCAP_RXCSUM_IPV6)
3497 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3498 #endif
3499 
3500 		if (mask & IFCAP_LRO)
3501 			ifp->if_capenable ^= IFCAP_LRO;
3502 
3503 		if (mask & IFCAP_TSO4) {
3504 			ifp->if_capenable ^= IFCAP_TSO4;
3505 			if (ifp->if_capenable & IFCAP_TSO4)
3506 				ifp->if_hwassist |= CSUM_IP_TSO;
3507 			else
3508 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3509 		}
3510 		if (mask & IFCAP_TSO6) {
3511 			ifp->if_capenable ^= IFCAP_TSO6;
3512 			if (ifp->if_capenable & IFCAP_TSO6)
3513 				ifp->if_hwassist |= CSUM_IP6_TSO;
3514 			else
3515 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3516 		}
3517 
3518 		HN_UNLOCK(sc);
3519 		break;
3520 
3521 	case SIOCADDMULTI:
3522 	case SIOCDELMULTI:
3523 		HN_LOCK(sc);
3524 
3525 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3526 			HN_UNLOCK(sc);
3527 			break;
3528 		}
3529 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3530 			/*
3531 			 * Multicast uses mutex; use busy-wait for
3532 			 * the RNDIS reply.
3533 			 */
3534 			HN_NO_SLEEPING(sc);
3535 			hn_rxfilter_config(sc);
3536 			HN_SLEEPING_OK(sc);
3537 		}
3538 
3539 		/* XXX vlan(4) style mcast addr maintenance */
3540 		if (hn_xpnt_vf_isready(sc)) {
3541 			int old_if_flags;
3542 
3543 			old_if_flags = sc->hn_vf_ifp->if_flags;
3544 			hn_xpnt_vf_saveifflags(sc);
3545 
3546 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3547 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3548 			     IFF_ALLMULTI))
3549 				error = hn_xpnt_vf_iocsetflags(sc);
3550 		}
3551 
3552 		HN_UNLOCK(sc);
3553 		break;
3554 
3555 	case SIOCSIFMEDIA:
3556 	case SIOCGIFMEDIA:
3557 		HN_LOCK(sc);
3558 		if (hn_xpnt_vf_isready(sc)) {
3559 			/*
3560 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3561 			 * create and pass ifr_vf to the VF here; just
3562 			 * replace the ifr_name.
3563 			 */
3564 			vf_ifp = sc->hn_vf_ifp;
3565 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3566 			    sizeof(ifr->ifr_name));
3567 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3568 			/* Restore the ifr_name. */
3569 			strlcpy(ifr->ifr_name, ifp->if_xname,
3570 			    sizeof(ifr->ifr_name));
3571 			HN_UNLOCK(sc);
3572 			break;
3573 		}
3574 		HN_UNLOCK(sc);
3575 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3576 		break;
3577 
3578 	case SIOCGIFRSSHASH:
3579 		ifrh = (struct ifrsshash *)data;
3580 		HN_LOCK(sc);
3581 		if (sc->hn_rx_ring_inuse == 1) {
3582 			HN_UNLOCK(sc);
3583 			ifrh->ifrh_func = RSS_FUNC_NONE;
3584 			ifrh->ifrh_types = 0;
3585 			break;
3586 		}
3587 
3588 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3589 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3590 		else
3591 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3592 
3593 		ifrh->ifrh_types = 0;
3594 		if (sc->hn_rss_hash & NDIS_HASH_IPV4)
3595 			ifrh->ifrh_types |= RSS_TYPE_IPV4;
3596 		if (sc->hn_rss_hash & NDIS_HASH_TCP_IPV4)
3597 			ifrh->ifrh_types |= RSS_TYPE_TCP_IPV4;
3598 		if (sc->hn_rss_hash & NDIS_HASH_IPV6)
3599 			ifrh->ifrh_types |= RSS_TYPE_IPV6;
3600 		if (sc->hn_rss_hash & NDIS_HASH_IPV6_EX)
3601 			ifrh->ifrh_types |= RSS_TYPE_IPV6_EX;
3602 		if (sc->hn_rss_hash & NDIS_HASH_TCP_IPV6)
3603 			ifrh->ifrh_types |= RSS_TYPE_TCP_IPV6;
3604 		if (sc->hn_rss_hash & NDIS_HASH_TCP_IPV6_EX)
3605 			ifrh->ifrh_types |= RSS_TYPE_TCP_IPV6_EX;
3606 		HN_UNLOCK(sc);
3607 		break;
3608 
3609 	case SIOCGIFRSSKEY:
3610 		ifrk = (struct ifrsskey *)data;
3611 		HN_LOCK(sc);
3612 		if (sc->hn_rx_ring_inuse == 1) {
3613 			HN_UNLOCK(sc);
3614 			ifrk->ifrk_func = RSS_FUNC_NONE;
3615 			ifrk->ifrk_keylen = 0;
3616 			break;
3617 		}
3618 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3619 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3620 		else
3621 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3622 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3623 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3624 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
3625 		HN_UNLOCK(sc);
3626 		break;
3627 
3628 	default:
3629 		error = ether_ioctl(ifp, cmd, data);
3630 		break;
3631 	}
3632 	return (error);
3633 }
3634 
3635 static void
3636 hn_stop(struct hn_softc *sc, bool detaching)
3637 {
3638 	struct ifnet *ifp = sc->hn_ifp;
3639 	int i;
3640 
3641 	HN_LOCK_ASSERT(sc);
3642 
3643 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3644 	    ("synthetic parts were not attached"));
3645 
3646 	/* Clear RUNNING bit ASAP. */
3647 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3648 
3649 	/* Disable polling. */
3650 	hn_polling(sc, 0);
3651 
3652 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
3653 		KASSERT(sc->hn_vf_ifp != NULL,
3654 		    ("%s: VF is not attached", ifp->if_xname));
3655 
3656 		/* Mark transparent mode VF as disabled. */
3657 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
3658 
3659 		/*
3660 		 * NOTE:
3661 		 * Datapath setting must happen _before_ bringing
3662 		 * the VF down.
3663 		 */
3664 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
3665 
3666 		/*
3667 		 * Bring the VF down.
3668 		 */
3669 		hn_xpnt_vf_saveifflags(sc);
3670 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
3671 		hn_xpnt_vf_iocsetflags(sc);
3672 	}
3673 
3674 	/* Suspend data transfers. */
3675 	hn_suspend_data(sc);
3676 
3677 	/* Clear OACTIVE bit. */
3678 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3679 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3680 		sc->hn_tx_ring[i].hn_oactive = 0;
3681 
3682 	/*
3683 	 * If the non-transparent mode VF is active, make sure
3684 	 * that the RX filter still allows packet reception.
3685 	 */
3686 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
3687 		hn_rxfilter_config(sc);
3688 }
3689 
3690 static void
3691 hn_init_locked(struct hn_softc *sc)
3692 {
3693 	struct ifnet *ifp = sc->hn_ifp;
3694 	int i;
3695 
3696 	HN_LOCK_ASSERT(sc);
3697 
3698 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
3699 		return;
3700 
3701 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3702 		return;
3703 
3704 	/* Configure RX filter */
3705 	hn_rxfilter_config(sc);
3706 
3707 	/* Clear OACTIVE bit. */
3708 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3709 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3710 		sc->hn_tx_ring[i].hn_oactive = 0;
3711 
3712 	/* Clear TX 'suspended' bit. */
3713 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
3714 
3715 	if (hn_xpnt_vf_isready(sc)) {
3716 		/* Initialize transparent VF. */
3717 		hn_xpnt_vf_init(sc);
3718 	}
3719 
3720 	/* Everything is ready; unleash! */
3721 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3722 
3723 	/* Re-enable polling if requested. */
3724 	if (sc->hn_pollhz > 0)
3725 		hn_polling(sc, sc->hn_pollhz);
3726 }
3727 
3728 static void
3729 hn_init(void *xsc)
3730 {
3731 	struct hn_softc *sc = xsc;
3732 
3733 	HN_LOCK(sc);
3734 	hn_init_locked(sc);
3735 	HN_UNLOCK(sc);
3736 }
3737 
3738 #if __FreeBSD_version >= 1100099
3739 
3740 static int
3741 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
3742 {
3743 	struct hn_softc *sc = arg1;
3744 	unsigned int lenlim;
3745 	int error;
3746 
3747 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
3748 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
3749 	if (error || req->newptr == NULL)
3750 		return error;
3751 
3752 	HN_LOCK(sc);
3753 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
3754 	    lenlim > TCP_LRO_LENGTH_MAX) {
3755 		HN_UNLOCK(sc);
3756 		return EINVAL;
3757 	}
3758 	hn_set_lro_lenlim(sc, lenlim);
3759 	HN_UNLOCK(sc);
3760 
3761 	return 0;
3762 }
3763 
3764 static int
3765 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
3766 {
3767 	struct hn_softc *sc = arg1;
3768 	int ackcnt, error, i;
3769 
3770 	/*
3771 	 * lro_ackcnt_lim is append count limit,
3772 	 * +1 to turn it into aggregation limit.
3773 	 */
3774 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
3775 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
3776 	if (error || req->newptr == NULL)
3777 		return error;
3778 
3779 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
3780 		return EINVAL;
3781 
3782 	/*
3783 	 * Convert aggregation limit back to append
3784 	 * count limit.
3785 	 */
3786 	--ackcnt;
3787 	HN_LOCK(sc);
3788 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
3789 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
3790 	HN_UNLOCK(sc);
3791 	return 0;
3792 }
3793 
3794 #endif
3795 
3796 static int
3797 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
3798 {
3799 	struct hn_softc *sc = arg1;
3800 	int hcsum = arg2;
3801 	int on, error, i;
3802 
3803 	on = 0;
3804 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
3805 		on = 1;
3806 
3807 	error = sysctl_handle_int(oidp, &on, 0, req);
3808 	if (error || req->newptr == NULL)
3809 		return error;
3810 
3811 	HN_LOCK(sc);
3812 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3813 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3814 
3815 		if (on)
3816 			rxr->hn_trust_hcsum |= hcsum;
3817 		else
3818 			rxr->hn_trust_hcsum &= ~hcsum;
3819 	}
3820 	HN_UNLOCK(sc);
3821 	return 0;
3822 }
3823 
3824 static int
3825 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
3826 {
3827 	struct hn_softc *sc = arg1;
3828 	int chim_size, error;
3829 
3830 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
3831 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
3832 	if (error || req->newptr == NULL)
3833 		return error;
3834 
3835 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
3836 		return EINVAL;
3837 
3838 	HN_LOCK(sc);
3839 	hn_set_chim_size(sc, chim_size);
3840 	HN_UNLOCK(sc);
3841 	return 0;
3842 }
3843 
3844 #if __FreeBSD_version < 1100095
3845 static int
3846 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
3847 {
3848 	struct hn_softc *sc = arg1;
3849 	int ofs = arg2, i, error;
3850 	struct hn_rx_ring *rxr;
3851 	uint64_t stat;
3852 
3853 	stat = 0;
3854 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3855 		rxr = &sc->hn_rx_ring[i];
3856 		stat += *((int *)((uint8_t *)rxr + ofs));
3857 	}
3858 
3859 	error = sysctl_handle_64(oidp, &stat, 0, req);
3860 	if (error || req->newptr == NULL)
3861 		return error;
3862 
3863 	/* Zero out this stat. */
3864 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3865 		rxr = &sc->hn_rx_ring[i];
3866 		*((int *)((uint8_t *)rxr + ofs)) = 0;
3867 	}
3868 	return 0;
3869 }
3870 #else
3871 static int
3872 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
3873 {
3874 	struct hn_softc *sc = arg1;
3875 	int ofs = arg2, i, error;
3876 	struct hn_rx_ring *rxr;
3877 	uint64_t stat;
3878 
3879 	stat = 0;
3880 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3881 		rxr = &sc->hn_rx_ring[i];
3882 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
3883 	}
3884 
3885 	error = sysctl_handle_64(oidp, &stat, 0, req);
3886 	if (error || req->newptr == NULL)
3887 		return error;
3888 
3889 	/* Zero out this stat. */
3890 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3891 		rxr = &sc->hn_rx_ring[i];
3892 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
3893 	}
3894 	return 0;
3895 }
3896 
3897 #endif
3898 
3899 static int
3900 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3901 {
3902 	struct hn_softc *sc = arg1;
3903 	int ofs = arg2, i, error;
3904 	struct hn_rx_ring *rxr;
3905 	u_long stat;
3906 
3907 	stat = 0;
3908 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3909 		rxr = &sc->hn_rx_ring[i];
3910 		stat += *((u_long *)((uint8_t *)rxr + ofs));
3911 	}
3912 
3913 	error = sysctl_handle_long(oidp, &stat, 0, req);
3914 	if (error || req->newptr == NULL)
3915 		return error;
3916 
3917 	/* Zero out this stat. */
3918 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3919 		rxr = &sc->hn_rx_ring[i];
3920 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
3921 	}
3922 	return 0;
3923 }
3924 
3925 static int
3926 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3927 {
3928 	struct hn_softc *sc = arg1;
3929 	int ofs = arg2, i, error;
3930 	struct hn_tx_ring *txr;
3931 	u_long stat;
3932 
3933 	stat = 0;
3934 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3935 		txr = &sc->hn_tx_ring[i];
3936 		stat += *((u_long *)((uint8_t *)txr + ofs));
3937 	}
3938 
3939 	error = sysctl_handle_long(oidp, &stat, 0, req);
3940 	if (error || req->newptr == NULL)
3941 		return error;
3942 
3943 	/* Zero out this stat. */
3944 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3945 		txr = &sc->hn_tx_ring[i];
3946 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
3947 	}
3948 	return 0;
3949 }
3950 
3951 static int
3952 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
3953 {
3954 	struct hn_softc *sc = arg1;
3955 	int ofs = arg2, i, error, conf;
3956 	struct hn_tx_ring *txr;
3957 
3958 	txr = &sc->hn_tx_ring[0];
3959 	conf = *((int *)((uint8_t *)txr + ofs));
3960 
3961 	error = sysctl_handle_int(oidp, &conf, 0, req);
3962 	if (error || req->newptr == NULL)
3963 		return error;
3964 
3965 	HN_LOCK(sc);
3966 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3967 		txr = &sc->hn_tx_ring[i];
3968 		*((int *)((uint8_t *)txr + ofs)) = conf;
3969 	}
3970 	HN_UNLOCK(sc);
3971 
3972 	return 0;
3973 }
3974 
3975 static int
3976 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
3977 {
3978 	struct hn_softc *sc = arg1;
3979 	int error, size;
3980 
3981 	size = sc->hn_agg_size;
3982 	error = sysctl_handle_int(oidp, &size, 0, req);
3983 	if (error || req->newptr == NULL)
3984 		return (error);
3985 
3986 	HN_LOCK(sc);
3987 	sc->hn_agg_size = size;
3988 	hn_set_txagg(sc);
3989 	HN_UNLOCK(sc);
3990 
3991 	return (0);
3992 }
3993 
3994 static int
3995 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3996 {
3997 	struct hn_softc *sc = arg1;
3998 	int error, pkts;
3999 
4000 	pkts = sc->hn_agg_pkts;
4001 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4002 	if (error || req->newptr == NULL)
4003 		return (error);
4004 
4005 	HN_LOCK(sc);
4006 	sc->hn_agg_pkts = pkts;
4007 	hn_set_txagg(sc);
4008 	HN_UNLOCK(sc);
4009 
4010 	return (0);
4011 }
4012 
4013 static int
4014 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4015 {
4016 	struct hn_softc *sc = arg1;
4017 	int pkts;
4018 
4019 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4020 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4021 }
4022 
4023 static int
4024 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4025 {
4026 	struct hn_softc *sc = arg1;
4027 	int align;
4028 
4029 	align = sc->hn_tx_ring[0].hn_agg_align;
4030 	return (sysctl_handle_int(oidp, &align, 0, req));
4031 }
4032 
4033 static void
4034 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4035 {
4036 	if (pollhz == 0)
4037 		vmbus_chan_poll_disable(chan);
4038 	else
4039 		vmbus_chan_poll_enable(chan, pollhz);
4040 }
4041 
4042 static void
4043 hn_polling(struct hn_softc *sc, u_int pollhz)
4044 {
4045 	int nsubch = sc->hn_rx_ring_inuse - 1;
4046 
4047 	HN_LOCK_ASSERT(sc);
4048 
4049 	if (nsubch > 0) {
4050 		struct vmbus_channel **subch;
4051 		int i;
4052 
4053 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4054 		for (i = 0; i < nsubch; ++i)
4055 			hn_chan_polling(subch[i], pollhz);
4056 		vmbus_subchan_rel(subch, nsubch);
4057 	}
4058 	hn_chan_polling(sc->hn_prichan, pollhz);
4059 }
4060 
4061 static int
4062 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4063 {
4064 	struct hn_softc *sc = arg1;
4065 	int pollhz, error;
4066 
4067 	pollhz = sc->hn_pollhz;
4068 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4069 	if (error || req->newptr == NULL)
4070 		return (error);
4071 
4072 	if (pollhz != 0 &&
4073 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4074 		return (EINVAL);
4075 
4076 	HN_LOCK(sc);
4077 	if (sc->hn_pollhz != pollhz) {
4078 		sc->hn_pollhz = pollhz;
4079 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4080 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4081 			hn_polling(sc, sc->hn_pollhz);
4082 	}
4083 	HN_UNLOCK(sc);
4084 
4085 	return (0);
4086 }
4087 
4088 static int
4089 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4090 {
4091 	struct hn_softc *sc = arg1;
4092 	char verstr[16];
4093 
4094 	snprintf(verstr, sizeof(verstr), "%u.%u",
4095 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4096 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4097 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4098 }
4099 
4100 static int
4101 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4102 {
4103 	struct hn_softc *sc = arg1;
4104 	char caps_str[128];
4105 	uint32_t caps;
4106 
4107 	HN_LOCK(sc);
4108 	caps = sc->hn_caps;
4109 	HN_UNLOCK(sc);
4110 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4111 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4112 }
4113 
4114 static int
4115 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4116 {
4117 	struct hn_softc *sc = arg1;
4118 	char assist_str[128];
4119 	uint32_t hwassist;
4120 
4121 	HN_LOCK(sc);
4122 	hwassist = sc->hn_ifp->if_hwassist;
4123 	HN_UNLOCK(sc);
4124 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4125 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4126 }
4127 
4128 static int
4129 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4130 {
4131 	struct hn_softc *sc = arg1;
4132 	char filter_str[128];
4133 	uint32_t filter;
4134 
4135 	HN_LOCK(sc);
4136 	filter = sc->hn_rx_filter;
4137 	HN_UNLOCK(sc);
4138 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4139 	    NDIS_PACKET_TYPES);
4140 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4141 }
4142 
4143 #ifndef RSS
4144 
4145 static int
4146 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4147 {
4148 	struct hn_softc *sc = arg1;
4149 	int error;
4150 
4151 	HN_LOCK(sc);
4152 
4153 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4154 	if (error || req->newptr == NULL)
4155 		goto back;
4156 
4157 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4158 	if (error)
4159 		goto back;
4160 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4161 
4162 	if (sc->hn_rx_ring_inuse > 1) {
4163 		error = hn_rss_reconfig(sc);
4164 	} else {
4165 		/* Not RSS capable, at least for now; just save the RSS key. */
4166 		error = 0;
4167 	}
4168 back:
4169 	HN_UNLOCK(sc);
4170 	return (error);
4171 }
4172 
4173 static int
4174 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4175 {
4176 	struct hn_softc *sc = arg1;
4177 	int error;
4178 
4179 	HN_LOCK(sc);
4180 
4181 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4182 	if (error || req->newptr == NULL)
4183 		goto back;
4184 
4185 	/*
4186 	 * Don't allow RSS indirect table change, if this interface is not
4187 	 * RSS capable currently.
4188 	 */
4189 	if (sc->hn_rx_ring_inuse == 1) {
4190 		error = EOPNOTSUPP;
4191 		goto back;
4192 	}
4193 
4194 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4195 	if (error)
4196 		goto back;
4197 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4198 
4199 	hn_rss_ind_fixup(sc);
4200 	error = hn_rss_reconfig(sc);
4201 back:
4202 	HN_UNLOCK(sc);
4203 	return (error);
4204 }
4205 
4206 #endif	/* !RSS */
4207 
4208 static int
4209 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4210 {
4211 	struct hn_softc *sc = arg1;
4212 	char hash_str[128];
4213 	uint32_t hash;
4214 
4215 	HN_LOCK(sc);
4216 	hash = sc->hn_rss_hash;
4217 	HN_UNLOCK(sc);
4218 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4219 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4220 }
4221 
4222 static int
4223 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4224 {
4225 	struct hn_softc *sc = arg1;
4226 	char vf_name[IFNAMSIZ + 1];
4227 	struct ifnet *vf_ifp;
4228 
4229 	HN_LOCK(sc);
4230 	vf_name[0] = '\0';
4231 	vf_ifp = sc->hn_vf_ifp;
4232 	if (vf_ifp != NULL)
4233 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4234 	HN_UNLOCK(sc);
4235 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4236 }
4237 
4238 static int
4239 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4240 {
4241 	struct hn_softc *sc = arg1;
4242 	char vf_name[IFNAMSIZ + 1];
4243 	struct ifnet *vf_ifp;
4244 
4245 	HN_LOCK(sc);
4246 	vf_name[0] = '\0';
4247 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4248 	if (vf_ifp != NULL)
4249 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4250 	HN_UNLOCK(sc);
4251 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4252 }
4253 
4254 static int
4255 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4256 {
4257 	struct rm_priotracker pt;
4258 	struct sbuf *sb;
4259 	int error, i;
4260 	bool first;
4261 
4262 	error = sysctl_wire_old_buffer(req, 0);
4263 	if (error != 0)
4264 		return (error);
4265 
4266 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4267 	if (sb == NULL)
4268 		return (ENOMEM);
4269 
4270 	rm_rlock(&hn_vfmap_lock, &pt);
4271 
4272 	first = true;
4273 	for (i = 0; i < hn_vfmap_size; ++i) {
4274 		struct ifnet *ifp;
4275 
4276 		if (hn_vfmap[i] == NULL)
4277 			continue;
4278 
4279 		ifp = ifnet_byindex(i);
4280 		if (ifp != NULL) {
4281 			if (first)
4282 				sbuf_printf(sb, "%s", ifp->if_xname);
4283 			else
4284 				sbuf_printf(sb, " %s", ifp->if_xname);
4285 			first = false;
4286 		}
4287 	}
4288 
4289 	rm_runlock(&hn_vfmap_lock, &pt);
4290 
4291 	error = sbuf_finish(sb);
4292 	sbuf_delete(sb);
4293 	return (error);
4294 }
4295 
4296 static int
4297 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4298 {
4299 	struct rm_priotracker pt;
4300 	struct sbuf *sb;
4301 	int error, i;
4302 	bool first;
4303 
4304 	error = sysctl_wire_old_buffer(req, 0);
4305 	if (error != 0)
4306 		return (error);
4307 
4308 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4309 	if (sb == NULL)
4310 		return (ENOMEM);
4311 
4312 	rm_rlock(&hn_vfmap_lock, &pt);
4313 
4314 	first = true;
4315 	for (i = 0; i < hn_vfmap_size; ++i) {
4316 		struct ifnet *ifp, *hn_ifp;
4317 
4318 		hn_ifp = hn_vfmap[i];
4319 		if (hn_ifp == NULL)
4320 			continue;
4321 
4322 		ifp = ifnet_byindex(i);
4323 		if (ifp != NULL) {
4324 			if (first) {
4325 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4326 				    hn_ifp->if_xname);
4327 			} else {
4328 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4329 				    hn_ifp->if_xname);
4330 			}
4331 			first = false;
4332 		}
4333 	}
4334 
4335 	rm_runlock(&hn_vfmap_lock, &pt);
4336 
4337 	error = sbuf_finish(sb);
4338 	sbuf_delete(sb);
4339 	return (error);
4340 }
4341 
4342 static int
4343 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4344 {
4345 	struct hn_softc *sc = arg1;
4346 	int error, onoff = 0;
4347 
4348 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4349 		onoff = 1;
4350 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4351 	if (error || req->newptr == NULL)
4352 		return (error);
4353 
4354 	HN_LOCK(sc);
4355 	/* NOTE: hn_vf_lock for hn_transmit() */
4356 	rm_wlock(&sc->hn_vf_lock);
4357 	if (onoff)
4358 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4359 	else
4360 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4361 	rm_wunlock(&sc->hn_vf_lock);
4362 	HN_UNLOCK(sc);
4363 
4364 	return (0);
4365 }
4366 
4367 static int
4368 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4369 {
4370 	struct hn_softc *sc = arg1;
4371 	int enabled = 0;
4372 
4373 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4374 		enabled = 1;
4375 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4376 }
4377 
4378 static int
4379 hn_check_iplen(const struct mbuf *m, int hoff)
4380 {
4381 	const struct ip *ip;
4382 	int len, iphlen, iplen;
4383 	const struct tcphdr *th;
4384 	int thoff;				/* TCP data offset */
4385 
4386 	len = hoff + sizeof(struct ip);
4387 
4388 	/* The packet must be at least the size of an IP header. */
4389 	if (m->m_pkthdr.len < len)
4390 		return IPPROTO_DONE;
4391 
4392 	/* The fixed IP header must reside completely in the first mbuf. */
4393 	if (m->m_len < len)
4394 		return IPPROTO_DONE;
4395 
4396 	ip = mtodo(m, hoff);
4397 
4398 	/* Bound check the packet's stated IP header length. */
4399 	iphlen = ip->ip_hl << 2;
4400 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4401 		return IPPROTO_DONE;
4402 
4403 	/* The full IP header must reside completely in the one mbuf. */
4404 	if (m->m_len < hoff + iphlen)
4405 		return IPPROTO_DONE;
4406 
4407 	iplen = ntohs(ip->ip_len);
4408 
4409 	/*
4410 	 * Check that the amount of data in the buffers is as
4411 	 * at least much as the IP header would have us expect.
4412 	 */
4413 	if (m->m_pkthdr.len < hoff + iplen)
4414 		return IPPROTO_DONE;
4415 
4416 	/*
4417 	 * Ignore IP fragments.
4418 	 */
4419 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4420 		return IPPROTO_DONE;
4421 
4422 	/*
4423 	 * The TCP/IP or UDP/IP header must be entirely contained within
4424 	 * the first fragment of a packet.
4425 	 */
4426 	switch (ip->ip_p) {
4427 	case IPPROTO_TCP:
4428 		if (iplen < iphlen + sizeof(struct tcphdr))
4429 			return IPPROTO_DONE;
4430 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4431 			return IPPROTO_DONE;
4432 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4433 		thoff = th->th_off << 2;
4434 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4435 			return IPPROTO_DONE;
4436 		if (m->m_len < hoff + iphlen + thoff)
4437 			return IPPROTO_DONE;
4438 		break;
4439 	case IPPROTO_UDP:
4440 		if (iplen < iphlen + sizeof(struct udphdr))
4441 			return IPPROTO_DONE;
4442 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4443 			return IPPROTO_DONE;
4444 		break;
4445 	default:
4446 		if (iplen < iphlen)
4447 			return IPPROTO_DONE;
4448 		break;
4449 	}
4450 	return ip->ip_p;
4451 }
4452 
4453 static int
4454 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4455 {
4456 	struct sysctl_oid_list *child;
4457 	struct sysctl_ctx_list *ctx;
4458 	device_t dev = sc->hn_dev;
4459 #if defined(INET) || defined(INET6)
4460 #if __FreeBSD_version >= 1100095
4461 	int lroent_cnt;
4462 #endif
4463 #endif
4464 	int i;
4465 
4466 	/*
4467 	 * Create RXBUF for reception.
4468 	 *
4469 	 * NOTE:
4470 	 * - It is shared by all channels.
4471 	 * - A large enough buffer is allocated, certain version of NVSes
4472 	 *   may further limit the usable space.
4473 	 */
4474 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4475 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4476 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4477 	if (sc->hn_rxbuf == NULL) {
4478 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4479 		return (ENOMEM);
4480 	}
4481 
4482 	sc->hn_rx_ring_cnt = ring_cnt;
4483 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4484 
4485 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4486 	    M_DEVBUF, M_WAITOK | M_ZERO);
4487 
4488 #if defined(INET) || defined(INET6)
4489 #if __FreeBSD_version >= 1100095
4490 	lroent_cnt = hn_lro_entry_count;
4491 	if (lroent_cnt < TCP_LRO_ENTRIES)
4492 		lroent_cnt = TCP_LRO_ENTRIES;
4493 	if (bootverbose)
4494 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4495 #endif
4496 #endif	/* INET || INET6 */
4497 
4498 	ctx = device_get_sysctl_ctx(dev);
4499 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4500 
4501 	/* Create dev.hn.UNIT.rx sysctl tree */
4502 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4503 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4504 
4505 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4506 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4507 
4508 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4509 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4510 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4511 		if (rxr->hn_br == NULL) {
4512 			device_printf(dev, "allocate bufring failed\n");
4513 			return (ENOMEM);
4514 		}
4515 
4516 		if (hn_trust_hosttcp)
4517 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4518 		if (hn_trust_hostudp)
4519 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4520 		if (hn_trust_hostip)
4521 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4522 		rxr->hn_ifp = sc->hn_ifp;
4523 		if (i < sc->hn_tx_ring_cnt)
4524 			rxr->hn_txr = &sc->hn_tx_ring[i];
4525 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4526 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4527 		rxr->hn_rx_idx = i;
4528 		rxr->hn_rxbuf = sc->hn_rxbuf;
4529 
4530 		/*
4531 		 * Initialize LRO.
4532 		 */
4533 #if defined(INET) || defined(INET6)
4534 #if __FreeBSD_version >= 1100095
4535 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4536 		    hn_lro_mbufq_depth);
4537 #else
4538 		tcp_lro_init(&rxr->hn_lro);
4539 		rxr->hn_lro.ifp = sc->hn_ifp;
4540 #endif
4541 #if __FreeBSD_version >= 1100099
4542 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4543 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4544 #endif
4545 #endif	/* INET || INET6 */
4546 
4547 		if (sc->hn_rx_sysctl_tree != NULL) {
4548 			char name[16];
4549 
4550 			/*
4551 			 * Create per RX ring sysctl tree:
4552 			 * dev.hn.UNIT.rx.RINGID
4553 			 */
4554 			snprintf(name, sizeof(name), "%d", i);
4555 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4556 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4557 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4558 
4559 			if (rxr->hn_rx_sysctl_tree != NULL) {
4560 				SYSCTL_ADD_ULONG(ctx,
4561 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4562 				    OID_AUTO, "packets", CTLFLAG_RW,
4563 				    &rxr->hn_pkts, "# of packets received");
4564 				SYSCTL_ADD_ULONG(ctx,
4565 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4566 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
4567 				    &rxr->hn_rss_pkts,
4568 				    "# of packets w/ RSS info received");
4569 				SYSCTL_ADD_INT(ctx,
4570 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4571 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
4572 				    &rxr->hn_pktbuf_len, 0,
4573 				    "Temporary channel packet buffer length");
4574 			}
4575 		}
4576 	}
4577 
4578 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
4579 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4580 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
4581 #if __FreeBSD_version < 1100095
4582 	    hn_rx_stat_int_sysctl,
4583 #else
4584 	    hn_rx_stat_u64_sysctl,
4585 #endif
4586 	    "LU", "LRO queued");
4587 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
4588 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4589 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
4590 #if __FreeBSD_version < 1100095
4591 	    hn_rx_stat_int_sysctl,
4592 #else
4593 	    hn_rx_stat_u64_sysctl,
4594 #endif
4595 	    "LU", "LRO flushed");
4596 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
4597 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4598 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
4599 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
4600 #if __FreeBSD_version >= 1100099
4601 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
4602 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4603 	    hn_lro_lenlim_sysctl, "IU",
4604 	    "Max # of data bytes to be aggregated by LRO");
4605 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
4606 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4607 	    hn_lro_ackcnt_sysctl, "I",
4608 	    "Max # of ACKs to be aggregated by LRO");
4609 #endif
4610 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
4611 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
4612 	    hn_trust_hcsum_sysctl, "I",
4613 	    "Trust tcp segement verification on host side, "
4614 	    "when csum info is missing");
4615 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
4616 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
4617 	    hn_trust_hcsum_sysctl, "I",
4618 	    "Trust udp datagram verification on host side, "
4619 	    "when csum info is missing");
4620 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
4621 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
4622 	    hn_trust_hcsum_sysctl, "I",
4623 	    "Trust ip packet verification on host side, "
4624 	    "when csum info is missing");
4625 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
4626 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4627 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
4628 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
4629 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
4630 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4631 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
4632 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
4633 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
4634 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4635 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
4636 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
4637 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
4638 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4639 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
4640 	    hn_rx_stat_ulong_sysctl, "LU",
4641 	    "# of packets that we trust host's csum verification");
4642 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
4643 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4644 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
4645 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
4646 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
4647 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4648 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
4649 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
4650 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
4651 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
4652 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
4653 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
4654 
4655 	return (0);
4656 }
4657 
4658 static void
4659 hn_destroy_rx_data(struct hn_softc *sc)
4660 {
4661 	int i;
4662 
4663 	if (sc->hn_rxbuf != NULL) {
4664 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
4665 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
4666 		else
4667 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
4668 		sc->hn_rxbuf = NULL;
4669 	}
4670 
4671 	if (sc->hn_rx_ring_cnt == 0)
4672 		return;
4673 
4674 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4675 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4676 
4677 		if (rxr->hn_br == NULL)
4678 			continue;
4679 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
4680 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
4681 		} else {
4682 			device_printf(sc->hn_dev,
4683 			    "%dth channel bufring is referenced", i);
4684 		}
4685 		rxr->hn_br = NULL;
4686 
4687 #if defined(INET) || defined(INET6)
4688 		tcp_lro_free(&rxr->hn_lro);
4689 #endif
4690 		free(rxr->hn_pktbuf, M_DEVBUF);
4691 	}
4692 	free(sc->hn_rx_ring, M_DEVBUF);
4693 	sc->hn_rx_ring = NULL;
4694 
4695 	sc->hn_rx_ring_cnt = 0;
4696 	sc->hn_rx_ring_inuse = 0;
4697 }
4698 
4699 static int
4700 hn_tx_ring_create(struct hn_softc *sc, int id)
4701 {
4702 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
4703 	device_t dev = sc->hn_dev;
4704 	bus_dma_tag_t parent_dtag;
4705 	int error, i;
4706 
4707 	txr->hn_sc = sc;
4708 	txr->hn_tx_idx = id;
4709 
4710 #ifndef HN_USE_TXDESC_BUFRING
4711 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
4712 #endif
4713 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
4714 
4715 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
4716 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
4717 	    M_DEVBUF, M_WAITOK | M_ZERO);
4718 #ifndef HN_USE_TXDESC_BUFRING
4719 	SLIST_INIT(&txr->hn_txlist);
4720 #else
4721 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
4722 	    M_WAITOK, &txr->hn_tx_lock);
4723 #endif
4724 
4725 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
4726 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
4727 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
4728 	} else {
4729 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
4730 	}
4731 
4732 #ifdef HN_IFSTART_SUPPORT
4733 	if (hn_use_if_start) {
4734 		txr->hn_txeof = hn_start_txeof;
4735 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
4736 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
4737 	} else
4738 #endif
4739 	{
4740 		int br_depth;
4741 
4742 		txr->hn_txeof = hn_xmit_txeof;
4743 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
4744 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
4745 
4746 		br_depth = hn_get_txswq_depth(txr);
4747 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
4748 		    M_WAITOK, &txr->hn_tx_lock);
4749 	}
4750 
4751 	txr->hn_direct_tx_size = hn_direct_tx_size;
4752 
4753 	/*
4754 	 * Always schedule transmission instead of trying to do direct
4755 	 * transmission.  This one gives the best performance so far.
4756 	 */
4757 	txr->hn_sched_tx = 1;
4758 
4759 	parent_dtag = bus_get_dma_tag(dev);
4760 
4761 	/* DMA tag for RNDIS packet messages. */
4762 	error = bus_dma_tag_create(parent_dtag, /* parent */
4763 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
4764 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
4765 	    BUS_SPACE_MAXADDR,		/* lowaddr */
4766 	    BUS_SPACE_MAXADDR,		/* highaddr */
4767 	    NULL, NULL,			/* filter, filterarg */
4768 	    HN_RNDIS_PKT_LEN,		/* maxsize */
4769 	    1,				/* nsegments */
4770 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
4771 	    0,				/* flags */
4772 	    NULL,			/* lockfunc */
4773 	    NULL,			/* lockfuncarg */
4774 	    &txr->hn_tx_rndis_dtag);
4775 	if (error) {
4776 		device_printf(dev, "failed to create rndis dmatag\n");
4777 		return error;
4778 	}
4779 
4780 	/* DMA tag for data. */
4781 	error = bus_dma_tag_create(parent_dtag, /* parent */
4782 	    1,				/* alignment */
4783 	    HN_TX_DATA_BOUNDARY,	/* boundary */
4784 	    BUS_SPACE_MAXADDR,		/* lowaddr */
4785 	    BUS_SPACE_MAXADDR,		/* highaddr */
4786 	    NULL, NULL,			/* filter, filterarg */
4787 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
4788 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
4789 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
4790 	    0,				/* flags */
4791 	    NULL,			/* lockfunc */
4792 	    NULL,			/* lockfuncarg */
4793 	    &txr->hn_tx_data_dtag);
4794 	if (error) {
4795 		device_printf(dev, "failed to create data dmatag\n");
4796 		return error;
4797 	}
4798 
4799 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
4800 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
4801 
4802 		txd->txr = txr;
4803 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
4804 		STAILQ_INIT(&txd->agg_list);
4805 
4806 		/*
4807 		 * Allocate and load RNDIS packet message.
4808 		 */
4809         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
4810 		    (void **)&txd->rndis_pkt,
4811 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
4812 		    &txd->rndis_pkt_dmap);
4813 		if (error) {
4814 			device_printf(dev,
4815 			    "failed to allocate rndis_packet_msg, %d\n", i);
4816 			return error;
4817 		}
4818 
4819 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
4820 		    txd->rndis_pkt_dmap,
4821 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
4822 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
4823 		    BUS_DMA_NOWAIT);
4824 		if (error) {
4825 			device_printf(dev,
4826 			    "failed to load rndis_packet_msg, %d\n", i);
4827 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
4828 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
4829 			return error;
4830 		}
4831 
4832 		/* DMA map for TX data. */
4833 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
4834 		    &txd->data_dmap);
4835 		if (error) {
4836 			device_printf(dev,
4837 			    "failed to allocate tx data dmamap\n");
4838 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
4839 			    txd->rndis_pkt_dmap);
4840 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
4841 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
4842 			return error;
4843 		}
4844 
4845 		/* All set, put it to list */
4846 		txd->flags |= HN_TXD_FLAG_ONLIST;
4847 #ifndef HN_USE_TXDESC_BUFRING
4848 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
4849 #else
4850 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
4851 #endif
4852 	}
4853 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
4854 
4855 	if (sc->hn_tx_sysctl_tree != NULL) {
4856 		struct sysctl_oid_list *child;
4857 		struct sysctl_ctx_list *ctx;
4858 		char name[16];
4859 
4860 		/*
4861 		 * Create per TX ring sysctl tree:
4862 		 * dev.hn.UNIT.tx.RINGID
4863 		 */
4864 		ctx = device_get_sysctl_ctx(dev);
4865 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
4866 
4867 		snprintf(name, sizeof(name), "%d", id);
4868 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
4869 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4870 
4871 		if (txr->hn_tx_sysctl_tree != NULL) {
4872 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
4873 
4874 #ifdef HN_DEBUG
4875 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
4876 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
4877 			    "# of available TX descs");
4878 #endif
4879 #ifdef HN_IFSTART_SUPPORT
4880 			if (!hn_use_if_start)
4881 #endif
4882 			{
4883 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
4884 				    CTLFLAG_RD, &txr->hn_oactive, 0,
4885 				    "over active");
4886 			}
4887 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
4888 			    CTLFLAG_RW, &txr->hn_pkts,
4889 			    "# of packets transmitted");
4890 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
4891 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
4892 		}
4893 	}
4894 
4895 	return 0;
4896 }
4897 
4898 static void
4899 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
4900 {
4901 	struct hn_tx_ring *txr = txd->txr;
4902 
4903 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
4904 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
4905 
4906 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
4907 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
4908 	    txd->rndis_pkt_dmap);
4909 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
4910 }
4911 
4912 static void
4913 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
4914 {
4915 
4916 	KASSERT(txd->refs == 0 || txd->refs == 1,
4917 	    ("invalid txd refs %d", txd->refs));
4918 
4919 	/* Aggregated txds will be freed by their aggregating txd. */
4920 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
4921 		int freed;
4922 
4923 		freed = hn_txdesc_put(txr, txd);
4924 		KASSERT(freed, ("can't free txdesc"));
4925 	}
4926 }
4927 
4928 static void
4929 hn_tx_ring_destroy(struct hn_tx_ring *txr)
4930 {
4931 	int i;
4932 
4933 	if (txr->hn_txdesc == NULL)
4934 		return;
4935 
4936 	/*
4937 	 * NOTE:
4938 	 * Because the freeing of aggregated txds will be deferred
4939 	 * to the aggregating txd, two passes are used here:
4940 	 * - The first pass GCes any pending txds.  This GC is necessary,
4941 	 *   since if the channels are revoked, hypervisor will not
4942 	 *   deliver send-done for all pending txds.
4943 	 * - The second pass frees the busdma stuffs, i.e. after all txds
4944 	 *   were freed.
4945 	 */
4946 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4947 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
4948 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4949 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
4950 
4951 	if (txr->hn_tx_data_dtag != NULL)
4952 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
4953 	if (txr->hn_tx_rndis_dtag != NULL)
4954 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
4955 
4956 #ifdef HN_USE_TXDESC_BUFRING
4957 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
4958 #endif
4959 
4960 	free(txr->hn_txdesc, M_DEVBUF);
4961 	txr->hn_txdesc = NULL;
4962 
4963 	if (txr->hn_mbuf_br != NULL)
4964 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
4965 
4966 #ifndef HN_USE_TXDESC_BUFRING
4967 	mtx_destroy(&txr->hn_txlist_spin);
4968 #endif
4969 	mtx_destroy(&txr->hn_tx_lock);
4970 }
4971 
4972 static int
4973 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
4974 {
4975 	struct sysctl_oid_list *child;
4976 	struct sysctl_ctx_list *ctx;
4977 	int i;
4978 
4979 	/*
4980 	 * Create TXBUF for chimney sending.
4981 	 *
4982 	 * NOTE: It is shared by all channels.
4983 	 */
4984 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
4985 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
4986 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4987 	if (sc->hn_chim == NULL) {
4988 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
4989 		return (ENOMEM);
4990 	}
4991 
4992 	sc->hn_tx_ring_cnt = ring_cnt;
4993 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4994 
4995 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
4996 	    M_DEVBUF, M_WAITOK | M_ZERO);
4997 
4998 	ctx = device_get_sysctl_ctx(sc->hn_dev);
4999 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5000 
5001 	/* Create dev.hn.UNIT.tx sysctl tree */
5002 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5003 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5004 
5005 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5006 		int error;
5007 
5008 		error = hn_tx_ring_create(sc, i);
5009 		if (error)
5010 			return error;
5011 	}
5012 
5013 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5014 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5015 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5016 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5017 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5018 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5019 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5020 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5021 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5022 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5023 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5024 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5025 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5026 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5027 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5028 	    hn_tx_stat_ulong_sysctl, "LU",
5029 	    "# of packet transmission aggregation flush failure");
5030 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5031 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5032 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5033 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5034 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5035 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5036 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5037 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5038 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5039 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5040 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5041 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5042 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5043 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5044 	    "# of total TX descs");
5045 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5046 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5047 	    "Chimney send packet size upper boundary");
5048 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5049 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5050 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5051 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5052 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5053 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5054 	    hn_tx_conf_int_sysctl, "I",
5055 	    "Size of the packet for direct transmission");
5056 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5057 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5058 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5059 	    hn_tx_conf_int_sysctl, "I",
5060 	    "Always schedule transmission "
5061 	    "instead of doing direct transmission");
5062 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5063 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5064 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5065 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5066 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5067 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5068 	    "Applied packet transmission aggregation size");
5069 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5070 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5071 	    hn_txagg_pktmax_sysctl, "I",
5072 	    "Applied packet transmission aggregation packets");
5073 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5074 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5075 	    hn_txagg_align_sysctl, "I",
5076 	    "Applied packet transmission aggregation alignment");
5077 
5078 	return 0;
5079 }
5080 
5081 static void
5082 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5083 {
5084 	int i;
5085 
5086 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5087 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5088 }
5089 
5090 static void
5091 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5092 {
5093 	struct ifnet *ifp = sc->hn_ifp;
5094 	u_int hw_tsomax;
5095 	int tso_minlen;
5096 
5097 	HN_LOCK_ASSERT(sc);
5098 
5099 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5100 		return;
5101 
5102 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5103 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5104 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5105 
5106 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5107 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5108 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5109 
5110 	if (tso_maxlen < tso_minlen)
5111 		tso_maxlen = tso_minlen;
5112 	else if (tso_maxlen > IP_MAXPACKET)
5113 		tso_maxlen = IP_MAXPACKET;
5114 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5115 		tso_maxlen = sc->hn_ndis_tso_szmax;
5116 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5117 
5118 	if (hn_xpnt_vf_isready(sc)) {
5119 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5120 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5121 	}
5122 	ifp->if_hw_tsomax = hw_tsomax;
5123 	if (bootverbose)
5124 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5125 }
5126 
5127 static void
5128 hn_fixup_tx_data(struct hn_softc *sc)
5129 {
5130 	uint64_t csum_assist;
5131 	int i;
5132 
5133 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5134 	if (hn_tx_chimney_size > 0 &&
5135 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5136 		hn_set_chim_size(sc, hn_tx_chimney_size);
5137 
5138 	csum_assist = 0;
5139 	if (sc->hn_caps & HN_CAP_IPCS)
5140 		csum_assist |= CSUM_IP;
5141 	if (sc->hn_caps & HN_CAP_TCP4CS)
5142 		csum_assist |= CSUM_IP_TCP;
5143 	if (sc->hn_caps & HN_CAP_UDP4CS)
5144 		csum_assist |= CSUM_IP_UDP;
5145 	if (sc->hn_caps & HN_CAP_TCP6CS)
5146 		csum_assist |= CSUM_IP6_TCP;
5147 	if (sc->hn_caps & HN_CAP_UDP6CS)
5148 		csum_assist |= CSUM_IP6_UDP;
5149 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5150 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5151 
5152 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5153 		/*
5154 		 * Support HASHVAL pktinfo on TX path.
5155 		 */
5156 		if (bootverbose)
5157 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5158 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5159 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5160 	}
5161 }
5162 
5163 static void
5164 hn_destroy_tx_data(struct hn_softc *sc)
5165 {
5166 	int i;
5167 
5168 	if (sc->hn_chim != NULL) {
5169 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5170 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5171 		} else {
5172 			device_printf(sc->hn_dev,
5173 			    "chimney sending buffer is referenced");
5174 		}
5175 		sc->hn_chim = NULL;
5176 	}
5177 
5178 	if (sc->hn_tx_ring_cnt == 0)
5179 		return;
5180 
5181 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5182 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5183 
5184 	free(sc->hn_tx_ring, M_DEVBUF);
5185 	sc->hn_tx_ring = NULL;
5186 
5187 	sc->hn_tx_ring_cnt = 0;
5188 	sc->hn_tx_ring_inuse = 0;
5189 }
5190 
5191 #ifdef HN_IFSTART_SUPPORT
5192 
5193 static void
5194 hn_start_taskfunc(void *xtxr, int pending __unused)
5195 {
5196 	struct hn_tx_ring *txr = xtxr;
5197 
5198 	mtx_lock(&txr->hn_tx_lock);
5199 	hn_start_locked(txr, 0);
5200 	mtx_unlock(&txr->hn_tx_lock);
5201 }
5202 
5203 static int
5204 hn_start_locked(struct hn_tx_ring *txr, int len)
5205 {
5206 	struct hn_softc *sc = txr->hn_sc;
5207 	struct ifnet *ifp = sc->hn_ifp;
5208 	int sched = 0;
5209 
5210 	KASSERT(hn_use_if_start,
5211 	    ("hn_start_locked is called, when if_start is disabled"));
5212 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5213 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5214 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5215 
5216 	if (__predict_false(txr->hn_suspended))
5217 		return (0);
5218 
5219 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5220 	    IFF_DRV_RUNNING)
5221 		return (0);
5222 
5223 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5224 		struct hn_txdesc *txd;
5225 		struct mbuf *m_head;
5226 		int error;
5227 
5228 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5229 		if (m_head == NULL)
5230 			break;
5231 
5232 		if (len > 0 && m_head->m_pkthdr.len > len) {
5233 			/*
5234 			 * This sending could be time consuming; let callers
5235 			 * dispatch this packet sending (and sending of any
5236 			 * following up packets) to tx taskqueue.
5237 			 */
5238 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5239 			sched = 1;
5240 			break;
5241 		}
5242 
5243 #if defined(INET6) || defined(INET)
5244 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5245 			m_head = hn_tso_fixup(m_head);
5246 			if (__predict_false(m_head == NULL)) {
5247 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5248 				continue;
5249 			}
5250 		}
5251 #endif
5252 
5253 		txd = hn_txdesc_get(txr);
5254 		if (txd == NULL) {
5255 			txr->hn_no_txdescs++;
5256 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5257 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5258 			break;
5259 		}
5260 
5261 		error = hn_encap(ifp, txr, txd, &m_head);
5262 		if (error) {
5263 			/* Both txd and m_head are freed */
5264 			KASSERT(txr->hn_agg_txd == NULL,
5265 			    ("encap failed w/ pending aggregating txdesc"));
5266 			continue;
5267 		}
5268 
5269 		if (txr->hn_agg_pktleft == 0) {
5270 			if (txr->hn_agg_txd != NULL) {
5271 				KASSERT(m_head == NULL,
5272 				    ("pending mbuf for aggregating txdesc"));
5273 				error = hn_flush_txagg(ifp, txr);
5274 				if (__predict_false(error)) {
5275 					atomic_set_int(&ifp->if_drv_flags,
5276 					    IFF_DRV_OACTIVE);
5277 					break;
5278 				}
5279 			} else {
5280 				KASSERT(m_head != NULL, ("mbuf was freed"));
5281 				error = hn_txpkt(ifp, txr, txd);
5282 				if (__predict_false(error)) {
5283 					/* txd is freed, but m_head is not */
5284 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5285 					atomic_set_int(&ifp->if_drv_flags,
5286 					    IFF_DRV_OACTIVE);
5287 					break;
5288 				}
5289 			}
5290 		}
5291 #ifdef INVARIANTS
5292 		else {
5293 			KASSERT(txr->hn_agg_txd != NULL,
5294 			    ("no aggregating txdesc"));
5295 			KASSERT(m_head == NULL,
5296 			    ("pending mbuf for aggregating txdesc"));
5297 		}
5298 #endif
5299 	}
5300 
5301 	/* Flush pending aggerated transmission. */
5302 	if (txr->hn_agg_txd != NULL)
5303 		hn_flush_txagg(ifp, txr);
5304 	return (sched);
5305 }
5306 
5307 static void
5308 hn_start(struct ifnet *ifp)
5309 {
5310 	struct hn_softc *sc = ifp->if_softc;
5311 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5312 
5313 	if (txr->hn_sched_tx)
5314 		goto do_sched;
5315 
5316 	if (mtx_trylock(&txr->hn_tx_lock)) {
5317 		int sched;
5318 
5319 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5320 		mtx_unlock(&txr->hn_tx_lock);
5321 		if (!sched)
5322 			return;
5323 	}
5324 do_sched:
5325 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5326 }
5327 
5328 static void
5329 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5330 {
5331 	struct hn_tx_ring *txr = xtxr;
5332 
5333 	mtx_lock(&txr->hn_tx_lock);
5334 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5335 	hn_start_locked(txr, 0);
5336 	mtx_unlock(&txr->hn_tx_lock);
5337 }
5338 
5339 static void
5340 hn_start_txeof(struct hn_tx_ring *txr)
5341 {
5342 	struct hn_softc *sc = txr->hn_sc;
5343 	struct ifnet *ifp = sc->hn_ifp;
5344 
5345 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5346 
5347 	if (txr->hn_sched_tx)
5348 		goto do_sched;
5349 
5350 	if (mtx_trylock(&txr->hn_tx_lock)) {
5351 		int sched;
5352 
5353 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5354 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5355 		mtx_unlock(&txr->hn_tx_lock);
5356 		if (sched) {
5357 			taskqueue_enqueue(txr->hn_tx_taskq,
5358 			    &txr->hn_tx_task);
5359 		}
5360 	} else {
5361 do_sched:
5362 		/*
5363 		 * Release the OACTIVE earlier, with the hope, that
5364 		 * others could catch up.  The task will clear the
5365 		 * flag again with the hn_tx_lock to avoid possible
5366 		 * races.
5367 		 */
5368 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5369 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5370 	}
5371 }
5372 
5373 #endif	/* HN_IFSTART_SUPPORT */
5374 
5375 static int
5376 hn_xmit(struct hn_tx_ring *txr, int len)
5377 {
5378 	struct hn_softc *sc = txr->hn_sc;
5379 	struct ifnet *ifp = sc->hn_ifp;
5380 	struct mbuf *m_head;
5381 	int sched = 0;
5382 
5383 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5384 #ifdef HN_IFSTART_SUPPORT
5385 	KASSERT(hn_use_if_start == 0,
5386 	    ("hn_xmit is called, when if_start is enabled"));
5387 #endif
5388 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5389 
5390 	if (__predict_false(txr->hn_suspended))
5391 		return (0);
5392 
5393 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5394 		return (0);
5395 
5396 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5397 		struct hn_txdesc *txd;
5398 		int error;
5399 
5400 		if (len > 0 && m_head->m_pkthdr.len > len) {
5401 			/*
5402 			 * This sending could be time consuming; let callers
5403 			 * dispatch this packet sending (and sending of any
5404 			 * following up packets) to tx taskqueue.
5405 			 */
5406 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5407 			sched = 1;
5408 			break;
5409 		}
5410 
5411 		txd = hn_txdesc_get(txr);
5412 		if (txd == NULL) {
5413 			txr->hn_no_txdescs++;
5414 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5415 			txr->hn_oactive = 1;
5416 			break;
5417 		}
5418 
5419 		error = hn_encap(ifp, txr, txd, &m_head);
5420 		if (error) {
5421 			/* Both txd and m_head are freed; discard */
5422 			KASSERT(txr->hn_agg_txd == NULL,
5423 			    ("encap failed w/ pending aggregating txdesc"));
5424 			drbr_advance(ifp, txr->hn_mbuf_br);
5425 			continue;
5426 		}
5427 
5428 		if (txr->hn_agg_pktleft == 0) {
5429 			if (txr->hn_agg_txd != NULL) {
5430 				KASSERT(m_head == NULL,
5431 				    ("pending mbuf for aggregating txdesc"));
5432 				error = hn_flush_txagg(ifp, txr);
5433 				if (__predict_false(error)) {
5434 					txr->hn_oactive = 1;
5435 					break;
5436 				}
5437 			} else {
5438 				KASSERT(m_head != NULL, ("mbuf was freed"));
5439 				error = hn_txpkt(ifp, txr, txd);
5440 				if (__predict_false(error)) {
5441 					/* txd is freed, but m_head is not */
5442 					drbr_putback(ifp, txr->hn_mbuf_br,
5443 					    m_head);
5444 					txr->hn_oactive = 1;
5445 					break;
5446 				}
5447 			}
5448 		}
5449 #ifdef INVARIANTS
5450 		else {
5451 			KASSERT(txr->hn_agg_txd != NULL,
5452 			    ("no aggregating txdesc"));
5453 			KASSERT(m_head == NULL,
5454 			    ("pending mbuf for aggregating txdesc"));
5455 		}
5456 #endif
5457 
5458 		/* Sent */
5459 		drbr_advance(ifp, txr->hn_mbuf_br);
5460 	}
5461 
5462 	/* Flush pending aggerated transmission. */
5463 	if (txr->hn_agg_txd != NULL)
5464 		hn_flush_txagg(ifp, txr);
5465 	return (sched);
5466 }
5467 
5468 static int
5469 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5470 {
5471 	struct hn_softc *sc = ifp->if_softc;
5472 	struct hn_tx_ring *txr;
5473 	int error, idx = 0;
5474 
5475 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5476 		struct rm_priotracker pt;
5477 
5478 		rm_rlock(&sc->hn_vf_lock, &pt);
5479 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5480 			struct mbuf *m_bpf = NULL;
5481 			int obytes, omcast;
5482 
5483 			obytes = m->m_pkthdr.len;
5484 			if (m->m_flags & M_MCAST)
5485 				omcast = 1;
5486 
5487 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5488 				if (bpf_peers_present(ifp->if_bpf)) {
5489 					m_bpf = m_copypacket(m, M_NOWAIT);
5490 					if (m_bpf == NULL) {
5491 						/*
5492 						 * Failed to grab a shallow
5493 						 * copy; tap now.
5494 						 */
5495 						ETHER_BPF_MTAP(ifp, m);
5496 					}
5497 				}
5498 			} else {
5499 				ETHER_BPF_MTAP(ifp, m);
5500 			}
5501 
5502 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5503 			rm_runlock(&sc->hn_vf_lock, &pt);
5504 
5505 			if (m_bpf != NULL) {
5506 				if (!error)
5507 					ETHER_BPF_MTAP(ifp, m_bpf);
5508 				m_freem(m_bpf);
5509 			}
5510 
5511 			if (error == ENOBUFS) {
5512 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5513 			} else if (error) {
5514 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5515 			} else {
5516 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5517 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5518 				if (omcast) {
5519 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5520 					    omcast);
5521 				}
5522 			}
5523 			return (error);
5524 		}
5525 		rm_runlock(&sc->hn_vf_lock, &pt);
5526 	}
5527 
5528 #if defined(INET6) || defined(INET)
5529 	/*
5530 	 * Perform TSO packet header fixup now, since the TSO
5531 	 * packet header should be cache-hot.
5532 	 */
5533 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5534 		m = hn_tso_fixup(m);
5535 		if (__predict_false(m == NULL)) {
5536 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5537 			return EIO;
5538 		}
5539 	}
5540 #endif
5541 
5542 	/*
5543 	 * Select the TX ring based on flowid
5544 	 */
5545 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
5546 #ifdef RSS
5547 		uint32_t bid;
5548 
5549 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
5550 		    &bid) == 0)
5551 			idx = bid % sc->hn_tx_ring_inuse;
5552 		else
5553 #endif
5554 		{
5555 #if defined(INET6) || defined(INET)
5556 			int tcpsyn = 0;
5557 
5558 			if (m->m_pkthdr.len < 128 &&
5559 			    (m->m_pkthdr.csum_flags &
5560 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
5561 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
5562 				m = hn_check_tcpsyn(m, &tcpsyn);
5563 				if (__predict_false(m == NULL)) {
5564 					if_inc_counter(ifp,
5565 					    IFCOUNTER_OERRORS, 1);
5566 					return (EIO);
5567 				}
5568 			}
5569 #else
5570 			const int tcpsyn = 0;
5571 #endif
5572 			if (tcpsyn)
5573 				idx = 0;
5574 			else
5575 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
5576 		}
5577 	}
5578 	txr = &sc->hn_tx_ring[idx];
5579 
5580 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
5581 	if (error) {
5582 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5583 		return error;
5584 	}
5585 
5586 	if (txr->hn_oactive)
5587 		return 0;
5588 
5589 	if (txr->hn_sched_tx)
5590 		goto do_sched;
5591 
5592 	if (mtx_trylock(&txr->hn_tx_lock)) {
5593 		int sched;
5594 
5595 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
5596 		mtx_unlock(&txr->hn_tx_lock);
5597 		if (!sched)
5598 			return 0;
5599 	}
5600 do_sched:
5601 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5602 	return 0;
5603 }
5604 
5605 static void
5606 hn_tx_ring_qflush(struct hn_tx_ring *txr)
5607 {
5608 	struct mbuf *m;
5609 
5610 	mtx_lock(&txr->hn_tx_lock);
5611 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
5612 		m_freem(m);
5613 	mtx_unlock(&txr->hn_tx_lock);
5614 }
5615 
5616 static void
5617 hn_xmit_qflush(struct ifnet *ifp)
5618 {
5619 	struct hn_softc *sc = ifp->if_softc;
5620 	struct rm_priotracker pt;
5621 	int i;
5622 
5623 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
5624 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5625 	if_qflush(ifp);
5626 
5627 	rm_rlock(&sc->hn_vf_lock, &pt);
5628 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
5629 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
5630 	rm_runlock(&sc->hn_vf_lock, &pt);
5631 }
5632 
5633 static void
5634 hn_xmit_txeof(struct hn_tx_ring *txr)
5635 {
5636 
5637 	if (txr->hn_sched_tx)
5638 		goto do_sched;
5639 
5640 	if (mtx_trylock(&txr->hn_tx_lock)) {
5641 		int sched;
5642 
5643 		txr->hn_oactive = 0;
5644 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
5645 		mtx_unlock(&txr->hn_tx_lock);
5646 		if (sched) {
5647 			taskqueue_enqueue(txr->hn_tx_taskq,
5648 			    &txr->hn_tx_task);
5649 		}
5650 	} else {
5651 do_sched:
5652 		/*
5653 		 * Release the oactive earlier, with the hope, that
5654 		 * others could catch up.  The task will clear the
5655 		 * oactive again with the hn_tx_lock to avoid possible
5656 		 * races.
5657 		 */
5658 		txr->hn_oactive = 0;
5659 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5660 	}
5661 }
5662 
5663 static void
5664 hn_xmit_taskfunc(void *xtxr, int pending __unused)
5665 {
5666 	struct hn_tx_ring *txr = xtxr;
5667 
5668 	mtx_lock(&txr->hn_tx_lock);
5669 	hn_xmit(txr, 0);
5670 	mtx_unlock(&txr->hn_tx_lock);
5671 }
5672 
5673 static void
5674 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
5675 {
5676 	struct hn_tx_ring *txr = xtxr;
5677 
5678 	mtx_lock(&txr->hn_tx_lock);
5679 	txr->hn_oactive = 0;
5680 	hn_xmit(txr, 0);
5681 	mtx_unlock(&txr->hn_tx_lock);
5682 }
5683 
5684 static int
5685 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
5686 {
5687 	struct vmbus_chan_br cbr;
5688 	struct hn_rx_ring *rxr;
5689 	struct hn_tx_ring *txr = NULL;
5690 	int idx, error;
5691 
5692 	idx = vmbus_chan_subidx(chan);
5693 
5694 	/*
5695 	 * Link this channel to RX/TX ring.
5696 	 */
5697 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
5698 	    ("invalid channel index %d, should > 0 && < %d",
5699 	     idx, sc->hn_rx_ring_inuse));
5700 	rxr = &sc->hn_rx_ring[idx];
5701 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
5702 	    ("RX ring %d already attached", idx));
5703 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
5704 	rxr->hn_chan = chan;
5705 
5706 	if (bootverbose) {
5707 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
5708 		    idx, vmbus_chan_id(chan));
5709 	}
5710 
5711 	if (idx < sc->hn_tx_ring_inuse) {
5712 		txr = &sc->hn_tx_ring[idx];
5713 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
5714 		    ("TX ring %d already attached", idx));
5715 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
5716 
5717 		txr->hn_chan = chan;
5718 		if (bootverbose) {
5719 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
5720 			    idx, vmbus_chan_id(chan));
5721 		}
5722 	}
5723 
5724 	/* Bind this channel to a proper CPU. */
5725 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
5726 
5727 	/*
5728 	 * Open this channel
5729 	 */
5730 	cbr.cbr = rxr->hn_br;
5731 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
5732 	cbr.cbr_txsz = HN_TXBR_SIZE;
5733 	cbr.cbr_rxsz = HN_RXBR_SIZE;
5734 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
5735 	if (error) {
5736 		if (error == EISCONN) {
5737 			if_printf(sc->hn_ifp, "bufring is connected after "
5738 			    "chan%u open failure\n", vmbus_chan_id(chan));
5739 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
5740 		} else {
5741 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
5742 			    vmbus_chan_id(chan), error);
5743 		}
5744 	}
5745 	return (error);
5746 }
5747 
5748 static void
5749 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
5750 {
5751 	struct hn_rx_ring *rxr;
5752 	int idx, error;
5753 
5754 	idx = vmbus_chan_subidx(chan);
5755 
5756 	/*
5757 	 * Link this channel to RX/TX ring.
5758 	 */
5759 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
5760 	    ("invalid channel index %d, should > 0 && < %d",
5761 	     idx, sc->hn_rx_ring_inuse));
5762 	rxr = &sc->hn_rx_ring[idx];
5763 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
5764 	    ("RX ring %d is not attached", idx));
5765 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
5766 
5767 	if (idx < sc->hn_tx_ring_inuse) {
5768 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
5769 
5770 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
5771 		    ("TX ring %d is not attached attached", idx));
5772 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
5773 	}
5774 
5775 	/*
5776 	 * Close this channel.
5777 	 *
5778 	 * NOTE:
5779 	 * Channel closing does _not_ destroy the target channel.
5780 	 */
5781 	error = vmbus_chan_close_direct(chan);
5782 	if (error == EISCONN) {
5783 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
5784 		    "after being closed\n", vmbus_chan_id(chan));
5785 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
5786 	} else if (error) {
5787 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
5788 		    vmbus_chan_id(chan), error);
5789 	}
5790 }
5791 
5792 static int
5793 hn_attach_subchans(struct hn_softc *sc)
5794 {
5795 	struct vmbus_channel **subchans;
5796 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
5797 	int i, error = 0;
5798 
5799 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
5800 
5801 	/* Attach the sub-channels. */
5802 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
5803 	for (i = 0; i < subchan_cnt; ++i) {
5804 		int error1;
5805 
5806 		error1 = hn_chan_attach(sc, subchans[i]);
5807 		if (error1) {
5808 			error = error1;
5809 			/* Move on; all channels will be detached later. */
5810 		}
5811 	}
5812 	vmbus_subchan_rel(subchans, subchan_cnt);
5813 
5814 	if (error) {
5815 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
5816 	} else {
5817 		if (bootverbose) {
5818 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
5819 			    subchan_cnt);
5820 		}
5821 	}
5822 	return (error);
5823 }
5824 
5825 static void
5826 hn_detach_allchans(struct hn_softc *sc)
5827 {
5828 	struct vmbus_channel **subchans;
5829 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
5830 	int i;
5831 
5832 	if (subchan_cnt == 0)
5833 		goto back;
5834 
5835 	/* Detach the sub-channels. */
5836 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
5837 	for (i = 0; i < subchan_cnt; ++i)
5838 		hn_chan_detach(sc, subchans[i]);
5839 	vmbus_subchan_rel(subchans, subchan_cnt);
5840 
5841 back:
5842 	/*
5843 	 * Detach the primary channel, _after_ all sub-channels
5844 	 * are detached.
5845 	 */
5846 	hn_chan_detach(sc, sc->hn_prichan);
5847 
5848 	/* Wait for sub-channels to be destroyed, if any. */
5849 	vmbus_subchan_drain(sc->hn_prichan);
5850 
5851 #ifdef INVARIANTS
5852 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5853 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
5854 		    HN_RX_FLAG_ATTACHED) == 0,
5855 		    ("%dth RX ring is still attached", i));
5856 	}
5857 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5858 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
5859 		    HN_TX_FLAG_ATTACHED) == 0,
5860 		    ("%dth TX ring is still attached", i));
5861 	}
5862 #endif
5863 }
5864 
5865 static int
5866 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
5867 {
5868 	struct vmbus_channel **subchans;
5869 	int nchan, rxr_cnt, error;
5870 
5871 	nchan = *nsubch + 1;
5872 	if (nchan == 1) {
5873 		/*
5874 		 * Multiple RX/TX rings are not requested.
5875 		 */
5876 		*nsubch = 0;
5877 		return (0);
5878 	}
5879 
5880 	/*
5881 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
5882 	 * table entries.
5883 	 */
5884 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
5885 	if (error) {
5886 		/* No RSS; this is benign. */
5887 		*nsubch = 0;
5888 		return (0);
5889 	}
5890 	if (bootverbose) {
5891 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
5892 		    rxr_cnt, nchan);
5893 	}
5894 
5895 	if (nchan > rxr_cnt)
5896 		nchan = rxr_cnt;
5897 	if (nchan == 1) {
5898 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
5899 		*nsubch = 0;
5900 		return (0);
5901 	}
5902 
5903 	/*
5904 	 * Allocate sub-channels from NVS.
5905 	 */
5906 	*nsubch = nchan - 1;
5907 	error = hn_nvs_alloc_subchans(sc, nsubch);
5908 	if (error || *nsubch == 0) {
5909 		/* Failed to allocate sub-channels. */
5910 		*nsubch = 0;
5911 		return (0);
5912 	}
5913 
5914 	/*
5915 	 * Wait for all sub-channels to become ready before moving on.
5916 	 */
5917 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
5918 	vmbus_subchan_rel(subchans, *nsubch);
5919 	return (0);
5920 }
5921 
5922 static bool
5923 hn_synth_attachable(const struct hn_softc *sc)
5924 {
5925 	int i;
5926 
5927 	if (sc->hn_flags & HN_FLAG_ERRORS)
5928 		return (false);
5929 
5930 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5931 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5932 
5933 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
5934 			return (false);
5935 	}
5936 	return (true);
5937 }
5938 
5939 /*
5940  * Make sure that the RX filter is zero after the successful
5941  * RNDIS initialization.
5942  *
5943  * NOTE:
5944  * Under certain conditions on certain versions of Hyper-V,
5945  * the RNDIS rxfilter is _not_ zero on the hypervisor side
5946  * after the successful RNDIS initialization, which breaks
5947  * the assumption of any following code (well, it breaks the
5948  * RNDIS API contract actually).  Clear the RNDIS rxfilter
5949  * explicitly, drain packets sneaking through, and drain the
5950  * interrupt taskqueues scheduled due to the stealth packets.
5951  */
5952 static void
5953 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
5954 {
5955 
5956 	hn_disable_rx(sc);
5957 	hn_drain_rxtx(sc, nchan);
5958 }
5959 
5960 static int
5961 hn_synth_attach(struct hn_softc *sc, int mtu)
5962 {
5963 #define ATTACHED_NVS		0x0002
5964 #define ATTACHED_RNDIS		0x0004
5965 
5966 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
5967 	int error, nsubch, nchan = 1, i, rndis_inited;
5968 	uint32_t old_caps, attached = 0;
5969 
5970 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
5971 	    ("synthetic parts were attached"));
5972 
5973 	if (!hn_synth_attachable(sc))
5974 		return (ENXIO);
5975 
5976 	/* Save capabilities for later verification. */
5977 	old_caps = sc->hn_caps;
5978 	sc->hn_caps = 0;
5979 
5980 	/* Clear RSS stuffs. */
5981 	sc->hn_rss_ind_size = 0;
5982 	sc->hn_rss_hash = 0;
5983 
5984 	/*
5985 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
5986 	 */
5987 	error = hn_chan_attach(sc, sc->hn_prichan);
5988 	if (error)
5989 		goto failed;
5990 
5991 	/*
5992 	 * Attach NVS.
5993 	 */
5994 	error = hn_nvs_attach(sc, mtu);
5995 	if (error)
5996 		goto failed;
5997 	attached |= ATTACHED_NVS;
5998 
5999 	/*
6000 	 * Attach RNDIS _after_ NVS is attached.
6001 	 */
6002 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6003 	if (rndis_inited)
6004 		attached |= ATTACHED_RNDIS;
6005 	if (error)
6006 		goto failed;
6007 
6008 	/*
6009 	 * Make sure capabilities are not changed.
6010 	 */
6011 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6012 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6013 		    old_caps, sc->hn_caps);
6014 		error = ENXIO;
6015 		goto failed;
6016 	}
6017 
6018 	/*
6019 	 * Allocate sub-channels for multi-TX/RX rings.
6020 	 *
6021 	 * NOTE:
6022 	 * The # of RX rings that can be used is equivalent to the # of
6023 	 * channels to be requested.
6024 	 */
6025 	nsubch = sc->hn_rx_ring_cnt - 1;
6026 	error = hn_synth_alloc_subchans(sc, &nsubch);
6027 	if (error)
6028 		goto failed;
6029 	/* NOTE: _Full_ synthetic parts detach is required now. */
6030 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6031 
6032 	/*
6033 	 * Set the # of TX/RX rings that could be used according to
6034 	 * the # of channels that NVS offered.
6035 	 */
6036 	nchan = nsubch + 1;
6037 	hn_set_ring_inuse(sc, nchan);
6038 	if (nchan == 1) {
6039 		/* Only the primary channel can be used; done */
6040 		goto back;
6041 	}
6042 
6043 	/*
6044 	 * Attach the sub-channels.
6045 	 *
6046 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6047 	 */
6048 	error = hn_attach_subchans(sc);
6049 	if (error)
6050 		goto failed;
6051 
6052 	/*
6053 	 * Configure RSS key and indirect table _after_ all sub-channels
6054 	 * are attached.
6055 	 */
6056 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6057 		/*
6058 		 * RSS key is not set yet; set it to the default RSS key.
6059 		 */
6060 		if (bootverbose)
6061 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6062 #ifdef RSS
6063 		rss_getkey(rss->rss_key);
6064 #else
6065 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6066 #endif
6067 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6068 	}
6069 
6070 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6071 		/*
6072 		 * RSS indirect table is not set yet; set it up in round-
6073 		 * robin fashion.
6074 		 */
6075 		if (bootverbose) {
6076 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6077 			    "table\n");
6078 		}
6079 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6080 			uint32_t subidx;
6081 
6082 #ifdef RSS
6083 			subidx = rss_get_indirection_to_bucket(i);
6084 #else
6085 			subidx = i;
6086 #endif
6087 			rss->rss_ind[i] = subidx % nchan;
6088 		}
6089 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6090 	} else {
6091 		/*
6092 		 * # of usable channels may be changed, so we have to
6093 		 * make sure that all entries in RSS indirect table
6094 		 * are valid.
6095 		 *
6096 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6097 		 */
6098 		hn_rss_ind_fixup(sc);
6099 	}
6100 
6101 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6102 	if (error)
6103 		goto failed;
6104 back:
6105 	/*
6106 	 * Fixup transmission aggregation setup.
6107 	 */
6108 	hn_set_txagg(sc);
6109 	hn_rndis_init_fixat(sc, nchan);
6110 	return (0);
6111 
6112 failed:
6113 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6114 		hn_rndis_init_fixat(sc, nchan);
6115 		hn_synth_detach(sc);
6116 	} else {
6117 		if (attached & ATTACHED_RNDIS) {
6118 			hn_rndis_init_fixat(sc, nchan);
6119 			hn_rndis_detach(sc);
6120 		}
6121 		if (attached & ATTACHED_NVS)
6122 			hn_nvs_detach(sc);
6123 		hn_chan_detach(sc, sc->hn_prichan);
6124 		/* Restore old capabilities. */
6125 		sc->hn_caps = old_caps;
6126 	}
6127 	return (error);
6128 
6129 #undef ATTACHED_RNDIS
6130 #undef ATTACHED_NVS
6131 }
6132 
6133 /*
6134  * NOTE:
6135  * The interface must have been suspended though hn_suspend(), before
6136  * this function get called.
6137  */
6138 static void
6139 hn_synth_detach(struct hn_softc *sc)
6140 {
6141 
6142 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6143 	    ("synthetic parts were not attached"));
6144 
6145 	/* Detach the RNDIS first. */
6146 	hn_rndis_detach(sc);
6147 
6148 	/* Detach NVS. */
6149 	hn_nvs_detach(sc);
6150 
6151 	/* Detach all of the channels. */
6152 	hn_detach_allchans(sc);
6153 
6154 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6155 }
6156 
6157 static void
6158 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6159 {
6160 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6161 	    ("invalid ring count %d", ring_cnt));
6162 
6163 	if (sc->hn_tx_ring_cnt > ring_cnt)
6164 		sc->hn_tx_ring_inuse = ring_cnt;
6165 	else
6166 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6167 	sc->hn_rx_ring_inuse = ring_cnt;
6168 
6169 #ifdef RSS
6170 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6171 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6172 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6173 		    rss_getnumbuckets());
6174 	}
6175 #endif
6176 
6177 	if (bootverbose) {
6178 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6179 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6180 	}
6181 }
6182 
6183 static void
6184 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6185 {
6186 
6187 	/*
6188 	 * NOTE:
6189 	 * The TX bufring will not be drained by the hypervisor,
6190 	 * if the primary channel is revoked.
6191 	 */
6192 	while (!vmbus_chan_rx_empty(chan) ||
6193 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6194 	     !vmbus_chan_tx_empty(chan)))
6195 		pause("waitch", 1);
6196 	vmbus_chan_intr_drain(chan);
6197 }
6198 
6199 static void
6200 hn_disable_rx(struct hn_softc *sc)
6201 {
6202 
6203 	/*
6204 	 * Disable RX by clearing RX filter forcefully.
6205 	 */
6206 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6207 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6208 
6209 	/*
6210 	 * Give RNDIS enough time to flush all pending data packets.
6211 	 */
6212 	pause("waitrx", (200 * hz) / 1000);
6213 }
6214 
6215 /*
6216  * NOTE:
6217  * RX/TX _must_ have been suspended/disabled, before this function
6218  * is called.
6219  */
6220 static void
6221 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6222 {
6223 	struct vmbus_channel **subch = NULL;
6224 	int nsubch;
6225 
6226 	/*
6227 	 * Drain RX/TX bufrings and interrupts.
6228 	 */
6229 	nsubch = nchan - 1;
6230 	if (nsubch > 0)
6231 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6232 
6233 	if (subch != NULL) {
6234 		int i;
6235 
6236 		for (i = 0; i < nsubch; ++i)
6237 			hn_chan_drain(sc, subch[i]);
6238 	}
6239 	hn_chan_drain(sc, sc->hn_prichan);
6240 
6241 	if (subch != NULL)
6242 		vmbus_subchan_rel(subch, nsubch);
6243 }
6244 
6245 static void
6246 hn_suspend_data(struct hn_softc *sc)
6247 {
6248 	struct hn_tx_ring *txr;
6249 	int i;
6250 
6251 	HN_LOCK_ASSERT(sc);
6252 
6253 	/*
6254 	 * Suspend TX.
6255 	 */
6256 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6257 		txr = &sc->hn_tx_ring[i];
6258 
6259 		mtx_lock(&txr->hn_tx_lock);
6260 		txr->hn_suspended = 1;
6261 		mtx_unlock(&txr->hn_tx_lock);
6262 		/* No one is able send more packets now. */
6263 
6264 		/*
6265 		 * Wait for all pending sends to finish.
6266 		 *
6267 		 * NOTE:
6268 		 * We will _not_ receive all pending send-done, if the
6269 		 * primary channel is revoked.
6270 		 */
6271 		while (hn_tx_ring_pending(txr) &&
6272 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6273 			pause("hnwtx", 1 /* 1 tick */);
6274 	}
6275 
6276 	/*
6277 	 * Disable RX.
6278 	 */
6279 	hn_disable_rx(sc);
6280 
6281 	/*
6282 	 * Drain RX/TX.
6283 	 */
6284 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6285 
6286 	/*
6287 	 * Drain any pending TX tasks.
6288 	 *
6289 	 * NOTE:
6290 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6291 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6292 	 */
6293 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6294 		txr = &sc->hn_tx_ring[i];
6295 
6296 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6297 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6298 	}
6299 }
6300 
6301 static void
6302 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6303 {
6304 
6305 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6306 }
6307 
6308 static void
6309 hn_suspend_mgmt(struct hn_softc *sc)
6310 {
6311 	struct task task;
6312 
6313 	HN_LOCK_ASSERT(sc);
6314 
6315 	/*
6316 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6317 	 * through hn_mgmt_taskq.
6318 	 */
6319 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6320 	vmbus_chan_run_task(sc->hn_prichan, &task);
6321 
6322 	/*
6323 	 * Make sure that all pending management tasks are completed.
6324 	 */
6325 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6326 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6327 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6328 }
6329 
6330 static void
6331 hn_suspend(struct hn_softc *sc)
6332 {
6333 
6334 	/* Disable polling. */
6335 	hn_polling(sc, 0);
6336 
6337 	/*
6338 	 * If the non-transparent mode VF is activated, the synthetic
6339 	 * device is receiving packets, so the data path of the
6340 	 * synthetic device must be suspended.
6341 	 */
6342 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6343 	    (sc->hn_flags & HN_FLAG_RXVF))
6344 		hn_suspend_data(sc);
6345 	hn_suspend_mgmt(sc);
6346 }
6347 
6348 static void
6349 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6350 {
6351 	int i;
6352 
6353 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6354 	    ("invalid TX ring count %d", tx_ring_cnt));
6355 
6356 	for (i = 0; i < tx_ring_cnt; ++i) {
6357 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6358 
6359 		mtx_lock(&txr->hn_tx_lock);
6360 		txr->hn_suspended = 0;
6361 		mtx_unlock(&txr->hn_tx_lock);
6362 	}
6363 }
6364 
6365 static void
6366 hn_resume_data(struct hn_softc *sc)
6367 {
6368 	int i;
6369 
6370 	HN_LOCK_ASSERT(sc);
6371 
6372 	/*
6373 	 * Re-enable RX.
6374 	 */
6375 	hn_rxfilter_config(sc);
6376 
6377 	/*
6378 	 * Make sure to clear suspend status on "all" TX rings,
6379 	 * since hn_tx_ring_inuse can be changed after
6380 	 * hn_suspend_data().
6381 	 */
6382 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6383 
6384 #ifdef HN_IFSTART_SUPPORT
6385 	if (!hn_use_if_start)
6386 #endif
6387 	{
6388 		/*
6389 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6390 		 * reduced.
6391 		 */
6392 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6393 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6394 	}
6395 
6396 	/*
6397 	 * Kick start TX.
6398 	 */
6399 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6400 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6401 
6402 		/*
6403 		 * Use txeof task, so that any pending oactive can be
6404 		 * cleared properly.
6405 		 */
6406 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6407 	}
6408 }
6409 
6410 static void
6411 hn_resume_mgmt(struct hn_softc *sc)
6412 {
6413 
6414 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6415 
6416 	/*
6417 	 * Kick off network change detection, if it was pending.
6418 	 * If no network change was pending, start link status
6419 	 * checks, which is more lightweight than network change
6420 	 * detection.
6421 	 */
6422 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6423 		hn_change_network(sc);
6424 	else
6425 		hn_update_link_status(sc);
6426 }
6427 
6428 static void
6429 hn_resume(struct hn_softc *sc)
6430 {
6431 
6432 	/*
6433 	 * If the non-transparent mode VF is activated, the synthetic
6434 	 * device have to receive packets, so the data path of the
6435 	 * synthetic device must be resumed.
6436 	 */
6437 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6438 	    (sc->hn_flags & HN_FLAG_RXVF))
6439 		hn_resume_data(sc);
6440 
6441 	/*
6442 	 * Don't resume link status change if VF is attached/activated.
6443 	 * - In the non-transparent VF mode, the synthetic device marks
6444 	 *   link down until the VF is deactivated; i.e. VF is down.
6445 	 * - In transparent VF mode, VF's media status is used until
6446 	 *   the VF is detached.
6447 	 */
6448 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6449 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6450 		hn_resume_mgmt(sc);
6451 
6452 	/*
6453 	 * Re-enable polling if this interface is running and
6454 	 * the polling is requested.
6455 	 */
6456 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6457 		hn_polling(sc, sc->hn_pollhz);
6458 }
6459 
6460 static void
6461 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6462 {
6463 	const struct rndis_status_msg *msg;
6464 	int ofs;
6465 
6466 	if (dlen < sizeof(*msg)) {
6467 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6468 		return;
6469 	}
6470 	msg = data;
6471 
6472 	switch (msg->rm_status) {
6473 	case RNDIS_STATUS_MEDIA_CONNECT:
6474 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6475 		hn_update_link_status(sc);
6476 		break;
6477 
6478 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6479 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6480 		/* Not really useful; ignore. */
6481 		break;
6482 
6483 	case RNDIS_STATUS_NETWORK_CHANGE:
6484 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6485 		if (dlen < ofs + msg->rm_stbuflen ||
6486 		    msg->rm_stbuflen < sizeof(uint32_t)) {
6487 			if_printf(sc->hn_ifp, "network changed\n");
6488 		} else {
6489 			uint32_t change;
6490 
6491 			memcpy(&change, ((const uint8_t *)msg) + ofs,
6492 			    sizeof(change));
6493 			if_printf(sc->hn_ifp, "network changed, change %u\n",
6494 			    change);
6495 		}
6496 		hn_change_network(sc);
6497 		break;
6498 
6499 	default:
6500 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6501 		    msg->rm_status);
6502 		break;
6503 	}
6504 }
6505 
6506 static int
6507 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6508 {
6509 	const struct rndis_pktinfo *pi = info_data;
6510 	uint32_t mask = 0;
6511 
6512 	while (info_dlen != 0) {
6513 		const void *data;
6514 		uint32_t dlen;
6515 
6516 		if (__predict_false(info_dlen < sizeof(*pi)))
6517 			return (EINVAL);
6518 		if (__predict_false(info_dlen < pi->rm_size))
6519 			return (EINVAL);
6520 		info_dlen -= pi->rm_size;
6521 
6522 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6523 			return (EINVAL);
6524 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6525 			return (EINVAL);
6526 		dlen = pi->rm_size - pi->rm_pktinfooffset;
6527 		data = pi->rm_data;
6528 
6529 		switch (pi->rm_type) {
6530 		case NDIS_PKTINFO_TYPE_VLAN:
6531 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6532 				return (EINVAL);
6533 			info->vlan_info = *((const uint32_t *)data);
6534 			mask |= HN_RXINFO_VLAN;
6535 			break;
6536 
6537 		case NDIS_PKTINFO_TYPE_CSUM:
6538 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6539 				return (EINVAL);
6540 			info->csum_info = *((const uint32_t *)data);
6541 			mask |= HN_RXINFO_CSUM;
6542 			break;
6543 
6544 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6545 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6546 				return (EINVAL);
6547 			info->hash_value = *((const uint32_t *)data);
6548 			mask |= HN_RXINFO_HASHVAL;
6549 			break;
6550 
6551 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
6552 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6553 				return (EINVAL);
6554 			info->hash_info = *((const uint32_t *)data);
6555 			mask |= HN_RXINFO_HASHINF;
6556 			break;
6557 
6558 		default:
6559 			goto next;
6560 		}
6561 
6562 		if (mask == HN_RXINFO_ALL) {
6563 			/* All found; done */
6564 			break;
6565 		}
6566 next:
6567 		pi = (const struct rndis_pktinfo *)
6568 		    ((const uint8_t *)pi + pi->rm_size);
6569 	}
6570 
6571 	/*
6572 	 * Final fixup.
6573 	 * - If there is no hash value, invalidate the hash info.
6574 	 */
6575 	if ((mask & HN_RXINFO_HASHVAL) == 0)
6576 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
6577 	return (0);
6578 }
6579 
6580 static __inline bool
6581 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
6582 {
6583 
6584 	if (off < check_off) {
6585 		if (__predict_true(off + len <= check_off))
6586 			return (false);
6587 	} else if (off > check_off) {
6588 		if (__predict_true(check_off + check_len <= off))
6589 			return (false);
6590 	}
6591 	return (true);
6592 }
6593 
6594 static void
6595 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
6596 {
6597 	const struct rndis_packet_msg *pkt;
6598 	struct hn_rxinfo info;
6599 	int data_off, pktinfo_off, data_len, pktinfo_len;
6600 
6601 	/*
6602 	 * Check length.
6603 	 */
6604 	if (__predict_false(dlen < sizeof(*pkt))) {
6605 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
6606 		return;
6607 	}
6608 	pkt = data;
6609 
6610 	if (__predict_false(dlen < pkt->rm_len)) {
6611 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
6612 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
6613 		return;
6614 	}
6615 	if (__predict_false(pkt->rm_len <
6616 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
6617 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
6618 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
6619 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
6620 		    pkt->rm_pktinfolen);
6621 		return;
6622 	}
6623 	if (__predict_false(pkt->rm_datalen == 0)) {
6624 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
6625 		return;
6626 	}
6627 
6628 	/*
6629 	 * Check offests.
6630 	 */
6631 #define IS_OFFSET_INVALID(ofs)			\
6632 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
6633 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
6634 
6635 	/* XXX Hyper-V does not meet data offset alignment requirement */
6636 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
6637 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6638 		    "data offset %u\n", pkt->rm_dataoffset);
6639 		return;
6640 	}
6641 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
6642 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
6643 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6644 		    "oob offset %u\n", pkt->rm_oobdataoffset);
6645 		return;
6646 	}
6647 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
6648 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
6649 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6650 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
6651 		return;
6652 	}
6653 
6654 #undef IS_OFFSET_INVALID
6655 
6656 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
6657 	data_len = pkt->rm_datalen;
6658 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
6659 	pktinfo_len = pkt->rm_pktinfolen;
6660 
6661 	/*
6662 	 * Check OOB coverage.
6663 	 */
6664 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
6665 		int oob_off, oob_len;
6666 
6667 		if_printf(rxr->hn_ifp, "got oobdata\n");
6668 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
6669 		oob_len = pkt->rm_oobdatalen;
6670 
6671 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
6672 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6673 			    "oob overflow, msglen %u, oob abs %d len %d\n",
6674 			    pkt->rm_len, oob_off, oob_len);
6675 			return;
6676 		}
6677 
6678 		/*
6679 		 * Check against data.
6680 		 */
6681 		if (hn_rndis_check_overlap(oob_off, oob_len,
6682 		    data_off, data_len)) {
6683 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6684 			    "oob overlaps data, oob abs %d len %d, "
6685 			    "data abs %d len %d\n",
6686 			    oob_off, oob_len, data_off, data_len);
6687 			return;
6688 		}
6689 
6690 		/*
6691 		 * Check against pktinfo.
6692 		 */
6693 		if (pktinfo_len != 0 &&
6694 		    hn_rndis_check_overlap(oob_off, oob_len,
6695 		    pktinfo_off, pktinfo_len)) {
6696 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6697 			    "oob overlaps pktinfo, oob abs %d len %d, "
6698 			    "pktinfo abs %d len %d\n",
6699 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
6700 			return;
6701 		}
6702 	}
6703 
6704 	/*
6705 	 * Check per-packet-info coverage and find useful per-packet-info.
6706 	 */
6707 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
6708 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
6709 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
6710 	if (__predict_true(pktinfo_len != 0)) {
6711 		bool overlap;
6712 		int error;
6713 
6714 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
6715 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6716 			    "pktinfo overflow, msglen %u, "
6717 			    "pktinfo abs %d len %d\n",
6718 			    pkt->rm_len, pktinfo_off, pktinfo_len);
6719 			return;
6720 		}
6721 
6722 		/*
6723 		 * Check packet info coverage.
6724 		 */
6725 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
6726 		    data_off, data_len);
6727 		if (__predict_false(overlap)) {
6728 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6729 			    "pktinfo overlap data, pktinfo abs %d len %d, "
6730 			    "data abs %d len %d\n",
6731 			    pktinfo_off, pktinfo_len, data_off, data_len);
6732 			return;
6733 		}
6734 
6735 		/*
6736 		 * Find useful per-packet-info.
6737 		 */
6738 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
6739 		    pktinfo_len, &info);
6740 		if (__predict_false(error)) {
6741 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
6742 			    "pktinfo\n");
6743 			return;
6744 		}
6745 	}
6746 
6747 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
6748 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6749 		    "data overflow, msglen %u, data abs %d len %d\n",
6750 		    pkt->rm_len, data_off, data_len);
6751 		return;
6752 	}
6753 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
6754 }
6755 
6756 static __inline void
6757 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
6758 {
6759 	const struct rndis_msghdr *hdr;
6760 
6761 	if (__predict_false(dlen < sizeof(*hdr))) {
6762 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
6763 		return;
6764 	}
6765 	hdr = data;
6766 
6767 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
6768 		/* Hot data path. */
6769 		hn_rndis_rx_data(rxr, data, dlen);
6770 		/* Done! */
6771 		return;
6772 	}
6773 
6774 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
6775 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
6776 	else
6777 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
6778 }
6779 
6780 static void
6781 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
6782 {
6783 	const struct hn_nvs_hdr *hdr;
6784 
6785 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
6786 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
6787 		return;
6788 	}
6789 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
6790 
6791 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
6792 		/* Useless; ignore */
6793 		return;
6794 	}
6795 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
6796 }
6797 
6798 static void
6799 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
6800     const struct vmbus_chanpkt_hdr *pkt)
6801 {
6802 	struct hn_nvs_sendctx *sndc;
6803 
6804 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
6805 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
6806 	    VMBUS_CHANPKT_DATALEN(pkt));
6807 	/*
6808 	 * NOTE:
6809 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
6810 	 * its callback.
6811 	 */
6812 }
6813 
6814 static void
6815 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
6816     const struct vmbus_chanpkt_hdr *pkthdr)
6817 {
6818 	const struct vmbus_chanpkt_rxbuf *pkt;
6819 	const struct hn_nvs_hdr *nvs_hdr;
6820 	int count, i, hlen;
6821 
6822 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
6823 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
6824 		return;
6825 	}
6826 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
6827 
6828 	/* Make sure that this is a RNDIS message. */
6829 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
6830 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
6831 		    nvs_hdr->nvs_type);
6832 		return;
6833 	}
6834 
6835 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
6836 	if (__predict_false(hlen < sizeof(*pkt))) {
6837 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
6838 		return;
6839 	}
6840 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
6841 
6842 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
6843 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
6844 		    pkt->cp_rxbuf_id);
6845 		return;
6846 	}
6847 
6848 	count = pkt->cp_rxbuf_cnt;
6849 	if (__predict_false(hlen <
6850 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
6851 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
6852 		return;
6853 	}
6854 
6855 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
6856 	for (i = 0; i < count; ++i) {
6857 		int ofs, len;
6858 
6859 		ofs = pkt->cp_rxbuf[i].rb_ofs;
6860 		len = pkt->cp_rxbuf[i].rb_len;
6861 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
6862 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
6863 			    "ofs %d, len %d\n", i, ofs, len);
6864 			continue;
6865 		}
6866 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
6867 	}
6868 
6869 	/*
6870 	 * Ack the consumed RXBUF associated w/ this channel packet,
6871 	 * so that this RXBUF can be recycled by the hypervisor.
6872 	 */
6873 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
6874 }
6875 
6876 static void
6877 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
6878     uint64_t tid)
6879 {
6880 	struct hn_nvs_rndis_ack ack;
6881 	int retries, error;
6882 
6883 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
6884 	ack.nvs_status = HN_NVS_STATUS_OK;
6885 
6886 	retries = 0;
6887 again:
6888 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
6889 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
6890 	if (__predict_false(error == EAGAIN)) {
6891 		/*
6892 		 * NOTE:
6893 		 * This should _not_ happen in real world, since the
6894 		 * consumption of the TX bufring from the TX path is
6895 		 * controlled.
6896 		 */
6897 		if (rxr->hn_ack_failed == 0)
6898 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
6899 		rxr->hn_ack_failed++;
6900 		retries++;
6901 		if (retries < 10) {
6902 			DELAY(100);
6903 			goto again;
6904 		}
6905 		/* RXBUF leaks! */
6906 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
6907 	}
6908 }
6909 
6910 static void
6911 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
6912 {
6913 	struct hn_rx_ring *rxr = xrxr;
6914 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
6915 
6916 	for (;;) {
6917 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
6918 		int error, pktlen;
6919 
6920 		pktlen = rxr->hn_pktbuf_len;
6921 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
6922 		if (__predict_false(error == ENOBUFS)) {
6923 			void *nbuf;
6924 			int nlen;
6925 
6926 			/*
6927 			 * Expand channel packet buffer.
6928 			 *
6929 			 * XXX
6930 			 * Use M_WAITOK here, since allocation failure
6931 			 * is fatal.
6932 			 */
6933 			nlen = rxr->hn_pktbuf_len * 2;
6934 			while (nlen < pktlen)
6935 				nlen *= 2;
6936 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
6937 
6938 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
6939 			    rxr->hn_pktbuf_len, nlen);
6940 
6941 			free(rxr->hn_pktbuf, M_DEVBUF);
6942 			rxr->hn_pktbuf = nbuf;
6943 			rxr->hn_pktbuf_len = nlen;
6944 			/* Retry! */
6945 			continue;
6946 		} else if (__predict_false(error == EAGAIN)) {
6947 			/* No more channel packets; done! */
6948 			break;
6949 		}
6950 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
6951 
6952 		switch (pkt->cph_type) {
6953 		case VMBUS_CHANPKT_TYPE_COMP:
6954 			hn_nvs_handle_comp(sc, chan, pkt);
6955 			break;
6956 
6957 		case VMBUS_CHANPKT_TYPE_RXBUF:
6958 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
6959 			break;
6960 
6961 		case VMBUS_CHANPKT_TYPE_INBAND:
6962 			hn_nvs_handle_notify(sc, pkt);
6963 			break;
6964 
6965 		default:
6966 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
6967 			    pkt->cph_type);
6968 			break;
6969 		}
6970 	}
6971 	hn_chan_rollup(rxr, rxr->hn_txr);
6972 }
6973 
6974 static void
6975 hn_sysinit(void *arg __unused)
6976 {
6977 	int i;
6978 
6979 #ifdef HN_IFSTART_SUPPORT
6980 	/*
6981 	 * Don't use ifnet.if_start if transparent VF mode is requested;
6982 	 * mainly due to the IFF_DRV_OACTIVE flag.
6983 	 */
6984 	if (hn_xpnt_vf && hn_use_if_start) {
6985 		hn_use_if_start = 0;
6986 		printf("hn: tranparent VF mode, if_transmit will be used, "
6987 		    "instead of if_start\n");
6988 	}
6989 #endif
6990 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
6991 		printf("hn: invalid transparent VF attach routing "
6992 		    "wait timeout %d, reset to %d\n",
6993 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
6994 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
6995 	}
6996 
6997 	/*
6998 	 * Initialize VF map.
6999 	 */
7000 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7001 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7002 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7003 	    M_WAITOK | M_ZERO);
7004 
7005 	/*
7006 	 * Fix the # of TX taskqueues.
7007 	 */
7008 	if (hn_tx_taskq_cnt <= 0)
7009 		hn_tx_taskq_cnt = 1;
7010 	else if (hn_tx_taskq_cnt > mp_ncpus)
7011 		hn_tx_taskq_cnt = mp_ncpus;
7012 
7013 	/*
7014 	 * Fix the TX taskqueue mode.
7015 	 */
7016 	switch (hn_tx_taskq_mode) {
7017 	case HN_TX_TASKQ_M_INDEP:
7018 	case HN_TX_TASKQ_M_GLOBAL:
7019 	case HN_TX_TASKQ_M_EVTTQ:
7020 		break;
7021 	default:
7022 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7023 		break;
7024 	}
7025 
7026 	if (vm_guest != VM_GUEST_HV)
7027 		return;
7028 
7029 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7030 		return;
7031 
7032 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7033 	    M_DEVBUF, M_WAITOK);
7034 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7035 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7036 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7037 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7038 		    "hn tx%d", i);
7039 	}
7040 }
7041 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7042 
7043 static void
7044 hn_sysuninit(void *arg __unused)
7045 {
7046 
7047 	if (hn_tx_taskque != NULL) {
7048 		int i;
7049 
7050 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7051 			taskqueue_free(hn_tx_taskque[i]);
7052 		free(hn_tx_taskque, M_DEVBUF);
7053 	}
7054 
7055 	if (hn_vfmap != NULL)
7056 		free(hn_vfmap, M_DEVBUF);
7057 	rm_destroy(&hn_vfmap_lock);
7058 }
7059 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7060