xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 52f72944b8f5abb2386eae924357dee8aea17d5b)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/rmlock.h>
75 #include <sys/sbuf.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 
85 #include <machine/atomic.h>
86 #include <machine/in_cksum.h>
87 
88 #include <net/bpf.h>
89 #include <net/ethernet.h>
90 #include <net/if.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/rndis.h>
96 #ifdef RSS
97 #include <net/rss_config.h>
98 #endif
99 
100 #include <netinet/in_systm.h>
101 #include <netinet/in.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip6.h>
104 #include <netinet/tcp.h>
105 #include <netinet/tcp_lro.h>
106 #include <netinet/udp.h>
107 
108 #include <dev/hyperv/include/hyperv.h>
109 #include <dev/hyperv/include/hyperv_busdma.h>
110 #include <dev/hyperv/include/vmbus.h>
111 #include <dev/hyperv/include/vmbus_xact.h>
112 
113 #include <dev/hyperv/netvsc/ndis.h>
114 #include <dev/hyperv/netvsc/if_hnreg.h>
115 #include <dev/hyperv/netvsc/if_hnvar.h>
116 #include <dev/hyperv/netvsc/hn_nvs.h>
117 #include <dev/hyperv/netvsc/hn_rndis.h>
118 
119 #include "vmbus_if.h"
120 
121 #define HN_IFSTART_SUPPORT
122 
123 #define HN_RING_CNT_DEF_MAX		8
124 
125 #define HN_VFMAP_SIZE_DEF		8
126 
127 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
128 
129 /* YYY should get it from the underlying channel */
130 #define HN_TX_DESC_CNT			512
131 
132 #define HN_RNDIS_PKT_LEN					\
133 	(sizeof(struct rndis_packet_msg) +			\
134 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
135 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
136 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
137 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
138 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
139 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
140 
141 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
142 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
143 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
144 /* -1 for RNDIS packet message */
145 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
146 
147 #define HN_DIRECT_TX_SIZE_DEF		128
148 
149 #define HN_EARLY_TXEOF_THRESH		8
150 
151 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
152 
153 #define HN_LROENT_CNT_DEF		128
154 
155 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
156 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
157 /* YYY 2*MTU is a bit rough, but should be good enough. */
158 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
159 
160 #define HN_LRO_ACKCNT_DEF		1
161 
162 #define HN_LOCK_INIT(sc)		\
163 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
164 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
165 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
166 #define HN_LOCK(sc)					\
167 do {							\
168 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
169 		DELAY(1000);				\
170 } while (0)
171 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
172 
173 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
174 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
175 #define HN_CSUM_IP_HWASSIST(sc)		\
176 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
177 #define HN_CSUM_IP6_HWASSIST(sc)	\
178 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
179 
180 #define HN_PKTSIZE_MIN(align)		\
181 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
182 	    HN_RNDIS_PKT_LEN, (align))
183 #define HN_PKTSIZE(m, align)		\
184 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
185 
186 #ifdef RSS
187 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
188 #else
189 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
190 #endif
191 
192 struct hn_txdesc {
193 #ifndef HN_USE_TXDESC_BUFRING
194 	SLIST_ENTRY(hn_txdesc)		link;
195 #endif
196 	STAILQ_ENTRY(hn_txdesc)		agg_link;
197 
198 	/* Aggregated txdescs, in sending order. */
199 	STAILQ_HEAD(, hn_txdesc)	agg_list;
200 
201 	/* The oldest packet, if transmission aggregation happens. */
202 	struct mbuf			*m;
203 	struct hn_tx_ring		*txr;
204 	int				refs;
205 	uint32_t			flags;	/* HN_TXD_FLAG_ */
206 	struct hn_nvs_sendctx		send_ctx;
207 	uint32_t			chim_index;
208 	int				chim_size;
209 
210 	bus_dmamap_t			data_dmap;
211 
212 	bus_addr_t			rndis_pkt_paddr;
213 	struct rndis_packet_msg		*rndis_pkt;
214 	bus_dmamap_t			rndis_pkt_dmap;
215 };
216 
217 #define HN_TXD_FLAG_ONLIST		0x0001
218 #define HN_TXD_FLAG_DMAMAP		0x0002
219 #define HN_TXD_FLAG_ONAGG		0x0004
220 
221 struct hn_rxinfo {
222 	uint32_t			vlan_info;
223 	uint32_t			csum_info;
224 	uint32_t			hash_info;
225 	uint32_t			hash_value;
226 };
227 
228 struct hn_rxvf_setarg {
229 	struct hn_rx_ring	*rxr;
230 	struct ifnet		*vf_ifp;
231 };
232 
233 #define HN_RXINFO_VLAN			0x0001
234 #define HN_RXINFO_CSUM			0x0002
235 #define HN_RXINFO_HASHINF		0x0004
236 #define HN_RXINFO_HASHVAL		0x0008
237 #define HN_RXINFO_ALL			\
238 	(HN_RXINFO_VLAN |		\
239 	 HN_RXINFO_CSUM |		\
240 	 HN_RXINFO_HASHINF |		\
241 	 HN_RXINFO_HASHVAL)
242 
243 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
244 #define HN_NDIS_RXCSUM_INFO_INVALID	0
245 #define HN_NDIS_HASH_INFO_INVALID	0
246 
247 static int			hn_probe(device_t);
248 static int			hn_attach(device_t);
249 static int			hn_detach(device_t);
250 static int			hn_shutdown(device_t);
251 static void			hn_chan_callback(struct vmbus_channel *,
252 				    void *);
253 
254 static void			hn_init(void *);
255 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
256 #ifdef HN_IFSTART_SUPPORT
257 static void			hn_start(struct ifnet *);
258 #endif
259 static int			hn_transmit(struct ifnet *, struct mbuf *);
260 static void			hn_xmit_qflush(struct ifnet *);
261 static int			hn_ifmedia_upd(struct ifnet *);
262 static void			hn_ifmedia_sts(struct ifnet *,
263 				    struct ifmediareq *);
264 
265 static void			hn_ifnet_event(void *, struct ifnet *, int);
266 static void			hn_ifaddr_event(void *, struct ifnet *);
267 static void			hn_ifnet_attevent(void *, struct ifnet *);
268 static void			hn_ifnet_detevent(void *, struct ifnet *);
269 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
270 
271 static bool			hn_ismyvf(const struct hn_softc *,
272 				    const struct ifnet *);
273 static void			hn_rxvf_change(struct hn_softc *,
274 				    struct ifnet *, bool);
275 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
276 static void			hn_rxvf_set_task(void *, int);
277 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
278 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
279 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
280 				    struct ifreq *);
281 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
282 static bool			hn_xpnt_vf_isready(struct hn_softc *);
283 static void			hn_xpnt_vf_setready(struct hn_softc *);
284 static void			hn_xpnt_vf_init_taskfunc(void *, int);
285 static void			hn_xpnt_vf_init(struct hn_softc *);
286 static void			hn_xpnt_vf_setenable(struct hn_softc *);
287 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
288 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
289 static void			hn_vf_rss_restore(struct hn_softc *);
290 
291 static int			hn_rndis_rxinfo(const void *, int,
292 				    struct hn_rxinfo *);
293 static void			hn_rndis_rx_data(struct hn_rx_ring *,
294 				    const void *, int);
295 static void			hn_rndis_rx_status(struct hn_softc *,
296 				    const void *, int);
297 static void			hn_rndis_init_fixat(struct hn_softc *, int);
298 
299 static void			hn_nvs_handle_notify(struct hn_softc *,
300 				    const struct vmbus_chanpkt_hdr *);
301 static void			hn_nvs_handle_comp(struct hn_softc *,
302 				    struct vmbus_channel *,
303 				    const struct vmbus_chanpkt_hdr *);
304 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
305 				    struct vmbus_channel *,
306 				    const struct vmbus_chanpkt_hdr *);
307 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
308 				    struct vmbus_channel *, uint64_t);
309 
310 #if __FreeBSD_version >= 1100099
311 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
312 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
313 #endif
314 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
315 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
316 #if __FreeBSD_version < 1100095
317 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
318 #else
319 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
320 #endif
321 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
323 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
324 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
325 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
326 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
327 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328 #ifndef RSS
329 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
346 
347 static void			hn_stop(struct hn_softc *, bool);
348 static void			hn_init_locked(struct hn_softc *);
349 static int			hn_chan_attach(struct hn_softc *,
350 				    struct vmbus_channel *);
351 static void			hn_chan_detach(struct hn_softc *,
352 				    struct vmbus_channel *);
353 static int			hn_attach_subchans(struct hn_softc *);
354 static void			hn_detach_allchans(struct hn_softc *);
355 static void			hn_chan_rollup(struct hn_rx_ring *,
356 				    struct hn_tx_ring *);
357 static void			hn_set_ring_inuse(struct hn_softc *, int);
358 static int			hn_synth_attach(struct hn_softc *, int);
359 static void			hn_synth_detach(struct hn_softc *);
360 static int			hn_synth_alloc_subchans(struct hn_softc *,
361 				    int *);
362 static bool			hn_synth_attachable(const struct hn_softc *);
363 static void			hn_suspend(struct hn_softc *);
364 static void			hn_suspend_data(struct hn_softc *);
365 static void			hn_suspend_mgmt(struct hn_softc *);
366 static void			hn_resume(struct hn_softc *);
367 static void			hn_resume_data(struct hn_softc *);
368 static void			hn_resume_mgmt(struct hn_softc *);
369 static void			hn_suspend_mgmt_taskfunc(void *, int);
370 static void			hn_chan_drain(struct hn_softc *,
371 				    struct vmbus_channel *);
372 static void			hn_disable_rx(struct hn_softc *);
373 static void			hn_drain_rxtx(struct hn_softc *, int);
374 static void			hn_polling(struct hn_softc *, u_int);
375 static void			hn_chan_polling(struct vmbus_channel *, u_int);
376 static void			hn_mtu_change_fixup(struct hn_softc *);
377 
378 static void			hn_update_link_status(struct hn_softc *);
379 static void			hn_change_network(struct hn_softc *);
380 static void			hn_link_taskfunc(void *, int);
381 static void			hn_netchg_init_taskfunc(void *, int);
382 static void			hn_netchg_status_taskfunc(void *, int);
383 static void			hn_link_status(struct hn_softc *);
384 
385 static int			hn_create_rx_data(struct hn_softc *, int);
386 static void			hn_destroy_rx_data(struct hn_softc *);
387 static int			hn_check_iplen(const struct mbuf *, int);
388 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
389 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
390 static int			hn_rxfilter_config(struct hn_softc *);
391 static int			hn_rss_reconfig(struct hn_softc *);
392 static void			hn_rss_ind_fixup(struct hn_softc *);
393 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
394 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
395 				    int, const struct hn_rxinfo *);
396 static uint32_t			hn_rss_type_fromndis(uint32_t);
397 static uint32_t			hn_rss_type_tondis(uint32_t);
398 
399 static int			hn_tx_ring_create(struct hn_softc *, int);
400 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
401 static int			hn_create_tx_data(struct hn_softc *, int);
402 static void			hn_fixup_tx_data(struct hn_softc *);
403 static void			hn_fixup_rx_data(struct hn_softc *);
404 static void			hn_destroy_tx_data(struct hn_softc *);
405 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
406 static void			hn_txdesc_gc(struct hn_tx_ring *,
407 				    struct hn_txdesc *);
408 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
409 				    struct hn_txdesc *, struct mbuf **);
410 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
411 				    struct hn_txdesc *);
412 static void			hn_set_chim_size(struct hn_softc *, int);
413 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
414 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
415 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
416 static void			hn_resume_tx(struct hn_softc *, int);
417 static void			hn_set_txagg(struct hn_softc *);
418 static void			*hn_try_txagg(struct ifnet *,
419 				    struct hn_tx_ring *, struct hn_txdesc *,
420 				    int);
421 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
422 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
423 				    struct hn_softc *, struct vmbus_channel *,
424 				    const void *, int);
425 static int			hn_txpkt_sglist(struct hn_tx_ring *,
426 				    struct hn_txdesc *);
427 static int			hn_txpkt_chim(struct hn_tx_ring *,
428 				    struct hn_txdesc *);
429 static int			hn_xmit(struct hn_tx_ring *, int);
430 static void			hn_xmit_taskfunc(void *, int);
431 static void			hn_xmit_txeof(struct hn_tx_ring *);
432 static void			hn_xmit_txeof_taskfunc(void *, int);
433 #ifdef HN_IFSTART_SUPPORT
434 static int			hn_start_locked(struct hn_tx_ring *, int);
435 static void			hn_start_taskfunc(void *, int);
436 static void			hn_start_txeof(struct hn_tx_ring *);
437 static void			hn_start_txeof_taskfunc(void *, int);
438 #endif
439 
440 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
441     "Hyper-V network interface");
442 
443 /* Trust tcp segements verification on host side. */
444 static int			hn_trust_hosttcp = 1;
445 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
446     &hn_trust_hosttcp, 0,
447     "Trust tcp segement verification on host side, "
448     "when csum info is missing (global setting)");
449 
450 /* Trust udp datagrams verification on host side. */
451 static int			hn_trust_hostudp = 1;
452 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
453     &hn_trust_hostudp, 0,
454     "Trust udp datagram verification on host side, "
455     "when csum info is missing (global setting)");
456 
457 /* Trust ip packets verification on host side. */
458 static int			hn_trust_hostip = 1;
459 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
460     &hn_trust_hostip, 0,
461     "Trust ip packet verification on host side, "
462     "when csum info is missing (global setting)");
463 
464 /*
465  * Offload UDP/IPv4 checksum.
466  */
467 static int			hn_enable_udp4cs = 1;
468 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
469     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
470 
471 /*
472  * Offload UDP/IPv6 checksum.
473  */
474 static int			hn_enable_udp6cs = 1;
475 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
476     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
477 
478 /* Stats. */
479 static counter_u64_t		hn_udpcs_fixup;
480 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
481     &hn_udpcs_fixup, "# of UDP checksum fixup");
482 
483 /*
484  * See hn_set_hlen().
485  *
486  * This value is for Azure.  For Hyper-V, set this above
487  * 65536 to disable UDP datagram checksum fixup.
488  */
489 static int			hn_udpcs_fixup_mtu = 1420;
490 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
491     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
492 
493 /* Limit TSO burst size */
494 static int			hn_tso_maxlen = IP_MAXPACKET;
495 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
496     &hn_tso_maxlen, 0, "TSO burst limit");
497 
498 /* Limit chimney send size */
499 static int			hn_tx_chimney_size = 0;
500 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
501     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
502 
503 /* Limit the size of packet for direct transmission */
504 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
505 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
506     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
507 
508 /* # of LRO entries per RX ring */
509 #if defined(INET) || defined(INET6)
510 #if __FreeBSD_version >= 1100095
511 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
512 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
513     &hn_lro_entry_count, 0, "LRO entry count");
514 #endif
515 #endif
516 
517 static int			hn_tx_taskq_cnt = 1;
518 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
519     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
520 
521 #define HN_TX_TASKQ_M_INDEP	0
522 #define HN_TX_TASKQ_M_GLOBAL	1
523 #define HN_TX_TASKQ_M_EVTTQ	2
524 
525 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
526 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
527     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
528     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
529 
530 #ifndef HN_USE_TXDESC_BUFRING
531 static int			hn_use_txdesc_bufring = 0;
532 #else
533 static int			hn_use_txdesc_bufring = 1;
534 #endif
535 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
536     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
537 
538 #ifdef HN_IFSTART_SUPPORT
539 /* Use ifnet.if_start instead of ifnet.if_transmit */
540 static int			hn_use_if_start = 0;
541 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
542     &hn_use_if_start, 0, "Use if_start TX method");
543 #endif
544 
545 /* # of channels to use */
546 static int			hn_chan_cnt = 0;
547 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
548     &hn_chan_cnt, 0,
549     "# of channels to use; each channel has one RX ring and one TX ring");
550 
551 /* # of transmit rings to use */
552 static int			hn_tx_ring_cnt = 0;
553 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
554     &hn_tx_ring_cnt, 0, "# of TX rings to use");
555 
556 /* Software TX ring deptch */
557 static int			hn_tx_swq_depth = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
559     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
560 
561 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
562 #if __FreeBSD_version >= 1100095
563 static u_int			hn_lro_mbufq_depth = 0;
564 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
565     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
566 #endif
567 
568 /* Packet transmission aggregation size limit */
569 static int			hn_tx_agg_size = -1;
570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
571     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
572 
573 /* Packet transmission aggregation count limit */
574 static int			hn_tx_agg_pkts = -1;
575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
576     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
577 
578 /* VF list */
579 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
580     0, 0, hn_vflist_sysctl, "A", "VF list");
581 
582 /* VF mapping */
583 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
584     0, 0, hn_vfmap_sysctl, "A", "VF mapping");
585 
586 /* Transparent VF */
587 static int			hn_xpnt_vf = 1;
588 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
589     &hn_xpnt_vf, 0, "Transparent VF mod");
590 
591 /* Accurate BPF support for Transparent VF */
592 static int			hn_xpnt_vf_accbpf = 0;
593 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
594     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
595 
596 /* Extra wait for transparent VF attach routing; unit seconds. */
597 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
598 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
599     &hn_xpnt_vf_attwait, 0,
600     "Extra wait for transparent VF attach routing; unit: seconds");
601 
602 static u_int			hn_cpu_index;	/* next CPU for channel */
603 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
604 
605 static struct rmlock		hn_vfmap_lock;
606 static int			hn_vfmap_size;
607 static struct ifnet		**hn_vfmap;
608 
609 #ifndef RSS
610 static const uint8_t
611 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
612 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
613 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
614 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
615 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
616 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
617 };
618 #endif	/* !RSS */
619 
620 static const struct hyperv_guid	hn_guid = {
621 	.hv_guid = {
622 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
623 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
624 };
625 
626 static device_method_t hn_methods[] = {
627 	/* Device interface */
628 	DEVMETHOD(device_probe,		hn_probe),
629 	DEVMETHOD(device_attach,	hn_attach),
630 	DEVMETHOD(device_detach,	hn_detach),
631 	DEVMETHOD(device_shutdown,	hn_shutdown),
632 	DEVMETHOD_END
633 };
634 
635 static driver_t hn_driver = {
636 	"hn",
637 	hn_methods,
638 	sizeof(struct hn_softc)
639 };
640 
641 static devclass_t hn_devclass;
642 
643 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
644 MODULE_VERSION(hn, 1);
645 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
646 
647 #if __FreeBSD_version >= 1100099
648 static void
649 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
650 {
651 	int i;
652 
653 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
654 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
655 }
656 #endif
657 
658 static int
659 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
660 {
661 
662 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
663 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
664 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
665 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
666 }
667 
668 static int
669 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
670 {
671 	struct hn_nvs_rndis rndis;
672 
673 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
674 	    txd->chim_size > 0, ("invalid rndis chim txd"));
675 
676 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
677 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
678 	rndis.nvs_chim_idx = txd->chim_index;
679 	rndis.nvs_chim_sz = txd->chim_size;
680 
681 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
682 	    &rndis, sizeof(rndis), &txd->send_ctx));
683 }
684 
685 static __inline uint32_t
686 hn_chim_alloc(struct hn_softc *sc)
687 {
688 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
689 	u_long *bmap = sc->hn_chim_bmap;
690 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
691 
692 	for (i = 0; i < bmap_cnt; ++i) {
693 		int idx;
694 
695 		idx = ffsl(~bmap[i]);
696 		if (idx == 0)
697 			continue;
698 
699 		--idx; /* ffsl is 1-based */
700 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
701 		    ("invalid i %d and idx %d", i, idx));
702 
703 		if (atomic_testandset_long(&bmap[i], idx))
704 			continue;
705 
706 		ret = i * LONG_BIT + idx;
707 		break;
708 	}
709 	return (ret);
710 }
711 
712 static __inline void
713 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
714 {
715 	u_long mask;
716 	uint32_t idx;
717 
718 	idx = chim_idx / LONG_BIT;
719 	KASSERT(idx < sc->hn_chim_bmap_cnt,
720 	    ("invalid chimney index 0x%x", chim_idx));
721 
722 	mask = 1UL << (chim_idx % LONG_BIT);
723 	KASSERT(sc->hn_chim_bmap[idx] & mask,
724 	    ("index bitmap 0x%lx, chimney index %u, "
725 	     "bitmap idx %d, bitmask 0x%lx",
726 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
727 
728 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
729 }
730 
731 #if defined(INET6) || defined(INET)
732 
733 #define PULLUP_HDR(m, len)				\
734 do {							\
735 	if (__predict_false((m)->m_len < (len))) {	\
736 		(m) = m_pullup((m), (len));		\
737 		if ((m) == NULL)			\
738 			return (NULL);			\
739 	}						\
740 } while (0)
741 
742 /*
743  * NOTE: If this function failed, the m_head would be freed.
744  */
745 static __inline struct mbuf *
746 hn_tso_fixup(struct mbuf *m_head)
747 {
748 	struct ether_vlan_header *evl;
749 	struct tcphdr *th;
750 	int ehlen;
751 
752 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
753 
754 	PULLUP_HDR(m_head, sizeof(*evl));
755 	evl = mtod(m_head, struct ether_vlan_header *);
756 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
757 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
758 	else
759 		ehlen = ETHER_HDR_LEN;
760 	m_head->m_pkthdr.l2hlen = ehlen;
761 
762 #ifdef INET
763 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
764 		struct ip *ip;
765 		int iphlen;
766 
767 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
768 		ip = mtodo(m_head, ehlen);
769 		iphlen = ip->ip_hl << 2;
770 		m_head->m_pkthdr.l3hlen = iphlen;
771 
772 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
773 		th = mtodo(m_head, ehlen + iphlen);
774 
775 		ip->ip_len = 0;
776 		ip->ip_sum = 0;
777 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
778 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
779 	}
780 #endif
781 #if defined(INET6) && defined(INET)
782 	else
783 #endif
784 #ifdef INET6
785 	{
786 		struct ip6_hdr *ip6;
787 
788 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
789 		ip6 = mtodo(m_head, ehlen);
790 		if (ip6->ip6_nxt != IPPROTO_TCP) {
791 			m_freem(m_head);
792 			return (NULL);
793 		}
794 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
795 
796 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
797 		th = mtodo(m_head, ehlen + sizeof(*ip6));
798 
799 		ip6->ip6_plen = 0;
800 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
801 	}
802 #endif
803 	return (m_head);
804 }
805 
806 /*
807  * NOTE: If this function failed, the m_head would be freed.
808  */
809 static __inline struct mbuf *
810 hn_set_hlen(struct mbuf *m_head)
811 {
812 	const struct ether_vlan_header *evl;
813 	int ehlen;
814 
815 	PULLUP_HDR(m_head, sizeof(*evl));
816 	evl = mtod(m_head, const struct ether_vlan_header *);
817 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
818 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
819 	else
820 		ehlen = ETHER_HDR_LEN;
821 	m_head->m_pkthdr.l2hlen = ehlen;
822 
823 #ifdef INET
824 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
825 		const struct ip *ip;
826 		int iphlen;
827 
828 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
829 		ip = mtodo(m_head, ehlen);
830 		iphlen = ip->ip_hl << 2;
831 		m_head->m_pkthdr.l3hlen = iphlen;
832 
833 		/*
834 		 * UDP checksum offload does not work in Azure, if the
835 		 * following conditions meet:
836 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
837 		 * - IP_DF is not set in the IP hdr.
838 		 *
839 		 * Fallback to software checksum for these UDP datagrams.
840 		 */
841 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
842 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
843 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
844 			uint16_t off = ehlen + iphlen;
845 
846 			counter_u64_add(hn_udpcs_fixup, 1);
847 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
848 			*(uint16_t *)(m_head->m_data + off +
849                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
850 			    m_head, m_head->m_pkthdr.len, off);
851 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
852 		}
853 	}
854 #endif
855 #if defined(INET6) && defined(INET)
856 	else
857 #endif
858 #ifdef INET6
859 	{
860 		const struct ip6_hdr *ip6;
861 
862 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
863 		ip6 = mtodo(m_head, ehlen);
864 		if (ip6->ip6_nxt != IPPROTO_TCP) {
865 			m_freem(m_head);
866 			return (NULL);
867 		}
868 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
869 	}
870 #endif
871 	return (m_head);
872 }
873 
874 /*
875  * NOTE: If this function failed, the m_head would be freed.
876  */
877 static __inline struct mbuf *
878 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
879 {
880 	const struct tcphdr *th;
881 	int ehlen, iphlen;
882 
883 	*tcpsyn = 0;
884 	ehlen = m_head->m_pkthdr.l2hlen;
885 	iphlen = m_head->m_pkthdr.l3hlen;
886 
887 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
888 	th = mtodo(m_head, ehlen + iphlen);
889 	if (th->th_flags & TH_SYN)
890 		*tcpsyn = 1;
891 	return (m_head);
892 }
893 
894 #undef PULLUP_HDR
895 
896 #endif	/* INET6 || INET */
897 
898 static int
899 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
900 {
901 	int error = 0;
902 
903 	HN_LOCK_ASSERT(sc);
904 
905 	if (sc->hn_rx_filter != filter) {
906 		error = hn_rndis_set_rxfilter(sc, filter);
907 		if (!error)
908 			sc->hn_rx_filter = filter;
909 	}
910 	return (error);
911 }
912 
913 static int
914 hn_rxfilter_config(struct hn_softc *sc)
915 {
916 	struct ifnet *ifp = sc->hn_ifp;
917 	uint32_t filter;
918 
919 	HN_LOCK_ASSERT(sc);
920 
921 	/*
922 	 * If the non-transparent mode VF is activated, we don't know how
923 	 * its RX filter is configured, so stick the synthetic device in
924 	 * the promiscous mode.
925 	 */
926 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
927 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
928 	} else {
929 		filter = NDIS_PACKET_TYPE_DIRECTED;
930 		if (ifp->if_flags & IFF_BROADCAST)
931 			filter |= NDIS_PACKET_TYPE_BROADCAST;
932 		/* TODO: support multicast list */
933 		if ((ifp->if_flags & IFF_ALLMULTI) ||
934 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
935 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
936 	}
937 	return (hn_set_rxfilter(sc, filter));
938 }
939 
940 static void
941 hn_set_txagg(struct hn_softc *sc)
942 {
943 	uint32_t size, pkts;
944 	int i;
945 
946 	/*
947 	 * Setup aggregation size.
948 	 */
949 	if (sc->hn_agg_size < 0)
950 		size = UINT32_MAX;
951 	else
952 		size = sc->hn_agg_size;
953 
954 	if (sc->hn_rndis_agg_size < size)
955 		size = sc->hn_rndis_agg_size;
956 
957 	/* NOTE: We only aggregate packets using chimney sending buffers. */
958 	if (size > (uint32_t)sc->hn_chim_szmax)
959 		size = sc->hn_chim_szmax;
960 
961 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
962 		/* Disable */
963 		size = 0;
964 		pkts = 0;
965 		goto done;
966 	}
967 
968 	/* NOTE: Type of the per TX ring setting is 'int'. */
969 	if (size > INT_MAX)
970 		size = INT_MAX;
971 
972 	/*
973 	 * Setup aggregation packet count.
974 	 */
975 	if (sc->hn_agg_pkts < 0)
976 		pkts = UINT32_MAX;
977 	else
978 		pkts = sc->hn_agg_pkts;
979 
980 	if (sc->hn_rndis_agg_pkts < pkts)
981 		pkts = sc->hn_rndis_agg_pkts;
982 
983 	if (pkts <= 1) {
984 		/* Disable */
985 		size = 0;
986 		pkts = 0;
987 		goto done;
988 	}
989 
990 	/* NOTE: Type of the per TX ring setting is 'short'. */
991 	if (pkts > SHRT_MAX)
992 		pkts = SHRT_MAX;
993 
994 done:
995 	/* NOTE: Type of the per TX ring setting is 'short'. */
996 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
997 		/* Disable */
998 		size = 0;
999 		pkts = 0;
1000 	}
1001 
1002 	if (bootverbose) {
1003 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1004 		    size, pkts, sc->hn_rndis_agg_align);
1005 	}
1006 
1007 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1008 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1009 
1010 		mtx_lock(&txr->hn_tx_lock);
1011 		txr->hn_agg_szmax = size;
1012 		txr->hn_agg_pktmax = pkts;
1013 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1014 		mtx_unlock(&txr->hn_tx_lock);
1015 	}
1016 }
1017 
1018 static int
1019 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1020 {
1021 
1022 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1023 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1024 		return txr->hn_txdesc_cnt;
1025 	return hn_tx_swq_depth;
1026 }
1027 
1028 static int
1029 hn_rss_reconfig(struct hn_softc *sc)
1030 {
1031 	int error;
1032 
1033 	HN_LOCK_ASSERT(sc);
1034 
1035 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1036 		return (ENXIO);
1037 
1038 	/*
1039 	 * Disable RSS first.
1040 	 *
1041 	 * NOTE:
1042 	 * Direct reconfiguration by setting the UNCHG flags does
1043 	 * _not_ work properly.
1044 	 */
1045 	if (bootverbose)
1046 		if_printf(sc->hn_ifp, "disable RSS\n");
1047 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1048 	if (error) {
1049 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1050 		return (error);
1051 	}
1052 
1053 	/*
1054 	 * Reenable the RSS w/ the updated RSS key or indirect
1055 	 * table.
1056 	 */
1057 	if (bootverbose)
1058 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1059 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1060 	if (error) {
1061 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1062 		return (error);
1063 	}
1064 	return (0);
1065 }
1066 
1067 static void
1068 hn_rss_ind_fixup(struct hn_softc *sc)
1069 {
1070 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1071 	int i, nchan;
1072 
1073 	nchan = sc->hn_rx_ring_inuse;
1074 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1075 
1076 	/*
1077 	 * Check indirect table to make sure that all channels in it
1078 	 * can be used.
1079 	 */
1080 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1081 		if (rss->rss_ind[i] >= nchan) {
1082 			if_printf(sc->hn_ifp,
1083 			    "RSS indirect table %d fixup: %u -> %d\n",
1084 			    i, rss->rss_ind[i], nchan - 1);
1085 			rss->rss_ind[i] = nchan - 1;
1086 		}
1087 	}
1088 }
1089 
1090 static int
1091 hn_ifmedia_upd(struct ifnet *ifp __unused)
1092 {
1093 
1094 	return EOPNOTSUPP;
1095 }
1096 
1097 static void
1098 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1099 {
1100 	struct hn_softc *sc = ifp->if_softc;
1101 
1102 	ifmr->ifm_status = IFM_AVALID;
1103 	ifmr->ifm_active = IFM_ETHER;
1104 
1105 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1106 		ifmr->ifm_active |= IFM_NONE;
1107 		return;
1108 	}
1109 	ifmr->ifm_status |= IFM_ACTIVE;
1110 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1111 }
1112 
1113 static void
1114 hn_rxvf_set_task(void *xarg, int pending __unused)
1115 {
1116 	struct hn_rxvf_setarg *arg = xarg;
1117 
1118 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1119 }
1120 
1121 static void
1122 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1123 {
1124 	struct hn_rx_ring *rxr;
1125 	struct hn_rxvf_setarg arg;
1126 	struct task task;
1127 	int i;
1128 
1129 	HN_LOCK_ASSERT(sc);
1130 
1131 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1132 
1133 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1134 		rxr = &sc->hn_rx_ring[i];
1135 
1136 		if (i < sc->hn_rx_ring_inuse) {
1137 			arg.rxr = rxr;
1138 			arg.vf_ifp = vf_ifp;
1139 			vmbus_chan_run_task(rxr->hn_chan, &task);
1140 		} else {
1141 			rxr->hn_rxvf_ifp = vf_ifp;
1142 		}
1143 	}
1144 }
1145 
1146 static bool
1147 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1148 {
1149 	const struct ifnet *hn_ifp;
1150 
1151 	hn_ifp = sc->hn_ifp;
1152 
1153 	if (ifp == hn_ifp)
1154 		return (false);
1155 
1156 	if (ifp->if_alloctype != IFT_ETHER)
1157 		return (false);
1158 
1159 	/* Ignore lagg/vlan interfaces */
1160 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1161 	    strcmp(ifp->if_dname, "vlan") == 0)
1162 		return (false);
1163 
1164 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1165 		return (false);
1166 
1167 	return (true);
1168 }
1169 
1170 static void
1171 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1172 {
1173 	struct ifnet *hn_ifp;
1174 
1175 	HN_LOCK(sc);
1176 
1177 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1178 		goto out;
1179 
1180 	if (!hn_ismyvf(sc, ifp))
1181 		goto out;
1182 	hn_ifp = sc->hn_ifp;
1183 
1184 	if (rxvf) {
1185 		if (sc->hn_flags & HN_FLAG_RXVF)
1186 			goto out;
1187 
1188 		sc->hn_flags |= HN_FLAG_RXVF;
1189 		hn_rxfilter_config(sc);
1190 	} else {
1191 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1192 			goto out;
1193 
1194 		sc->hn_flags &= ~HN_FLAG_RXVF;
1195 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1196 			hn_rxfilter_config(sc);
1197 		else
1198 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1199 	}
1200 
1201 	hn_nvs_set_datapath(sc,
1202 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1203 
1204 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1205 
1206 	if (rxvf) {
1207 		hn_vf_rss_fixup(sc, true);
1208 		hn_suspend_mgmt(sc);
1209 		sc->hn_link_flags &=
1210 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1211 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1212 	} else {
1213 		hn_vf_rss_restore(sc);
1214 		hn_resume_mgmt(sc);
1215 	}
1216 
1217 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1218 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1219 
1220 	if (bootverbose) {
1221 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1222 		    rxvf ? "to" : "from", ifp->if_xname);
1223 	}
1224 out:
1225 	HN_UNLOCK(sc);
1226 }
1227 
1228 static void
1229 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1230 {
1231 
1232 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1233 		return;
1234 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1235 }
1236 
1237 static void
1238 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1239 {
1240 
1241 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1242 }
1243 
1244 static int
1245 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1246 {
1247 	struct ifnet *ifp, *vf_ifp;
1248 	uint64_t tmp;
1249 	int error;
1250 
1251 	HN_LOCK_ASSERT(sc);
1252 	ifp = sc->hn_ifp;
1253 	vf_ifp = sc->hn_vf_ifp;
1254 
1255 	/*
1256 	 * Fix up requested capabilities w/ supported capabilities,
1257 	 * since the supported capabilities could have been changed.
1258 	 */
1259 	ifr->ifr_reqcap &= ifp->if_capabilities;
1260 	/* Pass SIOCSIFCAP to VF. */
1261 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1262 
1263 	/*
1264 	 * NOTE:
1265 	 * The error will be propagated to the callers, however, it
1266 	 * is _not_ useful here.
1267 	 */
1268 
1269 	/*
1270 	 * Merge VF's enabled capabilities.
1271 	 */
1272 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1273 
1274 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1275 	if (ifp->if_capenable & IFCAP_TXCSUM)
1276 		ifp->if_hwassist |= tmp;
1277 	else
1278 		ifp->if_hwassist &= ~tmp;
1279 
1280 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1281 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1282 		ifp->if_hwassist |= tmp;
1283 	else
1284 		ifp->if_hwassist &= ~tmp;
1285 
1286 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1287 	if (ifp->if_capenable & IFCAP_TSO4)
1288 		ifp->if_hwassist |= tmp;
1289 	else
1290 		ifp->if_hwassist &= ~tmp;
1291 
1292 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1293 	if (ifp->if_capenable & IFCAP_TSO6)
1294 		ifp->if_hwassist |= tmp;
1295 	else
1296 		ifp->if_hwassist &= ~tmp;
1297 
1298 	return (error);
1299 }
1300 
1301 static int
1302 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1303 {
1304 	struct ifnet *vf_ifp;
1305 	struct ifreq ifr;
1306 
1307 	HN_LOCK_ASSERT(sc);
1308 	vf_ifp = sc->hn_vf_ifp;
1309 
1310 	memset(&ifr, 0, sizeof(ifr));
1311 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1312 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1313 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1314 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1315 }
1316 
1317 static void
1318 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1319 {
1320 	struct ifnet *ifp = sc->hn_ifp;
1321 	int allmulti = 0;
1322 
1323 	HN_LOCK_ASSERT(sc);
1324 
1325 	/* XXX vlan(4) style mcast addr maintenance */
1326 	if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1327 		allmulti = IFF_ALLMULTI;
1328 
1329 	/* Always set the VF's if_flags */
1330 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1331 }
1332 
1333 static void
1334 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1335 {
1336 	struct rm_priotracker pt;
1337 	struct ifnet *hn_ifp = NULL;
1338 	struct mbuf *mn;
1339 
1340 	/*
1341 	 * XXX racy, if hn(4) ever detached.
1342 	 */
1343 	rm_rlock(&hn_vfmap_lock, &pt);
1344 	if (vf_ifp->if_index < hn_vfmap_size)
1345 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1346 	rm_runlock(&hn_vfmap_lock, &pt);
1347 
1348 	if (hn_ifp != NULL) {
1349 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1350 			/*
1351 			 * Allow tapping on the VF.
1352 			 */
1353 			ETHER_BPF_MTAP(vf_ifp, mn);
1354 
1355 			/*
1356 			 * Update VF stats.
1357 			 */
1358 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1359 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1360 				    mn->m_pkthdr.len);
1361 			}
1362 			/*
1363 			 * XXX IFCOUNTER_IMCAST
1364 			 * This stat updating is kinda invasive, since it
1365 			 * requires two checks on the mbuf: the length check
1366 			 * and the ethernet header check.  As of this write,
1367 			 * all multicast packets go directly to hn(4), which
1368 			 * makes imcast stat updating in the VF a try in vian.
1369 			 */
1370 
1371 			/*
1372 			 * Fix up rcvif and increase hn(4)'s ipackets.
1373 			 */
1374 			mn->m_pkthdr.rcvif = hn_ifp;
1375 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1376 		}
1377 		/*
1378 		 * Go through hn(4)'s if_input.
1379 		 */
1380 		hn_ifp->if_input(hn_ifp, m);
1381 	} else {
1382 		/*
1383 		 * In the middle of the transition; free this
1384 		 * mbuf chain.
1385 		 */
1386 		while (m != NULL) {
1387 			mn = m->m_nextpkt;
1388 			m->m_nextpkt = NULL;
1389 			m_freem(m);
1390 			m = mn;
1391 		}
1392 	}
1393 }
1394 
1395 static void
1396 hn_mtu_change_fixup(struct hn_softc *sc)
1397 {
1398 	struct ifnet *ifp;
1399 
1400 	HN_LOCK_ASSERT(sc);
1401 	ifp = sc->hn_ifp;
1402 
1403 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1404 #if __FreeBSD_version >= 1100099
1405 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1406 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1407 #endif
1408 }
1409 
1410 static uint32_t
1411 hn_rss_type_fromndis(uint32_t rss_hash)
1412 {
1413 	uint32_t types = 0;
1414 
1415 	if (rss_hash & NDIS_HASH_IPV4)
1416 		types |= RSS_TYPE_IPV4;
1417 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1418 		types |= RSS_TYPE_TCP_IPV4;
1419 	if (rss_hash & NDIS_HASH_IPV6)
1420 		types |= RSS_TYPE_IPV6;
1421 	if (rss_hash & NDIS_HASH_IPV6_EX)
1422 		types |= RSS_TYPE_IPV6_EX;
1423 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1424 		types |= RSS_TYPE_TCP_IPV6;
1425 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1426 		types |= RSS_TYPE_TCP_IPV6_EX;
1427 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1428 		types |= RSS_TYPE_UDP_IPV4;
1429 	return (types);
1430 }
1431 
1432 static uint32_t
1433 hn_rss_type_tondis(uint32_t types)
1434 {
1435 	uint32_t rss_hash = 0;
1436 
1437 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1438 	    ("UDP6 and UDP6EX are not supported"));
1439 
1440 	if (types & RSS_TYPE_IPV4)
1441 		rss_hash |= NDIS_HASH_IPV4;
1442 	if (types & RSS_TYPE_TCP_IPV4)
1443 		rss_hash |= NDIS_HASH_TCP_IPV4;
1444 	if (types & RSS_TYPE_IPV6)
1445 		rss_hash |= NDIS_HASH_IPV6;
1446 	if (types & RSS_TYPE_IPV6_EX)
1447 		rss_hash |= NDIS_HASH_IPV6_EX;
1448 	if (types & RSS_TYPE_TCP_IPV6)
1449 		rss_hash |= NDIS_HASH_TCP_IPV6;
1450 	if (types & RSS_TYPE_TCP_IPV6_EX)
1451 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1452 	if (types & RSS_TYPE_UDP_IPV4)
1453 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1454 	return (rss_hash);
1455 }
1456 
1457 static void
1458 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1459 {
1460 	int i;
1461 
1462 	HN_LOCK_ASSERT(sc);
1463 
1464 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1465 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1466 }
1467 
1468 static void
1469 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1470 {
1471 	struct ifnet *ifp, *vf_ifp;
1472 	struct ifrsshash ifrh;
1473 	struct ifrsskey ifrk;
1474 	int error;
1475 	uint32_t my_types, diff_types, mbuf_types = 0;
1476 
1477 	HN_LOCK_ASSERT(sc);
1478 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1479 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1480 
1481 	if (sc->hn_rx_ring_inuse == 1) {
1482 		/* No RSS on synthetic parts; done. */
1483 		return;
1484 	}
1485 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1486 		/* Synthetic parts do not support Toeplitz; done. */
1487 		return;
1488 	}
1489 
1490 	ifp = sc->hn_ifp;
1491 	vf_ifp = sc->hn_vf_ifp;
1492 
1493 	/*
1494 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1495 	 * supported.
1496 	 */
1497 	memset(&ifrk, 0, sizeof(ifrk));
1498 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1499 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1500 	if (error) {
1501 		if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n",
1502 		    vf_ifp->if_xname, error);
1503 		goto done;
1504 	}
1505 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1506 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1507 		    vf_ifp->if_xname, ifrk.ifrk_func);
1508 		goto done;
1509 	}
1510 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1511 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1512 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1513 		goto done;
1514 	}
1515 
1516 	/*
1517 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1518 	 */
1519 	memset(&ifrh, 0, sizeof(ifrh));
1520 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1521 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1522 	if (error) {
1523 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1524 		    vf_ifp->if_xname, error);
1525 		goto done;
1526 	}
1527 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1528 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1529 		    vf_ifp->if_xname, ifrh.ifrh_func);
1530 		goto done;
1531 	}
1532 
1533 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1534 	if ((ifrh.ifrh_types & my_types) == 0) {
1535 		/* This disables RSS; ignore it then */
1536 		if_printf(ifp, "%s intersection of RSS types failed.  "
1537 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1538 		    ifrh.ifrh_types, my_types);
1539 		goto done;
1540 	}
1541 
1542 	diff_types = my_types ^ ifrh.ifrh_types;
1543 	my_types &= ifrh.ifrh_types;
1544 	mbuf_types = my_types;
1545 
1546 	/*
1547 	 * Detect RSS hash value/type confliction.
1548 	 *
1549 	 * NOTE:
1550 	 * We don't disable the hash type, but stop delivery the hash
1551 	 * value/type through mbufs on RX path.
1552 	 *
1553 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1554 	 * hash is delivered with type of TCP_IPV4.  This means if
1555 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1556 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1557 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1558 	 * here.
1559 	 */
1560 	if ((my_types & RSS_TYPE_IPV4) &&
1561 	    (diff_types & ifrh.ifrh_types &
1562 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1563 		/* Conflict; disable IPV4 hash type/value delivery. */
1564 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1565 		mbuf_types &= ~RSS_TYPE_IPV4;
1566 	}
1567 	if ((my_types & RSS_TYPE_IPV6) &&
1568 	    (diff_types & ifrh.ifrh_types &
1569 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1570 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1571 	      RSS_TYPE_IPV6_EX))) {
1572 		/* Conflict; disable IPV6 hash type/value delivery. */
1573 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1574 		mbuf_types &= ~RSS_TYPE_IPV6;
1575 	}
1576 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1577 	    (diff_types & ifrh.ifrh_types &
1578 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1579 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1580 	      RSS_TYPE_IPV6))) {
1581 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1582 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1583 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1584 	}
1585 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1586 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1587 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1588 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1589 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1590 	}
1591 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1592 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1593 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1594 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1595 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1596 	}
1597 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1598 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1599 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1600 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1601 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1602 	}
1603 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1604 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1605 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1606 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1607 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1608 	}
1609 
1610 	/*
1611 	 * Indirect table does not matter.
1612 	 */
1613 
1614 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1615 	    hn_rss_type_tondis(my_types);
1616 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1617 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1618 
1619 	if (reconf) {
1620 		error = hn_rss_reconfig(sc);
1621 		if (error) {
1622 			/* XXX roll-back? */
1623 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1624 			/* XXX keep going. */
1625 		}
1626 	}
1627 done:
1628 	/* Hash deliverability for mbufs. */
1629 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1630 }
1631 
1632 static void
1633 hn_vf_rss_restore(struct hn_softc *sc)
1634 {
1635 
1636 	HN_LOCK_ASSERT(sc);
1637 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1638 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1639 
1640 	if (sc->hn_rx_ring_inuse == 1)
1641 		goto done;
1642 
1643 	/*
1644 	 * Restore hash types.  Key does _not_ matter.
1645 	 */
1646 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1647 		int error;
1648 
1649 		sc->hn_rss_hash = sc->hn_rss_hcap;
1650 		error = hn_rss_reconfig(sc);
1651 		if (error) {
1652 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1653 			    error);
1654 			/* XXX keep going. */
1655 		}
1656 	}
1657 done:
1658 	/* Hash deliverability for mbufs. */
1659 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1660 }
1661 
1662 static void
1663 hn_xpnt_vf_setready(struct hn_softc *sc)
1664 {
1665 	struct ifnet *ifp, *vf_ifp;
1666 	struct ifreq ifr;
1667 
1668 	HN_LOCK_ASSERT(sc);
1669 	ifp = sc->hn_ifp;
1670 	vf_ifp = sc->hn_vf_ifp;
1671 
1672 	/*
1673 	 * Mark the VF ready.
1674 	 */
1675 	sc->hn_vf_rdytick = 0;
1676 
1677 	/*
1678 	 * Save information for restoration.
1679 	 */
1680 	sc->hn_saved_caps = ifp->if_capabilities;
1681 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1682 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1683 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1684 
1685 	/*
1686 	 * Intersect supported/enabled capabilities.
1687 	 *
1688 	 * NOTE:
1689 	 * if_hwassist is not changed here.
1690 	 */
1691 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1692 	ifp->if_capenable &= ifp->if_capabilities;
1693 
1694 	/*
1695 	 * Fix TSO settings.
1696 	 */
1697 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1698 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1699 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1700 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1701 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1702 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1703 
1704 	/*
1705 	 * Change VF's enabled capabilities.
1706 	 */
1707 	memset(&ifr, 0, sizeof(ifr));
1708 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1709 	ifr.ifr_reqcap = ifp->if_capenable;
1710 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1711 
1712 	if (ifp->if_mtu != ETHERMTU) {
1713 		int error;
1714 
1715 		/*
1716 		 * Change VF's MTU.
1717 		 */
1718 		memset(&ifr, 0, sizeof(ifr));
1719 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1720 		ifr.ifr_mtu = ifp->if_mtu;
1721 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1722 		if (error) {
1723 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1724 			    vf_ifp->if_xname, ifp->if_mtu);
1725 			if (ifp->if_mtu > ETHERMTU) {
1726 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1727 
1728 				/*
1729 				 * XXX
1730 				 * No need to adjust the synthetic parts' MTU;
1731 				 * failure of the adjustment will cause us
1732 				 * infinite headache.
1733 				 */
1734 				ifp->if_mtu = ETHERMTU;
1735 				hn_mtu_change_fixup(sc);
1736 			}
1737 		}
1738 	}
1739 }
1740 
1741 static bool
1742 hn_xpnt_vf_isready(struct hn_softc *sc)
1743 {
1744 
1745 	HN_LOCK_ASSERT(sc);
1746 
1747 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1748 		return (false);
1749 
1750 	if (sc->hn_vf_rdytick == 0)
1751 		return (true);
1752 
1753 	if (sc->hn_vf_rdytick > ticks)
1754 		return (false);
1755 
1756 	/* Mark VF as ready. */
1757 	hn_xpnt_vf_setready(sc);
1758 	return (true);
1759 }
1760 
1761 static void
1762 hn_xpnt_vf_setenable(struct hn_softc *sc)
1763 {
1764 	int i;
1765 
1766 	HN_LOCK_ASSERT(sc);
1767 
1768 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1769 	rm_wlock(&sc->hn_vf_lock);
1770 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1771 	rm_wunlock(&sc->hn_vf_lock);
1772 
1773 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1774 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1775 }
1776 
1777 static void
1778 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1779 {
1780 	int i;
1781 
1782 	HN_LOCK_ASSERT(sc);
1783 
1784 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1785 	rm_wlock(&sc->hn_vf_lock);
1786 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1787 	if (clear_vf)
1788 		sc->hn_vf_ifp = NULL;
1789 	rm_wunlock(&sc->hn_vf_lock);
1790 
1791 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1792 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1793 }
1794 
1795 static void
1796 hn_xpnt_vf_init(struct hn_softc *sc)
1797 {
1798 	int error;
1799 
1800 	HN_LOCK_ASSERT(sc);
1801 
1802 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1803 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1804 
1805 	if (bootverbose) {
1806 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1807 		    sc->hn_vf_ifp->if_xname);
1808 	}
1809 
1810 	/*
1811 	 * Bring the VF up.
1812 	 */
1813 	hn_xpnt_vf_saveifflags(sc);
1814 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1815 	error = hn_xpnt_vf_iocsetflags(sc);
1816 	if (error) {
1817 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1818 		    sc->hn_vf_ifp->if_xname, error);
1819 		return;
1820 	}
1821 
1822 	/*
1823 	 * NOTE:
1824 	 * Datapath setting must happen _after_ bringing the VF up.
1825 	 */
1826 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1827 
1828 	/*
1829 	 * NOTE:
1830 	 * Fixup RSS related bits _after_ the VF is brought up, since
1831 	 * many VFs generate RSS key during it's initialization.
1832 	 */
1833 	hn_vf_rss_fixup(sc, true);
1834 
1835 	/* Mark transparent mode VF as enabled. */
1836 	hn_xpnt_vf_setenable(sc);
1837 }
1838 
1839 static void
1840 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1841 {
1842 	struct hn_softc *sc = xsc;
1843 
1844 	HN_LOCK(sc);
1845 
1846 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1847 		goto done;
1848 	if (sc->hn_vf_ifp == NULL)
1849 		goto done;
1850 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1851 		goto done;
1852 
1853 	if (sc->hn_vf_rdytick != 0) {
1854 		/* Mark VF as ready. */
1855 		hn_xpnt_vf_setready(sc);
1856 	}
1857 
1858 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1859 		/*
1860 		 * Delayed VF initialization.
1861 		 */
1862 		if (bootverbose) {
1863 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1864 			    sc->hn_vf_ifp->if_xname);
1865 		}
1866 		hn_xpnt_vf_init(sc);
1867 	}
1868 done:
1869 	HN_UNLOCK(sc);
1870 }
1871 
1872 static void
1873 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1874 {
1875 	struct hn_softc *sc = xsc;
1876 
1877 	HN_LOCK(sc);
1878 
1879 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1880 		goto done;
1881 
1882 	if (!hn_ismyvf(sc, ifp))
1883 		goto done;
1884 
1885 	if (sc->hn_vf_ifp != NULL) {
1886 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1887 		    sc->hn_vf_ifp->if_xname);
1888 		goto done;
1889 	}
1890 
1891 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1892 		/*
1893 		 * ifnet.if_start is _not_ supported by transparent
1894 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1895 		 */
1896 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1897 		    "in transparent VF mode.\n", ifp->if_xname);
1898 		goto done;
1899 	}
1900 
1901 	rm_wlock(&hn_vfmap_lock);
1902 
1903 	if (ifp->if_index >= hn_vfmap_size) {
1904 		struct ifnet **newmap;
1905 		int newsize;
1906 
1907 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1908 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1909 		    M_WAITOK | M_ZERO);
1910 
1911 		memcpy(newmap, hn_vfmap,
1912 		    sizeof(struct ifnet *) * hn_vfmap_size);
1913 		free(hn_vfmap, M_DEVBUF);
1914 		hn_vfmap = newmap;
1915 		hn_vfmap_size = newsize;
1916 	}
1917 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1918 	    ("%s: ifindex %d was mapped to %s",
1919 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1920 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1921 
1922 	rm_wunlock(&hn_vfmap_lock);
1923 
1924 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1925 	rm_wlock(&sc->hn_vf_lock);
1926 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1927 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1928 	sc->hn_vf_ifp = ifp;
1929 	rm_wunlock(&sc->hn_vf_lock);
1930 
1931 	if (hn_xpnt_vf) {
1932 		int wait_ticks;
1933 
1934 		/*
1935 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1936 		 * Save vf_ifp's current if_input for later restoration.
1937 		 */
1938 		sc->hn_vf_input = ifp->if_input;
1939 		ifp->if_input = hn_xpnt_vf_input;
1940 
1941 		/*
1942 		 * Stop link status management; use the VF's.
1943 		 */
1944 		hn_suspend_mgmt(sc);
1945 
1946 		/*
1947 		 * Give VF sometime to complete its attach routing.
1948 		 */
1949 		wait_ticks = hn_xpnt_vf_attwait * hz;
1950 		sc->hn_vf_rdytick = ticks + wait_ticks;
1951 
1952 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1953 		    wait_ticks);
1954 	}
1955 done:
1956 	HN_UNLOCK(sc);
1957 }
1958 
1959 static void
1960 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1961 {
1962 	struct hn_softc *sc = xsc;
1963 
1964 	HN_LOCK(sc);
1965 
1966 	if (sc->hn_vf_ifp == NULL)
1967 		goto done;
1968 
1969 	if (!hn_ismyvf(sc, ifp))
1970 		goto done;
1971 
1972 	if (hn_xpnt_vf) {
1973 		/*
1974 		 * Make sure that the delayed initialization is not running.
1975 		 *
1976 		 * NOTE:
1977 		 * - This lock _must_ be released, since the hn_vf_init task
1978 		 *   will try holding this lock.
1979 		 * - It is safe to release this lock here, since the
1980 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1981 		 *
1982 		 * XXX racy, if hn(4) ever detached.
1983 		 */
1984 		HN_UNLOCK(sc);
1985 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1986 		HN_LOCK(sc);
1987 
1988 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1989 		    sc->hn_ifp->if_xname));
1990 		ifp->if_input = sc->hn_vf_input;
1991 		sc->hn_vf_input = NULL;
1992 
1993 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1994 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1995 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1996 
1997 		if (sc->hn_vf_rdytick == 0) {
1998 			/*
1999 			 * The VF was ready; restore some settings.
2000 			 */
2001 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2002 			/*
2003 			 * NOTE:
2004 			 * There is _no_ need to fixup if_capenable and
2005 			 * if_hwassist, since the if_capabilities before
2006 			 * restoration was an intersection of the VF's
2007 			 * if_capabilites and the synthetic device's
2008 			 * if_capabilites.
2009 			 */
2010 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2011 			sc->hn_ifp->if_hw_tsomaxsegcount =
2012 			    sc->hn_saved_tsosegcnt;
2013 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2014 		}
2015 
2016 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2017 			/*
2018 			 * Restore RSS settings.
2019 			 */
2020 			hn_vf_rss_restore(sc);
2021 
2022 			/*
2023 			 * Resume link status management, which was suspended
2024 			 * by hn_ifnet_attevent().
2025 			 */
2026 			hn_resume_mgmt(sc);
2027 		}
2028 	}
2029 
2030 	/* Mark transparent mode VF as disabled. */
2031 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2032 
2033 	rm_wlock(&hn_vfmap_lock);
2034 
2035 	KASSERT(ifp->if_index < hn_vfmap_size,
2036 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2037 	if (hn_vfmap[ifp->if_index] != NULL) {
2038 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2039 		    ("%s: ifindex %d was mapped to %s",
2040 		     ifp->if_xname, ifp->if_index,
2041 		     hn_vfmap[ifp->if_index]->if_xname));
2042 		hn_vfmap[ifp->if_index] = NULL;
2043 	}
2044 
2045 	rm_wunlock(&hn_vfmap_lock);
2046 done:
2047 	HN_UNLOCK(sc);
2048 }
2049 
2050 static void
2051 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2052 {
2053 	struct hn_softc *sc = xsc;
2054 
2055 	if (sc->hn_vf_ifp == ifp)
2056 		if_link_state_change(sc->hn_ifp, link_state);
2057 }
2058 
2059 static int
2060 hn_probe(device_t dev)
2061 {
2062 
2063 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2064 		device_set_desc(dev, "Hyper-V Network Interface");
2065 		return BUS_PROBE_DEFAULT;
2066 	}
2067 	return ENXIO;
2068 }
2069 
2070 static int
2071 hn_attach(device_t dev)
2072 {
2073 	struct hn_softc *sc = device_get_softc(dev);
2074 	struct sysctl_oid_list *child;
2075 	struct sysctl_ctx_list *ctx;
2076 	uint8_t eaddr[ETHER_ADDR_LEN];
2077 	struct ifnet *ifp = NULL;
2078 	int error, ring_cnt, tx_ring_cnt;
2079 	uint32_t mtu;
2080 
2081 	sc->hn_dev = dev;
2082 	sc->hn_prichan = vmbus_get_channel(dev);
2083 	HN_LOCK_INIT(sc);
2084 	rm_init(&sc->hn_vf_lock, "hnvf");
2085 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2086 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2087 
2088 	/*
2089 	 * Initialize these tunables once.
2090 	 */
2091 	sc->hn_agg_size = hn_tx_agg_size;
2092 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2093 
2094 	/*
2095 	 * Setup taskqueue for transmission.
2096 	 */
2097 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2098 		int i;
2099 
2100 		sc->hn_tx_taskqs =
2101 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2102 		    M_DEVBUF, M_WAITOK);
2103 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2104 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2105 			    M_WAITOK, taskqueue_thread_enqueue,
2106 			    &sc->hn_tx_taskqs[i]);
2107 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2108 			    "%s tx%d", device_get_nameunit(dev), i);
2109 		}
2110 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2111 		sc->hn_tx_taskqs = hn_tx_taskque;
2112 	}
2113 
2114 	/*
2115 	 * Setup taskqueue for mangement tasks, e.g. link status.
2116 	 */
2117 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2118 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2119 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2120 	    device_get_nameunit(dev));
2121 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2122 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2123 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2124 	    hn_netchg_status_taskfunc, sc);
2125 
2126 	if (hn_xpnt_vf) {
2127 		/*
2128 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2129 		 */
2130 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2131 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2132 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2133 		    device_get_nameunit(dev));
2134 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2135 		    hn_xpnt_vf_init_taskfunc, sc);
2136 	}
2137 
2138 	/*
2139 	 * Allocate ifnet and setup its name earlier, so that if_printf
2140 	 * can be used by functions, which will be called after
2141 	 * ether_ifattach().
2142 	 */
2143 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2144 	ifp->if_softc = sc;
2145 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2146 
2147 	/*
2148 	 * Initialize ifmedia earlier so that it can be unconditionally
2149 	 * destroyed, if error happened later on.
2150 	 */
2151 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2152 
2153 	/*
2154 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2155 	 * to use (tx_ring_cnt).
2156 	 *
2157 	 * NOTE:
2158 	 * The # of RX rings to use is same as the # of channels to use.
2159 	 */
2160 	ring_cnt = hn_chan_cnt;
2161 	if (ring_cnt <= 0) {
2162 		/* Default */
2163 		ring_cnt = mp_ncpus;
2164 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2165 			ring_cnt = HN_RING_CNT_DEF_MAX;
2166 	} else if (ring_cnt > mp_ncpus) {
2167 		ring_cnt = mp_ncpus;
2168 	}
2169 #ifdef RSS
2170 	if (ring_cnt > rss_getnumbuckets())
2171 		ring_cnt = rss_getnumbuckets();
2172 #endif
2173 
2174 	tx_ring_cnt = hn_tx_ring_cnt;
2175 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2176 		tx_ring_cnt = ring_cnt;
2177 #ifdef HN_IFSTART_SUPPORT
2178 	if (hn_use_if_start) {
2179 		/* ifnet.if_start only needs one TX ring. */
2180 		tx_ring_cnt = 1;
2181 	}
2182 #endif
2183 
2184 	/*
2185 	 * Set the leader CPU for channels.
2186 	 */
2187 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2188 
2189 	/*
2190 	 * Create enough TX/RX rings, even if only limited number of
2191 	 * channels can be allocated.
2192 	 */
2193 	error = hn_create_tx_data(sc, tx_ring_cnt);
2194 	if (error)
2195 		goto failed;
2196 	error = hn_create_rx_data(sc, ring_cnt);
2197 	if (error)
2198 		goto failed;
2199 
2200 	/*
2201 	 * Create transaction context for NVS and RNDIS transactions.
2202 	 */
2203 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2204 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2205 	if (sc->hn_xact == NULL) {
2206 		error = ENXIO;
2207 		goto failed;
2208 	}
2209 
2210 	/*
2211 	 * Install orphan handler for the revocation of this device's
2212 	 * primary channel.
2213 	 *
2214 	 * NOTE:
2215 	 * The processing order is critical here:
2216 	 * Install the orphan handler, _before_ testing whether this
2217 	 * device's primary channel has been revoked or not.
2218 	 */
2219 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2220 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2221 		error = ENXIO;
2222 		goto failed;
2223 	}
2224 
2225 	/*
2226 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2227 	 */
2228 	error = hn_synth_attach(sc, ETHERMTU);
2229 	if (error)
2230 		goto failed;
2231 
2232 	error = hn_rndis_get_eaddr(sc, eaddr);
2233 	if (error)
2234 		goto failed;
2235 
2236 	error = hn_rndis_get_mtu(sc, &mtu);
2237 	if (error)
2238 		mtu = ETHERMTU;
2239 	else if (bootverbose)
2240 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2241 
2242 #if __FreeBSD_version >= 1100099
2243 	if (sc->hn_rx_ring_inuse > 1) {
2244 		/*
2245 		 * Reduce TCP segment aggregation limit for multiple
2246 		 * RX rings to increase ACK timeliness.
2247 		 */
2248 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2249 	}
2250 #endif
2251 
2252 	/*
2253 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2254 	 */
2255 	hn_fixup_tx_data(sc);
2256 	hn_fixup_rx_data(sc);
2257 
2258 	ctx = device_get_sysctl_ctx(dev);
2259 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2260 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2261 	    &sc->hn_nvs_ver, 0, "NVS version");
2262 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2263 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2264 	    hn_ndis_version_sysctl, "A", "NDIS version");
2265 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2266 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2267 	    hn_caps_sysctl, "A", "capabilities");
2268 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2269 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2270 	    hn_hwassist_sysctl, "A", "hwassist");
2271 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2272 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2273 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2274 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2275 	    "max # of TSO segments");
2276 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2277 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2278 	    "max size of TSO segment");
2279 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2280 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2281 	    hn_rxfilter_sysctl, "A", "rxfilter");
2282 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2283 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2284 	    hn_rss_hash_sysctl, "A", "RSS hash");
2285 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2286 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2287 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2288 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2289 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2290 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2291 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2292 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2293 #ifndef RSS
2294 	/*
2295 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2296 	 */
2297 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2298 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2299 	    hn_rss_key_sysctl, "IU", "RSS key");
2300 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2301 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2302 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2303 #endif
2304 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2305 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2306 	    "RNDIS offered packet transmission aggregation size limit");
2307 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2308 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2309 	    "RNDIS offered packet transmission aggregation count limit");
2310 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2311 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2312 	    "RNDIS packet transmission aggregation alignment");
2313 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2314 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2315 	    hn_txagg_size_sysctl, "I",
2316 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2317 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2318 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2319 	    hn_txagg_pkts_sysctl, "I",
2320 	    "Packet transmission aggregation packets, "
2321 	    "0 -- disable, -1 -- auto");
2322 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2323 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2324 	    hn_polling_sysctl, "I",
2325 	    "Polling frequency: [100,1000000], 0 disable polling");
2326 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2327 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2328 	    hn_vf_sysctl, "A", "Virtual Function's name");
2329 	if (!hn_xpnt_vf) {
2330 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2331 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2332 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2333 	} else {
2334 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2335 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2336 		    hn_xpnt_vf_enabled_sysctl, "I",
2337 		    "Transparent VF enabled");
2338 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2339 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2340 		    hn_xpnt_vf_accbpf_sysctl, "I",
2341 		    "Accurate BPF for transparent VF");
2342 	}
2343 
2344 	/*
2345 	 * Setup the ifmedia, which has been initialized earlier.
2346 	 */
2347 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2348 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2349 	/* XXX ifmedia_set really should do this for us */
2350 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2351 
2352 	/*
2353 	 * Setup the ifnet for this interface.
2354 	 */
2355 
2356 	ifp->if_baudrate = IF_Gbps(10);
2357 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2358 	ifp->if_ioctl = hn_ioctl;
2359 	ifp->if_init = hn_init;
2360 #ifdef HN_IFSTART_SUPPORT
2361 	if (hn_use_if_start) {
2362 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2363 
2364 		ifp->if_start = hn_start;
2365 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2366 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2367 		IFQ_SET_READY(&ifp->if_snd);
2368 	} else
2369 #endif
2370 	{
2371 		ifp->if_transmit = hn_transmit;
2372 		ifp->if_qflush = hn_xmit_qflush;
2373 	}
2374 
2375 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2376 #ifdef foo
2377 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2378 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2379 #endif
2380 	if (sc->hn_caps & HN_CAP_VLAN) {
2381 		/* XXX not sure about VLAN_MTU. */
2382 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2383 	}
2384 
2385 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2386 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2387 		ifp->if_capabilities |= IFCAP_TXCSUM;
2388 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2389 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2390 	if (sc->hn_caps & HN_CAP_TSO4) {
2391 		ifp->if_capabilities |= IFCAP_TSO4;
2392 		ifp->if_hwassist |= CSUM_IP_TSO;
2393 	}
2394 	if (sc->hn_caps & HN_CAP_TSO6) {
2395 		ifp->if_capabilities |= IFCAP_TSO6;
2396 		ifp->if_hwassist |= CSUM_IP6_TSO;
2397 	}
2398 
2399 	/* Enable all available capabilities by default. */
2400 	ifp->if_capenable = ifp->if_capabilities;
2401 
2402 	/*
2403 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2404 	 * be enabled through SIOCSIFCAP.
2405 	 */
2406 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2407 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2408 
2409 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2410 		/*
2411 		 * Lock hn_set_tso_maxsize() to simplify its
2412 		 * internal logic.
2413 		 */
2414 		HN_LOCK(sc);
2415 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2416 		HN_UNLOCK(sc);
2417 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2418 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2419 	}
2420 
2421 	ether_ifattach(ifp, eaddr);
2422 
2423 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2424 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2425 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2426 	}
2427 	if (mtu < ETHERMTU) {
2428 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2429 		ifp->if_mtu = mtu;
2430 	}
2431 
2432 	/* Inform the upper layer about the long frame support. */
2433 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2434 
2435 	/*
2436 	 * Kick off link status check.
2437 	 */
2438 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2439 	hn_update_link_status(sc);
2440 
2441 	if (!hn_xpnt_vf) {
2442 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2443 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2444 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2445 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2446 	} else {
2447 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2448 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2449 	}
2450 
2451 	/*
2452 	 * NOTE:
2453 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2454 	 * since interface's LLADDR is needed; interface LLADDR is not
2455 	 * available when ifnet_arrival event is triggered.
2456 	 */
2457 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2458 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2459 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2460 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2461 
2462 	return (0);
2463 failed:
2464 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2465 		hn_synth_detach(sc);
2466 	hn_detach(dev);
2467 	return (error);
2468 }
2469 
2470 static int
2471 hn_detach(device_t dev)
2472 {
2473 	struct hn_softc *sc = device_get_softc(dev);
2474 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2475 
2476 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2477 		/*
2478 		 * In case that the vmbus missed the orphan handler
2479 		 * installation.
2480 		 */
2481 		vmbus_xact_ctx_orphan(sc->hn_xact);
2482 	}
2483 
2484 	if (sc->hn_ifaddr_evthand != NULL)
2485 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2486 	if (sc->hn_ifnet_evthand != NULL)
2487 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2488 	if (sc->hn_ifnet_atthand != NULL) {
2489 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2490 		    sc->hn_ifnet_atthand);
2491 	}
2492 	if (sc->hn_ifnet_dethand != NULL) {
2493 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2494 		    sc->hn_ifnet_dethand);
2495 	}
2496 	if (sc->hn_ifnet_lnkhand != NULL)
2497 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2498 
2499 	vf_ifp = sc->hn_vf_ifp;
2500 	__compiler_membar();
2501 	if (vf_ifp != NULL)
2502 		hn_ifnet_detevent(sc, vf_ifp);
2503 
2504 	if (device_is_attached(dev)) {
2505 		HN_LOCK(sc);
2506 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2507 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2508 				hn_stop(sc, true);
2509 			/*
2510 			 * NOTE:
2511 			 * hn_stop() only suspends data, so managment
2512 			 * stuffs have to be suspended manually here.
2513 			 */
2514 			hn_suspend_mgmt(sc);
2515 			hn_synth_detach(sc);
2516 		}
2517 		HN_UNLOCK(sc);
2518 		ether_ifdetach(ifp);
2519 	}
2520 
2521 	ifmedia_removeall(&sc->hn_media);
2522 	hn_destroy_rx_data(sc);
2523 	hn_destroy_tx_data(sc);
2524 
2525 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2526 		int i;
2527 
2528 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2529 			taskqueue_free(sc->hn_tx_taskqs[i]);
2530 		free(sc->hn_tx_taskqs, M_DEVBUF);
2531 	}
2532 	taskqueue_free(sc->hn_mgmt_taskq0);
2533 	if (sc->hn_vf_taskq != NULL)
2534 		taskqueue_free(sc->hn_vf_taskq);
2535 
2536 	if (sc->hn_xact != NULL) {
2537 		/*
2538 		 * Uninstall the orphan handler _before_ the xact is
2539 		 * destructed.
2540 		 */
2541 		vmbus_chan_unset_orphan(sc->hn_prichan);
2542 		vmbus_xact_ctx_destroy(sc->hn_xact);
2543 	}
2544 
2545 	if_free(ifp);
2546 
2547 	HN_LOCK_DESTROY(sc);
2548 	rm_destroy(&sc->hn_vf_lock);
2549 	return (0);
2550 }
2551 
2552 static int
2553 hn_shutdown(device_t dev)
2554 {
2555 
2556 	return (0);
2557 }
2558 
2559 static void
2560 hn_link_status(struct hn_softc *sc)
2561 {
2562 	uint32_t link_status;
2563 	int error;
2564 
2565 	error = hn_rndis_get_linkstatus(sc, &link_status);
2566 	if (error) {
2567 		/* XXX what to do? */
2568 		return;
2569 	}
2570 
2571 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2572 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2573 	else
2574 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2575 	if_link_state_change(sc->hn_ifp,
2576 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2577 	    LINK_STATE_UP : LINK_STATE_DOWN);
2578 }
2579 
2580 static void
2581 hn_link_taskfunc(void *xsc, int pending __unused)
2582 {
2583 	struct hn_softc *sc = xsc;
2584 
2585 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2586 		return;
2587 	hn_link_status(sc);
2588 }
2589 
2590 static void
2591 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2592 {
2593 	struct hn_softc *sc = xsc;
2594 
2595 	/* Prevent any link status checks from running. */
2596 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2597 
2598 	/*
2599 	 * Fake up a [link down --> link up] state change; 5 seconds
2600 	 * delay is used, which closely simulates miibus reaction
2601 	 * upon link down event.
2602 	 */
2603 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2604 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2605 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2606 	    &sc->hn_netchg_status, 5 * hz);
2607 }
2608 
2609 static void
2610 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2611 {
2612 	struct hn_softc *sc = xsc;
2613 
2614 	/* Re-allow link status checks. */
2615 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2616 	hn_link_status(sc);
2617 }
2618 
2619 static void
2620 hn_update_link_status(struct hn_softc *sc)
2621 {
2622 
2623 	if (sc->hn_mgmt_taskq != NULL)
2624 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2625 }
2626 
2627 static void
2628 hn_change_network(struct hn_softc *sc)
2629 {
2630 
2631 	if (sc->hn_mgmt_taskq != NULL)
2632 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2633 }
2634 
2635 static __inline int
2636 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2637     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2638 {
2639 	struct mbuf *m = *m_head;
2640 	int error;
2641 
2642 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2643 
2644 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2645 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2646 	if (error == EFBIG) {
2647 		struct mbuf *m_new;
2648 
2649 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2650 		if (m_new == NULL)
2651 			return ENOBUFS;
2652 		else
2653 			*m_head = m = m_new;
2654 		txr->hn_tx_collapsed++;
2655 
2656 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2657 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2658 	}
2659 	if (!error) {
2660 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2661 		    BUS_DMASYNC_PREWRITE);
2662 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2663 	}
2664 	return error;
2665 }
2666 
2667 static __inline int
2668 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2669 {
2670 
2671 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2672 	    ("put an onlist txd %#x", txd->flags));
2673 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2674 	    ("put an onagg txd %#x", txd->flags));
2675 
2676 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2677 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2678 		return 0;
2679 
2680 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2681 		struct hn_txdesc *tmp_txd;
2682 
2683 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2684 			int freed;
2685 
2686 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2687 			    ("resursive aggregation on aggregated txdesc"));
2688 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2689 			    ("not aggregated txdesc"));
2690 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2691 			    ("aggregated txdesc uses dmamap"));
2692 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2693 			    ("aggregated txdesc consumes "
2694 			     "chimney sending buffer"));
2695 			KASSERT(tmp_txd->chim_size == 0,
2696 			    ("aggregated txdesc has non-zero "
2697 			     "chimney sending size"));
2698 
2699 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2700 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2701 			freed = hn_txdesc_put(txr, tmp_txd);
2702 			KASSERT(freed, ("failed to free aggregated txdesc"));
2703 		}
2704 	}
2705 
2706 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2707 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2708 		    ("chim txd uses dmamap"));
2709 		hn_chim_free(txr->hn_sc, txd->chim_index);
2710 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2711 		txd->chim_size = 0;
2712 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2713 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2714 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2715 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2716 		    txd->data_dmap);
2717 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2718 	}
2719 
2720 	if (txd->m != NULL) {
2721 		m_freem(txd->m);
2722 		txd->m = NULL;
2723 	}
2724 
2725 	txd->flags |= HN_TXD_FLAG_ONLIST;
2726 #ifndef HN_USE_TXDESC_BUFRING
2727 	mtx_lock_spin(&txr->hn_txlist_spin);
2728 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2729 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2730 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2731 	txr->hn_txdesc_avail++;
2732 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2733 	mtx_unlock_spin(&txr->hn_txlist_spin);
2734 #else	/* HN_USE_TXDESC_BUFRING */
2735 #ifdef HN_DEBUG
2736 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2737 #endif
2738 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2739 #endif	/* !HN_USE_TXDESC_BUFRING */
2740 
2741 	return 1;
2742 }
2743 
2744 static __inline struct hn_txdesc *
2745 hn_txdesc_get(struct hn_tx_ring *txr)
2746 {
2747 	struct hn_txdesc *txd;
2748 
2749 #ifndef HN_USE_TXDESC_BUFRING
2750 	mtx_lock_spin(&txr->hn_txlist_spin);
2751 	txd = SLIST_FIRST(&txr->hn_txlist);
2752 	if (txd != NULL) {
2753 		KASSERT(txr->hn_txdesc_avail > 0,
2754 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2755 		txr->hn_txdesc_avail--;
2756 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2757 	}
2758 	mtx_unlock_spin(&txr->hn_txlist_spin);
2759 #else
2760 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2761 #endif
2762 
2763 	if (txd != NULL) {
2764 #ifdef HN_USE_TXDESC_BUFRING
2765 #ifdef HN_DEBUG
2766 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2767 #endif
2768 #endif	/* HN_USE_TXDESC_BUFRING */
2769 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2770 		    STAILQ_EMPTY(&txd->agg_list) &&
2771 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2772 		    txd->chim_size == 0 &&
2773 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2774 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2775 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2776 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2777 		txd->refs = 1;
2778 	}
2779 	return txd;
2780 }
2781 
2782 static __inline void
2783 hn_txdesc_hold(struct hn_txdesc *txd)
2784 {
2785 
2786 	/* 0->1 transition will never work */
2787 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2788 	atomic_add_int(&txd->refs, 1);
2789 }
2790 
2791 static __inline void
2792 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2793 {
2794 
2795 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2796 	    ("recursive aggregation on aggregating txdesc"));
2797 
2798 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2799 	    ("already aggregated"));
2800 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2801 	    ("recursive aggregation on to-be-aggregated txdesc"));
2802 
2803 	txd->flags |= HN_TXD_FLAG_ONAGG;
2804 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2805 }
2806 
2807 static bool
2808 hn_tx_ring_pending(struct hn_tx_ring *txr)
2809 {
2810 	bool pending = false;
2811 
2812 #ifndef HN_USE_TXDESC_BUFRING
2813 	mtx_lock_spin(&txr->hn_txlist_spin);
2814 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2815 		pending = true;
2816 	mtx_unlock_spin(&txr->hn_txlist_spin);
2817 #else
2818 	if (!buf_ring_full(txr->hn_txdesc_br))
2819 		pending = true;
2820 #endif
2821 	return (pending);
2822 }
2823 
2824 static __inline void
2825 hn_txeof(struct hn_tx_ring *txr)
2826 {
2827 	txr->hn_has_txeof = 0;
2828 	txr->hn_txeof(txr);
2829 }
2830 
2831 static void
2832 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2833     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2834 {
2835 	struct hn_txdesc *txd = sndc->hn_cbarg;
2836 	struct hn_tx_ring *txr;
2837 
2838 	txr = txd->txr;
2839 	KASSERT(txr->hn_chan == chan,
2840 	    ("channel mismatch, on chan%u, should be chan%u",
2841 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2842 
2843 	txr->hn_has_txeof = 1;
2844 	hn_txdesc_put(txr, txd);
2845 
2846 	++txr->hn_txdone_cnt;
2847 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2848 		txr->hn_txdone_cnt = 0;
2849 		if (txr->hn_oactive)
2850 			hn_txeof(txr);
2851 	}
2852 }
2853 
2854 static void
2855 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2856 {
2857 #if defined(INET) || defined(INET6)
2858 	tcp_lro_flush_all(&rxr->hn_lro);
2859 #endif
2860 
2861 	/*
2862 	 * NOTE:
2863 	 * 'txr' could be NULL, if multiple channels and
2864 	 * ifnet.if_start method are enabled.
2865 	 */
2866 	if (txr == NULL || !txr->hn_has_txeof)
2867 		return;
2868 
2869 	txr->hn_txdone_cnt = 0;
2870 	hn_txeof(txr);
2871 }
2872 
2873 static __inline uint32_t
2874 hn_rndis_pktmsg_offset(uint32_t ofs)
2875 {
2876 
2877 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2878 	    ("invalid RNDIS packet msg offset %u", ofs));
2879 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2880 }
2881 
2882 static __inline void *
2883 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2884     size_t pi_dlen, uint32_t pi_type)
2885 {
2886 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2887 	struct rndis_pktinfo *pi;
2888 
2889 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2890 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2891 
2892 	/*
2893 	 * Per-packet-info does not move; it only grows.
2894 	 *
2895 	 * NOTE:
2896 	 * rm_pktinfooffset in this phase counts from the beginning
2897 	 * of rndis_packet_msg.
2898 	 */
2899 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2900 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2901 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2902 	    pkt->rm_pktinfolen);
2903 	pkt->rm_pktinfolen += pi_size;
2904 
2905 	pi->rm_size = pi_size;
2906 	pi->rm_type = pi_type;
2907 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2908 
2909 	return (pi->rm_data);
2910 }
2911 
2912 static __inline int
2913 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2914 {
2915 	struct hn_txdesc *txd;
2916 	struct mbuf *m;
2917 	int error, pkts;
2918 
2919 	txd = txr->hn_agg_txd;
2920 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2921 
2922 	/*
2923 	 * Since hn_txpkt() will reset this temporary stat, save
2924 	 * it now, so that oerrors can be updated properly, if
2925 	 * hn_txpkt() ever fails.
2926 	 */
2927 	pkts = txr->hn_stat_pkts;
2928 
2929 	/*
2930 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2931 	 * failure, save it for later freeing, if hn_txpkt() ever
2932 	 * fails.
2933 	 */
2934 	m = txd->m;
2935 	error = hn_txpkt(ifp, txr, txd);
2936 	if (__predict_false(error)) {
2937 		/* txd is freed, but m is not. */
2938 		m_freem(m);
2939 
2940 		txr->hn_flush_failed++;
2941 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2942 	}
2943 
2944 	/* Reset all aggregation states. */
2945 	txr->hn_agg_txd = NULL;
2946 	txr->hn_agg_szleft = 0;
2947 	txr->hn_agg_pktleft = 0;
2948 	txr->hn_agg_prevpkt = NULL;
2949 
2950 	return (error);
2951 }
2952 
2953 static void *
2954 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2955     int pktsize)
2956 {
2957 	void *chim;
2958 
2959 	if (txr->hn_agg_txd != NULL) {
2960 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2961 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2962 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2963 			int olen;
2964 
2965 			/*
2966 			 * Update the previous RNDIS packet's total length,
2967 			 * it can be increased due to the mandatory alignment
2968 			 * padding for this RNDIS packet.  And update the
2969 			 * aggregating txdesc's chimney sending buffer size
2970 			 * accordingly.
2971 			 *
2972 			 * XXX
2973 			 * Zero-out the padding, as required by the RNDIS spec.
2974 			 */
2975 			olen = pkt->rm_len;
2976 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2977 			agg_txd->chim_size += pkt->rm_len - olen;
2978 
2979 			/* Link this txdesc to the parent. */
2980 			hn_txdesc_agg(agg_txd, txd);
2981 
2982 			chim = (uint8_t *)pkt + pkt->rm_len;
2983 			/* Save the current packet for later fixup. */
2984 			txr->hn_agg_prevpkt = chim;
2985 
2986 			txr->hn_agg_pktleft--;
2987 			txr->hn_agg_szleft -= pktsize;
2988 			if (txr->hn_agg_szleft <=
2989 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2990 				/*
2991 				 * Probably can't aggregate more packets,
2992 				 * flush this aggregating txdesc proactively.
2993 				 */
2994 				txr->hn_agg_pktleft = 0;
2995 			}
2996 			/* Done! */
2997 			return (chim);
2998 		}
2999 		hn_flush_txagg(ifp, txr);
3000 	}
3001 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3002 
3003 	txr->hn_tx_chimney_tried++;
3004 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3005 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3006 		return (NULL);
3007 	txr->hn_tx_chimney++;
3008 
3009 	chim = txr->hn_sc->hn_chim +
3010 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3011 
3012 	if (txr->hn_agg_pktmax > 1 &&
3013 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3014 		txr->hn_agg_txd = txd;
3015 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3016 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3017 		txr->hn_agg_prevpkt = chim;
3018 	}
3019 	return (chim);
3020 }
3021 
3022 /*
3023  * NOTE:
3024  * If this function fails, then both txd and m_head0 will be freed.
3025  */
3026 static int
3027 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3028     struct mbuf **m_head0)
3029 {
3030 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3031 	int error, nsegs, i;
3032 	struct mbuf *m_head = *m_head0;
3033 	struct rndis_packet_msg *pkt;
3034 	uint32_t *pi_data;
3035 	void *chim = NULL;
3036 	int pkt_hlen, pkt_size;
3037 
3038 	pkt = txd->rndis_pkt;
3039 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3040 	if (pkt_size < txr->hn_chim_size) {
3041 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3042 		if (chim != NULL)
3043 			pkt = chim;
3044 	} else {
3045 		if (txr->hn_agg_txd != NULL)
3046 			hn_flush_txagg(ifp, txr);
3047 	}
3048 
3049 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3050 	pkt->rm_len = m_head->m_pkthdr.len;
3051 	pkt->rm_dataoffset = 0;
3052 	pkt->rm_datalen = m_head->m_pkthdr.len;
3053 	pkt->rm_oobdataoffset = 0;
3054 	pkt->rm_oobdatalen = 0;
3055 	pkt->rm_oobdataelements = 0;
3056 	pkt->rm_pktinfooffset = sizeof(*pkt);
3057 	pkt->rm_pktinfolen = 0;
3058 	pkt->rm_vchandle = 0;
3059 	pkt->rm_reserved = 0;
3060 
3061 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3062 		/*
3063 		 * Set the hash value for this packet, so that the host could
3064 		 * dispatch the TX done event for this packet back to this TX
3065 		 * ring's channel.
3066 		 */
3067 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3068 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3069 		*pi_data = txr->hn_tx_idx;
3070 	}
3071 
3072 	if (m_head->m_flags & M_VLANTAG) {
3073 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3074 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3075 		*pi_data = NDIS_VLAN_INFO_MAKE(
3076 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3077 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3078 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3079 	}
3080 
3081 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3082 #if defined(INET6) || defined(INET)
3083 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3084 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3085 #ifdef INET
3086 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3087 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3088 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3089 			    m_head->m_pkthdr.tso_segsz);
3090 		}
3091 #endif
3092 #if defined(INET6) && defined(INET)
3093 		else
3094 #endif
3095 #ifdef INET6
3096 		{
3097 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3098 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3099 			    m_head->m_pkthdr.tso_segsz);
3100 		}
3101 #endif
3102 #endif	/* INET6 || INET */
3103 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3104 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3105 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3106 		if (m_head->m_pkthdr.csum_flags &
3107 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3108 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3109 		} else {
3110 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3111 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3112 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3113 		}
3114 
3115 		if (m_head->m_pkthdr.csum_flags &
3116 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3117 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3118 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3119 		} else if (m_head->m_pkthdr.csum_flags &
3120 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3121 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3122 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3123 		}
3124 	}
3125 
3126 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3127 	/* Fixup RNDIS packet message total length */
3128 	pkt->rm_len += pkt_hlen;
3129 	/* Convert RNDIS packet message offsets */
3130 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3131 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3132 
3133 	/*
3134 	 * Fast path: Chimney sending.
3135 	 */
3136 	if (chim != NULL) {
3137 		struct hn_txdesc *tgt_txd = txd;
3138 
3139 		if (txr->hn_agg_txd != NULL) {
3140 			tgt_txd = txr->hn_agg_txd;
3141 #ifdef INVARIANTS
3142 			*m_head0 = NULL;
3143 #endif
3144 		}
3145 
3146 		KASSERT(pkt == chim,
3147 		    ("RNDIS pkt not in chimney sending buffer"));
3148 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3149 		    ("chimney sending buffer is not used"));
3150 		tgt_txd->chim_size += pkt->rm_len;
3151 
3152 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3153 		    ((uint8_t *)chim) + pkt_hlen);
3154 
3155 		txr->hn_gpa_cnt = 0;
3156 		txr->hn_sendpkt = hn_txpkt_chim;
3157 		goto done;
3158 	}
3159 
3160 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3161 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3162 	    ("chimney buffer is used"));
3163 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3164 
3165 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3166 	if (__predict_false(error)) {
3167 		int freed;
3168 
3169 		/*
3170 		 * This mbuf is not linked w/ the txd yet, so free it now.
3171 		 */
3172 		m_freem(m_head);
3173 		*m_head0 = NULL;
3174 
3175 		freed = hn_txdesc_put(txr, txd);
3176 		KASSERT(freed != 0,
3177 		    ("fail to free txd upon txdma error"));
3178 
3179 		txr->hn_txdma_failed++;
3180 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3181 		return error;
3182 	}
3183 	*m_head0 = m_head;
3184 
3185 	/* +1 RNDIS packet message */
3186 	txr->hn_gpa_cnt = nsegs + 1;
3187 
3188 	/* send packet with page buffer */
3189 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3190 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3191 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3192 
3193 	/*
3194 	 * Fill the page buffers with mbuf info after the page
3195 	 * buffer for RNDIS packet message.
3196 	 */
3197 	for (i = 0; i < nsegs; ++i) {
3198 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3199 
3200 		gpa->gpa_page = atop(segs[i].ds_addr);
3201 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3202 		gpa->gpa_len = segs[i].ds_len;
3203 	}
3204 
3205 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3206 	txd->chim_size = 0;
3207 	txr->hn_sendpkt = hn_txpkt_sglist;
3208 done:
3209 	txd->m = m_head;
3210 
3211 	/* Set the completion routine */
3212 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3213 
3214 	/* Update temporary stats for later use. */
3215 	txr->hn_stat_pkts++;
3216 	txr->hn_stat_size += m_head->m_pkthdr.len;
3217 	if (m_head->m_flags & M_MCAST)
3218 		txr->hn_stat_mcasts++;
3219 
3220 	return 0;
3221 }
3222 
3223 /*
3224  * NOTE:
3225  * If this function fails, then txd will be freed, but the mbuf
3226  * associated w/ the txd will _not_ be freed.
3227  */
3228 static int
3229 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3230 {
3231 	int error, send_failed = 0, has_bpf;
3232 
3233 again:
3234 	has_bpf = bpf_peers_present(ifp->if_bpf);
3235 	if (has_bpf) {
3236 		/*
3237 		 * Make sure that this txd and any aggregated txds are not
3238 		 * freed before ETHER_BPF_MTAP.
3239 		 */
3240 		hn_txdesc_hold(txd);
3241 	}
3242 	error = txr->hn_sendpkt(txr, txd);
3243 	if (!error) {
3244 		if (has_bpf) {
3245 			const struct hn_txdesc *tmp_txd;
3246 
3247 			ETHER_BPF_MTAP(ifp, txd->m);
3248 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3249 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3250 		}
3251 
3252 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3253 #ifdef HN_IFSTART_SUPPORT
3254 		if (!hn_use_if_start)
3255 #endif
3256 		{
3257 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3258 			    txr->hn_stat_size);
3259 			if (txr->hn_stat_mcasts != 0) {
3260 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3261 				    txr->hn_stat_mcasts);
3262 			}
3263 		}
3264 		txr->hn_pkts += txr->hn_stat_pkts;
3265 		txr->hn_sends++;
3266 	}
3267 	if (has_bpf)
3268 		hn_txdesc_put(txr, txd);
3269 
3270 	if (__predict_false(error)) {
3271 		int freed;
3272 
3273 		/*
3274 		 * This should "really rarely" happen.
3275 		 *
3276 		 * XXX Too many RX to be acked or too many sideband
3277 		 * commands to run?  Ask netvsc_channel_rollup()
3278 		 * to kick start later.
3279 		 */
3280 		txr->hn_has_txeof = 1;
3281 		if (!send_failed) {
3282 			txr->hn_send_failed++;
3283 			send_failed = 1;
3284 			/*
3285 			 * Try sending again after set hn_has_txeof;
3286 			 * in case that we missed the last
3287 			 * netvsc_channel_rollup().
3288 			 */
3289 			goto again;
3290 		}
3291 		if_printf(ifp, "send failed\n");
3292 
3293 		/*
3294 		 * Caller will perform further processing on the
3295 		 * associated mbuf, so don't free it in hn_txdesc_put();
3296 		 * only unload it from the DMA map in hn_txdesc_put(),
3297 		 * if it was loaded.
3298 		 */
3299 		txd->m = NULL;
3300 		freed = hn_txdesc_put(txr, txd);
3301 		KASSERT(freed != 0,
3302 		    ("fail to free txd upon send error"));
3303 
3304 		txr->hn_send_failed++;
3305 	}
3306 
3307 	/* Reset temporary stats, after this sending is done. */
3308 	txr->hn_stat_size = 0;
3309 	txr->hn_stat_pkts = 0;
3310 	txr->hn_stat_mcasts = 0;
3311 
3312 	return (error);
3313 }
3314 
3315 /*
3316  * Append the specified data to the indicated mbuf chain,
3317  * Extend the mbuf chain if the new data does not fit in
3318  * existing space.
3319  *
3320  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3321  * There should be an equivalent in the kernel mbuf code,
3322  * but there does not appear to be one yet.
3323  *
3324  * Differs from m_append() in that additional mbufs are
3325  * allocated with cluster size MJUMPAGESIZE, and filled
3326  * accordingly.
3327  *
3328  * Return 1 if able to complete the job; otherwise 0.
3329  */
3330 static int
3331 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3332 {
3333 	struct mbuf *m, *n;
3334 	int remainder, space;
3335 
3336 	for (m = m0; m->m_next != NULL; m = m->m_next)
3337 		;
3338 	remainder = len;
3339 	space = M_TRAILINGSPACE(m);
3340 	if (space > 0) {
3341 		/*
3342 		 * Copy into available space.
3343 		 */
3344 		if (space > remainder)
3345 			space = remainder;
3346 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3347 		m->m_len += space;
3348 		cp += space;
3349 		remainder -= space;
3350 	}
3351 	while (remainder > 0) {
3352 		/*
3353 		 * Allocate a new mbuf; could check space
3354 		 * and allocate a cluster instead.
3355 		 */
3356 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3357 		if (n == NULL)
3358 			break;
3359 		n->m_len = min(MJUMPAGESIZE, remainder);
3360 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3361 		cp += n->m_len;
3362 		remainder -= n->m_len;
3363 		m->m_next = n;
3364 		m = n;
3365 	}
3366 	if (m0->m_flags & M_PKTHDR)
3367 		m0->m_pkthdr.len += len - remainder;
3368 
3369 	return (remainder == 0);
3370 }
3371 
3372 #if defined(INET) || defined(INET6)
3373 static __inline int
3374 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3375 {
3376 #if __FreeBSD_version >= 1100095
3377 	if (hn_lro_mbufq_depth) {
3378 		tcp_lro_queue_mbuf(lc, m);
3379 		return 0;
3380 	}
3381 #endif
3382 	return tcp_lro_rx(lc, m, 0);
3383 }
3384 #endif
3385 
3386 static int
3387 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3388     const struct hn_rxinfo *info)
3389 {
3390 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3391 	struct mbuf *m_new;
3392 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3393 	int hash_type = M_HASHTYPE_NONE;
3394 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3395 
3396 	ifp = hn_ifp;
3397 	if (rxr->hn_rxvf_ifp != NULL) {
3398 		/*
3399 		 * Non-transparent mode VF; pretend this packet is from
3400 		 * the VF.
3401 		 */
3402 		ifp = rxr->hn_rxvf_ifp;
3403 		is_vf = 1;
3404 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3405 		/* Transparent mode VF. */
3406 		is_vf = 1;
3407 	}
3408 
3409 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3410 		/*
3411 		 * NOTE:
3412 		 * See the NOTE of hn_rndis_init_fixat().  This
3413 		 * function can be reached, immediately after the
3414 		 * RNDIS is initialized but before the ifnet is
3415 		 * setup on the hn_attach() path; drop the unexpected
3416 		 * packets.
3417 		 */
3418 		return (0);
3419 	}
3420 
3421 	if (__predict_false(dlen < ETHER_HDR_LEN)) {
3422 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3423 		return (0);
3424 	}
3425 
3426 	if (dlen <= MHLEN) {
3427 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3428 		if (m_new == NULL) {
3429 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3430 			return (0);
3431 		}
3432 		memcpy(mtod(m_new, void *), data, dlen);
3433 		m_new->m_pkthdr.len = m_new->m_len = dlen;
3434 		rxr->hn_small_pkts++;
3435 	} else {
3436 		/*
3437 		 * Get an mbuf with a cluster.  For packets 2K or less,
3438 		 * get a standard 2K cluster.  For anything larger, get a
3439 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3440 		 * if looped around to the Hyper-V TX channel, so avoid them.
3441 		 */
3442 		size = MCLBYTES;
3443 		if (dlen > MCLBYTES) {
3444 			/* 4096 */
3445 			size = MJUMPAGESIZE;
3446 		}
3447 
3448 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3449 		if (m_new == NULL) {
3450 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3451 			return (0);
3452 		}
3453 
3454 		hv_m_append(m_new, dlen, data);
3455 	}
3456 	m_new->m_pkthdr.rcvif = ifp;
3457 
3458 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3459 		do_csum = 0;
3460 
3461 	/* receive side checksum offload */
3462 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3463 		/* IP csum offload */
3464 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3465 			m_new->m_pkthdr.csum_flags |=
3466 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3467 			rxr->hn_csum_ip++;
3468 		}
3469 
3470 		/* TCP/UDP csum offload */
3471 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3472 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3473 			m_new->m_pkthdr.csum_flags |=
3474 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3475 			m_new->m_pkthdr.csum_data = 0xffff;
3476 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3477 				rxr->hn_csum_tcp++;
3478 			else
3479 				rxr->hn_csum_udp++;
3480 		}
3481 
3482 		/*
3483 		 * XXX
3484 		 * As of this write (Oct 28th, 2016), host side will turn
3485 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3486 		 * the do_lro setting here is actually _not_ accurate.  We
3487 		 * depend on the RSS hash type check to reset do_lro.
3488 		 */
3489 		if ((info->csum_info &
3490 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3491 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3492 			do_lro = 1;
3493 	} else {
3494 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3495 		if (l3proto == ETHERTYPE_IP) {
3496 			if (l4proto == IPPROTO_TCP) {
3497 				if (do_csum &&
3498 				    (rxr->hn_trust_hcsum &
3499 				     HN_TRUST_HCSUM_TCP)) {
3500 					rxr->hn_csum_trusted++;
3501 					m_new->m_pkthdr.csum_flags |=
3502 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3503 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3504 					m_new->m_pkthdr.csum_data = 0xffff;
3505 				}
3506 				do_lro = 1;
3507 			} else if (l4proto == IPPROTO_UDP) {
3508 				if (do_csum &&
3509 				    (rxr->hn_trust_hcsum &
3510 				     HN_TRUST_HCSUM_UDP)) {
3511 					rxr->hn_csum_trusted++;
3512 					m_new->m_pkthdr.csum_flags |=
3513 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3514 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3515 					m_new->m_pkthdr.csum_data = 0xffff;
3516 				}
3517 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3518 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3519 				rxr->hn_csum_trusted++;
3520 				m_new->m_pkthdr.csum_flags |=
3521 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3522 			}
3523 		}
3524 	}
3525 
3526 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3527 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3528 		    NDIS_VLAN_INFO_ID(info->vlan_info),
3529 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3530 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3531 		m_new->m_flags |= M_VLANTAG;
3532 	}
3533 
3534 	/*
3535 	 * If VF is activated (tranparent/non-transparent mode does not
3536 	 * matter here).
3537 	 *
3538 	 * - Disable LRO
3539 	 *
3540 	 *   hn(4) will only receive broadcast packets, multicast packets,
3541 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3542 	 *   packet types.
3543 	 *
3544 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3545 	 *   all, since the LRO flush will use hn(4) as the receiving
3546 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3547 	 */
3548 	if (is_vf)
3549 		do_lro = 0;
3550 
3551 	/*
3552 	 * If VF is activated (tranparent/non-transparent mode does not
3553 	 * matter here), do _not_ mess with unsupported hash types or
3554 	 * functions.
3555 	 */
3556 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3557 		rxr->hn_rss_pkts++;
3558 		m_new->m_pkthdr.flowid = info->hash_value;
3559 		if (!is_vf)
3560 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3561 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3562 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3563 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3564 			    rxr->hn_mbuf_hash);
3565 
3566 			/*
3567 			 * NOTE:
3568 			 * do_lro is resetted, if the hash types are not TCP
3569 			 * related.  See the comment in the above csum_flags
3570 			 * setup section.
3571 			 */
3572 			switch (type) {
3573 			case NDIS_HASH_IPV4:
3574 				hash_type = M_HASHTYPE_RSS_IPV4;
3575 				do_lro = 0;
3576 				break;
3577 
3578 			case NDIS_HASH_TCP_IPV4:
3579 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3580 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3581 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3582 
3583 					if (is_vf)
3584 						def_htype = M_HASHTYPE_NONE;
3585 
3586 					/*
3587 					 * UDP 4-tuple hash is delivered as
3588 					 * TCP 4-tuple hash.
3589 					 */
3590 					if (l3proto == ETHERTYPE_MAX) {
3591 						hn_rxpkt_proto(m_new,
3592 						    &l3proto, &l4proto);
3593 					}
3594 					if (l3proto == ETHERTYPE_IP) {
3595 						if (l4proto == IPPROTO_UDP &&
3596 						    (rxr->hn_mbuf_hash &
3597 						     NDIS_HASH_UDP_IPV4_X)) {
3598 							hash_type =
3599 							M_HASHTYPE_RSS_UDP_IPV4;
3600 							do_lro = 0;
3601 						} else if (l4proto !=
3602 						    IPPROTO_TCP) {
3603 							hash_type = def_htype;
3604 							do_lro = 0;
3605 						}
3606 					} else {
3607 						hash_type = def_htype;
3608 						do_lro = 0;
3609 					}
3610 				}
3611 				break;
3612 
3613 			case NDIS_HASH_IPV6:
3614 				hash_type = M_HASHTYPE_RSS_IPV6;
3615 				do_lro = 0;
3616 				break;
3617 
3618 			case NDIS_HASH_IPV6_EX:
3619 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3620 				do_lro = 0;
3621 				break;
3622 
3623 			case NDIS_HASH_TCP_IPV6:
3624 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3625 				break;
3626 
3627 			case NDIS_HASH_TCP_IPV6_EX:
3628 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3629 				break;
3630 			}
3631 		}
3632 	} else if (!is_vf) {
3633 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3634 		hash_type = M_HASHTYPE_OPAQUE;
3635 	}
3636 	M_HASHTYPE_SET(m_new, hash_type);
3637 
3638 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3639 	if (hn_ifp != ifp) {
3640 		const struct ether_header *eh;
3641 
3642 		/*
3643 		 * Non-transparent mode VF is activated.
3644 		 */
3645 
3646 		/*
3647 		 * Allow tapping on hn(4).
3648 		 */
3649 		ETHER_BPF_MTAP(hn_ifp, m_new);
3650 
3651 		/*
3652 		 * Update hn(4)'s stats.
3653 		 */
3654 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3655 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3656 		/* Checked at the beginning of this function. */
3657 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3658 		eh = mtod(m_new, struct ether_header *);
3659 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3660 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3661 	}
3662 	rxr->hn_pkts++;
3663 
3664 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3665 #if defined(INET) || defined(INET6)
3666 		struct lro_ctrl *lro = &rxr->hn_lro;
3667 
3668 		if (lro->lro_cnt) {
3669 			rxr->hn_lro_tried++;
3670 			if (hn_lro_rx(lro, m_new) == 0) {
3671 				/* DONE! */
3672 				return 0;
3673 			}
3674 		}
3675 #endif
3676 	}
3677 	ifp->if_input(ifp, m_new);
3678 
3679 	return (0);
3680 }
3681 
3682 static int
3683 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3684 {
3685 	struct hn_softc *sc = ifp->if_softc;
3686 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3687 	struct ifnet *vf_ifp;
3688 	int mask, error = 0;
3689 	struct ifrsskey *ifrk;
3690 	struct ifrsshash *ifrh;
3691 	uint32_t mtu;
3692 
3693 	switch (cmd) {
3694 	case SIOCSIFMTU:
3695 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3696 			error = EINVAL;
3697 			break;
3698 		}
3699 
3700 		HN_LOCK(sc);
3701 
3702 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3703 			HN_UNLOCK(sc);
3704 			break;
3705 		}
3706 
3707 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3708 			/* Can't change MTU */
3709 			HN_UNLOCK(sc);
3710 			error = EOPNOTSUPP;
3711 			break;
3712 		}
3713 
3714 		if (ifp->if_mtu == ifr->ifr_mtu) {
3715 			HN_UNLOCK(sc);
3716 			break;
3717 		}
3718 
3719 		if (hn_xpnt_vf_isready(sc)) {
3720 			vf_ifp = sc->hn_vf_ifp;
3721 			ifr_vf = *ifr;
3722 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3723 			    sizeof(ifr_vf.ifr_name));
3724 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3725 			    (caddr_t)&ifr_vf);
3726 			if (error) {
3727 				HN_UNLOCK(sc);
3728 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3729 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3730 				break;
3731 			}
3732 		}
3733 
3734 		/*
3735 		 * Suspend this interface before the synthetic parts
3736 		 * are ripped.
3737 		 */
3738 		hn_suspend(sc);
3739 
3740 		/*
3741 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3742 		 */
3743 		hn_synth_detach(sc);
3744 
3745 		/*
3746 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3747 		 * with the new MTU setting.
3748 		 */
3749 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3750 		if (error) {
3751 			HN_UNLOCK(sc);
3752 			break;
3753 		}
3754 
3755 		error = hn_rndis_get_mtu(sc, &mtu);
3756 		if (error)
3757 			mtu = ifr->ifr_mtu;
3758 		else if (bootverbose)
3759 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3760 
3761 		/*
3762 		 * Commit the requested MTU, after the synthetic parts
3763 		 * have been successfully attached.
3764 		 */
3765 		if (mtu >= ifr->ifr_mtu) {
3766 			mtu = ifr->ifr_mtu;
3767 		} else {
3768 			if_printf(ifp, "fixup mtu %d -> %u\n",
3769 			    ifr->ifr_mtu, mtu);
3770 		}
3771 		ifp->if_mtu = mtu;
3772 
3773 		/*
3774 		 * Synthetic parts' reattach may change the chimney
3775 		 * sending size; update it.
3776 		 */
3777 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3778 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3779 
3780 		/*
3781 		 * Make sure that various parameters based on MTU are
3782 		 * still valid, after the MTU change.
3783 		 */
3784 		hn_mtu_change_fixup(sc);
3785 
3786 		/*
3787 		 * All done!  Resume the interface now.
3788 		 */
3789 		hn_resume(sc);
3790 
3791 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3792 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3793 			/*
3794 			 * Since we have reattached the NVS part,
3795 			 * change the datapath to VF again; in case
3796 			 * that it is lost, after the NVS was detached.
3797 			 */
3798 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3799 		}
3800 
3801 		HN_UNLOCK(sc);
3802 		break;
3803 
3804 	case SIOCSIFFLAGS:
3805 		HN_LOCK(sc);
3806 
3807 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3808 			HN_UNLOCK(sc);
3809 			break;
3810 		}
3811 
3812 		if (hn_xpnt_vf_isready(sc))
3813 			hn_xpnt_vf_saveifflags(sc);
3814 
3815 		if (ifp->if_flags & IFF_UP) {
3816 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3817 				/*
3818 				 * Caller meight hold mutex, e.g.
3819 				 * bpf; use busy-wait for the RNDIS
3820 				 * reply.
3821 				 */
3822 				HN_NO_SLEEPING(sc);
3823 				hn_rxfilter_config(sc);
3824 				HN_SLEEPING_OK(sc);
3825 
3826 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3827 					error = hn_xpnt_vf_iocsetflags(sc);
3828 			} else {
3829 				hn_init_locked(sc);
3830 			}
3831 		} else {
3832 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3833 				hn_stop(sc, false);
3834 		}
3835 		sc->hn_if_flags = ifp->if_flags;
3836 
3837 		HN_UNLOCK(sc);
3838 		break;
3839 
3840 	case SIOCSIFCAP:
3841 		HN_LOCK(sc);
3842 
3843 		if (hn_xpnt_vf_isready(sc)) {
3844 			ifr_vf = *ifr;
3845 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3846 			    sizeof(ifr_vf.ifr_name));
3847 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3848 			HN_UNLOCK(sc);
3849 			break;
3850 		}
3851 
3852 		/*
3853 		 * Fix up requested capabilities w/ supported capabilities,
3854 		 * since the supported capabilities could have been changed.
3855 		 */
3856 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3857 		    ifp->if_capenable;
3858 
3859 		if (mask & IFCAP_TXCSUM) {
3860 			ifp->if_capenable ^= IFCAP_TXCSUM;
3861 			if (ifp->if_capenable & IFCAP_TXCSUM)
3862 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3863 			else
3864 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3865 		}
3866 		if (mask & IFCAP_TXCSUM_IPV6) {
3867 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3868 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3869 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3870 			else
3871 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3872 		}
3873 
3874 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3875 		if (mask & IFCAP_RXCSUM)
3876 			ifp->if_capenable ^= IFCAP_RXCSUM;
3877 #ifdef foo
3878 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3879 		if (mask & IFCAP_RXCSUM_IPV6)
3880 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3881 #endif
3882 
3883 		if (mask & IFCAP_LRO)
3884 			ifp->if_capenable ^= IFCAP_LRO;
3885 
3886 		if (mask & IFCAP_TSO4) {
3887 			ifp->if_capenable ^= IFCAP_TSO4;
3888 			if (ifp->if_capenable & IFCAP_TSO4)
3889 				ifp->if_hwassist |= CSUM_IP_TSO;
3890 			else
3891 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3892 		}
3893 		if (mask & IFCAP_TSO6) {
3894 			ifp->if_capenable ^= IFCAP_TSO6;
3895 			if (ifp->if_capenable & IFCAP_TSO6)
3896 				ifp->if_hwassist |= CSUM_IP6_TSO;
3897 			else
3898 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3899 		}
3900 
3901 		HN_UNLOCK(sc);
3902 		break;
3903 
3904 	case SIOCADDMULTI:
3905 	case SIOCDELMULTI:
3906 		HN_LOCK(sc);
3907 
3908 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3909 			HN_UNLOCK(sc);
3910 			break;
3911 		}
3912 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3913 			/*
3914 			 * Multicast uses mutex; use busy-wait for
3915 			 * the RNDIS reply.
3916 			 */
3917 			HN_NO_SLEEPING(sc);
3918 			hn_rxfilter_config(sc);
3919 			HN_SLEEPING_OK(sc);
3920 		}
3921 
3922 		/* XXX vlan(4) style mcast addr maintenance */
3923 		if (hn_xpnt_vf_isready(sc)) {
3924 			int old_if_flags;
3925 
3926 			old_if_flags = sc->hn_vf_ifp->if_flags;
3927 			hn_xpnt_vf_saveifflags(sc);
3928 
3929 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3930 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3931 			     IFF_ALLMULTI))
3932 				error = hn_xpnt_vf_iocsetflags(sc);
3933 		}
3934 
3935 		HN_UNLOCK(sc);
3936 		break;
3937 
3938 	case SIOCSIFMEDIA:
3939 	case SIOCGIFMEDIA:
3940 		HN_LOCK(sc);
3941 		if (hn_xpnt_vf_isready(sc)) {
3942 			/*
3943 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3944 			 * create and pass ifr_vf to the VF here; just
3945 			 * replace the ifr_name.
3946 			 */
3947 			vf_ifp = sc->hn_vf_ifp;
3948 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3949 			    sizeof(ifr->ifr_name));
3950 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3951 			/* Restore the ifr_name. */
3952 			strlcpy(ifr->ifr_name, ifp->if_xname,
3953 			    sizeof(ifr->ifr_name));
3954 			HN_UNLOCK(sc);
3955 			break;
3956 		}
3957 		HN_UNLOCK(sc);
3958 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3959 		break;
3960 
3961 	case SIOCGIFRSSHASH:
3962 		ifrh = (struct ifrsshash *)data;
3963 		HN_LOCK(sc);
3964 		if (sc->hn_rx_ring_inuse == 1) {
3965 			HN_UNLOCK(sc);
3966 			ifrh->ifrh_func = RSS_FUNC_NONE;
3967 			ifrh->ifrh_types = 0;
3968 			break;
3969 		}
3970 
3971 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3972 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3973 		else
3974 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3975 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3976 		HN_UNLOCK(sc);
3977 		break;
3978 
3979 	case SIOCGIFRSSKEY:
3980 		ifrk = (struct ifrsskey *)data;
3981 		HN_LOCK(sc);
3982 		if (sc->hn_rx_ring_inuse == 1) {
3983 			HN_UNLOCK(sc);
3984 			ifrk->ifrk_func = RSS_FUNC_NONE;
3985 			ifrk->ifrk_keylen = 0;
3986 			break;
3987 		}
3988 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3989 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3990 		else
3991 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3992 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3993 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3994 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
3995 		HN_UNLOCK(sc);
3996 		break;
3997 
3998 	default:
3999 		error = ether_ioctl(ifp, cmd, data);
4000 		break;
4001 	}
4002 	return (error);
4003 }
4004 
4005 static void
4006 hn_stop(struct hn_softc *sc, bool detaching)
4007 {
4008 	struct ifnet *ifp = sc->hn_ifp;
4009 	int i;
4010 
4011 	HN_LOCK_ASSERT(sc);
4012 
4013 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4014 	    ("synthetic parts were not attached"));
4015 
4016 	/* Clear RUNNING bit ASAP. */
4017 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4018 
4019 	/* Disable polling. */
4020 	hn_polling(sc, 0);
4021 
4022 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4023 		KASSERT(sc->hn_vf_ifp != NULL,
4024 		    ("%s: VF is not attached", ifp->if_xname));
4025 
4026 		/* Mark transparent mode VF as disabled. */
4027 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4028 
4029 		/*
4030 		 * NOTE:
4031 		 * Datapath setting must happen _before_ bringing
4032 		 * the VF down.
4033 		 */
4034 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4035 
4036 		/*
4037 		 * Bring the VF down.
4038 		 */
4039 		hn_xpnt_vf_saveifflags(sc);
4040 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4041 		hn_xpnt_vf_iocsetflags(sc);
4042 	}
4043 
4044 	/* Suspend data transfers. */
4045 	hn_suspend_data(sc);
4046 
4047 	/* Clear OACTIVE bit. */
4048 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4049 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4050 		sc->hn_tx_ring[i].hn_oactive = 0;
4051 
4052 	/*
4053 	 * If the non-transparent mode VF is active, make sure
4054 	 * that the RX filter still allows packet reception.
4055 	 */
4056 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4057 		hn_rxfilter_config(sc);
4058 }
4059 
4060 static void
4061 hn_init_locked(struct hn_softc *sc)
4062 {
4063 	struct ifnet *ifp = sc->hn_ifp;
4064 	int i;
4065 
4066 	HN_LOCK_ASSERT(sc);
4067 
4068 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4069 		return;
4070 
4071 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4072 		return;
4073 
4074 	/* Configure RX filter */
4075 	hn_rxfilter_config(sc);
4076 
4077 	/* Clear OACTIVE bit. */
4078 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4079 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4080 		sc->hn_tx_ring[i].hn_oactive = 0;
4081 
4082 	/* Clear TX 'suspended' bit. */
4083 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4084 
4085 	if (hn_xpnt_vf_isready(sc)) {
4086 		/* Initialize transparent VF. */
4087 		hn_xpnt_vf_init(sc);
4088 	}
4089 
4090 	/* Everything is ready; unleash! */
4091 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4092 
4093 	/* Re-enable polling if requested. */
4094 	if (sc->hn_pollhz > 0)
4095 		hn_polling(sc, sc->hn_pollhz);
4096 }
4097 
4098 static void
4099 hn_init(void *xsc)
4100 {
4101 	struct hn_softc *sc = xsc;
4102 
4103 	HN_LOCK(sc);
4104 	hn_init_locked(sc);
4105 	HN_UNLOCK(sc);
4106 }
4107 
4108 #if __FreeBSD_version >= 1100099
4109 
4110 static int
4111 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4112 {
4113 	struct hn_softc *sc = arg1;
4114 	unsigned int lenlim;
4115 	int error;
4116 
4117 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4118 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4119 	if (error || req->newptr == NULL)
4120 		return error;
4121 
4122 	HN_LOCK(sc);
4123 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4124 	    lenlim > TCP_LRO_LENGTH_MAX) {
4125 		HN_UNLOCK(sc);
4126 		return EINVAL;
4127 	}
4128 	hn_set_lro_lenlim(sc, lenlim);
4129 	HN_UNLOCK(sc);
4130 
4131 	return 0;
4132 }
4133 
4134 static int
4135 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4136 {
4137 	struct hn_softc *sc = arg1;
4138 	int ackcnt, error, i;
4139 
4140 	/*
4141 	 * lro_ackcnt_lim is append count limit,
4142 	 * +1 to turn it into aggregation limit.
4143 	 */
4144 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4145 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4146 	if (error || req->newptr == NULL)
4147 		return error;
4148 
4149 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4150 		return EINVAL;
4151 
4152 	/*
4153 	 * Convert aggregation limit back to append
4154 	 * count limit.
4155 	 */
4156 	--ackcnt;
4157 	HN_LOCK(sc);
4158 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4159 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4160 	HN_UNLOCK(sc);
4161 	return 0;
4162 }
4163 
4164 #endif
4165 
4166 static int
4167 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4168 {
4169 	struct hn_softc *sc = arg1;
4170 	int hcsum = arg2;
4171 	int on, error, i;
4172 
4173 	on = 0;
4174 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4175 		on = 1;
4176 
4177 	error = sysctl_handle_int(oidp, &on, 0, req);
4178 	if (error || req->newptr == NULL)
4179 		return error;
4180 
4181 	HN_LOCK(sc);
4182 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4183 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4184 
4185 		if (on)
4186 			rxr->hn_trust_hcsum |= hcsum;
4187 		else
4188 			rxr->hn_trust_hcsum &= ~hcsum;
4189 	}
4190 	HN_UNLOCK(sc);
4191 	return 0;
4192 }
4193 
4194 static int
4195 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4196 {
4197 	struct hn_softc *sc = arg1;
4198 	int chim_size, error;
4199 
4200 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4201 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4202 	if (error || req->newptr == NULL)
4203 		return error;
4204 
4205 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4206 		return EINVAL;
4207 
4208 	HN_LOCK(sc);
4209 	hn_set_chim_size(sc, chim_size);
4210 	HN_UNLOCK(sc);
4211 	return 0;
4212 }
4213 
4214 #if __FreeBSD_version < 1100095
4215 static int
4216 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4217 {
4218 	struct hn_softc *sc = arg1;
4219 	int ofs = arg2, i, error;
4220 	struct hn_rx_ring *rxr;
4221 	uint64_t stat;
4222 
4223 	stat = 0;
4224 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4225 		rxr = &sc->hn_rx_ring[i];
4226 		stat += *((int *)((uint8_t *)rxr + ofs));
4227 	}
4228 
4229 	error = sysctl_handle_64(oidp, &stat, 0, req);
4230 	if (error || req->newptr == NULL)
4231 		return error;
4232 
4233 	/* Zero out this stat. */
4234 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4235 		rxr = &sc->hn_rx_ring[i];
4236 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4237 	}
4238 	return 0;
4239 }
4240 #else
4241 static int
4242 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4243 {
4244 	struct hn_softc *sc = arg1;
4245 	int ofs = arg2, i, error;
4246 	struct hn_rx_ring *rxr;
4247 	uint64_t stat;
4248 
4249 	stat = 0;
4250 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4251 		rxr = &sc->hn_rx_ring[i];
4252 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4253 	}
4254 
4255 	error = sysctl_handle_64(oidp, &stat, 0, req);
4256 	if (error || req->newptr == NULL)
4257 		return error;
4258 
4259 	/* Zero out this stat. */
4260 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4261 		rxr = &sc->hn_rx_ring[i];
4262 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4263 	}
4264 	return 0;
4265 }
4266 
4267 #endif
4268 
4269 static int
4270 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4271 {
4272 	struct hn_softc *sc = arg1;
4273 	int ofs = arg2, i, error;
4274 	struct hn_rx_ring *rxr;
4275 	u_long stat;
4276 
4277 	stat = 0;
4278 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4279 		rxr = &sc->hn_rx_ring[i];
4280 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4281 	}
4282 
4283 	error = sysctl_handle_long(oidp, &stat, 0, req);
4284 	if (error || req->newptr == NULL)
4285 		return error;
4286 
4287 	/* Zero out this stat. */
4288 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4289 		rxr = &sc->hn_rx_ring[i];
4290 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4291 	}
4292 	return 0;
4293 }
4294 
4295 static int
4296 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4297 {
4298 	struct hn_softc *sc = arg1;
4299 	int ofs = arg2, i, error;
4300 	struct hn_tx_ring *txr;
4301 	u_long stat;
4302 
4303 	stat = 0;
4304 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4305 		txr = &sc->hn_tx_ring[i];
4306 		stat += *((u_long *)((uint8_t *)txr + ofs));
4307 	}
4308 
4309 	error = sysctl_handle_long(oidp, &stat, 0, req);
4310 	if (error || req->newptr == NULL)
4311 		return error;
4312 
4313 	/* Zero out this stat. */
4314 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4315 		txr = &sc->hn_tx_ring[i];
4316 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4317 	}
4318 	return 0;
4319 }
4320 
4321 static int
4322 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4323 {
4324 	struct hn_softc *sc = arg1;
4325 	int ofs = arg2, i, error, conf;
4326 	struct hn_tx_ring *txr;
4327 
4328 	txr = &sc->hn_tx_ring[0];
4329 	conf = *((int *)((uint8_t *)txr + ofs));
4330 
4331 	error = sysctl_handle_int(oidp, &conf, 0, req);
4332 	if (error || req->newptr == NULL)
4333 		return error;
4334 
4335 	HN_LOCK(sc);
4336 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4337 		txr = &sc->hn_tx_ring[i];
4338 		*((int *)((uint8_t *)txr + ofs)) = conf;
4339 	}
4340 	HN_UNLOCK(sc);
4341 
4342 	return 0;
4343 }
4344 
4345 static int
4346 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4347 {
4348 	struct hn_softc *sc = arg1;
4349 	int error, size;
4350 
4351 	size = sc->hn_agg_size;
4352 	error = sysctl_handle_int(oidp, &size, 0, req);
4353 	if (error || req->newptr == NULL)
4354 		return (error);
4355 
4356 	HN_LOCK(sc);
4357 	sc->hn_agg_size = size;
4358 	hn_set_txagg(sc);
4359 	HN_UNLOCK(sc);
4360 
4361 	return (0);
4362 }
4363 
4364 static int
4365 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4366 {
4367 	struct hn_softc *sc = arg1;
4368 	int error, pkts;
4369 
4370 	pkts = sc->hn_agg_pkts;
4371 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4372 	if (error || req->newptr == NULL)
4373 		return (error);
4374 
4375 	HN_LOCK(sc);
4376 	sc->hn_agg_pkts = pkts;
4377 	hn_set_txagg(sc);
4378 	HN_UNLOCK(sc);
4379 
4380 	return (0);
4381 }
4382 
4383 static int
4384 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4385 {
4386 	struct hn_softc *sc = arg1;
4387 	int pkts;
4388 
4389 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4390 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4391 }
4392 
4393 static int
4394 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4395 {
4396 	struct hn_softc *sc = arg1;
4397 	int align;
4398 
4399 	align = sc->hn_tx_ring[0].hn_agg_align;
4400 	return (sysctl_handle_int(oidp, &align, 0, req));
4401 }
4402 
4403 static void
4404 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4405 {
4406 	if (pollhz == 0)
4407 		vmbus_chan_poll_disable(chan);
4408 	else
4409 		vmbus_chan_poll_enable(chan, pollhz);
4410 }
4411 
4412 static void
4413 hn_polling(struct hn_softc *sc, u_int pollhz)
4414 {
4415 	int nsubch = sc->hn_rx_ring_inuse - 1;
4416 
4417 	HN_LOCK_ASSERT(sc);
4418 
4419 	if (nsubch > 0) {
4420 		struct vmbus_channel **subch;
4421 		int i;
4422 
4423 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4424 		for (i = 0; i < nsubch; ++i)
4425 			hn_chan_polling(subch[i], pollhz);
4426 		vmbus_subchan_rel(subch, nsubch);
4427 	}
4428 	hn_chan_polling(sc->hn_prichan, pollhz);
4429 }
4430 
4431 static int
4432 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4433 {
4434 	struct hn_softc *sc = arg1;
4435 	int pollhz, error;
4436 
4437 	pollhz = sc->hn_pollhz;
4438 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4439 	if (error || req->newptr == NULL)
4440 		return (error);
4441 
4442 	if (pollhz != 0 &&
4443 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4444 		return (EINVAL);
4445 
4446 	HN_LOCK(sc);
4447 	if (sc->hn_pollhz != pollhz) {
4448 		sc->hn_pollhz = pollhz;
4449 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4450 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4451 			hn_polling(sc, sc->hn_pollhz);
4452 	}
4453 	HN_UNLOCK(sc);
4454 
4455 	return (0);
4456 }
4457 
4458 static int
4459 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4460 {
4461 	struct hn_softc *sc = arg1;
4462 	char verstr[16];
4463 
4464 	snprintf(verstr, sizeof(verstr), "%u.%u",
4465 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4466 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4467 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4468 }
4469 
4470 static int
4471 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4472 {
4473 	struct hn_softc *sc = arg1;
4474 	char caps_str[128];
4475 	uint32_t caps;
4476 
4477 	HN_LOCK(sc);
4478 	caps = sc->hn_caps;
4479 	HN_UNLOCK(sc);
4480 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4481 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4482 }
4483 
4484 static int
4485 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4486 {
4487 	struct hn_softc *sc = arg1;
4488 	char assist_str[128];
4489 	uint32_t hwassist;
4490 
4491 	HN_LOCK(sc);
4492 	hwassist = sc->hn_ifp->if_hwassist;
4493 	HN_UNLOCK(sc);
4494 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4495 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4496 }
4497 
4498 static int
4499 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4500 {
4501 	struct hn_softc *sc = arg1;
4502 	char filter_str[128];
4503 	uint32_t filter;
4504 
4505 	HN_LOCK(sc);
4506 	filter = sc->hn_rx_filter;
4507 	HN_UNLOCK(sc);
4508 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4509 	    NDIS_PACKET_TYPES);
4510 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4511 }
4512 
4513 #ifndef RSS
4514 
4515 static int
4516 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4517 {
4518 	struct hn_softc *sc = arg1;
4519 	int error;
4520 
4521 	HN_LOCK(sc);
4522 
4523 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4524 	if (error || req->newptr == NULL)
4525 		goto back;
4526 
4527 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4528 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4529 		/*
4530 		 * RSS key is synchronized w/ VF's, don't allow users
4531 		 * to change it.
4532 		 */
4533 		error = EBUSY;
4534 		goto back;
4535 	}
4536 
4537 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4538 	if (error)
4539 		goto back;
4540 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4541 
4542 	if (sc->hn_rx_ring_inuse > 1) {
4543 		error = hn_rss_reconfig(sc);
4544 	} else {
4545 		/* Not RSS capable, at least for now; just save the RSS key. */
4546 		error = 0;
4547 	}
4548 back:
4549 	HN_UNLOCK(sc);
4550 	return (error);
4551 }
4552 
4553 static int
4554 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4555 {
4556 	struct hn_softc *sc = arg1;
4557 	int error;
4558 
4559 	HN_LOCK(sc);
4560 
4561 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4562 	if (error || req->newptr == NULL)
4563 		goto back;
4564 
4565 	/*
4566 	 * Don't allow RSS indirect table change, if this interface is not
4567 	 * RSS capable currently.
4568 	 */
4569 	if (sc->hn_rx_ring_inuse == 1) {
4570 		error = EOPNOTSUPP;
4571 		goto back;
4572 	}
4573 
4574 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4575 	if (error)
4576 		goto back;
4577 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4578 
4579 	hn_rss_ind_fixup(sc);
4580 	error = hn_rss_reconfig(sc);
4581 back:
4582 	HN_UNLOCK(sc);
4583 	return (error);
4584 }
4585 
4586 #endif	/* !RSS */
4587 
4588 static int
4589 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4590 {
4591 	struct hn_softc *sc = arg1;
4592 	char hash_str[128];
4593 	uint32_t hash;
4594 
4595 	HN_LOCK(sc);
4596 	hash = sc->hn_rss_hash;
4597 	HN_UNLOCK(sc);
4598 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4599 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4600 }
4601 
4602 static int
4603 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4604 {
4605 	struct hn_softc *sc = arg1;
4606 	char hash_str[128];
4607 	uint32_t hash;
4608 
4609 	HN_LOCK(sc);
4610 	hash = sc->hn_rss_hcap;
4611 	HN_UNLOCK(sc);
4612 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4613 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4614 }
4615 
4616 static int
4617 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4618 {
4619 	struct hn_softc *sc = arg1;
4620 	char hash_str[128];
4621 	uint32_t hash;
4622 
4623 	HN_LOCK(sc);
4624 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4625 	HN_UNLOCK(sc);
4626 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4627 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4628 }
4629 
4630 static int
4631 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4632 {
4633 	struct hn_softc *sc = arg1;
4634 	char vf_name[IFNAMSIZ + 1];
4635 	struct ifnet *vf_ifp;
4636 
4637 	HN_LOCK(sc);
4638 	vf_name[0] = '\0';
4639 	vf_ifp = sc->hn_vf_ifp;
4640 	if (vf_ifp != NULL)
4641 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4642 	HN_UNLOCK(sc);
4643 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4644 }
4645 
4646 static int
4647 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4648 {
4649 	struct hn_softc *sc = arg1;
4650 	char vf_name[IFNAMSIZ + 1];
4651 	struct ifnet *vf_ifp;
4652 
4653 	HN_LOCK(sc);
4654 	vf_name[0] = '\0';
4655 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4656 	if (vf_ifp != NULL)
4657 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4658 	HN_UNLOCK(sc);
4659 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4660 }
4661 
4662 static int
4663 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4664 {
4665 	struct rm_priotracker pt;
4666 	struct sbuf *sb;
4667 	int error, i;
4668 	bool first;
4669 
4670 	error = sysctl_wire_old_buffer(req, 0);
4671 	if (error != 0)
4672 		return (error);
4673 
4674 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4675 	if (sb == NULL)
4676 		return (ENOMEM);
4677 
4678 	rm_rlock(&hn_vfmap_lock, &pt);
4679 
4680 	first = true;
4681 	for (i = 0; i < hn_vfmap_size; ++i) {
4682 		struct ifnet *ifp;
4683 
4684 		if (hn_vfmap[i] == NULL)
4685 			continue;
4686 
4687 		ifp = ifnet_byindex(i);
4688 		if (ifp != NULL) {
4689 			if (first)
4690 				sbuf_printf(sb, "%s", ifp->if_xname);
4691 			else
4692 				sbuf_printf(sb, " %s", ifp->if_xname);
4693 			first = false;
4694 		}
4695 	}
4696 
4697 	rm_runlock(&hn_vfmap_lock, &pt);
4698 
4699 	error = sbuf_finish(sb);
4700 	sbuf_delete(sb);
4701 	return (error);
4702 }
4703 
4704 static int
4705 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4706 {
4707 	struct rm_priotracker pt;
4708 	struct sbuf *sb;
4709 	int error, i;
4710 	bool first;
4711 
4712 	error = sysctl_wire_old_buffer(req, 0);
4713 	if (error != 0)
4714 		return (error);
4715 
4716 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4717 	if (sb == NULL)
4718 		return (ENOMEM);
4719 
4720 	rm_rlock(&hn_vfmap_lock, &pt);
4721 
4722 	first = true;
4723 	for (i = 0; i < hn_vfmap_size; ++i) {
4724 		struct ifnet *ifp, *hn_ifp;
4725 
4726 		hn_ifp = hn_vfmap[i];
4727 		if (hn_ifp == NULL)
4728 			continue;
4729 
4730 		ifp = ifnet_byindex(i);
4731 		if (ifp != NULL) {
4732 			if (first) {
4733 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4734 				    hn_ifp->if_xname);
4735 			} else {
4736 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4737 				    hn_ifp->if_xname);
4738 			}
4739 			first = false;
4740 		}
4741 	}
4742 
4743 	rm_runlock(&hn_vfmap_lock, &pt);
4744 
4745 	error = sbuf_finish(sb);
4746 	sbuf_delete(sb);
4747 	return (error);
4748 }
4749 
4750 static int
4751 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4752 {
4753 	struct hn_softc *sc = arg1;
4754 	int error, onoff = 0;
4755 
4756 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4757 		onoff = 1;
4758 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4759 	if (error || req->newptr == NULL)
4760 		return (error);
4761 
4762 	HN_LOCK(sc);
4763 	/* NOTE: hn_vf_lock for hn_transmit() */
4764 	rm_wlock(&sc->hn_vf_lock);
4765 	if (onoff)
4766 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4767 	else
4768 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4769 	rm_wunlock(&sc->hn_vf_lock);
4770 	HN_UNLOCK(sc);
4771 
4772 	return (0);
4773 }
4774 
4775 static int
4776 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4777 {
4778 	struct hn_softc *sc = arg1;
4779 	int enabled = 0;
4780 
4781 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4782 		enabled = 1;
4783 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4784 }
4785 
4786 static int
4787 hn_check_iplen(const struct mbuf *m, int hoff)
4788 {
4789 	const struct ip *ip;
4790 	int len, iphlen, iplen;
4791 	const struct tcphdr *th;
4792 	int thoff;				/* TCP data offset */
4793 
4794 	len = hoff + sizeof(struct ip);
4795 
4796 	/* The packet must be at least the size of an IP header. */
4797 	if (m->m_pkthdr.len < len)
4798 		return IPPROTO_DONE;
4799 
4800 	/* The fixed IP header must reside completely in the first mbuf. */
4801 	if (m->m_len < len)
4802 		return IPPROTO_DONE;
4803 
4804 	ip = mtodo(m, hoff);
4805 
4806 	/* Bound check the packet's stated IP header length. */
4807 	iphlen = ip->ip_hl << 2;
4808 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4809 		return IPPROTO_DONE;
4810 
4811 	/* The full IP header must reside completely in the one mbuf. */
4812 	if (m->m_len < hoff + iphlen)
4813 		return IPPROTO_DONE;
4814 
4815 	iplen = ntohs(ip->ip_len);
4816 
4817 	/*
4818 	 * Check that the amount of data in the buffers is as
4819 	 * at least much as the IP header would have us expect.
4820 	 */
4821 	if (m->m_pkthdr.len < hoff + iplen)
4822 		return IPPROTO_DONE;
4823 
4824 	/*
4825 	 * Ignore IP fragments.
4826 	 */
4827 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4828 		return IPPROTO_DONE;
4829 
4830 	/*
4831 	 * The TCP/IP or UDP/IP header must be entirely contained within
4832 	 * the first fragment of a packet.
4833 	 */
4834 	switch (ip->ip_p) {
4835 	case IPPROTO_TCP:
4836 		if (iplen < iphlen + sizeof(struct tcphdr))
4837 			return IPPROTO_DONE;
4838 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4839 			return IPPROTO_DONE;
4840 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4841 		thoff = th->th_off << 2;
4842 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4843 			return IPPROTO_DONE;
4844 		if (m->m_len < hoff + iphlen + thoff)
4845 			return IPPROTO_DONE;
4846 		break;
4847 	case IPPROTO_UDP:
4848 		if (iplen < iphlen + sizeof(struct udphdr))
4849 			return IPPROTO_DONE;
4850 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4851 			return IPPROTO_DONE;
4852 		break;
4853 	default:
4854 		if (iplen < iphlen)
4855 			return IPPROTO_DONE;
4856 		break;
4857 	}
4858 	return ip->ip_p;
4859 }
4860 
4861 static void
4862 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4863 {
4864 	const struct ether_header *eh;
4865 	uint16_t etype;
4866 	int hoff;
4867 
4868 	hoff = sizeof(*eh);
4869 	/* Checked at the beginning of this function. */
4870 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4871 
4872 	eh = mtod(m_new, const struct ether_header *);
4873 	etype = ntohs(eh->ether_type);
4874 	if (etype == ETHERTYPE_VLAN) {
4875 		const struct ether_vlan_header *evl;
4876 
4877 		hoff = sizeof(*evl);
4878 		if (m_new->m_len < hoff)
4879 			return;
4880 		evl = mtod(m_new, const struct ether_vlan_header *);
4881 		etype = ntohs(evl->evl_proto);
4882 	}
4883 	*l3proto = etype;
4884 
4885 	if (etype == ETHERTYPE_IP)
4886 		*l4proto = hn_check_iplen(m_new, hoff);
4887 	else
4888 		*l4proto = IPPROTO_DONE;
4889 }
4890 
4891 static int
4892 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4893 {
4894 	struct sysctl_oid_list *child;
4895 	struct sysctl_ctx_list *ctx;
4896 	device_t dev = sc->hn_dev;
4897 #if defined(INET) || defined(INET6)
4898 #if __FreeBSD_version >= 1100095
4899 	int lroent_cnt;
4900 #endif
4901 #endif
4902 	int i;
4903 
4904 	/*
4905 	 * Create RXBUF for reception.
4906 	 *
4907 	 * NOTE:
4908 	 * - It is shared by all channels.
4909 	 * - A large enough buffer is allocated, certain version of NVSes
4910 	 *   may further limit the usable space.
4911 	 */
4912 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4913 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4914 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4915 	if (sc->hn_rxbuf == NULL) {
4916 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4917 		return (ENOMEM);
4918 	}
4919 
4920 	sc->hn_rx_ring_cnt = ring_cnt;
4921 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4922 
4923 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4924 	    M_DEVBUF, M_WAITOK | M_ZERO);
4925 
4926 #if defined(INET) || defined(INET6)
4927 #if __FreeBSD_version >= 1100095
4928 	lroent_cnt = hn_lro_entry_count;
4929 	if (lroent_cnt < TCP_LRO_ENTRIES)
4930 		lroent_cnt = TCP_LRO_ENTRIES;
4931 	if (bootverbose)
4932 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4933 #endif
4934 #endif	/* INET || INET6 */
4935 
4936 	ctx = device_get_sysctl_ctx(dev);
4937 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4938 
4939 	/* Create dev.hn.UNIT.rx sysctl tree */
4940 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4941 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4942 
4943 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4944 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4945 
4946 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4947 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4948 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4949 		if (rxr->hn_br == NULL) {
4950 			device_printf(dev, "allocate bufring failed\n");
4951 			return (ENOMEM);
4952 		}
4953 
4954 		if (hn_trust_hosttcp)
4955 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4956 		if (hn_trust_hostudp)
4957 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4958 		if (hn_trust_hostip)
4959 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4960 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4961 		rxr->hn_ifp = sc->hn_ifp;
4962 		if (i < sc->hn_tx_ring_cnt)
4963 			rxr->hn_txr = &sc->hn_tx_ring[i];
4964 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4965 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4966 		rxr->hn_rx_idx = i;
4967 		rxr->hn_rxbuf = sc->hn_rxbuf;
4968 
4969 		/*
4970 		 * Initialize LRO.
4971 		 */
4972 #if defined(INET) || defined(INET6)
4973 #if __FreeBSD_version >= 1100095
4974 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4975 		    hn_lro_mbufq_depth);
4976 #else
4977 		tcp_lro_init(&rxr->hn_lro);
4978 		rxr->hn_lro.ifp = sc->hn_ifp;
4979 #endif
4980 #if __FreeBSD_version >= 1100099
4981 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4982 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4983 #endif
4984 #endif	/* INET || INET6 */
4985 
4986 		if (sc->hn_rx_sysctl_tree != NULL) {
4987 			char name[16];
4988 
4989 			/*
4990 			 * Create per RX ring sysctl tree:
4991 			 * dev.hn.UNIT.rx.RINGID
4992 			 */
4993 			snprintf(name, sizeof(name), "%d", i);
4994 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4995 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4996 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4997 
4998 			if (rxr->hn_rx_sysctl_tree != NULL) {
4999 				SYSCTL_ADD_ULONG(ctx,
5000 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5001 				    OID_AUTO, "packets", CTLFLAG_RW,
5002 				    &rxr->hn_pkts, "# of packets received");
5003 				SYSCTL_ADD_ULONG(ctx,
5004 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5005 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
5006 				    &rxr->hn_rss_pkts,
5007 				    "# of packets w/ RSS info received");
5008 				SYSCTL_ADD_INT(ctx,
5009 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5010 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5011 				    &rxr->hn_pktbuf_len, 0,
5012 				    "Temporary channel packet buffer length");
5013 			}
5014 		}
5015 	}
5016 
5017 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5018 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5019 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5020 #if __FreeBSD_version < 1100095
5021 	    hn_rx_stat_int_sysctl,
5022 #else
5023 	    hn_rx_stat_u64_sysctl,
5024 #endif
5025 	    "LU", "LRO queued");
5026 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5027 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5028 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5029 #if __FreeBSD_version < 1100095
5030 	    hn_rx_stat_int_sysctl,
5031 #else
5032 	    hn_rx_stat_u64_sysctl,
5033 #endif
5034 	    "LU", "LRO flushed");
5035 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5036 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5037 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5038 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5039 #if __FreeBSD_version >= 1100099
5040 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5041 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5042 	    hn_lro_lenlim_sysctl, "IU",
5043 	    "Max # of data bytes to be aggregated by LRO");
5044 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5045 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5046 	    hn_lro_ackcnt_sysctl, "I",
5047 	    "Max # of ACKs to be aggregated by LRO");
5048 #endif
5049 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5050 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5051 	    hn_trust_hcsum_sysctl, "I",
5052 	    "Trust tcp segement verification on host side, "
5053 	    "when csum info is missing");
5054 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5055 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5056 	    hn_trust_hcsum_sysctl, "I",
5057 	    "Trust udp datagram verification on host side, "
5058 	    "when csum info is missing");
5059 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5060 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5061 	    hn_trust_hcsum_sysctl, "I",
5062 	    "Trust ip packet verification on host side, "
5063 	    "when csum info is missing");
5064 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5065 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5066 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5067 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5068 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5069 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5070 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5071 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5072 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5073 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5074 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5075 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5076 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5077 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5078 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5079 	    hn_rx_stat_ulong_sysctl, "LU",
5080 	    "# of packets that we trust host's csum verification");
5081 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5082 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5083 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5084 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5085 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5086 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5087 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5088 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5089 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5090 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5091 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5092 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5093 
5094 	return (0);
5095 }
5096 
5097 static void
5098 hn_destroy_rx_data(struct hn_softc *sc)
5099 {
5100 	int i;
5101 
5102 	if (sc->hn_rxbuf != NULL) {
5103 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5104 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5105 		else
5106 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5107 		sc->hn_rxbuf = NULL;
5108 	}
5109 
5110 	if (sc->hn_rx_ring_cnt == 0)
5111 		return;
5112 
5113 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5114 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5115 
5116 		if (rxr->hn_br == NULL)
5117 			continue;
5118 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5119 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5120 		} else {
5121 			device_printf(sc->hn_dev,
5122 			    "%dth channel bufring is referenced", i);
5123 		}
5124 		rxr->hn_br = NULL;
5125 
5126 #if defined(INET) || defined(INET6)
5127 		tcp_lro_free(&rxr->hn_lro);
5128 #endif
5129 		free(rxr->hn_pktbuf, M_DEVBUF);
5130 	}
5131 	free(sc->hn_rx_ring, M_DEVBUF);
5132 	sc->hn_rx_ring = NULL;
5133 
5134 	sc->hn_rx_ring_cnt = 0;
5135 	sc->hn_rx_ring_inuse = 0;
5136 }
5137 
5138 static int
5139 hn_tx_ring_create(struct hn_softc *sc, int id)
5140 {
5141 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5142 	device_t dev = sc->hn_dev;
5143 	bus_dma_tag_t parent_dtag;
5144 	int error, i;
5145 
5146 	txr->hn_sc = sc;
5147 	txr->hn_tx_idx = id;
5148 
5149 #ifndef HN_USE_TXDESC_BUFRING
5150 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5151 #endif
5152 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5153 
5154 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5155 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5156 	    M_DEVBUF, M_WAITOK | M_ZERO);
5157 #ifndef HN_USE_TXDESC_BUFRING
5158 	SLIST_INIT(&txr->hn_txlist);
5159 #else
5160 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5161 	    M_WAITOK, &txr->hn_tx_lock);
5162 #endif
5163 
5164 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5165 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5166 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5167 	} else {
5168 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5169 	}
5170 
5171 #ifdef HN_IFSTART_SUPPORT
5172 	if (hn_use_if_start) {
5173 		txr->hn_txeof = hn_start_txeof;
5174 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5175 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5176 	} else
5177 #endif
5178 	{
5179 		int br_depth;
5180 
5181 		txr->hn_txeof = hn_xmit_txeof;
5182 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5183 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5184 
5185 		br_depth = hn_get_txswq_depth(txr);
5186 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5187 		    M_WAITOK, &txr->hn_tx_lock);
5188 	}
5189 
5190 	txr->hn_direct_tx_size = hn_direct_tx_size;
5191 
5192 	/*
5193 	 * Always schedule transmission instead of trying to do direct
5194 	 * transmission.  This one gives the best performance so far.
5195 	 */
5196 	txr->hn_sched_tx = 1;
5197 
5198 	parent_dtag = bus_get_dma_tag(dev);
5199 
5200 	/* DMA tag for RNDIS packet messages. */
5201 	error = bus_dma_tag_create(parent_dtag, /* parent */
5202 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5203 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5204 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5205 	    BUS_SPACE_MAXADDR,		/* highaddr */
5206 	    NULL, NULL,			/* filter, filterarg */
5207 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5208 	    1,				/* nsegments */
5209 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5210 	    0,				/* flags */
5211 	    NULL,			/* lockfunc */
5212 	    NULL,			/* lockfuncarg */
5213 	    &txr->hn_tx_rndis_dtag);
5214 	if (error) {
5215 		device_printf(dev, "failed to create rndis dmatag\n");
5216 		return error;
5217 	}
5218 
5219 	/* DMA tag for data. */
5220 	error = bus_dma_tag_create(parent_dtag, /* parent */
5221 	    1,				/* alignment */
5222 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5223 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5224 	    BUS_SPACE_MAXADDR,		/* highaddr */
5225 	    NULL, NULL,			/* filter, filterarg */
5226 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5227 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5228 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5229 	    0,				/* flags */
5230 	    NULL,			/* lockfunc */
5231 	    NULL,			/* lockfuncarg */
5232 	    &txr->hn_tx_data_dtag);
5233 	if (error) {
5234 		device_printf(dev, "failed to create data dmatag\n");
5235 		return error;
5236 	}
5237 
5238 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5239 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5240 
5241 		txd->txr = txr;
5242 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5243 		STAILQ_INIT(&txd->agg_list);
5244 
5245 		/*
5246 		 * Allocate and load RNDIS packet message.
5247 		 */
5248         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5249 		    (void **)&txd->rndis_pkt,
5250 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5251 		    &txd->rndis_pkt_dmap);
5252 		if (error) {
5253 			device_printf(dev,
5254 			    "failed to allocate rndis_packet_msg, %d\n", i);
5255 			return error;
5256 		}
5257 
5258 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5259 		    txd->rndis_pkt_dmap,
5260 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5261 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5262 		    BUS_DMA_NOWAIT);
5263 		if (error) {
5264 			device_printf(dev,
5265 			    "failed to load rndis_packet_msg, %d\n", i);
5266 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5267 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5268 			return error;
5269 		}
5270 
5271 		/* DMA map for TX data. */
5272 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5273 		    &txd->data_dmap);
5274 		if (error) {
5275 			device_printf(dev,
5276 			    "failed to allocate tx data dmamap\n");
5277 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5278 			    txd->rndis_pkt_dmap);
5279 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5280 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5281 			return error;
5282 		}
5283 
5284 		/* All set, put it to list */
5285 		txd->flags |= HN_TXD_FLAG_ONLIST;
5286 #ifndef HN_USE_TXDESC_BUFRING
5287 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5288 #else
5289 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5290 #endif
5291 	}
5292 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5293 
5294 	if (sc->hn_tx_sysctl_tree != NULL) {
5295 		struct sysctl_oid_list *child;
5296 		struct sysctl_ctx_list *ctx;
5297 		char name[16];
5298 
5299 		/*
5300 		 * Create per TX ring sysctl tree:
5301 		 * dev.hn.UNIT.tx.RINGID
5302 		 */
5303 		ctx = device_get_sysctl_ctx(dev);
5304 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5305 
5306 		snprintf(name, sizeof(name), "%d", id);
5307 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5308 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5309 
5310 		if (txr->hn_tx_sysctl_tree != NULL) {
5311 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5312 
5313 #ifdef HN_DEBUG
5314 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5315 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5316 			    "# of available TX descs");
5317 #endif
5318 #ifdef HN_IFSTART_SUPPORT
5319 			if (!hn_use_if_start)
5320 #endif
5321 			{
5322 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5323 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5324 				    "over active");
5325 			}
5326 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5327 			    CTLFLAG_RW, &txr->hn_pkts,
5328 			    "# of packets transmitted");
5329 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5330 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5331 		}
5332 	}
5333 
5334 	return 0;
5335 }
5336 
5337 static void
5338 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5339 {
5340 	struct hn_tx_ring *txr = txd->txr;
5341 
5342 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5343 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5344 
5345 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5346 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5347 	    txd->rndis_pkt_dmap);
5348 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5349 }
5350 
5351 static void
5352 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5353 {
5354 
5355 	KASSERT(txd->refs == 0 || txd->refs == 1,
5356 	    ("invalid txd refs %d", txd->refs));
5357 
5358 	/* Aggregated txds will be freed by their aggregating txd. */
5359 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5360 		int freed;
5361 
5362 		freed = hn_txdesc_put(txr, txd);
5363 		KASSERT(freed, ("can't free txdesc"));
5364 	}
5365 }
5366 
5367 static void
5368 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5369 {
5370 	int i;
5371 
5372 	if (txr->hn_txdesc == NULL)
5373 		return;
5374 
5375 	/*
5376 	 * NOTE:
5377 	 * Because the freeing of aggregated txds will be deferred
5378 	 * to the aggregating txd, two passes are used here:
5379 	 * - The first pass GCes any pending txds.  This GC is necessary,
5380 	 *   since if the channels are revoked, hypervisor will not
5381 	 *   deliver send-done for all pending txds.
5382 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5383 	 *   were freed.
5384 	 */
5385 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5386 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5387 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5388 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5389 
5390 	if (txr->hn_tx_data_dtag != NULL)
5391 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5392 	if (txr->hn_tx_rndis_dtag != NULL)
5393 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5394 
5395 #ifdef HN_USE_TXDESC_BUFRING
5396 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5397 #endif
5398 
5399 	free(txr->hn_txdesc, M_DEVBUF);
5400 	txr->hn_txdesc = NULL;
5401 
5402 	if (txr->hn_mbuf_br != NULL)
5403 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5404 
5405 #ifndef HN_USE_TXDESC_BUFRING
5406 	mtx_destroy(&txr->hn_txlist_spin);
5407 #endif
5408 	mtx_destroy(&txr->hn_tx_lock);
5409 }
5410 
5411 static int
5412 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5413 {
5414 	struct sysctl_oid_list *child;
5415 	struct sysctl_ctx_list *ctx;
5416 	int i;
5417 
5418 	/*
5419 	 * Create TXBUF for chimney sending.
5420 	 *
5421 	 * NOTE: It is shared by all channels.
5422 	 */
5423 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5424 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5425 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5426 	if (sc->hn_chim == NULL) {
5427 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5428 		return (ENOMEM);
5429 	}
5430 
5431 	sc->hn_tx_ring_cnt = ring_cnt;
5432 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5433 
5434 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5435 	    M_DEVBUF, M_WAITOK | M_ZERO);
5436 
5437 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5438 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5439 
5440 	/* Create dev.hn.UNIT.tx sysctl tree */
5441 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5442 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5443 
5444 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5445 		int error;
5446 
5447 		error = hn_tx_ring_create(sc, i);
5448 		if (error)
5449 			return error;
5450 	}
5451 
5452 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5453 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5454 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5455 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5456 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5457 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5458 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5459 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5460 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5461 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5462 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5463 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5464 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5465 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5466 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5467 	    hn_tx_stat_ulong_sysctl, "LU",
5468 	    "# of packet transmission aggregation flush failure");
5469 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5470 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5471 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5472 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5473 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5474 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5475 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5476 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5477 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5478 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5479 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5480 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5481 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5482 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5483 	    "# of total TX descs");
5484 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5485 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5486 	    "Chimney send packet size upper boundary");
5487 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5488 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5489 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5490 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5491 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5492 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5493 	    hn_tx_conf_int_sysctl, "I",
5494 	    "Size of the packet for direct transmission");
5495 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5496 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5497 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5498 	    hn_tx_conf_int_sysctl, "I",
5499 	    "Always schedule transmission "
5500 	    "instead of doing direct transmission");
5501 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5502 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5503 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5504 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5505 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5506 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5507 	    "Applied packet transmission aggregation size");
5508 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5509 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5510 	    hn_txagg_pktmax_sysctl, "I",
5511 	    "Applied packet transmission aggregation packets");
5512 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5513 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5514 	    hn_txagg_align_sysctl, "I",
5515 	    "Applied packet transmission aggregation alignment");
5516 
5517 	return 0;
5518 }
5519 
5520 static void
5521 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5522 {
5523 	int i;
5524 
5525 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5526 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5527 }
5528 
5529 static void
5530 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5531 {
5532 	struct ifnet *ifp = sc->hn_ifp;
5533 	u_int hw_tsomax;
5534 	int tso_minlen;
5535 
5536 	HN_LOCK_ASSERT(sc);
5537 
5538 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5539 		return;
5540 
5541 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5542 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5543 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5544 
5545 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5546 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5547 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5548 
5549 	if (tso_maxlen < tso_minlen)
5550 		tso_maxlen = tso_minlen;
5551 	else if (tso_maxlen > IP_MAXPACKET)
5552 		tso_maxlen = IP_MAXPACKET;
5553 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5554 		tso_maxlen = sc->hn_ndis_tso_szmax;
5555 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5556 
5557 	if (hn_xpnt_vf_isready(sc)) {
5558 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5559 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5560 	}
5561 	ifp->if_hw_tsomax = hw_tsomax;
5562 	if (bootverbose)
5563 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5564 }
5565 
5566 static void
5567 hn_fixup_tx_data(struct hn_softc *sc)
5568 {
5569 	uint64_t csum_assist;
5570 	int i;
5571 
5572 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5573 	if (hn_tx_chimney_size > 0 &&
5574 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5575 		hn_set_chim_size(sc, hn_tx_chimney_size);
5576 
5577 	csum_assist = 0;
5578 	if (sc->hn_caps & HN_CAP_IPCS)
5579 		csum_assist |= CSUM_IP;
5580 	if (sc->hn_caps & HN_CAP_TCP4CS)
5581 		csum_assist |= CSUM_IP_TCP;
5582 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5583 		csum_assist |= CSUM_IP_UDP;
5584 	if (sc->hn_caps & HN_CAP_TCP6CS)
5585 		csum_assist |= CSUM_IP6_TCP;
5586 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5587 		csum_assist |= CSUM_IP6_UDP;
5588 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5589 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5590 
5591 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5592 		/*
5593 		 * Support HASHVAL pktinfo on TX path.
5594 		 */
5595 		if (bootverbose)
5596 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5597 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5598 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5599 	}
5600 }
5601 
5602 static void
5603 hn_fixup_rx_data(struct hn_softc *sc)
5604 {
5605 
5606 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5607 		int i;
5608 
5609 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5610 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5611 	}
5612 }
5613 
5614 static void
5615 hn_destroy_tx_data(struct hn_softc *sc)
5616 {
5617 	int i;
5618 
5619 	if (sc->hn_chim != NULL) {
5620 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5621 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5622 		} else {
5623 			device_printf(sc->hn_dev,
5624 			    "chimney sending buffer is referenced");
5625 		}
5626 		sc->hn_chim = NULL;
5627 	}
5628 
5629 	if (sc->hn_tx_ring_cnt == 0)
5630 		return;
5631 
5632 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5633 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5634 
5635 	free(sc->hn_tx_ring, M_DEVBUF);
5636 	sc->hn_tx_ring = NULL;
5637 
5638 	sc->hn_tx_ring_cnt = 0;
5639 	sc->hn_tx_ring_inuse = 0;
5640 }
5641 
5642 #ifdef HN_IFSTART_SUPPORT
5643 
5644 static void
5645 hn_start_taskfunc(void *xtxr, int pending __unused)
5646 {
5647 	struct hn_tx_ring *txr = xtxr;
5648 
5649 	mtx_lock(&txr->hn_tx_lock);
5650 	hn_start_locked(txr, 0);
5651 	mtx_unlock(&txr->hn_tx_lock);
5652 }
5653 
5654 static int
5655 hn_start_locked(struct hn_tx_ring *txr, int len)
5656 {
5657 	struct hn_softc *sc = txr->hn_sc;
5658 	struct ifnet *ifp = sc->hn_ifp;
5659 	int sched = 0;
5660 
5661 	KASSERT(hn_use_if_start,
5662 	    ("hn_start_locked is called, when if_start is disabled"));
5663 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5664 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5665 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5666 
5667 	if (__predict_false(txr->hn_suspended))
5668 		return (0);
5669 
5670 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5671 	    IFF_DRV_RUNNING)
5672 		return (0);
5673 
5674 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5675 		struct hn_txdesc *txd;
5676 		struct mbuf *m_head;
5677 		int error;
5678 
5679 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5680 		if (m_head == NULL)
5681 			break;
5682 
5683 		if (len > 0 && m_head->m_pkthdr.len > len) {
5684 			/*
5685 			 * This sending could be time consuming; let callers
5686 			 * dispatch this packet sending (and sending of any
5687 			 * following up packets) to tx taskqueue.
5688 			 */
5689 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5690 			sched = 1;
5691 			break;
5692 		}
5693 
5694 #if defined(INET6) || defined(INET)
5695 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5696 			m_head = hn_tso_fixup(m_head);
5697 			if (__predict_false(m_head == NULL)) {
5698 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5699 				continue;
5700 			}
5701 		} else if (m_head->m_pkthdr.csum_flags &
5702 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5703 			m_head = hn_set_hlen(m_head);
5704 			if (__predict_false(m_head == NULL)) {
5705 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5706 				continue;
5707 			}
5708 		}
5709 #endif
5710 
5711 		txd = hn_txdesc_get(txr);
5712 		if (txd == NULL) {
5713 			txr->hn_no_txdescs++;
5714 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5715 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5716 			break;
5717 		}
5718 
5719 		error = hn_encap(ifp, txr, txd, &m_head);
5720 		if (error) {
5721 			/* Both txd and m_head are freed */
5722 			KASSERT(txr->hn_agg_txd == NULL,
5723 			    ("encap failed w/ pending aggregating txdesc"));
5724 			continue;
5725 		}
5726 
5727 		if (txr->hn_agg_pktleft == 0) {
5728 			if (txr->hn_agg_txd != NULL) {
5729 				KASSERT(m_head == NULL,
5730 				    ("pending mbuf for aggregating txdesc"));
5731 				error = hn_flush_txagg(ifp, txr);
5732 				if (__predict_false(error)) {
5733 					atomic_set_int(&ifp->if_drv_flags,
5734 					    IFF_DRV_OACTIVE);
5735 					break;
5736 				}
5737 			} else {
5738 				KASSERT(m_head != NULL, ("mbuf was freed"));
5739 				error = hn_txpkt(ifp, txr, txd);
5740 				if (__predict_false(error)) {
5741 					/* txd is freed, but m_head is not */
5742 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5743 					atomic_set_int(&ifp->if_drv_flags,
5744 					    IFF_DRV_OACTIVE);
5745 					break;
5746 				}
5747 			}
5748 		}
5749 #ifdef INVARIANTS
5750 		else {
5751 			KASSERT(txr->hn_agg_txd != NULL,
5752 			    ("no aggregating txdesc"));
5753 			KASSERT(m_head == NULL,
5754 			    ("pending mbuf for aggregating txdesc"));
5755 		}
5756 #endif
5757 	}
5758 
5759 	/* Flush pending aggerated transmission. */
5760 	if (txr->hn_agg_txd != NULL)
5761 		hn_flush_txagg(ifp, txr);
5762 	return (sched);
5763 }
5764 
5765 static void
5766 hn_start(struct ifnet *ifp)
5767 {
5768 	struct hn_softc *sc = ifp->if_softc;
5769 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5770 
5771 	if (txr->hn_sched_tx)
5772 		goto do_sched;
5773 
5774 	if (mtx_trylock(&txr->hn_tx_lock)) {
5775 		int sched;
5776 
5777 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5778 		mtx_unlock(&txr->hn_tx_lock);
5779 		if (!sched)
5780 			return;
5781 	}
5782 do_sched:
5783 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5784 }
5785 
5786 static void
5787 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5788 {
5789 	struct hn_tx_ring *txr = xtxr;
5790 
5791 	mtx_lock(&txr->hn_tx_lock);
5792 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5793 	hn_start_locked(txr, 0);
5794 	mtx_unlock(&txr->hn_tx_lock);
5795 }
5796 
5797 static void
5798 hn_start_txeof(struct hn_tx_ring *txr)
5799 {
5800 	struct hn_softc *sc = txr->hn_sc;
5801 	struct ifnet *ifp = sc->hn_ifp;
5802 
5803 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5804 
5805 	if (txr->hn_sched_tx)
5806 		goto do_sched;
5807 
5808 	if (mtx_trylock(&txr->hn_tx_lock)) {
5809 		int sched;
5810 
5811 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5812 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5813 		mtx_unlock(&txr->hn_tx_lock);
5814 		if (sched) {
5815 			taskqueue_enqueue(txr->hn_tx_taskq,
5816 			    &txr->hn_tx_task);
5817 		}
5818 	} else {
5819 do_sched:
5820 		/*
5821 		 * Release the OACTIVE earlier, with the hope, that
5822 		 * others could catch up.  The task will clear the
5823 		 * flag again with the hn_tx_lock to avoid possible
5824 		 * races.
5825 		 */
5826 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5827 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5828 	}
5829 }
5830 
5831 #endif	/* HN_IFSTART_SUPPORT */
5832 
5833 static int
5834 hn_xmit(struct hn_tx_ring *txr, int len)
5835 {
5836 	struct hn_softc *sc = txr->hn_sc;
5837 	struct ifnet *ifp = sc->hn_ifp;
5838 	struct mbuf *m_head;
5839 	int sched = 0;
5840 
5841 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5842 #ifdef HN_IFSTART_SUPPORT
5843 	KASSERT(hn_use_if_start == 0,
5844 	    ("hn_xmit is called, when if_start is enabled"));
5845 #endif
5846 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5847 
5848 	if (__predict_false(txr->hn_suspended))
5849 		return (0);
5850 
5851 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5852 		return (0);
5853 
5854 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5855 		struct hn_txdesc *txd;
5856 		int error;
5857 
5858 		if (len > 0 && m_head->m_pkthdr.len > len) {
5859 			/*
5860 			 * This sending could be time consuming; let callers
5861 			 * dispatch this packet sending (and sending of any
5862 			 * following up packets) to tx taskqueue.
5863 			 */
5864 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5865 			sched = 1;
5866 			break;
5867 		}
5868 
5869 		txd = hn_txdesc_get(txr);
5870 		if (txd == NULL) {
5871 			txr->hn_no_txdescs++;
5872 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5873 			txr->hn_oactive = 1;
5874 			break;
5875 		}
5876 
5877 		error = hn_encap(ifp, txr, txd, &m_head);
5878 		if (error) {
5879 			/* Both txd and m_head are freed; discard */
5880 			KASSERT(txr->hn_agg_txd == NULL,
5881 			    ("encap failed w/ pending aggregating txdesc"));
5882 			drbr_advance(ifp, txr->hn_mbuf_br);
5883 			continue;
5884 		}
5885 
5886 		if (txr->hn_agg_pktleft == 0) {
5887 			if (txr->hn_agg_txd != NULL) {
5888 				KASSERT(m_head == NULL,
5889 				    ("pending mbuf for aggregating txdesc"));
5890 				error = hn_flush_txagg(ifp, txr);
5891 				if (__predict_false(error)) {
5892 					txr->hn_oactive = 1;
5893 					break;
5894 				}
5895 			} else {
5896 				KASSERT(m_head != NULL, ("mbuf was freed"));
5897 				error = hn_txpkt(ifp, txr, txd);
5898 				if (__predict_false(error)) {
5899 					/* txd is freed, but m_head is not */
5900 					drbr_putback(ifp, txr->hn_mbuf_br,
5901 					    m_head);
5902 					txr->hn_oactive = 1;
5903 					break;
5904 				}
5905 			}
5906 		}
5907 #ifdef INVARIANTS
5908 		else {
5909 			KASSERT(txr->hn_agg_txd != NULL,
5910 			    ("no aggregating txdesc"));
5911 			KASSERT(m_head == NULL,
5912 			    ("pending mbuf for aggregating txdesc"));
5913 		}
5914 #endif
5915 
5916 		/* Sent */
5917 		drbr_advance(ifp, txr->hn_mbuf_br);
5918 	}
5919 
5920 	/* Flush pending aggerated transmission. */
5921 	if (txr->hn_agg_txd != NULL)
5922 		hn_flush_txagg(ifp, txr);
5923 	return (sched);
5924 }
5925 
5926 static int
5927 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5928 {
5929 	struct hn_softc *sc = ifp->if_softc;
5930 	struct hn_tx_ring *txr;
5931 	int error, idx = 0;
5932 
5933 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5934 		struct rm_priotracker pt;
5935 
5936 		rm_rlock(&sc->hn_vf_lock, &pt);
5937 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5938 			struct mbuf *m_bpf = NULL;
5939 			int obytes, omcast;
5940 
5941 			obytes = m->m_pkthdr.len;
5942 			if (m->m_flags & M_MCAST)
5943 				omcast = 1;
5944 
5945 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5946 				if (bpf_peers_present(ifp->if_bpf)) {
5947 					m_bpf = m_copypacket(m, M_NOWAIT);
5948 					if (m_bpf == NULL) {
5949 						/*
5950 						 * Failed to grab a shallow
5951 						 * copy; tap now.
5952 						 */
5953 						ETHER_BPF_MTAP(ifp, m);
5954 					}
5955 				}
5956 			} else {
5957 				ETHER_BPF_MTAP(ifp, m);
5958 			}
5959 
5960 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5961 			rm_runlock(&sc->hn_vf_lock, &pt);
5962 
5963 			if (m_bpf != NULL) {
5964 				if (!error)
5965 					ETHER_BPF_MTAP(ifp, m_bpf);
5966 				m_freem(m_bpf);
5967 			}
5968 
5969 			if (error == ENOBUFS) {
5970 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5971 			} else if (error) {
5972 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5973 			} else {
5974 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5975 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5976 				if (omcast) {
5977 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5978 					    omcast);
5979 				}
5980 			}
5981 			return (error);
5982 		}
5983 		rm_runlock(&sc->hn_vf_lock, &pt);
5984 	}
5985 
5986 #if defined(INET6) || defined(INET)
5987 	/*
5988 	 * Perform TSO packet header fixup or get l2/l3 header length now,
5989 	 * since packet headers should be cache-hot.
5990 	 */
5991 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5992 		m = hn_tso_fixup(m);
5993 		if (__predict_false(m == NULL)) {
5994 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5995 			return EIO;
5996 		}
5997 	} else if (m->m_pkthdr.csum_flags &
5998 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5999 		m = hn_set_hlen(m);
6000 		if (__predict_false(m == NULL)) {
6001 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6002 			return EIO;
6003 		}
6004 	}
6005 #endif
6006 
6007 	/*
6008 	 * Select the TX ring based on flowid
6009 	 */
6010 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6011 #ifdef RSS
6012 		uint32_t bid;
6013 
6014 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6015 		    &bid) == 0)
6016 			idx = bid % sc->hn_tx_ring_inuse;
6017 		else
6018 #endif
6019 		{
6020 #if defined(INET6) || defined(INET)
6021 			int tcpsyn = 0;
6022 
6023 			if (m->m_pkthdr.len < 128 &&
6024 			    (m->m_pkthdr.csum_flags &
6025 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6026 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6027 				m = hn_check_tcpsyn(m, &tcpsyn);
6028 				if (__predict_false(m == NULL)) {
6029 					if_inc_counter(ifp,
6030 					    IFCOUNTER_OERRORS, 1);
6031 					return (EIO);
6032 				}
6033 			}
6034 #else
6035 			const int tcpsyn = 0;
6036 #endif
6037 			if (tcpsyn)
6038 				idx = 0;
6039 			else
6040 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6041 		}
6042 	}
6043 	txr = &sc->hn_tx_ring[idx];
6044 
6045 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6046 	if (error) {
6047 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6048 		return error;
6049 	}
6050 
6051 	if (txr->hn_oactive)
6052 		return 0;
6053 
6054 	if (txr->hn_sched_tx)
6055 		goto do_sched;
6056 
6057 	if (mtx_trylock(&txr->hn_tx_lock)) {
6058 		int sched;
6059 
6060 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6061 		mtx_unlock(&txr->hn_tx_lock);
6062 		if (!sched)
6063 			return 0;
6064 	}
6065 do_sched:
6066 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6067 	return 0;
6068 }
6069 
6070 static void
6071 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6072 {
6073 	struct mbuf *m;
6074 
6075 	mtx_lock(&txr->hn_tx_lock);
6076 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6077 		m_freem(m);
6078 	mtx_unlock(&txr->hn_tx_lock);
6079 }
6080 
6081 static void
6082 hn_xmit_qflush(struct ifnet *ifp)
6083 {
6084 	struct hn_softc *sc = ifp->if_softc;
6085 	struct rm_priotracker pt;
6086 	int i;
6087 
6088 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6089 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6090 	if_qflush(ifp);
6091 
6092 	rm_rlock(&sc->hn_vf_lock, &pt);
6093 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6094 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6095 	rm_runlock(&sc->hn_vf_lock, &pt);
6096 }
6097 
6098 static void
6099 hn_xmit_txeof(struct hn_tx_ring *txr)
6100 {
6101 
6102 	if (txr->hn_sched_tx)
6103 		goto do_sched;
6104 
6105 	if (mtx_trylock(&txr->hn_tx_lock)) {
6106 		int sched;
6107 
6108 		txr->hn_oactive = 0;
6109 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6110 		mtx_unlock(&txr->hn_tx_lock);
6111 		if (sched) {
6112 			taskqueue_enqueue(txr->hn_tx_taskq,
6113 			    &txr->hn_tx_task);
6114 		}
6115 	} else {
6116 do_sched:
6117 		/*
6118 		 * Release the oactive earlier, with the hope, that
6119 		 * others could catch up.  The task will clear the
6120 		 * oactive again with the hn_tx_lock to avoid possible
6121 		 * races.
6122 		 */
6123 		txr->hn_oactive = 0;
6124 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6125 	}
6126 }
6127 
6128 static void
6129 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6130 {
6131 	struct hn_tx_ring *txr = xtxr;
6132 
6133 	mtx_lock(&txr->hn_tx_lock);
6134 	hn_xmit(txr, 0);
6135 	mtx_unlock(&txr->hn_tx_lock);
6136 }
6137 
6138 static void
6139 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6140 {
6141 	struct hn_tx_ring *txr = xtxr;
6142 
6143 	mtx_lock(&txr->hn_tx_lock);
6144 	txr->hn_oactive = 0;
6145 	hn_xmit(txr, 0);
6146 	mtx_unlock(&txr->hn_tx_lock);
6147 }
6148 
6149 static int
6150 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6151 {
6152 	struct vmbus_chan_br cbr;
6153 	struct hn_rx_ring *rxr;
6154 	struct hn_tx_ring *txr = NULL;
6155 	int idx, error;
6156 
6157 	idx = vmbus_chan_subidx(chan);
6158 
6159 	/*
6160 	 * Link this channel to RX/TX ring.
6161 	 */
6162 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6163 	    ("invalid channel index %d, should > 0 && < %d",
6164 	     idx, sc->hn_rx_ring_inuse));
6165 	rxr = &sc->hn_rx_ring[idx];
6166 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6167 	    ("RX ring %d already attached", idx));
6168 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6169 	rxr->hn_chan = chan;
6170 
6171 	if (bootverbose) {
6172 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6173 		    idx, vmbus_chan_id(chan));
6174 	}
6175 
6176 	if (idx < sc->hn_tx_ring_inuse) {
6177 		txr = &sc->hn_tx_ring[idx];
6178 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6179 		    ("TX ring %d already attached", idx));
6180 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6181 
6182 		txr->hn_chan = chan;
6183 		if (bootverbose) {
6184 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6185 			    idx, vmbus_chan_id(chan));
6186 		}
6187 	}
6188 
6189 	/* Bind this channel to a proper CPU. */
6190 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6191 
6192 	/*
6193 	 * Open this channel
6194 	 */
6195 	cbr.cbr = rxr->hn_br;
6196 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6197 	cbr.cbr_txsz = HN_TXBR_SIZE;
6198 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6199 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6200 	if (error) {
6201 		if (error == EISCONN) {
6202 			if_printf(sc->hn_ifp, "bufring is connected after "
6203 			    "chan%u open failure\n", vmbus_chan_id(chan));
6204 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6205 		} else {
6206 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6207 			    vmbus_chan_id(chan), error);
6208 		}
6209 	}
6210 	return (error);
6211 }
6212 
6213 static void
6214 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6215 {
6216 	struct hn_rx_ring *rxr;
6217 	int idx, error;
6218 
6219 	idx = vmbus_chan_subidx(chan);
6220 
6221 	/*
6222 	 * Link this channel to RX/TX ring.
6223 	 */
6224 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6225 	    ("invalid channel index %d, should > 0 && < %d",
6226 	     idx, sc->hn_rx_ring_inuse));
6227 	rxr = &sc->hn_rx_ring[idx];
6228 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6229 	    ("RX ring %d is not attached", idx));
6230 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6231 
6232 	if (idx < sc->hn_tx_ring_inuse) {
6233 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6234 
6235 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6236 		    ("TX ring %d is not attached attached", idx));
6237 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6238 	}
6239 
6240 	/*
6241 	 * Close this channel.
6242 	 *
6243 	 * NOTE:
6244 	 * Channel closing does _not_ destroy the target channel.
6245 	 */
6246 	error = vmbus_chan_close_direct(chan);
6247 	if (error == EISCONN) {
6248 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6249 		    "after being closed\n", vmbus_chan_id(chan));
6250 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6251 	} else if (error) {
6252 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6253 		    vmbus_chan_id(chan), error);
6254 	}
6255 }
6256 
6257 static int
6258 hn_attach_subchans(struct hn_softc *sc)
6259 {
6260 	struct vmbus_channel **subchans;
6261 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6262 	int i, error = 0;
6263 
6264 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6265 
6266 	/* Attach the sub-channels. */
6267 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6268 	for (i = 0; i < subchan_cnt; ++i) {
6269 		int error1;
6270 
6271 		error1 = hn_chan_attach(sc, subchans[i]);
6272 		if (error1) {
6273 			error = error1;
6274 			/* Move on; all channels will be detached later. */
6275 		}
6276 	}
6277 	vmbus_subchan_rel(subchans, subchan_cnt);
6278 
6279 	if (error) {
6280 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6281 	} else {
6282 		if (bootverbose) {
6283 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6284 			    subchan_cnt);
6285 		}
6286 	}
6287 	return (error);
6288 }
6289 
6290 static void
6291 hn_detach_allchans(struct hn_softc *sc)
6292 {
6293 	struct vmbus_channel **subchans;
6294 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6295 	int i;
6296 
6297 	if (subchan_cnt == 0)
6298 		goto back;
6299 
6300 	/* Detach the sub-channels. */
6301 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6302 	for (i = 0; i < subchan_cnt; ++i)
6303 		hn_chan_detach(sc, subchans[i]);
6304 	vmbus_subchan_rel(subchans, subchan_cnt);
6305 
6306 back:
6307 	/*
6308 	 * Detach the primary channel, _after_ all sub-channels
6309 	 * are detached.
6310 	 */
6311 	hn_chan_detach(sc, sc->hn_prichan);
6312 
6313 	/* Wait for sub-channels to be destroyed, if any. */
6314 	vmbus_subchan_drain(sc->hn_prichan);
6315 
6316 #ifdef INVARIANTS
6317 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6318 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6319 		    HN_RX_FLAG_ATTACHED) == 0,
6320 		    ("%dth RX ring is still attached", i));
6321 	}
6322 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6323 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6324 		    HN_TX_FLAG_ATTACHED) == 0,
6325 		    ("%dth TX ring is still attached", i));
6326 	}
6327 #endif
6328 }
6329 
6330 static int
6331 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6332 {
6333 	struct vmbus_channel **subchans;
6334 	int nchan, rxr_cnt, error;
6335 
6336 	nchan = *nsubch + 1;
6337 	if (nchan == 1) {
6338 		/*
6339 		 * Multiple RX/TX rings are not requested.
6340 		 */
6341 		*nsubch = 0;
6342 		return (0);
6343 	}
6344 
6345 	/*
6346 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6347 	 * table entries.
6348 	 */
6349 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6350 	if (error) {
6351 		/* No RSS; this is benign. */
6352 		*nsubch = 0;
6353 		return (0);
6354 	}
6355 	if (bootverbose) {
6356 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6357 		    rxr_cnt, nchan);
6358 	}
6359 
6360 	if (nchan > rxr_cnt)
6361 		nchan = rxr_cnt;
6362 	if (nchan == 1) {
6363 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6364 		*nsubch = 0;
6365 		return (0);
6366 	}
6367 
6368 	/*
6369 	 * Allocate sub-channels from NVS.
6370 	 */
6371 	*nsubch = nchan - 1;
6372 	error = hn_nvs_alloc_subchans(sc, nsubch);
6373 	if (error || *nsubch == 0) {
6374 		/* Failed to allocate sub-channels. */
6375 		*nsubch = 0;
6376 		return (0);
6377 	}
6378 
6379 	/*
6380 	 * Wait for all sub-channels to become ready before moving on.
6381 	 */
6382 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6383 	vmbus_subchan_rel(subchans, *nsubch);
6384 	return (0);
6385 }
6386 
6387 static bool
6388 hn_synth_attachable(const struct hn_softc *sc)
6389 {
6390 	int i;
6391 
6392 	if (sc->hn_flags & HN_FLAG_ERRORS)
6393 		return (false);
6394 
6395 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6396 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6397 
6398 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6399 			return (false);
6400 	}
6401 	return (true);
6402 }
6403 
6404 /*
6405  * Make sure that the RX filter is zero after the successful
6406  * RNDIS initialization.
6407  *
6408  * NOTE:
6409  * Under certain conditions on certain versions of Hyper-V,
6410  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6411  * after the successful RNDIS initialization, which breaks
6412  * the assumption of any following code (well, it breaks the
6413  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6414  * explicitly, drain packets sneaking through, and drain the
6415  * interrupt taskqueues scheduled due to the stealth packets.
6416  */
6417 static void
6418 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6419 {
6420 
6421 	hn_disable_rx(sc);
6422 	hn_drain_rxtx(sc, nchan);
6423 }
6424 
6425 static int
6426 hn_synth_attach(struct hn_softc *sc, int mtu)
6427 {
6428 #define ATTACHED_NVS		0x0002
6429 #define ATTACHED_RNDIS		0x0004
6430 
6431 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6432 	int error, nsubch, nchan = 1, i, rndis_inited;
6433 	uint32_t old_caps, attached = 0;
6434 
6435 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6436 	    ("synthetic parts were attached"));
6437 
6438 	if (!hn_synth_attachable(sc))
6439 		return (ENXIO);
6440 
6441 	/* Save capabilities for later verification. */
6442 	old_caps = sc->hn_caps;
6443 	sc->hn_caps = 0;
6444 
6445 	/* Clear RSS stuffs. */
6446 	sc->hn_rss_ind_size = 0;
6447 	sc->hn_rss_hash = 0;
6448 	sc->hn_rss_hcap = 0;
6449 
6450 	/*
6451 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6452 	 */
6453 	error = hn_chan_attach(sc, sc->hn_prichan);
6454 	if (error)
6455 		goto failed;
6456 
6457 	/*
6458 	 * Attach NVS.
6459 	 */
6460 	error = hn_nvs_attach(sc, mtu);
6461 	if (error)
6462 		goto failed;
6463 	attached |= ATTACHED_NVS;
6464 
6465 	/*
6466 	 * Attach RNDIS _after_ NVS is attached.
6467 	 */
6468 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6469 	if (rndis_inited)
6470 		attached |= ATTACHED_RNDIS;
6471 	if (error)
6472 		goto failed;
6473 
6474 	/*
6475 	 * Make sure capabilities are not changed.
6476 	 */
6477 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6478 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6479 		    old_caps, sc->hn_caps);
6480 		error = ENXIO;
6481 		goto failed;
6482 	}
6483 
6484 	/*
6485 	 * Allocate sub-channels for multi-TX/RX rings.
6486 	 *
6487 	 * NOTE:
6488 	 * The # of RX rings that can be used is equivalent to the # of
6489 	 * channels to be requested.
6490 	 */
6491 	nsubch = sc->hn_rx_ring_cnt - 1;
6492 	error = hn_synth_alloc_subchans(sc, &nsubch);
6493 	if (error)
6494 		goto failed;
6495 	/* NOTE: _Full_ synthetic parts detach is required now. */
6496 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6497 
6498 	/*
6499 	 * Set the # of TX/RX rings that could be used according to
6500 	 * the # of channels that NVS offered.
6501 	 */
6502 	nchan = nsubch + 1;
6503 	hn_set_ring_inuse(sc, nchan);
6504 	if (nchan == 1) {
6505 		/* Only the primary channel can be used; done */
6506 		goto back;
6507 	}
6508 
6509 	/*
6510 	 * Attach the sub-channels.
6511 	 *
6512 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6513 	 */
6514 	error = hn_attach_subchans(sc);
6515 	if (error)
6516 		goto failed;
6517 
6518 	/*
6519 	 * Configure RSS key and indirect table _after_ all sub-channels
6520 	 * are attached.
6521 	 */
6522 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6523 		/*
6524 		 * RSS key is not set yet; set it to the default RSS key.
6525 		 */
6526 		if (bootverbose)
6527 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6528 #ifdef RSS
6529 		rss_getkey(rss->rss_key);
6530 #else
6531 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6532 #endif
6533 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6534 	}
6535 
6536 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6537 		/*
6538 		 * RSS indirect table is not set yet; set it up in round-
6539 		 * robin fashion.
6540 		 */
6541 		if (bootverbose) {
6542 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6543 			    "table\n");
6544 		}
6545 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6546 			uint32_t subidx;
6547 
6548 #ifdef RSS
6549 			subidx = rss_get_indirection_to_bucket(i);
6550 #else
6551 			subidx = i;
6552 #endif
6553 			rss->rss_ind[i] = subidx % nchan;
6554 		}
6555 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6556 	} else {
6557 		/*
6558 		 * # of usable channels may be changed, so we have to
6559 		 * make sure that all entries in RSS indirect table
6560 		 * are valid.
6561 		 *
6562 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6563 		 */
6564 		hn_rss_ind_fixup(sc);
6565 	}
6566 
6567 	sc->hn_rss_hash = sc->hn_rss_hcap;
6568 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6569 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6570 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6571 		hn_vf_rss_fixup(sc, false);
6572 	}
6573 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6574 	if (error)
6575 		goto failed;
6576 back:
6577 	/*
6578 	 * Fixup transmission aggregation setup.
6579 	 */
6580 	hn_set_txagg(sc);
6581 	hn_rndis_init_fixat(sc, nchan);
6582 	return (0);
6583 
6584 failed:
6585 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6586 		hn_rndis_init_fixat(sc, nchan);
6587 		hn_synth_detach(sc);
6588 	} else {
6589 		if (attached & ATTACHED_RNDIS) {
6590 			hn_rndis_init_fixat(sc, nchan);
6591 			hn_rndis_detach(sc);
6592 		}
6593 		if (attached & ATTACHED_NVS)
6594 			hn_nvs_detach(sc);
6595 		hn_chan_detach(sc, sc->hn_prichan);
6596 		/* Restore old capabilities. */
6597 		sc->hn_caps = old_caps;
6598 	}
6599 	return (error);
6600 
6601 #undef ATTACHED_RNDIS
6602 #undef ATTACHED_NVS
6603 }
6604 
6605 /*
6606  * NOTE:
6607  * The interface must have been suspended though hn_suspend(), before
6608  * this function get called.
6609  */
6610 static void
6611 hn_synth_detach(struct hn_softc *sc)
6612 {
6613 
6614 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6615 	    ("synthetic parts were not attached"));
6616 
6617 	/* Detach the RNDIS first. */
6618 	hn_rndis_detach(sc);
6619 
6620 	/* Detach NVS. */
6621 	hn_nvs_detach(sc);
6622 
6623 	/* Detach all of the channels. */
6624 	hn_detach_allchans(sc);
6625 
6626 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6627 }
6628 
6629 static void
6630 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6631 {
6632 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6633 	    ("invalid ring count %d", ring_cnt));
6634 
6635 	if (sc->hn_tx_ring_cnt > ring_cnt)
6636 		sc->hn_tx_ring_inuse = ring_cnt;
6637 	else
6638 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6639 	sc->hn_rx_ring_inuse = ring_cnt;
6640 
6641 #ifdef RSS
6642 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6643 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6644 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6645 		    rss_getnumbuckets());
6646 	}
6647 #endif
6648 
6649 	if (bootverbose) {
6650 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6651 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6652 	}
6653 }
6654 
6655 static void
6656 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6657 {
6658 
6659 	/*
6660 	 * NOTE:
6661 	 * The TX bufring will not be drained by the hypervisor,
6662 	 * if the primary channel is revoked.
6663 	 */
6664 	while (!vmbus_chan_rx_empty(chan) ||
6665 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6666 	     !vmbus_chan_tx_empty(chan)))
6667 		pause("waitch", 1);
6668 	vmbus_chan_intr_drain(chan);
6669 }
6670 
6671 static void
6672 hn_disable_rx(struct hn_softc *sc)
6673 {
6674 
6675 	/*
6676 	 * Disable RX by clearing RX filter forcefully.
6677 	 */
6678 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6679 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6680 
6681 	/*
6682 	 * Give RNDIS enough time to flush all pending data packets.
6683 	 */
6684 	pause("waitrx", (200 * hz) / 1000);
6685 }
6686 
6687 /*
6688  * NOTE:
6689  * RX/TX _must_ have been suspended/disabled, before this function
6690  * is called.
6691  */
6692 static void
6693 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6694 {
6695 	struct vmbus_channel **subch = NULL;
6696 	int nsubch;
6697 
6698 	/*
6699 	 * Drain RX/TX bufrings and interrupts.
6700 	 */
6701 	nsubch = nchan - 1;
6702 	if (nsubch > 0)
6703 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6704 
6705 	if (subch != NULL) {
6706 		int i;
6707 
6708 		for (i = 0; i < nsubch; ++i)
6709 			hn_chan_drain(sc, subch[i]);
6710 	}
6711 	hn_chan_drain(sc, sc->hn_prichan);
6712 
6713 	if (subch != NULL)
6714 		vmbus_subchan_rel(subch, nsubch);
6715 }
6716 
6717 static void
6718 hn_suspend_data(struct hn_softc *sc)
6719 {
6720 	struct hn_tx_ring *txr;
6721 	int i;
6722 
6723 	HN_LOCK_ASSERT(sc);
6724 
6725 	/*
6726 	 * Suspend TX.
6727 	 */
6728 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6729 		txr = &sc->hn_tx_ring[i];
6730 
6731 		mtx_lock(&txr->hn_tx_lock);
6732 		txr->hn_suspended = 1;
6733 		mtx_unlock(&txr->hn_tx_lock);
6734 		/* No one is able send more packets now. */
6735 
6736 		/*
6737 		 * Wait for all pending sends to finish.
6738 		 *
6739 		 * NOTE:
6740 		 * We will _not_ receive all pending send-done, if the
6741 		 * primary channel is revoked.
6742 		 */
6743 		while (hn_tx_ring_pending(txr) &&
6744 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6745 			pause("hnwtx", 1 /* 1 tick */);
6746 	}
6747 
6748 	/*
6749 	 * Disable RX.
6750 	 */
6751 	hn_disable_rx(sc);
6752 
6753 	/*
6754 	 * Drain RX/TX.
6755 	 */
6756 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6757 
6758 	/*
6759 	 * Drain any pending TX tasks.
6760 	 *
6761 	 * NOTE:
6762 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6763 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6764 	 */
6765 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6766 		txr = &sc->hn_tx_ring[i];
6767 
6768 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6769 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6770 	}
6771 }
6772 
6773 static void
6774 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6775 {
6776 
6777 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6778 }
6779 
6780 static void
6781 hn_suspend_mgmt(struct hn_softc *sc)
6782 {
6783 	struct task task;
6784 
6785 	HN_LOCK_ASSERT(sc);
6786 
6787 	/*
6788 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6789 	 * through hn_mgmt_taskq.
6790 	 */
6791 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6792 	vmbus_chan_run_task(sc->hn_prichan, &task);
6793 
6794 	/*
6795 	 * Make sure that all pending management tasks are completed.
6796 	 */
6797 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6798 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6799 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6800 }
6801 
6802 static void
6803 hn_suspend(struct hn_softc *sc)
6804 {
6805 
6806 	/* Disable polling. */
6807 	hn_polling(sc, 0);
6808 
6809 	/*
6810 	 * If the non-transparent mode VF is activated, the synthetic
6811 	 * device is receiving packets, so the data path of the
6812 	 * synthetic device must be suspended.
6813 	 */
6814 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6815 	    (sc->hn_flags & HN_FLAG_RXVF))
6816 		hn_suspend_data(sc);
6817 	hn_suspend_mgmt(sc);
6818 }
6819 
6820 static void
6821 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6822 {
6823 	int i;
6824 
6825 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6826 	    ("invalid TX ring count %d", tx_ring_cnt));
6827 
6828 	for (i = 0; i < tx_ring_cnt; ++i) {
6829 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6830 
6831 		mtx_lock(&txr->hn_tx_lock);
6832 		txr->hn_suspended = 0;
6833 		mtx_unlock(&txr->hn_tx_lock);
6834 	}
6835 }
6836 
6837 static void
6838 hn_resume_data(struct hn_softc *sc)
6839 {
6840 	int i;
6841 
6842 	HN_LOCK_ASSERT(sc);
6843 
6844 	/*
6845 	 * Re-enable RX.
6846 	 */
6847 	hn_rxfilter_config(sc);
6848 
6849 	/*
6850 	 * Make sure to clear suspend status on "all" TX rings,
6851 	 * since hn_tx_ring_inuse can be changed after
6852 	 * hn_suspend_data().
6853 	 */
6854 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6855 
6856 #ifdef HN_IFSTART_SUPPORT
6857 	if (!hn_use_if_start)
6858 #endif
6859 	{
6860 		/*
6861 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6862 		 * reduced.
6863 		 */
6864 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6865 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6866 	}
6867 
6868 	/*
6869 	 * Kick start TX.
6870 	 */
6871 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6872 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6873 
6874 		/*
6875 		 * Use txeof task, so that any pending oactive can be
6876 		 * cleared properly.
6877 		 */
6878 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6879 	}
6880 }
6881 
6882 static void
6883 hn_resume_mgmt(struct hn_softc *sc)
6884 {
6885 
6886 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6887 
6888 	/*
6889 	 * Kick off network change detection, if it was pending.
6890 	 * If no network change was pending, start link status
6891 	 * checks, which is more lightweight than network change
6892 	 * detection.
6893 	 */
6894 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6895 		hn_change_network(sc);
6896 	else
6897 		hn_update_link_status(sc);
6898 }
6899 
6900 static void
6901 hn_resume(struct hn_softc *sc)
6902 {
6903 
6904 	/*
6905 	 * If the non-transparent mode VF is activated, the synthetic
6906 	 * device have to receive packets, so the data path of the
6907 	 * synthetic device must be resumed.
6908 	 */
6909 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6910 	    (sc->hn_flags & HN_FLAG_RXVF))
6911 		hn_resume_data(sc);
6912 
6913 	/*
6914 	 * Don't resume link status change if VF is attached/activated.
6915 	 * - In the non-transparent VF mode, the synthetic device marks
6916 	 *   link down until the VF is deactivated; i.e. VF is down.
6917 	 * - In transparent VF mode, VF's media status is used until
6918 	 *   the VF is detached.
6919 	 */
6920 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6921 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6922 		hn_resume_mgmt(sc);
6923 
6924 	/*
6925 	 * Re-enable polling if this interface is running and
6926 	 * the polling is requested.
6927 	 */
6928 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6929 		hn_polling(sc, sc->hn_pollhz);
6930 }
6931 
6932 static void
6933 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6934 {
6935 	const struct rndis_status_msg *msg;
6936 	int ofs;
6937 
6938 	if (dlen < sizeof(*msg)) {
6939 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6940 		return;
6941 	}
6942 	msg = data;
6943 
6944 	switch (msg->rm_status) {
6945 	case RNDIS_STATUS_MEDIA_CONNECT:
6946 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6947 		hn_update_link_status(sc);
6948 		break;
6949 
6950 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6951 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6952 		/* Not really useful; ignore. */
6953 		break;
6954 
6955 	case RNDIS_STATUS_NETWORK_CHANGE:
6956 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6957 		if (dlen < ofs + msg->rm_stbuflen ||
6958 		    msg->rm_stbuflen < sizeof(uint32_t)) {
6959 			if_printf(sc->hn_ifp, "network changed\n");
6960 		} else {
6961 			uint32_t change;
6962 
6963 			memcpy(&change, ((const uint8_t *)msg) + ofs,
6964 			    sizeof(change));
6965 			if_printf(sc->hn_ifp, "network changed, change %u\n",
6966 			    change);
6967 		}
6968 		hn_change_network(sc);
6969 		break;
6970 
6971 	default:
6972 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6973 		    msg->rm_status);
6974 		break;
6975 	}
6976 }
6977 
6978 static int
6979 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6980 {
6981 	const struct rndis_pktinfo *pi = info_data;
6982 	uint32_t mask = 0;
6983 
6984 	while (info_dlen != 0) {
6985 		const void *data;
6986 		uint32_t dlen;
6987 
6988 		if (__predict_false(info_dlen < sizeof(*pi)))
6989 			return (EINVAL);
6990 		if (__predict_false(info_dlen < pi->rm_size))
6991 			return (EINVAL);
6992 		info_dlen -= pi->rm_size;
6993 
6994 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6995 			return (EINVAL);
6996 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6997 			return (EINVAL);
6998 		dlen = pi->rm_size - pi->rm_pktinfooffset;
6999 		data = pi->rm_data;
7000 
7001 		switch (pi->rm_type) {
7002 		case NDIS_PKTINFO_TYPE_VLAN:
7003 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
7004 				return (EINVAL);
7005 			info->vlan_info = *((const uint32_t *)data);
7006 			mask |= HN_RXINFO_VLAN;
7007 			break;
7008 
7009 		case NDIS_PKTINFO_TYPE_CSUM:
7010 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
7011 				return (EINVAL);
7012 			info->csum_info = *((const uint32_t *)data);
7013 			mask |= HN_RXINFO_CSUM;
7014 			break;
7015 
7016 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7017 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
7018 				return (EINVAL);
7019 			info->hash_value = *((const uint32_t *)data);
7020 			mask |= HN_RXINFO_HASHVAL;
7021 			break;
7022 
7023 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
7024 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
7025 				return (EINVAL);
7026 			info->hash_info = *((const uint32_t *)data);
7027 			mask |= HN_RXINFO_HASHINF;
7028 			break;
7029 
7030 		default:
7031 			goto next;
7032 		}
7033 
7034 		if (mask == HN_RXINFO_ALL) {
7035 			/* All found; done */
7036 			break;
7037 		}
7038 next:
7039 		pi = (const struct rndis_pktinfo *)
7040 		    ((const uint8_t *)pi + pi->rm_size);
7041 	}
7042 
7043 	/*
7044 	 * Final fixup.
7045 	 * - If there is no hash value, invalidate the hash info.
7046 	 */
7047 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7048 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
7049 	return (0);
7050 }
7051 
7052 static __inline bool
7053 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7054 {
7055 
7056 	if (off < check_off) {
7057 		if (__predict_true(off + len <= check_off))
7058 			return (false);
7059 	} else if (off > check_off) {
7060 		if (__predict_true(check_off + check_len <= off))
7061 			return (false);
7062 	}
7063 	return (true);
7064 }
7065 
7066 static void
7067 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7068 {
7069 	const struct rndis_packet_msg *pkt;
7070 	struct hn_rxinfo info;
7071 	int data_off, pktinfo_off, data_len, pktinfo_len;
7072 
7073 	/*
7074 	 * Check length.
7075 	 */
7076 	if (__predict_false(dlen < sizeof(*pkt))) {
7077 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7078 		return;
7079 	}
7080 	pkt = data;
7081 
7082 	if (__predict_false(dlen < pkt->rm_len)) {
7083 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7084 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7085 		return;
7086 	}
7087 	if (__predict_false(pkt->rm_len <
7088 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7089 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7090 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7091 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7092 		    pkt->rm_pktinfolen);
7093 		return;
7094 	}
7095 	if (__predict_false(pkt->rm_datalen == 0)) {
7096 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7097 		return;
7098 	}
7099 
7100 	/*
7101 	 * Check offests.
7102 	 */
7103 #define IS_OFFSET_INVALID(ofs)			\
7104 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7105 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7106 
7107 	/* XXX Hyper-V does not meet data offset alignment requirement */
7108 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7109 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7110 		    "data offset %u\n", pkt->rm_dataoffset);
7111 		return;
7112 	}
7113 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7114 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7115 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7116 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7117 		return;
7118 	}
7119 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7120 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7121 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7122 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7123 		return;
7124 	}
7125 
7126 #undef IS_OFFSET_INVALID
7127 
7128 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7129 	data_len = pkt->rm_datalen;
7130 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7131 	pktinfo_len = pkt->rm_pktinfolen;
7132 
7133 	/*
7134 	 * Check OOB coverage.
7135 	 */
7136 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7137 		int oob_off, oob_len;
7138 
7139 		if_printf(rxr->hn_ifp, "got oobdata\n");
7140 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7141 		oob_len = pkt->rm_oobdatalen;
7142 
7143 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7144 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7145 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7146 			    pkt->rm_len, oob_off, oob_len);
7147 			return;
7148 		}
7149 
7150 		/*
7151 		 * Check against data.
7152 		 */
7153 		if (hn_rndis_check_overlap(oob_off, oob_len,
7154 		    data_off, data_len)) {
7155 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7156 			    "oob overlaps data, oob abs %d len %d, "
7157 			    "data abs %d len %d\n",
7158 			    oob_off, oob_len, data_off, data_len);
7159 			return;
7160 		}
7161 
7162 		/*
7163 		 * Check against pktinfo.
7164 		 */
7165 		if (pktinfo_len != 0 &&
7166 		    hn_rndis_check_overlap(oob_off, oob_len,
7167 		    pktinfo_off, pktinfo_len)) {
7168 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7169 			    "oob overlaps pktinfo, oob abs %d len %d, "
7170 			    "pktinfo abs %d len %d\n",
7171 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7172 			return;
7173 		}
7174 	}
7175 
7176 	/*
7177 	 * Check per-packet-info coverage and find useful per-packet-info.
7178 	 */
7179 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7180 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7181 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7182 	if (__predict_true(pktinfo_len != 0)) {
7183 		bool overlap;
7184 		int error;
7185 
7186 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7187 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7188 			    "pktinfo overflow, msglen %u, "
7189 			    "pktinfo abs %d len %d\n",
7190 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7191 			return;
7192 		}
7193 
7194 		/*
7195 		 * Check packet info coverage.
7196 		 */
7197 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7198 		    data_off, data_len);
7199 		if (__predict_false(overlap)) {
7200 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7201 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7202 			    "data abs %d len %d\n",
7203 			    pktinfo_off, pktinfo_len, data_off, data_len);
7204 			return;
7205 		}
7206 
7207 		/*
7208 		 * Find useful per-packet-info.
7209 		 */
7210 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7211 		    pktinfo_len, &info);
7212 		if (__predict_false(error)) {
7213 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7214 			    "pktinfo\n");
7215 			return;
7216 		}
7217 	}
7218 
7219 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7220 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7221 		    "data overflow, msglen %u, data abs %d len %d\n",
7222 		    pkt->rm_len, data_off, data_len);
7223 		return;
7224 	}
7225 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7226 }
7227 
7228 static __inline void
7229 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7230 {
7231 	const struct rndis_msghdr *hdr;
7232 
7233 	if (__predict_false(dlen < sizeof(*hdr))) {
7234 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7235 		return;
7236 	}
7237 	hdr = data;
7238 
7239 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7240 		/* Hot data path. */
7241 		hn_rndis_rx_data(rxr, data, dlen);
7242 		/* Done! */
7243 		return;
7244 	}
7245 
7246 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7247 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7248 	else
7249 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7250 }
7251 
7252 static void
7253 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7254 {
7255 	const struct hn_nvs_hdr *hdr;
7256 
7257 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7258 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7259 		return;
7260 	}
7261 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7262 
7263 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7264 		/* Useless; ignore */
7265 		return;
7266 	}
7267 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7268 }
7269 
7270 static void
7271 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7272     const struct vmbus_chanpkt_hdr *pkt)
7273 {
7274 	struct hn_nvs_sendctx *sndc;
7275 
7276 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7277 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7278 	    VMBUS_CHANPKT_DATALEN(pkt));
7279 	/*
7280 	 * NOTE:
7281 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7282 	 * its callback.
7283 	 */
7284 }
7285 
7286 static void
7287 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7288     const struct vmbus_chanpkt_hdr *pkthdr)
7289 {
7290 	const struct vmbus_chanpkt_rxbuf *pkt;
7291 	const struct hn_nvs_hdr *nvs_hdr;
7292 	int count, i, hlen;
7293 
7294 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7295 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7296 		return;
7297 	}
7298 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7299 
7300 	/* Make sure that this is a RNDIS message. */
7301 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7302 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7303 		    nvs_hdr->nvs_type);
7304 		return;
7305 	}
7306 
7307 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7308 	if (__predict_false(hlen < sizeof(*pkt))) {
7309 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7310 		return;
7311 	}
7312 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7313 
7314 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7315 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7316 		    pkt->cp_rxbuf_id);
7317 		return;
7318 	}
7319 
7320 	count = pkt->cp_rxbuf_cnt;
7321 	if (__predict_false(hlen <
7322 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7323 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7324 		return;
7325 	}
7326 
7327 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7328 	for (i = 0; i < count; ++i) {
7329 		int ofs, len;
7330 
7331 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7332 		len = pkt->cp_rxbuf[i].rb_len;
7333 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7334 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7335 			    "ofs %d, len %d\n", i, ofs, len);
7336 			continue;
7337 		}
7338 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7339 	}
7340 
7341 	/*
7342 	 * Ack the consumed RXBUF associated w/ this channel packet,
7343 	 * so that this RXBUF can be recycled by the hypervisor.
7344 	 */
7345 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7346 }
7347 
7348 static void
7349 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7350     uint64_t tid)
7351 {
7352 	struct hn_nvs_rndis_ack ack;
7353 	int retries, error;
7354 
7355 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7356 	ack.nvs_status = HN_NVS_STATUS_OK;
7357 
7358 	retries = 0;
7359 again:
7360 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7361 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7362 	if (__predict_false(error == EAGAIN)) {
7363 		/*
7364 		 * NOTE:
7365 		 * This should _not_ happen in real world, since the
7366 		 * consumption of the TX bufring from the TX path is
7367 		 * controlled.
7368 		 */
7369 		if (rxr->hn_ack_failed == 0)
7370 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7371 		rxr->hn_ack_failed++;
7372 		retries++;
7373 		if (retries < 10) {
7374 			DELAY(100);
7375 			goto again;
7376 		}
7377 		/* RXBUF leaks! */
7378 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7379 	}
7380 }
7381 
7382 static void
7383 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7384 {
7385 	struct hn_rx_ring *rxr = xrxr;
7386 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7387 
7388 	for (;;) {
7389 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7390 		int error, pktlen;
7391 
7392 		pktlen = rxr->hn_pktbuf_len;
7393 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7394 		if (__predict_false(error == ENOBUFS)) {
7395 			void *nbuf;
7396 			int nlen;
7397 
7398 			/*
7399 			 * Expand channel packet buffer.
7400 			 *
7401 			 * XXX
7402 			 * Use M_WAITOK here, since allocation failure
7403 			 * is fatal.
7404 			 */
7405 			nlen = rxr->hn_pktbuf_len * 2;
7406 			while (nlen < pktlen)
7407 				nlen *= 2;
7408 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7409 
7410 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7411 			    rxr->hn_pktbuf_len, nlen);
7412 
7413 			free(rxr->hn_pktbuf, M_DEVBUF);
7414 			rxr->hn_pktbuf = nbuf;
7415 			rxr->hn_pktbuf_len = nlen;
7416 			/* Retry! */
7417 			continue;
7418 		} else if (__predict_false(error == EAGAIN)) {
7419 			/* No more channel packets; done! */
7420 			break;
7421 		}
7422 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7423 
7424 		switch (pkt->cph_type) {
7425 		case VMBUS_CHANPKT_TYPE_COMP:
7426 			hn_nvs_handle_comp(sc, chan, pkt);
7427 			break;
7428 
7429 		case VMBUS_CHANPKT_TYPE_RXBUF:
7430 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7431 			break;
7432 
7433 		case VMBUS_CHANPKT_TYPE_INBAND:
7434 			hn_nvs_handle_notify(sc, pkt);
7435 			break;
7436 
7437 		default:
7438 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7439 			    pkt->cph_type);
7440 			break;
7441 		}
7442 	}
7443 	hn_chan_rollup(rxr, rxr->hn_txr);
7444 }
7445 
7446 static void
7447 hn_sysinit(void *arg __unused)
7448 {
7449 	int i;
7450 
7451 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7452 
7453 #ifdef HN_IFSTART_SUPPORT
7454 	/*
7455 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7456 	 * mainly due to the IFF_DRV_OACTIVE flag.
7457 	 */
7458 	if (hn_xpnt_vf && hn_use_if_start) {
7459 		hn_use_if_start = 0;
7460 		printf("hn: tranparent VF mode, if_transmit will be used, "
7461 		    "instead of if_start\n");
7462 	}
7463 #endif
7464 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7465 		printf("hn: invalid transparent VF attach routing "
7466 		    "wait timeout %d, reset to %d\n",
7467 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7468 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7469 	}
7470 
7471 	/*
7472 	 * Initialize VF map.
7473 	 */
7474 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7475 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7476 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7477 	    M_WAITOK | M_ZERO);
7478 
7479 	/*
7480 	 * Fix the # of TX taskqueues.
7481 	 */
7482 	if (hn_tx_taskq_cnt <= 0)
7483 		hn_tx_taskq_cnt = 1;
7484 	else if (hn_tx_taskq_cnt > mp_ncpus)
7485 		hn_tx_taskq_cnt = mp_ncpus;
7486 
7487 	/*
7488 	 * Fix the TX taskqueue mode.
7489 	 */
7490 	switch (hn_tx_taskq_mode) {
7491 	case HN_TX_TASKQ_M_INDEP:
7492 	case HN_TX_TASKQ_M_GLOBAL:
7493 	case HN_TX_TASKQ_M_EVTTQ:
7494 		break;
7495 	default:
7496 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7497 		break;
7498 	}
7499 
7500 	if (vm_guest != VM_GUEST_HV)
7501 		return;
7502 
7503 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7504 		return;
7505 
7506 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7507 	    M_DEVBUF, M_WAITOK);
7508 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7509 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7510 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7511 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7512 		    "hn tx%d", i);
7513 	}
7514 }
7515 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7516 
7517 static void
7518 hn_sysuninit(void *arg __unused)
7519 {
7520 
7521 	if (hn_tx_taskque != NULL) {
7522 		int i;
7523 
7524 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7525 			taskqueue_free(hn_tx_taskque[i]);
7526 		free(hn_tx_taskque, M_DEVBUF);
7527 	}
7528 
7529 	if (hn_vfmap != NULL)
7530 		free(hn_vfmap, M_DEVBUF);
7531 	rm_destroy(&hn_vfmap_lock);
7532 
7533 	counter_u64_free(hn_udpcs_fixup);
7534 }
7535 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7536