xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision f9fd7337f63698f33239c58c07bf430198235a22)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/proc.h>
75 #include <sys/rmlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/socket.h>
80 #include <sys/sockio.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/buf_ring.h>
85 #include <sys/eventhandler.h>
86 
87 #include <machine/atomic.h>
88 #include <machine/in_cksum.h>
89 
90 #include <net/bpf.h>
91 #include <net/ethernet.h>
92 #include <net/if.h>
93 #include <net/if_dl.h>
94 #include <net/if_media.h>
95 #include <net/if_types.h>
96 #include <net/if_var.h>
97 #include <net/rndis.h>
98 #ifdef RSS
99 #include <net/rss_config.h>
100 #endif
101 
102 #include <netinet/in_systm.h>
103 #include <netinet/in.h>
104 #include <netinet/ip.h>
105 #include <netinet/ip6.h>
106 #include <netinet/tcp.h>
107 #include <netinet/tcp_lro.h>
108 #include <netinet/udp.h>
109 
110 #include <dev/hyperv/include/hyperv.h>
111 #include <dev/hyperv/include/hyperv_busdma.h>
112 #include <dev/hyperv/include/vmbus.h>
113 #include <dev/hyperv/include/vmbus_xact.h>
114 
115 #include <dev/hyperv/netvsc/ndis.h>
116 #include <dev/hyperv/netvsc/if_hnreg.h>
117 #include <dev/hyperv/netvsc/if_hnvar.h>
118 #include <dev/hyperv/netvsc/hn_nvs.h>
119 #include <dev/hyperv/netvsc/hn_rndis.h>
120 
121 #include "vmbus_if.h"
122 
123 #define HN_IFSTART_SUPPORT
124 
125 #define HN_RING_CNT_DEF_MAX		8
126 
127 #define HN_VFMAP_SIZE_DEF		8
128 
129 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
130 
131 /* YYY should get it from the underlying channel */
132 #define HN_TX_DESC_CNT			512
133 
134 #define HN_RNDIS_PKT_LEN					\
135 	(sizeof(struct rndis_packet_msg) +			\
136 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
137 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
138 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
139 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
140 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
141 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
142 
143 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
144 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
145 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
146 /* -1 for RNDIS packet message */
147 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
148 
149 #define HN_DIRECT_TX_SIZE_DEF		128
150 
151 #define HN_EARLY_TXEOF_THRESH		8
152 
153 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
154 
155 #define HN_LROENT_CNT_DEF		128
156 
157 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
158 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
159 /* YYY 2*MTU is a bit rough, but should be good enough. */
160 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
161 
162 #define HN_LRO_ACKCNT_DEF		1
163 
164 #define HN_LOCK_INIT(sc)		\
165 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
166 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
167 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
168 #define HN_LOCK(sc)					\
169 do {							\
170 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
171 		/* Relinquish cpu to avoid deadlock */	\
172 		sched_relinquish(curthread);		\
173 		DELAY(1000);				\
174 	}						\
175 } while (0)
176 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
177 
178 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
179 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
180 #define HN_CSUM_IP_HWASSIST(sc)		\
181 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
182 #define HN_CSUM_IP6_HWASSIST(sc)	\
183 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
184 
185 #define HN_PKTSIZE_MIN(align)		\
186 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
187 	    HN_RNDIS_PKT_LEN, (align))
188 #define HN_PKTSIZE(m, align)		\
189 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
190 
191 #ifdef RSS
192 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
193 #else
194 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
195 #endif
196 
197 struct hn_txdesc {
198 #ifndef HN_USE_TXDESC_BUFRING
199 	SLIST_ENTRY(hn_txdesc)		link;
200 #endif
201 	STAILQ_ENTRY(hn_txdesc)		agg_link;
202 
203 	/* Aggregated txdescs, in sending order. */
204 	STAILQ_HEAD(, hn_txdesc)	agg_list;
205 
206 	/* The oldest packet, if transmission aggregation happens. */
207 	struct mbuf			*m;
208 	struct hn_tx_ring		*txr;
209 	int				refs;
210 	uint32_t			flags;	/* HN_TXD_FLAG_ */
211 	struct hn_nvs_sendctx		send_ctx;
212 	uint32_t			chim_index;
213 	int				chim_size;
214 
215 	bus_dmamap_t			data_dmap;
216 
217 	bus_addr_t			rndis_pkt_paddr;
218 	struct rndis_packet_msg		*rndis_pkt;
219 	bus_dmamap_t			rndis_pkt_dmap;
220 };
221 
222 #define HN_TXD_FLAG_ONLIST		0x0001
223 #define HN_TXD_FLAG_DMAMAP		0x0002
224 #define HN_TXD_FLAG_ONAGG		0x0004
225 
226 struct hn_rxinfo {
227 	uint32_t			vlan_info;
228 	uint32_t			csum_info;
229 	uint32_t			hash_info;
230 	uint32_t			hash_value;
231 };
232 
233 struct hn_rxvf_setarg {
234 	struct hn_rx_ring	*rxr;
235 	struct ifnet		*vf_ifp;
236 };
237 
238 #define HN_RXINFO_VLAN			0x0001
239 #define HN_RXINFO_CSUM			0x0002
240 #define HN_RXINFO_HASHINF		0x0004
241 #define HN_RXINFO_HASHVAL		0x0008
242 #define HN_RXINFO_ALL			\
243 	(HN_RXINFO_VLAN |		\
244 	 HN_RXINFO_CSUM |		\
245 	 HN_RXINFO_HASHINF |		\
246 	 HN_RXINFO_HASHVAL)
247 
248 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
249 #define HN_NDIS_RXCSUM_INFO_INVALID	0
250 #define HN_NDIS_HASH_INFO_INVALID	0
251 
252 static int			hn_probe(device_t);
253 static int			hn_attach(device_t);
254 static int			hn_detach(device_t);
255 static int			hn_shutdown(device_t);
256 static void			hn_chan_callback(struct vmbus_channel *,
257 				    void *);
258 
259 static void			hn_init(void *);
260 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
261 #ifdef HN_IFSTART_SUPPORT
262 static void			hn_start(struct ifnet *);
263 #endif
264 static int			hn_transmit(struct ifnet *, struct mbuf *);
265 static void			hn_xmit_qflush(struct ifnet *);
266 static int			hn_ifmedia_upd(struct ifnet *);
267 static void			hn_ifmedia_sts(struct ifnet *,
268 				    struct ifmediareq *);
269 
270 static void			hn_ifnet_event(void *, struct ifnet *, int);
271 static void			hn_ifaddr_event(void *, struct ifnet *);
272 static void			hn_ifnet_attevent(void *, struct ifnet *);
273 static void			hn_ifnet_detevent(void *, struct ifnet *);
274 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
275 
276 static bool			hn_ismyvf(const struct hn_softc *,
277 				    const struct ifnet *);
278 static void			hn_rxvf_change(struct hn_softc *,
279 				    struct ifnet *, bool);
280 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
281 static void			hn_rxvf_set_task(void *, int);
282 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
283 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
284 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
285 				    struct ifreq *);
286 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
287 static bool			hn_xpnt_vf_isready(struct hn_softc *);
288 static void			hn_xpnt_vf_setready(struct hn_softc *);
289 static void			hn_xpnt_vf_init_taskfunc(void *, int);
290 static void			hn_xpnt_vf_init(struct hn_softc *);
291 static void			hn_xpnt_vf_setenable(struct hn_softc *);
292 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
293 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
294 static void			hn_vf_rss_restore(struct hn_softc *);
295 
296 static int			hn_rndis_rxinfo(const void *, int,
297 				    struct hn_rxinfo *);
298 static void			hn_rndis_rx_data(struct hn_rx_ring *,
299 				    const void *, int);
300 static void			hn_rndis_rx_status(struct hn_softc *,
301 				    const void *, int);
302 static void			hn_rndis_init_fixat(struct hn_softc *, int);
303 
304 static void			hn_nvs_handle_notify(struct hn_softc *,
305 				    const struct vmbus_chanpkt_hdr *);
306 static void			hn_nvs_handle_comp(struct hn_softc *,
307 				    struct vmbus_channel *,
308 				    const struct vmbus_chanpkt_hdr *);
309 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
310 				    struct vmbus_channel *,
311 				    const struct vmbus_chanpkt_hdr *);
312 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
313 				    struct vmbus_channel *, uint64_t);
314 
315 #if __FreeBSD_version >= 1100099
316 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
317 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
318 #endif
319 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
320 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
321 #if __FreeBSD_version < 1100095
322 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
323 #else
324 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
325 #endif
326 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
327 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
328 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
329 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
331 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
333 #ifndef RSS
334 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
336 #endif
337 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
346 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
347 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
349 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
350 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
351 
352 static void			hn_stop(struct hn_softc *, bool);
353 static void			hn_init_locked(struct hn_softc *);
354 static int			hn_chan_attach(struct hn_softc *,
355 				    struct vmbus_channel *);
356 static void			hn_chan_detach(struct hn_softc *,
357 				    struct vmbus_channel *);
358 static int			hn_attach_subchans(struct hn_softc *);
359 static void			hn_detach_allchans(struct hn_softc *);
360 static void			hn_chan_rollup(struct hn_rx_ring *,
361 				    struct hn_tx_ring *);
362 static void			hn_set_ring_inuse(struct hn_softc *, int);
363 static int			hn_synth_attach(struct hn_softc *, int);
364 static void			hn_synth_detach(struct hn_softc *);
365 static int			hn_synth_alloc_subchans(struct hn_softc *,
366 				    int *);
367 static bool			hn_synth_attachable(const struct hn_softc *);
368 static void			hn_suspend(struct hn_softc *);
369 static void			hn_suspend_data(struct hn_softc *);
370 static void			hn_suspend_mgmt(struct hn_softc *);
371 static void			hn_resume(struct hn_softc *);
372 static void			hn_resume_data(struct hn_softc *);
373 static void			hn_resume_mgmt(struct hn_softc *);
374 static void			hn_suspend_mgmt_taskfunc(void *, int);
375 static void			hn_chan_drain(struct hn_softc *,
376 				    struct vmbus_channel *);
377 static void			hn_disable_rx(struct hn_softc *);
378 static void			hn_drain_rxtx(struct hn_softc *, int);
379 static void			hn_polling(struct hn_softc *, u_int);
380 static void			hn_chan_polling(struct vmbus_channel *, u_int);
381 static void			hn_mtu_change_fixup(struct hn_softc *);
382 
383 static void			hn_update_link_status(struct hn_softc *);
384 static void			hn_change_network(struct hn_softc *);
385 static void			hn_link_taskfunc(void *, int);
386 static void			hn_netchg_init_taskfunc(void *, int);
387 static void			hn_netchg_status_taskfunc(void *, int);
388 static void			hn_link_status(struct hn_softc *);
389 
390 static int			hn_create_rx_data(struct hn_softc *, int);
391 static void			hn_destroy_rx_data(struct hn_softc *);
392 static int			hn_check_iplen(const struct mbuf *, int);
393 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
394 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
395 static int			hn_rxfilter_config(struct hn_softc *);
396 static int			hn_rss_reconfig(struct hn_softc *);
397 static void			hn_rss_ind_fixup(struct hn_softc *);
398 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
399 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
400 				    int, const struct hn_rxinfo *);
401 static uint32_t			hn_rss_type_fromndis(uint32_t);
402 static uint32_t			hn_rss_type_tondis(uint32_t);
403 
404 static int			hn_tx_ring_create(struct hn_softc *, int);
405 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
406 static int			hn_create_tx_data(struct hn_softc *, int);
407 static void			hn_fixup_tx_data(struct hn_softc *);
408 static void			hn_fixup_rx_data(struct hn_softc *);
409 static void			hn_destroy_tx_data(struct hn_softc *);
410 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
411 static void			hn_txdesc_gc(struct hn_tx_ring *,
412 				    struct hn_txdesc *);
413 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
414 				    struct hn_txdesc *, struct mbuf **);
415 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
416 				    struct hn_txdesc *);
417 static void			hn_set_chim_size(struct hn_softc *, int);
418 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
419 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
420 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
421 static void			hn_resume_tx(struct hn_softc *, int);
422 static void			hn_set_txagg(struct hn_softc *);
423 static void			*hn_try_txagg(struct ifnet *,
424 				    struct hn_tx_ring *, struct hn_txdesc *,
425 				    int);
426 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
427 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
428 				    struct hn_softc *, struct vmbus_channel *,
429 				    const void *, int);
430 static int			hn_txpkt_sglist(struct hn_tx_ring *,
431 				    struct hn_txdesc *);
432 static int			hn_txpkt_chim(struct hn_tx_ring *,
433 				    struct hn_txdesc *);
434 static int			hn_xmit(struct hn_tx_ring *, int);
435 static void			hn_xmit_taskfunc(void *, int);
436 static void			hn_xmit_txeof(struct hn_tx_ring *);
437 static void			hn_xmit_txeof_taskfunc(void *, int);
438 #ifdef HN_IFSTART_SUPPORT
439 static int			hn_start_locked(struct hn_tx_ring *, int);
440 static void			hn_start_taskfunc(void *, int);
441 static void			hn_start_txeof(struct hn_tx_ring *);
442 static void			hn_start_txeof_taskfunc(void *, int);
443 #endif
444 
445 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
446     "Hyper-V network interface");
447 
448 /* Trust tcp segements verification on host side. */
449 static int			hn_trust_hosttcp = 1;
450 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
451     &hn_trust_hosttcp, 0,
452     "Trust tcp segement verification on host side, "
453     "when csum info is missing (global setting)");
454 
455 /* Trust udp datagrams verification on host side. */
456 static int			hn_trust_hostudp = 1;
457 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
458     &hn_trust_hostudp, 0,
459     "Trust udp datagram verification on host side, "
460     "when csum info is missing (global setting)");
461 
462 /* Trust ip packets verification on host side. */
463 static int			hn_trust_hostip = 1;
464 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
465     &hn_trust_hostip, 0,
466     "Trust ip packet verification on host side, "
467     "when csum info is missing (global setting)");
468 
469 /*
470  * Offload UDP/IPv4 checksum.
471  */
472 static int			hn_enable_udp4cs = 1;
473 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
474     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
475 
476 /*
477  * Offload UDP/IPv6 checksum.
478  */
479 static int			hn_enable_udp6cs = 1;
480 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
481     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
482 
483 /* Stats. */
484 static counter_u64_t		hn_udpcs_fixup;
485 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
486     &hn_udpcs_fixup, "# of UDP checksum fixup");
487 
488 /*
489  * See hn_set_hlen().
490  *
491  * This value is for Azure.  For Hyper-V, set this above
492  * 65536 to disable UDP datagram checksum fixup.
493  */
494 static int			hn_udpcs_fixup_mtu = 1420;
495 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
496     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
497 
498 /* Limit TSO burst size */
499 static int			hn_tso_maxlen = IP_MAXPACKET;
500 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
501     &hn_tso_maxlen, 0, "TSO burst limit");
502 
503 /* Limit chimney send size */
504 static int			hn_tx_chimney_size = 0;
505 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
506     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
507 
508 /* Limit the size of packet for direct transmission */
509 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
510 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
511     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
512 
513 /* # of LRO entries per RX ring */
514 #if defined(INET) || defined(INET6)
515 #if __FreeBSD_version >= 1100095
516 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
517 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
518     &hn_lro_entry_count, 0, "LRO entry count");
519 #endif
520 #endif
521 
522 static int			hn_tx_taskq_cnt = 1;
523 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
524     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
525 
526 #define HN_TX_TASKQ_M_INDEP	0
527 #define HN_TX_TASKQ_M_GLOBAL	1
528 #define HN_TX_TASKQ_M_EVTTQ	2
529 
530 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
532     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
533     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
534 
535 #ifndef HN_USE_TXDESC_BUFRING
536 static int			hn_use_txdesc_bufring = 0;
537 #else
538 static int			hn_use_txdesc_bufring = 1;
539 #endif
540 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
541     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
542 
543 #ifdef HN_IFSTART_SUPPORT
544 /* Use ifnet.if_start instead of ifnet.if_transmit */
545 static int			hn_use_if_start = 0;
546 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
547     &hn_use_if_start, 0, "Use if_start TX method");
548 #endif
549 
550 /* # of channels to use */
551 static int			hn_chan_cnt = 0;
552 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
553     &hn_chan_cnt, 0,
554     "# of channels to use; each channel has one RX ring and one TX ring");
555 
556 /* # of transmit rings to use */
557 static int			hn_tx_ring_cnt = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
559     &hn_tx_ring_cnt, 0, "# of TX rings to use");
560 
561 /* Software TX ring deptch */
562 static int			hn_tx_swq_depth = 0;
563 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
564     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
565 
566 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
567 #if __FreeBSD_version >= 1100095
568 static u_int			hn_lro_mbufq_depth = 0;
569 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
570     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
571 #endif
572 
573 /* Packet transmission aggregation size limit */
574 static int			hn_tx_agg_size = -1;
575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
576     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
577 
578 /* Packet transmission aggregation count limit */
579 static int			hn_tx_agg_pkts = -1;
580 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
581     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
582 
583 /* VF list */
584 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
585     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
586     hn_vflist_sysctl, "A",
587     "VF list");
588 
589 /* VF mapping */
590 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
591     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
592     hn_vfmap_sysctl, "A",
593     "VF mapping");
594 
595 /* Transparent VF */
596 static int			hn_xpnt_vf = 1;
597 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
598     &hn_xpnt_vf, 0, "Transparent VF mod");
599 
600 /* Accurate BPF support for Transparent VF */
601 static int			hn_xpnt_vf_accbpf = 0;
602 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
603     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
604 
605 /* Extra wait for transparent VF attach routing; unit seconds. */
606 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
607 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
608     &hn_xpnt_vf_attwait, 0,
609     "Extra wait for transparent VF attach routing; unit: seconds");
610 
611 static u_int			hn_cpu_index;	/* next CPU for channel */
612 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
613 
614 static struct rmlock		hn_vfmap_lock;
615 static int			hn_vfmap_size;
616 static struct ifnet		**hn_vfmap;
617 
618 #ifndef RSS
619 static const uint8_t
620 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
621 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
622 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
623 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
624 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
625 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
626 };
627 #endif	/* !RSS */
628 
629 static const struct hyperv_guid	hn_guid = {
630 	.hv_guid = {
631 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
632 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
633 };
634 
635 static device_method_t hn_methods[] = {
636 	/* Device interface */
637 	DEVMETHOD(device_probe,		hn_probe),
638 	DEVMETHOD(device_attach,	hn_attach),
639 	DEVMETHOD(device_detach,	hn_detach),
640 	DEVMETHOD(device_shutdown,	hn_shutdown),
641 	DEVMETHOD_END
642 };
643 
644 static driver_t hn_driver = {
645 	"hn",
646 	hn_methods,
647 	sizeof(struct hn_softc)
648 };
649 
650 static devclass_t hn_devclass;
651 
652 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
653 MODULE_VERSION(hn, 1);
654 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
655 
656 #if __FreeBSD_version >= 1100099
657 static void
658 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
659 {
660 	int i;
661 
662 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
663 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
664 }
665 #endif
666 
667 static int
668 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
669 {
670 
671 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
672 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
673 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
674 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
675 }
676 
677 static int
678 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
679 {
680 	struct hn_nvs_rndis rndis;
681 
682 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
683 	    txd->chim_size > 0, ("invalid rndis chim txd"));
684 
685 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
686 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
687 	rndis.nvs_chim_idx = txd->chim_index;
688 	rndis.nvs_chim_sz = txd->chim_size;
689 
690 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
691 	    &rndis, sizeof(rndis), &txd->send_ctx));
692 }
693 
694 static __inline uint32_t
695 hn_chim_alloc(struct hn_softc *sc)
696 {
697 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
698 	u_long *bmap = sc->hn_chim_bmap;
699 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
700 
701 	for (i = 0; i < bmap_cnt; ++i) {
702 		int idx;
703 
704 		idx = ffsl(~bmap[i]);
705 		if (idx == 0)
706 			continue;
707 
708 		--idx; /* ffsl is 1-based */
709 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
710 		    ("invalid i %d and idx %d", i, idx));
711 
712 		if (atomic_testandset_long(&bmap[i], idx))
713 			continue;
714 
715 		ret = i * LONG_BIT + idx;
716 		break;
717 	}
718 	return (ret);
719 }
720 
721 static __inline void
722 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
723 {
724 	u_long mask;
725 	uint32_t idx;
726 
727 	idx = chim_idx / LONG_BIT;
728 	KASSERT(idx < sc->hn_chim_bmap_cnt,
729 	    ("invalid chimney index 0x%x", chim_idx));
730 
731 	mask = 1UL << (chim_idx % LONG_BIT);
732 	KASSERT(sc->hn_chim_bmap[idx] & mask,
733 	    ("index bitmap 0x%lx, chimney index %u, "
734 	     "bitmap idx %d, bitmask 0x%lx",
735 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
736 
737 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
738 }
739 
740 #if defined(INET6) || defined(INET)
741 
742 #define PULLUP_HDR(m, len)				\
743 do {							\
744 	if (__predict_false((m)->m_len < (len))) {	\
745 		(m) = m_pullup((m), (len));		\
746 		if ((m) == NULL)			\
747 			return (NULL);			\
748 	}						\
749 } while (0)
750 
751 /*
752  * NOTE: If this function failed, the m_head would be freed.
753  */
754 static __inline struct mbuf *
755 hn_tso_fixup(struct mbuf *m_head)
756 {
757 	struct ether_vlan_header *evl;
758 	struct tcphdr *th;
759 	int ehlen;
760 
761 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
762 
763 	PULLUP_HDR(m_head, sizeof(*evl));
764 	evl = mtod(m_head, struct ether_vlan_header *);
765 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
766 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
767 	else
768 		ehlen = ETHER_HDR_LEN;
769 	m_head->m_pkthdr.l2hlen = ehlen;
770 
771 #ifdef INET
772 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
773 		struct ip *ip;
774 		int iphlen;
775 
776 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
777 		ip = mtodo(m_head, ehlen);
778 		iphlen = ip->ip_hl << 2;
779 		m_head->m_pkthdr.l3hlen = iphlen;
780 
781 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
782 		th = mtodo(m_head, ehlen + iphlen);
783 
784 		ip->ip_len = 0;
785 		ip->ip_sum = 0;
786 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
787 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
788 	}
789 #endif
790 #if defined(INET6) && defined(INET)
791 	else
792 #endif
793 #ifdef INET6
794 	{
795 		struct ip6_hdr *ip6;
796 
797 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
798 		ip6 = mtodo(m_head, ehlen);
799 		if (ip6->ip6_nxt != IPPROTO_TCP) {
800 			m_freem(m_head);
801 			return (NULL);
802 		}
803 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
804 
805 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
806 		th = mtodo(m_head, ehlen + sizeof(*ip6));
807 
808 		ip6->ip6_plen = 0;
809 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
810 	}
811 #endif
812 	return (m_head);
813 }
814 
815 /*
816  * NOTE: If this function failed, the m_head would be freed.
817  */
818 static __inline struct mbuf *
819 hn_set_hlen(struct mbuf *m_head)
820 {
821 	const struct ether_vlan_header *evl;
822 	int ehlen;
823 
824 	PULLUP_HDR(m_head, sizeof(*evl));
825 	evl = mtod(m_head, const struct ether_vlan_header *);
826 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
827 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
828 	else
829 		ehlen = ETHER_HDR_LEN;
830 	m_head->m_pkthdr.l2hlen = ehlen;
831 
832 #ifdef INET
833 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
834 		const struct ip *ip;
835 		int iphlen;
836 
837 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
838 		ip = mtodo(m_head, ehlen);
839 		iphlen = ip->ip_hl << 2;
840 		m_head->m_pkthdr.l3hlen = iphlen;
841 
842 		/*
843 		 * UDP checksum offload does not work in Azure, if the
844 		 * following conditions meet:
845 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
846 		 * - IP_DF is not set in the IP hdr.
847 		 *
848 		 * Fallback to software checksum for these UDP datagrams.
849 		 */
850 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
851 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
852 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
853 			uint16_t off = ehlen + iphlen;
854 
855 			counter_u64_add(hn_udpcs_fixup, 1);
856 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
857 			*(uint16_t *)(m_head->m_data + off +
858                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
859 			    m_head, m_head->m_pkthdr.len, off);
860 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
861 		}
862 	}
863 #endif
864 #if defined(INET6) && defined(INET)
865 	else
866 #endif
867 #ifdef INET6
868 	{
869 		const struct ip6_hdr *ip6;
870 
871 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
872 		ip6 = mtodo(m_head, ehlen);
873 		if (ip6->ip6_nxt != IPPROTO_TCP &&
874 		    ip6->ip6_nxt != IPPROTO_UDP) {
875 			m_freem(m_head);
876 			return (NULL);
877 		}
878 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
879 	}
880 #endif
881 	return (m_head);
882 }
883 
884 /*
885  * NOTE: If this function failed, the m_head would be freed.
886  */
887 static __inline struct mbuf *
888 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
889 {
890 	const struct tcphdr *th;
891 	int ehlen, iphlen;
892 
893 	*tcpsyn = 0;
894 	ehlen = m_head->m_pkthdr.l2hlen;
895 	iphlen = m_head->m_pkthdr.l3hlen;
896 
897 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
898 	th = mtodo(m_head, ehlen + iphlen);
899 	if (th->th_flags & TH_SYN)
900 		*tcpsyn = 1;
901 	return (m_head);
902 }
903 
904 #undef PULLUP_HDR
905 
906 #endif	/* INET6 || INET */
907 
908 static int
909 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
910 {
911 	int error = 0;
912 
913 	HN_LOCK_ASSERT(sc);
914 
915 	if (sc->hn_rx_filter != filter) {
916 		error = hn_rndis_set_rxfilter(sc, filter);
917 		if (!error)
918 			sc->hn_rx_filter = filter;
919 	}
920 	return (error);
921 }
922 
923 static int
924 hn_rxfilter_config(struct hn_softc *sc)
925 {
926 	struct ifnet *ifp = sc->hn_ifp;
927 	uint32_t filter;
928 
929 	HN_LOCK_ASSERT(sc);
930 
931 	/*
932 	 * If the non-transparent mode VF is activated, we don't know how
933 	 * its RX filter is configured, so stick the synthetic device in
934 	 * the promiscous mode.
935 	 */
936 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
937 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
938 	} else {
939 		filter = NDIS_PACKET_TYPE_DIRECTED;
940 		if (ifp->if_flags & IFF_BROADCAST)
941 			filter |= NDIS_PACKET_TYPE_BROADCAST;
942 		/* TODO: support multicast list */
943 		if ((ifp->if_flags & IFF_ALLMULTI) ||
944 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
945 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
946 	}
947 	return (hn_set_rxfilter(sc, filter));
948 }
949 
950 static void
951 hn_set_txagg(struct hn_softc *sc)
952 {
953 	uint32_t size, pkts;
954 	int i;
955 
956 	/*
957 	 * Setup aggregation size.
958 	 */
959 	if (sc->hn_agg_size < 0)
960 		size = UINT32_MAX;
961 	else
962 		size = sc->hn_agg_size;
963 
964 	if (sc->hn_rndis_agg_size < size)
965 		size = sc->hn_rndis_agg_size;
966 
967 	/* NOTE: We only aggregate packets using chimney sending buffers. */
968 	if (size > (uint32_t)sc->hn_chim_szmax)
969 		size = sc->hn_chim_szmax;
970 
971 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
972 		/* Disable */
973 		size = 0;
974 		pkts = 0;
975 		goto done;
976 	}
977 
978 	/* NOTE: Type of the per TX ring setting is 'int'. */
979 	if (size > INT_MAX)
980 		size = INT_MAX;
981 
982 	/*
983 	 * Setup aggregation packet count.
984 	 */
985 	if (sc->hn_agg_pkts < 0)
986 		pkts = UINT32_MAX;
987 	else
988 		pkts = sc->hn_agg_pkts;
989 
990 	if (sc->hn_rndis_agg_pkts < pkts)
991 		pkts = sc->hn_rndis_agg_pkts;
992 
993 	if (pkts <= 1) {
994 		/* Disable */
995 		size = 0;
996 		pkts = 0;
997 		goto done;
998 	}
999 
1000 	/* NOTE: Type of the per TX ring setting is 'short'. */
1001 	if (pkts > SHRT_MAX)
1002 		pkts = SHRT_MAX;
1003 
1004 done:
1005 	/* NOTE: Type of the per TX ring setting is 'short'. */
1006 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1007 		/* Disable */
1008 		size = 0;
1009 		pkts = 0;
1010 	}
1011 
1012 	if (bootverbose) {
1013 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1014 		    size, pkts, sc->hn_rndis_agg_align);
1015 	}
1016 
1017 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1018 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1019 
1020 		mtx_lock(&txr->hn_tx_lock);
1021 		txr->hn_agg_szmax = size;
1022 		txr->hn_agg_pktmax = pkts;
1023 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1024 		mtx_unlock(&txr->hn_tx_lock);
1025 	}
1026 }
1027 
1028 static int
1029 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1030 {
1031 
1032 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1033 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1034 		return txr->hn_txdesc_cnt;
1035 	return hn_tx_swq_depth;
1036 }
1037 
1038 static int
1039 hn_rss_reconfig(struct hn_softc *sc)
1040 {
1041 	int error;
1042 
1043 	HN_LOCK_ASSERT(sc);
1044 
1045 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1046 		return (ENXIO);
1047 
1048 	/*
1049 	 * Disable RSS first.
1050 	 *
1051 	 * NOTE:
1052 	 * Direct reconfiguration by setting the UNCHG flags does
1053 	 * _not_ work properly.
1054 	 */
1055 	if (bootverbose)
1056 		if_printf(sc->hn_ifp, "disable RSS\n");
1057 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1058 	if (error) {
1059 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1060 		return (error);
1061 	}
1062 
1063 	/*
1064 	 * Reenable the RSS w/ the updated RSS key or indirect
1065 	 * table.
1066 	 */
1067 	if (bootverbose)
1068 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1069 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1070 	if (error) {
1071 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1072 		return (error);
1073 	}
1074 	return (0);
1075 }
1076 
1077 static void
1078 hn_rss_ind_fixup(struct hn_softc *sc)
1079 {
1080 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1081 	int i, nchan;
1082 
1083 	nchan = sc->hn_rx_ring_inuse;
1084 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1085 
1086 	/*
1087 	 * Check indirect table to make sure that all channels in it
1088 	 * can be used.
1089 	 */
1090 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1091 		if (rss->rss_ind[i] >= nchan) {
1092 			if_printf(sc->hn_ifp,
1093 			    "RSS indirect table %d fixup: %u -> %d\n",
1094 			    i, rss->rss_ind[i], nchan - 1);
1095 			rss->rss_ind[i] = nchan - 1;
1096 		}
1097 	}
1098 }
1099 
1100 static int
1101 hn_ifmedia_upd(struct ifnet *ifp __unused)
1102 {
1103 
1104 	return EOPNOTSUPP;
1105 }
1106 
1107 static void
1108 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1109 {
1110 	struct hn_softc *sc = ifp->if_softc;
1111 
1112 	ifmr->ifm_status = IFM_AVALID;
1113 	ifmr->ifm_active = IFM_ETHER;
1114 
1115 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1116 		ifmr->ifm_active |= IFM_NONE;
1117 		return;
1118 	}
1119 	ifmr->ifm_status |= IFM_ACTIVE;
1120 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1121 }
1122 
1123 static void
1124 hn_rxvf_set_task(void *xarg, int pending __unused)
1125 {
1126 	struct hn_rxvf_setarg *arg = xarg;
1127 
1128 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1129 }
1130 
1131 static void
1132 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1133 {
1134 	struct hn_rx_ring *rxr;
1135 	struct hn_rxvf_setarg arg;
1136 	struct task task;
1137 	int i;
1138 
1139 	HN_LOCK_ASSERT(sc);
1140 
1141 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1142 
1143 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1144 		rxr = &sc->hn_rx_ring[i];
1145 
1146 		if (i < sc->hn_rx_ring_inuse) {
1147 			arg.rxr = rxr;
1148 			arg.vf_ifp = vf_ifp;
1149 			vmbus_chan_run_task(rxr->hn_chan, &task);
1150 		} else {
1151 			rxr->hn_rxvf_ifp = vf_ifp;
1152 		}
1153 	}
1154 }
1155 
1156 static bool
1157 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1158 {
1159 	const struct ifnet *hn_ifp;
1160 
1161 	hn_ifp = sc->hn_ifp;
1162 
1163 	if (ifp == hn_ifp)
1164 		return (false);
1165 
1166 	if (ifp->if_alloctype != IFT_ETHER)
1167 		return (false);
1168 
1169 	/* Ignore lagg/vlan interfaces */
1170 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1171 	    strcmp(ifp->if_dname, "vlan") == 0)
1172 		return (false);
1173 
1174 	/*
1175 	 * During detach events ifp->if_addr might be NULL.
1176 	 * Make sure the bcmp() below doesn't panic on that:
1177 	 */
1178 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1179 		return (false);
1180 
1181 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1182 		return (false);
1183 
1184 	return (true);
1185 }
1186 
1187 static void
1188 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1189 {
1190 	struct ifnet *hn_ifp;
1191 
1192 	HN_LOCK(sc);
1193 
1194 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1195 		goto out;
1196 
1197 	if (!hn_ismyvf(sc, ifp))
1198 		goto out;
1199 	hn_ifp = sc->hn_ifp;
1200 
1201 	if (rxvf) {
1202 		if (sc->hn_flags & HN_FLAG_RXVF)
1203 			goto out;
1204 
1205 		sc->hn_flags |= HN_FLAG_RXVF;
1206 		hn_rxfilter_config(sc);
1207 	} else {
1208 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1209 			goto out;
1210 
1211 		sc->hn_flags &= ~HN_FLAG_RXVF;
1212 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1213 			hn_rxfilter_config(sc);
1214 		else
1215 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1216 	}
1217 
1218 	hn_nvs_set_datapath(sc,
1219 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1220 
1221 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1222 
1223 	if (rxvf) {
1224 		hn_vf_rss_fixup(sc, true);
1225 		hn_suspend_mgmt(sc);
1226 		sc->hn_link_flags &=
1227 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1228 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1229 	} else {
1230 		hn_vf_rss_restore(sc);
1231 		hn_resume_mgmt(sc);
1232 	}
1233 
1234 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1235 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1236 
1237 	if (bootverbose) {
1238 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1239 		    rxvf ? "to" : "from", ifp->if_xname);
1240 	}
1241 out:
1242 	HN_UNLOCK(sc);
1243 }
1244 
1245 static void
1246 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1247 {
1248 
1249 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1250 		return;
1251 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1252 }
1253 
1254 static void
1255 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1256 {
1257 
1258 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1259 }
1260 
1261 static int
1262 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1263 {
1264 	struct ifnet *ifp, *vf_ifp;
1265 	uint64_t tmp;
1266 	int error;
1267 
1268 	HN_LOCK_ASSERT(sc);
1269 	ifp = sc->hn_ifp;
1270 	vf_ifp = sc->hn_vf_ifp;
1271 
1272 	/*
1273 	 * Fix up requested capabilities w/ supported capabilities,
1274 	 * since the supported capabilities could have been changed.
1275 	 */
1276 	ifr->ifr_reqcap &= ifp->if_capabilities;
1277 	/* Pass SIOCSIFCAP to VF. */
1278 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1279 
1280 	/*
1281 	 * NOTE:
1282 	 * The error will be propagated to the callers, however, it
1283 	 * is _not_ useful here.
1284 	 */
1285 
1286 	/*
1287 	 * Merge VF's enabled capabilities.
1288 	 */
1289 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1290 
1291 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1292 	if (ifp->if_capenable & IFCAP_TXCSUM)
1293 		ifp->if_hwassist |= tmp;
1294 	else
1295 		ifp->if_hwassist &= ~tmp;
1296 
1297 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1298 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1299 		ifp->if_hwassist |= tmp;
1300 	else
1301 		ifp->if_hwassist &= ~tmp;
1302 
1303 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1304 	if (ifp->if_capenable & IFCAP_TSO4)
1305 		ifp->if_hwassist |= tmp;
1306 	else
1307 		ifp->if_hwassist &= ~tmp;
1308 
1309 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1310 	if (ifp->if_capenable & IFCAP_TSO6)
1311 		ifp->if_hwassist |= tmp;
1312 	else
1313 		ifp->if_hwassist &= ~tmp;
1314 
1315 	return (error);
1316 }
1317 
1318 static int
1319 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1320 {
1321 	struct ifnet *vf_ifp;
1322 	struct ifreq ifr;
1323 
1324 	HN_LOCK_ASSERT(sc);
1325 	vf_ifp = sc->hn_vf_ifp;
1326 
1327 	memset(&ifr, 0, sizeof(ifr));
1328 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1329 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1330 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1331 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1332 }
1333 
1334 static void
1335 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1336 {
1337 	struct ifnet *ifp = sc->hn_ifp;
1338 	int allmulti = 0;
1339 
1340 	HN_LOCK_ASSERT(sc);
1341 
1342 	/* XXX vlan(4) style mcast addr maintenance */
1343 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1344 		allmulti = IFF_ALLMULTI;
1345 
1346 	/* Always set the VF's if_flags */
1347 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1348 }
1349 
1350 static void
1351 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1352 {
1353 	struct rm_priotracker pt;
1354 	struct ifnet *hn_ifp = NULL;
1355 	struct mbuf *mn;
1356 
1357 	/*
1358 	 * XXX racy, if hn(4) ever detached.
1359 	 */
1360 	rm_rlock(&hn_vfmap_lock, &pt);
1361 	if (vf_ifp->if_index < hn_vfmap_size)
1362 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1363 	rm_runlock(&hn_vfmap_lock, &pt);
1364 
1365 	if (hn_ifp != NULL) {
1366 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1367 			/*
1368 			 * Allow tapping on the VF.
1369 			 */
1370 			ETHER_BPF_MTAP(vf_ifp, mn);
1371 
1372 			/*
1373 			 * Update VF stats.
1374 			 */
1375 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1376 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1377 				    mn->m_pkthdr.len);
1378 			}
1379 			/*
1380 			 * XXX IFCOUNTER_IMCAST
1381 			 * This stat updating is kinda invasive, since it
1382 			 * requires two checks on the mbuf: the length check
1383 			 * and the ethernet header check.  As of this write,
1384 			 * all multicast packets go directly to hn(4), which
1385 			 * makes imcast stat updating in the VF a try in vian.
1386 			 */
1387 
1388 			/*
1389 			 * Fix up rcvif and increase hn(4)'s ipackets.
1390 			 */
1391 			mn->m_pkthdr.rcvif = hn_ifp;
1392 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1393 		}
1394 		/*
1395 		 * Go through hn(4)'s if_input.
1396 		 */
1397 		hn_ifp->if_input(hn_ifp, m);
1398 	} else {
1399 		/*
1400 		 * In the middle of the transition; free this
1401 		 * mbuf chain.
1402 		 */
1403 		while (m != NULL) {
1404 			mn = m->m_nextpkt;
1405 			m->m_nextpkt = NULL;
1406 			m_freem(m);
1407 			m = mn;
1408 		}
1409 	}
1410 }
1411 
1412 static void
1413 hn_mtu_change_fixup(struct hn_softc *sc)
1414 {
1415 	struct ifnet *ifp;
1416 
1417 	HN_LOCK_ASSERT(sc);
1418 	ifp = sc->hn_ifp;
1419 
1420 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1421 #if __FreeBSD_version >= 1100099
1422 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1423 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1424 #endif
1425 }
1426 
1427 static uint32_t
1428 hn_rss_type_fromndis(uint32_t rss_hash)
1429 {
1430 	uint32_t types = 0;
1431 
1432 	if (rss_hash & NDIS_HASH_IPV4)
1433 		types |= RSS_TYPE_IPV4;
1434 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1435 		types |= RSS_TYPE_TCP_IPV4;
1436 	if (rss_hash & NDIS_HASH_IPV6)
1437 		types |= RSS_TYPE_IPV6;
1438 	if (rss_hash & NDIS_HASH_IPV6_EX)
1439 		types |= RSS_TYPE_IPV6_EX;
1440 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1441 		types |= RSS_TYPE_TCP_IPV6;
1442 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1443 		types |= RSS_TYPE_TCP_IPV6_EX;
1444 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1445 		types |= RSS_TYPE_UDP_IPV4;
1446 	return (types);
1447 }
1448 
1449 static uint32_t
1450 hn_rss_type_tondis(uint32_t types)
1451 {
1452 	uint32_t rss_hash = 0;
1453 
1454 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1455 	    ("UDP6 and UDP6EX are not supported"));
1456 
1457 	if (types & RSS_TYPE_IPV4)
1458 		rss_hash |= NDIS_HASH_IPV4;
1459 	if (types & RSS_TYPE_TCP_IPV4)
1460 		rss_hash |= NDIS_HASH_TCP_IPV4;
1461 	if (types & RSS_TYPE_IPV6)
1462 		rss_hash |= NDIS_HASH_IPV6;
1463 	if (types & RSS_TYPE_IPV6_EX)
1464 		rss_hash |= NDIS_HASH_IPV6_EX;
1465 	if (types & RSS_TYPE_TCP_IPV6)
1466 		rss_hash |= NDIS_HASH_TCP_IPV6;
1467 	if (types & RSS_TYPE_TCP_IPV6_EX)
1468 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1469 	if (types & RSS_TYPE_UDP_IPV4)
1470 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1471 	return (rss_hash);
1472 }
1473 
1474 static void
1475 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1476 {
1477 	int i;
1478 
1479 	HN_LOCK_ASSERT(sc);
1480 
1481 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1482 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1483 }
1484 
1485 static void
1486 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1487 {
1488 	struct ifnet *ifp, *vf_ifp;
1489 	struct ifrsshash ifrh;
1490 	struct ifrsskey ifrk;
1491 	int error;
1492 	uint32_t my_types, diff_types, mbuf_types = 0;
1493 
1494 	HN_LOCK_ASSERT(sc);
1495 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1496 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1497 
1498 	if (sc->hn_rx_ring_inuse == 1) {
1499 		/* No RSS on synthetic parts; done. */
1500 		return;
1501 	}
1502 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1503 		/* Synthetic parts do not support Toeplitz; done. */
1504 		return;
1505 	}
1506 
1507 	ifp = sc->hn_ifp;
1508 	vf_ifp = sc->hn_vf_ifp;
1509 
1510 	/*
1511 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1512 	 * supported.
1513 	 */
1514 	memset(&ifrk, 0, sizeof(ifrk));
1515 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1516 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1517 	if (error) {
1518 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1519 		    vf_ifp->if_xname, error);
1520 		goto done;
1521 	}
1522 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1523 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1524 		    vf_ifp->if_xname, ifrk.ifrk_func);
1525 		goto done;
1526 	}
1527 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1528 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1529 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1530 		goto done;
1531 	}
1532 
1533 	/*
1534 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1535 	 */
1536 	memset(&ifrh, 0, sizeof(ifrh));
1537 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1538 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1539 	if (error) {
1540 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1541 		    vf_ifp->if_xname, error);
1542 		goto done;
1543 	}
1544 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1545 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1546 		    vf_ifp->if_xname, ifrh.ifrh_func);
1547 		goto done;
1548 	}
1549 
1550 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1551 	if ((ifrh.ifrh_types & my_types) == 0) {
1552 		/* This disables RSS; ignore it then */
1553 		if_printf(ifp, "%s intersection of RSS types failed.  "
1554 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1555 		    ifrh.ifrh_types, my_types);
1556 		goto done;
1557 	}
1558 
1559 	diff_types = my_types ^ ifrh.ifrh_types;
1560 	my_types &= ifrh.ifrh_types;
1561 	mbuf_types = my_types;
1562 
1563 	/*
1564 	 * Detect RSS hash value/type confliction.
1565 	 *
1566 	 * NOTE:
1567 	 * We don't disable the hash type, but stop delivery the hash
1568 	 * value/type through mbufs on RX path.
1569 	 *
1570 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1571 	 * hash is delivered with type of TCP_IPV4.  This means if
1572 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1573 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1574 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1575 	 * here.
1576 	 */
1577 	if ((my_types & RSS_TYPE_IPV4) &&
1578 	    (diff_types & ifrh.ifrh_types &
1579 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1580 		/* Conflict; disable IPV4 hash type/value delivery. */
1581 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1582 		mbuf_types &= ~RSS_TYPE_IPV4;
1583 	}
1584 	if ((my_types & RSS_TYPE_IPV6) &&
1585 	    (diff_types & ifrh.ifrh_types &
1586 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1587 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1588 	      RSS_TYPE_IPV6_EX))) {
1589 		/* Conflict; disable IPV6 hash type/value delivery. */
1590 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1591 		mbuf_types &= ~RSS_TYPE_IPV6;
1592 	}
1593 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1594 	    (diff_types & ifrh.ifrh_types &
1595 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1596 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1597 	      RSS_TYPE_IPV6))) {
1598 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1599 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1600 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1601 	}
1602 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1603 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1604 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1605 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1606 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1607 	}
1608 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1609 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1610 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1611 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1612 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1613 	}
1614 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1615 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1616 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1617 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1618 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1619 	}
1620 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1621 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1622 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1623 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1624 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1625 	}
1626 
1627 	/*
1628 	 * Indirect table does not matter.
1629 	 */
1630 
1631 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1632 	    hn_rss_type_tondis(my_types);
1633 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1634 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1635 
1636 	if (reconf) {
1637 		error = hn_rss_reconfig(sc);
1638 		if (error) {
1639 			/* XXX roll-back? */
1640 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1641 			/* XXX keep going. */
1642 		}
1643 	}
1644 done:
1645 	/* Hash deliverability for mbufs. */
1646 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1647 }
1648 
1649 static void
1650 hn_vf_rss_restore(struct hn_softc *sc)
1651 {
1652 
1653 	HN_LOCK_ASSERT(sc);
1654 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1655 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1656 
1657 	if (sc->hn_rx_ring_inuse == 1)
1658 		goto done;
1659 
1660 	/*
1661 	 * Restore hash types.  Key does _not_ matter.
1662 	 */
1663 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1664 		int error;
1665 
1666 		sc->hn_rss_hash = sc->hn_rss_hcap;
1667 		error = hn_rss_reconfig(sc);
1668 		if (error) {
1669 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1670 			    error);
1671 			/* XXX keep going. */
1672 		}
1673 	}
1674 done:
1675 	/* Hash deliverability for mbufs. */
1676 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1677 }
1678 
1679 static void
1680 hn_xpnt_vf_setready(struct hn_softc *sc)
1681 {
1682 	struct ifnet *ifp, *vf_ifp;
1683 	struct ifreq ifr;
1684 
1685 	HN_LOCK_ASSERT(sc);
1686 	ifp = sc->hn_ifp;
1687 	vf_ifp = sc->hn_vf_ifp;
1688 
1689 	/*
1690 	 * Mark the VF ready.
1691 	 */
1692 	sc->hn_vf_rdytick = 0;
1693 
1694 	/*
1695 	 * Save information for restoration.
1696 	 */
1697 	sc->hn_saved_caps = ifp->if_capabilities;
1698 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1699 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1700 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1701 
1702 	/*
1703 	 * Intersect supported/enabled capabilities.
1704 	 *
1705 	 * NOTE:
1706 	 * if_hwassist is not changed here.
1707 	 */
1708 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1709 	ifp->if_capenable &= ifp->if_capabilities;
1710 
1711 	/*
1712 	 * Fix TSO settings.
1713 	 */
1714 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1715 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1716 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1717 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1718 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1719 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1720 
1721 	/*
1722 	 * Change VF's enabled capabilities.
1723 	 */
1724 	memset(&ifr, 0, sizeof(ifr));
1725 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1726 	ifr.ifr_reqcap = ifp->if_capenable;
1727 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1728 
1729 	if (ifp->if_mtu != ETHERMTU) {
1730 		int error;
1731 
1732 		/*
1733 		 * Change VF's MTU.
1734 		 */
1735 		memset(&ifr, 0, sizeof(ifr));
1736 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1737 		ifr.ifr_mtu = ifp->if_mtu;
1738 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1739 		if (error) {
1740 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1741 			    vf_ifp->if_xname, ifp->if_mtu);
1742 			if (ifp->if_mtu > ETHERMTU) {
1743 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1744 
1745 				/*
1746 				 * XXX
1747 				 * No need to adjust the synthetic parts' MTU;
1748 				 * failure of the adjustment will cause us
1749 				 * infinite headache.
1750 				 */
1751 				ifp->if_mtu = ETHERMTU;
1752 				hn_mtu_change_fixup(sc);
1753 			}
1754 		}
1755 	}
1756 }
1757 
1758 static bool
1759 hn_xpnt_vf_isready(struct hn_softc *sc)
1760 {
1761 
1762 	HN_LOCK_ASSERT(sc);
1763 
1764 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1765 		return (false);
1766 
1767 	if (sc->hn_vf_rdytick == 0)
1768 		return (true);
1769 
1770 	if (sc->hn_vf_rdytick > ticks)
1771 		return (false);
1772 
1773 	/* Mark VF as ready. */
1774 	hn_xpnt_vf_setready(sc);
1775 	return (true);
1776 }
1777 
1778 static void
1779 hn_xpnt_vf_setenable(struct hn_softc *sc)
1780 {
1781 	int i;
1782 
1783 	HN_LOCK_ASSERT(sc);
1784 
1785 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1786 	rm_wlock(&sc->hn_vf_lock);
1787 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1788 	rm_wunlock(&sc->hn_vf_lock);
1789 
1790 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1791 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1792 }
1793 
1794 static void
1795 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1796 {
1797 	int i;
1798 
1799 	HN_LOCK_ASSERT(sc);
1800 
1801 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1802 	rm_wlock(&sc->hn_vf_lock);
1803 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1804 	if (clear_vf)
1805 		sc->hn_vf_ifp = NULL;
1806 	rm_wunlock(&sc->hn_vf_lock);
1807 
1808 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1809 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1810 }
1811 
1812 static void
1813 hn_xpnt_vf_init(struct hn_softc *sc)
1814 {
1815 	int error;
1816 
1817 	HN_LOCK_ASSERT(sc);
1818 
1819 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1820 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1821 
1822 	if (bootverbose) {
1823 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1824 		    sc->hn_vf_ifp->if_xname);
1825 	}
1826 
1827 	/*
1828 	 * Bring the VF up.
1829 	 */
1830 	hn_xpnt_vf_saveifflags(sc);
1831 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1832 	error = hn_xpnt_vf_iocsetflags(sc);
1833 	if (error) {
1834 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1835 		    sc->hn_vf_ifp->if_xname, error);
1836 		return;
1837 	}
1838 
1839 	/*
1840 	 * NOTE:
1841 	 * Datapath setting must happen _after_ bringing the VF up.
1842 	 */
1843 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1844 
1845 	/*
1846 	 * NOTE:
1847 	 * Fixup RSS related bits _after_ the VF is brought up, since
1848 	 * many VFs generate RSS key during it's initialization.
1849 	 */
1850 	hn_vf_rss_fixup(sc, true);
1851 
1852 	/* Mark transparent mode VF as enabled. */
1853 	hn_xpnt_vf_setenable(sc);
1854 }
1855 
1856 static void
1857 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1858 {
1859 	struct hn_softc *sc = xsc;
1860 
1861 	HN_LOCK(sc);
1862 
1863 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1864 		goto done;
1865 	if (sc->hn_vf_ifp == NULL)
1866 		goto done;
1867 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1868 		goto done;
1869 
1870 	if (sc->hn_vf_rdytick != 0) {
1871 		/* Mark VF as ready. */
1872 		hn_xpnt_vf_setready(sc);
1873 	}
1874 
1875 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1876 		/*
1877 		 * Delayed VF initialization.
1878 		 */
1879 		if (bootverbose) {
1880 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1881 			    sc->hn_vf_ifp->if_xname);
1882 		}
1883 		hn_xpnt_vf_init(sc);
1884 	}
1885 done:
1886 	HN_UNLOCK(sc);
1887 }
1888 
1889 static void
1890 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1891 {
1892 	struct hn_softc *sc = xsc;
1893 
1894 	HN_LOCK(sc);
1895 
1896 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1897 		goto done;
1898 
1899 	if (!hn_ismyvf(sc, ifp))
1900 		goto done;
1901 
1902 	if (sc->hn_vf_ifp != NULL) {
1903 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1904 		    sc->hn_vf_ifp->if_xname);
1905 		goto done;
1906 	}
1907 
1908 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1909 		/*
1910 		 * ifnet.if_start is _not_ supported by transparent
1911 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1912 		 */
1913 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1914 		    "in transparent VF mode.\n", ifp->if_xname);
1915 		goto done;
1916 	}
1917 
1918 	rm_wlock(&hn_vfmap_lock);
1919 
1920 	if (ifp->if_index >= hn_vfmap_size) {
1921 		struct ifnet **newmap;
1922 		int newsize;
1923 
1924 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1925 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1926 		    M_WAITOK | M_ZERO);
1927 
1928 		memcpy(newmap, hn_vfmap,
1929 		    sizeof(struct ifnet *) * hn_vfmap_size);
1930 		free(hn_vfmap, M_DEVBUF);
1931 		hn_vfmap = newmap;
1932 		hn_vfmap_size = newsize;
1933 	}
1934 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1935 	    ("%s: ifindex %d was mapped to %s",
1936 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1937 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1938 
1939 	rm_wunlock(&hn_vfmap_lock);
1940 
1941 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1942 	rm_wlock(&sc->hn_vf_lock);
1943 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1944 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1945 	sc->hn_vf_ifp = ifp;
1946 	rm_wunlock(&sc->hn_vf_lock);
1947 
1948 	if (hn_xpnt_vf) {
1949 		int wait_ticks;
1950 
1951 		/*
1952 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1953 		 * Save vf_ifp's current if_input for later restoration.
1954 		 */
1955 		sc->hn_vf_input = ifp->if_input;
1956 		ifp->if_input = hn_xpnt_vf_input;
1957 
1958 		/*
1959 		 * Stop link status management; use the VF's.
1960 		 */
1961 		hn_suspend_mgmt(sc);
1962 
1963 		/*
1964 		 * Give VF sometime to complete its attach routing.
1965 		 */
1966 		wait_ticks = hn_xpnt_vf_attwait * hz;
1967 		sc->hn_vf_rdytick = ticks + wait_ticks;
1968 
1969 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1970 		    wait_ticks);
1971 	}
1972 done:
1973 	HN_UNLOCK(sc);
1974 }
1975 
1976 static void
1977 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1978 {
1979 	struct hn_softc *sc = xsc;
1980 
1981 	HN_LOCK(sc);
1982 
1983 	if (sc->hn_vf_ifp == NULL)
1984 		goto done;
1985 
1986 	if (!hn_ismyvf(sc, ifp))
1987 		goto done;
1988 
1989 	if (hn_xpnt_vf) {
1990 		/*
1991 		 * Make sure that the delayed initialization is not running.
1992 		 *
1993 		 * NOTE:
1994 		 * - This lock _must_ be released, since the hn_vf_init task
1995 		 *   will try holding this lock.
1996 		 * - It is safe to release this lock here, since the
1997 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1998 		 *
1999 		 * XXX racy, if hn(4) ever detached.
2000 		 */
2001 		HN_UNLOCK(sc);
2002 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2003 		HN_LOCK(sc);
2004 
2005 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2006 		    sc->hn_ifp->if_xname));
2007 		ifp->if_input = sc->hn_vf_input;
2008 		sc->hn_vf_input = NULL;
2009 
2010 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2011 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2012 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2013 
2014 		if (sc->hn_vf_rdytick == 0) {
2015 			/*
2016 			 * The VF was ready; restore some settings.
2017 			 */
2018 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2019 			/*
2020 			 * NOTE:
2021 			 * There is _no_ need to fixup if_capenable and
2022 			 * if_hwassist, since the if_capabilities before
2023 			 * restoration was an intersection of the VF's
2024 			 * if_capabilites and the synthetic device's
2025 			 * if_capabilites.
2026 			 */
2027 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2028 			sc->hn_ifp->if_hw_tsomaxsegcount =
2029 			    sc->hn_saved_tsosegcnt;
2030 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2031 		}
2032 
2033 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2034 			/*
2035 			 * Restore RSS settings.
2036 			 */
2037 			hn_vf_rss_restore(sc);
2038 
2039 			/*
2040 			 * Resume link status management, which was suspended
2041 			 * by hn_ifnet_attevent().
2042 			 */
2043 			hn_resume_mgmt(sc);
2044 		}
2045 	}
2046 
2047 	/* Mark transparent mode VF as disabled. */
2048 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2049 
2050 	rm_wlock(&hn_vfmap_lock);
2051 
2052 	KASSERT(ifp->if_index < hn_vfmap_size,
2053 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2054 	if (hn_vfmap[ifp->if_index] != NULL) {
2055 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2056 		    ("%s: ifindex %d was mapped to %s",
2057 		     ifp->if_xname, ifp->if_index,
2058 		     hn_vfmap[ifp->if_index]->if_xname));
2059 		hn_vfmap[ifp->if_index] = NULL;
2060 	}
2061 
2062 	rm_wunlock(&hn_vfmap_lock);
2063 done:
2064 	HN_UNLOCK(sc);
2065 }
2066 
2067 static void
2068 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2069 {
2070 	struct hn_softc *sc = xsc;
2071 
2072 	if (sc->hn_vf_ifp == ifp)
2073 		if_link_state_change(sc->hn_ifp, link_state);
2074 }
2075 
2076 static int
2077 hn_probe(device_t dev)
2078 {
2079 
2080 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2081 		device_set_desc(dev, "Hyper-V Network Interface");
2082 		return BUS_PROBE_DEFAULT;
2083 	}
2084 	return ENXIO;
2085 }
2086 
2087 static int
2088 hn_attach(device_t dev)
2089 {
2090 	struct hn_softc *sc = device_get_softc(dev);
2091 	struct sysctl_oid_list *child;
2092 	struct sysctl_ctx_list *ctx;
2093 	uint8_t eaddr[ETHER_ADDR_LEN];
2094 	struct ifnet *ifp = NULL;
2095 	int error, ring_cnt, tx_ring_cnt;
2096 	uint32_t mtu;
2097 
2098 	sc->hn_dev = dev;
2099 	sc->hn_prichan = vmbus_get_channel(dev);
2100 	HN_LOCK_INIT(sc);
2101 	rm_init(&sc->hn_vf_lock, "hnvf");
2102 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2103 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2104 
2105 	/*
2106 	 * Initialize these tunables once.
2107 	 */
2108 	sc->hn_agg_size = hn_tx_agg_size;
2109 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2110 
2111 	/*
2112 	 * Setup taskqueue for transmission.
2113 	 */
2114 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2115 		int i;
2116 
2117 		sc->hn_tx_taskqs =
2118 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2119 		    M_DEVBUF, M_WAITOK);
2120 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2121 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2122 			    M_WAITOK, taskqueue_thread_enqueue,
2123 			    &sc->hn_tx_taskqs[i]);
2124 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2125 			    "%s tx%d", device_get_nameunit(dev), i);
2126 		}
2127 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2128 		sc->hn_tx_taskqs = hn_tx_taskque;
2129 	}
2130 
2131 	/*
2132 	 * Setup taskqueue for mangement tasks, e.g. link status.
2133 	 */
2134 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2135 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2136 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2137 	    device_get_nameunit(dev));
2138 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2139 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2140 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2141 	    hn_netchg_status_taskfunc, sc);
2142 
2143 	if (hn_xpnt_vf) {
2144 		/*
2145 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2146 		 */
2147 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2148 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2149 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2150 		    device_get_nameunit(dev));
2151 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2152 		    hn_xpnt_vf_init_taskfunc, sc);
2153 	}
2154 
2155 	/*
2156 	 * Allocate ifnet and setup its name earlier, so that if_printf
2157 	 * can be used by functions, which will be called after
2158 	 * ether_ifattach().
2159 	 */
2160 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2161 	ifp->if_softc = sc;
2162 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2163 
2164 	/*
2165 	 * Initialize ifmedia earlier so that it can be unconditionally
2166 	 * destroyed, if error happened later on.
2167 	 */
2168 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2169 
2170 	/*
2171 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2172 	 * to use (tx_ring_cnt).
2173 	 *
2174 	 * NOTE:
2175 	 * The # of RX rings to use is same as the # of channels to use.
2176 	 */
2177 	ring_cnt = hn_chan_cnt;
2178 	if (ring_cnt <= 0) {
2179 		/* Default */
2180 		ring_cnt = mp_ncpus;
2181 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2182 			ring_cnt = HN_RING_CNT_DEF_MAX;
2183 	} else if (ring_cnt > mp_ncpus) {
2184 		ring_cnt = mp_ncpus;
2185 	}
2186 #ifdef RSS
2187 	if (ring_cnt > rss_getnumbuckets())
2188 		ring_cnt = rss_getnumbuckets();
2189 #endif
2190 
2191 	tx_ring_cnt = hn_tx_ring_cnt;
2192 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2193 		tx_ring_cnt = ring_cnt;
2194 #ifdef HN_IFSTART_SUPPORT
2195 	if (hn_use_if_start) {
2196 		/* ifnet.if_start only needs one TX ring. */
2197 		tx_ring_cnt = 1;
2198 	}
2199 #endif
2200 
2201 	/*
2202 	 * Set the leader CPU for channels.
2203 	 */
2204 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2205 
2206 	/*
2207 	 * Create enough TX/RX rings, even if only limited number of
2208 	 * channels can be allocated.
2209 	 */
2210 	error = hn_create_tx_data(sc, tx_ring_cnt);
2211 	if (error)
2212 		goto failed;
2213 	error = hn_create_rx_data(sc, ring_cnt);
2214 	if (error)
2215 		goto failed;
2216 
2217 	/*
2218 	 * Create transaction context for NVS and RNDIS transactions.
2219 	 */
2220 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2221 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2222 	if (sc->hn_xact == NULL) {
2223 		error = ENXIO;
2224 		goto failed;
2225 	}
2226 
2227 	/*
2228 	 * Install orphan handler for the revocation of this device's
2229 	 * primary channel.
2230 	 *
2231 	 * NOTE:
2232 	 * The processing order is critical here:
2233 	 * Install the orphan handler, _before_ testing whether this
2234 	 * device's primary channel has been revoked or not.
2235 	 */
2236 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2237 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2238 		error = ENXIO;
2239 		goto failed;
2240 	}
2241 
2242 	/*
2243 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2244 	 */
2245 	error = hn_synth_attach(sc, ETHERMTU);
2246 	if (error)
2247 		goto failed;
2248 
2249 	error = hn_rndis_get_eaddr(sc, eaddr);
2250 	if (error)
2251 		goto failed;
2252 
2253 	error = hn_rndis_get_mtu(sc, &mtu);
2254 	if (error)
2255 		mtu = ETHERMTU;
2256 	else if (bootverbose)
2257 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2258 
2259 #if __FreeBSD_version >= 1100099
2260 	if (sc->hn_rx_ring_inuse > 1) {
2261 		/*
2262 		 * Reduce TCP segment aggregation limit for multiple
2263 		 * RX rings to increase ACK timeliness.
2264 		 */
2265 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2266 	}
2267 #endif
2268 
2269 	/*
2270 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2271 	 */
2272 	hn_fixup_tx_data(sc);
2273 	hn_fixup_rx_data(sc);
2274 
2275 	ctx = device_get_sysctl_ctx(dev);
2276 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2277 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2278 	    &sc->hn_nvs_ver, 0, "NVS version");
2279 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2280 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2281 	    hn_ndis_version_sysctl, "A", "NDIS version");
2282 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2283 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2284 	    hn_caps_sysctl, "A", "capabilities");
2285 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2286 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2287 	    hn_hwassist_sysctl, "A", "hwassist");
2288 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2289 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2290 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2291 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2292 	    "max # of TSO segments");
2293 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2294 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2295 	    "max size of TSO segment");
2296 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2297 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2298 	    hn_rxfilter_sysctl, "A", "rxfilter");
2299 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2300 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2301 	    hn_rss_hash_sysctl, "A", "RSS hash");
2302 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2303 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2304 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2305 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2306 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2307 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2308 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2309 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2310 #ifndef RSS
2311 	/*
2312 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2313 	 */
2314 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2315 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2316 	    hn_rss_key_sysctl, "IU", "RSS key");
2317 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2318 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2319 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2320 #endif
2321 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2322 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2323 	    "RNDIS offered packet transmission aggregation size limit");
2324 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2325 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2326 	    "RNDIS offered packet transmission aggregation count limit");
2327 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2328 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2329 	    "RNDIS packet transmission aggregation alignment");
2330 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2331 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2332 	    hn_txagg_size_sysctl, "I",
2333 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2334 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2335 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2336 	    hn_txagg_pkts_sysctl, "I",
2337 	    "Packet transmission aggregation packets, "
2338 	    "0 -- disable, -1 -- auto");
2339 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2340 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2341 	    hn_polling_sysctl, "I",
2342 	    "Polling frequency: [100,1000000], 0 disable polling");
2343 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2344 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2345 	    hn_vf_sysctl, "A", "Virtual Function's name");
2346 	if (!hn_xpnt_vf) {
2347 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2348 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2349 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2350 	} else {
2351 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2352 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2353 		    hn_xpnt_vf_enabled_sysctl, "I",
2354 		    "Transparent VF enabled");
2355 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2356 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2357 		    hn_xpnt_vf_accbpf_sysctl, "I",
2358 		    "Accurate BPF for transparent VF");
2359 	}
2360 
2361 	/*
2362 	 * Setup the ifmedia, which has been initialized earlier.
2363 	 */
2364 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2365 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2366 	/* XXX ifmedia_set really should do this for us */
2367 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2368 
2369 	/*
2370 	 * Setup the ifnet for this interface.
2371 	 */
2372 
2373 	ifp->if_baudrate = IF_Gbps(10);
2374 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2375 	ifp->if_ioctl = hn_ioctl;
2376 	ifp->if_init = hn_init;
2377 #ifdef HN_IFSTART_SUPPORT
2378 	if (hn_use_if_start) {
2379 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2380 
2381 		ifp->if_start = hn_start;
2382 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2383 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2384 		IFQ_SET_READY(&ifp->if_snd);
2385 	} else
2386 #endif
2387 	{
2388 		ifp->if_transmit = hn_transmit;
2389 		ifp->if_qflush = hn_xmit_qflush;
2390 	}
2391 
2392 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2393 #ifdef foo
2394 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2395 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2396 #endif
2397 	if (sc->hn_caps & HN_CAP_VLAN) {
2398 		/* XXX not sure about VLAN_MTU. */
2399 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2400 	}
2401 
2402 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2403 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2404 		ifp->if_capabilities |= IFCAP_TXCSUM;
2405 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2406 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2407 	if (sc->hn_caps & HN_CAP_TSO4) {
2408 		ifp->if_capabilities |= IFCAP_TSO4;
2409 		ifp->if_hwassist |= CSUM_IP_TSO;
2410 	}
2411 	if (sc->hn_caps & HN_CAP_TSO6) {
2412 		ifp->if_capabilities |= IFCAP_TSO6;
2413 		ifp->if_hwassist |= CSUM_IP6_TSO;
2414 	}
2415 
2416 	/* Enable all available capabilities by default. */
2417 	ifp->if_capenable = ifp->if_capabilities;
2418 
2419 	/*
2420 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2421 	 * be enabled through SIOCSIFCAP.
2422 	 */
2423 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2424 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2425 
2426 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2427 		/*
2428 		 * Lock hn_set_tso_maxsize() to simplify its
2429 		 * internal logic.
2430 		 */
2431 		HN_LOCK(sc);
2432 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2433 		HN_UNLOCK(sc);
2434 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2435 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2436 	}
2437 
2438 	ether_ifattach(ifp, eaddr);
2439 
2440 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2441 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2442 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2443 	}
2444 	if (mtu < ETHERMTU) {
2445 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2446 		ifp->if_mtu = mtu;
2447 	}
2448 
2449 	/* Inform the upper layer about the long frame support. */
2450 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2451 
2452 	/*
2453 	 * Kick off link status check.
2454 	 */
2455 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2456 	hn_update_link_status(sc);
2457 
2458 	if (!hn_xpnt_vf) {
2459 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2460 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2461 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2462 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2463 	} else {
2464 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2465 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2466 	}
2467 
2468 	/*
2469 	 * NOTE:
2470 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2471 	 * since interface's LLADDR is needed; interface LLADDR is not
2472 	 * available when ifnet_arrival event is triggered.
2473 	 */
2474 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2475 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2476 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2477 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2478 
2479 	return (0);
2480 failed:
2481 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2482 		hn_synth_detach(sc);
2483 	hn_detach(dev);
2484 	return (error);
2485 }
2486 
2487 static int
2488 hn_detach(device_t dev)
2489 {
2490 	struct hn_softc *sc = device_get_softc(dev);
2491 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2492 
2493 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2494 		/*
2495 		 * In case that the vmbus missed the orphan handler
2496 		 * installation.
2497 		 */
2498 		vmbus_xact_ctx_orphan(sc->hn_xact);
2499 	}
2500 
2501 	if (sc->hn_ifaddr_evthand != NULL)
2502 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2503 	if (sc->hn_ifnet_evthand != NULL)
2504 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2505 	if (sc->hn_ifnet_atthand != NULL) {
2506 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2507 		    sc->hn_ifnet_atthand);
2508 	}
2509 	if (sc->hn_ifnet_dethand != NULL) {
2510 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2511 		    sc->hn_ifnet_dethand);
2512 	}
2513 	if (sc->hn_ifnet_lnkhand != NULL)
2514 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2515 
2516 	vf_ifp = sc->hn_vf_ifp;
2517 	__compiler_membar();
2518 	if (vf_ifp != NULL)
2519 		hn_ifnet_detevent(sc, vf_ifp);
2520 
2521 	if (device_is_attached(dev)) {
2522 		HN_LOCK(sc);
2523 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2524 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2525 				hn_stop(sc, true);
2526 			/*
2527 			 * NOTE:
2528 			 * hn_stop() only suspends data, so managment
2529 			 * stuffs have to be suspended manually here.
2530 			 */
2531 			hn_suspend_mgmt(sc);
2532 			hn_synth_detach(sc);
2533 		}
2534 		HN_UNLOCK(sc);
2535 		ether_ifdetach(ifp);
2536 	}
2537 
2538 	ifmedia_removeall(&sc->hn_media);
2539 	hn_destroy_rx_data(sc);
2540 	hn_destroy_tx_data(sc);
2541 
2542 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2543 		int i;
2544 
2545 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2546 			taskqueue_free(sc->hn_tx_taskqs[i]);
2547 		free(sc->hn_tx_taskqs, M_DEVBUF);
2548 	}
2549 	taskqueue_free(sc->hn_mgmt_taskq0);
2550 	if (sc->hn_vf_taskq != NULL)
2551 		taskqueue_free(sc->hn_vf_taskq);
2552 
2553 	if (sc->hn_xact != NULL) {
2554 		/*
2555 		 * Uninstall the orphan handler _before_ the xact is
2556 		 * destructed.
2557 		 */
2558 		vmbus_chan_unset_orphan(sc->hn_prichan);
2559 		vmbus_xact_ctx_destroy(sc->hn_xact);
2560 	}
2561 
2562 	if_free(ifp);
2563 
2564 	HN_LOCK_DESTROY(sc);
2565 	rm_destroy(&sc->hn_vf_lock);
2566 	return (0);
2567 }
2568 
2569 static int
2570 hn_shutdown(device_t dev)
2571 {
2572 
2573 	return (0);
2574 }
2575 
2576 static void
2577 hn_link_status(struct hn_softc *sc)
2578 {
2579 	uint32_t link_status;
2580 	int error;
2581 
2582 	error = hn_rndis_get_linkstatus(sc, &link_status);
2583 	if (error) {
2584 		/* XXX what to do? */
2585 		return;
2586 	}
2587 
2588 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2589 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2590 	else
2591 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2592 	if_link_state_change(sc->hn_ifp,
2593 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2594 	    LINK_STATE_UP : LINK_STATE_DOWN);
2595 }
2596 
2597 static void
2598 hn_link_taskfunc(void *xsc, int pending __unused)
2599 {
2600 	struct hn_softc *sc = xsc;
2601 
2602 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2603 		return;
2604 	hn_link_status(sc);
2605 }
2606 
2607 static void
2608 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2609 {
2610 	struct hn_softc *sc = xsc;
2611 
2612 	/* Prevent any link status checks from running. */
2613 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2614 
2615 	/*
2616 	 * Fake up a [link down --> link up] state change; 5 seconds
2617 	 * delay is used, which closely simulates miibus reaction
2618 	 * upon link down event.
2619 	 */
2620 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2621 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2622 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2623 	    &sc->hn_netchg_status, 5 * hz);
2624 }
2625 
2626 static void
2627 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2628 {
2629 	struct hn_softc *sc = xsc;
2630 
2631 	/* Re-allow link status checks. */
2632 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2633 	hn_link_status(sc);
2634 }
2635 
2636 static void
2637 hn_update_link_status(struct hn_softc *sc)
2638 {
2639 
2640 	if (sc->hn_mgmt_taskq != NULL)
2641 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2642 }
2643 
2644 static void
2645 hn_change_network(struct hn_softc *sc)
2646 {
2647 
2648 	if (sc->hn_mgmt_taskq != NULL)
2649 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2650 }
2651 
2652 static __inline int
2653 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2654     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2655 {
2656 	struct mbuf *m = *m_head;
2657 	int error;
2658 
2659 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2660 
2661 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2662 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2663 	if (error == EFBIG) {
2664 		struct mbuf *m_new;
2665 
2666 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2667 		if (m_new == NULL)
2668 			return ENOBUFS;
2669 		else
2670 			*m_head = m = m_new;
2671 		txr->hn_tx_collapsed++;
2672 
2673 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2674 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2675 	}
2676 	if (!error) {
2677 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2678 		    BUS_DMASYNC_PREWRITE);
2679 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2680 	}
2681 	return error;
2682 }
2683 
2684 static __inline int
2685 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2686 {
2687 
2688 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2689 	    ("put an onlist txd %#x", txd->flags));
2690 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2691 	    ("put an onagg txd %#x", txd->flags));
2692 
2693 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2694 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2695 		return 0;
2696 
2697 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2698 		struct hn_txdesc *tmp_txd;
2699 
2700 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2701 			int freed;
2702 
2703 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2704 			    ("resursive aggregation on aggregated txdesc"));
2705 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2706 			    ("not aggregated txdesc"));
2707 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2708 			    ("aggregated txdesc uses dmamap"));
2709 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2710 			    ("aggregated txdesc consumes "
2711 			     "chimney sending buffer"));
2712 			KASSERT(tmp_txd->chim_size == 0,
2713 			    ("aggregated txdesc has non-zero "
2714 			     "chimney sending size"));
2715 
2716 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2717 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2718 			freed = hn_txdesc_put(txr, tmp_txd);
2719 			KASSERT(freed, ("failed to free aggregated txdesc"));
2720 		}
2721 	}
2722 
2723 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2724 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2725 		    ("chim txd uses dmamap"));
2726 		hn_chim_free(txr->hn_sc, txd->chim_index);
2727 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2728 		txd->chim_size = 0;
2729 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2730 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2731 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2732 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2733 		    txd->data_dmap);
2734 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2735 	}
2736 
2737 	if (txd->m != NULL) {
2738 		m_freem(txd->m);
2739 		txd->m = NULL;
2740 	}
2741 
2742 	txd->flags |= HN_TXD_FLAG_ONLIST;
2743 #ifndef HN_USE_TXDESC_BUFRING
2744 	mtx_lock_spin(&txr->hn_txlist_spin);
2745 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2746 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2747 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2748 	txr->hn_txdesc_avail++;
2749 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2750 	mtx_unlock_spin(&txr->hn_txlist_spin);
2751 #else	/* HN_USE_TXDESC_BUFRING */
2752 #ifdef HN_DEBUG
2753 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2754 #endif
2755 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2756 #endif	/* !HN_USE_TXDESC_BUFRING */
2757 
2758 	return 1;
2759 }
2760 
2761 static __inline struct hn_txdesc *
2762 hn_txdesc_get(struct hn_tx_ring *txr)
2763 {
2764 	struct hn_txdesc *txd;
2765 
2766 #ifndef HN_USE_TXDESC_BUFRING
2767 	mtx_lock_spin(&txr->hn_txlist_spin);
2768 	txd = SLIST_FIRST(&txr->hn_txlist);
2769 	if (txd != NULL) {
2770 		KASSERT(txr->hn_txdesc_avail > 0,
2771 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2772 		txr->hn_txdesc_avail--;
2773 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2774 	}
2775 	mtx_unlock_spin(&txr->hn_txlist_spin);
2776 #else
2777 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2778 #endif
2779 
2780 	if (txd != NULL) {
2781 #ifdef HN_USE_TXDESC_BUFRING
2782 #ifdef HN_DEBUG
2783 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2784 #endif
2785 #endif	/* HN_USE_TXDESC_BUFRING */
2786 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2787 		    STAILQ_EMPTY(&txd->agg_list) &&
2788 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2789 		    txd->chim_size == 0 &&
2790 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2791 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2792 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2793 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2794 		txd->refs = 1;
2795 	}
2796 	return txd;
2797 }
2798 
2799 static __inline void
2800 hn_txdesc_hold(struct hn_txdesc *txd)
2801 {
2802 
2803 	/* 0->1 transition will never work */
2804 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2805 	atomic_add_int(&txd->refs, 1);
2806 }
2807 
2808 static __inline void
2809 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2810 {
2811 
2812 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2813 	    ("recursive aggregation on aggregating txdesc"));
2814 
2815 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2816 	    ("already aggregated"));
2817 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2818 	    ("recursive aggregation on to-be-aggregated txdesc"));
2819 
2820 	txd->flags |= HN_TXD_FLAG_ONAGG;
2821 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2822 }
2823 
2824 static bool
2825 hn_tx_ring_pending(struct hn_tx_ring *txr)
2826 {
2827 	bool pending = false;
2828 
2829 #ifndef HN_USE_TXDESC_BUFRING
2830 	mtx_lock_spin(&txr->hn_txlist_spin);
2831 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2832 		pending = true;
2833 	mtx_unlock_spin(&txr->hn_txlist_spin);
2834 #else
2835 	if (!buf_ring_full(txr->hn_txdesc_br))
2836 		pending = true;
2837 #endif
2838 	return (pending);
2839 }
2840 
2841 static __inline void
2842 hn_txeof(struct hn_tx_ring *txr)
2843 {
2844 	txr->hn_has_txeof = 0;
2845 	txr->hn_txeof(txr);
2846 }
2847 
2848 static void
2849 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2850     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2851 {
2852 	struct hn_txdesc *txd = sndc->hn_cbarg;
2853 	struct hn_tx_ring *txr;
2854 
2855 	txr = txd->txr;
2856 	KASSERT(txr->hn_chan == chan,
2857 	    ("channel mismatch, on chan%u, should be chan%u",
2858 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2859 
2860 	txr->hn_has_txeof = 1;
2861 	hn_txdesc_put(txr, txd);
2862 
2863 	++txr->hn_txdone_cnt;
2864 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2865 		txr->hn_txdone_cnt = 0;
2866 		if (txr->hn_oactive)
2867 			hn_txeof(txr);
2868 	}
2869 }
2870 
2871 static void
2872 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2873 {
2874 #if defined(INET) || defined(INET6)
2875 	tcp_lro_flush_all(&rxr->hn_lro);
2876 #endif
2877 
2878 	/*
2879 	 * NOTE:
2880 	 * 'txr' could be NULL, if multiple channels and
2881 	 * ifnet.if_start method are enabled.
2882 	 */
2883 	if (txr == NULL || !txr->hn_has_txeof)
2884 		return;
2885 
2886 	txr->hn_txdone_cnt = 0;
2887 	hn_txeof(txr);
2888 }
2889 
2890 static __inline uint32_t
2891 hn_rndis_pktmsg_offset(uint32_t ofs)
2892 {
2893 
2894 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2895 	    ("invalid RNDIS packet msg offset %u", ofs));
2896 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2897 }
2898 
2899 static __inline void *
2900 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2901     size_t pi_dlen, uint32_t pi_type)
2902 {
2903 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2904 	struct rndis_pktinfo *pi;
2905 
2906 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2907 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2908 
2909 	/*
2910 	 * Per-packet-info does not move; it only grows.
2911 	 *
2912 	 * NOTE:
2913 	 * rm_pktinfooffset in this phase counts from the beginning
2914 	 * of rndis_packet_msg.
2915 	 */
2916 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2917 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2918 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2919 	    pkt->rm_pktinfolen);
2920 	pkt->rm_pktinfolen += pi_size;
2921 
2922 	pi->rm_size = pi_size;
2923 	pi->rm_type = pi_type;
2924 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2925 
2926 	return (pi->rm_data);
2927 }
2928 
2929 static __inline int
2930 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2931 {
2932 	struct hn_txdesc *txd;
2933 	struct mbuf *m;
2934 	int error, pkts;
2935 
2936 	txd = txr->hn_agg_txd;
2937 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2938 
2939 	/*
2940 	 * Since hn_txpkt() will reset this temporary stat, save
2941 	 * it now, so that oerrors can be updated properly, if
2942 	 * hn_txpkt() ever fails.
2943 	 */
2944 	pkts = txr->hn_stat_pkts;
2945 
2946 	/*
2947 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2948 	 * failure, save it for later freeing, if hn_txpkt() ever
2949 	 * fails.
2950 	 */
2951 	m = txd->m;
2952 	error = hn_txpkt(ifp, txr, txd);
2953 	if (__predict_false(error)) {
2954 		/* txd is freed, but m is not. */
2955 		m_freem(m);
2956 
2957 		txr->hn_flush_failed++;
2958 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2959 	}
2960 
2961 	/* Reset all aggregation states. */
2962 	txr->hn_agg_txd = NULL;
2963 	txr->hn_agg_szleft = 0;
2964 	txr->hn_agg_pktleft = 0;
2965 	txr->hn_agg_prevpkt = NULL;
2966 
2967 	return (error);
2968 }
2969 
2970 static void *
2971 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2972     int pktsize)
2973 {
2974 	void *chim;
2975 
2976 	if (txr->hn_agg_txd != NULL) {
2977 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2978 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2979 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2980 			int olen;
2981 
2982 			/*
2983 			 * Update the previous RNDIS packet's total length,
2984 			 * it can be increased due to the mandatory alignment
2985 			 * padding for this RNDIS packet.  And update the
2986 			 * aggregating txdesc's chimney sending buffer size
2987 			 * accordingly.
2988 			 *
2989 			 * XXX
2990 			 * Zero-out the padding, as required by the RNDIS spec.
2991 			 */
2992 			olen = pkt->rm_len;
2993 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2994 			agg_txd->chim_size += pkt->rm_len - olen;
2995 
2996 			/* Link this txdesc to the parent. */
2997 			hn_txdesc_agg(agg_txd, txd);
2998 
2999 			chim = (uint8_t *)pkt + pkt->rm_len;
3000 			/* Save the current packet for later fixup. */
3001 			txr->hn_agg_prevpkt = chim;
3002 
3003 			txr->hn_agg_pktleft--;
3004 			txr->hn_agg_szleft -= pktsize;
3005 			if (txr->hn_agg_szleft <=
3006 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3007 				/*
3008 				 * Probably can't aggregate more packets,
3009 				 * flush this aggregating txdesc proactively.
3010 				 */
3011 				txr->hn_agg_pktleft = 0;
3012 			}
3013 			/* Done! */
3014 			return (chim);
3015 		}
3016 		hn_flush_txagg(ifp, txr);
3017 	}
3018 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3019 
3020 	txr->hn_tx_chimney_tried++;
3021 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3022 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3023 		return (NULL);
3024 	txr->hn_tx_chimney++;
3025 
3026 	chim = txr->hn_sc->hn_chim +
3027 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3028 
3029 	if (txr->hn_agg_pktmax > 1 &&
3030 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3031 		txr->hn_agg_txd = txd;
3032 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3033 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3034 		txr->hn_agg_prevpkt = chim;
3035 	}
3036 	return (chim);
3037 }
3038 
3039 /*
3040  * NOTE:
3041  * If this function fails, then both txd and m_head0 will be freed.
3042  */
3043 static int
3044 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3045     struct mbuf **m_head0)
3046 {
3047 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3048 	int error, nsegs, i;
3049 	struct mbuf *m_head = *m_head0;
3050 	struct rndis_packet_msg *pkt;
3051 	uint32_t *pi_data;
3052 	void *chim = NULL;
3053 	int pkt_hlen, pkt_size;
3054 
3055 	pkt = txd->rndis_pkt;
3056 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3057 	if (pkt_size < txr->hn_chim_size) {
3058 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3059 		if (chim != NULL)
3060 			pkt = chim;
3061 	} else {
3062 		if (txr->hn_agg_txd != NULL)
3063 			hn_flush_txagg(ifp, txr);
3064 	}
3065 
3066 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3067 	pkt->rm_len = m_head->m_pkthdr.len;
3068 	pkt->rm_dataoffset = 0;
3069 	pkt->rm_datalen = m_head->m_pkthdr.len;
3070 	pkt->rm_oobdataoffset = 0;
3071 	pkt->rm_oobdatalen = 0;
3072 	pkt->rm_oobdataelements = 0;
3073 	pkt->rm_pktinfooffset = sizeof(*pkt);
3074 	pkt->rm_pktinfolen = 0;
3075 	pkt->rm_vchandle = 0;
3076 	pkt->rm_reserved = 0;
3077 
3078 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3079 		/*
3080 		 * Set the hash value for this packet, so that the host could
3081 		 * dispatch the TX done event for this packet back to this TX
3082 		 * ring's channel.
3083 		 */
3084 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3085 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3086 		*pi_data = txr->hn_tx_idx;
3087 	}
3088 
3089 	if (m_head->m_flags & M_VLANTAG) {
3090 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3091 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3092 		*pi_data = NDIS_VLAN_INFO_MAKE(
3093 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3094 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3095 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3096 	}
3097 
3098 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3099 #if defined(INET6) || defined(INET)
3100 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3101 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3102 #ifdef INET
3103 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3104 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3105 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3106 			    m_head->m_pkthdr.tso_segsz);
3107 		}
3108 #endif
3109 #if defined(INET6) && defined(INET)
3110 		else
3111 #endif
3112 #ifdef INET6
3113 		{
3114 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3115 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3116 			    m_head->m_pkthdr.tso_segsz);
3117 		}
3118 #endif
3119 #endif	/* INET6 || INET */
3120 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3121 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3122 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3123 		if (m_head->m_pkthdr.csum_flags &
3124 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3125 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3126 		} else {
3127 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3128 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3129 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3130 		}
3131 
3132 		if (m_head->m_pkthdr.csum_flags &
3133 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3134 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3135 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3136 		} else if (m_head->m_pkthdr.csum_flags &
3137 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3138 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3139 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3140 		}
3141 	}
3142 
3143 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3144 	/* Fixup RNDIS packet message total length */
3145 	pkt->rm_len += pkt_hlen;
3146 	/* Convert RNDIS packet message offsets */
3147 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3148 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3149 
3150 	/*
3151 	 * Fast path: Chimney sending.
3152 	 */
3153 	if (chim != NULL) {
3154 		struct hn_txdesc *tgt_txd = txd;
3155 
3156 		if (txr->hn_agg_txd != NULL) {
3157 			tgt_txd = txr->hn_agg_txd;
3158 #ifdef INVARIANTS
3159 			*m_head0 = NULL;
3160 #endif
3161 		}
3162 
3163 		KASSERT(pkt == chim,
3164 		    ("RNDIS pkt not in chimney sending buffer"));
3165 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3166 		    ("chimney sending buffer is not used"));
3167 		tgt_txd->chim_size += pkt->rm_len;
3168 
3169 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3170 		    ((uint8_t *)chim) + pkt_hlen);
3171 
3172 		txr->hn_gpa_cnt = 0;
3173 		txr->hn_sendpkt = hn_txpkt_chim;
3174 		goto done;
3175 	}
3176 
3177 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3178 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3179 	    ("chimney buffer is used"));
3180 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3181 
3182 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3183 	if (__predict_false(error)) {
3184 		int freed;
3185 
3186 		/*
3187 		 * This mbuf is not linked w/ the txd yet, so free it now.
3188 		 */
3189 		m_freem(m_head);
3190 		*m_head0 = NULL;
3191 
3192 		freed = hn_txdesc_put(txr, txd);
3193 		KASSERT(freed != 0,
3194 		    ("fail to free txd upon txdma error"));
3195 
3196 		txr->hn_txdma_failed++;
3197 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3198 		return error;
3199 	}
3200 	*m_head0 = m_head;
3201 
3202 	/* +1 RNDIS packet message */
3203 	txr->hn_gpa_cnt = nsegs + 1;
3204 
3205 	/* send packet with page buffer */
3206 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3207 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3208 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3209 
3210 	/*
3211 	 * Fill the page buffers with mbuf info after the page
3212 	 * buffer for RNDIS packet message.
3213 	 */
3214 	for (i = 0; i < nsegs; ++i) {
3215 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3216 
3217 		gpa->gpa_page = atop(segs[i].ds_addr);
3218 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3219 		gpa->gpa_len = segs[i].ds_len;
3220 	}
3221 
3222 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3223 	txd->chim_size = 0;
3224 	txr->hn_sendpkt = hn_txpkt_sglist;
3225 done:
3226 	txd->m = m_head;
3227 
3228 	/* Set the completion routine */
3229 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3230 
3231 	/* Update temporary stats for later use. */
3232 	txr->hn_stat_pkts++;
3233 	txr->hn_stat_size += m_head->m_pkthdr.len;
3234 	if (m_head->m_flags & M_MCAST)
3235 		txr->hn_stat_mcasts++;
3236 
3237 	return 0;
3238 }
3239 
3240 /*
3241  * NOTE:
3242  * If this function fails, then txd will be freed, but the mbuf
3243  * associated w/ the txd will _not_ be freed.
3244  */
3245 static int
3246 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3247 {
3248 	int error, send_failed = 0, has_bpf;
3249 
3250 again:
3251 	has_bpf = bpf_peers_present(ifp->if_bpf);
3252 	if (has_bpf) {
3253 		/*
3254 		 * Make sure that this txd and any aggregated txds are not
3255 		 * freed before ETHER_BPF_MTAP.
3256 		 */
3257 		hn_txdesc_hold(txd);
3258 	}
3259 	error = txr->hn_sendpkt(txr, txd);
3260 	if (!error) {
3261 		if (has_bpf) {
3262 			const struct hn_txdesc *tmp_txd;
3263 
3264 			ETHER_BPF_MTAP(ifp, txd->m);
3265 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3266 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3267 		}
3268 
3269 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3270 #ifdef HN_IFSTART_SUPPORT
3271 		if (!hn_use_if_start)
3272 #endif
3273 		{
3274 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3275 			    txr->hn_stat_size);
3276 			if (txr->hn_stat_mcasts != 0) {
3277 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3278 				    txr->hn_stat_mcasts);
3279 			}
3280 		}
3281 		txr->hn_pkts += txr->hn_stat_pkts;
3282 		txr->hn_sends++;
3283 	}
3284 	if (has_bpf)
3285 		hn_txdesc_put(txr, txd);
3286 
3287 	if (__predict_false(error)) {
3288 		int freed;
3289 
3290 		/*
3291 		 * This should "really rarely" happen.
3292 		 *
3293 		 * XXX Too many RX to be acked or too many sideband
3294 		 * commands to run?  Ask netvsc_channel_rollup()
3295 		 * to kick start later.
3296 		 */
3297 		txr->hn_has_txeof = 1;
3298 		if (!send_failed) {
3299 			txr->hn_send_failed++;
3300 			send_failed = 1;
3301 			/*
3302 			 * Try sending again after set hn_has_txeof;
3303 			 * in case that we missed the last
3304 			 * netvsc_channel_rollup().
3305 			 */
3306 			goto again;
3307 		}
3308 		if_printf(ifp, "send failed\n");
3309 
3310 		/*
3311 		 * Caller will perform further processing on the
3312 		 * associated mbuf, so don't free it in hn_txdesc_put();
3313 		 * only unload it from the DMA map in hn_txdesc_put(),
3314 		 * if it was loaded.
3315 		 */
3316 		txd->m = NULL;
3317 		freed = hn_txdesc_put(txr, txd);
3318 		KASSERT(freed != 0,
3319 		    ("fail to free txd upon send error"));
3320 
3321 		txr->hn_send_failed++;
3322 	}
3323 
3324 	/* Reset temporary stats, after this sending is done. */
3325 	txr->hn_stat_size = 0;
3326 	txr->hn_stat_pkts = 0;
3327 	txr->hn_stat_mcasts = 0;
3328 
3329 	return (error);
3330 }
3331 
3332 /*
3333  * Append the specified data to the indicated mbuf chain,
3334  * Extend the mbuf chain if the new data does not fit in
3335  * existing space.
3336  *
3337  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3338  * There should be an equivalent in the kernel mbuf code,
3339  * but there does not appear to be one yet.
3340  *
3341  * Differs from m_append() in that additional mbufs are
3342  * allocated with cluster size MJUMPAGESIZE, and filled
3343  * accordingly.
3344  *
3345  * Return 1 if able to complete the job; otherwise 0.
3346  */
3347 static int
3348 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3349 {
3350 	struct mbuf *m, *n;
3351 	int remainder, space;
3352 
3353 	for (m = m0; m->m_next != NULL; m = m->m_next)
3354 		;
3355 	remainder = len;
3356 	space = M_TRAILINGSPACE(m);
3357 	if (space > 0) {
3358 		/*
3359 		 * Copy into available space.
3360 		 */
3361 		if (space > remainder)
3362 			space = remainder;
3363 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3364 		m->m_len += space;
3365 		cp += space;
3366 		remainder -= space;
3367 	}
3368 	while (remainder > 0) {
3369 		/*
3370 		 * Allocate a new mbuf; could check space
3371 		 * and allocate a cluster instead.
3372 		 */
3373 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3374 		if (n == NULL)
3375 			break;
3376 		n->m_len = min(MJUMPAGESIZE, remainder);
3377 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3378 		cp += n->m_len;
3379 		remainder -= n->m_len;
3380 		m->m_next = n;
3381 		m = n;
3382 	}
3383 	if (m0->m_flags & M_PKTHDR)
3384 		m0->m_pkthdr.len += len - remainder;
3385 
3386 	return (remainder == 0);
3387 }
3388 
3389 #if defined(INET) || defined(INET6)
3390 static __inline int
3391 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3392 {
3393 #if __FreeBSD_version >= 1100095
3394 	if (hn_lro_mbufq_depth) {
3395 		tcp_lro_queue_mbuf(lc, m);
3396 		return 0;
3397 	}
3398 #endif
3399 	return tcp_lro_rx(lc, m, 0);
3400 }
3401 #endif
3402 
3403 static int
3404 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3405     const struct hn_rxinfo *info)
3406 {
3407 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3408 	struct mbuf *m_new;
3409 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3410 	int hash_type = M_HASHTYPE_NONE;
3411 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3412 
3413 	ifp = hn_ifp;
3414 	if (rxr->hn_rxvf_ifp != NULL) {
3415 		/*
3416 		 * Non-transparent mode VF; pretend this packet is from
3417 		 * the VF.
3418 		 */
3419 		ifp = rxr->hn_rxvf_ifp;
3420 		is_vf = 1;
3421 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3422 		/* Transparent mode VF. */
3423 		is_vf = 1;
3424 	}
3425 
3426 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3427 		/*
3428 		 * NOTE:
3429 		 * See the NOTE of hn_rndis_init_fixat().  This
3430 		 * function can be reached, immediately after the
3431 		 * RNDIS is initialized but before the ifnet is
3432 		 * setup on the hn_attach() path; drop the unexpected
3433 		 * packets.
3434 		 */
3435 		return (0);
3436 	}
3437 
3438 	if (__predict_false(dlen < ETHER_HDR_LEN)) {
3439 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3440 		return (0);
3441 	}
3442 
3443 	if (dlen <= MHLEN) {
3444 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3445 		if (m_new == NULL) {
3446 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3447 			return (0);
3448 		}
3449 		memcpy(mtod(m_new, void *), data, dlen);
3450 		m_new->m_pkthdr.len = m_new->m_len = dlen;
3451 		rxr->hn_small_pkts++;
3452 	} else {
3453 		/*
3454 		 * Get an mbuf with a cluster.  For packets 2K or less,
3455 		 * get a standard 2K cluster.  For anything larger, get a
3456 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3457 		 * if looped around to the Hyper-V TX channel, so avoid them.
3458 		 */
3459 		size = MCLBYTES;
3460 		if (dlen > MCLBYTES) {
3461 			/* 4096 */
3462 			size = MJUMPAGESIZE;
3463 		}
3464 
3465 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3466 		if (m_new == NULL) {
3467 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3468 			return (0);
3469 		}
3470 
3471 		hv_m_append(m_new, dlen, data);
3472 	}
3473 	m_new->m_pkthdr.rcvif = ifp;
3474 
3475 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3476 		do_csum = 0;
3477 
3478 	/* receive side checksum offload */
3479 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3480 		/* IP csum offload */
3481 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3482 			m_new->m_pkthdr.csum_flags |=
3483 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3484 			rxr->hn_csum_ip++;
3485 		}
3486 
3487 		/* TCP/UDP csum offload */
3488 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3489 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3490 			m_new->m_pkthdr.csum_flags |=
3491 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3492 			m_new->m_pkthdr.csum_data = 0xffff;
3493 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3494 				rxr->hn_csum_tcp++;
3495 			else
3496 				rxr->hn_csum_udp++;
3497 		}
3498 
3499 		/*
3500 		 * XXX
3501 		 * As of this write (Oct 28th, 2016), host side will turn
3502 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3503 		 * the do_lro setting here is actually _not_ accurate.  We
3504 		 * depend on the RSS hash type check to reset do_lro.
3505 		 */
3506 		if ((info->csum_info &
3507 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3508 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3509 			do_lro = 1;
3510 	} else {
3511 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3512 		if (l3proto == ETHERTYPE_IP) {
3513 			if (l4proto == IPPROTO_TCP) {
3514 				if (do_csum &&
3515 				    (rxr->hn_trust_hcsum &
3516 				     HN_TRUST_HCSUM_TCP)) {
3517 					rxr->hn_csum_trusted++;
3518 					m_new->m_pkthdr.csum_flags |=
3519 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3520 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3521 					m_new->m_pkthdr.csum_data = 0xffff;
3522 				}
3523 				do_lro = 1;
3524 			} else if (l4proto == IPPROTO_UDP) {
3525 				if (do_csum &&
3526 				    (rxr->hn_trust_hcsum &
3527 				     HN_TRUST_HCSUM_UDP)) {
3528 					rxr->hn_csum_trusted++;
3529 					m_new->m_pkthdr.csum_flags |=
3530 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3531 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3532 					m_new->m_pkthdr.csum_data = 0xffff;
3533 				}
3534 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3535 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3536 				rxr->hn_csum_trusted++;
3537 				m_new->m_pkthdr.csum_flags |=
3538 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3539 			}
3540 		}
3541 	}
3542 
3543 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3544 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3545 		    NDIS_VLAN_INFO_ID(info->vlan_info),
3546 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3547 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3548 		m_new->m_flags |= M_VLANTAG;
3549 	}
3550 
3551 	/*
3552 	 * If VF is activated (tranparent/non-transparent mode does not
3553 	 * matter here).
3554 	 *
3555 	 * - Disable LRO
3556 	 *
3557 	 *   hn(4) will only receive broadcast packets, multicast packets,
3558 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3559 	 *   packet types.
3560 	 *
3561 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3562 	 *   all, since the LRO flush will use hn(4) as the receiving
3563 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3564 	 */
3565 	if (is_vf)
3566 		do_lro = 0;
3567 
3568 	/*
3569 	 * If VF is activated (tranparent/non-transparent mode does not
3570 	 * matter here), do _not_ mess with unsupported hash types or
3571 	 * functions.
3572 	 */
3573 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3574 		rxr->hn_rss_pkts++;
3575 		m_new->m_pkthdr.flowid = info->hash_value;
3576 		if (!is_vf)
3577 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3578 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3579 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3580 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3581 			    rxr->hn_mbuf_hash);
3582 
3583 			/*
3584 			 * NOTE:
3585 			 * do_lro is resetted, if the hash types are not TCP
3586 			 * related.  See the comment in the above csum_flags
3587 			 * setup section.
3588 			 */
3589 			switch (type) {
3590 			case NDIS_HASH_IPV4:
3591 				hash_type = M_HASHTYPE_RSS_IPV4;
3592 				do_lro = 0;
3593 				break;
3594 
3595 			case NDIS_HASH_TCP_IPV4:
3596 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3597 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3598 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3599 
3600 					if (is_vf)
3601 						def_htype = M_HASHTYPE_NONE;
3602 
3603 					/*
3604 					 * UDP 4-tuple hash is delivered as
3605 					 * TCP 4-tuple hash.
3606 					 */
3607 					if (l3proto == ETHERTYPE_MAX) {
3608 						hn_rxpkt_proto(m_new,
3609 						    &l3proto, &l4proto);
3610 					}
3611 					if (l3proto == ETHERTYPE_IP) {
3612 						if (l4proto == IPPROTO_UDP &&
3613 						    (rxr->hn_mbuf_hash &
3614 						     NDIS_HASH_UDP_IPV4_X)) {
3615 							hash_type =
3616 							M_HASHTYPE_RSS_UDP_IPV4;
3617 							do_lro = 0;
3618 						} else if (l4proto !=
3619 						    IPPROTO_TCP) {
3620 							hash_type = def_htype;
3621 							do_lro = 0;
3622 						}
3623 					} else {
3624 						hash_type = def_htype;
3625 						do_lro = 0;
3626 					}
3627 				}
3628 				break;
3629 
3630 			case NDIS_HASH_IPV6:
3631 				hash_type = M_HASHTYPE_RSS_IPV6;
3632 				do_lro = 0;
3633 				break;
3634 
3635 			case NDIS_HASH_IPV6_EX:
3636 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3637 				do_lro = 0;
3638 				break;
3639 
3640 			case NDIS_HASH_TCP_IPV6:
3641 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3642 				break;
3643 
3644 			case NDIS_HASH_TCP_IPV6_EX:
3645 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3646 				break;
3647 			}
3648 		}
3649 	} else if (!is_vf) {
3650 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3651 		hash_type = M_HASHTYPE_OPAQUE;
3652 	}
3653 	M_HASHTYPE_SET(m_new, hash_type);
3654 
3655 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3656 	if (hn_ifp != ifp) {
3657 		const struct ether_header *eh;
3658 
3659 		/*
3660 		 * Non-transparent mode VF is activated.
3661 		 */
3662 
3663 		/*
3664 		 * Allow tapping on hn(4).
3665 		 */
3666 		ETHER_BPF_MTAP(hn_ifp, m_new);
3667 
3668 		/*
3669 		 * Update hn(4)'s stats.
3670 		 */
3671 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3672 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3673 		/* Checked at the beginning of this function. */
3674 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3675 		eh = mtod(m_new, struct ether_header *);
3676 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3677 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3678 	}
3679 	rxr->hn_pkts++;
3680 
3681 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3682 #if defined(INET) || defined(INET6)
3683 		struct lro_ctrl *lro = &rxr->hn_lro;
3684 
3685 		if (lro->lro_cnt) {
3686 			rxr->hn_lro_tried++;
3687 			if (hn_lro_rx(lro, m_new) == 0) {
3688 				/* DONE! */
3689 				return 0;
3690 			}
3691 		}
3692 #endif
3693 	}
3694 	ifp->if_input(ifp, m_new);
3695 
3696 	return (0);
3697 }
3698 
3699 static int
3700 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3701 {
3702 	struct hn_softc *sc = ifp->if_softc;
3703 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3704 	struct ifnet *vf_ifp;
3705 	int mask, error = 0;
3706 	struct ifrsskey *ifrk;
3707 	struct ifrsshash *ifrh;
3708 	uint32_t mtu;
3709 
3710 	switch (cmd) {
3711 	case SIOCSIFMTU:
3712 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3713 			error = EINVAL;
3714 			break;
3715 		}
3716 
3717 		HN_LOCK(sc);
3718 
3719 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3720 			HN_UNLOCK(sc);
3721 			break;
3722 		}
3723 
3724 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3725 			/* Can't change MTU */
3726 			HN_UNLOCK(sc);
3727 			error = EOPNOTSUPP;
3728 			break;
3729 		}
3730 
3731 		if (ifp->if_mtu == ifr->ifr_mtu) {
3732 			HN_UNLOCK(sc);
3733 			break;
3734 		}
3735 
3736 		if (hn_xpnt_vf_isready(sc)) {
3737 			vf_ifp = sc->hn_vf_ifp;
3738 			ifr_vf = *ifr;
3739 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3740 			    sizeof(ifr_vf.ifr_name));
3741 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3742 			    (caddr_t)&ifr_vf);
3743 			if (error) {
3744 				HN_UNLOCK(sc);
3745 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3746 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3747 				break;
3748 			}
3749 		}
3750 
3751 		/*
3752 		 * Suspend this interface before the synthetic parts
3753 		 * are ripped.
3754 		 */
3755 		hn_suspend(sc);
3756 
3757 		/*
3758 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3759 		 */
3760 		hn_synth_detach(sc);
3761 
3762 		/*
3763 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3764 		 * with the new MTU setting.
3765 		 */
3766 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3767 		if (error) {
3768 			HN_UNLOCK(sc);
3769 			break;
3770 		}
3771 
3772 		error = hn_rndis_get_mtu(sc, &mtu);
3773 		if (error)
3774 			mtu = ifr->ifr_mtu;
3775 		else if (bootverbose)
3776 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3777 
3778 		/*
3779 		 * Commit the requested MTU, after the synthetic parts
3780 		 * have been successfully attached.
3781 		 */
3782 		if (mtu >= ifr->ifr_mtu) {
3783 			mtu = ifr->ifr_mtu;
3784 		} else {
3785 			if_printf(ifp, "fixup mtu %d -> %u\n",
3786 			    ifr->ifr_mtu, mtu);
3787 		}
3788 		ifp->if_mtu = mtu;
3789 
3790 		/*
3791 		 * Synthetic parts' reattach may change the chimney
3792 		 * sending size; update it.
3793 		 */
3794 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3795 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3796 
3797 		/*
3798 		 * Make sure that various parameters based on MTU are
3799 		 * still valid, after the MTU change.
3800 		 */
3801 		hn_mtu_change_fixup(sc);
3802 
3803 		/*
3804 		 * All done!  Resume the interface now.
3805 		 */
3806 		hn_resume(sc);
3807 
3808 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3809 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3810 			/*
3811 			 * Since we have reattached the NVS part,
3812 			 * change the datapath to VF again; in case
3813 			 * that it is lost, after the NVS was detached.
3814 			 */
3815 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3816 		}
3817 
3818 		HN_UNLOCK(sc);
3819 		break;
3820 
3821 	case SIOCSIFFLAGS:
3822 		HN_LOCK(sc);
3823 
3824 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3825 			HN_UNLOCK(sc);
3826 			break;
3827 		}
3828 
3829 		if (hn_xpnt_vf_isready(sc))
3830 			hn_xpnt_vf_saveifflags(sc);
3831 
3832 		if (ifp->if_flags & IFF_UP) {
3833 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3834 				/*
3835 				 * Caller meight hold mutex, e.g.
3836 				 * bpf; use busy-wait for the RNDIS
3837 				 * reply.
3838 				 */
3839 				HN_NO_SLEEPING(sc);
3840 				hn_rxfilter_config(sc);
3841 				HN_SLEEPING_OK(sc);
3842 
3843 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3844 					error = hn_xpnt_vf_iocsetflags(sc);
3845 			} else {
3846 				hn_init_locked(sc);
3847 			}
3848 		} else {
3849 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3850 				hn_stop(sc, false);
3851 		}
3852 		sc->hn_if_flags = ifp->if_flags;
3853 
3854 		HN_UNLOCK(sc);
3855 		break;
3856 
3857 	case SIOCSIFCAP:
3858 		HN_LOCK(sc);
3859 
3860 		if (hn_xpnt_vf_isready(sc)) {
3861 			ifr_vf = *ifr;
3862 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3863 			    sizeof(ifr_vf.ifr_name));
3864 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3865 			HN_UNLOCK(sc);
3866 			break;
3867 		}
3868 
3869 		/*
3870 		 * Fix up requested capabilities w/ supported capabilities,
3871 		 * since the supported capabilities could have been changed.
3872 		 */
3873 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3874 		    ifp->if_capenable;
3875 
3876 		if (mask & IFCAP_TXCSUM) {
3877 			ifp->if_capenable ^= IFCAP_TXCSUM;
3878 			if (ifp->if_capenable & IFCAP_TXCSUM)
3879 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3880 			else
3881 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3882 		}
3883 		if (mask & IFCAP_TXCSUM_IPV6) {
3884 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3885 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3886 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3887 			else
3888 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3889 		}
3890 
3891 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3892 		if (mask & IFCAP_RXCSUM)
3893 			ifp->if_capenable ^= IFCAP_RXCSUM;
3894 #ifdef foo
3895 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3896 		if (mask & IFCAP_RXCSUM_IPV6)
3897 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3898 #endif
3899 
3900 		if (mask & IFCAP_LRO)
3901 			ifp->if_capenable ^= IFCAP_LRO;
3902 
3903 		if (mask & IFCAP_TSO4) {
3904 			ifp->if_capenable ^= IFCAP_TSO4;
3905 			if (ifp->if_capenable & IFCAP_TSO4)
3906 				ifp->if_hwassist |= CSUM_IP_TSO;
3907 			else
3908 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3909 		}
3910 		if (mask & IFCAP_TSO6) {
3911 			ifp->if_capenable ^= IFCAP_TSO6;
3912 			if (ifp->if_capenable & IFCAP_TSO6)
3913 				ifp->if_hwassist |= CSUM_IP6_TSO;
3914 			else
3915 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3916 		}
3917 
3918 		HN_UNLOCK(sc);
3919 		break;
3920 
3921 	case SIOCADDMULTI:
3922 	case SIOCDELMULTI:
3923 		HN_LOCK(sc);
3924 
3925 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3926 			HN_UNLOCK(sc);
3927 			break;
3928 		}
3929 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3930 			/*
3931 			 * Multicast uses mutex; use busy-wait for
3932 			 * the RNDIS reply.
3933 			 */
3934 			HN_NO_SLEEPING(sc);
3935 			hn_rxfilter_config(sc);
3936 			HN_SLEEPING_OK(sc);
3937 		}
3938 
3939 		/* XXX vlan(4) style mcast addr maintenance */
3940 		if (hn_xpnt_vf_isready(sc)) {
3941 			int old_if_flags;
3942 
3943 			old_if_flags = sc->hn_vf_ifp->if_flags;
3944 			hn_xpnt_vf_saveifflags(sc);
3945 
3946 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3947 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3948 			     IFF_ALLMULTI))
3949 				error = hn_xpnt_vf_iocsetflags(sc);
3950 		}
3951 
3952 		HN_UNLOCK(sc);
3953 		break;
3954 
3955 	case SIOCSIFMEDIA:
3956 	case SIOCGIFMEDIA:
3957 		HN_LOCK(sc);
3958 		if (hn_xpnt_vf_isready(sc)) {
3959 			/*
3960 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3961 			 * create and pass ifr_vf to the VF here; just
3962 			 * replace the ifr_name.
3963 			 */
3964 			vf_ifp = sc->hn_vf_ifp;
3965 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3966 			    sizeof(ifr->ifr_name));
3967 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3968 			/* Restore the ifr_name. */
3969 			strlcpy(ifr->ifr_name, ifp->if_xname,
3970 			    sizeof(ifr->ifr_name));
3971 			HN_UNLOCK(sc);
3972 			break;
3973 		}
3974 		HN_UNLOCK(sc);
3975 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3976 		break;
3977 
3978 	case SIOCGIFRSSHASH:
3979 		ifrh = (struct ifrsshash *)data;
3980 		HN_LOCK(sc);
3981 		if (sc->hn_rx_ring_inuse == 1) {
3982 			HN_UNLOCK(sc);
3983 			ifrh->ifrh_func = RSS_FUNC_NONE;
3984 			ifrh->ifrh_types = 0;
3985 			break;
3986 		}
3987 
3988 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3989 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3990 		else
3991 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3992 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3993 		HN_UNLOCK(sc);
3994 		break;
3995 
3996 	case SIOCGIFRSSKEY:
3997 		ifrk = (struct ifrsskey *)data;
3998 		HN_LOCK(sc);
3999 		if (sc->hn_rx_ring_inuse == 1) {
4000 			HN_UNLOCK(sc);
4001 			ifrk->ifrk_func = RSS_FUNC_NONE;
4002 			ifrk->ifrk_keylen = 0;
4003 			break;
4004 		}
4005 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4006 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4007 		else
4008 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4009 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4010 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4011 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4012 		HN_UNLOCK(sc);
4013 		break;
4014 
4015 	default:
4016 		error = ether_ioctl(ifp, cmd, data);
4017 		break;
4018 	}
4019 	return (error);
4020 }
4021 
4022 static void
4023 hn_stop(struct hn_softc *sc, bool detaching)
4024 {
4025 	struct ifnet *ifp = sc->hn_ifp;
4026 	int i;
4027 
4028 	HN_LOCK_ASSERT(sc);
4029 
4030 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4031 	    ("synthetic parts were not attached"));
4032 
4033 	/* Clear RUNNING bit ASAP. */
4034 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4035 
4036 	/* Disable polling. */
4037 	hn_polling(sc, 0);
4038 
4039 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4040 		KASSERT(sc->hn_vf_ifp != NULL,
4041 		    ("%s: VF is not attached", ifp->if_xname));
4042 
4043 		/* Mark transparent mode VF as disabled. */
4044 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4045 
4046 		/*
4047 		 * NOTE:
4048 		 * Datapath setting must happen _before_ bringing
4049 		 * the VF down.
4050 		 */
4051 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4052 
4053 		/*
4054 		 * Bring the VF down.
4055 		 */
4056 		hn_xpnt_vf_saveifflags(sc);
4057 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4058 		hn_xpnt_vf_iocsetflags(sc);
4059 	}
4060 
4061 	/* Suspend data transfers. */
4062 	hn_suspend_data(sc);
4063 
4064 	/* Clear OACTIVE bit. */
4065 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4066 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4067 		sc->hn_tx_ring[i].hn_oactive = 0;
4068 
4069 	/*
4070 	 * If the non-transparent mode VF is active, make sure
4071 	 * that the RX filter still allows packet reception.
4072 	 */
4073 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4074 		hn_rxfilter_config(sc);
4075 }
4076 
4077 static void
4078 hn_init_locked(struct hn_softc *sc)
4079 {
4080 	struct ifnet *ifp = sc->hn_ifp;
4081 	int i;
4082 
4083 	HN_LOCK_ASSERT(sc);
4084 
4085 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4086 		return;
4087 
4088 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4089 		return;
4090 
4091 	/* Configure RX filter */
4092 	hn_rxfilter_config(sc);
4093 
4094 	/* Clear OACTIVE bit. */
4095 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4096 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4097 		sc->hn_tx_ring[i].hn_oactive = 0;
4098 
4099 	/* Clear TX 'suspended' bit. */
4100 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4101 
4102 	if (hn_xpnt_vf_isready(sc)) {
4103 		/* Initialize transparent VF. */
4104 		hn_xpnt_vf_init(sc);
4105 	}
4106 
4107 	/* Everything is ready; unleash! */
4108 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4109 
4110 	/* Re-enable polling if requested. */
4111 	if (sc->hn_pollhz > 0)
4112 		hn_polling(sc, sc->hn_pollhz);
4113 }
4114 
4115 static void
4116 hn_init(void *xsc)
4117 {
4118 	struct hn_softc *sc = xsc;
4119 
4120 	HN_LOCK(sc);
4121 	hn_init_locked(sc);
4122 	HN_UNLOCK(sc);
4123 }
4124 
4125 #if __FreeBSD_version >= 1100099
4126 
4127 static int
4128 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4129 {
4130 	struct hn_softc *sc = arg1;
4131 	unsigned int lenlim;
4132 	int error;
4133 
4134 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4135 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4136 	if (error || req->newptr == NULL)
4137 		return error;
4138 
4139 	HN_LOCK(sc);
4140 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4141 	    lenlim > TCP_LRO_LENGTH_MAX) {
4142 		HN_UNLOCK(sc);
4143 		return EINVAL;
4144 	}
4145 	hn_set_lro_lenlim(sc, lenlim);
4146 	HN_UNLOCK(sc);
4147 
4148 	return 0;
4149 }
4150 
4151 static int
4152 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4153 {
4154 	struct hn_softc *sc = arg1;
4155 	int ackcnt, error, i;
4156 
4157 	/*
4158 	 * lro_ackcnt_lim is append count limit,
4159 	 * +1 to turn it into aggregation limit.
4160 	 */
4161 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4162 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4163 	if (error || req->newptr == NULL)
4164 		return error;
4165 
4166 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4167 		return EINVAL;
4168 
4169 	/*
4170 	 * Convert aggregation limit back to append
4171 	 * count limit.
4172 	 */
4173 	--ackcnt;
4174 	HN_LOCK(sc);
4175 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4176 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4177 	HN_UNLOCK(sc);
4178 	return 0;
4179 }
4180 
4181 #endif
4182 
4183 static int
4184 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4185 {
4186 	struct hn_softc *sc = arg1;
4187 	int hcsum = arg2;
4188 	int on, error, i;
4189 
4190 	on = 0;
4191 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4192 		on = 1;
4193 
4194 	error = sysctl_handle_int(oidp, &on, 0, req);
4195 	if (error || req->newptr == NULL)
4196 		return error;
4197 
4198 	HN_LOCK(sc);
4199 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4200 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4201 
4202 		if (on)
4203 			rxr->hn_trust_hcsum |= hcsum;
4204 		else
4205 			rxr->hn_trust_hcsum &= ~hcsum;
4206 	}
4207 	HN_UNLOCK(sc);
4208 	return 0;
4209 }
4210 
4211 static int
4212 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4213 {
4214 	struct hn_softc *sc = arg1;
4215 	int chim_size, error;
4216 
4217 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4218 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4219 	if (error || req->newptr == NULL)
4220 		return error;
4221 
4222 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4223 		return EINVAL;
4224 
4225 	HN_LOCK(sc);
4226 	hn_set_chim_size(sc, chim_size);
4227 	HN_UNLOCK(sc);
4228 	return 0;
4229 }
4230 
4231 #if __FreeBSD_version < 1100095
4232 static int
4233 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4234 {
4235 	struct hn_softc *sc = arg1;
4236 	int ofs = arg2, i, error;
4237 	struct hn_rx_ring *rxr;
4238 	uint64_t stat;
4239 
4240 	stat = 0;
4241 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4242 		rxr = &sc->hn_rx_ring[i];
4243 		stat += *((int *)((uint8_t *)rxr + ofs));
4244 	}
4245 
4246 	error = sysctl_handle_64(oidp, &stat, 0, req);
4247 	if (error || req->newptr == NULL)
4248 		return error;
4249 
4250 	/* Zero out this stat. */
4251 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4252 		rxr = &sc->hn_rx_ring[i];
4253 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4254 	}
4255 	return 0;
4256 }
4257 #else
4258 static int
4259 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4260 {
4261 	struct hn_softc *sc = arg1;
4262 	int ofs = arg2, i, error;
4263 	struct hn_rx_ring *rxr;
4264 	uint64_t stat;
4265 
4266 	stat = 0;
4267 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4268 		rxr = &sc->hn_rx_ring[i];
4269 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4270 	}
4271 
4272 	error = sysctl_handle_64(oidp, &stat, 0, req);
4273 	if (error || req->newptr == NULL)
4274 		return error;
4275 
4276 	/* Zero out this stat. */
4277 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4278 		rxr = &sc->hn_rx_ring[i];
4279 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4280 	}
4281 	return 0;
4282 }
4283 
4284 #endif
4285 
4286 static int
4287 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4288 {
4289 	struct hn_softc *sc = arg1;
4290 	int ofs = arg2, i, error;
4291 	struct hn_rx_ring *rxr;
4292 	u_long stat;
4293 
4294 	stat = 0;
4295 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4296 		rxr = &sc->hn_rx_ring[i];
4297 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4298 	}
4299 
4300 	error = sysctl_handle_long(oidp, &stat, 0, req);
4301 	if (error || req->newptr == NULL)
4302 		return error;
4303 
4304 	/* Zero out this stat. */
4305 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4306 		rxr = &sc->hn_rx_ring[i];
4307 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4308 	}
4309 	return 0;
4310 }
4311 
4312 static int
4313 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4314 {
4315 	struct hn_softc *sc = arg1;
4316 	int ofs = arg2, i, error;
4317 	struct hn_tx_ring *txr;
4318 	u_long stat;
4319 
4320 	stat = 0;
4321 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4322 		txr = &sc->hn_tx_ring[i];
4323 		stat += *((u_long *)((uint8_t *)txr + ofs));
4324 	}
4325 
4326 	error = sysctl_handle_long(oidp, &stat, 0, req);
4327 	if (error || req->newptr == NULL)
4328 		return error;
4329 
4330 	/* Zero out this stat. */
4331 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4332 		txr = &sc->hn_tx_ring[i];
4333 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4334 	}
4335 	return 0;
4336 }
4337 
4338 static int
4339 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4340 {
4341 	struct hn_softc *sc = arg1;
4342 	int ofs = arg2, i, error, conf;
4343 	struct hn_tx_ring *txr;
4344 
4345 	txr = &sc->hn_tx_ring[0];
4346 	conf = *((int *)((uint8_t *)txr + ofs));
4347 
4348 	error = sysctl_handle_int(oidp, &conf, 0, req);
4349 	if (error || req->newptr == NULL)
4350 		return error;
4351 
4352 	HN_LOCK(sc);
4353 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4354 		txr = &sc->hn_tx_ring[i];
4355 		*((int *)((uint8_t *)txr + ofs)) = conf;
4356 	}
4357 	HN_UNLOCK(sc);
4358 
4359 	return 0;
4360 }
4361 
4362 static int
4363 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4364 {
4365 	struct hn_softc *sc = arg1;
4366 	int error, size;
4367 
4368 	size = sc->hn_agg_size;
4369 	error = sysctl_handle_int(oidp, &size, 0, req);
4370 	if (error || req->newptr == NULL)
4371 		return (error);
4372 
4373 	HN_LOCK(sc);
4374 	sc->hn_agg_size = size;
4375 	hn_set_txagg(sc);
4376 	HN_UNLOCK(sc);
4377 
4378 	return (0);
4379 }
4380 
4381 static int
4382 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4383 {
4384 	struct hn_softc *sc = arg1;
4385 	int error, pkts;
4386 
4387 	pkts = sc->hn_agg_pkts;
4388 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4389 	if (error || req->newptr == NULL)
4390 		return (error);
4391 
4392 	HN_LOCK(sc);
4393 	sc->hn_agg_pkts = pkts;
4394 	hn_set_txagg(sc);
4395 	HN_UNLOCK(sc);
4396 
4397 	return (0);
4398 }
4399 
4400 static int
4401 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4402 {
4403 	struct hn_softc *sc = arg1;
4404 	int pkts;
4405 
4406 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4407 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4408 }
4409 
4410 static int
4411 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4412 {
4413 	struct hn_softc *sc = arg1;
4414 	int align;
4415 
4416 	align = sc->hn_tx_ring[0].hn_agg_align;
4417 	return (sysctl_handle_int(oidp, &align, 0, req));
4418 }
4419 
4420 static void
4421 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4422 {
4423 	if (pollhz == 0)
4424 		vmbus_chan_poll_disable(chan);
4425 	else
4426 		vmbus_chan_poll_enable(chan, pollhz);
4427 }
4428 
4429 static void
4430 hn_polling(struct hn_softc *sc, u_int pollhz)
4431 {
4432 	int nsubch = sc->hn_rx_ring_inuse - 1;
4433 
4434 	HN_LOCK_ASSERT(sc);
4435 
4436 	if (nsubch > 0) {
4437 		struct vmbus_channel **subch;
4438 		int i;
4439 
4440 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4441 		for (i = 0; i < nsubch; ++i)
4442 			hn_chan_polling(subch[i], pollhz);
4443 		vmbus_subchan_rel(subch, nsubch);
4444 	}
4445 	hn_chan_polling(sc->hn_prichan, pollhz);
4446 }
4447 
4448 static int
4449 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4450 {
4451 	struct hn_softc *sc = arg1;
4452 	int pollhz, error;
4453 
4454 	pollhz = sc->hn_pollhz;
4455 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4456 	if (error || req->newptr == NULL)
4457 		return (error);
4458 
4459 	if (pollhz != 0 &&
4460 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4461 		return (EINVAL);
4462 
4463 	HN_LOCK(sc);
4464 	if (sc->hn_pollhz != pollhz) {
4465 		sc->hn_pollhz = pollhz;
4466 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4467 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4468 			hn_polling(sc, sc->hn_pollhz);
4469 	}
4470 	HN_UNLOCK(sc);
4471 
4472 	return (0);
4473 }
4474 
4475 static int
4476 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4477 {
4478 	struct hn_softc *sc = arg1;
4479 	char verstr[16];
4480 
4481 	snprintf(verstr, sizeof(verstr), "%u.%u",
4482 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4483 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4484 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4485 }
4486 
4487 static int
4488 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4489 {
4490 	struct hn_softc *sc = arg1;
4491 	char caps_str[128];
4492 	uint32_t caps;
4493 
4494 	HN_LOCK(sc);
4495 	caps = sc->hn_caps;
4496 	HN_UNLOCK(sc);
4497 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4498 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4499 }
4500 
4501 static int
4502 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4503 {
4504 	struct hn_softc *sc = arg1;
4505 	char assist_str[128];
4506 	uint32_t hwassist;
4507 
4508 	HN_LOCK(sc);
4509 	hwassist = sc->hn_ifp->if_hwassist;
4510 	HN_UNLOCK(sc);
4511 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4512 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4513 }
4514 
4515 static int
4516 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4517 {
4518 	struct hn_softc *sc = arg1;
4519 	char filter_str[128];
4520 	uint32_t filter;
4521 
4522 	HN_LOCK(sc);
4523 	filter = sc->hn_rx_filter;
4524 	HN_UNLOCK(sc);
4525 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4526 	    NDIS_PACKET_TYPES);
4527 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4528 }
4529 
4530 #ifndef RSS
4531 
4532 static int
4533 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4534 {
4535 	struct hn_softc *sc = arg1;
4536 	int error;
4537 
4538 	HN_LOCK(sc);
4539 
4540 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4541 	if (error || req->newptr == NULL)
4542 		goto back;
4543 
4544 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4545 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4546 		/*
4547 		 * RSS key is synchronized w/ VF's, don't allow users
4548 		 * to change it.
4549 		 */
4550 		error = EBUSY;
4551 		goto back;
4552 	}
4553 
4554 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4555 	if (error)
4556 		goto back;
4557 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4558 
4559 	if (sc->hn_rx_ring_inuse > 1) {
4560 		error = hn_rss_reconfig(sc);
4561 	} else {
4562 		/* Not RSS capable, at least for now; just save the RSS key. */
4563 		error = 0;
4564 	}
4565 back:
4566 	HN_UNLOCK(sc);
4567 	return (error);
4568 }
4569 
4570 static int
4571 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4572 {
4573 	struct hn_softc *sc = arg1;
4574 	int error;
4575 
4576 	HN_LOCK(sc);
4577 
4578 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4579 	if (error || req->newptr == NULL)
4580 		goto back;
4581 
4582 	/*
4583 	 * Don't allow RSS indirect table change, if this interface is not
4584 	 * RSS capable currently.
4585 	 */
4586 	if (sc->hn_rx_ring_inuse == 1) {
4587 		error = EOPNOTSUPP;
4588 		goto back;
4589 	}
4590 
4591 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4592 	if (error)
4593 		goto back;
4594 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4595 
4596 	hn_rss_ind_fixup(sc);
4597 	error = hn_rss_reconfig(sc);
4598 back:
4599 	HN_UNLOCK(sc);
4600 	return (error);
4601 }
4602 
4603 #endif	/* !RSS */
4604 
4605 static int
4606 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4607 {
4608 	struct hn_softc *sc = arg1;
4609 	char hash_str[128];
4610 	uint32_t hash;
4611 
4612 	HN_LOCK(sc);
4613 	hash = sc->hn_rss_hash;
4614 	HN_UNLOCK(sc);
4615 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4616 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4617 }
4618 
4619 static int
4620 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4621 {
4622 	struct hn_softc *sc = arg1;
4623 	char hash_str[128];
4624 	uint32_t hash;
4625 
4626 	HN_LOCK(sc);
4627 	hash = sc->hn_rss_hcap;
4628 	HN_UNLOCK(sc);
4629 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4630 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4631 }
4632 
4633 static int
4634 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4635 {
4636 	struct hn_softc *sc = arg1;
4637 	char hash_str[128];
4638 	uint32_t hash;
4639 
4640 	HN_LOCK(sc);
4641 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4642 	HN_UNLOCK(sc);
4643 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4644 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4645 }
4646 
4647 static int
4648 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4649 {
4650 	struct hn_softc *sc = arg1;
4651 	char vf_name[IFNAMSIZ + 1];
4652 	struct ifnet *vf_ifp;
4653 
4654 	HN_LOCK(sc);
4655 	vf_name[0] = '\0';
4656 	vf_ifp = sc->hn_vf_ifp;
4657 	if (vf_ifp != NULL)
4658 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4659 	HN_UNLOCK(sc);
4660 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4661 }
4662 
4663 static int
4664 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4665 {
4666 	struct hn_softc *sc = arg1;
4667 	char vf_name[IFNAMSIZ + 1];
4668 	struct ifnet *vf_ifp;
4669 
4670 	HN_LOCK(sc);
4671 	vf_name[0] = '\0';
4672 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4673 	if (vf_ifp != NULL)
4674 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4675 	HN_UNLOCK(sc);
4676 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4677 }
4678 
4679 static int
4680 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4681 {
4682 	struct rm_priotracker pt;
4683 	struct sbuf *sb;
4684 	int error, i;
4685 	bool first;
4686 
4687 	error = sysctl_wire_old_buffer(req, 0);
4688 	if (error != 0)
4689 		return (error);
4690 
4691 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4692 	if (sb == NULL)
4693 		return (ENOMEM);
4694 
4695 	rm_rlock(&hn_vfmap_lock, &pt);
4696 
4697 	first = true;
4698 	for (i = 0; i < hn_vfmap_size; ++i) {
4699 		struct ifnet *ifp;
4700 
4701 		if (hn_vfmap[i] == NULL)
4702 			continue;
4703 
4704 		ifp = ifnet_byindex(i);
4705 		if (ifp != NULL) {
4706 			if (first)
4707 				sbuf_printf(sb, "%s", ifp->if_xname);
4708 			else
4709 				sbuf_printf(sb, " %s", ifp->if_xname);
4710 			first = false;
4711 		}
4712 	}
4713 
4714 	rm_runlock(&hn_vfmap_lock, &pt);
4715 
4716 	error = sbuf_finish(sb);
4717 	sbuf_delete(sb);
4718 	return (error);
4719 }
4720 
4721 static int
4722 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4723 {
4724 	struct rm_priotracker pt;
4725 	struct sbuf *sb;
4726 	int error, i;
4727 	bool first;
4728 
4729 	error = sysctl_wire_old_buffer(req, 0);
4730 	if (error != 0)
4731 		return (error);
4732 
4733 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4734 	if (sb == NULL)
4735 		return (ENOMEM);
4736 
4737 	rm_rlock(&hn_vfmap_lock, &pt);
4738 
4739 	first = true;
4740 	for (i = 0; i < hn_vfmap_size; ++i) {
4741 		struct ifnet *ifp, *hn_ifp;
4742 
4743 		hn_ifp = hn_vfmap[i];
4744 		if (hn_ifp == NULL)
4745 			continue;
4746 
4747 		ifp = ifnet_byindex(i);
4748 		if (ifp != NULL) {
4749 			if (first) {
4750 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4751 				    hn_ifp->if_xname);
4752 			} else {
4753 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4754 				    hn_ifp->if_xname);
4755 			}
4756 			first = false;
4757 		}
4758 	}
4759 
4760 	rm_runlock(&hn_vfmap_lock, &pt);
4761 
4762 	error = sbuf_finish(sb);
4763 	sbuf_delete(sb);
4764 	return (error);
4765 }
4766 
4767 static int
4768 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4769 {
4770 	struct hn_softc *sc = arg1;
4771 	int error, onoff = 0;
4772 
4773 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4774 		onoff = 1;
4775 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4776 	if (error || req->newptr == NULL)
4777 		return (error);
4778 
4779 	HN_LOCK(sc);
4780 	/* NOTE: hn_vf_lock for hn_transmit() */
4781 	rm_wlock(&sc->hn_vf_lock);
4782 	if (onoff)
4783 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4784 	else
4785 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4786 	rm_wunlock(&sc->hn_vf_lock);
4787 	HN_UNLOCK(sc);
4788 
4789 	return (0);
4790 }
4791 
4792 static int
4793 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4794 {
4795 	struct hn_softc *sc = arg1;
4796 	int enabled = 0;
4797 
4798 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4799 		enabled = 1;
4800 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4801 }
4802 
4803 static int
4804 hn_check_iplen(const struct mbuf *m, int hoff)
4805 {
4806 	const struct ip *ip;
4807 	int len, iphlen, iplen;
4808 	const struct tcphdr *th;
4809 	int thoff;				/* TCP data offset */
4810 
4811 	len = hoff + sizeof(struct ip);
4812 
4813 	/* The packet must be at least the size of an IP header. */
4814 	if (m->m_pkthdr.len < len)
4815 		return IPPROTO_DONE;
4816 
4817 	/* The fixed IP header must reside completely in the first mbuf. */
4818 	if (m->m_len < len)
4819 		return IPPROTO_DONE;
4820 
4821 	ip = mtodo(m, hoff);
4822 
4823 	/* Bound check the packet's stated IP header length. */
4824 	iphlen = ip->ip_hl << 2;
4825 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4826 		return IPPROTO_DONE;
4827 
4828 	/* The full IP header must reside completely in the one mbuf. */
4829 	if (m->m_len < hoff + iphlen)
4830 		return IPPROTO_DONE;
4831 
4832 	iplen = ntohs(ip->ip_len);
4833 
4834 	/*
4835 	 * Check that the amount of data in the buffers is as
4836 	 * at least much as the IP header would have us expect.
4837 	 */
4838 	if (m->m_pkthdr.len < hoff + iplen)
4839 		return IPPROTO_DONE;
4840 
4841 	/*
4842 	 * Ignore IP fragments.
4843 	 */
4844 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4845 		return IPPROTO_DONE;
4846 
4847 	/*
4848 	 * The TCP/IP or UDP/IP header must be entirely contained within
4849 	 * the first fragment of a packet.
4850 	 */
4851 	switch (ip->ip_p) {
4852 	case IPPROTO_TCP:
4853 		if (iplen < iphlen + sizeof(struct tcphdr))
4854 			return IPPROTO_DONE;
4855 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4856 			return IPPROTO_DONE;
4857 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4858 		thoff = th->th_off << 2;
4859 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4860 			return IPPROTO_DONE;
4861 		if (m->m_len < hoff + iphlen + thoff)
4862 			return IPPROTO_DONE;
4863 		break;
4864 	case IPPROTO_UDP:
4865 		if (iplen < iphlen + sizeof(struct udphdr))
4866 			return IPPROTO_DONE;
4867 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4868 			return IPPROTO_DONE;
4869 		break;
4870 	default:
4871 		if (iplen < iphlen)
4872 			return IPPROTO_DONE;
4873 		break;
4874 	}
4875 	return ip->ip_p;
4876 }
4877 
4878 static void
4879 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4880 {
4881 	const struct ether_header *eh;
4882 	uint16_t etype;
4883 	int hoff;
4884 
4885 	hoff = sizeof(*eh);
4886 	/* Checked at the beginning of this function. */
4887 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4888 
4889 	eh = mtod(m_new, const struct ether_header *);
4890 	etype = ntohs(eh->ether_type);
4891 	if (etype == ETHERTYPE_VLAN) {
4892 		const struct ether_vlan_header *evl;
4893 
4894 		hoff = sizeof(*evl);
4895 		if (m_new->m_len < hoff)
4896 			return;
4897 		evl = mtod(m_new, const struct ether_vlan_header *);
4898 		etype = ntohs(evl->evl_proto);
4899 	}
4900 	*l3proto = etype;
4901 
4902 	if (etype == ETHERTYPE_IP)
4903 		*l4proto = hn_check_iplen(m_new, hoff);
4904 	else
4905 		*l4proto = IPPROTO_DONE;
4906 }
4907 
4908 static int
4909 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4910 {
4911 	struct sysctl_oid_list *child;
4912 	struct sysctl_ctx_list *ctx;
4913 	device_t dev = sc->hn_dev;
4914 #if defined(INET) || defined(INET6)
4915 #if __FreeBSD_version >= 1100095
4916 	int lroent_cnt;
4917 #endif
4918 #endif
4919 	int i;
4920 
4921 	/*
4922 	 * Create RXBUF for reception.
4923 	 *
4924 	 * NOTE:
4925 	 * - It is shared by all channels.
4926 	 * - A large enough buffer is allocated, certain version of NVSes
4927 	 *   may further limit the usable space.
4928 	 */
4929 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4930 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4931 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4932 	if (sc->hn_rxbuf == NULL) {
4933 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4934 		return (ENOMEM);
4935 	}
4936 
4937 	sc->hn_rx_ring_cnt = ring_cnt;
4938 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4939 
4940 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4941 	    M_DEVBUF, M_WAITOK | M_ZERO);
4942 
4943 #if defined(INET) || defined(INET6)
4944 #if __FreeBSD_version >= 1100095
4945 	lroent_cnt = hn_lro_entry_count;
4946 	if (lroent_cnt < TCP_LRO_ENTRIES)
4947 		lroent_cnt = TCP_LRO_ENTRIES;
4948 	if (bootverbose)
4949 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4950 #endif
4951 #endif	/* INET || INET6 */
4952 
4953 	ctx = device_get_sysctl_ctx(dev);
4954 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4955 
4956 	/* Create dev.hn.UNIT.rx sysctl tree */
4957 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4958 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4959 
4960 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4961 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4962 
4963 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4964 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4965 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4966 		if (rxr->hn_br == NULL) {
4967 			device_printf(dev, "allocate bufring failed\n");
4968 			return (ENOMEM);
4969 		}
4970 
4971 		if (hn_trust_hosttcp)
4972 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4973 		if (hn_trust_hostudp)
4974 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4975 		if (hn_trust_hostip)
4976 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4977 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4978 		rxr->hn_ifp = sc->hn_ifp;
4979 		if (i < sc->hn_tx_ring_cnt)
4980 			rxr->hn_txr = &sc->hn_tx_ring[i];
4981 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4982 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4983 		rxr->hn_rx_idx = i;
4984 		rxr->hn_rxbuf = sc->hn_rxbuf;
4985 
4986 		/*
4987 		 * Initialize LRO.
4988 		 */
4989 #if defined(INET) || defined(INET6)
4990 #if __FreeBSD_version >= 1100095
4991 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4992 		    hn_lro_mbufq_depth);
4993 #else
4994 		tcp_lro_init(&rxr->hn_lro);
4995 		rxr->hn_lro.ifp = sc->hn_ifp;
4996 #endif
4997 #if __FreeBSD_version >= 1100099
4998 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4999 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5000 #endif
5001 #endif	/* INET || INET6 */
5002 
5003 		if (sc->hn_rx_sysctl_tree != NULL) {
5004 			char name[16];
5005 
5006 			/*
5007 			 * Create per RX ring sysctl tree:
5008 			 * dev.hn.UNIT.rx.RINGID
5009 			 */
5010 			snprintf(name, sizeof(name), "%d", i);
5011 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5012 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5013 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5014 
5015 			if (rxr->hn_rx_sysctl_tree != NULL) {
5016 				SYSCTL_ADD_ULONG(ctx,
5017 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5018 				    OID_AUTO, "packets", CTLFLAG_RW,
5019 				    &rxr->hn_pkts, "# of packets received");
5020 				SYSCTL_ADD_ULONG(ctx,
5021 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5022 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
5023 				    &rxr->hn_rss_pkts,
5024 				    "# of packets w/ RSS info received");
5025 				SYSCTL_ADD_INT(ctx,
5026 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5027 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5028 				    &rxr->hn_pktbuf_len, 0,
5029 				    "Temporary channel packet buffer length");
5030 			}
5031 		}
5032 	}
5033 
5034 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5035 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5036 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5037 #if __FreeBSD_version < 1100095
5038 	    hn_rx_stat_int_sysctl,
5039 #else
5040 	    hn_rx_stat_u64_sysctl,
5041 #endif
5042 	    "LU", "LRO queued");
5043 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5044 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5045 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5046 #if __FreeBSD_version < 1100095
5047 	    hn_rx_stat_int_sysctl,
5048 #else
5049 	    hn_rx_stat_u64_sysctl,
5050 #endif
5051 	    "LU", "LRO flushed");
5052 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5053 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5054 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5055 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5056 #if __FreeBSD_version >= 1100099
5057 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5058 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5059 	    hn_lro_lenlim_sysctl, "IU",
5060 	    "Max # of data bytes to be aggregated by LRO");
5061 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5062 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5063 	    hn_lro_ackcnt_sysctl, "I",
5064 	    "Max # of ACKs to be aggregated by LRO");
5065 #endif
5066 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5067 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5068 	    hn_trust_hcsum_sysctl, "I",
5069 	    "Trust tcp segement verification on host side, "
5070 	    "when csum info is missing");
5071 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5072 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5073 	    hn_trust_hcsum_sysctl, "I",
5074 	    "Trust udp datagram verification on host side, "
5075 	    "when csum info is missing");
5076 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5077 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5078 	    hn_trust_hcsum_sysctl, "I",
5079 	    "Trust ip packet verification on host side, "
5080 	    "when csum info is missing");
5081 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5082 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5083 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5084 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5085 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5086 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5087 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5088 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5089 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5090 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5091 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5092 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5093 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5094 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5095 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5096 	    hn_rx_stat_ulong_sysctl, "LU",
5097 	    "# of packets that we trust host's csum verification");
5098 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5099 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5100 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5101 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5102 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5103 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5104 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5105 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5106 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5107 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5108 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5109 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5110 
5111 	return (0);
5112 }
5113 
5114 static void
5115 hn_destroy_rx_data(struct hn_softc *sc)
5116 {
5117 	int i;
5118 
5119 	if (sc->hn_rxbuf != NULL) {
5120 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5121 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5122 		else
5123 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5124 		sc->hn_rxbuf = NULL;
5125 	}
5126 
5127 	if (sc->hn_rx_ring_cnt == 0)
5128 		return;
5129 
5130 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5131 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5132 
5133 		if (rxr->hn_br == NULL)
5134 			continue;
5135 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5136 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5137 		} else {
5138 			device_printf(sc->hn_dev,
5139 			    "%dth channel bufring is referenced", i);
5140 		}
5141 		rxr->hn_br = NULL;
5142 
5143 #if defined(INET) || defined(INET6)
5144 		tcp_lro_free(&rxr->hn_lro);
5145 #endif
5146 		free(rxr->hn_pktbuf, M_DEVBUF);
5147 	}
5148 	free(sc->hn_rx_ring, M_DEVBUF);
5149 	sc->hn_rx_ring = NULL;
5150 
5151 	sc->hn_rx_ring_cnt = 0;
5152 	sc->hn_rx_ring_inuse = 0;
5153 }
5154 
5155 static int
5156 hn_tx_ring_create(struct hn_softc *sc, int id)
5157 {
5158 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5159 	device_t dev = sc->hn_dev;
5160 	bus_dma_tag_t parent_dtag;
5161 	int error, i;
5162 
5163 	txr->hn_sc = sc;
5164 	txr->hn_tx_idx = id;
5165 
5166 #ifndef HN_USE_TXDESC_BUFRING
5167 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5168 #endif
5169 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5170 
5171 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5172 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5173 	    M_DEVBUF, M_WAITOK | M_ZERO);
5174 #ifndef HN_USE_TXDESC_BUFRING
5175 	SLIST_INIT(&txr->hn_txlist);
5176 #else
5177 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5178 	    M_WAITOK, &txr->hn_tx_lock);
5179 #endif
5180 
5181 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5182 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5183 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5184 	} else {
5185 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5186 	}
5187 
5188 #ifdef HN_IFSTART_SUPPORT
5189 	if (hn_use_if_start) {
5190 		txr->hn_txeof = hn_start_txeof;
5191 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5192 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5193 	} else
5194 #endif
5195 	{
5196 		int br_depth;
5197 
5198 		txr->hn_txeof = hn_xmit_txeof;
5199 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5200 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5201 
5202 		br_depth = hn_get_txswq_depth(txr);
5203 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5204 		    M_WAITOK, &txr->hn_tx_lock);
5205 	}
5206 
5207 	txr->hn_direct_tx_size = hn_direct_tx_size;
5208 
5209 	/*
5210 	 * Always schedule transmission instead of trying to do direct
5211 	 * transmission.  This one gives the best performance so far.
5212 	 */
5213 	txr->hn_sched_tx = 1;
5214 
5215 	parent_dtag = bus_get_dma_tag(dev);
5216 
5217 	/* DMA tag for RNDIS packet messages. */
5218 	error = bus_dma_tag_create(parent_dtag, /* parent */
5219 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5220 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5221 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5222 	    BUS_SPACE_MAXADDR,		/* highaddr */
5223 	    NULL, NULL,			/* filter, filterarg */
5224 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5225 	    1,				/* nsegments */
5226 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5227 	    0,				/* flags */
5228 	    NULL,			/* lockfunc */
5229 	    NULL,			/* lockfuncarg */
5230 	    &txr->hn_tx_rndis_dtag);
5231 	if (error) {
5232 		device_printf(dev, "failed to create rndis dmatag\n");
5233 		return error;
5234 	}
5235 
5236 	/* DMA tag for data. */
5237 	error = bus_dma_tag_create(parent_dtag, /* parent */
5238 	    1,				/* alignment */
5239 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5240 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5241 	    BUS_SPACE_MAXADDR,		/* highaddr */
5242 	    NULL, NULL,			/* filter, filterarg */
5243 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5244 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5245 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5246 	    0,				/* flags */
5247 	    NULL,			/* lockfunc */
5248 	    NULL,			/* lockfuncarg */
5249 	    &txr->hn_tx_data_dtag);
5250 	if (error) {
5251 		device_printf(dev, "failed to create data dmatag\n");
5252 		return error;
5253 	}
5254 
5255 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5256 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5257 
5258 		txd->txr = txr;
5259 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5260 		STAILQ_INIT(&txd->agg_list);
5261 
5262 		/*
5263 		 * Allocate and load RNDIS packet message.
5264 		 */
5265         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5266 		    (void **)&txd->rndis_pkt,
5267 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5268 		    &txd->rndis_pkt_dmap);
5269 		if (error) {
5270 			device_printf(dev,
5271 			    "failed to allocate rndis_packet_msg, %d\n", i);
5272 			return error;
5273 		}
5274 
5275 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5276 		    txd->rndis_pkt_dmap,
5277 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5278 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5279 		    BUS_DMA_NOWAIT);
5280 		if (error) {
5281 			device_printf(dev,
5282 			    "failed to load rndis_packet_msg, %d\n", i);
5283 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5284 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5285 			return error;
5286 		}
5287 
5288 		/* DMA map for TX data. */
5289 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5290 		    &txd->data_dmap);
5291 		if (error) {
5292 			device_printf(dev,
5293 			    "failed to allocate tx data dmamap\n");
5294 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5295 			    txd->rndis_pkt_dmap);
5296 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5297 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5298 			return error;
5299 		}
5300 
5301 		/* All set, put it to list */
5302 		txd->flags |= HN_TXD_FLAG_ONLIST;
5303 #ifndef HN_USE_TXDESC_BUFRING
5304 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5305 #else
5306 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5307 #endif
5308 	}
5309 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5310 
5311 	if (sc->hn_tx_sysctl_tree != NULL) {
5312 		struct sysctl_oid_list *child;
5313 		struct sysctl_ctx_list *ctx;
5314 		char name[16];
5315 
5316 		/*
5317 		 * Create per TX ring sysctl tree:
5318 		 * dev.hn.UNIT.tx.RINGID
5319 		 */
5320 		ctx = device_get_sysctl_ctx(dev);
5321 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5322 
5323 		snprintf(name, sizeof(name), "%d", id);
5324 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5325 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5326 
5327 		if (txr->hn_tx_sysctl_tree != NULL) {
5328 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5329 
5330 #ifdef HN_DEBUG
5331 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5332 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5333 			    "# of available TX descs");
5334 #endif
5335 #ifdef HN_IFSTART_SUPPORT
5336 			if (!hn_use_if_start)
5337 #endif
5338 			{
5339 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5340 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5341 				    "over active");
5342 			}
5343 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5344 			    CTLFLAG_RW, &txr->hn_pkts,
5345 			    "# of packets transmitted");
5346 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5347 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5348 		}
5349 	}
5350 
5351 	return 0;
5352 }
5353 
5354 static void
5355 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5356 {
5357 	struct hn_tx_ring *txr = txd->txr;
5358 
5359 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5360 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5361 
5362 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5363 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5364 	    txd->rndis_pkt_dmap);
5365 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5366 }
5367 
5368 static void
5369 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5370 {
5371 
5372 	KASSERT(txd->refs == 0 || txd->refs == 1,
5373 	    ("invalid txd refs %d", txd->refs));
5374 
5375 	/* Aggregated txds will be freed by their aggregating txd. */
5376 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5377 		int freed;
5378 
5379 		freed = hn_txdesc_put(txr, txd);
5380 		KASSERT(freed, ("can't free txdesc"));
5381 	}
5382 }
5383 
5384 static void
5385 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5386 {
5387 	int i;
5388 
5389 	if (txr->hn_txdesc == NULL)
5390 		return;
5391 
5392 	/*
5393 	 * NOTE:
5394 	 * Because the freeing of aggregated txds will be deferred
5395 	 * to the aggregating txd, two passes are used here:
5396 	 * - The first pass GCes any pending txds.  This GC is necessary,
5397 	 *   since if the channels are revoked, hypervisor will not
5398 	 *   deliver send-done for all pending txds.
5399 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5400 	 *   were freed.
5401 	 */
5402 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5403 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5404 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5405 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5406 
5407 	if (txr->hn_tx_data_dtag != NULL)
5408 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5409 	if (txr->hn_tx_rndis_dtag != NULL)
5410 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5411 
5412 #ifdef HN_USE_TXDESC_BUFRING
5413 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5414 #endif
5415 
5416 	free(txr->hn_txdesc, M_DEVBUF);
5417 	txr->hn_txdesc = NULL;
5418 
5419 	if (txr->hn_mbuf_br != NULL)
5420 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5421 
5422 #ifndef HN_USE_TXDESC_BUFRING
5423 	mtx_destroy(&txr->hn_txlist_spin);
5424 #endif
5425 	mtx_destroy(&txr->hn_tx_lock);
5426 }
5427 
5428 static int
5429 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5430 {
5431 	struct sysctl_oid_list *child;
5432 	struct sysctl_ctx_list *ctx;
5433 	int i;
5434 
5435 	/*
5436 	 * Create TXBUF for chimney sending.
5437 	 *
5438 	 * NOTE: It is shared by all channels.
5439 	 */
5440 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5441 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5442 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5443 	if (sc->hn_chim == NULL) {
5444 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5445 		return (ENOMEM);
5446 	}
5447 
5448 	sc->hn_tx_ring_cnt = ring_cnt;
5449 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5450 
5451 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5452 	    M_DEVBUF, M_WAITOK | M_ZERO);
5453 
5454 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5455 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5456 
5457 	/* Create dev.hn.UNIT.tx sysctl tree */
5458 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5459 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5460 
5461 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5462 		int error;
5463 
5464 		error = hn_tx_ring_create(sc, i);
5465 		if (error)
5466 			return error;
5467 	}
5468 
5469 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5470 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5471 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5472 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5473 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5474 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5475 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5476 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5477 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5478 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5479 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5480 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5481 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5482 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5483 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5484 	    hn_tx_stat_ulong_sysctl, "LU",
5485 	    "# of packet transmission aggregation flush failure");
5486 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5487 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5488 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5489 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5490 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5491 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5492 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5493 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5494 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5495 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5496 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5497 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5498 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5499 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5500 	    "# of total TX descs");
5501 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5502 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5503 	    "Chimney send packet size upper boundary");
5504 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5505 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5506 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5507 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5508 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5509 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5510 	    hn_tx_conf_int_sysctl, "I",
5511 	    "Size of the packet for direct transmission");
5512 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5513 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5514 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5515 	    hn_tx_conf_int_sysctl, "I",
5516 	    "Always schedule transmission "
5517 	    "instead of doing direct transmission");
5518 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5519 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5520 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5521 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5522 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5523 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5524 	    "Applied packet transmission aggregation size");
5525 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5526 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5527 	    hn_txagg_pktmax_sysctl, "I",
5528 	    "Applied packet transmission aggregation packets");
5529 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5530 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5531 	    hn_txagg_align_sysctl, "I",
5532 	    "Applied packet transmission aggregation alignment");
5533 
5534 	return 0;
5535 }
5536 
5537 static void
5538 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5539 {
5540 	int i;
5541 
5542 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5543 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5544 }
5545 
5546 static void
5547 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5548 {
5549 	struct ifnet *ifp = sc->hn_ifp;
5550 	u_int hw_tsomax;
5551 	int tso_minlen;
5552 
5553 	HN_LOCK_ASSERT(sc);
5554 
5555 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5556 		return;
5557 
5558 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5559 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5560 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5561 
5562 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5563 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5564 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5565 
5566 	if (tso_maxlen < tso_minlen)
5567 		tso_maxlen = tso_minlen;
5568 	else if (tso_maxlen > IP_MAXPACKET)
5569 		tso_maxlen = IP_MAXPACKET;
5570 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5571 		tso_maxlen = sc->hn_ndis_tso_szmax;
5572 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5573 
5574 	if (hn_xpnt_vf_isready(sc)) {
5575 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5576 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5577 	}
5578 	ifp->if_hw_tsomax = hw_tsomax;
5579 	if (bootverbose)
5580 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5581 }
5582 
5583 static void
5584 hn_fixup_tx_data(struct hn_softc *sc)
5585 {
5586 	uint64_t csum_assist;
5587 	int i;
5588 
5589 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5590 	if (hn_tx_chimney_size > 0 &&
5591 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5592 		hn_set_chim_size(sc, hn_tx_chimney_size);
5593 
5594 	csum_assist = 0;
5595 	if (sc->hn_caps & HN_CAP_IPCS)
5596 		csum_assist |= CSUM_IP;
5597 	if (sc->hn_caps & HN_CAP_TCP4CS)
5598 		csum_assist |= CSUM_IP_TCP;
5599 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5600 		csum_assist |= CSUM_IP_UDP;
5601 	if (sc->hn_caps & HN_CAP_TCP6CS)
5602 		csum_assist |= CSUM_IP6_TCP;
5603 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5604 		csum_assist |= CSUM_IP6_UDP;
5605 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5606 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5607 
5608 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5609 		/*
5610 		 * Support HASHVAL pktinfo on TX path.
5611 		 */
5612 		if (bootverbose)
5613 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5614 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5615 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5616 	}
5617 }
5618 
5619 static void
5620 hn_fixup_rx_data(struct hn_softc *sc)
5621 {
5622 
5623 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5624 		int i;
5625 
5626 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5627 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5628 	}
5629 }
5630 
5631 static void
5632 hn_destroy_tx_data(struct hn_softc *sc)
5633 {
5634 	int i;
5635 
5636 	if (sc->hn_chim != NULL) {
5637 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5638 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5639 		} else {
5640 			device_printf(sc->hn_dev,
5641 			    "chimney sending buffer is referenced");
5642 		}
5643 		sc->hn_chim = NULL;
5644 	}
5645 
5646 	if (sc->hn_tx_ring_cnt == 0)
5647 		return;
5648 
5649 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5650 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5651 
5652 	free(sc->hn_tx_ring, M_DEVBUF);
5653 	sc->hn_tx_ring = NULL;
5654 
5655 	sc->hn_tx_ring_cnt = 0;
5656 	sc->hn_tx_ring_inuse = 0;
5657 }
5658 
5659 #ifdef HN_IFSTART_SUPPORT
5660 
5661 static void
5662 hn_start_taskfunc(void *xtxr, int pending __unused)
5663 {
5664 	struct hn_tx_ring *txr = xtxr;
5665 
5666 	mtx_lock(&txr->hn_tx_lock);
5667 	hn_start_locked(txr, 0);
5668 	mtx_unlock(&txr->hn_tx_lock);
5669 }
5670 
5671 static int
5672 hn_start_locked(struct hn_tx_ring *txr, int len)
5673 {
5674 	struct hn_softc *sc = txr->hn_sc;
5675 	struct ifnet *ifp = sc->hn_ifp;
5676 	int sched = 0;
5677 
5678 	KASSERT(hn_use_if_start,
5679 	    ("hn_start_locked is called, when if_start is disabled"));
5680 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5681 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5682 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5683 
5684 	if (__predict_false(txr->hn_suspended))
5685 		return (0);
5686 
5687 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5688 	    IFF_DRV_RUNNING)
5689 		return (0);
5690 
5691 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5692 		struct hn_txdesc *txd;
5693 		struct mbuf *m_head;
5694 		int error;
5695 
5696 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5697 		if (m_head == NULL)
5698 			break;
5699 
5700 		if (len > 0 && m_head->m_pkthdr.len > len) {
5701 			/*
5702 			 * This sending could be time consuming; let callers
5703 			 * dispatch this packet sending (and sending of any
5704 			 * following up packets) to tx taskqueue.
5705 			 */
5706 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5707 			sched = 1;
5708 			break;
5709 		}
5710 
5711 #if defined(INET6) || defined(INET)
5712 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5713 			m_head = hn_tso_fixup(m_head);
5714 			if (__predict_false(m_head == NULL)) {
5715 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5716 				continue;
5717 			}
5718 		} else if (m_head->m_pkthdr.csum_flags &
5719 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5720 			m_head = hn_set_hlen(m_head);
5721 			if (__predict_false(m_head == NULL)) {
5722 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5723 				continue;
5724 			}
5725 		}
5726 #endif
5727 
5728 		txd = hn_txdesc_get(txr);
5729 		if (txd == NULL) {
5730 			txr->hn_no_txdescs++;
5731 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5732 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5733 			break;
5734 		}
5735 
5736 		error = hn_encap(ifp, txr, txd, &m_head);
5737 		if (error) {
5738 			/* Both txd and m_head are freed */
5739 			KASSERT(txr->hn_agg_txd == NULL,
5740 			    ("encap failed w/ pending aggregating txdesc"));
5741 			continue;
5742 		}
5743 
5744 		if (txr->hn_agg_pktleft == 0) {
5745 			if (txr->hn_agg_txd != NULL) {
5746 				KASSERT(m_head == NULL,
5747 				    ("pending mbuf for aggregating txdesc"));
5748 				error = hn_flush_txagg(ifp, txr);
5749 				if (__predict_false(error)) {
5750 					atomic_set_int(&ifp->if_drv_flags,
5751 					    IFF_DRV_OACTIVE);
5752 					break;
5753 				}
5754 			} else {
5755 				KASSERT(m_head != NULL, ("mbuf was freed"));
5756 				error = hn_txpkt(ifp, txr, txd);
5757 				if (__predict_false(error)) {
5758 					/* txd is freed, but m_head is not */
5759 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5760 					atomic_set_int(&ifp->if_drv_flags,
5761 					    IFF_DRV_OACTIVE);
5762 					break;
5763 				}
5764 			}
5765 		}
5766 #ifdef INVARIANTS
5767 		else {
5768 			KASSERT(txr->hn_agg_txd != NULL,
5769 			    ("no aggregating txdesc"));
5770 			KASSERT(m_head == NULL,
5771 			    ("pending mbuf for aggregating txdesc"));
5772 		}
5773 #endif
5774 	}
5775 
5776 	/* Flush pending aggerated transmission. */
5777 	if (txr->hn_agg_txd != NULL)
5778 		hn_flush_txagg(ifp, txr);
5779 	return (sched);
5780 }
5781 
5782 static void
5783 hn_start(struct ifnet *ifp)
5784 {
5785 	struct hn_softc *sc = ifp->if_softc;
5786 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5787 
5788 	if (txr->hn_sched_tx)
5789 		goto do_sched;
5790 
5791 	if (mtx_trylock(&txr->hn_tx_lock)) {
5792 		int sched;
5793 
5794 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5795 		mtx_unlock(&txr->hn_tx_lock);
5796 		if (!sched)
5797 			return;
5798 	}
5799 do_sched:
5800 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5801 }
5802 
5803 static void
5804 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5805 {
5806 	struct hn_tx_ring *txr = xtxr;
5807 
5808 	mtx_lock(&txr->hn_tx_lock);
5809 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5810 	hn_start_locked(txr, 0);
5811 	mtx_unlock(&txr->hn_tx_lock);
5812 }
5813 
5814 static void
5815 hn_start_txeof(struct hn_tx_ring *txr)
5816 {
5817 	struct hn_softc *sc = txr->hn_sc;
5818 	struct ifnet *ifp = sc->hn_ifp;
5819 
5820 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5821 
5822 	if (txr->hn_sched_tx)
5823 		goto do_sched;
5824 
5825 	if (mtx_trylock(&txr->hn_tx_lock)) {
5826 		int sched;
5827 
5828 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5829 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5830 		mtx_unlock(&txr->hn_tx_lock);
5831 		if (sched) {
5832 			taskqueue_enqueue(txr->hn_tx_taskq,
5833 			    &txr->hn_tx_task);
5834 		}
5835 	} else {
5836 do_sched:
5837 		/*
5838 		 * Release the OACTIVE earlier, with the hope, that
5839 		 * others could catch up.  The task will clear the
5840 		 * flag again with the hn_tx_lock to avoid possible
5841 		 * races.
5842 		 */
5843 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5844 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5845 	}
5846 }
5847 
5848 #endif	/* HN_IFSTART_SUPPORT */
5849 
5850 static int
5851 hn_xmit(struct hn_tx_ring *txr, int len)
5852 {
5853 	struct hn_softc *sc = txr->hn_sc;
5854 	struct ifnet *ifp = sc->hn_ifp;
5855 	struct mbuf *m_head;
5856 	int sched = 0;
5857 
5858 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5859 #ifdef HN_IFSTART_SUPPORT
5860 	KASSERT(hn_use_if_start == 0,
5861 	    ("hn_xmit is called, when if_start is enabled"));
5862 #endif
5863 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5864 
5865 	if (__predict_false(txr->hn_suspended))
5866 		return (0);
5867 
5868 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5869 		return (0);
5870 
5871 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5872 		struct hn_txdesc *txd;
5873 		int error;
5874 
5875 		if (len > 0 && m_head->m_pkthdr.len > len) {
5876 			/*
5877 			 * This sending could be time consuming; let callers
5878 			 * dispatch this packet sending (and sending of any
5879 			 * following up packets) to tx taskqueue.
5880 			 */
5881 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5882 			sched = 1;
5883 			break;
5884 		}
5885 
5886 		txd = hn_txdesc_get(txr);
5887 		if (txd == NULL) {
5888 			txr->hn_no_txdescs++;
5889 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5890 			txr->hn_oactive = 1;
5891 			break;
5892 		}
5893 
5894 		error = hn_encap(ifp, txr, txd, &m_head);
5895 		if (error) {
5896 			/* Both txd and m_head are freed; discard */
5897 			KASSERT(txr->hn_agg_txd == NULL,
5898 			    ("encap failed w/ pending aggregating txdesc"));
5899 			drbr_advance(ifp, txr->hn_mbuf_br);
5900 			continue;
5901 		}
5902 
5903 		if (txr->hn_agg_pktleft == 0) {
5904 			if (txr->hn_agg_txd != NULL) {
5905 				KASSERT(m_head == NULL,
5906 				    ("pending mbuf for aggregating txdesc"));
5907 				error = hn_flush_txagg(ifp, txr);
5908 				if (__predict_false(error)) {
5909 					txr->hn_oactive = 1;
5910 					break;
5911 				}
5912 			} else {
5913 				KASSERT(m_head != NULL, ("mbuf was freed"));
5914 				error = hn_txpkt(ifp, txr, txd);
5915 				if (__predict_false(error)) {
5916 					/* txd is freed, but m_head is not */
5917 					drbr_putback(ifp, txr->hn_mbuf_br,
5918 					    m_head);
5919 					txr->hn_oactive = 1;
5920 					break;
5921 				}
5922 			}
5923 		}
5924 #ifdef INVARIANTS
5925 		else {
5926 			KASSERT(txr->hn_agg_txd != NULL,
5927 			    ("no aggregating txdesc"));
5928 			KASSERT(m_head == NULL,
5929 			    ("pending mbuf for aggregating txdesc"));
5930 		}
5931 #endif
5932 
5933 		/* Sent */
5934 		drbr_advance(ifp, txr->hn_mbuf_br);
5935 	}
5936 
5937 	/* Flush pending aggerated transmission. */
5938 	if (txr->hn_agg_txd != NULL)
5939 		hn_flush_txagg(ifp, txr);
5940 	return (sched);
5941 }
5942 
5943 static int
5944 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5945 {
5946 	struct hn_softc *sc = ifp->if_softc;
5947 	struct hn_tx_ring *txr;
5948 	int error, idx = 0;
5949 
5950 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5951 		struct rm_priotracker pt;
5952 
5953 		rm_rlock(&sc->hn_vf_lock, &pt);
5954 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5955 			struct mbuf *m_bpf = NULL;
5956 			int obytes, omcast;
5957 
5958 			obytes = m->m_pkthdr.len;
5959 			omcast = (m->m_flags & M_MCAST) != 0;
5960 
5961 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5962 				if (bpf_peers_present(ifp->if_bpf)) {
5963 					m_bpf = m_copypacket(m, M_NOWAIT);
5964 					if (m_bpf == NULL) {
5965 						/*
5966 						 * Failed to grab a shallow
5967 						 * copy; tap now.
5968 						 */
5969 						ETHER_BPF_MTAP(ifp, m);
5970 					}
5971 				}
5972 			} else {
5973 				ETHER_BPF_MTAP(ifp, m);
5974 			}
5975 
5976 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5977 			rm_runlock(&sc->hn_vf_lock, &pt);
5978 
5979 			if (m_bpf != NULL) {
5980 				if (!error)
5981 					ETHER_BPF_MTAP(ifp, m_bpf);
5982 				m_freem(m_bpf);
5983 			}
5984 
5985 			if (error == ENOBUFS) {
5986 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5987 			} else if (error) {
5988 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5989 			} else {
5990 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5991 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5992 				if (omcast) {
5993 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5994 					    omcast);
5995 				}
5996 			}
5997 			return (error);
5998 		}
5999 		rm_runlock(&sc->hn_vf_lock, &pt);
6000 	}
6001 
6002 #if defined(INET6) || defined(INET)
6003 	/*
6004 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6005 	 * since packet headers should be cache-hot.
6006 	 */
6007 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6008 		m = hn_tso_fixup(m);
6009 		if (__predict_false(m == NULL)) {
6010 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6011 			return EIO;
6012 		}
6013 	} else if (m->m_pkthdr.csum_flags &
6014 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6015 		m = hn_set_hlen(m);
6016 		if (__predict_false(m == NULL)) {
6017 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6018 			return EIO;
6019 		}
6020 	}
6021 #endif
6022 
6023 	/*
6024 	 * Select the TX ring based on flowid
6025 	 */
6026 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6027 #ifdef RSS
6028 		uint32_t bid;
6029 
6030 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6031 		    &bid) == 0)
6032 			idx = bid % sc->hn_tx_ring_inuse;
6033 		else
6034 #endif
6035 		{
6036 #if defined(INET6) || defined(INET)
6037 			int tcpsyn = 0;
6038 
6039 			if (m->m_pkthdr.len < 128 &&
6040 			    (m->m_pkthdr.csum_flags &
6041 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6042 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6043 				m = hn_check_tcpsyn(m, &tcpsyn);
6044 				if (__predict_false(m == NULL)) {
6045 					if_inc_counter(ifp,
6046 					    IFCOUNTER_OERRORS, 1);
6047 					return (EIO);
6048 				}
6049 			}
6050 #else
6051 			const int tcpsyn = 0;
6052 #endif
6053 			if (tcpsyn)
6054 				idx = 0;
6055 			else
6056 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6057 		}
6058 	}
6059 	txr = &sc->hn_tx_ring[idx];
6060 
6061 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6062 	if (error) {
6063 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6064 		return error;
6065 	}
6066 
6067 	if (txr->hn_oactive)
6068 		return 0;
6069 
6070 	if (txr->hn_sched_tx)
6071 		goto do_sched;
6072 
6073 	if (mtx_trylock(&txr->hn_tx_lock)) {
6074 		int sched;
6075 
6076 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6077 		mtx_unlock(&txr->hn_tx_lock);
6078 		if (!sched)
6079 			return 0;
6080 	}
6081 do_sched:
6082 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6083 	return 0;
6084 }
6085 
6086 static void
6087 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6088 {
6089 	struct mbuf *m;
6090 
6091 	mtx_lock(&txr->hn_tx_lock);
6092 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6093 		m_freem(m);
6094 	mtx_unlock(&txr->hn_tx_lock);
6095 }
6096 
6097 static void
6098 hn_xmit_qflush(struct ifnet *ifp)
6099 {
6100 	struct hn_softc *sc = ifp->if_softc;
6101 	struct rm_priotracker pt;
6102 	int i;
6103 
6104 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6105 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6106 	if_qflush(ifp);
6107 
6108 	rm_rlock(&sc->hn_vf_lock, &pt);
6109 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6110 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6111 	rm_runlock(&sc->hn_vf_lock, &pt);
6112 }
6113 
6114 static void
6115 hn_xmit_txeof(struct hn_tx_ring *txr)
6116 {
6117 
6118 	if (txr->hn_sched_tx)
6119 		goto do_sched;
6120 
6121 	if (mtx_trylock(&txr->hn_tx_lock)) {
6122 		int sched;
6123 
6124 		txr->hn_oactive = 0;
6125 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6126 		mtx_unlock(&txr->hn_tx_lock);
6127 		if (sched) {
6128 			taskqueue_enqueue(txr->hn_tx_taskq,
6129 			    &txr->hn_tx_task);
6130 		}
6131 	} else {
6132 do_sched:
6133 		/*
6134 		 * Release the oactive earlier, with the hope, that
6135 		 * others could catch up.  The task will clear the
6136 		 * oactive again with the hn_tx_lock to avoid possible
6137 		 * races.
6138 		 */
6139 		txr->hn_oactive = 0;
6140 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6141 	}
6142 }
6143 
6144 static void
6145 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6146 {
6147 	struct hn_tx_ring *txr = xtxr;
6148 
6149 	mtx_lock(&txr->hn_tx_lock);
6150 	hn_xmit(txr, 0);
6151 	mtx_unlock(&txr->hn_tx_lock);
6152 }
6153 
6154 static void
6155 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6156 {
6157 	struct hn_tx_ring *txr = xtxr;
6158 
6159 	mtx_lock(&txr->hn_tx_lock);
6160 	txr->hn_oactive = 0;
6161 	hn_xmit(txr, 0);
6162 	mtx_unlock(&txr->hn_tx_lock);
6163 }
6164 
6165 static int
6166 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6167 {
6168 	struct vmbus_chan_br cbr;
6169 	struct hn_rx_ring *rxr;
6170 	struct hn_tx_ring *txr = NULL;
6171 	int idx, error;
6172 
6173 	idx = vmbus_chan_subidx(chan);
6174 
6175 	/*
6176 	 * Link this channel to RX/TX ring.
6177 	 */
6178 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6179 	    ("invalid channel index %d, should > 0 && < %d",
6180 	     idx, sc->hn_rx_ring_inuse));
6181 	rxr = &sc->hn_rx_ring[idx];
6182 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6183 	    ("RX ring %d already attached", idx));
6184 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6185 	rxr->hn_chan = chan;
6186 
6187 	if (bootverbose) {
6188 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6189 		    idx, vmbus_chan_id(chan));
6190 	}
6191 
6192 	if (idx < sc->hn_tx_ring_inuse) {
6193 		txr = &sc->hn_tx_ring[idx];
6194 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6195 		    ("TX ring %d already attached", idx));
6196 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6197 
6198 		txr->hn_chan = chan;
6199 		if (bootverbose) {
6200 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6201 			    idx, vmbus_chan_id(chan));
6202 		}
6203 	}
6204 
6205 	/* Bind this channel to a proper CPU. */
6206 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6207 
6208 	/*
6209 	 * Open this channel
6210 	 */
6211 	cbr.cbr = rxr->hn_br;
6212 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6213 	cbr.cbr_txsz = HN_TXBR_SIZE;
6214 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6215 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6216 	if (error) {
6217 		if (error == EISCONN) {
6218 			if_printf(sc->hn_ifp, "bufring is connected after "
6219 			    "chan%u open failure\n", vmbus_chan_id(chan));
6220 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6221 		} else {
6222 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6223 			    vmbus_chan_id(chan), error);
6224 		}
6225 	}
6226 	return (error);
6227 }
6228 
6229 static void
6230 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6231 {
6232 	struct hn_rx_ring *rxr;
6233 	int idx, error;
6234 
6235 	idx = vmbus_chan_subidx(chan);
6236 
6237 	/*
6238 	 * Link this channel to RX/TX ring.
6239 	 */
6240 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6241 	    ("invalid channel index %d, should > 0 && < %d",
6242 	     idx, sc->hn_rx_ring_inuse));
6243 	rxr = &sc->hn_rx_ring[idx];
6244 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6245 	    ("RX ring %d is not attached", idx));
6246 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6247 
6248 	if (idx < sc->hn_tx_ring_inuse) {
6249 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6250 
6251 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6252 		    ("TX ring %d is not attached attached", idx));
6253 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6254 	}
6255 
6256 	/*
6257 	 * Close this channel.
6258 	 *
6259 	 * NOTE:
6260 	 * Channel closing does _not_ destroy the target channel.
6261 	 */
6262 	error = vmbus_chan_close_direct(chan);
6263 	if (error == EISCONN) {
6264 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6265 		    "after being closed\n", vmbus_chan_id(chan));
6266 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6267 	} else if (error) {
6268 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6269 		    vmbus_chan_id(chan), error);
6270 	}
6271 }
6272 
6273 static int
6274 hn_attach_subchans(struct hn_softc *sc)
6275 {
6276 	struct vmbus_channel **subchans;
6277 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6278 	int i, error = 0;
6279 
6280 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6281 
6282 	/* Attach the sub-channels. */
6283 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6284 	for (i = 0; i < subchan_cnt; ++i) {
6285 		int error1;
6286 
6287 		error1 = hn_chan_attach(sc, subchans[i]);
6288 		if (error1) {
6289 			error = error1;
6290 			/* Move on; all channels will be detached later. */
6291 		}
6292 	}
6293 	vmbus_subchan_rel(subchans, subchan_cnt);
6294 
6295 	if (error) {
6296 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6297 	} else {
6298 		if (bootverbose) {
6299 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6300 			    subchan_cnt);
6301 		}
6302 	}
6303 	return (error);
6304 }
6305 
6306 static void
6307 hn_detach_allchans(struct hn_softc *sc)
6308 {
6309 	struct vmbus_channel **subchans;
6310 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6311 	int i;
6312 
6313 	if (subchan_cnt == 0)
6314 		goto back;
6315 
6316 	/* Detach the sub-channels. */
6317 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6318 	for (i = 0; i < subchan_cnt; ++i)
6319 		hn_chan_detach(sc, subchans[i]);
6320 	vmbus_subchan_rel(subchans, subchan_cnt);
6321 
6322 back:
6323 	/*
6324 	 * Detach the primary channel, _after_ all sub-channels
6325 	 * are detached.
6326 	 */
6327 	hn_chan_detach(sc, sc->hn_prichan);
6328 
6329 	/* Wait for sub-channels to be destroyed, if any. */
6330 	vmbus_subchan_drain(sc->hn_prichan);
6331 
6332 #ifdef INVARIANTS
6333 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6334 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6335 		    HN_RX_FLAG_ATTACHED) == 0,
6336 		    ("%dth RX ring is still attached", i));
6337 	}
6338 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6339 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6340 		    HN_TX_FLAG_ATTACHED) == 0,
6341 		    ("%dth TX ring is still attached", i));
6342 	}
6343 #endif
6344 }
6345 
6346 static int
6347 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6348 {
6349 	struct vmbus_channel **subchans;
6350 	int nchan, rxr_cnt, error;
6351 
6352 	nchan = *nsubch + 1;
6353 	if (nchan == 1) {
6354 		/*
6355 		 * Multiple RX/TX rings are not requested.
6356 		 */
6357 		*nsubch = 0;
6358 		return (0);
6359 	}
6360 
6361 	/*
6362 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6363 	 * table entries.
6364 	 */
6365 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6366 	if (error) {
6367 		/* No RSS; this is benign. */
6368 		*nsubch = 0;
6369 		return (0);
6370 	}
6371 	if (bootverbose) {
6372 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6373 		    rxr_cnt, nchan);
6374 	}
6375 
6376 	if (nchan > rxr_cnt)
6377 		nchan = rxr_cnt;
6378 	if (nchan == 1) {
6379 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6380 		*nsubch = 0;
6381 		return (0);
6382 	}
6383 
6384 	/*
6385 	 * Allocate sub-channels from NVS.
6386 	 */
6387 	*nsubch = nchan - 1;
6388 	error = hn_nvs_alloc_subchans(sc, nsubch);
6389 	if (error || *nsubch == 0) {
6390 		/* Failed to allocate sub-channels. */
6391 		*nsubch = 0;
6392 		return (0);
6393 	}
6394 
6395 	/*
6396 	 * Wait for all sub-channels to become ready before moving on.
6397 	 */
6398 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6399 	vmbus_subchan_rel(subchans, *nsubch);
6400 	return (0);
6401 }
6402 
6403 static bool
6404 hn_synth_attachable(const struct hn_softc *sc)
6405 {
6406 	int i;
6407 
6408 	if (sc->hn_flags & HN_FLAG_ERRORS)
6409 		return (false);
6410 
6411 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6412 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6413 
6414 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6415 			return (false);
6416 	}
6417 	return (true);
6418 }
6419 
6420 /*
6421  * Make sure that the RX filter is zero after the successful
6422  * RNDIS initialization.
6423  *
6424  * NOTE:
6425  * Under certain conditions on certain versions of Hyper-V,
6426  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6427  * after the successful RNDIS initialization, which breaks
6428  * the assumption of any following code (well, it breaks the
6429  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6430  * explicitly, drain packets sneaking through, and drain the
6431  * interrupt taskqueues scheduled due to the stealth packets.
6432  */
6433 static void
6434 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6435 {
6436 
6437 	hn_disable_rx(sc);
6438 	hn_drain_rxtx(sc, nchan);
6439 }
6440 
6441 static int
6442 hn_synth_attach(struct hn_softc *sc, int mtu)
6443 {
6444 #define ATTACHED_NVS		0x0002
6445 #define ATTACHED_RNDIS		0x0004
6446 
6447 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6448 	int error, nsubch, nchan = 1, i, rndis_inited;
6449 	uint32_t old_caps, attached = 0;
6450 
6451 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6452 	    ("synthetic parts were attached"));
6453 
6454 	if (!hn_synth_attachable(sc))
6455 		return (ENXIO);
6456 
6457 	/* Save capabilities for later verification. */
6458 	old_caps = sc->hn_caps;
6459 	sc->hn_caps = 0;
6460 
6461 	/* Clear RSS stuffs. */
6462 	sc->hn_rss_ind_size = 0;
6463 	sc->hn_rss_hash = 0;
6464 	sc->hn_rss_hcap = 0;
6465 
6466 	/*
6467 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6468 	 */
6469 	error = hn_chan_attach(sc, sc->hn_prichan);
6470 	if (error)
6471 		goto failed;
6472 
6473 	/*
6474 	 * Attach NVS.
6475 	 */
6476 	error = hn_nvs_attach(sc, mtu);
6477 	if (error)
6478 		goto failed;
6479 	attached |= ATTACHED_NVS;
6480 
6481 	/*
6482 	 * Attach RNDIS _after_ NVS is attached.
6483 	 */
6484 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6485 	if (rndis_inited)
6486 		attached |= ATTACHED_RNDIS;
6487 	if (error)
6488 		goto failed;
6489 
6490 	/*
6491 	 * Make sure capabilities are not changed.
6492 	 */
6493 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6494 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6495 		    old_caps, sc->hn_caps);
6496 		error = ENXIO;
6497 		goto failed;
6498 	}
6499 
6500 	/*
6501 	 * Allocate sub-channels for multi-TX/RX rings.
6502 	 *
6503 	 * NOTE:
6504 	 * The # of RX rings that can be used is equivalent to the # of
6505 	 * channels to be requested.
6506 	 */
6507 	nsubch = sc->hn_rx_ring_cnt - 1;
6508 	error = hn_synth_alloc_subchans(sc, &nsubch);
6509 	if (error)
6510 		goto failed;
6511 	/* NOTE: _Full_ synthetic parts detach is required now. */
6512 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6513 
6514 	/*
6515 	 * Set the # of TX/RX rings that could be used according to
6516 	 * the # of channels that NVS offered.
6517 	 */
6518 	nchan = nsubch + 1;
6519 	hn_set_ring_inuse(sc, nchan);
6520 	if (nchan == 1) {
6521 		/* Only the primary channel can be used; done */
6522 		goto back;
6523 	}
6524 
6525 	/*
6526 	 * Attach the sub-channels.
6527 	 *
6528 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6529 	 */
6530 	error = hn_attach_subchans(sc);
6531 	if (error)
6532 		goto failed;
6533 
6534 	/*
6535 	 * Configure RSS key and indirect table _after_ all sub-channels
6536 	 * are attached.
6537 	 */
6538 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6539 		/*
6540 		 * RSS key is not set yet; set it to the default RSS key.
6541 		 */
6542 		if (bootverbose)
6543 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6544 #ifdef RSS
6545 		rss_getkey(rss->rss_key);
6546 #else
6547 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6548 #endif
6549 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6550 	}
6551 
6552 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6553 		/*
6554 		 * RSS indirect table is not set yet; set it up in round-
6555 		 * robin fashion.
6556 		 */
6557 		if (bootverbose) {
6558 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6559 			    "table\n");
6560 		}
6561 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6562 			uint32_t subidx;
6563 
6564 #ifdef RSS
6565 			subidx = rss_get_indirection_to_bucket(i);
6566 #else
6567 			subidx = i;
6568 #endif
6569 			rss->rss_ind[i] = subidx % nchan;
6570 		}
6571 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6572 	} else {
6573 		/*
6574 		 * # of usable channels may be changed, so we have to
6575 		 * make sure that all entries in RSS indirect table
6576 		 * are valid.
6577 		 *
6578 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6579 		 */
6580 		hn_rss_ind_fixup(sc);
6581 	}
6582 
6583 	sc->hn_rss_hash = sc->hn_rss_hcap;
6584 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6585 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6586 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6587 		hn_vf_rss_fixup(sc, false);
6588 	}
6589 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6590 	if (error)
6591 		goto failed;
6592 back:
6593 	/*
6594 	 * Fixup transmission aggregation setup.
6595 	 */
6596 	hn_set_txagg(sc);
6597 	hn_rndis_init_fixat(sc, nchan);
6598 	return (0);
6599 
6600 failed:
6601 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6602 		hn_rndis_init_fixat(sc, nchan);
6603 		hn_synth_detach(sc);
6604 	} else {
6605 		if (attached & ATTACHED_RNDIS) {
6606 			hn_rndis_init_fixat(sc, nchan);
6607 			hn_rndis_detach(sc);
6608 		}
6609 		if (attached & ATTACHED_NVS)
6610 			hn_nvs_detach(sc);
6611 		hn_chan_detach(sc, sc->hn_prichan);
6612 		/* Restore old capabilities. */
6613 		sc->hn_caps = old_caps;
6614 	}
6615 	return (error);
6616 
6617 #undef ATTACHED_RNDIS
6618 #undef ATTACHED_NVS
6619 }
6620 
6621 /*
6622  * NOTE:
6623  * The interface must have been suspended though hn_suspend(), before
6624  * this function get called.
6625  */
6626 static void
6627 hn_synth_detach(struct hn_softc *sc)
6628 {
6629 
6630 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6631 	    ("synthetic parts were not attached"));
6632 
6633 	/* Detach the RNDIS first. */
6634 	hn_rndis_detach(sc);
6635 
6636 	/* Detach NVS. */
6637 	hn_nvs_detach(sc);
6638 
6639 	/* Detach all of the channels. */
6640 	hn_detach_allchans(sc);
6641 
6642 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6643 		/*
6644 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6645 		 */
6646 		int error;
6647 
6648 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6649 		    sc->hn_rxbuf_gpadl);
6650 		if (error) {
6651 			if_printf(sc->hn_ifp,
6652 			    "rxbuf gpadl disconn failed: %d\n", error);
6653 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6654 		}
6655 		sc->hn_rxbuf_gpadl = 0;
6656 	}
6657 
6658 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6659 		/*
6660 		 * Host is post-Win2016, disconnect chimney sending buffer from
6661 		 * primary channel here.
6662 		 */
6663 		int error;
6664 
6665 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6666 		    sc->hn_chim_gpadl);
6667 		if (error) {
6668 			if_printf(sc->hn_ifp,
6669 			    "chim gpadl disconn failed: %d\n", error);
6670 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6671 		}
6672 		sc->hn_chim_gpadl = 0;
6673 	}
6674 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6675 }
6676 
6677 static void
6678 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6679 {
6680 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6681 	    ("invalid ring count %d", ring_cnt));
6682 
6683 	if (sc->hn_tx_ring_cnt > ring_cnt)
6684 		sc->hn_tx_ring_inuse = ring_cnt;
6685 	else
6686 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6687 	sc->hn_rx_ring_inuse = ring_cnt;
6688 
6689 #ifdef RSS
6690 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6691 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6692 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6693 		    rss_getnumbuckets());
6694 	}
6695 #endif
6696 
6697 	if (bootverbose) {
6698 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6699 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6700 	}
6701 }
6702 
6703 static void
6704 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6705 {
6706 
6707 	/*
6708 	 * NOTE:
6709 	 * The TX bufring will not be drained by the hypervisor,
6710 	 * if the primary channel is revoked.
6711 	 */
6712 	while (!vmbus_chan_rx_empty(chan) ||
6713 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6714 	     !vmbus_chan_tx_empty(chan)))
6715 		pause("waitch", 1);
6716 	vmbus_chan_intr_drain(chan);
6717 }
6718 
6719 static void
6720 hn_disable_rx(struct hn_softc *sc)
6721 {
6722 
6723 	/*
6724 	 * Disable RX by clearing RX filter forcefully.
6725 	 */
6726 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6727 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6728 
6729 	/*
6730 	 * Give RNDIS enough time to flush all pending data packets.
6731 	 */
6732 	pause("waitrx", (200 * hz) / 1000);
6733 }
6734 
6735 /*
6736  * NOTE:
6737  * RX/TX _must_ have been suspended/disabled, before this function
6738  * is called.
6739  */
6740 static void
6741 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6742 {
6743 	struct vmbus_channel **subch = NULL;
6744 	int nsubch;
6745 
6746 	/*
6747 	 * Drain RX/TX bufrings and interrupts.
6748 	 */
6749 	nsubch = nchan - 1;
6750 	if (nsubch > 0)
6751 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6752 
6753 	if (subch != NULL) {
6754 		int i;
6755 
6756 		for (i = 0; i < nsubch; ++i)
6757 			hn_chan_drain(sc, subch[i]);
6758 	}
6759 	hn_chan_drain(sc, sc->hn_prichan);
6760 
6761 	if (subch != NULL)
6762 		vmbus_subchan_rel(subch, nsubch);
6763 }
6764 
6765 static void
6766 hn_suspend_data(struct hn_softc *sc)
6767 {
6768 	struct hn_tx_ring *txr;
6769 	int i;
6770 
6771 	HN_LOCK_ASSERT(sc);
6772 
6773 	/*
6774 	 * Suspend TX.
6775 	 */
6776 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6777 		txr = &sc->hn_tx_ring[i];
6778 
6779 		mtx_lock(&txr->hn_tx_lock);
6780 		txr->hn_suspended = 1;
6781 		mtx_unlock(&txr->hn_tx_lock);
6782 		/* No one is able send more packets now. */
6783 
6784 		/*
6785 		 * Wait for all pending sends to finish.
6786 		 *
6787 		 * NOTE:
6788 		 * We will _not_ receive all pending send-done, if the
6789 		 * primary channel is revoked.
6790 		 */
6791 		while (hn_tx_ring_pending(txr) &&
6792 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6793 			pause("hnwtx", 1 /* 1 tick */);
6794 	}
6795 
6796 	/*
6797 	 * Disable RX.
6798 	 */
6799 	hn_disable_rx(sc);
6800 
6801 	/*
6802 	 * Drain RX/TX.
6803 	 */
6804 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6805 
6806 	/*
6807 	 * Drain any pending TX tasks.
6808 	 *
6809 	 * NOTE:
6810 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6811 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6812 	 */
6813 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6814 		txr = &sc->hn_tx_ring[i];
6815 
6816 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6817 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6818 	}
6819 }
6820 
6821 static void
6822 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6823 {
6824 
6825 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6826 }
6827 
6828 static void
6829 hn_suspend_mgmt(struct hn_softc *sc)
6830 {
6831 	struct task task;
6832 
6833 	HN_LOCK_ASSERT(sc);
6834 
6835 	/*
6836 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6837 	 * through hn_mgmt_taskq.
6838 	 */
6839 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6840 	vmbus_chan_run_task(sc->hn_prichan, &task);
6841 
6842 	/*
6843 	 * Make sure that all pending management tasks are completed.
6844 	 */
6845 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6846 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6847 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6848 }
6849 
6850 static void
6851 hn_suspend(struct hn_softc *sc)
6852 {
6853 
6854 	/* Disable polling. */
6855 	hn_polling(sc, 0);
6856 
6857 	/*
6858 	 * If the non-transparent mode VF is activated, the synthetic
6859 	 * device is receiving packets, so the data path of the
6860 	 * synthetic device must be suspended.
6861 	 */
6862 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6863 	    (sc->hn_flags & HN_FLAG_RXVF))
6864 		hn_suspend_data(sc);
6865 	hn_suspend_mgmt(sc);
6866 }
6867 
6868 static void
6869 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6870 {
6871 	int i;
6872 
6873 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6874 	    ("invalid TX ring count %d", tx_ring_cnt));
6875 
6876 	for (i = 0; i < tx_ring_cnt; ++i) {
6877 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6878 
6879 		mtx_lock(&txr->hn_tx_lock);
6880 		txr->hn_suspended = 0;
6881 		mtx_unlock(&txr->hn_tx_lock);
6882 	}
6883 }
6884 
6885 static void
6886 hn_resume_data(struct hn_softc *sc)
6887 {
6888 	int i;
6889 
6890 	HN_LOCK_ASSERT(sc);
6891 
6892 	/*
6893 	 * Re-enable RX.
6894 	 */
6895 	hn_rxfilter_config(sc);
6896 
6897 	/*
6898 	 * Make sure to clear suspend status on "all" TX rings,
6899 	 * since hn_tx_ring_inuse can be changed after
6900 	 * hn_suspend_data().
6901 	 */
6902 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6903 
6904 #ifdef HN_IFSTART_SUPPORT
6905 	if (!hn_use_if_start)
6906 #endif
6907 	{
6908 		/*
6909 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6910 		 * reduced.
6911 		 */
6912 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6913 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6914 	}
6915 
6916 	/*
6917 	 * Kick start TX.
6918 	 */
6919 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6920 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6921 
6922 		/*
6923 		 * Use txeof task, so that any pending oactive can be
6924 		 * cleared properly.
6925 		 */
6926 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6927 	}
6928 }
6929 
6930 static void
6931 hn_resume_mgmt(struct hn_softc *sc)
6932 {
6933 
6934 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6935 
6936 	/*
6937 	 * Kick off network change detection, if it was pending.
6938 	 * If no network change was pending, start link status
6939 	 * checks, which is more lightweight than network change
6940 	 * detection.
6941 	 */
6942 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6943 		hn_change_network(sc);
6944 	else
6945 		hn_update_link_status(sc);
6946 }
6947 
6948 static void
6949 hn_resume(struct hn_softc *sc)
6950 {
6951 
6952 	/*
6953 	 * If the non-transparent mode VF is activated, the synthetic
6954 	 * device have to receive packets, so the data path of the
6955 	 * synthetic device must be resumed.
6956 	 */
6957 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6958 	    (sc->hn_flags & HN_FLAG_RXVF))
6959 		hn_resume_data(sc);
6960 
6961 	/*
6962 	 * Don't resume link status change if VF is attached/activated.
6963 	 * - In the non-transparent VF mode, the synthetic device marks
6964 	 *   link down until the VF is deactivated; i.e. VF is down.
6965 	 * - In transparent VF mode, VF's media status is used until
6966 	 *   the VF is detached.
6967 	 */
6968 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6969 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6970 		hn_resume_mgmt(sc);
6971 
6972 	/*
6973 	 * Re-enable polling if this interface is running and
6974 	 * the polling is requested.
6975 	 */
6976 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6977 		hn_polling(sc, sc->hn_pollhz);
6978 }
6979 
6980 static void
6981 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6982 {
6983 	const struct rndis_status_msg *msg;
6984 	int ofs;
6985 
6986 	if (dlen < sizeof(*msg)) {
6987 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6988 		return;
6989 	}
6990 	msg = data;
6991 
6992 	switch (msg->rm_status) {
6993 	case RNDIS_STATUS_MEDIA_CONNECT:
6994 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6995 		hn_update_link_status(sc);
6996 		break;
6997 
6998 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6999 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7000 		/* Not really useful; ignore. */
7001 		break;
7002 
7003 	case RNDIS_STATUS_NETWORK_CHANGE:
7004 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7005 		if (dlen < ofs + msg->rm_stbuflen ||
7006 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7007 			if_printf(sc->hn_ifp, "network changed\n");
7008 		} else {
7009 			uint32_t change;
7010 
7011 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7012 			    sizeof(change));
7013 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7014 			    change);
7015 		}
7016 		hn_change_network(sc);
7017 		break;
7018 
7019 	default:
7020 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7021 		    msg->rm_status);
7022 		break;
7023 	}
7024 }
7025 
7026 static int
7027 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7028 {
7029 	const struct rndis_pktinfo *pi = info_data;
7030 	uint32_t mask = 0;
7031 
7032 	while (info_dlen != 0) {
7033 		const void *data;
7034 		uint32_t dlen;
7035 
7036 		if (__predict_false(info_dlen < sizeof(*pi)))
7037 			return (EINVAL);
7038 		if (__predict_false(info_dlen < pi->rm_size))
7039 			return (EINVAL);
7040 		info_dlen -= pi->rm_size;
7041 
7042 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7043 			return (EINVAL);
7044 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7045 			return (EINVAL);
7046 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7047 		data = pi->rm_data;
7048 
7049 		switch (pi->rm_type) {
7050 		case NDIS_PKTINFO_TYPE_VLAN:
7051 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
7052 				return (EINVAL);
7053 			info->vlan_info = *((const uint32_t *)data);
7054 			mask |= HN_RXINFO_VLAN;
7055 			break;
7056 
7057 		case NDIS_PKTINFO_TYPE_CSUM:
7058 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
7059 				return (EINVAL);
7060 			info->csum_info = *((const uint32_t *)data);
7061 			mask |= HN_RXINFO_CSUM;
7062 			break;
7063 
7064 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7065 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
7066 				return (EINVAL);
7067 			info->hash_value = *((const uint32_t *)data);
7068 			mask |= HN_RXINFO_HASHVAL;
7069 			break;
7070 
7071 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
7072 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
7073 				return (EINVAL);
7074 			info->hash_info = *((const uint32_t *)data);
7075 			mask |= HN_RXINFO_HASHINF;
7076 			break;
7077 
7078 		default:
7079 			goto next;
7080 		}
7081 
7082 		if (mask == HN_RXINFO_ALL) {
7083 			/* All found; done */
7084 			break;
7085 		}
7086 next:
7087 		pi = (const struct rndis_pktinfo *)
7088 		    ((const uint8_t *)pi + pi->rm_size);
7089 	}
7090 
7091 	/*
7092 	 * Final fixup.
7093 	 * - If there is no hash value, invalidate the hash info.
7094 	 */
7095 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7096 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
7097 	return (0);
7098 }
7099 
7100 static __inline bool
7101 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7102 {
7103 
7104 	if (off < check_off) {
7105 		if (__predict_true(off + len <= check_off))
7106 			return (false);
7107 	} else if (off > check_off) {
7108 		if (__predict_true(check_off + check_len <= off))
7109 			return (false);
7110 	}
7111 	return (true);
7112 }
7113 
7114 static void
7115 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7116 {
7117 	const struct rndis_packet_msg *pkt;
7118 	struct hn_rxinfo info;
7119 	int data_off, pktinfo_off, data_len, pktinfo_len;
7120 
7121 	/*
7122 	 * Check length.
7123 	 */
7124 	if (__predict_false(dlen < sizeof(*pkt))) {
7125 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7126 		return;
7127 	}
7128 	pkt = data;
7129 
7130 	if (__predict_false(dlen < pkt->rm_len)) {
7131 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7132 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7133 		return;
7134 	}
7135 	if (__predict_false(pkt->rm_len <
7136 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7137 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7138 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7139 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7140 		    pkt->rm_pktinfolen);
7141 		return;
7142 	}
7143 	if (__predict_false(pkt->rm_datalen == 0)) {
7144 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7145 		return;
7146 	}
7147 
7148 	/*
7149 	 * Check offests.
7150 	 */
7151 #define IS_OFFSET_INVALID(ofs)			\
7152 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7153 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7154 
7155 	/* XXX Hyper-V does not meet data offset alignment requirement */
7156 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7157 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7158 		    "data offset %u\n", pkt->rm_dataoffset);
7159 		return;
7160 	}
7161 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7162 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7163 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7164 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7165 		return;
7166 	}
7167 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7168 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7169 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7170 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7171 		return;
7172 	}
7173 
7174 #undef IS_OFFSET_INVALID
7175 
7176 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7177 	data_len = pkt->rm_datalen;
7178 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7179 	pktinfo_len = pkt->rm_pktinfolen;
7180 
7181 	/*
7182 	 * Check OOB coverage.
7183 	 */
7184 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7185 		int oob_off, oob_len;
7186 
7187 		if_printf(rxr->hn_ifp, "got oobdata\n");
7188 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7189 		oob_len = pkt->rm_oobdatalen;
7190 
7191 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7192 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7193 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7194 			    pkt->rm_len, oob_off, oob_len);
7195 			return;
7196 		}
7197 
7198 		/*
7199 		 * Check against data.
7200 		 */
7201 		if (hn_rndis_check_overlap(oob_off, oob_len,
7202 		    data_off, data_len)) {
7203 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7204 			    "oob overlaps data, oob abs %d len %d, "
7205 			    "data abs %d len %d\n",
7206 			    oob_off, oob_len, data_off, data_len);
7207 			return;
7208 		}
7209 
7210 		/*
7211 		 * Check against pktinfo.
7212 		 */
7213 		if (pktinfo_len != 0 &&
7214 		    hn_rndis_check_overlap(oob_off, oob_len,
7215 		    pktinfo_off, pktinfo_len)) {
7216 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7217 			    "oob overlaps pktinfo, oob abs %d len %d, "
7218 			    "pktinfo abs %d len %d\n",
7219 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7220 			return;
7221 		}
7222 	}
7223 
7224 	/*
7225 	 * Check per-packet-info coverage and find useful per-packet-info.
7226 	 */
7227 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7228 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7229 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7230 	if (__predict_true(pktinfo_len != 0)) {
7231 		bool overlap;
7232 		int error;
7233 
7234 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7235 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7236 			    "pktinfo overflow, msglen %u, "
7237 			    "pktinfo abs %d len %d\n",
7238 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7239 			return;
7240 		}
7241 
7242 		/*
7243 		 * Check packet info coverage.
7244 		 */
7245 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7246 		    data_off, data_len);
7247 		if (__predict_false(overlap)) {
7248 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7249 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7250 			    "data abs %d len %d\n",
7251 			    pktinfo_off, pktinfo_len, data_off, data_len);
7252 			return;
7253 		}
7254 
7255 		/*
7256 		 * Find useful per-packet-info.
7257 		 */
7258 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7259 		    pktinfo_len, &info);
7260 		if (__predict_false(error)) {
7261 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7262 			    "pktinfo\n");
7263 			return;
7264 		}
7265 	}
7266 
7267 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7268 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7269 		    "data overflow, msglen %u, data abs %d len %d\n",
7270 		    pkt->rm_len, data_off, data_len);
7271 		return;
7272 	}
7273 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7274 }
7275 
7276 static __inline void
7277 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7278 {
7279 	const struct rndis_msghdr *hdr;
7280 
7281 	if (__predict_false(dlen < sizeof(*hdr))) {
7282 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7283 		return;
7284 	}
7285 	hdr = data;
7286 
7287 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7288 		/* Hot data path. */
7289 		hn_rndis_rx_data(rxr, data, dlen);
7290 		/* Done! */
7291 		return;
7292 	}
7293 
7294 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7295 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7296 	else
7297 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7298 }
7299 
7300 static void
7301 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7302 {
7303 	const struct hn_nvs_hdr *hdr;
7304 
7305 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7306 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7307 		return;
7308 	}
7309 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7310 
7311 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7312 		/* Useless; ignore */
7313 		return;
7314 	}
7315 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7316 }
7317 
7318 static void
7319 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7320     const struct vmbus_chanpkt_hdr *pkt)
7321 {
7322 	struct hn_nvs_sendctx *sndc;
7323 
7324 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7325 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7326 	    VMBUS_CHANPKT_DATALEN(pkt));
7327 	/*
7328 	 * NOTE:
7329 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7330 	 * its callback.
7331 	 */
7332 }
7333 
7334 static void
7335 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7336     const struct vmbus_chanpkt_hdr *pkthdr)
7337 {
7338 	const struct vmbus_chanpkt_rxbuf *pkt;
7339 	const struct hn_nvs_hdr *nvs_hdr;
7340 	int count, i, hlen;
7341 
7342 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7343 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7344 		return;
7345 	}
7346 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7347 
7348 	/* Make sure that this is a RNDIS message. */
7349 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7350 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7351 		    nvs_hdr->nvs_type);
7352 		return;
7353 	}
7354 
7355 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7356 	if (__predict_false(hlen < sizeof(*pkt))) {
7357 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7358 		return;
7359 	}
7360 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7361 
7362 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7363 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7364 		    pkt->cp_rxbuf_id);
7365 		return;
7366 	}
7367 
7368 	count = pkt->cp_rxbuf_cnt;
7369 	if (__predict_false(hlen <
7370 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7371 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7372 		return;
7373 	}
7374 
7375 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7376 	for (i = 0; i < count; ++i) {
7377 		int ofs, len;
7378 
7379 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7380 		len = pkt->cp_rxbuf[i].rb_len;
7381 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7382 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7383 			    "ofs %d, len %d\n", i, ofs, len);
7384 			continue;
7385 		}
7386 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7387 	}
7388 
7389 	/*
7390 	 * Ack the consumed RXBUF associated w/ this channel packet,
7391 	 * so that this RXBUF can be recycled by the hypervisor.
7392 	 */
7393 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7394 }
7395 
7396 static void
7397 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7398     uint64_t tid)
7399 {
7400 	struct hn_nvs_rndis_ack ack;
7401 	int retries, error;
7402 
7403 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7404 	ack.nvs_status = HN_NVS_STATUS_OK;
7405 
7406 	retries = 0;
7407 again:
7408 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7409 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7410 	if (__predict_false(error == EAGAIN)) {
7411 		/*
7412 		 * NOTE:
7413 		 * This should _not_ happen in real world, since the
7414 		 * consumption of the TX bufring from the TX path is
7415 		 * controlled.
7416 		 */
7417 		if (rxr->hn_ack_failed == 0)
7418 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7419 		rxr->hn_ack_failed++;
7420 		retries++;
7421 		if (retries < 10) {
7422 			DELAY(100);
7423 			goto again;
7424 		}
7425 		/* RXBUF leaks! */
7426 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7427 	}
7428 }
7429 
7430 static void
7431 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7432 {
7433 	struct hn_rx_ring *rxr = xrxr;
7434 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7435 
7436 	for (;;) {
7437 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7438 		int error, pktlen;
7439 
7440 		pktlen = rxr->hn_pktbuf_len;
7441 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7442 		if (__predict_false(error == ENOBUFS)) {
7443 			void *nbuf;
7444 			int nlen;
7445 
7446 			/*
7447 			 * Expand channel packet buffer.
7448 			 *
7449 			 * XXX
7450 			 * Use M_WAITOK here, since allocation failure
7451 			 * is fatal.
7452 			 */
7453 			nlen = rxr->hn_pktbuf_len * 2;
7454 			while (nlen < pktlen)
7455 				nlen *= 2;
7456 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7457 
7458 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7459 			    rxr->hn_pktbuf_len, nlen);
7460 
7461 			free(rxr->hn_pktbuf, M_DEVBUF);
7462 			rxr->hn_pktbuf = nbuf;
7463 			rxr->hn_pktbuf_len = nlen;
7464 			/* Retry! */
7465 			continue;
7466 		} else if (__predict_false(error == EAGAIN)) {
7467 			/* No more channel packets; done! */
7468 			break;
7469 		}
7470 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7471 
7472 		switch (pkt->cph_type) {
7473 		case VMBUS_CHANPKT_TYPE_COMP:
7474 			hn_nvs_handle_comp(sc, chan, pkt);
7475 			break;
7476 
7477 		case VMBUS_CHANPKT_TYPE_RXBUF:
7478 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7479 			break;
7480 
7481 		case VMBUS_CHANPKT_TYPE_INBAND:
7482 			hn_nvs_handle_notify(sc, pkt);
7483 			break;
7484 
7485 		default:
7486 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7487 			    pkt->cph_type);
7488 			break;
7489 		}
7490 	}
7491 	hn_chan_rollup(rxr, rxr->hn_txr);
7492 }
7493 
7494 static void
7495 hn_sysinit(void *arg __unused)
7496 {
7497 	int i;
7498 
7499 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7500 
7501 #ifdef HN_IFSTART_SUPPORT
7502 	/*
7503 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7504 	 * mainly due to the IFF_DRV_OACTIVE flag.
7505 	 */
7506 	if (hn_xpnt_vf && hn_use_if_start) {
7507 		hn_use_if_start = 0;
7508 		printf("hn: tranparent VF mode, if_transmit will be used, "
7509 		    "instead of if_start\n");
7510 	}
7511 #endif
7512 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7513 		printf("hn: invalid transparent VF attach routing "
7514 		    "wait timeout %d, reset to %d\n",
7515 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7516 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7517 	}
7518 
7519 	/*
7520 	 * Initialize VF map.
7521 	 */
7522 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7523 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7524 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7525 	    M_WAITOK | M_ZERO);
7526 
7527 	/*
7528 	 * Fix the # of TX taskqueues.
7529 	 */
7530 	if (hn_tx_taskq_cnt <= 0)
7531 		hn_tx_taskq_cnt = 1;
7532 	else if (hn_tx_taskq_cnt > mp_ncpus)
7533 		hn_tx_taskq_cnt = mp_ncpus;
7534 
7535 	/*
7536 	 * Fix the TX taskqueue mode.
7537 	 */
7538 	switch (hn_tx_taskq_mode) {
7539 	case HN_TX_TASKQ_M_INDEP:
7540 	case HN_TX_TASKQ_M_GLOBAL:
7541 	case HN_TX_TASKQ_M_EVTTQ:
7542 		break;
7543 	default:
7544 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7545 		break;
7546 	}
7547 
7548 	if (vm_guest != VM_GUEST_HV)
7549 		return;
7550 
7551 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7552 		return;
7553 
7554 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7555 	    M_DEVBUF, M_WAITOK);
7556 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7557 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7558 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7559 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7560 		    "hn tx%d", i);
7561 	}
7562 }
7563 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7564 
7565 static void
7566 hn_sysuninit(void *arg __unused)
7567 {
7568 
7569 	if (hn_tx_taskque != NULL) {
7570 		int i;
7571 
7572 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7573 			taskqueue_free(hn_tx_taskque[i]);
7574 		free(hn_tx_taskque, M_DEVBUF);
7575 	}
7576 
7577 	if (hn_vfmap != NULL)
7578 		free(hn_vfmap, M_DEVBUF);
7579 	rm_destroy(&hn_vfmap_lock);
7580 
7581 	counter_u64_free(hn_udpcs_fixup);
7582 }
7583 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7584