xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 5def4c47d4bd90b209b9b4a4ba9faec15846d8fd)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/proc.h>
75 #include <sys/rmlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/socket.h>
80 #include <sys/sockio.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/buf_ring.h>
85 #include <sys/eventhandler.h>
86 
87 #include <machine/atomic.h>
88 #include <machine/in_cksum.h>
89 
90 #include <net/bpf.h>
91 #include <net/ethernet.h>
92 #include <net/if.h>
93 #include <net/if_dl.h>
94 #include <net/if_media.h>
95 #include <net/if_types.h>
96 #include <net/if_var.h>
97 #include <net/rndis.h>
98 #ifdef RSS
99 #include <net/rss_config.h>
100 #endif
101 
102 #include <netinet/in_systm.h>
103 #include <netinet/in.h>
104 #include <netinet/ip.h>
105 #include <netinet/ip6.h>
106 #include <netinet/tcp.h>
107 #include <netinet/tcp_lro.h>
108 #include <netinet/udp.h>
109 
110 #include <dev/hyperv/include/hyperv.h>
111 #include <dev/hyperv/include/hyperv_busdma.h>
112 #include <dev/hyperv/include/vmbus.h>
113 #include <dev/hyperv/include/vmbus_xact.h>
114 
115 #include <dev/hyperv/netvsc/ndis.h>
116 #include <dev/hyperv/netvsc/if_hnreg.h>
117 #include <dev/hyperv/netvsc/if_hnvar.h>
118 #include <dev/hyperv/netvsc/hn_nvs.h>
119 #include <dev/hyperv/netvsc/hn_rndis.h>
120 
121 #include "vmbus_if.h"
122 
123 #define HN_IFSTART_SUPPORT
124 
125 #define HN_RING_CNT_DEF_MAX		8
126 
127 #define HN_VFMAP_SIZE_DEF		8
128 
129 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
130 
131 /* YYY should get it from the underlying channel */
132 #define HN_TX_DESC_CNT			512
133 
134 #define HN_RNDIS_PKT_LEN					\
135 	(sizeof(struct rndis_packet_msg) +			\
136 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
137 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
138 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
139 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
140 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
141 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
142 
143 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
144 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
145 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
146 /* -1 for RNDIS packet message */
147 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
148 
149 #define HN_DIRECT_TX_SIZE_DEF		128
150 
151 #define HN_EARLY_TXEOF_THRESH		8
152 
153 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
154 
155 #define HN_LROENT_CNT_DEF		128
156 
157 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
158 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
159 /* YYY 2*MTU is a bit rough, but should be good enough. */
160 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
161 
162 #define HN_LRO_ACKCNT_DEF		1
163 
164 #define HN_LOCK_INIT(sc)		\
165 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
166 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
167 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
168 #define HN_LOCK(sc)					\
169 do {							\
170 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
171 		/* Relinquish cpu to avoid deadlock */	\
172 		sched_relinquish(curthread);		\
173 		DELAY(1000);				\
174 	}						\
175 } while (0)
176 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
177 
178 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
179 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
180 #define HN_CSUM_IP_HWASSIST(sc)		\
181 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
182 #define HN_CSUM_IP6_HWASSIST(sc)	\
183 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
184 
185 #define HN_PKTSIZE_MIN(align)		\
186 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
187 	    HN_RNDIS_PKT_LEN, (align))
188 #define HN_PKTSIZE(m, align)		\
189 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
190 
191 #ifdef RSS
192 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
193 #else
194 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
195 #endif
196 
197 struct hn_txdesc {
198 #ifndef HN_USE_TXDESC_BUFRING
199 	SLIST_ENTRY(hn_txdesc)		link;
200 #endif
201 	STAILQ_ENTRY(hn_txdesc)		agg_link;
202 
203 	/* Aggregated txdescs, in sending order. */
204 	STAILQ_HEAD(, hn_txdesc)	agg_list;
205 
206 	/* The oldest packet, if transmission aggregation happens. */
207 	struct mbuf			*m;
208 	struct hn_tx_ring		*txr;
209 	int				refs;
210 	uint32_t			flags;	/* HN_TXD_FLAG_ */
211 	struct hn_nvs_sendctx		send_ctx;
212 	uint32_t			chim_index;
213 	int				chim_size;
214 
215 	bus_dmamap_t			data_dmap;
216 
217 	bus_addr_t			rndis_pkt_paddr;
218 	struct rndis_packet_msg		*rndis_pkt;
219 	bus_dmamap_t			rndis_pkt_dmap;
220 };
221 
222 #define HN_TXD_FLAG_ONLIST		0x0001
223 #define HN_TXD_FLAG_DMAMAP		0x0002
224 #define HN_TXD_FLAG_ONAGG		0x0004
225 
226 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
227 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
228 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
229 
230 struct packet_info_id {
231 	uint8_t				ver;
232 	uint8_t				flag;
233 	uint16_t			pkt_id;
234 };
235 
236 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
237 
238 
239 struct hn_rxinfo {
240 	const uint32_t			*vlan_info;
241 	const uint32_t			*csum_info;
242 	const uint32_t			*hash_info;
243 	const uint32_t			*hash_value;
244 	const struct packet_info_id	*pktinfo_id;
245 };
246 
247 struct hn_rxvf_setarg {
248 	struct hn_rx_ring	*rxr;
249 	struct ifnet		*vf_ifp;
250 };
251 
252 #define HN_RXINFO_VLAN			0x0001
253 #define HN_RXINFO_CSUM			0x0002
254 #define HN_RXINFO_HASHINF		0x0004
255 #define HN_RXINFO_HASHVAL		0x0008
256 #define HN_RXINFO_PKTINFO_ID		0x0010
257 #define HN_RXINFO_ALL			\
258 	(HN_RXINFO_VLAN |		\
259 	 HN_RXINFO_CSUM |		\
260 	 HN_RXINFO_HASHINF |		\
261 	 HN_RXINFO_HASHVAL |		\
262 	 HN_RXINFO_PKTINFO_ID)
263 
264 static int			hn_probe(device_t);
265 static int			hn_attach(device_t);
266 static int			hn_detach(device_t);
267 static int			hn_shutdown(device_t);
268 static void			hn_chan_callback(struct vmbus_channel *,
269 				    void *);
270 
271 static void			hn_init(void *);
272 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
273 #ifdef HN_IFSTART_SUPPORT
274 static void			hn_start(struct ifnet *);
275 #endif
276 static int			hn_transmit(struct ifnet *, struct mbuf *);
277 static void			hn_xmit_qflush(struct ifnet *);
278 static int			hn_ifmedia_upd(struct ifnet *);
279 static void			hn_ifmedia_sts(struct ifnet *,
280 				    struct ifmediareq *);
281 
282 static void			hn_ifnet_event(void *, struct ifnet *, int);
283 static void			hn_ifaddr_event(void *, struct ifnet *);
284 static void			hn_ifnet_attevent(void *, struct ifnet *);
285 static void			hn_ifnet_detevent(void *, struct ifnet *);
286 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
287 
288 static bool			hn_ismyvf(const struct hn_softc *,
289 				    const struct ifnet *);
290 static void			hn_rxvf_change(struct hn_softc *,
291 				    struct ifnet *, bool);
292 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
293 static void			hn_rxvf_set_task(void *, int);
294 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
295 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
296 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
297 				    struct ifreq *);
298 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
299 static bool			hn_xpnt_vf_isready(struct hn_softc *);
300 static void			hn_xpnt_vf_setready(struct hn_softc *);
301 static void			hn_xpnt_vf_init_taskfunc(void *, int);
302 static void			hn_xpnt_vf_init(struct hn_softc *);
303 static void			hn_xpnt_vf_setenable(struct hn_softc *);
304 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
305 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
306 static void			hn_vf_rss_restore(struct hn_softc *);
307 
308 static int			hn_rndis_rxinfo(const void *, int,
309 				    struct hn_rxinfo *);
310 static void			hn_rndis_rx_data(struct hn_rx_ring *,
311 				    const void *, int);
312 static void			hn_rndis_rx_status(struct hn_softc *,
313 				    const void *, int);
314 static void			hn_rndis_init_fixat(struct hn_softc *, int);
315 
316 static void			hn_nvs_handle_notify(struct hn_softc *,
317 				    const struct vmbus_chanpkt_hdr *);
318 static void			hn_nvs_handle_comp(struct hn_softc *,
319 				    struct vmbus_channel *,
320 				    const struct vmbus_chanpkt_hdr *);
321 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
322 				    struct vmbus_channel *,
323 				    const struct vmbus_chanpkt_hdr *);
324 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
325 				    struct vmbus_channel *, uint64_t);
326 
327 #if __FreeBSD_version >= 1100099
328 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
329 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
330 #endif
331 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
333 #if __FreeBSD_version < 1100095
334 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
335 #else
336 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
337 #endif
338 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
345 #ifndef RSS
346 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
347 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
348 #endif
349 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
350 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
358 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
359 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
360 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
361 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
362 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
363 
364 static void			hn_stop(struct hn_softc *, bool);
365 static void			hn_init_locked(struct hn_softc *);
366 static int			hn_chan_attach(struct hn_softc *,
367 				    struct vmbus_channel *);
368 static void			hn_chan_detach(struct hn_softc *,
369 				    struct vmbus_channel *);
370 static int			hn_attach_subchans(struct hn_softc *);
371 static void			hn_detach_allchans(struct hn_softc *);
372 static void			hn_chan_rollup(struct hn_rx_ring *,
373 				    struct hn_tx_ring *);
374 static void			hn_set_ring_inuse(struct hn_softc *, int);
375 static int			hn_synth_attach(struct hn_softc *, int);
376 static void			hn_synth_detach(struct hn_softc *);
377 static int			hn_synth_alloc_subchans(struct hn_softc *,
378 				    int *);
379 static bool			hn_synth_attachable(const struct hn_softc *);
380 static void			hn_suspend(struct hn_softc *);
381 static void			hn_suspend_data(struct hn_softc *);
382 static void			hn_suspend_mgmt(struct hn_softc *);
383 static void			hn_resume(struct hn_softc *);
384 static void			hn_resume_data(struct hn_softc *);
385 static void			hn_resume_mgmt(struct hn_softc *);
386 static void			hn_suspend_mgmt_taskfunc(void *, int);
387 static void			hn_chan_drain(struct hn_softc *,
388 				    struct vmbus_channel *);
389 static void			hn_disable_rx(struct hn_softc *);
390 static void			hn_drain_rxtx(struct hn_softc *, int);
391 static void			hn_polling(struct hn_softc *, u_int);
392 static void			hn_chan_polling(struct vmbus_channel *, u_int);
393 static void			hn_mtu_change_fixup(struct hn_softc *);
394 
395 static void			hn_update_link_status(struct hn_softc *);
396 static void			hn_change_network(struct hn_softc *);
397 static void			hn_link_taskfunc(void *, int);
398 static void			hn_netchg_init_taskfunc(void *, int);
399 static void			hn_netchg_status_taskfunc(void *, int);
400 static void			hn_link_status(struct hn_softc *);
401 
402 static int			hn_create_rx_data(struct hn_softc *, int);
403 static void			hn_destroy_rx_data(struct hn_softc *);
404 static int			hn_check_iplen(const struct mbuf *, int);
405 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
406 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
407 static int			hn_rxfilter_config(struct hn_softc *);
408 static int			hn_rss_reconfig(struct hn_softc *);
409 static void			hn_rss_ind_fixup(struct hn_softc *);
410 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
411 static int			hn_rxpkt(struct hn_rx_ring *);
412 static uint32_t			hn_rss_type_fromndis(uint32_t);
413 static uint32_t			hn_rss_type_tondis(uint32_t);
414 
415 static int			hn_tx_ring_create(struct hn_softc *, int);
416 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
417 static int			hn_create_tx_data(struct hn_softc *, int);
418 static void			hn_fixup_tx_data(struct hn_softc *);
419 static void			hn_fixup_rx_data(struct hn_softc *);
420 static void			hn_destroy_tx_data(struct hn_softc *);
421 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
422 static void			hn_txdesc_gc(struct hn_tx_ring *,
423 				    struct hn_txdesc *);
424 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
425 				    struct hn_txdesc *, struct mbuf **);
426 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
427 				    struct hn_txdesc *);
428 static void			hn_set_chim_size(struct hn_softc *, int);
429 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
430 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
431 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
432 static void			hn_resume_tx(struct hn_softc *, int);
433 static void			hn_set_txagg(struct hn_softc *);
434 static void			*hn_try_txagg(struct ifnet *,
435 				    struct hn_tx_ring *, struct hn_txdesc *,
436 				    int);
437 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
438 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
439 				    struct hn_softc *, struct vmbus_channel *,
440 				    const void *, int);
441 static int			hn_txpkt_sglist(struct hn_tx_ring *,
442 				    struct hn_txdesc *);
443 static int			hn_txpkt_chim(struct hn_tx_ring *,
444 				    struct hn_txdesc *);
445 static int			hn_xmit(struct hn_tx_ring *, int);
446 static void			hn_xmit_taskfunc(void *, int);
447 static void			hn_xmit_txeof(struct hn_tx_ring *);
448 static void			hn_xmit_txeof_taskfunc(void *, int);
449 #ifdef HN_IFSTART_SUPPORT
450 static int			hn_start_locked(struct hn_tx_ring *, int);
451 static void			hn_start_taskfunc(void *, int);
452 static void			hn_start_txeof(struct hn_tx_ring *);
453 static void			hn_start_txeof_taskfunc(void *, int);
454 #endif
455 
456 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
457     "Hyper-V network interface");
458 
459 /* Trust tcp segements verification on host side. */
460 static int			hn_trust_hosttcp = 1;
461 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
462     &hn_trust_hosttcp, 0,
463     "Trust tcp segement verification on host side, "
464     "when csum info is missing (global setting)");
465 
466 /* Trust udp datagrams verification on host side. */
467 static int			hn_trust_hostudp = 1;
468 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
469     &hn_trust_hostudp, 0,
470     "Trust udp datagram verification on host side, "
471     "when csum info is missing (global setting)");
472 
473 /* Trust ip packets verification on host side. */
474 static int			hn_trust_hostip = 1;
475 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
476     &hn_trust_hostip, 0,
477     "Trust ip packet verification on host side, "
478     "when csum info is missing (global setting)");
479 
480 /*
481  * Offload UDP/IPv4 checksum.
482  */
483 static int			hn_enable_udp4cs = 1;
484 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
485     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
486 
487 /*
488  * Offload UDP/IPv6 checksum.
489  */
490 static int			hn_enable_udp6cs = 1;
491 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
492     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
493 
494 /* Stats. */
495 static counter_u64_t		hn_udpcs_fixup;
496 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
497     &hn_udpcs_fixup, "# of UDP checksum fixup");
498 
499 /*
500  * See hn_set_hlen().
501  *
502  * This value is for Azure.  For Hyper-V, set this above
503  * 65536 to disable UDP datagram checksum fixup.
504  */
505 static int			hn_udpcs_fixup_mtu = 1420;
506 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
507     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
508 
509 /* Limit TSO burst size */
510 static int			hn_tso_maxlen = IP_MAXPACKET;
511 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
512     &hn_tso_maxlen, 0, "TSO burst limit");
513 
514 /* Limit chimney send size */
515 static int			hn_tx_chimney_size = 0;
516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
517     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
518 
519 /* Limit the size of packet for direct transmission */
520 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
521 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
522     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
523 
524 /* # of LRO entries per RX ring */
525 #if defined(INET) || defined(INET6)
526 #if __FreeBSD_version >= 1100095
527 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
528 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
529     &hn_lro_entry_count, 0, "LRO entry count");
530 #endif
531 #endif
532 
533 static int			hn_tx_taskq_cnt = 1;
534 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
535     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
536 
537 #define HN_TX_TASKQ_M_INDEP	0
538 #define HN_TX_TASKQ_M_GLOBAL	1
539 #define HN_TX_TASKQ_M_EVTTQ	2
540 
541 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
542 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
543     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
544     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
545 
546 #ifndef HN_USE_TXDESC_BUFRING
547 static int			hn_use_txdesc_bufring = 0;
548 #else
549 static int			hn_use_txdesc_bufring = 1;
550 #endif
551 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
552     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
553 
554 #ifdef HN_IFSTART_SUPPORT
555 /* Use ifnet.if_start instead of ifnet.if_transmit */
556 static int			hn_use_if_start = 0;
557 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
558     &hn_use_if_start, 0, "Use if_start TX method");
559 #endif
560 
561 /* # of channels to use */
562 static int			hn_chan_cnt = 0;
563 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
564     &hn_chan_cnt, 0,
565     "# of channels to use; each channel has one RX ring and one TX ring");
566 
567 /* # of transmit rings to use */
568 static int			hn_tx_ring_cnt = 0;
569 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
570     &hn_tx_ring_cnt, 0, "# of TX rings to use");
571 
572 /* Software TX ring deptch */
573 static int			hn_tx_swq_depth = 0;
574 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
575     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
576 
577 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
578 #if __FreeBSD_version >= 1100095
579 static u_int			hn_lro_mbufq_depth = 0;
580 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
581     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
582 #endif
583 
584 /* Packet transmission aggregation size limit */
585 static int			hn_tx_agg_size = -1;
586 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
587     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
588 
589 /* Packet transmission aggregation count limit */
590 static int			hn_tx_agg_pkts = -1;
591 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
592     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
593 
594 /* VF list */
595 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
596     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
597     hn_vflist_sysctl, "A",
598     "VF list");
599 
600 /* VF mapping */
601 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
602     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
603     hn_vfmap_sysctl, "A",
604     "VF mapping");
605 
606 /* Transparent VF */
607 static int			hn_xpnt_vf = 1;
608 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
609     &hn_xpnt_vf, 0, "Transparent VF mod");
610 
611 /* Accurate BPF support for Transparent VF */
612 static int			hn_xpnt_vf_accbpf = 0;
613 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
614     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
615 
616 /* Extra wait for transparent VF attach routing; unit seconds. */
617 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
618 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
619     &hn_xpnt_vf_attwait, 0,
620     "Extra wait for transparent VF attach routing; unit: seconds");
621 
622 static u_int			hn_cpu_index;	/* next CPU for channel */
623 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
624 
625 static struct rmlock		hn_vfmap_lock;
626 static int			hn_vfmap_size;
627 static struct ifnet		**hn_vfmap;
628 
629 #ifndef RSS
630 static const uint8_t
631 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
632 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
633 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
634 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
635 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
636 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
637 };
638 #endif	/* !RSS */
639 
640 static const struct hyperv_guid	hn_guid = {
641 	.hv_guid = {
642 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
643 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
644 };
645 
646 static device_method_t hn_methods[] = {
647 	/* Device interface */
648 	DEVMETHOD(device_probe,		hn_probe),
649 	DEVMETHOD(device_attach,	hn_attach),
650 	DEVMETHOD(device_detach,	hn_detach),
651 	DEVMETHOD(device_shutdown,	hn_shutdown),
652 	DEVMETHOD_END
653 };
654 
655 static driver_t hn_driver = {
656 	"hn",
657 	hn_methods,
658 	sizeof(struct hn_softc)
659 };
660 
661 static devclass_t hn_devclass;
662 
663 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
664 MODULE_VERSION(hn, 1);
665 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
666 
667 #if __FreeBSD_version >= 1100099
668 static void
669 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
670 {
671 	int i;
672 
673 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
674 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
675 }
676 #endif
677 
678 static int
679 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
680 {
681 
682 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
683 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
684 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
685 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
686 }
687 
688 static int
689 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
690 {
691 	struct hn_nvs_rndis rndis;
692 
693 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
694 	    txd->chim_size > 0, ("invalid rndis chim txd"));
695 
696 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
697 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
698 	rndis.nvs_chim_idx = txd->chim_index;
699 	rndis.nvs_chim_sz = txd->chim_size;
700 
701 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
702 	    &rndis, sizeof(rndis), &txd->send_ctx));
703 }
704 
705 static __inline uint32_t
706 hn_chim_alloc(struct hn_softc *sc)
707 {
708 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
709 	u_long *bmap = sc->hn_chim_bmap;
710 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
711 
712 	for (i = 0; i < bmap_cnt; ++i) {
713 		int idx;
714 
715 		idx = ffsl(~bmap[i]);
716 		if (idx == 0)
717 			continue;
718 
719 		--idx; /* ffsl is 1-based */
720 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
721 		    ("invalid i %d and idx %d", i, idx));
722 
723 		if (atomic_testandset_long(&bmap[i], idx))
724 			continue;
725 
726 		ret = i * LONG_BIT + idx;
727 		break;
728 	}
729 	return (ret);
730 }
731 
732 static __inline void
733 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
734 {
735 	u_long mask;
736 	uint32_t idx;
737 
738 	idx = chim_idx / LONG_BIT;
739 	KASSERT(idx < sc->hn_chim_bmap_cnt,
740 	    ("invalid chimney index 0x%x", chim_idx));
741 
742 	mask = 1UL << (chim_idx % LONG_BIT);
743 	KASSERT(sc->hn_chim_bmap[idx] & mask,
744 	    ("index bitmap 0x%lx, chimney index %u, "
745 	     "bitmap idx %d, bitmask 0x%lx",
746 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
747 
748 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
749 }
750 
751 #if defined(INET6) || defined(INET)
752 
753 #define PULLUP_HDR(m, len)				\
754 do {							\
755 	if (__predict_false((m)->m_len < (len))) {	\
756 		(m) = m_pullup((m), (len));		\
757 		if ((m) == NULL)			\
758 			return (NULL);			\
759 	}						\
760 } while (0)
761 
762 /*
763  * NOTE: If this function failed, the m_head would be freed.
764  */
765 static __inline struct mbuf *
766 hn_tso_fixup(struct mbuf *m_head)
767 {
768 	struct ether_vlan_header *evl;
769 	struct tcphdr *th;
770 	int ehlen;
771 
772 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
773 
774 	PULLUP_HDR(m_head, sizeof(*evl));
775 	evl = mtod(m_head, struct ether_vlan_header *);
776 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
777 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
778 	else
779 		ehlen = ETHER_HDR_LEN;
780 	m_head->m_pkthdr.l2hlen = ehlen;
781 
782 #ifdef INET
783 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
784 		struct ip *ip;
785 		int iphlen;
786 
787 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
788 		ip = mtodo(m_head, ehlen);
789 		iphlen = ip->ip_hl << 2;
790 		m_head->m_pkthdr.l3hlen = iphlen;
791 
792 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
793 		th = mtodo(m_head, ehlen + iphlen);
794 
795 		ip->ip_len = 0;
796 		ip->ip_sum = 0;
797 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
798 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
799 	}
800 #endif
801 #if defined(INET6) && defined(INET)
802 	else
803 #endif
804 #ifdef INET6
805 	{
806 		struct ip6_hdr *ip6;
807 
808 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
809 		ip6 = mtodo(m_head, ehlen);
810 		if (ip6->ip6_nxt != IPPROTO_TCP) {
811 			m_freem(m_head);
812 			return (NULL);
813 		}
814 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
815 
816 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
817 		th = mtodo(m_head, ehlen + sizeof(*ip6));
818 
819 		ip6->ip6_plen = 0;
820 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
821 	}
822 #endif
823 	return (m_head);
824 }
825 
826 /*
827  * NOTE: If this function failed, the m_head would be freed.
828  */
829 static __inline struct mbuf *
830 hn_set_hlen(struct mbuf *m_head)
831 {
832 	const struct ether_vlan_header *evl;
833 	int ehlen;
834 
835 	PULLUP_HDR(m_head, sizeof(*evl));
836 	evl = mtod(m_head, const struct ether_vlan_header *);
837 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
838 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
839 	else
840 		ehlen = ETHER_HDR_LEN;
841 	m_head->m_pkthdr.l2hlen = ehlen;
842 
843 #ifdef INET
844 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
845 		const struct ip *ip;
846 		int iphlen;
847 
848 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
849 		ip = mtodo(m_head, ehlen);
850 		iphlen = ip->ip_hl << 2;
851 		m_head->m_pkthdr.l3hlen = iphlen;
852 
853 		/*
854 		 * UDP checksum offload does not work in Azure, if the
855 		 * following conditions meet:
856 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
857 		 * - IP_DF is not set in the IP hdr.
858 		 *
859 		 * Fallback to software checksum for these UDP datagrams.
860 		 */
861 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
862 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
863 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
864 			uint16_t off = ehlen + iphlen;
865 
866 			counter_u64_add(hn_udpcs_fixup, 1);
867 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
868 			*(uint16_t *)(m_head->m_data + off +
869                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
870 			    m_head, m_head->m_pkthdr.len, off);
871 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
872 		}
873 	}
874 #endif
875 #if defined(INET6) && defined(INET)
876 	else
877 #endif
878 #ifdef INET6
879 	{
880 		const struct ip6_hdr *ip6;
881 
882 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
883 		ip6 = mtodo(m_head, ehlen);
884 		if (ip6->ip6_nxt != IPPROTO_TCP &&
885 		    ip6->ip6_nxt != IPPROTO_UDP) {
886 			m_freem(m_head);
887 			return (NULL);
888 		}
889 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
890 	}
891 #endif
892 	return (m_head);
893 }
894 
895 /*
896  * NOTE: If this function failed, the m_head would be freed.
897  */
898 static __inline struct mbuf *
899 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
900 {
901 	const struct tcphdr *th;
902 	int ehlen, iphlen;
903 
904 	*tcpsyn = 0;
905 	ehlen = m_head->m_pkthdr.l2hlen;
906 	iphlen = m_head->m_pkthdr.l3hlen;
907 
908 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
909 	th = mtodo(m_head, ehlen + iphlen);
910 	if (th->th_flags & TH_SYN)
911 		*tcpsyn = 1;
912 	return (m_head);
913 }
914 
915 #undef PULLUP_HDR
916 
917 #endif	/* INET6 || INET */
918 
919 static int
920 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
921 {
922 	int error = 0;
923 
924 	HN_LOCK_ASSERT(sc);
925 
926 	if (sc->hn_rx_filter != filter) {
927 		error = hn_rndis_set_rxfilter(sc, filter);
928 		if (!error)
929 			sc->hn_rx_filter = filter;
930 	}
931 	return (error);
932 }
933 
934 static int
935 hn_rxfilter_config(struct hn_softc *sc)
936 {
937 	struct ifnet *ifp = sc->hn_ifp;
938 	uint32_t filter;
939 
940 	HN_LOCK_ASSERT(sc);
941 
942 	/*
943 	 * If the non-transparent mode VF is activated, we don't know how
944 	 * its RX filter is configured, so stick the synthetic device in
945 	 * the promiscous mode.
946 	 */
947 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
948 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
949 	} else {
950 		filter = NDIS_PACKET_TYPE_DIRECTED;
951 		if (ifp->if_flags & IFF_BROADCAST)
952 			filter |= NDIS_PACKET_TYPE_BROADCAST;
953 		/* TODO: support multicast list */
954 		if ((ifp->if_flags & IFF_ALLMULTI) ||
955 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
956 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
957 	}
958 	return (hn_set_rxfilter(sc, filter));
959 }
960 
961 static void
962 hn_set_txagg(struct hn_softc *sc)
963 {
964 	uint32_t size, pkts;
965 	int i;
966 
967 	/*
968 	 * Setup aggregation size.
969 	 */
970 	if (sc->hn_agg_size < 0)
971 		size = UINT32_MAX;
972 	else
973 		size = sc->hn_agg_size;
974 
975 	if (sc->hn_rndis_agg_size < size)
976 		size = sc->hn_rndis_agg_size;
977 
978 	/* NOTE: We only aggregate packets using chimney sending buffers. */
979 	if (size > (uint32_t)sc->hn_chim_szmax)
980 		size = sc->hn_chim_szmax;
981 
982 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
983 		/* Disable */
984 		size = 0;
985 		pkts = 0;
986 		goto done;
987 	}
988 
989 	/* NOTE: Type of the per TX ring setting is 'int'. */
990 	if (size > INT_MAX)
991 		size = INT_MAX;
992 
993 	/*
994 	 * Setup aggregation packet count.
995 	 */
996 	if (sc->hn_agg_pkts < 0)
997 		pkts = UINT32_MAX;
998 	else
999 		pkts = sc->hn_agg_pkts;
1000 
1001 	if (sc->hn_rndis_agg_pkts < pkts)
1002 		pkts = sc->hn_rndis_agg_pkts;
1003 
1004 	if (pkts <= 1) {
1005 		/* Disable */
1006 		size = 0;
1007 		pkts = 0;
1008 		goto done;
1009 	}
1010 
1011 	/* NOTE: Type of the per TX ring setting is 'short'. */
1012 	if (pkts > SHRT_MAX)
1013 		pkts = SHRT_MAX;
1014 
1015 done:
1016 	/* NOTE: Type of the per TX ring setting is 'short'. */
1017 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1018 		/* Disable */
1019 		size = 0;
1020 		pkts = 0;
1021 	}
1022 
1023 	if (bootverbose) {
1024 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1025 		    size, pkts, sc->hn_rndis_agg_align);
1026 	}
1027 
1028 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1029 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1030 
1031 		mtx_lock(&txr->hn_tx_lock);
1032 		txr->hn_agg_szmax = size;
1033 		txr->hn_agg_pktmax = pkts;
1034 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1035 		mtx_unlock(&txr->hn_tx_lock);
1036 	}
1037 }
1038 
1039 static int
1040 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1041 {
1042 
1043 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1044 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1045 		return txr->hn_txdesc_cnt;
1046 	return hn_tx_swq_depth;
1047 }
1048 
1049 static int
1050 hn_rss_reconfig(struct hn_softc *sc)
1051 {
1052 	int error;
1053 
1054 	HN_LOCK_ASSERT(sc);
1055 
1056 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1057 		return (ENXIO);
1058 
1059 	/*
1060 	 * Disable RSS first.
1061 	 *
1062 	 * NOTE:
1063 	 * Direct reconfiguration by setting the UNCHG flags does
1064 	 * _not_ work properly.
1065 	 */
1066 	if (bootverbose)
1067 		if_printf(sc->hn_ifp, "disable RSS\n");
1068 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1069 	if (error) {
1070 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1071 		return (error);
1072 	}
1073 
1074 	/*
1075 	 * Reenable the RSS w/ the updated RSS key or indirect
1076 	 * table.
1077 	 */
1078 	if (bootverbose)
1079 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1080 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1081 	if (error) {
1082 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1083 		return (error);
1084 	}
1085 	return (0);
1086 }
1087 
1088 static void
1089 hn_rss_ind_fixup(struct hn_softc *sc)
1090 {
1091 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1092 	int i, nchan;
1093 
1094 	nchan = sc->hn_rx_ring_inuse;
1095 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1096 
1097 	/*
1098 	 * Check indirect table to make sure that all channels in it
1099 	 * can be used.
1100 	 */
1101 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1102 		if (rss->rss_ind[i] >= nchan) {
1103 			if_printf(sc->hn_ifp,
1104 			    "RSS indirect table %d fixup: %u -> %d\n",
1105 			    i, rss->rss_ind[i], nchan - 1);
1106 			rss->rss_ind[i] = nchan - 1;
1107 		}
1108 	}
1109 }
1110 
1111 static int
1112 hn_ifmedia_upd(struct ifnet *ifp __unused)
1113 {
1114 
1115 	return EOPNOTSUPP;
1116 }
1117 
1118 static void
1119 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1120 {
1121 	struct hn_softc *sc = ifp->if_softc;
1122 
1123 	ifmr->ifm_status = IFM_AVALID;
1124 	ifmr->ifm_active = IFM_ETHER;
1125 
1126 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1127 		ifmr->ifm_active |= IFM_NONE;
1128 		return;
1129 	}
1130 	ifmr->ifm_status |= IFM_ACTIVE;
1131 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1132 }
1133 
1134 static void
1135 hn_rxvf_set_task(void *xarg, int pending __unused)
1136 {
1137 	struct hn_rxvf_setarg *arg = xarg;
1138 
1139 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1140 }
1141 
1142 static void
1143 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1144 {
1145 	struct hn_rx_ring *rxr;
1146 	struct hn_rxvf_setarg arg;
1147 	struct task task;
1148 	int i;
1149 
1150 	HN_LOCK_ASSERT(sc);
1151 
1152 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1153 
1154 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1155 		rxr = &sc->hn_rx_ring[i];
1156 
1157 		if (i < sc->hn_rx_ring_inuse) {
1158 			arg.rxr = rxr;
1159 			arg.vf_ifp = vf_ifp;
1160 			vmbus_chan_run_task(rxr->hn_chan, &task);
1161 		} else {
1162 			rxr->hn_rxvf_ifp = vf_ifp;
1163 		}
1164 	}
1165 }
1166 
1167 static bool
1168 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1169 {
1170 	const struct ifnet *hn_ifp;
1171 
1172 	hn_ifp = sc->hn_ifp;
1173 
1174 	if (ifp == hn_ifp)
1175 		return (false);
1176 
1177 	if (ifp->if_alloctype != IFT_ETHER)
1178 		return (false);
1179 
1180 	/* Ignore lagg/vlan interfaces */
1181 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1182 	    strcmp(ifp->if_dname, "vlan") == 0)
1183 		return (false);
1184 
1185 	/*
1186 	 * During detach events ifp->if_addr might be NULL.
1187 	 * Make sure the bcmp() below doesn't panic on that:
1188 	 */
1189 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1190 		return (false);
1191 
1192 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1193 		return (false);
1194 
1195 	return (true);
1196 }
1197 
1198 static void
1199 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1200 {
1201 	struct ifnet *hn_ifp;
1202 
1203 	HN_LOCK(sc);
1204 
1205 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1206 		goto out;
1207 
1208 	if (!hn_ismyvf(sc, ifp))
1209 		goto out;
1210 	hn_ifp = sc->hn_ifp;
1211 
1212 	if (rxvf) {
1213 		if (sc->hn_flags & HN_FLAG_RXVF)
1214 			goto out;
1215 
1216 		sc->hn_flags |= HN_FLAG_RXVF;
1217 		hn_rxfilter_config(sc);
1218 	} else {
1219 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1220 			goto out;
1221 
1222 		sc->hn_flags &= ~HN_FLAG_RXVF;
1223 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1224 			hn_rxfilter_config(sc);
1225 		else
1226 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1227 	}
1228 
1229 	hn_nvs_set_datapath(sc,
1230 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1231 
1232 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1233 
1234 	if (rxvf) {
1235 		hn_vf_rss_fixup(sc, true);
1236 		hn_suspend_mgmt(sc);
1237 		sc->hn_link_flags &=
1238 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1239 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1240 	} else {
1241 		hn_vf_rss_restore(sc);
1242 		hn_resume_mgmt(sc);
1243 	}
1244 
1245 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1246 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1247 
1248 	if (bootverbose) {
1249 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1250 		    rxvf ? "to" : "from", ifp->if_xname);
1251 	}
1252 out:
1253 	HN_UNLOCK(sc);
1254 }
1255 
1256 static void
1257 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1258 {
1259 
1260 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1261 		return;
1262 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1263 }
1264 
1265 static void
1266 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1267 {
1268 
1269 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1270 }
1271 
1272 static int
1273 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1274 {
1275 	struct ifnet *ifp, *vf_ifp;
1276 	uint64_t tmp;
1277 	int error;
1278 
1279 	HN_LOCK_ASSERT(sc);
1280 	ifp = sc->hn_ifp;
1281 	vf_ifp = sc->hn_vf_ifp;
1282 
1283 	/*
1284 	 * Fix up requested capabilities w/ supported capabilities,
1285 	 * since the supported capabilities could have been changed.
1286 	 */
1287 	ifr->ifr_reqcap &= ifp->if_capabilities;
1288 	/* Pass SIOCSIFCAP to VF. */
1289 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1290 
1291 	/*
1292 	 * NOTE:
1293 	 * The error will be propagated to the callers, however, it
1294 	 * is _not_ useful here.
1295 	 */
1296 
1297 	/*
1298 	 * Merge VF's enabled capabilities.
1299 	 */
1300 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1301 
1302 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1303 	if (ifp->if_capenable & IFCAP_TXCSUM)
1304 		ifp->if_hwassist |= tmp;
1305 	else
1306 		ifp->if_hwassist &= ~tmp;
1307 
1308 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1309 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1310 		ifp->if_hwassist |= tmp;
1311 	else
1312 		ifp->if_hwassist &= ~tmp;
1313 
1314 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1315 	if (ifp->if_capenable & IFCAP_TSO4)
1316 		ifp->if_hwassist |= tmp;
1317 	else
1318 		ifp->if_hwassist &= ~tmp;
1319 
1320 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1321 	if (ifp->if_capenable & IFCAP_TSO6)
1322 		ifp->if_hwassist |= tmp;
1323 	else
1324 		ifp->if_hwassist &= ~tmp;
1325 
1326 	return (error);
1327 }
1328 
1329 static int
1330 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1331 {
1332 	struct ifnet *vf_ifp;
1333 	struct ifreq ifr;
1334 
1335 	HN_LOCK_ASSERT(sc);
1336 	vf_ifp = sc->hn_vf_ifp;
1337 
1338 	memset(&ifr, 0, sizeof(ifr));
1339 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1340 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1341 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1342 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1343 }
1344 
1345 static void
1346 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1347 {
1348 	struct ifnet *ifp = sc->hn_ifp;
1349 	int allmulti = 0;
1350 
1351 	HN_LOCK_ASSERT(sc);
1352 
1353 	/* XXX vlan(4) style mcast addr maintenance */
1354 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1355 		allmulti = IFF_ALLMULTI;
1356 
1357 	/* Always set the VF's if_flags */
1358 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1359 }
1360 
1361 static void
1362 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1363 {
1364 	struct rm_priotracker pt;
1365 	struct ifnet *hn_ifp = NULL;
1366 	struct mbuf *mn;
1367 
1368 	/*
1369 	 * XXX racy, if hn(4) ever detached.
1370 	 */
1371 	rm_rlock(&hn_vfmap_lock, &pt);
1372 	if (vf_ifp->if_index < hn_vfmap_size)
1373 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1374 	rm_runlock(&hn_vfmap_lock, &pt);
1375 
1376 	if (hn_ifp != NULL) {
1377 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1378 			/*
1379 			 * Allow tapping on the VF.
1380 			 */
1381 			ETHER_BPF_MTAP(vf_ifp, mn);
1382 
1383 			/*
1384 			 * Update VF stats.
1385 			 */
1386 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1387 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1388 				    mn->m_pkthdr.len);
1389 			}
1390 			/*
1391 			 * XXX IFCOUNTER_IMCAST
1392 			 * This stat updating is kinda invasive, since it
1393 			 * requires two checks on the mbuf: the length check
1394 			 * and the ethernet header check.  As of this write,
1395 			 * all multicast packets go directly to hn(4), which
1396 			 * makes imcast stat updating in the VF a try in vian.
1397 			 */
1398 
1399 			/*
1400 			 * Fix up rcvif and increase hn(4)'s ipackets.
1401 			 */
1402 			mn->m_pkthdr.rcvif = hn_ifp;
1403 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1404 		}
1405 		/*
1406 		 * Go through hn(4)'s if_input.
1407 		 */
1408 		hn_ifp->if_input(hn_ifp, m);
1409 	} else {
1410 		/*
1411 		 * In the middle of the transition; free this
1412 		 * mbuf chain.
1413 		 */
1414 		while (m != NULL) {
1415 			mn = m->m_nextpkt;
1416 			m->m_nextpkt = NULL;
1417 			m_freem(m);
1418 			m = mn;
1419 		}
1420 	}
1421 }
1422 
1423 static void
1424 hn_mtu_change_fixup(struct hn_softc *sc)
1425 {
1426 	struct ifnet *ifp;
1427 
1428 	HN_LOCK_ASSERT(sc);
1429 	ifp = sc->hn_ifp;
1430 
1431 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1432 #if __FreeBSD_version >= 1100099
1433 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1434 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1435 #endif
1436 }
1437 
1438 static uint32_t
1439 hn_rss_type_fromndis(uint32_t rss_hash)
1440 {
1441 	uint32_t types = 0;
1442 
1443 	if (rss_hash & NDIS_HASH_IPV4)
1444 		types |= RSS_TYPE_IPV4;
1445 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1446 		types |= RSS_TYPE_TCP_IPV4;
1447 	if (rss_hash & NDIS_HASH_IPV6)
1448 		types |= RSS_TYPE_IPV6;
1449 	if (rss_hash & NDIS_HASH_IPV6_EX)
1450 		types |= RSS_TYPE_IPV6_EX;
1451 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1452 		types |= RSS_TYPE_TCP_IPV6;
1453 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1454 		types |= RSS_TYPE_TCP_IPV6_EX;
1455 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1456 		types |= RSS_TYPE_UDP_IPV4;
1457 	return (types);
1458 }
1459 
1460 static uint32_t
1461 hn_rss_type_tondis(uint32_t types)
1462 {
1463 	uint32_t rss_hash = 0;
1464 
1465 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1466 	    ("UDP6 and UDP6EX are not supported"));
1467 
1468 	if (types & RSS_TYPE_IPV4)
1469 		rss_hash |= NDIS_HASH_IPV4;
1470 	if (types & RSS_TYPE_TCP_IPV4)
1471 		rss_hash |= NDIS_HASH_TCP_IPV4;
1472 	if (types & RSS_TYPE_IPV6)
1473 		rss_hash |= NDIS_HASH_IPV6;
1474 	if (types & RSS_TYPE_IPV6_EX)
1475 		rss_hash |= NDIS_HASH_IPV6_EX;
1476 	if (types & RSS_TYPE_TCP_IPV6)
1477 		rss_hash |= NDIS_HASH_TCP_IPV6;
1478 	if (types & RSS_TYPE_TCP_IPV6_EX)
1479 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1480 	if (types & RSS_TYPE_UDP_IPV4)
1481 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1482 	return (rss_hash);
1483 }
1484 
1485 static void
1486 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1487 {
1488 	int i;
1489 
1490 	HN_LOCK_ASSERT(sc);
1491 
1492 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1493 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1494 }
1495 
1496 static void
1497 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1498 {
1499 	struct ifnet *ifp, *vf_ifp;
1500 	struct ifrsshash ifrh;
1501 	struct ifrsskey ifrk;
1502 	int error;
1503 	uint32_t my_types, diff_types, mbuf_types = 0;
1504 
1505 	HN_LOCK_ASSERT(sc);
1506 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1507 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1508 
1509 	if (sc->hn_rx_ring_inuse == 1) {
1510 		/* No RSS on synthetic parts; done. */
1511 		return;
1512 	}
1513 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1514 		/* Synthetic parts do not support Toeplitz; done. */
1515 		return;
1516 	}
1517 
1518 	ifp = sc->hn_ifp;
1519 	vf_ifp = sc->hn_vf_ifp;
1520 
1521 	/*
1522 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1523 	 * supported.
1524 	 */
1525 	memset(&ifrk, 0, sizeof(ifrk));
1526 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1527 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1528 	if (error) {
1529 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1530 		    vf_ifp->if_xname, error);
1531 		goto done;
1532 	}
1533 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1534 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1535 		    vf_ifp->if_xname, ifrk.ifrk_func);
1536 		goto done;
1537 	}
1538 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1539 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1540 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1541 		goto done;
1542 	}
1543 
1544 	/*
1545 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1546 	 */
1547 	memset(&ifrh, 0, sizeof(ifrh));
1548 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1549 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1550 	if (error) {
1551 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1552 		    vf_ifp->if_xname, error);
1553 		goto done;
1554 	}
1555 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1556 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1557 		    vf_ifp->if_xname, ifrh.ifrh_func);
1558 		goto done;
1559 	}
1560 
1561 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1562 	if ((ifrh.ifrh_types & my_types) == 0) {
1563 		/* This disables RSS; ignore it then */
1564 		if_printf(ifp, "%s intersection of RSS types failed.  "
1565 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1566 		    ifrh.ifrh_types, my_types);
1567 		goto done;
1568 	}
1569 
1570 	diff_types = my_types ^ ifrh.ifrh_types;
1571 	my_types &= ifrh.ifrh_types;
1572 	mbuf_types = my_types;
1573 
1574 	/*
1575 	 * Detect RSS hash value/type confliction.
1576 	 *
1577 	 * NOTE:
1578 	 * We don't disable the hash type, but stop delivery the hash
1579 	 * value/type through mbufs on RX path.
1580 	 *
1581 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1582 	 * hash is delivered with type of TCP_IPV4.  This means if
1583 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1584 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1585 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1586 	 * here.
1587 	 */
1588 	if ((my_types & RSS_TYPE_IPV4) &&
1589 	    (diff_types & ifrh.ifrh_types &
1590 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1591 		/* Conflict; disable IPV4 hash type/value delivery. */
1592 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1593 		mbuf_types &= ~RSS_TYPE_IPV4;
1594 	}
1595 	if ((my_types & RSS_TYPE_IPV6) &&
1596 	    (diff_types & ifrh.ifrh_types &
1597 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1598 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1599 	      RSS_TYPE_IPV6_EX))) {
1600 		/* Conflict; disable IPV6 hash type/value delivery. */
1601 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1602 		mbuf_types &= ~RSS_TYPE_IPV6;
1603 	}
1604 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1605 	    (diff_types & ifrh.ifrh_types &
1606 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1607 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1608 	      RSS_TYPE_IPV6))) {
1609 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1610 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1611 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1612 	}
1613 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1614 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1615 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1616 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1617 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1618 	}
1619 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1620 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1621 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1622 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1623 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1624 	}
1625 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1626 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1627 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1628 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1629 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1630 	}
1631 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1632 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1633 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1634 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1635 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1636 	}
1637 
1638 	/*
1639 	 * Indirect table does not matter.
1640 	 */
1641 
1642 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1643 	    hn_rss_type_tondis(my_types);
1644 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1645 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1646 
1647 	if (reconf) {
1648 		error = hn_rss_reconfig(sc);
1649 		if (error) {
1650 			/* XXX roll-back? */
1651 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1652 			/* XXX keep going. */
1653 		}
1654 	}
1655 done:
1656 	/* Hash deliverability for mbufs. */
1657 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1658 }
1659 
1660 static void
1661 hn_vf_rss_restore(struct hn_softc *sc)
1662 {
1663 
1664 	HN_LOCK_ASSERT(sc);
1665 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1666 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1667 
1668 	if (sc->hn_rx_ring_inuse == 1)
1669 		goto done;
1670 
1671 	/*
1672 	 * Restore hash types.  Key does _not_ matter.
1673 	 */
1674 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1675 		int error;
1676 
1677 		sc->hn_rss_hash = sc->hn_rss_hcap;
1678 		error = hn_rss_reconfig(sc);
1679 		if (error) {
1680 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1681 			    error);
1682 			/* XXX keep going. */
1683 		}
1684 	}
1685 done:
1686 	/* Hash deliverability for mbufs. */
1687 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1688 }
1689 
1690 static void
1691 hn_xpnt_vf_setready(struct hn_softc *sc)
1692 {
1693 	struct ifnet *ifp, *vf_ifp;
1694 	struct ifreq ifr;
1695 
1696 	HN_LOCK_ASSERT(sc);
1697 	ifp = sc->hn_ifp;
1698 	vf_ifp = sc->hn_vf_ifp;
1699 
1700 	/*
1701 	 * Mark the VF ready.
1702 	 */
1703 	sc->hn_vf_rdytick = 0;
1704 
1705 	/*
1706 	 * Save information for restoration.
1707 	 */
1708 	sc->hn_saved_caps = ifp->if_capabilities;
1709 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1710 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1711 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1712 
1713 	/*
1714 	 * Intersect supported/enabled capabilities.
1715 	 *
1716 	 * NOTE:
1717 	 * if_hwassist is not changed here.
1718 	 */
1719 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1720 	ifp->if_capenable &= ifp->if_capabilities;
1721 
1722 	/*
1723 	 * Fix TSO settings.
1724 	 */
1725 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1726 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1727 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1728 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1729 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1730 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1731 
1732 	/*
1733 	 * Change VF's enabled capabilities.
1734 	 */
1735 	memset(&ifr, 0, sizeof(ifr));
1736 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1737 	ifr.ifr_reqcap = ifp->if_capenable;
1738 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1739 
1740 	if (ifp->if_mtu != ETHERMTU) {
1741 		int error;
1742 
1743 		/*
1744 		 * Change VF's MTU.
1745 		 */
1746 		memset(&ifr, 0, sizeof(ifr));
1747 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1748 		ifr.ifr_mtu = ifp->if_mtu;
1749 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1750 		if (error) {
1751 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1752 			    vf_ifp->if_xname, ifp->if_mtu);
1753 			if (ifp->if_mtu > ETHERMTU) {
1754 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1755 
1756 				/*
1757 				 * XXX
1758 				 * No need to adjust the synthetic parts' MTU;
1759 				 * failure of the adjustment will cause us
1760 				 * infinite headache.
1761 				 */
1762 				ifp->if_mtu = ETHERMTU;
1763 				hn_mtu_change_fixup(sc);
1764 			}
1765 		}
1766 	}
1767 }
1768 
1769 static bool
1770 hn_xpnt_vf_isready(struct hn_softc *sc)
1771 {
1772 
1773 	HN_LOCK_ASSERT(sc);
1774 
1775 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1776 		return (false);
1777 
1778 	if (sc->hn_vf_rdytick == 0)
1779 		return (true);
1780 
1781 	if (sc->hn_vf_rdytick > ticks)
1782 		return (false);
1783 
1784 	/* Mark VF as ready. */
1785 	hn_xpnt_vf_setready(sc);
1786 	return (true);
1787 }
1788 
1789 static void
1790 hn_xpnt_vf_setenable(struct hn_softc *sc)
1791 {
1792 	int i;
1793 
1794 	HN_LOCK_ASSERT(sc);
1795 
1796 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1797 	rm_wlock(&sc->hn_vf_lock);
1798 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1799 	rm_wunlock(&sc->hn_vf_lock);
1800 
1801 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1802 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1803 }
1804 
1805 static void
1806 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1807 {
1808 	int i;
1809 
1810 	HN_LOCK_ASSERT(sc);
1811 
1812 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1813 	rm_wlock(&sc->hn_vf_lock);
1814 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1815 	if (clear_vf)
1816 		sc->hn_vf_ifp = NULL;
1817 	rm_wunlock(&sc->hn_vf_lock);
1818 
1819 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1820 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1821 }
1822 
1823 static void
1824 hn_xpnt_vf_init(struct hn_softc *sc)
1825 {
1826 	int error;
1827 
1828 	HN_LOCK_ASSERT(sc);
1829 
1830 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1831 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1832 
1833 	if (bootverbose) {
1834 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1835 		    sc->hn_vf_ifp->if_xname);
1836 	}
1837 
1838 	/*
1839 	 * Bring the VF up.
1840 	 */
1841 	hn_xpnt_vf_saveifflags(sc);
1842 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1843 	error = hn_xpnt_vf_iocsetflags(sc);
1844 	if (error) {
1845 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1846 		    sc->hn_vf_ifp->if_xname, error);
1847 		return;
1848 	}
1849 
1850 	/*
1851 	 * NOTE:
1852 	 * Datapath setting must happen _after_ bringing the VF up.
1853 	 */
1854 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1855 
1856 	/*
1857 	 * NOTE:
1858 	 * Fixup RSS related bits _after_ the VF is brought up, since
1859 	 * many VFs generate RSS key during it's initialization.
1860 	 */
1861 	hn_vf_rss_fixup(sc, true);
1862 
1863 	/* Mark transparent mode VF as enabled. */
1864 	hn_xpnt_vf_setenable(sc);
1865 }
1866 
1867 static void
1868 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1869 {
1870 	struct hn_softc *sc = xsc;
1871 
1872 	HN_LOCK(sc);
1873 
1874 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1875 		goto done;
1876 	if (sc->hn_vf_ifp == NULL)
1877 		goto done;
1878 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1879 		goto done;
1880 
1881 	if (sc->hn_vf_rdytick != 0) {
1882 		/* Mark VF as ready. */
1883 		hn_xpnt_vf_setready(sc);
1884 	}
1885 
1886 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1887 		/*
1888 		 * Delayed VF initialization.
1889 		 */
1890 		if (bootverbose) {
1891 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1892 			    sc->hn_vf_ifp->if_xname);
1893 		}
1894 		hn_xpnt_vf_init(sc);
1895 	}
1896 done:
1897 	HN_UNLOCK(sc);
1898 }
1899 
1900 static void
1901 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1902 {
1903 	struct hn_softc *sc = xsc;
1904 
1905 	HN_LOCK(sc);
1906 
1907 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1908 		goto done;
1909 
1910 	if (!hn_ismyvf(sc, ifp))
1911 		goto done;
1912 
1913 	if (sc->hn_vf_ifp != NULL) {
1914 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1915 		    sc->hn_vf_ifp->if_xname);
1916 		goto done;
1917 	}
1918 
1919 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1920 		/*
1921 		 * ifnet.if_start is _not_ supported by transparent
1922 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1923 		 */
1924 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1925 		    "in transparent VF mode.\n", ifp->if_xname);
1926 		goto done;
1927 	}
1928 
1929 	rm_wlock(&hn_vfmap_lock);
1930 
1931 	if (ifp->if_index >= hn_vfmap_size) {
1932 		struct ifnet **newmap;
1933 		int newsize;
1934 
1935 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1936 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1937 		    M_WAITOK | M_ZERO);
1938 
1939 		memcpy(newmap, hn_vfmap,
1940 		    sizeof(struct ifnet *) * hn_vfmap_size);
1941 		free(hn_vfmap, M_DEVBUF);
1942 		hn_vfmap = newmap;
1943 		hn_vfmap_size = newsize;
1944 	}
1945 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1946 	    ("%s: ifindex %d was mapped to %s",
1947 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1948 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1949 
1950 	rm_wunlock(&hn_vfmap_lock);
1951 
1952 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1953 	rm_wlock(&sc->hn_vf_lock);
1954 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1955 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1956 	sc->hn_vf_ifp = ifp;
1957 	rm_wunlock(&sc->hn_vf_lock);
1958 
1959 	if (hn_xpnt_vf) {
1960 		int wait_ticks;
1961 
1962 		/*
1963 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1964 		 * Save vf_ifp's current if_input for later restoration.
1965 		 */
1966 		sc->hn_vf_input = ifp->if_input;
1967 		ifp->if_input = hn_xpnt_vf_input;
1968 
1969 		/*
1970 		 * Stop link status management; use the VF's.
1971 		 */
1972 		hn_suspend_mgmt(sc);
1973 
1974 		/*
1975 		 * Give VF sometime to complete its attach routing.
1976 		 */
1977 		wait_ticks = hn_xpnt_vf_attwait * hz;
1978 		sc->hn_vf_rdytick = ticks + wait_ticks;
1979 
1980 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1981 		    wait_ticks);
1982 	}
1983 done:
1984 	HN_UNLOCK(sc);
1985 }
1986 
1987 static void
1988 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1989 {
1990 	struct hn_softc *sc = xsc;
1991 
1992 	HN_LOCK(sc);
1993 
1994 	if (sc->hn_vf_ifp == NULL)
1995 		goto done;
1996 
1997 	if (!hn_ismyvf(sc, ifp))
1998 		goto done;
1999 
2000 	if (hn_xpnt_vf) {
2001 		/*
2002 		 * Make sure that the delayed initialization is not running.
2003 		 *
2004 		 * NOTE:
2005 		 * - This lock _must_ be released, since the hn_vf_init task
2006 		 *   will try holding this lock.
2007 		 * - It is safe to release this lock here, since the
2008 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
2009 		 *
2010 		 * XXX racy, if hn(4) ever detached.
2011 		 */
2012 		HN_UNLOCK(sc);
2013 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2014 		HN_LOCK(sc);
2015 
2016 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2017 		    sc->hn_ifp->if_xname));
2018 		ifp->if_input = sc->hn_vf_input;
2019 		sc->hn_vf_input = NULL;
2020 
2021 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2022 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2023 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2024 
2025 		if (sc->hn_vf_rdytick == 0) {
2026 			/*
2027 			 * The VF was ready; restore some settings.
2028 			 */
2029 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2030 			/*
2031 			 * NOTE:
2032 			 * There is _no_ need to fixup if_capenable and
2033 			 * if_hwassist, since the if_capabilities before
2034 			 * restoration was an intersection of the VF's
2035 			 * if_capabilites and the synthetic device's
2036 			 * if_capabilites.
2037 			 */
2038 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2039 			sc->hn_ifp->if_hw_tsomaxsegcount =
2040 			    sc->hn_saved_tsosegcnt;
2041 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2042 		}
2043 
2044 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2045 			/*
2046 			 * Restore RSS settings.
2047 			 */
2048 			hn_vf_rss_restore(sc);
2049 
2050 			/*
2051 			 * Resume link status management, which was suspended
2052 			 * by hn_ifnet_attevent().
2053 			 */
2054 			hn_resume_mgmt(sc);
2055 		}
2056 	}
2057 
2058 	/* Mark transparent mode VF as disabled. */
2059 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2060 
2061 	rm_wlock(&hn_vfmap_lock);
2062 
2063 	KASSERT(ifp->if_index < hn_vfmap_size,
2064 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2065 	if (hn_vfmap[ifp->if_index] != NULL) {
2066 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2067 		    ("%s: ifindex %d was mapped to %s",
2068 		     ifp->if_xname, ifp->if_index,
2069 		     hn_vfmap[ifp->if_index]->if_xname));
2070 		hn_vfmap[ifp->if_index] = NULL;
2071 	}
2072 
2073 	rm_wunlock(&hn_vfmap_lock);
2074 done:
2075 	HN_UNLOCK(sc);
2076 }
2077 
2078 static void
2079 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2080 {
2081 	struct hn_softc *sc = xsc;
2082 
2083 	if (sc->hn_vf_ifp == ifp)
2084 		if_link_state_change(sc->hn_ifp, link_state);
2085 }
2086 
2087 static int
2088 hn_probe(device_t dev)
2089 {
2090 
2091 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2092 		device_set_desc(dev, "Hyper-V Network Interface");
2093 		return BUS_PROBE_DEFAULT;
2094 	}
2095 	return ENXIO;
2096 }
2097 
2098 static int
2099 hn_attach(device_t dev)
2100 {
2101 	struct hn_softc *sc = device_get_softc(dev);
2102 	struct sysctl_oid_list *child;
2103 	struct sysctl_ctx_list *ctx;
2104 	uint8_t eaddr[ETHER_ADDR_LEN];
2105 	struct ifnet *ifp = NULL;
2106 	int error, ring_cnt, tx_ring_cnt;
2107 	uint32_t mtu;
2108 
2109 	sc->hn_dev = dev;
2110 	sc->hn_prichan = vmbus_get_channel(dev);
2111 	HN_LOCK_INIT(sc);
2112 	rm_init(&sc->hn_vf_lock, "hnvf");
2113 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2114 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2115 
2116 	/*
2117 	 * Initialize these tunables once.
2118 	 */
2119 	sc->hn_agg_size = hn_tx_agg_size;
2120 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2121 
2122 	/*
2123 	 * Setup taskqueue for transmission.
2124 	 */
2125 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2126 		int i;
2127 
2128 		sc->hn_tx_taskqs =
2129 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2130 		    M_DEVBUF, M_WAITOK);
2131 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2132 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2133 			    M_WAITOK, taskqueue_thread_enqueue,
2134 			    &sc->hn_tx_taskqs[i]);
2135 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2136 			    "%s tx%d", device_get_nameunit(dev), i);
2137 		}
2138 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2139 		sc->hn_tx_taskqs = hn_tx_taskque;
2140 	}
2141 
2142 	/*
2143 	 * Setup taskqueue for mangement tasks, e.g. link status.
2144 	 */
2145 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2146 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2147 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2148 	    device_get_nameunit(dev));
2149 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2150 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2151 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2152 	    hn_netchg_status_taskfunc, sc);
2153 
2154 	if (hn_xpnt_vf) {
2155 		/*
2156 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2157 		 */
2158 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2159 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2160 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2161 		    device_get_nameunit(dev));
2162 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2163 		    hn_xpnt_vf_init_taskfunc, sc);
2164 	}
2165 
2166 	/*
2167 	 * Allocate ifnet and setup its name earlier, so that if_printf
2168 	 * can be used by functions, which will be called after
2169 	 * ether_ifattach().
2170 	 */
2171 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2172 	ifp->if_softc = sc;
2173 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2174 
2175 	/*
2176 	 * Initialize ifmedia earlier so that it can be unconditionally
2177 	 * destroyed, if error happened later on.
2178 	 */
2179 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2180 
2181 	/*
2182 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2183 	 * to use (tx_ring_cnt).
2184 	 *
2185 	 * NOTE:
2186 	 * The # of RX rings to use is same as the # of channels to use.
2187 	 */
2188 	ring_cnt = hn_chan_cnt;
2189 	if (ring_cnt <= 0) {
2190 		/* Default */
2191 		ring_cnt = mp_ncpus;
2192 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2193 			ring_cnt = HN_RING_CNT_DEF_MAX;
2194 	} else if (ring_cnt > mp_ncpus) {
2195 		ring_cnt = mp_ncpus;
2196 	}
2197 #ifdef RSS
2198 	if (ring_cnt > rss_getnumbuckets())
2199 		ring_cnt = rss_getnumbuckets();
2200 #endif
2201 
2202 	tx_ring_cnt = hn_tx_ring_cnt;
2203 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2204 		tx_ring_cnt = ring_cnt;
2205 #ifdef HN_IFSTART_SUPPORT
2206 	if (hn_use_if_start) {
2207 		/* ifnet.if_start only needs one TX ring. */
2208 		tx_ring_cnt = 1;
2209 	}
2210 #endif
2211 
2212 	/*
2213 	 * Set the leader CPU for channels.
2214 	 */
2215 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2216 
2217 	/*
2218 	 * Create enough TX/RX rings, even if only limited number of
2219 	 * channels can be allocated.
2220 	 */
2221 	error = hn_create_tx_data(sc, tx_ring_cnt);
2222 	if (error)
2223 		goto failed;
2224 	error = hn_create_rx_data(sc, ring_cnt);
2225 	if (error)
2226 		goto failed;
2227 
2228 	/*
2229 	 * Create transaction context for NVS and RNDIS transactions.
2230 	 */
2231 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2232 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2233 	if (sc->hn_xact == NULL) {
2234 		error = ENXIO;
2235 		goto failed;
2236 	}
2237 
2238 	/*
2239 	 * Install orphan handler for the revocation of this device's
2240 	 * primary channel.
2241 	 *
2242 	 * NOTE:
2243 	 * The processing order is critical here:
2244 	 * Install the orphan handler, _before_ testing whether this
2245 	 * device's primary channel has been revoked or not.
2246 	 */
2247 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2248 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2249 		error = ENXIO;
2250 		goto failed;
2251 	}
2252 
2253 	/*
2254 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2255 	 */
2256 	error = hn_synth_attach(sc, ETHERMTU);
2257 	if (error)
2258 		goto failed;
2259 
2260 	error = hn_rndis_get_eaddr(sc, eaddr);
2261 	if (error)
2262 		goto failed;
2263 
2264 	error = hn_rndis_get_mtu(sc, &mtu);
2265 	if (error)
2266 		mtu = ETHERMTU;
2267 	else if (bootverbose)
2268 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2269 
2270 #if __FreeBSD_version >= 1100099
2271 	if (sc->hn_rx_ring_inuse > 1) {
2272 		/*
2273 		 * Reduce TCP segment aggregation limit for multiple
2274 		 * RX rings to increase ACK timeliness.
2275 		 */
2276 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2277 	}
2278 #endif
2279 
2280 	/*
2281 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2282 	 */
2283 	hn_fixup_tx_data(sc);
2284 	hn_fixup_rx_data(sc);
2285 
2286 	ctx = device_get_sysctl_ctx(dev);
2287 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2288 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2289 	    &sc->hn_nvs_ver, 0, "NVS version");
2290 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2291 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2292 	    hn_ndis_version_sysctl, "A", "NDIS version");
2293 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2294 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2295 	    hn_caps_sysctl, "A", "capabilities");
2296 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2297 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2298 	    hn_hwassist_sysctl, "A", "hwassist");
2299 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2300 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2301 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2302 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2303 	    "max # of TSO segments");
2304 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2305 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2306 	    "max size of TSO segment");
2307 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2308 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2309 	    hn_rxfilter_sysctl, "A", "rxfilter");
2310 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2311 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2312 	    hn_rss_hash_sysctl, "A", "RSS hash");
2313 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2314 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2315 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2316 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2317 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2318 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2319 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2320 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2321 #ifndef RSS
2322 	/*
2323 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2324 	 */
2325 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2326 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2327 	    hn_rss_key_sysctl, "IU", "RSS key");
2328 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2329 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2330 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2331 #endif
2332 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2333 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2334 	    "RNDIS offered packet transmission aggregation size limit");
2335 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2336 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2337 	    "RNDIS offered packet transmission aggregation count limit");
2338 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2339 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2340 	    "RNDIS packet transmission aggregation alignment");
2341 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2342 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2343 	    hn_txagg_size_sysctl, "I",
2344 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2345 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2346 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2347 	    hn_txagg_pkts_sysctl, "I",
2348 	    "Packet transmission aggregation packets, "
2349 	    "0 -- disable, -1 -- auto");
2350 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2351 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2352 	    hn_polling_sysctl, "I",
2353 	    "Polling frequency: [100,1000000], 0 disable polling");
2354 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2355 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2356 	    hn_vf_sysctl, "A", "Virtual Function's name");
2357 	if (!hn_xpnt_vf) {
2358 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2359 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2360 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2361 	} else {
2362 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2363 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2364 		    hn_xpnt_vf_enabled_sysctl, "I",
2365 		    "Transparent VF enabled");
2366 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2367 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2368 		    hn_xpnt_vf_accbpf_sysctl, "I",
2369 		    "Accurate BPF for transparent VF");
2370 	}
2371 
2372 	/*
2373 	 * Setup the ifmedia, which has been initialized earlier.
2374 	 */
2375 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2376 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2377 	/* XXX ifmedia_set really should do this for us */
2378 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2379 
2380 	/*
2381 	 * Setup the ifnet for this interface.
2382 	 */
2383 
2384 	ifp->if_baudrate = IF_Gbps(10);
2385 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2386 	ifp->if_ioctl = hn_ioctl;
2387 	ifp->if_init = hn_init;
2388 #ifdef HN_IFSTART_SUPPORT
2389 	if (hn_use_if_start) {
2390 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2391 
2392 		ifp->if_start = hn_start;
2393 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2394 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2395 		IFQ_SET_READY(&ifp->if_snd);
2396 	} else
2397 #endif
2398 	{
2399 		ifp->if_transmit = hn_transmit;
2400 		ifp->if_qflush = hn_xmit_qflush;
2401 	}
2402 
2403 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2404 #ifdef foo
2405 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2406 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2407 #endif
2408 	if (sc->hn_caps & HN_CAP_VLAN) {
2409 		/* XXX not sure about VLAN_MTU. */
2410 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2411 	}
2412 
2413 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2414 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2415 		ifp->if_capabilities |= IFCAP_TXCSUM;
2416 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2417 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2418 	if (sc->hn_caps & HN_CAP_TSO4) {
2419 		ifp->if_capabilities |= IFCAP_TSO4;
2420 		ifp->if_hwassist |= CSUM_IP_TSO;
2421 	}
2422 	if (sc->hn_caps & HN_CAP_TSO6) {
2423 		ifp->if_capabilities |= IFCAP_TSO6;
2424 		ifp->if_hwassist |= CSUM_IP6_TSO;
2425 	}
2426 
2427 	/* Enable all available capabilities by default. */
2428 	ifp->if_capenable = ifp->if_capabilities;
2429 
2430 	/*
2431 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2432 	 * be enabled through SIOCSIFCAP.
2433 	 */
2434 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2435 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2436 
2437 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2438 		/*
2439 		 * Lock hn_set_tso_maxsize() to simplify its
2440 		 * internal logic.
2441 		 */
2442 		HN_LOCK(sc);
2443 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2444 		HN_UNLOCK(sc);
2445 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2446 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2447 	}
2448 
2449 	ether_ifattach(ifp, eaddr);
2450 
2451 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2452 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2453 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2454 	}
2455 	if (mtu < ETHERMTU) {
2456 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2457 		ifp->if_mtu = mtu;
2458 	}
2459 
2460 	/* Inform the upper layer about the long frame support. */
2461 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2462 
2463 	/*
2464 	 * Kick off link status check.
2465 	 */
2466 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2467 	hn_update_link_status(sc);
2468 
2469 	if (!hn_xpnt_vf) {
2470 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2471 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2472 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2473 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2474 	} else {
2475 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2476 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2477 	}
2478 
2479 	/*
2480 	 * NOTE:
2481 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2482 	 * since interface's LLADDR is needed; interface LLADDR is not
2483 	 * available when ifnet_arrival event is triggered.
2484 	 */
2485 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2486 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2487 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2488 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2489 
2490 	return (0);
2491 failed:
2492 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2493 		hn_synth_detach(sc);
2494 	hn_detach(dev);
2495 	return (error);
2496 }
2497 
2498 static int
2499 hn_detach(device_t dev)
2500 {
2501 	struct hn_softc *sc = device_get_softc(dev);
2502 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2503 
2504 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2505 		/*
2506 		 * In case that the vmbus missed the orphan handler
2507 		 * installation.
2508 		 */
2509 		vmbus_xact_ctx_orphan(sc->hn_xact);
2510 	}
2511 
2512 	if (sc->hn_ifaddr_evthand != NULL)
2513 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2514 	if (sc->hn_ifnet_evthand != NULL)
2515 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2516 	if (sc->hn_ifnet_atthand != NULL) {
2517 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2518 		    sc->hn_ifnet_atthand);
2519 	}
2520 	if (sc->hn_ifnet_dethand != NULL) {
2521 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2522 		    sc->hn_ifnet_dethand);
2523 	}
2524 	if (sc->hn_ifnet_lnkhand != NULL)
2525 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2526 
2527 	vf_ifp = sc->hn_vf_ifp;
2528 	__compiler_membar();
2529 	if (vf_ifp != NULL)
2530 		hn_ifnet_detevent(sc, vf_ifp);
2531 
2532 	if (device_is_attached(dev)) {
2533 		HN_LOCK(sc);
2534 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2535 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2536 				hn_stop(sc, true);
2537 			/*
2538 			 * NOTE:
2539 			 * hn_stop() only suspends data, so managment
2540 			 * stuffs have to be suspended manually here.
2541 			 */
2542 			hn_suspend_mgmt(sc);
2543 			hn_synth_detach(sc);
2544 		}
2545 		HN_UNLOCK(sc);
2546 		ether_ifdetach(ifp);
2547 	}
2548 
2549 	ifmedia_removeall(&sc->hn_media);
2550 	hn_destroy_rx_data(sc);
2551 	hn_destroy_tx_data(sc);
2552 
2553 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2554 		int i;
2555 
2556 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2557 			taskqueue_free(sc->hn_tx_taskqs[i]);
2558 		free(sc->hn_tx_taskqs, M_DEVBUF);
2559 	}
2560 	taskqueue_free(sc->hn_mgmt_taskq0);
2561 	if (sc->hn_vf_taskq != NULL)
2562 		taskqueue_free(sc->hn_vf_taskq);
2563 
2564 	if (sc->hn_xact != NULL) {
2565 		/*
2566 		 * Uninstall the orphan handler _before_ the xact is
2567 		 * destructed.
2568 		 */
2569 		vmbus_chan_unset_orphan(sc->hn_prichan);
2570 		vmbus_xact_ctx_destroy(sc->hn_xact);
2571 	}
2572 
2573 	if_free(ifp);
2574 
2575 	HN_LOCK_DESTROY(sc);
2576 	rm_destroy(&sc->hn_vf_lock);
2577 	return (0);
2578 }
2579 
2580 static int
2581 hn_shutdown(device_t dev)
2582 {
2583 
2584 	return (0);
2585 }
2586 
2587 static void
2588 hn_link_status(struct hn_softc *sc)
2589 {
2590 	uint32_t link_status;
2591 	int error;
2592 
2593 	error = hn_rndis_get_linkstatus(sc, &link_status);
2594 	if (error) {
2595 		/* XXX what to do? */
2596 		return;
2597 	}
2598 
2599 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2600 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2601 	else
2602 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2603 	if_link_state_change(sc->hn_ifp,
2604 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2605 	    LINK_STATE_UP : LINK_STATE_DOWN);
2606 }
2607 
2608 static void
2609 hn_link_taskfunc(void *xsc, int pending __unused)
2610 {
2611 	struct hn_softc *sc = xsc;
2612 
2613 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2614 		return;
2615 	hn_link_status(sc);
2616 }
2617 
2618 static void
2619 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2620 {
2621 	struct hn_softc *sc = xsc;
2622 
2623 	/* Prevent any link status checks from running. */
2624 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2625 
2626 	/*
2627 	 * Fake up a [link down --> link up] state change; 5 seconds
2628 	 * delay is used, which closely simulates miibus reaction
2629 	 * upon link down event.
2630 	 */
2631 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2632 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2633 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2634 	    &sc->hn_netchg_status, 5 * hz);
2635 }
2636 
2637 static void
2638 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2639 {
2640 	struct hn_softc *sc = xsc;
2641 
2642 	/* Re-allow link status checks. */
2643 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2644 	hn_link_status(sc);
2645 }
2646 
2647 static void
2648 hn_update_link_status(struct hn_softc *sc)
2649 {
2650 
2651 	if (sc->hn_mgmt_taskq != NULL)
2652 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2653 }
2654 
2655 static void
2656 hn_change_network(struct hn_softc *sc)
2657 {
2658 
2659 	if (sc->hn_mgmt_taskq != NULL)
2660 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2661 }
2662 
2663 static __inline int
2664 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2665     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2666 {
2667 	struct mbuf *m = *m_head;
2668 	int error;
2669 
2670 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2671 
2672 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2673 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2674 	if (error == EFBIG) {
2675 		struct mbuf *m_new;
2676 
2677 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2678 		if (m_new == NULL)
2679 			return ENOBUFS;
2680 		else
2681 			*m_head = m = m_new;
2682 		txr->hn_tx_collapsed++;
2683 
2684 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2685 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2686 	}
2687 	if (!error) {
2688 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2689 		    BUS_DMASYNC_PREWRITE);
2690 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2691 	}
2692 	return error;
2693 }
2694 
2695 static __inline int
2696 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2697 {
2698 
2699 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2700 	    ("put an onlist txd %#x", txd->flags));
2701 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2702 	    ("put an onagg txd %#x", txd->flags));
2703 
2704 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2705 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2706 		return 0;
2707 
2708 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2709 		struct hn_txdesc *tmp_txd;
2710 
2711 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2712 			int freed;
2713 
2714 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2715 			    ("resursive aggregation on aggregated txdesc"));
2716 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2717 			    ("not aggregated txdesc"));
2718 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2719 			    ("aggregated txdesc uses dmamap"));
2720 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2721 			    ("aggregated txdesc consumes "
2722 			     "chimney sending buffer"));
2723 			KASSERT(tmp_txd->chim_size == 0,
2724 			    ("aggregated txdesc has non-zero "
2725 			     "chimney sending size"));
2726 
2727 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2728 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2729 			freed = hn_txdesc_put(txr, tmp_txd);
2730 			KASSERT(freed, ("failed to free aggregated txdesc"));
2731 		}
2732 	}
2733 
2734 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2735 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2736 		    ("chim txd uses dmamap"));
2737 		hn_chim_free(txr->hn_sc, txd->chim_index);
2738 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2739 		txd->chim_size = 0;
2740 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2741 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2742 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2743 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2744 		    txd->data_dmap);
2745 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2746 	}
2747 
2748 	if (txd->m != NULL) {
2749 		m_freem(txd->m);
2750 		txd->m = NULL;
2751 	}
2752 
2753 	txd->flags |= HN_TXD_FLAG_ONLIST;
2754 #ifndef HN_USE_TXDESC_BUFRING
2755 	mtx_lock_spin(&txr->hn_txlist_spin);
2756 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2757 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2758 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2759 	txr->hn_txdesc_avail++;
2760 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2761 	mtx_unlock_spin(&txr->hn_txlist_spin);
2762 #else	/* HN_USE_TXDESC_BUFRING */
2763 #ifdef HN_DEBUG
2764 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2765 #endif
2766 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2767 #endif	/* !HN_USE_TXDESC_BUFRING */
2768 
2769 	return 1;
2770 }
2771 
2772 static __inline struct hn_txdesc *
2773 hn_txdesc_get(struct hn_tx_ring *txr)
2774 {
2775 	struct hn_txdesc *txd;
2776 
2777 #ifndef HN_USE_TXDESC_BUFRING
2778 	mtx_lock_spin(&txr->hn_txlist_spin);
2779 	txd = SLIST_FIRST(&txr->hn_txlist);
2780 	if (txd != NULL) {
2781 		KASSERT(txr->hn_txdesc_avail > 0,
2782 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2783 		txr->hn_txdesc_avail--;
2784 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2785 	}
2786 	mtx_unlock_spin(&txr->hn_txlist_spin);
2787 #else
2788 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2789 #endif
2790 
2791 	if (txd != NULL) {
2792 #ifdef HN_USE_TXDESC_BUFRING
2793 #ifdef HN_DEBUG
2794 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2795 #endif
2796 #endif	/* HN_USE_TXDESC_BUFRING */
2797 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2798 		    STAILQ_EMPTY(&txd->agg_list) &&
2799 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2800 		    txd->chim_size == 0 &&
2801 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2802 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2803 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2804 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2805 		txd->refs = 1;
2806 	}
2807 	return txd;
2808 }
2809 
2810 static __inline void
2811 hn_txdesc_hold(struct hn_txdesc *txd)
2812 {
2813 
2814 	/* 0->1 transition will never work */
2815 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2816 	atomic_add_int(&txd->refs, 1);
2817 }
2818 
2819 static __inline void
2820 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2821 {
2822 
2823 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2824 	    ("recursive aggregation on aggregating txdesc"));
2825 
2826 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2827 	    ("already aggregated"));
2828 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2829 	    ("recursive aggregation on to-be-aggregated txdesc"));
2830 
2831 	txd->flags |= HN_TXD_FLAG_ONAGG;
2832 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2833 }
2834 
2835 static bool
2836 hn_tx_ring_pending(struct hn_tx_ring *txr)
2837 {
2838 	bool pending = false;
2839 
2840 #ifndef HN_USE_TXDESC_BUFRING
2841 	mtx_lock_spin(&txr->hn_txlist_spin);
2842 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2843 		pending = true;
2844 	mtx_unlock_spin(&txr->hn_txlist_spin);
2845 #else
2846 	if (!buf_ring_full(txr->hn_txdesc_br))
2847 		pending = true;
2848 #endif
2849 	return (pending);
2850 }
2851 
2852 static __inline void
2853 hn_txeof(struct hn_tx_ring *txr)
2854 {
2855 	txr->hn_has_txeof = 0;
2856 	txr->hn_txeof(txr);
2857 }
2858 
2859 static void
2860 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2861     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2862 {
2863 	struct hn_txdesc *txd = sndc->hn_cbarg;
2864 	struct hn_tx_ring *txr;
2865 
2866 	txr = txd->txr;
2867 	KASSERT(txr->hn_chan == chan,
2868 	    ("channel mismatch, on chan%u, should be chan%u",
2869 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2870 
2871 	txr->hn_has_txeof = 1;
2872 	hn_txdesc_put(txr, txd);
2873 
2874 	++txr->hn_txdone_cnt;
2875 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2876 		txr->hn_txdone_cnt = 0;
2877 		if (txr->hn_oactive)
2878 			hn_txeof(txr);
2879 	}
2880 }
2881 
2882 static void
2883 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2884 {
2885 #if defined(INET) || defined(INET6)
2886 	tcp_lro_flush_all(&rxr->hn_lro);
2887 #endif
2888 
2889 	/*
2890 	 * NOTE:
2891 	 * 'txr' could be NULL, if multiple channels and
2892 	 * ifnet.if_start method are enabled.
2893 	 */
2894 	if (txr == NULL || !txr->hn_has_txeof)
2895 		return;
2896 
2897 	txr->hn_txdone_cnt = 0;
2898 	hn_txeof(txr);
2899 }
2900 
2901 static __inline uint32_t
2902 hn_rndis_pktmsg_offset(uint32_t ofs)
2903 {
2904 
2905 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2906 	    ("invalid RNDIS packet msg offset %u", ofs));
2907 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2908 }
2909 
2910 static __inline void *
2911 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2912     size_t pi_dlen, uint32_t pi_type)
2913 {
2914 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2915 	struct rndis_pktinfo *pi;
2916 
2917 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2918 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2919 
2920 	/*
2921 	 * Per-packet-info does not move; it only grows.
2922 	 *
2923 	 * NOTE:
2924 	 * rm_pktinfooffset in this phase counts from the beginning
2925 	 * of rndis_packet_msg.
2926 	 */
2927 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2928 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2929 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2930 	    pkt->rm_pktinfolen);
2931 	pkt->rm_pktinfolen += pi_size;
2932 
2933 	pi->rm_size = pi_size;
2934 	pi->rm_type = pi_type;
2935 	pi->rm_internal = 0;
2936 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2937 
2938 	return (pi->rm_data);
2939 }
2940 
2941 static __inline int
2942 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2943 {
2944 	struct hn_txdesc *txd;
2945 	struct mbuf *m;
2946 	int error, pkts;
2947 
2948 	txd = txr->hn_agg_txd;
2949 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2950 
2951 	/*
2952 	 * Since hn_txpkt() will reset this temporary stat, save
2953 	 * it now, so that oerrors can be updated properly, if
2954 	 * hn_txpkt() ever fails.
2955 	 */
2956 	pkts = txr->hn_stat_pkts;
2957 
2958 	/*
2959 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2960 	 * failure, save it for later freeing, if hn_txpkt() ever
2961 	 * fails.
2962 	 */
2963 	m = txd->m;
2964 	error = hn_txpkt(ifp, txr, txd);
2965 	if (__predict_false(error)) {
2966 		/* txd is freed, but m is not. */
2967 		m_freem(m);
2968 
2969 		txr->hn_flush_failed++;
2970 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2971 	}
2972 
2973 	/* Reset all aggregation states. */
2974 	txr->hn_agg_txd = NULL;
2975 	txr->hn_agg_szleft = 0;
2976 	txr->hn_agg_pktleft = 0;
2977 	txr->hn_agg_prevpkt = NULL;
2978 
2979 	return (error);
2980 }
2981 
2982 static void *
2983 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2984     int pktsize)
2985 {
2986 	void *chim;
2987 
2988 	if (txr->hn_agg_txd != NULL) {
2989 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2990 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2991 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2992 			int olen;
2993 
2994 			/*
2995 			 * Update the previous RNDIS packet's total length,
2996 			 * it can be increased due to the mandatory alignment
2997 			 * padding for this RNDIS packet.  And update the
2998 			 * aggregating txdesc's chimney sending buffer size
2999 			 * accordingly.
3000 			 *
3001 			 * XXX
3002 			 * Zero-out the padding, as required by the RNDIS spec.
3003 			 */
3004 			olen = pkt->rm_len;
3005 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
3006 			agg_txd->chim_size += pkt->rm_len - olen;
3007 
3008 			/* Link this txdesc to the parent. */
3009 			hn_txdesc_agg(agg_txd, txd);
3010 
3011 			chim = (uint8_t *)pkt + pkt->rm_len;
3012 			/* Save the current packet for later fixup. */
3013 			txr->hn_agg_prevpkt = chim;
3014 
3015 			txr->hn_agg_pktleft--;
3016 			txr->hn_agg_szleft -= pktsize;
3017 			if (txr->hn_agg_szleft <=
3018 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3019 				/*
3020 				 * Probably can't aggregate more packets,
3021 				 * flush this aggregating txdesc proactively.
3022 				 */
3023 				txr->hn_agg_pktleft = 0;
3024 			}
3025 			/* Done! */
3026 			return (chim);
3027 		}
3028 		hn_flush_txagg(ifp, txr);
3029 	}
3030 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3031 
3032 	txr->hn_tx_chimney_tried++;
3033 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3034 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3035 		return (NULL);
3036 	txr->hn_tx_chimney++;
3037 
3038 	chim = txr->hn_sc->hn_chim +
3039 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3040 
3041 	if (txr->hn_agg_pktmax > 1 &&
3042 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3043 		txr->hn_agg_txd = txd;
3044 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3045 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3046 		txr->hn_agg_prevpkt = chim;
3047 	}
3048 	return (chim);
3049 }
3050 
3051 /*
3052  * NOTE:
3053  * If this function fails, then both txd and m_head0 will be freed.
3054  */
3055 static int
3056 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3057     struct mbuf **m_head0)
3058 {
3059 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3060 	int error, nsegs, i;
3061 	struct mbuf *m_head = *m_head0;
3062 	struct rndis_packet_msg *pkt;
3063 	uint32_t *pi_data;
3064 	void *chim = NULL;
3065 	int pkt_hlen, pkt_size;
3066 
3067 	pkt = txd->rndis_pkt;
3068 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3069 	if (pkt_size < txr->hn_chim_size) {
3070 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3071 		if (chim != NULL)
3072 			pkt = chim;
3073 	} else {
3074 		if (txr->hn_agg_txd != NULL)
3075 			hn_flush_txagg(ifp, txr);
3076 	}
3077 
3078 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3079 	pkt->rm_len = m_head->m_pkthdr.len;
3080 	pkt->rm_dataoffset = 0;
3081 	pkt->rm_datalen = m_head->m_pkthdr.len;
3082 	pkt->rm_oobdataoffset = 0;
3083 	pkt->rm_oobdatalen = 0;
3084 	pkt->rm_oobdataelements = 0;
3085 	pkt->rm_pktinfooffset = sizeof(*pkt);
3086 	pkt->rm_pktinfolen = 0;
3087 	pkt->rm_vchandle = 0;
3088 	pkt->rm_reserved = 0;
3089 
3090 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3091 		/*
3092 		 * Set the hash value for this packet.
3093 		 */
3094 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3095 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3096 
3097 		if (M_HASHTYPE_ISHASH(m_head))
3098 			/*
3099 			 * The flowid field contains the hash value host
3100 			 * set in the rx queue if it is a ip forwarding pkt.
3101 			 * Set the same hash value so host can send on the
3102 			 * cpu it was received.
3103 			 */
3104 			*pi_data = m_head->m_pkthdr.flowid;
3105 		else
3106 			/*
3107 			 * Otherwise just put the tx queue index.
3108 			 */
3109 			*pi_data = txr->hn_tx_idx;
3110 	}
3111 
3112 	if (m_head->m_flags & M_VLANTAG) {
3113 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3114 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3115 		*pi_data = NDIS_VLAN_INFO_MAKE(
3116 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3117 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3118 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3119 	}
3120 
3121 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3122 #if defined(INET6) || defined(INET)
3123 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3124 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3125 #ifdef INET
3126 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3127 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3128 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3129 			    m_head->m_pkthdr.tso_segsz);
3130 		}
3131 #endif
3132 #if defined(INET6) && defined(INET)
3133 		else
3134 #endif
3135 #ifdef INET6
3136 		{
3137 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3138 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3139 			    m_head->m_pkthdr.tso_segsz);
3140 		}
3141 #endif
3142 #endif	/* INET6 || INET */
3143 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3144 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3145 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3146 		if (m_head->m_pkthdr.csum_flags &
3147 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3148 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3149 		} else {
3150 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3151 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3152 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3153 		}
3154 
3155 		if (m_head->m_pkthdr.csum_flags &
3156 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3157 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3158 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3159 		} else if (m_head->m_pkthdr.csum_flags &
3160 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3161 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3162 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3163 		}
3164 	}
3165 
3166 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3167 	/* Fixup RNDIS packet message total length */
3168 	pkt->rm_len += pkt_hlen;
3169 	/* Convert RNDIS packet message offsets */
3170 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3171 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3172 
3173 	/*
3174 	 * Fast path: Chimney sending.
3175 	 */
3176 	if (chim != NULL) {
3177 		struct hn_txdesc *tgt_txd = txd;
3178 
3179 		if (txr->hn_agg_txd != NULL) {
3180 			tgt_txd = txr->hn_agg_txd;
3181 #ifdef INVARIANTS
3182 			*m_head0 = NULL;
3183 #endif
3184 		}
3185 
3186 		KASSERT(pkt == chim,
3187 		    ("RNDIS pkt not in chimney sending buffer"));
3188 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3189 		    ("chimney sending buffer is not used"));
3190 		tgt_txd->chim_size += pkt->rm_len;
3191 
3192 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3193 		    ((uint8_t *)chim) + pkt_hlen);
3194 
3195 		txr->hn_gpa_cnt = 0;
3196 		txr->hn_sendpkt = hn_txpkt_chim;
3197 		goto done;
3198 	}
3199 
3200 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3201 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3202 	    ("chimney buffer is used"));
3203 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3204 
3205 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3206 	if (__predict_false(error)) {
3207 		int freed;
3208 
3209 		/*
3210 		 * This mbuf is not linked w/ the txd yet, so free it now.
3211 		 */
3212 		m_freem(m_head);
3213 		*m_head0 = NULL;
3214 
3215 		freed = hn_txdesc_put(txr, txd);
3216 		KASSERT(freed != 0,
3217 		    ("fail to free txd upon txdma error"));
3218 
3219 		txr->hn_txdma_failed++;
3220 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3221 		return error;
3222 	}
3223 	*m_head0 = m_head;
3224 
3225 	/* +1 RNDIS packet message */
3226 	txr->hn_gpa_cnt = nsegs + 1;
3227 
3228 	/* send packet with page buffer */
3229 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3230 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3231 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3232 
3233 	/*
3234 	 * Fill the page buffers with mbuf info after the page
3235 	 * buffer for RNDIS packet message.
3236 	 */
3237 	for (i = 0; i < nsegs; ++i) {
3238 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3239 
3240 		gpa->gpa_page = atop(segs[i].ds_addr);
3241 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3242 		gpa->gpa_len = segs[i].ds_len;
3243 	}
3244 
3245 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3246 	txd->chim_size = 0;
3247 	txr->hn_sendpkt = hn_txpkt_sglist;
3248 done:
3249 	txd->m = m_head;
3250 
3251 	/* Set the completion routine */
3252 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3253 
3254 	/* Update temporary stats for later use. */
3255 	txr->hn_stat_pkts++;
3256 	txr->hn_stat_size += m_head->m_pkthdr.len;
3257 	if (m_head->m_flags & M_MCAST)
3258 		txr->hn_stat_mcasts++;
3259 
3260 	return 0;
3261 }
3262 
3263 /*
3264  * NOTE:
3265  * If this function fails, then txd will be freed, but the mbuf
3266  * associated w/ the txd will _not_ be freed.
3267  */
3268 static int
3269 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3270 {
3271 	int error, send_failed = 0, has_bpf;
3272 
3273 again:
3274 	has_bpf = bpf_peers_present(ifp->if_bpf);
3275 	if (has_bpf) {
3276 		/*
3277 		 * Make sure that this txd and any aggregated txds are not
3278 		 * freed before ETHER_BPF_MTAP.
3279 		 */
3280 		hn_txdesc_hold(txd);
3281 	}
3282 	error = txr->hn_sendpkt(txr, txd);
3283 	if (!error) {
3284 		if (has_bpf) {
3285 			const struct hn_txdesc *tmp_txd;
3286 
3287 			ETHER_BPF_MTAP(ifp, txd->m);
3288 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3289 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3290 		}
3291 
3292 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3293 #ifdef HN_IFSTART_SUPPORT
3294 		if (!hn_use_if_start)
3295 #endif
3296 		{
3297 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3298 			    txr->hn_stat_size);
3299 			if (txr->hn_stat_mcasts != 0) {
3300 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3301 				    txr->hn_stat_mcasts);
3302 			}
3303 		}
3304 		txr->hn_pkts += txr->hn_stat_pkts;
3305 		txr->hn_sends++;
3306 	}
3307 	if (has_bpf)
3308 		hn_txdesc_put(txr, txd);
3309 
3310 	if (__predict_false(error)) {
3311 		int freed;
3312 
3313 		/*
3314 		 * This should "really rarely" happen.
3315 		 *
3316 		 * XXX Too many RX to be acked or too many sideband
3317 		 * commands to run?  Ask netvsc_channel_rollup()
3318 		 * to kick start later.
3319 		 */
3320 		txr->hn_has_txeof = 1;
3321 		if (!send_failed) {
3322 			txr->hn_send_failed++;
3323 			send_failed = 1;
3324 			/*
3325 			 * Try sending again after set hn_has_txeof;
3326 			 * in case that we missed the last
3327 			 * netvsc_channel_rollup().
3328 			 */
3329 			goto again;
3330 		}
3331 		if_printf(ifp, "send failed\n");
3332 
3333 		/*
3334 		 * Caller will perform further processing on the
3335 		 * associated mbuf, so don't free it in hn_txdesc_put();
3336 		 * only unload it from the DMA map in hn_txdesc_put(),
3337 		 * if it was loaded.
3338 		 */
3339 		txd->m = NULL;
3340 		freed = hn_txdesc_put(txr, txd);
3341 		KASSERT(freed != 0,
3342 		    ("fail to free txd upon send error"));
3343 
3344 		txr->hn_send_failed++;
3345 	}
3346 
3347 	/* Reset temporary stats, after this sending is done. */
3348 	txr->hn_stat_size = 0;
3349 	txr->hn_stat_pkts = 0;
3350 	txr->hn_stat_mcasts = 0;
3351 
3352 	return (error);
3353 }
3354 
3355 /*
3356  * Append the specified data to the indicated mbuf chain,
3357  * Extend the mbuf chain if the new data does not fit in
3358  * existing space.
3359  *
3360  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3361  * There should be an equivalent in the kernel mbuf code,
3362  * but there does not appear to be one yet.
3363  *
3364  * Differs from m_append() in that additional mbufs are
3365  * allocated with cluster size MJUMPAGESIZE, and filled
3366  * accordingly.
3367  *
3368  * Return the last mbuf in the chain or NULL if failed to
3369  * allocate new mbuf.
3370  */
3371 static struct mbuf *
3372 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3373 {
3374 	struct mbuf *m, *n;
3375 	int remainder, space;
3376 
3377 	for (m = m0; m->m_next != NULL; m = m->m_next)
3378 		;
3379 	remainder = len;
3380 	space = M_TRAILINGSPACE(m);
3381 	if (space > 0) {
3382 		/*
3383 		 * Copy into available space.
3384 		 */
3385 		if (space > remainder)
3386 			space = remainder;
3387 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3388 		m->m_len += space;
3389 		cp += space;
3390 		remainder -= space;
3391 	}
3392 	while (remainder > 0) {
3393 		/*
3394 		 * Allocate a new mbuf; could check space
3395 		 * and allocate a cluster instead.
3396 		 */
3397 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3398 		if (n == NULL)
3399 			return NULL;
3400 		n->m_len = min(MJUMPAGESIZE, remainder);
3401 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3402 		cp += n->m_len;
3403 		remainder -= n->m_len;
3404 		m->m_next = n;
3405 		m = n;
3406 	}
3407 
3408 	return m;
3409 }
3410 
3411 #if defined(INET) || defined(INET6)
3412 static __inline int
3413 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3414 {
3415 #if __FreeBSD_version >= 1100095
3416 	if (hn_lro_mbufq_depth) {
3417 		tcp_lro_queue_mbuf(lc, m);
3418 		return 0;
3419 	}
3420 #endif
3421 	return tcp_lro_rx(lc, m, 0);
3422 }
3423 #endif
3424 
3425 static int
3426 hn_rxpkt(struct hn_rx_ring *rxr)
3427 {
3428 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3429 	struct mbuf *m_new, *n;
3430 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3431 	int hash_type = M_HASHTYPE_NONE;
3432 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3433 	int i;
3434 
3435 	ifp = hn_ifp;
3436 	if (rxr->hn_rxvf_ifp != NULL) {
3437 		/*
3438 		 * Non-transparent mode VF; pretend this packet is from
3439 		 * the VF.
3440 		 */
3441 		ifp = rxr->hn_rxvf_ifp;
3442 		is_vf = 1;
3443 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3444 		/* Transparent mode VF. */
3445 		is_vf = 1;
3446 	}
3447 
3448 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3449 		/*
3450 		 * NOTE:
3451 		 * See the NOTE of hn_rndis_init_fixat().  This
3452 		 * function can be reached, immediately after the
3453 		 * RNDIS is initialized but before the ifnet is
3454 		 * setup on the hn_attach() path; drop the unexpected
3455 		 * packets.
3456 		 */
3457 		return (0);
3458 	}
3459 
3460 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3461 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3462 		return (0);
3463 	}
3464 
3465 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3466 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3467 		if (m_new == NULL) {
3468 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3469 			return (0);
3470 		}
3471 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3472 		    rxr->rsc.frag_len[0]);
3473 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3474 	} else {
3475 		/*
3476 		 * Get an mbuf with a cluster.  For packets 2K or less,
3477 		 * get a standard 2K cluster.  For anything larger, get a
3478 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3479 		 * if looped around to the Hyper-V TX channel, so avoid them.
3480 		 */
3481 		size = MCLBYTES;
3482 		if (rxr->rsc.pktlen > MCLBYTES) {
3483 			/* 4096 */
3484 			size = MJUMPAGESIZE;
3485 		}
3486 
3487 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3488 		if (m_new == NULL) {
3489 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3490 			return (0);
3491 		}
3492 
3493 		n = m_new;
3494 		for (i = 0; i < rxr->rsc.cnt; i++) {
3495 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3496 			    rxr->rsc.frag_data[i]);
3497 			if (n == NULL) {
3498 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3499 				return (0);
3500 			} else {
3501 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3502 			}
3503 		}
3504 	}
3505 	if (rxr->rsc.pktlen <= MHLEN)
3506 		rxr->hn_small_pkts++;
3507 
3508 	m_new->m_pkthdr.rcvif = ifp;
3509 
3510 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3511 		do_csum = 0;
3512 
3513 	/* receive side checksum offload */
3514 	if (rxr->rsc.csum_info != NULL) {
3515 		/* IP csum offload */
3516 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3517 			m_new->m_pkthdr.csum_flags |=
3518 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3519 			rxr->hn_csum_ip++;
3520 		}
3521 
3522 		/* TCP/UDP csum offload */
3523 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3524 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3525 			m_new->m_pkthdr.csum_flags |=
3526 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3527 			m_new->m_pkthdr.csum_data = 0xffff;
3528 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3529 				rxr->hn_csum_tcp++;
3530 			else
3531 				rxr->hn_csum_udp++;
3532 		}
3533 
3534 		/*
3535 		 * XXX
3536 		 * As of this write (Oct 28th, 2016), host side will turn
3537 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3538 		 * the do_lro setting here is actually _not_ accurate.  We
3539 		 * depend on the RSS hash type check to reset do_lro.
3540 		 */
3541 		if ((*(rxr->rsc.csum_info) &
3542 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3543 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3544 			do_lro = 1;
3545 	} else {
3546 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3547 		if (l3proto == ETHERTYPE_IP) {
3548 			if (l4proto == IPPROTO_TCP) {
3549 				if (do_csum &&
3550 				    (rxr->hn_trust_hcsum &
3551 				     HN_TRUST_HCSUM_TCP)) {
3552 					rxr->hn_csum_trusted++;
3553 					m_new->m_pkthdr.csum_flags |=
3554 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3555 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3556 					m_new->m_pkthdr.csum_data = 0xffff;
3557 				}
3558 				do_lro = 1;
3559 			} else if (l4proto == IPPROTO_UDP) {
3560 				if (do_csum &&
3561 				    (rxr->hn_trust_hcsum &
3562 				     HN_TRUST_HCSUM_UDP)) {
3563 					rxr->hn_csum_trusted++;
3564 					m_new->m_pkthdr.csum_flags |=
3565 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3566 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3567 					m_new->m_pkthdr.csum_data = 0xffff;
3568 				}
3569 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3570 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3571 				rxr->hn_csum_trusted++;
3572 				m_new->m_pkthdr.csum_flags |=
3573 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3574 			}
3575 		}
3576 	}
3577 
3578 	if (rxr->rsc.vlan_info != NULL) {
3579 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3580 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3581 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3582 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3583 		m_new->m_flags |= M_VLANTAG;
3584 	}
3585 
3586 	/*
3587 	 * If VF is activated (tranparent/non-transparent mode does not
3588 	 * matter here).
3589 	 *
3590 	 * - Disable LRO
3591 	 *
3592 	 *   hn(4) will only receive broadcast packets, multicast packets,
3593 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3594 	 *   packet types.
3595 	 *
3596 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3597 	 *   all, since the LRO flush will use hn(4) as the receiving
3598 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3599 	 */
3600 	if (is_vf)
3601 		do_lro = 0;
3602 
3603 	/*
3604 	 * If VF is activated (tranparent/non-transparent mode does not
3605 	 * matter here), do _not_ mess with unsupported hash types or
3606 	 * functions.
3607 	 */
3608 	if (rxr->rsc.hash_info != NULL) {
3609 		rxr->hn_rss_pkts++;
3610 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3611 		if (!is_vf)
3612 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3613 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3614 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3615 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3616 			    rxr->hn_mbuf_hash);
3617 
3618 			/*
3619 			 * NOTE:
3620 			 * do_lro is resetted, if the hash types are not TCP
3621 			 * related.  See the comment in the above csum_flags
3622 			 * setup section.
3623 			 */
3624 			switch (type) {
3625 			case NDIS_HASH_IPV4:
3626 				hash_type = M_HASHTYPE_RSS_IPV4;
3627 				do_lro = 0;
3628 				break;
3629 
3630 			case NDIS_HASH_TCP_IPV4:
3631 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3632 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3633 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3634 
3635 					if (is_vf)
3636 						def_htype = M_HASHTYPE_NONE;
3637 
3638 					/*
3639 					 * UDP 4-tuple hash is delivered as
3640 					 * TCP 4-tuple hash.
3641 					 */
3642 					if (l3proto == ETHERTYPE_MAX) {
3643 						hn_rxpkt_proto(m_new,
3644 						    &l3proto, &l4proto);
3645 					}
3646 					if (l3proto == ETHERTYPE_IP) {
3647 						if (l4proto == IPPROTO_UDP &&
3648 						    (rxr->hn_mbuf_hash &
3649 						     NDIS_HASH_UDP_IPV4_X)) {
3650 							hash_type =
3651 							M_HASHTYPE_RSS_UDP_IPV4;
3652 							do_lro = 0;
3653 						} else if (l4proto !=
3654 						    IPPROTO_TCP) {
3655 							hash_type = def_htype;
3656 							do_lro = 0;
3657 						}
3658 					} else {
3659 						hash_type = def_htype;
3660 						do_lro = 0;
3661 					}
3662 				}
3663 				break;
3664 
3665 			case NDIS_HASH_IPV6:
3666 				hash_type = M_HASHTYPE_RSS_IPV6;
3667 				do_lro = 0;
3668 				break;
3669 
3670 			case NDIS_HASH_IPV6_EX:
3671 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3672 				do_lro = 0;
3673 				break;
3674 
3675 			case NDIS_HASH_TCP_IPV6:
3676 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3677 				break;
3678 
3679 			case NDIS_HASH_TCP_IPV6_EX:
3680 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3681 				break;
3682 			}
3683 		}
3684 	} else if (!is_vf) {
3685 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3686 		hash_type = M_HASHTYPE_OPAQUE;
3687 	}
3688 	M_HASHTYPE_SET(m_new, hash_type);
3689 
3690 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3691 	if (hn_ifp != ifp) {
3692 		const struct ether_header *eh;
3693 
3694 		/*
3695 		 * Non-transparent mode VF is activated.
3696 		 */
3697 
3698 		/*
3699 		 * Allow tapping on hn(4).
3700 		 */
3701 		ETHER_BPF_MTAP(hn_ifp, m_new);
3702 
3703 		/*
3704 		 * Update hn(4)'s stats.
3705 		 */
3706 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3707 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3708 		/* Checked at the beginning of this function. */
3709 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3710 		eh = mtod(m_new, struct ether_header *);
3711 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3712 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3713 	}
3714 	rxr->hn_pkts++;
3715 
3716 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3717 #if defined(INET) || defined(INET6)
3718 		struct lro_ctrl *lro = &rxr->hn_lro;
3719 
3720 		if (lro->lro_cnt) {
3721 			rxr->hn_lro_tried++;
3722 			if (hn_lro_rx(lro, m_new) == 0) {
3723 				/* DONE! */
3724 				return 0;
3725 			}
3726 		}
3727 #endif
3728 	}
3729 	ifp->if_input(ifp, m_new);
3730 
3731 	return (0);
3732 }
3733 
3734 static int
3735 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3736 {
3737 	struct hn_softc *sc = ifp->if_softc;
3738 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3739 	struct ifnet *vf_ifp;
3740 	int mask, error = 0;
3741 	struct ifrsskey *ifrk;
3742 	struct ifrsshash *ifrh;
3743 	uint32_t mtu;
3744 
3745 	switch (cmd) {
3746 	case SIOCSIFMTU:
3747 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3748 			error = EINVAL;
3749 			break;
3750 		}
3751 
3752 		HN_LOCK(sc);
3753 
3754 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3755 			HN_UNLOCK(sc);
3756 			break;
3757 		}
3758 
3759 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3760 			/* Can't change MTU */
3761 			HN_UNLOCK(sc);
3762 			error = EOPNOTSUPP;
3763 			break;
3764 		}
3765 
3766 		if (ifp->if_mtu == ifr->ifr_mtu) {
3767 			HN_UNLOCK(sc);
3768 			break;
3769 		}
3770 
3771 		if (hn_xpnt_vf_isready(sc)) {
3772 			vf_ifp = sc->hn_vf_ifp;
3773 			ifr_vf = *ifr;
3774 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3775 			    sizeof(ifr_vf.ifr_name));
3776 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3777 			    (caddr_t)&ifr_vf);
3778 			if (error) {
3779 				HN_UNLOCK(sc);
3780 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3781 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3782 				break;
3783 			}
3784 		}
3785 
3786 		/*
3787 		 * Suspend this interface before the synthetic parts
3788 		 * are ripped.
3789 		 */
3790 		hn_suspend(sc);
3791 
3792 		/*
3793 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3794 		 */
3795 		hn_synth_detach(sc);
3796 
3797 		/*
3798 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3799 		 * with the new MTU setting.
3800 		 */
3801 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3802 		if (error) {
3803 			HN_UNLOCK(sc);
3804 			break;
3805 		}
3806 
3807 		error = hn_rndis_get_mtu(sc, &mtu);
3808 		if (error)
3809 			mtu = ifr->ifr_mtu;
3810 		else if (bootverbose)
3811 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3812 
3813 		/*
3814 		 * Commit the requested MTU, after the synthetic parts
3815 		 * have been successfully attached.
3816 		 */
3817 		if (mtu >= ifr->ifr_mtu) {
3818 			mtu = ifr->ifr_mtu;
3819 		} else {
3820 			if_printf(ifp, "fixup mtu %d -> %u\n",
3821 			    ifr->ifr_mtu, mtu);
3822 		}
3823 		ifp->if_mtu = mtu;
3824 
3825 		/*
3826 		 * Synthetic parts' reattach may change the chimney
3827 		 * sending size; update it.
3828 		 */
3829 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3830 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3831 
3832 		/*
3833 		 * Make sure that various parameters based on MTU are
3834 		 * still valid, after the MTU change.
3835 		 */
3836 		hn_mtu_change_fixup(sc);
3837 
3838 		/*
3839 		 * All done!  Resume the interface now.
3840 		 */
3841 		hn_resume(sc);
3842 
3843 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3844 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3845 			/*
3846 			 * Since we have reattached the NVS part,
3847 			 * change the datapath to VF again; in case
3848 			 * that it is lost, after the NVS was detached.
3849 			 */
3850 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3851 		}
3852 
3853 		HN_UNLOCK(sc);
3854 		break;
3855 
3856 	case SIOCSIFFLAGS:
3857 		HN_LOCK(sc);
3858 
3859 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3860 			HN_UNLOCK(sc);
3861 			break;
3862 		}
3863 
3864 		if (hn_xpnt_vf_isready(sc))
3865 			hn_xpnt_vf_saveifflags(sc);
3866 
3867 		if (ifp->if_flags & IFF_UP) {
3868 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3869 				/*
3870 				 * Caller meight hold mutex, e.g.
3871 				 * bpf; use busy-wait for the RNDIS
3872 				 * reply.
3873 				 */
3874 				HN_NO_SLEEPING(sc);
3875 				hn_rxfilter_config(sc);
3876 				HN_SLEEPING_OK(sc);
3877 
3878 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3879 					error = hn_xpnt_vf_iocsetflags(sc);
3880 			} else {
3881 				hn_init_locked(sc);
3882 			}
3883 		} else {
3884 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3885 				hn_stop(sc, false);
3886 		}
3887 		sc->hn_if_flags = ifp->if_flags;
3888 
3889 		HN_UNLOCK(sc);
3890 		break;
3891 
3892 	case SIOCSIFCAP:
3893 		HN_LOCK(sc);
3894 
3895 		if (hn_xpnt_vf_isready(sc)) {
3896 			ifr_vf = *ifr;
3897 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3898 			    sizeof(ifr_vf.ifr_name));
3899 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3900 			HN_UNLOCK(sc);
3901 			break;
3902 		}
3903 
3904 		/*
3905 		 * Fix up requested capabilities w/ supported capabilities,
3906 		 * since the supported capabilities could have been changed.
3907 		 */
3908 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3909 		    ifp->if_capenable;
3910 
3911 		if (mask & IFCAP_TXCSUM) {
3912 			ifp->if_capenable ^= IFCAP_TXCSUM;
3913 			if (ifp->if_capenable & IFCAP_TXCSUM)
3914 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3915 			else
3916 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3917 		}
3918 		if (mask & IFCAP_TXCSUM_IPV6) {
3919 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3920 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3921 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3922 			else
3923 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3924 		}
3925 
3926 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3927 		if (mask & IFCAP_RXCSUM)
3928 			ifp->if_capenable ^= IFCAP_RXCSUM;
3929 #ifdef foo
3930 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3931 		if (mask & IFCAP_RXCSUM_IPV6)
3932 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3933 #endif
3934 
3935 		if (mask & IFCAP_LRO)
3936 			ifp->if_capenable ^= IFCAP_LRO;
3937 
3938 		if (mask & IFCAP_TSO4) {
3939 			ifp->if_capenable ^= IFCAP_TSO4;
3940 			if (ifp->if_capenable & IFCAP_TSO4)
3941 				ifp->if_hwassist |= CSUM_IP_TSO;
3942 			else
3943 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3944 		}
3945 		if (mask & IFCAP_TSO6) {
3946 			ifp->if_capenable ^= IFCAP_TSO6;
3947 			if (ifp->if_capenable & IFCAP_TSO6)
3948 				ifp->if_hwassist |= CSUM_IP6_TSO;
3949 			else
3950 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3951 		}
3952 
3953 		HN_UNLOCK(sc);
3954 		break;
3955 
3956 	case SIOCADDMULTI:
3957 	case SIOCDELMULTI:
3958 		HN_LOCK(sc);
3959 
3960 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3961 			HN_UNLOCK(sc);
3962 			break;
3963 		}
3964 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3965 			/*
3966 			 * Multicast uses mutex; use busy-wait for
3967 			 * the RNDIS reply.
3968 			 */
3969 			HN_NO_SLEEPING(sc);
3970 			hn_rxfilter_config(sc);
3971 			HN_SLEEPING_OK(sc);
3972 		}
3973 
3974 		/* XXX vlan(4) style mcast addr maintenance */
3975 		if (hn_xpnt_vf_isready(sc)) {
3976 			int old_if_flags;
3977 
3978 			old_if_flags = sc->hn_vf_ifp->if_flags;
3979 			hn_xpnt_vf_saveifflags(sc);
3980 
3981 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3982 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3983 			     IFF_ALLMULTI))
3984 				error = hn_xpnt_vf_iocsetflags(sc);
3985 		}
3986 
3987 		HN_UNLOCK(sc);
3988 		break;
3989 
3990 	case SIOCSIFMEDIA:
3991 	case SIOCGIFMEDIA:
3992 		HN_LOCK(sc);
3993 		if (hn_xpnt_vf_isready(sc)) {
3994 			/*
3995 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3996 			 * create and pass ifr_vf to the VF here; just
3997 			 * replace the ifr_name.
3998 			 */
3999 			vf_ifp = sc->hn_vf_ifp;
4000 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
4001 			    sizeof(ifr->ifr_name));
4002 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
4003 			/* Restore the ifr_name. */
4004 			strlcpy(ifr->ifr_name, ifp->if_xname,
4005 			    sizeof(ifr->ifr_name));
4006 			HN_UNLOCK(sc);
4007 			break;
4008 		}
4009 		HN_UNLOCK(sc);
4010 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4011 		break;
4012 
4013 	case SIOCGIFRSSHASH:
4014 		ifrh = (struct ifrsshash *)data;
4015 		HN_LOCK(sc);
4016 		if (sc->hn_rx_ring_inuse == 1) {
4017 			HN_UNLOCK(sc);
4018 			ifrh->ifrh_func = RSS_FUNC_NONE;
4019 			ifrh->ifrh_types = 0;
4020 			break;
4021 		}
4022 
4023 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4024 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4025 		else
4026 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4027 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4028 		HN_UNLOCK(sc);
4029 		break;
4030 
4031 	case SIOCGIFRSSKEY:
4032 		ifrk = (struct ifrsskey *)data;
4033 		HN_LOCK(sc);
4034 		if (sc->hn_rx_ring_inuse == 1) {
4035 			HN_UNLOCK(sc);
4036 			ifrk->ifrk_func = RSS_FUNC_NONE;
4037 			ifrk->ifrk_keylen = 0;
4038 			break;
4039 		}
4040 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4041 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4042 		else
4043 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4044 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4045 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4046 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4047 		HN_UNLOCK(sc);
4048 		break;
4049 
4050 	default:
4051 		error = ether_ioctl(ifp, cmd, data);
4052 		break;
4053 	}
4054 	return (error);
4055 }
4056 
4057 static void
4058 hn_stop(struct hn_softc *sc, bool detaching)
4059 {
4060 	struct ifnet *ifp = sc->hn_ifp;
4061 	int i;
4062 
4063 	HN_LOCK_ASSERT(sc);
4064 
4065 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4066 	    ("synthetic parts were not attached"));
4067 
4068 	/* Clear RUNNING bit ASAP. */
4069 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4070 
4071 	/* Disable polling. */
4072 	hn_polling(sc, 0);
4073 
4074 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4075 		KASSERT(sc->hn_vf_ifp != NULL,
4076 		    ("%s: VF is not attached", ifp->if_xname));
4077 
4078 		/* Mark transparent mode VF as disabled. */
4079 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4080 
4081 		/*
4082 		 * NOTE:
4083 		 * Datapath setting must happen _before_ bringing
4084 		 * the VF down.
4085 		 */
4086 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4087 
4088 		/*
4089 		 * Bring the VF down.
4090 		 */
4091 		hn_xpnt_vf_saveifflags(sc);
4092 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4093 		hn_xpnt_vf_iocsetflags(sc);
4094 	}
4095 
4096 	/* Suspend data transfers. */
4097 	hn_suspend_data(sc);
4098 
4099 	/* Clear OACTIVE bit. */
4100 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4101 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4102 		sc->hn_tx_ring[i].hn_oactive = 0;
4103 
4104 	/*
4105 	 * If the non-transparent mode VF is active, make sure
4106 	 * that the RX filter still allows packet reception.
4107 	 */
4108 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4109 		hn_rxfilter_config(sc);
4110 }
4111 
4112 static void
4113 hn_init_locked(struct hn_softc *sc)
4114 {
4115 	struct ifnet *ifp = sc->hn_ifp;
4116 	int i;
4117 
4118 	HN_LOCK_ASSERT(sc);
4119 
4120 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4121 		return;
4122 
4123 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4124 		return;
4125 
4126 	/* Configure RX filter */
4127 	hn_rxfilter_config(sc);
4128 
4129 	/* Clear OACTIVE bit. */
4130 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4131 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4132 		sc->hn_tx_ring[i].hn_oactive = 0;
4133 
4134 	/* Clear TX 'suspended' bit. */
4135 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4136 
4137 	if (hn_xpnt_vf_isready(sc)) {
4138 		/* Initialize transparent VF. */
4139 		hn_xpnt_vf_init(sc);
4140 	}
4141 
4142 	/* Everything is ready; unleash! */
4143 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4144 
4145 	/* Re-enable polling if requested. */
4146 	if (sc->hn_pollhz > 0)
4147 		hn_polling(sc, sc->hn_pollhz);
4148 }
4149 
4150 static void
4151 hn_init(void *xsc)
4152 {
4153 	struct hn_softc *sc = xsc;
4154 
4155 	HN_LOCK(sc);
4156 	hn_init_locked(sc);
4157 	HN_UNLOCK(sc);
4158 }
4159 
4160 #if __FreeBSD_version >= 1100099
4161 
4162 static int
4163 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4164 {
4165 	struct hn_softc *sc = arg1;
4166 	unsigned int lenlim;
4167 	int error;
4168 
4169 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4170 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4171 	if (error || req->newptr == NULL)
4172 		return error;
4173 
4174 	HN_LOCK(sc);
4175 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4176 	    lenlim > TCP_LRO_LENGTH_MAX) {
4177 		HN_UNLOCK(sc);
4178 		return EINVAL;
4179 	}
4180 	hn_set_lro_lenlim(sc, lenlim);
4181 	HN_UNLOCK(sc);
4182 
4183 	return 0;
4184 }
4185 
4186 static int
4187 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4188 {
4189 	struct hn_softc *sc = arg1;
4190 	int ackcnt, error, i;
4191 
4192 	/*
4193 	 * lro_ackcnt_lim is append count limit,
4194 	 * +1 to turn it into aggregation limit.
4195 	 */
4196 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4197 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4198 	if (error || req->newptr == NULL)
4199 		return error;
4200 
4201 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4202 		return EINVAL;
4203 
4204 	/*
4205 	 * Convert aggregation limit back to append
4206 	 * count limit.
4207 	 */
4208 	--ackcnt;
4209 	HN_LOCK(sc);
4210 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4211 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4212 	HN_UNLOCK(sc);
4213 	return 0;
4214 }
4215 
4216 #endif
4217 
4218 static int
4219 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4220 {
4221 	struct hn_softc *sc = arg1;
4222 	int hcsum = arg2;
4223 	int on, error, i;
4224 
4225 	on = 0;
4226 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4227 		on = 1;
4228 
4229 	error = sysctl_handle_int(oidp, &on, 0, req);
4230 	if (error || req->newptr == NULL)
4231 		return error;
4232 
4233 	HN_LOCK(sc);
4234 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4235 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4236 
4237 		if (on)
4238 			rxr->hn_trust_hcsum |= hcsum;
4239 		else
4240 			rxr->hn_trust_hcsum &= ~hcsum;
4241 	}
4242 	HN_UNLOCK(sc);
4243 	return 0;
4244 }
4245 
4246 static int
4247 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4248 {
4249 	struct hn_softc *sc = arg1;
4250 	int chim_size, error;
4251 
4252 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4253 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4254 	if (error || req->newptr == NULL)
4255 		return error;
4256 
4257 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4258 		return EINVAL;
4259 
4260 	HN_LOCK(sc);
4261 	hn_set_chim_size(sc, chim_size);
4262 	HN_UNLOCK(sc);
4263 	return 0;
4264 }
4265 
4266 #if __FreeBSD_version < 1100095
4267 static int
4268 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4269 {
4270 	struct hn_softc *sc = arg1;
4271 	int ofs = arg2, i, error;
4272 	struct hn_rx_ring *rxr;
4273 	uint64_t stat;
4274 
4275 	stat = 0;
4276 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4277 		rxr = &sc->hn_rx_ring[i];
4278 		stat += *((int *)((uint8_t *)rxr + ofs));
4279 	}
4280 
4281 	error = sysctl_handle_64(oidp, &stat, 0, req);
4282 	if (error || req->newptr == NULL)
4283 		return error;
4284 
4285 	/* Zero out this stat. */
4286 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4287 		rxr = &sc->hn_rx_ring[i];
4288 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4289 	}
4290 	return 0;
4291 }
4292 #else
4293 static int
4294 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4295 {
4296 	struct hn_softc *sc = arg1;
4297 	int ofs = arg2, i, error;
4298 	struct hn_rx_ring *rxr;
4299 	uint64_t stat;
4300 
4301 	stat = 0;
4302 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4303 		rxr = &sc->hn_rx_ring[i];
4304 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4305 	}
4306 
4307 	error = sysctl_handle_64(oidp, &stat, 0, req);
4308 	if (error || req->newptr == NULL)
4309 		return error;
4310 
4311 	/* Zero out this stat. */
4312 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4313 		rxr = &sc->hn_rx_ring[i];
4314 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4315 	}
4316 	return 0;
4317 }
4318 
4319 #endif
4320 
4321 static int
4322 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4323 {
4324 	struct hn_softc *sc = arg1;
4325 	int ofs = arg2, i, error;
4326 	struct hn_rx_ring *rxr;
4327 	u_long stat;
4328 
4329 	stat = 0;
4330 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4331 		rxr = &sc->hn_rx_ring[i];
4332 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4333 	}
4334 
4335 	error = sysctl_handle_long(oidp, &stat, 0, req);
4336 	if (error || req->newptr == NULL)
4337 		return error;
4338 
4339 	/* Zero out this stat. */
4340 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4341 		rxr = &sc->hn_rx_ring[i];
4342 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4343 	}
4344 	return 0;
4345 }
4346 
4347 static int
4348 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4349 {
4350 	struct hn_softc *sc = arg1;
4351 	int ofs = arg2, i, error;
4352 	struct hn_tx_ring *txr;
4353 	u_long stat;
4354 
4355 	stat = 0;
4356 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4357 		txr = &sc->hn_tx_ring[i];
4358 		stat += *((u_long *)((uint8_t *)txr + ofs));
4359 	}
4360 
4361 	error = sysctl_handle_long(oidp, &stat, 0, req);
4362 	if (error || req->newptr == NULL)
4363 		return error;
4364 
4365 	/* Zero out this stat. */
4366 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4367 		txr = &sc->hn_tx_ring[i];
4368 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4369 	}
4370 	return 0;
4371 }
4372 
4373 static int
4374 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4375 {
4376 	struct hn_softc *sc = arg1;
4377 	int ofs = arg2, i, error, conf;
4378 	struct hn_tx_ring *txr;
4379 
4380 	txr = &sc->hn_tx_ring[0];
4381 	conf = *((int *)((uint8_t *)txr + ofs));
4382 
4383 	error = sysctl_handle_int(oidp, &conf, 0, req);
4384 	if (error || req->newptr == NULL)
4385 		return error;
4386 
4387 	HN_LOCK(sc);
4388 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4389 		txr = &sc->hn_tx_ring[i];
4390 		*((int *)((uint8_t *)txr + ofs)) = conf;
4391 	}
4392 	HN_UNLOCK(sc);
4393 
4394 	return 0;
4395 }
4396 
4397 static int
4398 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4399 {
4400 	struct hn_softc *sc = arg1;
4401 	int error, size;
4402 
4403 	size = sc->hn_agg_size;
4404 	error = sysctl_handle_int(oidp, &size, 0, req);
4405 	if (error || req->newptr == NULL)
4406 		return (error);
4407 
4408 	HN_LOCK(sc);
4409 	sc->hn_agg_size = size;
4410 	hn_set_txagg(sc);
4411 	HN_UNLOCK(sc);
4412 
4413 	return (0);
4414 }
4415 
4416 static int
4417 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4418 {
4419 	struct hn_softc *sc = arg1;
4420 	int error, pkts;
4421 
4422 	pkts = sc->hn_agg_pkts;
4423 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4424 	if (error || req->newptr == NULL)
4425 		return (error);
4426 
4427 	HN_LOCK(sc);
4428 	sc->hn_agg_pkts = pkts;
4429 	hn_set_txagg(sc);
4430 	HN_UNLOCK(sc);
4431 
4432 	return (0);
4433 }
4434 
4435 static int
4436 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4437 {
4438 	struct hn_softc *sc = arg1;
4439 	int pkts;
4440 
4441 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4442 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4443 }
4444 
4445 static int
4446 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4447 {
4448 	struct hn_softc *sc = arg1;
4449 	int align;
4450 
4451 	align = sc->hn_tx_ring[0].hn_agg_align;
4452 	return (sysctl_handle_int(oidp, &align, 0, req));
4453 }
4454 
4455 static void
4456 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4457 {
4458 	if (pollhz == 0)
4459 		vmbus_chan_poll_disable(chan);
4460 	else
4461 		vmbus_chan_poll_enable(chan, pollhz);
4462 }
4463 
4464 static void
4465 hn_polling(struct hn_softc *sc, u_int pollhz)
4466 {
4467 	int nsubch = sc->hn_rx_ring_inuse - 1;
4468 
4469 	HN_LOCK_ASSERT(sc);
4470 
4471 	if (nsubch > 0) {
4472 		struct vmbus_channel **subch;
4473 		int i;
4474 
4475 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4476 		for (i = 0; i < nsubch; ++i)
4477 			hn_chan_polling(subch[i], pollhz);
4478 		vmbus_subchan_rel(subch, nsubch);
4479 	}
4480 	hn_chan_polling(sc->hn_prichan, pollhz);
4481 }
4482 
4483 static int
4484 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4485 {
4486 	struct hn_softc *sc = arg1;
4487 	int pollhz, error;
4488 
4489 	pollhz = sc->hn_pollhz;
4490 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4491 	if (error || req->newptr == NULL)
4492 		return (error);
4493 
4494 	if (pollhz != 0 &&
4495 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4496 		return (EINVAL);
4497 
4498 	HN_LOCK(sc);
4499 	if (sc->hn_pollhz != pollhz) {
4500 		sc->hn_pollhz = pollhz;
4501 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4502 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4503 			hn_polling(sc, sc->hn_pollhz);
4504 	}
4505 	HN_UNLOCK(sc);
4506 
4507 	return (0);
4508 }
4509 
4510 static int
4511 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4512 {
4513 	struct hn_softc *sc = arg1;
4514 	char verstr[16];
4515 
4516 	snprintf(verstr, sizeof(verstr), "%u.%u",
4517 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4518 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4519 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4520 }
4521 
4522 static int
4523 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4524 {
4525 	struct hn_softc *sc = arg1;
4526 	char caps_str[128];
4527 	uint32_t caps;
4528 
4529 	HN_LOCK(sc);
4530 	caps = sc->hn_caps;
4531 	HN_UNLOCK(sc);
4532 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4533 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4534 }
4535 
4536 static int
4537 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4538 {
4539 	struct hn_softc *sc = arg1;
4540 	char assist_str[128];
4541 	uint32_t hwassist;
4542 
4543 	HN_LOCK(sc);
4544 	hwassist = sc->hn_ifp->if_hwassist;
4545 	HN_UNLOCK(sc);
4546 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4547 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4548 }
4549 
4550 static int
4551 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4552 {
4553 	struct hn_softc *sc = arg1;
4554 	char filter_str[128];
4555 	uint32_t filter;
4556 
4557 	HN_LOCK(sc);
4558 	filter = sc->hn_rx_filter;
4559 	HN_UNLOCK(sc);
4560 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4561 	    NDIS_PACKET_TYPES);
4562 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4563 }
4564 
4565 #ifndef RSS
4566 
4567 static int
4568 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4569 {
4570 	struct hn_softc *sc = arg1;
4571 	int error;
4572 
4573 	HN_LOCK(sc);
4574 
4575 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4576 	if (error || req->newptr == NULL)
4577 		goto back;
4578 
4579 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4580 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4581 		/*
4582 		 * RSS key is synchronized w/ VF's, don't allow users
4583 		 * to change it.
4584 		 */
4585 		error = EBUSY;
4586 		goto back;
4587 	}
4588 
4589 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4590 	if (error)
4591 		goto back;
4592 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4593 
4594 	if (sc->hn_rx_ring_inuse > 1) {
4595 		error = hn_rss_reconfig(sc);
4596 	} else {
4597 		/* Not RSS capable, at least for now; just save the RSS key. */
4598 		error = 0;
4599 	}
4600 back:
4601 	HN_UNLOCK(sc);
4602 	return (error);
4603 }
4604 
4605 static int
4606 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4607 {
4608 	struct hn_softc *sc = arg1;
4609 	int error;
4610 
4611 	HN_LOCK(sc);
4612 
4613 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4614 	if (error || req->newptr == NULL)
4615 		goto back;
4616 
4617 	/*
4618 	 * Don't allow RSS indirect table change, if this interface is not
4619 	 * RSS capable currently.
4620 	 */
4621 	if (sc->hn_rx_ring_inuse == 1) {
4622 		error = EOPNOTSUPP;
4623 		goto back;
4624 	}
4625 
4626 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4627 	if (error)
4628 		goto back;
4629 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4630 
4631 	hn_rss_ind_fixup(sc);
4632 	error = hn_rss_reconfig(sc);
4633 back:
4634 	HN_UNLOCK(sc);
4635 	return (error);
4636 }
4637 
4638 #endif	/* !RSS */
4639 
4640 static int
4641 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4642 {
4643 	struct hn_softc *sc = arg1;
4644 	char hash_str[128];
4645 	uint32_t hash;
4646 
4647 	HN_LOCK(sc);
4648 	hash = sc->hn_rss_hash;
4649 	HN_UNLOCK(sc);
4650 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4651 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4652 }
4653 
4654 static int
4655 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4656 {
4657 	struct hn_softc *sc = arg1;
4658 	char hash_str[128];
4659 	uint32_t hash;
4660 
4661 	HN_LOCK(sc);
4662 	hash = sc->hn_rss_hcap;
4663 	HN_UNLOCK(sc);
4664 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4665 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4666 }
4667 
4668 static int
4669 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4670 {
4671 	struct hn_softc *sc = arg1;
4672 	char hash_str[128];
4673 	uint32_t hash;
4674 
4675 	HN_LOCK(sc);
4676 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4677 	HN_UNLOCK(sc);
4678 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4679 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4680 }
4681 
4682 static int
4683 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4684 {
4685 	struct hn_softc *sc = arg1;
4686 	char vf_name[IFNAMSIZ + 1];
4687 	struct ifnet *vf_ifp;
4688 
4689 	HN_LOCK(sc);
4690 	vf_name[0] = '\0';
4691 	vf_ifp = sc->hn_vf_ifp;
4692 	if (vf_ifp != NULL)
4693 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4694 	HN_UNLOCK(sc);
4695 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4696 }
4697 
4698 static int
4699 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4700 {
4701 	struct hn_softc *sc = arg1;
4702 	char vf_name[IFNAMSIZ + 1];
4703 	struct ifnet *vf_ifp;
4704 
4705 	HN_LOCK(sc);
4706 	vf_name[0] = '\0';
4707 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4708 	if (vf_ifp != NULL)
4709 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4710 	HN_UNLOCK(sc);
4711 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4712 }
4713 
4714 static int
4715 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4716 {
4717 	struct rm_priotracker pt;
4718 	struct sbuf *sb;
4719 	int error, i;
4720 	bool first;
4721 
4722 	error = sysctl_wire_old_buffer(req, 0);
4723 	if (error != 0)
4724 		return (error);
4725 
4726 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4727 	if (sb == NULL)
4728 		return (ENOMEM);
4729 
4730 	rm_rlock(&hn_vfmap_lock, &pt);
4731 
4732 	first = true;
4733 	for (i = 0; i < hn_vfmap_size; ++i) {
4734 		struct ifnet *ifp;
4735 
4736 		if (hn_vfmap[i] == NULL)
4737 			continue;
4738 
4739 		ifp = ifnet_byindex(i);
4740 		if (ifp != NULL) {
4741 			if (first)
4742 				sbuf_printf(sb, "%s", ifp->if_xname);
4743 			else
4744 				sbuf_printf(sb, " %s", ifp->if_xname);
4745 			first = false;
4746 		}
4747 	}
4748 
4749 	rm_runlock(&hn_vfmap_lock, &pt);
4750 
4751 	error = sbuf_finish(sb);
4752 	sbuf_delete(sb);
4753 	return (error);
4754 }
4755 
4756 static int
4757 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4758 {
4759 	struct rm_priotracker pt;
4760 	struct sbuf *sb;
4761 	int error, i;
4762 	bool first;
4763 
4764 	error = sysctl_wire_old_buffer(req, 0);
4765 	if (error != 0)
4766 		return (error);
4767 
4768 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4769 	if (sb == NULL)
4770 		return (ENOMEM);
4771 
4772 	rm_rlock(&hn_vfmap_lock, &pt);
4773 
4774 	first = true;
4775 	for (i = 0; i < hn_vfmap_size; ++i) {
4776 		struct ifnet *ifp, *hn_ifp;
4777 
4778 		hn_ifp = hn_vfmap[i];
4779 		if (hn_ifp == NULL)
4780 			continue;
4781 
4782 		ifp = ifnet_byindex(i);
4783 		if (ifp != NULL) {
4784 			if (first) {
4785 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4786 				    hn_ifp->if_xname);
4787 			} else {
4788 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4789 				    hn_ifp->if_xname);
4790 			}
4791 			first = false;
4792 		}
4793 	}
4794 
4795 	rm_runlock(&hn_vfmap_lock, &pt);
4796 
4797 	error = sbuf_finish(sb);
4798 	sbuf_delete(sb);
4799 	return (error);
4800 }
4801 
4802 static int
4803 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4804 {
4805 	struct hn_softc *sc = arg1;
4806 	int error, onoff = 0;
4807 
4808 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4809 		onoff = 1;
4810 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4811 	if (error || req->newptr == NULL)
4812 		return (error);
4813 
4814 	HN_LOCK(sc);
4815 	/* NOTE: hn_vf_lock for hn_transmit() */
4816 	rm_wlock(&sc->hn_vf_lock);
4817 	if (onoff)
4818 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4819 	else
4820 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4821 	rm_wunlock(&sc->hn_vf_lock);
4822 	HN_UNLOCK(sc);
4823 
4824 	return (0);
4825 }
4826 
4827 static int
4828 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4829 {
4830 	struct hn_softc *sc = arg1;
4831 	int enabled = 0;
4832 
4833 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4834 		enabled = 1;
4835 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4836 }
4837 
4838 static int
4839 hn_check_iplen(const struct mbuf *m, int hoff)
4840 {
4841 	const struct ip *ip;
4842 	int len, iphlen, iplen;
4843 	const struct tcphdr *th;
4844 	int thoff;				/* TCP data offset */
4845 
4846 	len = hoff + sizeof(struct ip);
4847 
4848 	/* The packet must be at least the size of an IP header. */
4849 	if (m->m_pkthdr.len < len)
4850 		return IPPROTO_DONE;
4851 
4852 	/* The fixed IP header must reside completely in the first mbuf. */
4853 	if (m->m_len < len)
4854 		return IPPROTO_DONE;
4855 
4856 	ip = mtodo(m, hoff);
4857 
4858 	/* Bound check the packet's stated IP header length. */
4859 	iphlen = ip->ip_hl << 2;
4860 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4861 		return IPPROTO_DONE;
4862 
4863 	/* The full IP header must reside completely in the one mbuf. */
4864 	if (m->m_len < hoff + iphlen)
4865 		return IPPROTO_DONE;
4866 
4867 	iplen = ntohs(ip->ip_len);
4868 
4869 	/*
4870 	 * Check that the amount of data in the buffers is as
4871 	 * at least much as the IP header would have us expect.
4872 	 */
4873 	if (m->m_pkthdr.len < hoff + iplen)
4874 		return IPPROTO_DONE;
4875 
4876 	/*
4877 	 * Ignore IP fragments.
4878 	 */
4879 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4880 		return IPPROTO_DONE;
4881 
4882 	/*
4883 	 * The TCP/IP or UDP/IP header must be entirely contained within
4884 	 * the first fragment of a packet.
4885 	 */
4886 	switch (ip->ip_p) {
4887 	case IPPROTO_TCP:
4888 		if (iplen < iphlen + sizeof(struct tcphdr))
4889 			return IPPROTO_DONE;
4890 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4891 			return IPPROTO_DONE;
4892 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4893 		thoff = th->th_off << 2;
4894 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4895 			return IPPROTO_DONE;
4896 		if (m->m_len < hoff + iphlen + thoff)
4897 			return IPPROTO_DONE;
4898 		break;
4899 	case IPPROTO_UDP:
4900 		if (iplen < iphlen + sizeof(struct udphdr))
4901 			return IPPROTO_DONE;
4902 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4903 			return IPPROTO_DONE;
4904 		break;
4905 	default:
4906 		if (iplen < iphlen)
4907 			return IPPROTO_DONE;
4908 		break;
4909 	}
4910 	return ip->ip_p;
4911 }
4912 
4913 static void
4914 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4915 {
4916 	const struct ether_header *eh;
4917 	uint16_t etype;
4918 	int hoff;
4919 
4920 	hoff = sizeof(*eh);
4921 	/* Checked at the beginning of this function. */
4922 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4923 
4924 	eh = mtod(m_new, const struct ether_header *);
4925 	etype = ntohs(eh->ether_type);
4926 	if (etype == ETHERTYPE_VLAN) {
4927 		const struct ether_vlan_header *evl;
4928 
4929 		hoff = sizeof(*evl);
4930 		if (m_new->m_len < hoff)
4931 			return;
4932 		evl = mtod(m_new, const struct ether_vlan_header *);
4933 		etype = ntohs(evl->evl_proto);
4934 	}
4935 	*l3proto = etype;
4936 
4937 	if (etype == ETHERTYPE_IP)
4938 		*l4proto = hn_check_iplen(m_new, hoff);
4939 	else
4940 		*l4proto = IPPROTO_DONE;
4941 }
4942 
4943 static int
4944 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4945 {
4946 	struct sysctl_oid_list *child;
4947 	struct sysctl_ctx_list *ctx;
4948 	device_t dev = sc->hn_dev;
4949 #if defined(INET) || defined(INET6)
4950 #if __FreeBSD_version >= 1100095
4951 	int lroent_cnt;
4952 #endif
4953 #endif
4954 	int i;
4955 
4956 	/*
4957 	 * Create RXBUF for reception.
4958 	 *
4959 	 * NOTE:
4960 	 * - It is shared by all channels.
4961 	 * - A large enough buffer is allocated, certain version of NVSes
4962 	 *   may further limit the usable space.
4963 	 */
4964 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4965 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4966 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4967 	if (sc->hn_rxbuf == NULL) {
4968 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4969 		return (ENOMEM);
4970 	}
4971 
4972 	sc->hn_rx_ring_cnt = ring_cnt;
4973 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4974 
4975 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4976 	    M_DEVBUF, M_WAITOK | M_ZERO);
4977 
4978 #if defined(INET) || defined(INET6)
4979 #if __FreeBSD_version >= 1100095
4980 	lroent_cnt = hn_lro_entry_count;
4981 	if (lroent_cnt < TCP_LRO_ENTRIES)
4982 		lroent_cnt = TCP_LRO_ENTRIES;
4983 	if (bootverbose)
4984 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4985 #endif
4986 #endif	/* INET || INET6 */
4987 
4988 	ctx = device_get_sysctl_ctx(dev);
4989 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4990 
4991 	/* Create dev.hn.UNIT.rx sysctl tree */
4992 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4993 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4994 
4995 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4996 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4997 
4998 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4999 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
5000 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
5001 		if (rxr->hn_br == NULL) {
5002 			device_printf(dev, "allocate bufring failed\n");
5003 			return (ENOMEM);
5004 		}
5005 
5006 		if (hn_trust_hosttcp)
5007 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
5008 		if (hn_trust_hostudp)
5009 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
5010 		if (hn_trust_hostip)
5011 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
5012 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
5013 		rxr->hn_ifp = sc->hn_ifp;
5014 		if (i < sc->hn_tx_ring_cnt)
5015 			rxr->hn_txr = &sc->hn_tx_ring[i];
5016 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
5017 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
5018 		rxr->hn_rx_idx = i;
5019 		rxr->hn_rxbuf = sc->hn_rxbuf;
5020 
5021 		/*
5022 		 * Initialize LRO.
5023 		 */
5024 #if defined(INET) || defined(INET6)
5025 #if __FreeBSD_version >= 1100095
5026 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5027 		    hn_lro_mbufq_depth);
5028 #else
5029 		tcp_lro_init(&rxr->hn_lro);
5030 		rxr->hn_lro.ifp = sc->hn_ifp;
5031 #endif
5032 #if __FreeBSD_version >= 1100099
5033 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5034 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5035 #endif
5036 #endif	/* INET || INET6 */
5037 
5038 		if (sc->hn_rx_sysctl_tree != NULL) {
5039 			char name[16];
5040 
5041 			/*
5042 			 * Create per RX ring sysctl tree:
5043 			 * dev.hn.UNIT.rx.RINGID
5044 			 */
5045 			snprintf(name, sizeof(name), "%d", i);
5046 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5047 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5048 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5049 
5050 			if (rxr->hn_rx_sysctl_tree != NULL) {
5051 				SYSCTL_ADD_ULONG(ctx,
5052 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5053 				    OID_AUTO, "packets", CTLFLAG_RW,
5054 				    &rxr->hn_pkts, "# of packets received");
5055 				SYSCTL_ADD_ULONG(ctx,
5056 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5057 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
5058 				    &rxr->hn_rss_pkts,
5059 				    "# of packets w/ RSS info received");
5060 				SYSCTL_ADD_ULONG(ctx,
5061 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5062 				    OID_AUTO, "rsc_pkts", CTLFLAG_RW,
5063 				    &rxr->hn_rsc_pkts,
5064 				    "# of RSC packets received");
5065 				SYSCTL_ADD_ULONG(ctx,
5066 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5067 				    OID_AUTO, "rsc_drop", CTLFLAG_RW,
5068 				    &rxr->hn_rsc_drop,
5069 				    "# of RSC fragments dropped");
5070 				SYSCTL_ADD_INT(ctx,
5071 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5072 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5073 				    &rxr->hn_pktbuf_len, 0,
5074 				    "Temporary channel packet buffer length");
5075 			}
5076 		}
5077 	}
5078 
5079 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5080 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5081 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5082 #if __FreeBSD_version < 1100095
5083 	    hn_rx_stat_int_sysctl,
5084 #else
5085 	    hn_rx_stat_u64_sysctl,
5086 #endif
5087 	    "LU", "LRO queued");
5088 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5089 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5090 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5091 #if __FreeBSD_version < 1100095
5092 	    hn_rx_stat_int_sysctl,
5093 #else
5094 	    hn_rx_stat_u64_sysctl,
5095 #endif
5096 	    "LU", "LRO flushed");
5097 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5098 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5099 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5100 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5101 #if __FreeBSD_version >= 1100099
5102 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5103 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5104 	    hn_lro_lenlim_sysctl, "IU",
5105 	    "Max # of data bytes to be aggregated by LRO");
5106 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5107 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5108 	    hn_lro_ackcnt_sysctl, "I",
5109 	    "Max # of ACKs to be aggregated by LRO");
5110 #endif
5111 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5112 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5113 	    hn_trust_hcsum_sysctl, "I",
5114 	    "Trust tcp segement verification on host side, "
5115 	    "when csum info is missing");
5116 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5117 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5118 	    hn_trust_hcsum_sysctl, "I",
5119 	    "Trust udp datagram verification on host side, "
5120 	    "when csum info is missing");
5121 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5122 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5123 	    hn_trust_hcsum_sysctl, "I",
5124 	    "Trust ip packet verification on host side, "
5125 	    "when csum info is missing");
5126 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5127 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5128 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5129 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5130 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5131 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5132 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5133 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5134 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5135 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5136 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5137 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5138 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5139 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5140 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5141 	    hn_rx_stat_ulong_sysctl, "LU",
5142 	    "# of packets that we trust host's csum verification");
5143 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5144 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5145 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5146 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5147 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5148 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5149 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5150 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5151 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5152 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5153 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5154 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5155 
5156 	return (0);
5157 }
5158 
5159 static void
5160 hn_destroy_rx_data(struct hn_softc *sc)
5161 {
5162 	int i;
5163 
5164 	if (sc->hn_rxbuf != NULL) {
5165 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5166 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5167 		else
5168 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5169 		sc->hn_rxbuf = NULL;
5170 	}
5171 
5172 	if (sc->hn_rx_ring_cnt == 0)
5173 		return;
5174 
5175 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5176 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5177 
5178 		if (rxr->hn_br == NULL)
5179 			continue;
5180 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5181 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5182 		} else {
5183 			device_printf(sc->hn_dev,
5184 			    "%dth channel bufring is referenced", i);
5185 		}
5186 		rxr->hn_br = NULL;
5187 
5188 #if defined(INET) || defined(INET6)
5189 		tcp_lro_free(&rxr->hn_lro);
5190 #endif
5191 		free(rxr->hn_pktbuf, M_DEVBUF);
5192 	}
5193 	free(sc->hn_rx_ring, M_DEVBUF);
5194 	sc->hn_rx_ring = NULL;
5195 
5196 	sc->hn_rx_ring_cnt = 0;
5197 	sc->hn_rx_ring_inuse = 0;
5198 }
5199 
5200 static int
5201 hn_tx_ring_create(struct hn_softc *sc, int id)
5202 {
5203 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5204 	device_t dev = sc->hn_dev;
5205 	bus_dma_tag_t parent_dtag;
5206 	int error, i;
5207 
5208 	txr->hn_sc = sc;
5209 	txr->hn_tx_idx = id;
5210 
5211 #ifndef HN_USE_TXDESC_BUFRING
5212 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5213 #endif
5214 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5215 
5216 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5217 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5218 	    M_DEVBUF, M_WAITOK | M_ZERO);
5219 #ifndef HN_USE_TXDESC_BUFRING
5220 	SLIST_INIT(&txr->hn_txlist);
5221 #else
5222 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5223 	    M_WAITOK, &txr->hn_tx_lock);
5224 #endif
5225 
5226 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5227 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5228 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5229 	} else {
5230 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5231 	}
5232 
5233 #ifdef HN_IFSTART_SUPPORT
5234 	if (hn_use_if_start) {
5235 		txr->hn_txeof = hn_start_txeof;
5236 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5237 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5238 	} else
5239 #endif
5240 	{
5241 		int br_depth;
5242 
5243 		txr->hn_txeof = hn_xmit_txeof;
5244 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5245 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5246 
5247 		br_depth = hn_get_txswq_depth(txr);
5248 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5249 		    M_WAITOK, &txr->hn_tx_lock);
5250 	}
5251 
5252 	txr->hn_direct_tx_size = hn_direct_tx_size;
5253 
5254 	/*
5255 	 * Always schedule transmission instead of trying to do direct
5256 	 * transmission.  This one gives the best performance so far.
5257 	 */
5258 	txr->hn_sched_tx = 1;
5259 
5260 	parent_dtag = bus_get_dma_tag(dev);
5261 
5262 	/* DMA tag for RNDIS packet messages. */
5263 	error = bus_dma_tag_create(parent_dtag, /* parent */
5264 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5265 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5266 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5267 	    BUS_SPACE_MAXADDR,		/* highaddr */
5268 	    NULL, NULL,			/* filter, filterarg */
5269 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5270 	    1,				/* nsegments */
5271 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5272 	    0,				/* flags */
5273 	    NULL,			/* lockfunc */
5274 	    NULL,			/* lockfuncarg */
5275 	    &txr->hn_tx_rndis_dtag);
5276 	if (error) {
5277 		device_printf(dev, "failed to create rndis dmatag\n");
5278 		return error;
5279 	}
5280 
5281 	/* DMA tag for data. */
5282 	error = bus_dma_tag_create(parent_dtag, /* parent */
5283 	    1,				/* alignment */
5284 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5285 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5286 	    BUS_SPACE_MAXADDR,		/* highaddr */
5287 	    NULL, NULL,			/* filter, filterarg */
5288 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5289 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5290 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5291 	    0,				/* flags */
5292 	    NULL,			/* lockfunc */
5293 	    NULL,			/* lockfuncarg */
5294 	    &txr->hn_tx_data_dtag);
5295 	if (error) {
5296 		device_printf(dev, "failed to create data dmatag\n");
5297 		return error;
5298 	}
5299 
5300 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5301 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5302 
5303 		txd->txr = txr;
5304 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5305 		STAILQ_INIT(&txd->agg_list);
5306 
5307 		/*
5308 		 * Allocate and load RNDIS packet message.
5309 		 */
5310         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5311 		    (void **)&txd->rndis_pkt,
5312 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5313 		    &txd->rndis_pkt_dmap);
5314 		if (error) {
5315 			device_printf(dev,
5316 			    "failed to allocate rndis_packet_msg, %d\n", i);
5317 			return error;
5318 		}
5319 
5320 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5321 		    txd->rndis_pkt_dmap,
5322 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5323 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5324 		    BUS_DMA_NOWAIT);
5325 		if (error) {
5326 			device_printf(dev,
5327 			    "failed to load rndis_packet_msg, %d\n", i);
5328 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5329 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5330 			return error;
5331 		}
5332 
5333 		/* DMA map for TX data. */
5334 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5335 		    &txd->data_dmap);
5336 		if (error) {
5337 			device_printf(dev,
5338 			    "failed to allocate tx data dmamap\n");
5339 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5340 			    txd->rndis_pkt_dmap);
5341 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5342 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5343 			return error;
5344 		}
5345 
5346 		/* All set, put it to list */
5347 		txd->flags |= HN_TXD_FLAG_ONLIST;
5348 #ifndef HN_USE_TXDESC_BUFRING
5349 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5350 #else
5351 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5352 #endif
5353 	}
5354 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5355 
5356 	if (sc->hn_tx_sysctl_tree != NULL) {
5357 		struct sysctl_oid_list *child;
5358 		struct sysctl_ctx_list *ctx;
5359 		char name[16];
5360 
5361 		/*
5362 		 * Create per TX ring sysctl tree:
5363 		 * dev.hn.UNIT.tx.RINGID
5364 		 */
5365 		ctx = device_get_sysctl_ctx(dev);
5366 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5367 
5368 		snprintf(name, sizeof(name), "%d", id);
5369 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5370 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5371 
5372 		if (txr->hn_tx_sysctl_tree != NULL) {
5373 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5374 
5375 #ifdef HN_DEBUG
5376 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5377 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5378 			    "# of available TX descs");
5379 #endif
5380 #ifdef HN_IFSTART_SUPPORT
5381 			if (!hn_use_if_start)
5382 #endif
5383 			{
5384 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5385 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5386 				    "over active");
5387 			}
5388 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5389 			    CTLFLAG_RW, &txr->hn_pkts,
5390 			    "# of packets transmitted");
5391 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5392 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5393 		}
5394 	}
5395 
5396 	return 0;
5397 }
5398 
5399 static void
5400 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5401 {
5402 	struct hn_tx_ring *txr = txd->txr;
5403 
5404 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5405 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5406 
5407 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5408 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5409 	    txd->rndis_pkt_dmap);
5410 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5411 }
5412 
5413 static void
5414 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5415 {
5416 
5417 	KASSERT(txd->refs == 0 || txd->refs == 1,
5418 	    ("invalid txd refs %d", txd->refs));
5419 
5420 	/* Aggregated txds will be freed by their aggregating txd. */
5421 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5422 		int freed;
5423 
5424 		freed = hn_txdesc_put(txr, txd);
5425 		KASSERT(freed, ("can't free txdesc"));
5426 	}
5427 }
5428 
5429 static void
5430 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5431 {
5432 	int i;
5433 
5434 	if (txr->hn_txdesc == NULL)
5435 		return;
5436 
5437 	/*
5438 	 * NOTE:
5439 	 * Because the freeing of aggregated txds will be deferred
5440 	 * to the aggregating txd, two passes are used here:
5441 	 * - The first pass GCes any pending txds.  This GC is necessary,
5442 	 *   since if the channels are revoked, hypervisor will not
5443 	 *   deliver send-done for all pending txds.
5444 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5445 	 *   were freed.
5446 	 */
5447 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5448 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5449 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5450 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5451 
5452 	if (txr->hn_tx_data_dtag != NULL)
5453 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5454 	if (txr->hn_tx_rndis_dtag != NULL)
5455 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5456 
5457 #ifdef HN_USE_TXDESC_BUFRING
5458 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5459 #endif
5460 
5461 	free(txr->hn_txdesc, M_DEVBUF);
5462 	txr->hn_txdesc = NULL;
5463 
5464 	if (txr->hn_mbuf_br != NULL)
5465 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5466 
5467 #ifndef HN_USE_TXDESC_BUFRING
5468 	mtx_destroy(&txr->hn_txlist_spin);
5469 #endif
5470 	mtx_destroy(&txr->hn_tx_lock);
5471 }
5472 
5473 static int
5474 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5475 {
5476 	struct sysctl_oid_list *child;
5477 	struct sysctl_ctx_list *ctx;
5478 	int i;
5479 
5480 	/*
5481 	 * Create TXBUF for chimney sending.
5482 	 *
5483 	 * NOTE: It is shared by all channels.
5484 	 */
5485 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5486 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5487 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5488 	if (sc->hn_chim == NULL) {
5489 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5490 		return (ENOMEM);
5491 	}
5492 
5493 	sc->hn_tx_ring_cnt = ring_cnt;
5494 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5495 
5496 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5497 	    M_DEVBUF, M_WAITOK | M_ZERO);
5498 
5499 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5500 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5501 
5502 	/* Create dev.hn.UNIT.tx sysctl tree */
5503 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5504 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5505 
5506 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5507 		int error;
5508 
5509 		error = hn_tx_ring_create(sc, i);
5510 		if (error)
5511 			return error;
5512 	}
5513 
5514 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5515 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5516 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5517 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5518 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5519 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5520 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5521 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5522 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5523 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5524 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5525 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5526 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5527 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5528 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5529 	    hn_tx_stat_ulong_sysctl, "LU",
5530 	    "# of packet transmission aggregation flush failure");
5531 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5532 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5533 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5534 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5535 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5536 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5537 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5538 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5539 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5540 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5541 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5542 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5543 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5544 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5545 	    "# of total TX descs");
5546 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5547 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5548 	    "Chimney send packet size upper boundary");
5549 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5550 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5551 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5552 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5553 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5554 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5555 	    hn_tx_conf_int_sysctl, "I",
5556 	    "Size of the packet for direct transmission");
5557 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5558 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5559 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5560 	    hn_tx_conf_int_sysctl, "I",
5561 	    "Always schedule transmission "
5562 	    "instead of doing direct transmission");
5563 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5564 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5565 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5566 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5567 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5568 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5569 	    "Applied packet transmission aggregation size");
5570 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5571 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5572 	    hn_txagg_pktmax_sysctl, "I",
5573 	    "Applied packet transmission aggregation packets");
5574 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5575 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5576 	    hn_txagg_align_sysctl, "I",
5577 	    "Applied packet transmission aggregation alignment");
5578 
5579 	return 0;
5580 }
5581 
5582 static void
5583 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5584 {
5585 	int i;
5586 
5587 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5588 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5589 }
5590 
5591 static void
5592 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5593 {
5594 	struct ifnet *ifp = sc->hn_ifp;
5595 	u_int hw_tsomax;
5596 	int tso_minlen;
5597 
5598 	HN_LOCK_ASSERT(sc);
5599 
5600 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5601 		return;
5602 
5603 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5604 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5605 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5606 
5607 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5608 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5609 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5610 
5611 	if (tso_maxlen < tso_minlen)
5612 		tso_maxlen = tso_minlen;
5613 	else if (tso_maxlen > IP_MAXPACKET)
5614 		tso_maxlen = IP_MAXPACKET;
5615 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5616 		tso_maxlen = sc->hn_ndis_tso_szmax;
5617 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5618 
5619 	if (hn_xpnt_vf_isready(sc)) {
5620 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5621 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5622 	}
5623 	ifp->if_hw_tsomax = hw_tsomax;
5624 	if (bootverbose)
5625 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5626 }
5627 
5628 static void
5629 hn_fixup_tx_data(struct hn_softc *sc)
5630 {
5631 	uint64_t csum_assist;
5632 	int i;
5633 
5634 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5635 	if (hn_tx_chimney_size > 0 &&
5636 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5637 		hn_set_chim_size(sc, hn_tx_chimney_size);
5638 
5639 	csum_assist = 0;
5640 	if (sc->hn_caps & HN_CAP_IPCS)
5641 		csum_assist |= CSUM_IP;
5642 	if (sc->hn_caps & HN_CAP_TCP4CS)
5643 		csum_assist |= CSUM_IP_TCP;
5644 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5645 		csum_assist |= CSUM_IP_UDP;
5646 	if (sc->hn_caps & HN_CAP_TCP6CS)
5647 		csum_assist |= CSUM_IP6_TCP;
5648 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5649 		csum_assist |= CSUM_IP6_UDP;
5650 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5651 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5652 
5653 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5654 		/*
5655 		 * Support HASHVAL pktinfo on TX path.
5656 		 */
5657 		if (bootverbose)
5658 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5659 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5660 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5661 	}
5662 }
5663 
5664 static void
5665 hn_fixup_rx_data(struct hn_softc *sc)
5666 {
5667 
5668 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5669 		int i;
5670 
5671 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5672 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5673 	}
5674 }
5675 
5676 static void
5677 hn_destroy_tx_data(struct hn_softc *sc)
5678 {
5679 	int i;
5680 
5681 	if (sc->hn_chim != NULL) {
5682 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5683 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5684 		} else {
5685 			device_printf(sc->hn_dev,
5686 			    "chimney sending buffer is referenced");
5687 		}
5688 		sc->hn_chim = NULL;
5689 	}
5690 
5691 	if (sc->hn_tx_ring_cnt == 0)
5692 		return;
5693 
5694 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5695 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5696 
5697 	free(sc->hn_tx_ring, M_DEVBUF);
5698 	sc->hn_tx_ring = NULL;
5699 
5700 	sc->hn_tx_ring_cnt = 0;
5701 	sc->hn_tx_ring_inuse = 0;
5702 }
5703 
5704 #ifdef HN_IFSTART_SUPPORT
5705 
5706 static void
5707 hn_start_taskfunc(void *xtxr, int pending __unused)
5708 {
5709 	struct hn_tx_ring *txr = xtxr;
5710 
5711 	mtx_lock(&txr->hn_tx_lock);
5712 	hn_start_locked(txr, 0);
5713 	mtx_unlock(&txr->hn_tx_lock);
5714 }
5715 
5716 static int
5717 hn_start_locked(struct hn_tx_ring *txr, int len)
5718 {
5719 	struct hn_softc *sc = txr->hn_sc;
5720 	struct ifnet *ifp = sc->hn_ifp;
5721 	int sched = 0;
5722 
5723 	KASSERT(hn_use_if_start,
5724 	    ("hn_start_locked is called, when if_start is disabled"));
5725 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5726 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5727 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5728 
5729 	if (__predict_false(txr->hn_suspended))
5730 		return (0);
5731 
5732 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5733 	    IFF_DRV_RUNNING)
5734 		return (0);
5735 
5736 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5737 		struct hn_txdesc *txd;
5738 		struct mbuf *m_head;
5739 		int error;
5740 
5741 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5742 		if (m_head == NULL)
5743 			break;
5744 
5745 		if (len > 0 && m_head->m_pkthdr.len > len) {
5746 			/*
5747 			 * This sending could be time consuming; let callers
5748 			 * dispatch this packet sending (and sending of any
5749 			 * following up packets) to tx taskqueue.
5750 			 */
5751 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5752 			sched = 1;
5753 			break;
5754 		}
5755 
5756 #if defined(INET6) || defined(INET)
5757 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5758 			m_head = hn_tso_fixup(m_head);
5759 			if (__predict_false(m_head == NULL)) {
5760 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5761 				continue;
5762 			}
5763 		} else if (m_head->m_pkthdr.csum_flags &
5764 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5765 			m_head = hn_set_hlen(m_head);
5766 			if (__predict_false(m_head == NULL)) {
5767 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5768 				continue;
5769 			}
5770 		}
5771 #endif
5772 
5773 		txd = hn_txdesc_get(txr);
5774 		if (txd == NULL) {
5775 			txr->hn_no_txdescs++;
5776 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5777 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5778 			break;
5779 		}
5780 
5781 		error = hn_encap(ifp, txr, txd, &m_head);
5782 		if (error) {
5783 			/* Both txd and m_head are freed */
5784 			KASSERT(txr->hn_agg_txd == NULL,
5785 			    ("encap failed w/ pending aggregating txdesc"));
5786 			continue;
5787 		}
5788 
5789 		if (txr->hn_agg_pktleft == 0) {
5790 			if (txr->hn_agg_txd != NULL) {
5791 				KASSERT(m_head == NULL,
5792 				    ("pending mbuf for aggregating txdesc"));
5793 				error = hn_flush_txagg(ifp, txr);
5794 				if (__predict_false(error)) {
5795 					atomic_set_int(&ifp->if_drv_flags,
5796 					    IFF_DRV_OACTIVE);
5797 					break;
5798 				}
5799 			} else {
5800 				KASSERT(m_head != NULL, ("mbuf was freed"));
5801 				error = hn_txpkt(ifp, txr, txd);
5802 				if (__predict_false(error)) {
5803 					/* txd is freed, but m_head is not */
5804 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5805 					atomic_set_int(&ifp->if_drv_flags,
5806 					    IFF_DRV_OACTIVE);
5807 					break;
5808 				}
5809 			}
5810 		}
5811 #ifdef INVARIANTS
5812 		else {
5813 			KASSERT(txr->hn_agg_txd != NULL,
5814 			    ("no aggregating txdesc"));
5815 			KASSERT(m_head == NULL,
5816 			    ("pending mbuf for aggregating txdesc"));
5817 		}
5818 #endif
5819 	}
5820 
5821 	/* Flush pending aggerated transmission. */
5822 	if (txr->hn_agg_txd != NULL)
5823 		hn_flush_txagg(ifp, txr);
5824 	return (sched);
5825 }
5826 
5827 static void
5828 hn_start(struct ifnet *ifp)
5829 {
5830 	struct hn_softc *sc = ifp->if_softc;
5831 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5832 
5833 	if (txr->hn_sched_tx)
5834 		goto do_sched;
5835 
5836 	if (mtx_trylock(&txr->hn_tx_lock)) {
5837 		int sched;
5838 
5839 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5840 		mtx_unlock(&txr->hn_tx_lock);
5841 		if (!sched)
5842 			return;
5843 	}
5844 do_sched:
5845 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5846 }
5847 
5848 static void
5849 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5850 {
5851 	struct hn_tx_ring *txr = xtxr;
5852 
5853 	mtx_lock(&txr->hn_tx_lock);
5854 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5855 	hn_start_locked(txr, 0);
5856 	mtx_unlock(&txr->hn_tx_lock);
5857 }
5858 
5859 static void
5860 hn_start_txeof(struct hn_tx_ring *txr)
5861 {
5862 	struct hn_softc *sc = txr->hn_sc;
5863 	struct ifnet *ifp = sc->hn_ifp;
5864 
5865 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5866 
5867 	if (txr->hn_sched_tx)
5868 		goto do_sched;
5869 
5870 	if (mtx_trylock(&txr->hn_tx_lock)) {
5871 		int sched;
5872 
5873 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5874 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5875 		mtx_unlock(&txr->hn_tx_lock);
5876 		if (sched) {
5877 			taskqueue_enqueue(txr->hn_tx_taskq,
5878 			    &txr->hn_tx_task);
5879 		}
5880 	} else {
5881 do_sched:
5882 		/*
5883 		 * Release the OACTIVE earlier, with the hope, that
5884 		 * others could catch up.  The task will clear the
5885 		 * flag again with the hn_tx_lock to avoid possible
5886 		 * races.
5887 		 */
5888 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5889 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5890 	}
5891 }
5892 
5893 #endif	/* HN_IFSTART_SUPPORT */
5894 
5895 static int
5896 hn_xmit(struct hn_tx_ring *txr, int len)
5897 {
5898 	struct hn_softc *sc = txr->hn_sc;
5899 	struct ifnet *ifp = sc->hn_ifp;
5900 	struct mbuf *m_head;
5901 	int sched = 0;
5902 
5903 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5904 #ifdef HN_IFSTART_SUPPORT
5905 	KASSERT(hn_use_if_start == 0,
5906 	    ("hn_xmit is called, when if_start is enabled"));
5907 #endif
5908 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5909 
5910 	if (__predict_false(txr->hn_suspended))
5911 		return (0);
5912 
5913 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5914 		return (0);
5915 
5916 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5917 		struct hn_txdesc *txd;
5918 		int error;
5919 
5920 		if (len > 0 && m_head->m_pkthdr.len > len) {
5921 			/*
5922 			 * This sending could be time consuming; let callers
5923 			 * dispatch this packet sending (and sending of any
5924 			 * following up packets) to tx taskqueue.
5925 			 */
5926 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5927 			sched = 1;
5928 			break;
5929 		}
5930 
5931 		txd = hn_txdesc_get(txr);
5932 		if (txd == NULL) {
5933 			txr->hn_no_txdescs++;
5934 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5935 			txr->hn_oactive = 1;
5936 			break;
5937 		}
5938 
5939 		error = hn_encap(ifp, txr, txd, &m_head);
5940 		if (error) {
5941 			/* Both txd and m_head are freed; discard */
5942 			KASSERT(txr->hn_agg_txd == NULL,
5943 			    ("encap failed w/ pending aggregating txdesc"));
5944 			drbr_advance(ifp, txr->hn_mbuf_br);
5945 			continue;
5946 		}
5947 
5948 		if (txr->hn_agg_pktleft == 0) {
5949 			if (txr->hn_agg_txd != NULL) {
5950 				KASSERT(m_head == NULL,
5951 				    ("pending mbuf for aggregating txdesc"));
5952 				error = hn_flush_txagg(ifp, txr);
5953 				if (__predict_false(error)) {
5954 					txr->hn_oactive = 1;
5955 					break;
5956 				}
5957 			} else {
5958 				KASSERT(m_head != NULL, ("mbuf was freed"));
5959 				error = hn_txpkt(ifp, txr, txd);
5960 				if (__predict_false(error)) {
5961 					/* txd is freed, but m_head is not */
5962 					drbr_putback(ifp, txr->hn_mbuf_br,
5963 					    m_head);
5964 					txr->hn_oactive = 1;
5965 					break;
5966 				}
5967 			}
5968 		}
5969 #ifdef INVARIANTS
5970 		else {
5971 			KASSERT(txr->hn_agg_txd != NULL,
5972 			    ("no aggregating txdesc"));
5973 			KASSERT(m_head == NULL,
5974 			    ("pending mbuf for aggregating txdesc"));
5975 		}
5976 #endif
5977 
5978 		/* Sent */
5979 		drbr_advance(ifp, txr->hn_mbuf_br);
5980 	}
5981 
5982 	/* Flush pending aggerated transmission. */
5983 	if (txr->hn_agg_txd != NULL)
5984 		hn_flush_txagg(ifp, txr);
5985 	return (sched);
5986 }
5987 
5988 static int
5989 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5990 {
5991 	struct hn_softc *sc = ifp->if_softc;
5992 	struct hn_tx_ring *txr;
5993 	int error, idx = 0;
5994 
5995 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5996 		struct rm_priotracker pt;
5997 
5998 		rm_rlock(&sc->hn_vf_lock, &pt);
5999 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6000 			struct mbuf *m_bpf = NULL;
6001 			int obytes, omcast;
6002 
6003 			obytes = m->m_pkthdr.len;
6004 			omcast = (m->m_flags & M_MCAST) != 0;
6005 
6006 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
6007 				if (bpf_peers_present(ifp->if_bpf)) {
6008 					m_bpf = m_copypacket(m, M_NOWAIT);
6009 					if (m_bpf == NULL) {
6010 						/*
6011 						 * Failed to grab a shallow
6012 						 * copy; tap now.
6013 						 */
6014 						ETHER_BPF_MTAP(ifp, m);
6015 					}
6016 				}
6017 			} else {
6018 				ETHER_BPF_MTAP(ifp, m);
6019 			}
6020 
6021 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
6022 			rm_runlock(&sc->hn_vf_lock, &pt);
6023 
6024 			if (m_bpf != NULL) {
6025 				if (!error)
6026 					ETHER_BPF_MTAP(ifp, m_bpf);
6027 				m_freem(m_bpf);
6028 			}
6029 
6030 			if (error == ENOBUFS) {
6031 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6032 			} else if (error) {
6033 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6034 			} else {
6035 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6036 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6037 				if (omcast) {
6038 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6039 					    omcast);
6040 				}
6041 			}
6042 			return (error);
6043 		}
6044 		rm_runlock(&sc->hn_vf_lock, &pt);
6045 	}
6046 
6047 #if defined(INET6) || defined(INET)
6048 	/*
6049 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6050 	 * since packet headers should be cache-hot.
6051 	 */
6052 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6053 		m = hn_tso_fixup(m);
6054 		if (__predict_false(m == NULL)) {
6055 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6056 			return EIO;
6057 		}
6058 	} else if (m->m_pkthdr.csum_flags &
6059 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6060 		m = hn_set_hlen(m);
6061 		if (__predict_false(m == NULL)) {
6062 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6063 			return EIO;
6064 		}
6065 	}
6066 #endif
6067 
6068 	/*
6069 	 * Select the TX ring based on flowid
6070 	 */
6071 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6072 #ifdef RSS
6073 		uint32_t bid;
6074 
6075 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6076 		    &bid) == 0)
6077 			idx = bid % sc->hn_tx_ring_inuse;
6078 		else
6079 #endif
6080 		{
6081 #if defined(INET6) || defined(INET)
6082 			int tcpsyn = 0;
6083 
6084 			if (m->m_pkthdr.len < 128 &&
6085 			    (m->m_pkthdr.csum_flags &
6086 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6087 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6088 				m = hn_check_tcpsyn(m, &tcpsyn);
6089 				if (__predict_false(m == NULL)) {
6090 					if_inc_counter(ifp,
6091 					    IFCOUNTER_OERRORS, 1);
6092 					return (EIO);
6093 				}
6094 			}
6095 #else
6096 			const int tcpsyn = 0;
6097 #endif
6098 			if (tcpsyn)
6099 				idx = 0;
6100 			else
6101 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6102 		}
6103 	}
6104 	txr = &sc->hn_tx_ring[idx];
6105 
6106 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6107 	if (error) {
6108 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6109 		return error;
6110 	}
6111 
6112 	if (txr->hn_oactive)
6113 		return 0;
6114 
6115 	if (txr->hn_sched_tx)
6116 		goto do_sched;
6117 
6118 	if (mtx_trylock(&txr->hn_tx_lock)) {
6119 		int sched;
6120 
6121 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6122 		mtx_unlock(&txr->hn_tx_lock);
6123 		if (!sched)
6124 			return 0;
6125 	}
6126 do_sched:
6127 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6128 	return 0;
6129 }
6130 
6131 static void
6132 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6133 {
6134 	struct mbuf *m;
6135 
6136 	mtx_lock(&txr->hn_tx_lock);
6137 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6138 		m_freem(m);
6139 	mtx_unlock(&txr->hn_tx_lock);
6140 }
6141 
6142 static void
6143 hn_xmit_qflush(struct ifnet *ifp)
6144 {
6145 	struct hn_softc *sc = ifp->if_softc;
6146 	struct rm_priotracker pt;
6147 	int i;
6148 
6149 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6150 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6151 	if_qflush(ifp);
6152 
6153 	rm_rlock(&sc->hn_vf_lock, &pt);
6154 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6155 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6156 	rm_runlock(&sc->hn_vf_lock, &pt);
6157 }
6158 
6159 static void
6160 hn_xmit_txeof(struct hn_tx_ring *txr)
6161 {
6162 
6163 	if (txr->hn_sched_tx)
6164 		goto do_sched;
6165 
6166 	if (mtx_trylock(&txr->hn_tx_lock)) {
6167 		int sched;
6168 
6169 		txr->hn_oactive = 0;
6170 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6171 		mtx_unlock(&txr->hn_tx_lock);
6172 		if (sched) {
6173 			taskqueue_enqueue(txr->hn_tx_taskq,
6174 			    &txr->hn_tx_task);
6175 		}
6176 	} else {
6177 do_sched:
6178 		/*
6179 		 * Release the oactive earlier, with the hope, that
6180 		 * others could catch up.  The task will clear the
6181 		 * oactive again with the hn_tx_lock to avoid possible
6182 		 * races.
6183 		 */
6184 		txr->hn_oactive = 0;
6185 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6186 	}
6187 }
6188 
6189 static void
6190 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6191 {
6192 	struct hn_tx_ring *txr = xtxr;
6193 
6194 	mtx_lock(&txr->hn_tx_lock);
6195 	hn_xmit(txr, 0);
6196 	mtx_unlock(&txr->hn_tx_lock);
6197 }
6198 
6199 static void
6200 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6201 {
6202 	struct hn_tx_ring *txr = xtxr;
6203 
6204 	mtx_lock(&txr->hn_tx_lock);
6205 	txr->hn_oactive = 0;
6206 	hn_xmit(txr, 0);
6207 	mtx_unlock(&txr->hn_tx_lock);
6208 }
6209 
6210 static int
6211 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6212 {
6213 	struct vmbus_chan_br cbr;
6214 	struct hn_rx_ring *rxr;
6215 	struct hn_tx_ring *txr = NULL;
6216 	int idx, error;
6217 
6218 	idx = vmbus_chan_subidx(chan);
6219 
6220 	/*
6221 	 * Link this channel to RX/TX ring.
6222 	 */
6223 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6224 	    ("invalid channel index %d, should > 0 && < %d",
6225 	     idx, sc->hn_rx_ring_inuse));
6226 	rxr = &sc->hn_rx_ring[idx];
6227 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6228 	    ("RX ring %d already attached", idx));
6229 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6230 	rxr->hn_chan = chan;
6231 
6232 	if (bootverbose) {
6233 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6234 		    idx, vmbus_chan_id(chan));
6235 	}
6236 
6237 	if (idx < sc->hn_tx_ring_inuse) {
6238 		txr = &sc->hn_tx_ring[idx];
6239 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6240 		    ("TX ring %d already attached", idx));
6241 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6242 
6243 		txr->hn_chan = chan;
6244 		if (bootverbose) {
6245 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6246 			    idx, vmbus_chan_id(chan));
6247 		}
6248 	}
6249 
6250 	/* Bind this channel to a proper CPU. */
6251 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6252 
6253 	/*
6254 	 * Open this channel
6255 	 */
6256 	cbr.cbr = rxr->hn_br;
6257 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6258 	cbr.cbr_txsz = HN_TXBR_SIZE;
6259 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6260 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6261 	if (error) {
6262 		if (error == EISCONN) {
6263 			if_printf(sc->hn_ifp, "bufring is connected after "
6264 			    "chan%u open failure\n", vmbus_chan_id(chan));
6265 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6266 		} else {
6267 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6268 			    vmbus_chan_id(chan), error);
6269 		}
6270 	}
6271 	return (error);
6272 }
6273 
6274 static void
6275 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6276 {
6277 	struct hn_rx_ring *rxr;
6278 	int idx, error;
6279 
6280 	idx = vmbus_chan_subidx(chan);
6281 
6282 	/*
6283 	 * Link this channel to RX/TX ring.
6284 	 */
6285 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6286 	    ("invalid channel index %d, should > 0 && < %d",
6287 	     idx, sc->hn_rx_ring_inuse));
6288 	rxr = &sc->hn_rx_ring[idx];
6289 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6290 	    ("RX ring %d is not attached", idx));
6291 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6292 
6293 	if (idx < sc->hn_tx_ring_inuse) {
6294 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6295 
6296 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6297 		    ("TX ring %d is not attached attached", idx));
6298 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6299 	}
6300 
6301 	/*
6302 	 * Close this channel.
6303 	 *
6304 	 * NOTE:
6305 	 * Channel closing does _not_ destroy the target channel.
6306 	 */
6307 	error = vmbus_chan_close_direct(chan);
6308 	if (error == EISCONN) {
6309 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6310 		    "after being closed\n", vmbus_chan_id(chan));
6311 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6312 	} else if (error) {
6313 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6314 		    vmbus_chan_id(chan), error);
6315 	}
6316 }
6317 
6318 static int
6319 hn_attach_subchans(struct hn_softc *sc)
6320 {
6321 	struct vmbus_channel **subchans;
6322 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6323 	int i, error = 0;
6324 
6325 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6326 
6327 	/* Attach the sub-channels. */
6328 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6329 	for (i = 0; i < subchan_cnt; ++i) {
6330 		int error1;
6331 
6332 		error1 = hn_chan_attach(sc, subchans[i]);
6333 		if (error1) {
6334 			error = error1;
6335 			/* Move on; all channels will be detached later. */
6336 		}
6337 	}
6338 	vmbus_subchan_rel(subchans, subchan_cnt);
6339 
6340 	if (error) {
6341 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6342 	} else {
6343 		if (bootverbose) {
6344 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6345 			    subchan_cnt);
6346 		}
6347 	}
6348 	return (error);
6349 }
6350 
6351 static void
6352 hn_detach_allchans(struct hn_softc *sc)
6353 {
6354 	struct vmbus_channel **subchans;
6355 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6356 	int i;
6357 
6358 	if (subchan_cnt == 0)
6359 		goto back;
6360 
6361 	/* Detach the sub-channels. */
6362 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6363 	for (i = 0; i < subchan_cnt; ++i)
6364 		hn_chan_detach(sc, subchans[i]);
6365 	vmbus_subchan_rel(subchans, subchan_cnt);
6366 
6367 back:
6368 	/*
6369 	 * Detach the primary channel, _after_ all sub-channels
6370 	 * are detached.
6371 	 */
6372 	hn_chan_detach(sc, sc->hn_prichan);
6373 
6374 	/* Wait for sub-channels to be destroyed, if any. */
6375 	vmbus_subchan_drain(sc->hn_prichan);
6376 
6377 #ifdef INVARIANTS
6378 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6379 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6380 		    HN_RX_FLAG_ATTACHED) == 0,
6381 		    ("%dth RX ring is still attached", i));
6382 	}
6383 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6384 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6385 		    HN_TX_FLAG_ATTACHED) == 0,
6386 		    ("%dth TX ring is still attached", i));
6387 	}
6388 #endif
6389 }
6390 
6391 static int
6392 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6393 {
6394 	struct vmbus_channel **subchans;
6395 	int nchan, rxr_cnt, error;
6396 
6397 	nchan = *nsubch + 1;
6398 	if (nchan == 1) {
6399 		/*
6400 		 * Multiple RX/TX rings are not requested.
6401 		 */
6402 		*nsubch = 0;
6403 		return (0);
6404 	}
6405 
6406 	/*
6407 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6408 	 * table entries.
6409 	 */
6410 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6411 	if (error) {
6412 		/* No RSS; this is benign. */
6413 		*nsubch = 0;
6414 		return (0);
6415 	}
6416 	if (bootverbose) {
6417 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6418 		    rxr_cnt, nchan);
6419 	}
6420 
6421 	if (nchan > rxr_cnt)
6422 		nchan = rxr_cnt;
6423 	if (nchan == 1) {
6424 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6425 		*nsubch = 0;
6426 		return (0);
6427 	}
6428 
6429 	/*
6430 	 * Allocate sub-channels from NVS.
6431 	 */
6432 	*nsubch = nchan - 1;
6433 	error = hn_nvs_alloc_subchans(sc, nsubch);
6434 	if (error || *nsubch == 0) {
6435 		/* Failed to allocate sub-channels. */
6436 		*nsubch = 0;
6437 		return (0);
6438 	}
6439 
6440 	/*
6441 	 * Wait for all sub-channels to become ready before moving on.
6442 	 */
6443 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6444 	vmbus_subchan_rel(subchans, *nsubch);
6445 	return (0);
6446 }
6447 
6448 static bool
6449 hn_synth_attachable(const struct hn_softc *sc)
6450 {
6451 	int i;
6452 
6453 	if (sc->hn_flags & HN_FLAG_ERRORS)
6454 		return (false);
6455 
6456 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6457 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6458 
6459 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6460 			return (false);
6461 	}
6462 	return (true);
6463 }
6464 
6465 /*
6466  * Make sure that the RX filter is zero after the successful
6467  * RNDIS initialization.
6468  *
6469  * NOTE:
6470  * Under certain conditions on certain versions of Hyper-V,
6471  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6472  * after the successful RNDIS initialization, which breaks
6473  * the assumption of any following code (well, it breaks the
6474  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6475  * explicitly, drain packets sneaking through, and drain the
6476  * interrupt taskqueues scheduled due to the stealth packets.
6477  */
6478 static void
6479 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6480 {
6481 
6482 	hn_disable_rx(sc);
6483 	hn_drain_rxtx(sc, nchan);
6484 }
6485 
6486 static int
6487 hn_synth_attach(struct hn_softc *sc, int mtu)
6488 {
6489 #define ATTACHED_NVS		0x0002
6490 #define ATTACHED_RNDIS		0x0004
6491 
6492 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6493 	int error, nsubch, nchan = 1, i, rndis_inited;
6494 	uint32_t old_caps, attached = 0;
6495 
6496 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6497 	    ("synthetic parts were attached"));
6498 
6499 	if (!hn_synth_attachable(sc))
6500 		return (ENXIO);
6501 
6502 	/* Save capabilities for later verification. */
6503 	old_caps = sc->hn_caps;
6504 	sc->hn_caps = 0;
6505 
6506 	/* Clear RSS stuffs. */
6507 	sc->hn_rss_ind_size = 0;
6508 	sc->hn_rss_hash = 0;
6509 	sc->hn_rss_hcap = 0;
6510 
6511 	/*
6512 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6513 	 */
6514 	error = hn_chan_attach(sc, sc->hn_prichan);
6515 	if (error)
6516 		goto failed;
6517 
6518 	/*
6519 	 * Attach NVS.
6520 	 */
6521 	error = hn_nvs_attach(sc, mtu);
6522 	if (error)
6523 		goto failed;
6524 	attached |= ATTACHED_NVS;
6525 
6526 	/*
6527 	 * Attach RNDIS _after_ NVS is attached.
6528 	 */
6529 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6530 	if (rndis_inited)
6531 		attached |= ATTACHED_RNDIS;
6532 	if (error)
6533 		goto failed;
6534 
6535 	/*
6536 	 * Make sure capabilities are not changed.
6537 	 */
6538 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6539 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6540 		    old_caps, sc->hn_caps);
6541 		error = ENXIO;
6542 		goto failed;
6543 	}
6544 
6545 	/*
6546 	 * Allocate sub-channels for multi-TX/RX rings.
6547 	 *
6548 	 * NOTE:
6549 	 * The # of RX rings that can be used is equivalent to the # of
6550 	 * channels to be requested.
6551 	 */
6552 	nsubch = sc->hn_rx_ring_cnt - 1;
6553 	error = hn_synth_alloc_subchans(sc, &nsubch);
6554 	if (error)
6555 		goto failed;
6556 	/* NOTE: _Full_ synthetic parts detach is required now. */
6557 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6558 
6559 	/*
6560 	 * Set the # of TX/RX rings that could be used according to
6561 	 * the # of channels that NVS offered.
6562 	 */
6563 	nchan = nsubch + 1;
6564 	hn_set_ring_inuse(sc, nchan);
6565 	if (nchan == 1) {
6566 		/* Only the primary channel can be used; done */
6567 		goto back;
6568 	}
6569 
6570 	/*
6571 	 * Attach the sub-channels.
6572 	 *
6573 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6574 	 */
6575 	error = hn_attach_subchans(sc);
6576 	if (error)
6577 		goto failed;
6578 
6579 	/*
6580 	 * Configure RSS key and indirect table _after_ all sub-channels
6581 	 * are attached.
6582 	 */
6583 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6584 		/*
6585 		 * RSS key is not set yet; set it to the default RSS key.
6586 		 */
6587 		if (bootverbose)
6588 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6589 #ifdef RSS
6590 		rss_getkey(rss->rss_key);
6591 #else
6592 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6593 #endif
6594 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6595 	}
6596 
6597 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6598 		/*
6599 		 * RSS indirect table is not set yet; set it up in round-
6600 		 * robin fashion.
6601 		 */
6602 		if (bootverbose) {
6603 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6604 			    "table\n");
6605 		}
6606 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6607 			uint32_t subidx;
6608 
6609 #ifdef RSS
6610 			subidx = rss_get_indirection_to_bucket(i);
6611 #else
6612 			subidx = i;
6613 #endif
6614 			rss->rss_ind[i] = subidx % nchan;
6615 		}
6616 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6617 	} else {
6618 		/*
6619 		 * # of usable channels may be changed, so we have to
6620 		 * make sure that all entries in RSS indirect table
6621 		 * are valid.
6622 		 *
6623 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6624 		 */
6625 		hn_rss_ind_fixup(sc);
6626 	}
6627 
6628 	sc->hn_rss_hash = sc->hn_rss_hcap;
6629 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6630 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6631 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6632 		hn_vf_rss_fixup(sc, false);
6633 	}
6634 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6635 	if (error)
6636 		goto failed;
6637 back:
6638 	/*
6639 	 * Fixup transmission aggregation setup.
6640 	 */
6641 	hn_set_txagg(sc);
6642 	hn_rndis_init_fixat(sc, nchan);
6643 	return (0);
6644 
6645 failed:
6646 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6647 		hn_rndis_init_fixat(sc, nchan);
6648 		hn_synth_detach(sc);
6649 	} else {
6650 		if (attached & ATTACHED_RNDIS) {
6651 			hn_rndis_init_fixat(sc, nchan);
6652 			hn_rndis_detach(sc);
6653 		}
6654 		if (attached & ATTACHED_NVS)
6655 			hn_nvs_detach(sc);
6656 		hn_chan_detach(sc, sc->hn_prichan);
6657 		/* Restore old capabilities. */
6658 		sc->hn_caps = old_caps;
6659 	}
6660 	return (error);
6661 
6662 #undef ATTACHED_RNDIS
6663 #undef ATTACHED_NVS
6664 }
6665 
6666 /*
6667  * NOTE:
6668  * The interface must have been suspended though hn_suspend(), before
6669  * this function get called.
6670  */
6671 static void
6672 hn_synth_detach(struct hn_softc *sc)
6673 {
6674 
6675 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6676 	    ("synthetic parts were not attached"));
6677 
6678 	/* Detach the RNDIS first. */
6679 	hn_rndis_detach(sc);
6680 
6681 	/* Detach NVS. */
6682 	hn_nvs_detach(sc);
6683 
6684 	/* Detach all of the channels. */
6685 	hn_detach_allchans(sc);
6686 
6687 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6688 		/*
6689 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6690 		 */
6691 		int error;
6692 
6693 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6694 		    sc->hn_rxbuf_gpadl);
6695 		if (error) {
6696 			if_printf(sc->hn_ifp,
6697 			    "rxbuf gpadl disconn failed: %d\n", error);
6698 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6699 		}
6700 		sc->hn_rxbuf_gpadl = 0;
6701 	}
6702 
6703 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6704 		/*
6705 		 * Host is post-Win2016, disconnect chimney sending buffer from
6706 		 * primary channel here.
6707 		 */
6708 		int error;
6709 
6710 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6711 		    sc->hn_chim_gpadl);
6712 		if (error) {
6713 			if_printf(sc->hn_ifp,
6714 			    "chim gpadl disconn failed: %d\n", error);
6715 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6716 		}
6717 		sc->hn_chim_gpadl = 0;
6718 	}
6719 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6720 }
6721 
6722 static void
6723 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6724 {
6725 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6726 	    ("invalid ring count %d", ring_cnt));
6727 
6728 	if (sc->hn_tx_ring_cnt > ring_cnt)
6729 		sc->hn_tx_ring_inuse = ring_cnt;
6730 	else
6731 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6732 	sc->hn_rx_ring_inuse = ring_cnt;
6733 
6734 #ifdef RSS
6735 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6736 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6737 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6738 		    rss_getnumbuckets());
6739 	}
6740 #endif
6741 
6742 	if (bootverbose) {
6743 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6744 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6745 	}
6746 }
6747 
6748 static void
6749 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6750 {
6751 
6752 	/*
6753 	 * NOTE:
6754 	 * The TX bufring will not be drained by the hypervisor,
6755 	 * if the primary channel is revoked.
6756 	 */
6757 	while (!vmbus_chan_rx_empty(chan) ||
6758 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6759 	     !vmbus_chan_tx_empty(chan)))
6760 		pause("waitch", 1);
6761 	vmbus_chan_intr_drain(chan);
6762 }
6763 
6764 static void
6765 hn_disable_rx(struct hn_softc *sc)
6766 {
6767 
6768 	/*
6769 	 * Disable RX by clearing RX filter forcefully.
6770 	 */
6771 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6772 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6773 
6774 	/*
6775 	 * Give RNDIS enough time to flush all pending data packets.
6776 	 */
6777 	pause("waitrx", (200 * hz) / 1000);
6778 }
6779 
6780 /*
6781  * NOTE:
6782  * RX/TX _must_ have been suspended/disabled, before this function
6783  * is called.
6784  */
6785 static void
6786 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6787 {
6788 	struct vmbus_channel **subch = NULL;
6789 	int nsubch;
6790 
6791 	/*
6792 	 * Drain RX/TX bufrings and interrupts.
6793 	 */
6794 	nsubch = nchan - 1;
6795 	if (nsubch > 0)
6796 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6797 
6798 	if (subch != NULL) {
6799 		int i;
6800 
6801 		for (i = 0; i < nsubch; ++i)
6802 			hn_chan_drain(sc, subch[i]);
6803 	}
6804 	hn_chan_drain(sc, sc->hn_prichan);
6805 
6806 	if (subch != NULL)
6807 		vmbus_subchan_rel(subch, nsubch);
6808 }
6809 
6810 static void
6811 hn_suspend_data(struct hn_softc *sc)
6812 {
6813 	struct hn_tx_ring *txr;
6814 	int i;
6815 
6816 	HN_LOCK_ASSERT(sc);
6817 
6818 	/*
6819 	 * Suspend TX.
6820 	 */
6821 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6822 		txr = &sc->hn_tx_ring[i];
6823 
6824 		mtx_lock(&txr->hn_tx_lock);
6825 		txr->hn_suspended = 1;
6826 		mtx_unlock(&txr->hn_tx_lock);
6827 		/* No one is able send more packets now. */
6828 
6829 		/*
6830 		 * Wait for all pending sends to finish.
6831 		 *
6832 		 * NOTE:
6833 		 * We will _not_ receive all pending send-done, if the
6834 		 * primary channel is revoked.
6835 		 */
6836 		while (hn_tx_ring_pending(txr) &&
6837 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6838 			pause("hnwtx", 1 /* 1 tick */);
6839 	}
6840 
6841 	/*
6842 	 * Disable RX.
6843 	 */
6844 	hn_disable_rx(sc);
6845 
6846 	/*
6847 	 * Drain RX/TX.
6848 	 */
6849 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6850 
6851 	/*
6852 	 * Drain any pending TX tasks.
6853 	 *
6854 	 * NOTE:
6855 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6856 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6857 	 */
6858 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6859 		txr = &sc->hn_tx_ring[i];
6860 
6861 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6862 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6863 	}
6864 }
6865 
6866 static void
6867 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6868 {
6869 
6870 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6871 }
6872 
6873 static void
6874 hn_suspend_mgmt(struct hn_softc *sc)
6875 {
6876 	struct task task;
6877 
6878 	HN_LOCK_ASSERT(sc);
6879 
6880 	/*
6881 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6882 	 * through hn_mgmt_taskq.
6883 	 */
6884 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6885 	vmbus_chan_run_task(sc->hn_prichan, &task);
6886 
6887 	/*
6888 	 * Make sure that all pending management tasks are completed.
6889 	 */
6890 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6891 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6892 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6893 }
6894 
6895 static void
6896 hn_suspend(struct hn_softc *sc)
6897 {
6898 
6899 	/* Disable polling. */
6900 	hn_polling(sc, 0);
6901 
6902 	/*
6903 	 * If the non-transparent mode VF is activated, the synthetic
6904 	 * device is receiving packets, so the data path of the
6905 	 * synthetic device must be suspended.
6906 	 */
6907 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6908 	    (sc->hn_flags & HN_FLAG_RXVF))
6909 		hn_suspend_data(sc);
6910 	hn_suspend_mgmt(sc);
6911 }
6912 
6913 static void
6914 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6915 {
6916 	int i;
6917 
6918 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6919 	    ("invalid TX ring count %d", tx_ring_cnt));
6920 
6921 	for (i = 0; i < tx_ring_cnt; ++i) {
6922 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6923 
6924 		mtx_lock(&txr->hn_tx_lock);
6925 		txr->hn_suspended = 0;
6926 		mtx_unlock(&txr->hn_tx_lock);
6927 	}
6928 }
6929 
6930 static void
6931 hn_resume_data(struct hn_softc *sc)
6932 {
6933 	int i;
6934 
6935 	HN_LOCK_ASSERT(sc);
6936 
6937 	/*
6938 	 * Re-enable RX.
6939 	 */
6940 	hn_rxfilter_config(sc);
6941 
6942 	/*
6943 	 * Make sure to clear suspend status on "all" TX rings,
6944 	 * since hn_tx_ring_inuse can be changed after
6945 	 * hn_suspend_data().
6946 	 */
6947 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6948 
6949 #ifdef HN_IFSTART_SUPPORT
6950 	if (!hn_use_if_start)
6951 #endif
6952 	{
6953 		/*
6954 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6955 		 * reduced.
6956 		 */
6957 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6958 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6959 	}
6960 
6961 	/*
6962 	 * Kick start TX.
6963 	 */
6964 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6965 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6966 
6967 		/*
6968 		 * Use txeof task, so that any pending oactive can be
6969 		 * cleared properly.
6970 		 */
6971 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6972 	}
6973 }
6974 
6975 static void
6976 hn_resume_mgmt(struct hn_softc *sc)
6977 {
6978 
6979 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6980 
6981 	/*
6982 	 * Kick off network change detection, if it was pending.
6983 	 * If no network change was pending, start link status
6984 	 * checks, which is more lightweight than network change
6985 	 * detection.
6986 	 */
6987 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6988 		hn_change_network(sc);
6989 	else
6990 		hn_update_link_status(sc);
6991 }
6992 
6993 static void
6994 hn_resume(struct hn_softc *sc)
6995 {
6996 
6997 	/*
6998 	 * If the non-transparent mode VF is activated, the synthetic
6999 	 * device have to receive packets, so the data path of the
7000 	 * synthetic device must be resumed.
7001 	 */
7002 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
7003 	    (sc->hn_flags & HN_FLAG_RXVF))
7004 		hn_resume_data(sc);
7005 
7006 	/*
7007 	 * Don't resume link status change if VF is attached/activated.
7008 	 * - In the non-transparent VF mode, the synthetic device marks
7009 	 *   link down until the VF is deactivated; i.e. VF is down.
7010 	 * - In transparent VF mode, VF's media status is used until
7011 	 *   the VF is detached.
7012 	 */
7013 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
7014 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
7015 		hn_resume_mgmt(sc);
7016 
7017 	/*
7018 	 * Re-enable polling if this interface is running and
7019 	 * the polling is requested.
7020 	 */
7021 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
7022 		hn_polling(sc, sc->hn_pollhz);
7023 }
7024 
7025 static void
7026 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
7027 {
7028 	const struct rndis_status_msg *msg;
7029 	int ofs;
7030 
7031 	if (dlen < sizeof(*msg)) {
7032 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7033 		return;
7034 	}
7035 	msg = data;
7036 
7037 	switch (msg->rm_status) {
7038 	case RNDIS_STATUS_MEDIA_CONNECT:
7039 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7040 		hn_update_link_status(sc);
7041 		break;
7042 
7043 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7044 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7045 		/* Not really useful; ignore. */
7046 		break;
7047 
7048 	case RNDIS_STATUS_NETWORK_CHANGE:
7049 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7050 		if (dlen < ofs + msg->rm_stbuflen ||
7051 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7052 			if_printf(sc->hn_ifp, "network changed\n");
7053 		} else {
7054 			uint32_t change;
7055 
7056 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7057 			    sizeof(change));
7058 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7059 			    change);
7060 		}
7061 		hn_change_network(sc);
7062 		break;
7063 
7064 	default:
7065 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7066 		    msg->rm_status);
7067 		break;
7068 	}
7069 }
7070 
7071 static int
7072 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7073 {
7074 	const struct rndis_pktinfo *pi = info_data;
7075 	uint32_t mask = 0;
7076 
7077 	while (info_dlen != 0) {
7078 		const void *data;
7079 		uint32_t dlen;
7080 
7081 		if (__predict_false(info_dlen < sizeof(*pi)))
7082 			return (EINVAL);
7083 		if (__predict_false(info_dlen < pi->rm_size))
7084 			return (EINVAL);
7085 		info_dlen -= pi->rm_size;
7086 
7087 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7088 			return (EINVAL);
7089 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7090 			return (EINVAL);
7091 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7092 		data = pi->rm_data;
7093 
7094 		if (pi->rm_internal == 1) {
7095 			switch (pi->rm_type) {
7096 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7097 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7098 					return (EINVAL);
7099 				info->pktinfo_id =
7100 				    (const struct packet_info_id *)data;
7101 				mask |= HN_RXINFO_PKTINFO_ID;
7102 				break;
7103 
7104 			default:
7105 				goto next;
7106 			}
7107 		} else {
7108 			switch (pi->rm_type) {
7109 			case NDIS_PKTINFO_TYPE_VLAN:
7110 				if (__predict_false(dlen
7111 				    < NDIS_VLAN_INFO_SIZE))
7112 					return (EINVAL);
7113 				info->vlan_info = (const uint32_t *)data;
7114 				mask |= HN_RXINFO_VLAN;
7115 				break;
7116 
7117 			case NDIS_PKTINFO_TYPE_CSUM:
7118 				if (__predict_false(dlen
7119 				    < NDIS_RXCSUM_INFO_SIZE))
7120 					return (EINVAL);
7121 				info->csum_info = (const uint32_t *)data;
7122 				mask |= HN_RXINFO_CSUM;
7123 				break;
7124 
7125 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7126 				if (__predict_false(dlen
7127 				    < HN_NDIS_HASH_VALUE_SIZE))
7128 					return (EINVAL);
7129 				info->hash_value = (const uint32_t *)data;
7130 				mask |= HN_RXINFO_HASHVAL;
7131 				break;
7132 
7133 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7134 				if (__predict_false(dlen
7135 				    < HN_NDIS_HASH_INFO_SIZE))
7136 					return (EINVAL);
7137 				info->hash_info = (const uint32_t *)data;
7138 				mask |= HN_RXINFO_HASHINF;
7139 				break;
7140 
7141 			default:
7142 				goto next;
7143 			}
7144 		}
7145 
7146 		if (mask == HN_RXINFO_ALL) {
7147 			/* All found; done */
7148 			break;
7149 		}
7150 next:
7151 		pi = (const struct rndis_pktinfo *)
7152 		    ((const uint8_t *)pi + pi->rm_size);
7153 	}
7154 
7155 	/*
7156 	 * Final fixup.
7157 	 * - If there is no hash value, invalidate the hash info.
7158 	 */
7159 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7160 		info->hash_info = NULL;
7161 	return (0);
7162 }
7163 
7164 static __inline bool
7165 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7166 {
7167 
7168 	if (off < check_off) {
7169 		if (__predict_true(off + len <= check_off))
7170 			return (false);
7171 	} else if (off > check_off) {
7172 		if (__predict_true(check_off + check_len <= off))
7173 			return (false);
7174 	}
7175 	return (true);
7176 }
7177 
7178 static __inline void
7179 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7180 		uint32_t len, struct hn_rxinfo *info)
7181 {
7182 	uint32_t cnt = rxr->rsc.cnt;
7183 
7184 	if (cnt) {
7185 		rxr->rsc.pktlen += len;
7186 	} else {
7187 		rxr->rsc.vlan_info = info->vlan_info;
7188 		rxr->rsc.csum_info = info->csum_info;
7189 		rxr->rsc.hash_info = info->hash_info;
7190 		rxr->rsc.hash_value = info->hash_value;
7191 		rxr->rsc.pktlen = len;
7192 	}
7193 
7194 	rxr->rsc.frag_data[cnt] = data;
7195 	rxr->rsc.frag_len[cnt] = len;
7196 	rxr->rsc.cnt++;
7197 }
7198 
7199 static void
7200 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7201 {
7202 	const struct rndis_packet_msg *pkt;
7203 	struct hn_rxinfo info;
7204 	int data_off, pktinfo_off, data_len, pktinfo_len;
7205 	bool rsc_more= false;
7206 
7207 	/*
7208 	 * Check length.
7209 	 */
7210 	if (__predict_false(dlen < sizeof(*pkt))) {
7211 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7212 		return;
7213 	}
7214 	pkt = data;
7215 
7216 	if (__predict_false(dlen < pkt->rm_len)) {
7217 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7218 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7219 		return;
7220 	}
7221 	if (__predict_false(pkt->rm_len <
7222 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7223 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7224 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7225 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7226 		    pkt->rm_pktinfolen);
7227 		return;
7228 	}
7229 	if (__predict_false(pkt->rm_datalen == 0)) {
7230 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7231 		return;
7232 	}
7233 
7234 	/*
7235 	 * Check offests.
7236 	 */
7237 #define IS_OFFSET_INVALID(ofs)			\
7238 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7239 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7240 
7241 	/* XXX Hyper-V does not meet data offset alignment requirement */
7242 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7243 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7244 		    "data offset %u\n", pkt->rm_dataoffset);
7245 		return;
7246 	}
7247 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7248 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7249 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7250 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7251 		return;
7252 	}
7253 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7254 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7255 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7256 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7257 		return;
7258 	}
7259 
7260 #undef IS_OFFSET_INVALID
7261 
7262 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7263 	data_len = pkt->rm_datalen;
7264 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7265 	pktinfo_len = pkt->rm_pktinfolen;
7266 
7267 	/*
7268 	 * Check OOB coverage.
7269 	 */
7270 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7271 		int oob_off, oob_len;
7272 
7273 		if_printf(rxr->hn_ifp, "got oobdata\n");
7274 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7275 		oob_len = pkt->rm_oobdatalen;
7276 
7277 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7278 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7279 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7280 			    pkt->rm_len, oob_off, oob_len);
7281 			return;
7282 		}
7283 
7284 		/*
7285 		 * Check against data.
7286 		 */
7287 		if (hn_rndis_check_overlap(oob_off, oob_len,
7288 		    data_off, data_len)) {
7289 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7290 			    "oob overlaps data, oob abs %d len %d, "
7291 			    "data abs %d len %d\n",
7292 			    oob_off, oob_len, data_off, data_len);
7293 			return;
7294 		}
7295 
7296 		/*
7297 		 * Check against pktinfo.
7298 		 */
7299 		if (pktinfo_len != 0 &&
7300 		    hn_rndis_check_overlap(oob_off, oob_len,
7301 		    pktinfo_off, pktinfo_len)) {
7302 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7303 			    "oob overlaps pktinfo, oob abs %d len %d, "
7304 			    "pktinfo abs %d len %d\n",
7305 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7306 			return;
7307 		}
7308 	}
7309 
7310 	/*
7311 	 * Check per-packet-info coverage and find useful per-packet-info.
7312 	 */
7313 	info.vlan_info = NULL;
7314 	info.csum_info = NULL;
7315 	info.hash_info = NULL;
7316 	info.pktinfo_id = NULL;
7317 
7318 	if (__predict_true(pktinfo_len != 0)) {
7319 		bool overlap;
7320 		int error;
7321 
7322 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7323 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7324 			    "pktinfo overflow, msglen %u, "
7325 			    "pktinfo abs %d len %d\n",
7326 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7327 			return;
7328 		}
7329 
7330 		/*
7331 		 * Check packet info coverage.
7332 		 */
7333 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7334 		    data_off, data_len);
7335 		if (__predict_false(overlap)) {
7336 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7337 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7338 			    "data abs %d len %d\n",
7339 			    pktinfo_off, pktinfo_len, data_off, data_len);
7340 			return;
7341 		}
7342 
7343 		/*
7344 		 * Find useful per-packet-info.
7345 		 */
7346 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7347 		    pktinfo_len, &info);
7348 		if (__predict_false(error)) {
7349 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7350 			    "pktinfo\n");
7351 			return;
7352 		}
7353 	}
7354 
7355 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7356 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7357 		    "data overflow, msglen %u, data abs %d len %d\n",
7358 		    pkt->rm_len, data_off, data_len);
7359 		return;
7360 	}
7361 
7362 	/* Identify RSC fragments, drop invalid packets */
7363 	if ((info.pktinfo_id != NULL) &&
7364 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7365 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7366 			rxr->rsc.cnt = 0;
7367 			rxr->hn_rsc_pkts++;
7368 		} else if (rxr->rsc.cnt == 0)
7369 			goto drop;
7370 
7371 		rsc_more = true;
7372 
7373 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7374 			rsc_more = false;
7375 
7376 		if (rsc_more && rxr->rsc.is_last)
7377 			goto drop;
7378 	} else {
7379 		rxr->rsc.cnt = 0;
7380 	}
7381 
7382 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7383 		goto drop;
7384 
7385 	/* Store data in per rx ring structure */
7386 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7387 	    data_len, &info);
7388 
7389 	if (rsc_more)
7390 		return;
7391 
7392 	hn_rxpkt(rxr);
7393 	rxr->rsc.cnt = 0;
7394 	return;
7395 drop:
7396 	rxr->hn_rsc_drop++;
7397 	return;
7398 }
7399 
7400 static __inline void
7401 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7402 {
7403 	const struct rndis_msghdr *hdr;
7404 
7405 	if (__predict_false(dlen < sizeof(*hdr))) {
7406 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7407 		return;
7408 	}
7409 	hdr = data;
7410 
7411 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7412 		/* Hot data path. */
7413 		hn_rndis_rx_data(rxr, data, dlen);
7414 		/* Done! */
7415 		return;
7416 	}
7417 
7418 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7419 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7420 	else
7421 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7422 }
7423 
7424 static void
7425 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7426 {
7427 	const struct hn_nvs_hdr *hdr;
7428 
7429 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7430 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7431 		return;
7432 	}
7433 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7434 
7435 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7436 		/* Useless; ignore */
7437 		return;
7438 	}
7439 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7440 }
7441 
7442 static void
7443 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7444     const struct vmbus_chanpkt_hdr *pkt)
7445 {
7446 	struct hn_nvs_sendctx *sndc;
7447 
7448 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7449 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7450 	    VMBUS_CHANPKT_DATALEN(pkt));
7451 	/*
7452 	 * NOTE:
7453 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7454 	 * its callback.
7455 	 */
7456 }
7457 
7458 static void
7459 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7460     const struct vmbus_chanpkt_hdr *pkthdr)
7461 {
7462 	const struct vmbus_chanpkt_rxbuf *pkt;
7463 	const struct hn_nvs_hdr *nvs_hdr;
7464 	int count, i, hlen;
7465 
7466 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7467 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7468 		return;
7469 	}
7470 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7471 
7472 	/* Make sure that this is a RNDIS message. */
7473 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7474 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7475 		    nvs_hdr->nvs_type);
7476 		return;
7477 	}
7478 
7479 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7480 	if (__predict_false(hlen < sizeof(*pkt))) {
7481 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7482 		return;
7483 	}
7484 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7485 
7486 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7487 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7488 		    pkt->cp_rxbuf_id);
7489 		return;
7490 	}
7491 
7492 	count = pkt->cp_rxbuf_cnt;
7493 	if (__predict_false(hlen <
7494 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7495 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7496 		return;
7497 	}
7498 
7499 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7500 	for (i = 0; i < count; ++i) {
7501 		int ofs, len;
7502 
7503 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7504 		len = pkt->cp_rxbuf[i].rb_len;
7505 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7506 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7507 			    "ofs %d, len %d\n", i, ofs, len);
7508 			continue;
7509 		}
7510 
7511 		rxr->rsc.is_last = (i == (count - 1));
7512 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7513 	}
7514 
7515 	/*
7516 	 * Ack the consumed RXBUF associated w/ this channel packet,
7517 	 * so that this RXBUF can be recycled by the hypervisor.
7518 	 */
7519 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7520 }
7521 
7522 static void
7523 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7524     uint64_t tid)
7525 {
7526 	struct hn_nvs_rndis_ack ack;
7527 	int retries, error;
7528 
7529 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7530 	ack.nvs_status = HN_NVS_STATUS_OK;
7531 
7532 	retries = 0;
7533 again:
7534 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7535 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7536 	if (__predict_false(error == EAGAIN)) {
7537 		/*
7538 		 * NOTE:
7539 		 * This should _not_ happen in real world, since the
7540 		 * consumption of the TX bufring from the TX path is
7541 		 * controlled.
7542 		 */
7543 		if (rxr->hn_ack_failed == 0)
7544 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7545 		rxr->hn_ack_failed++;
7546 		retries++;
7547 		if (retries < 10) {
7548 			DELAY(100);
7549 			goto again;
7550 		}
7551 		/* RXBUF leaks! */
7552 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7553 	}
7554 }
7555 
7556 static void
7557 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7558 {
7559 	struct hn_rx_ring *rxr = xrxr;
7560 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7561 
7562 	for (;;) {
7563 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7564 		int error, pktlen;
7565 
7566 		pktlen = rxr->hn_pktbuf_len;
7567 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7568 		if (__predict_false(error == ENOBUFS)) {
7569 			void *nbuf;
7570 			int nlen;
7571 
7572 			/*
7573 			 * Expand channel packet buffer.
7574 			 *
7575 			 * XXX
7576 			 * Use M_WAITOK here, since allocation failure
7577 			 * is fatal.
7578 			 */
7579 			nlen = rxr->hn_pktbuf_len * 2;
7580 			while (nlen < pktlen)
7581 				nlen *= 2;
7582 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7583 
7584 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7585 			    rxr->hn_pktbuf_len, nlen);
7586 
7587 			free(rxr->hn_pktbuf, M_DEVBUF);
7588 			rxr->hn_pktbuf = nbuf;
7589 			rxr->hn_pktbuf_len = nlen;
7590 			/* Retry! */
7591 			continue;
7592 		} else if (__predict_false(error == EAGAIN)) {
7593 			/* No more channel packets; done! */
7594 			break;
7595 		}
7596 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7597 
7598 		switch (pkt->cph_type) {
7599 		case VMBUS_CHANPKT_TYPE_COMP:
7600 			hn_nvs_handle_comp(sc, chan, pkt);
7601 			break;
7602 
7603 		case VMBUS_CHANPKT_TYPE_RXBUF:
7604 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7605 			break;
7606 
7607 		case VMBUS_CHANPKT_TYPE_INBAND:
7608 			hn_nvs_handle_notify(sc, pkt);
7609 			break;
7610 
7611 		default:
7612 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7613 			    pkt->cph_type);
7614 			break;
7615 		}
7616 	}
7617 	hn_chan_rollup(rxr, rxr->hn_txr);
7618 }
7619 
7620 static void
7621 hn_sysinit(void *arg __unused)
7622 {
7623 	int i;
7624 
7625 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7626 
7627 #ifdef HN_IFSTART_SUPPORT
7628 	/*
7629 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7630 	 * mainly due to the IFF_DRV_OACTIVE flag.
7631 	 */
7632 	if (hn_xpnt_vf && hn_use_if_start) {
7633 		hn_use_if_start = 0;
7634 		printf("hn: tranparent VF mode, if_transmit will be used, "
7635 		    "instead of if_start\n");
7636 	}
7637 #endif
7638 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7639 		printf("hn: invalid transparent VF attach routing "
7640 		    "wait timeout %d, reset to %d\n",
7641 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7642 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7643 	}
7644 
7645 	/*
7646 	 * Initialize VF map.
7647 	 */
7648 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7649 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7650 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7651 	    M_WAITOK | M_ZERO);
7652 
7653 	/*
7654 	 * Fix the # of TX taskqueues.
7655 	 */
7656 	if (hn_tx_taskq_cnt <= 0)
7657 		hn_tx_taskq_cnt = 1;
7658 	else if (hn_tx_taskq_cnt > mp_ncpus)
7659 		hn_tx_taskq_cnt = mp_ncpus;
7660 
7661 	/*
7662 	 * Fix the TX taskqueue mode.
7663 	 */
7664 	switch (hn_tx_taskq_mode) {
7665 	case HN_TX_TASKQ_M_INDEP:
7666 	case HN_TX_TASKQ_M_GLOBAL:
7667 	case HN_TX_TASKQ_M_EVTTQ:
7668 		break;
7669 	default:
7670 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7671 		break;
7672 	}
7673 
7674 	if (vm_guest != VM_GUEST_HV)
7675 		return;
7676 
7677 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7678 		return;
7679 
7680 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7681 	    M_DEVBUF, M_WAITOK);
7682 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7683 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7684 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7685 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7686 		    "hn tx%d", i);
7687 	}
7688 }
7689 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7690 
7691 static void
7692 hn_sysuninit(void *arg __unused)
7693 {
7694 
7695 	if (hn_tx_taskque != NULL) {
7696 		int i;
7697 
7698 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7699 			taskqueue_free(hn_tx_taskque[i]);
7700 		free(hn_tx_taskque, M_DEVBUF);
7701 	}
7702 
7703 	if (hn_vfmap != NULL)
7704 		free(hn_vfmap, M_DEVBUF);
7705 	rm_destroy(&hn_vfmap_lock);
7706 
7707 	counter_u64_free(hn_udpcs_fixup);
7708 }
7709 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7710