xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision e2afbc45258f2fa4bdcf126e959ac660e76fc802)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 #include "opt_hn.h"
57 #include "opt_inet6.h"
58 #include "opt_inet.h"
59 #include "opt_rss.h"
60 
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/bus.h>
64 #include <sys/counter.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/proc.h>
73 #include <sys/rmlock.h>
74 #include <sys/sbuf.h>
75 #include <sys/sched.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 #include <sys/epoch.h>
85 
86 #include <vm/vm.h>
87 #include <vm/vm_extern.h>
88 #include <vm/pmap.h>
89 
90 #include <machine/atomic.h>
91 #include <machine/in_cksum.h>
92 
93 #include <net/bpf.h>
94 #include <net/ethernet.h>
95 #include <net/if.h>
96 #include <net/if_dl.h>
97 #include <net/if_media.h>
98 #include <net/if_types.h>
99 #include <net/if_var.h>
100 #include <net/rndis.h>
101 #include <net/rss_config.h>
102 
103 #include <netinet/in_systm.h>
104 #include <netinet/in.h>
105 #include <netinet/ip.h>
106 #include <netinet/ip6.h>
107 #include <netinet/tcp.h>
108 #include <netinet/tcp_lro.h>
109 #include <netinet/udp.h>
110 
111 #include <dev/hyperv/include/hyperv.h>
112 #include <dev/hyperv/include/hyperv_busdma.h>
113 #include <dev/hyperv/include/vmbus.h>
114 #include <dev/hyperv/include/vmbus_xact.h>
115 
116 #include <dev/hyperv/netvsc/ndis.h>
117 #include <dev/hyperv/netvsc/if_hnreg.h>
118 #include <dev/hyperv/netvsc/if_hnvar.h>
119 #include <dev/hyperv/netvsc/hn_nvs.h>
120 #include <dev/hyperv/netvsc/hn_rndis.h>
121 
122 #include "vmbus_if.h"
123 
124 #define HN_IFSTART_SUPPORT
125 
126 #define HN_RING_CNT_DEF_MAX		8
127 
128 #define HN_VFMAP_SIZE_DEF		8
129 
130 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
131 
132 /* YYY should get it from the underlying channel */
133 #define HN_TX_DESC_CNT			512
134 
135 #define HN_RNDIS_PKT_LEN					\
136 	(sizeof(struct rndis_packet_msg) +			\
137 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
138 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
139 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
140 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
141 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
142 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
143 
144 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
145 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
146 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
147 /* -1 for RNDIS packet message */
148 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
149 
150 #define HN_DIRECT_TX_SIZE_DEF		128
151 
152 #define HN_EARLY_TXEOF_THRESH		8
153 
154 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
155 
156 #define HN_LROENT_CNT_DEF		128
157 
158 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
159 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
160 /* YYY 2*MTU is a bit rough, but should be good enough. */
161 #define HN_LRO_LENLIM_MIN(ifp)		(2 * if_getmtu(ifp))
162 
163 #define HN_LRO_ACKCNT_DEF		1
164 
165 #define HN_LOCK_INIT(sc)		\
166 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
167 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
168 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
169 #define HN_LOCK(sc)					\
170 do {							\
171 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
172 		/* Relinquish cpu to avoid deadlock */	\
173 		sched_relinquish(curthread);		\
174 		DELAY(1000);				\
175 	}						\
176 } while (0)
177 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
178 
179 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
180 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
181 #define HN_CSUM_IP_HWASSIST(sc)		\
182 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
183 #define HN_CSUM_IP6_HWASSIST(sc)	\
184 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
185 
186 #define HN_PKTSIZE_MIN(align)		\
187 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
188 	    HN_RNDIS_PKT_LEN, (align))
189 #define HN_PKTSIZE(m, align)		\
190 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
191 
192 #ifdef RSS
193 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
194 #else
195 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
196 #endif
197 
198 struct hn_txdesc {
199 #ifndef HN_USE_TXDESC_BUFRING
200 	SLIST_ENTRY(hn_txdesc)		link;
201 #endif
202 	STAILQ_ENTRY(hn_txdesc)		agg_link;
203 
204 	/* Aggregated txdescs, in sending order. */
205 	STAILQ_HEAD(, hn_txdesc)	agg_list;
206 
207 	/* The oldest packet, if transmission aggregation happens. */
208 	struct mbuf			*m;
209 	struct hn_tx_ring		*txr;
210 	int				refs;
211 	uint32_t			flags;	/* HN_TXD_FLAG_ */
212 	struct hn_nvs_sendctx		send_ctx;
213 	uint32_t			chim_index;
214 	int				chim_size;
215 
216 	bus_dmamap_t			data_dmap;
217 
218 	bus_addr_t			rndis_pkt_paddr;
219 	struct rndis_packet_msg		*rndis_pkt;
220 	bus_dmamap_t			rndis_pkt_dmap;
221 };
222 
223 #define HN_TXD_FLAG_ONLIST		0x0001
224 #define HN_TXD_FLAG_DMAMAP		0x0002
225 #define HN_TXD_FLAG_ONAGG		0x0004
226 
227 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
228 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
229 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
230 
231 struct packet_info_id {
232 	uint8_t				ver;
233 	uint8_t				flag;
234 	uint16_t			pkt_id;
235 };
236 
237 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
238 
239 
240 struct hn_rxinfo {
241 	const uint32_t			*vlan_info;
242 	const uint32_t			*csum_info;
243 	const uint32_t			*hash_info;
244 	const uint32_t			*hash_value;
245 	const struct packet_info_id	*pktinfo_id;
246 };
247 
248 struct hn_rxvf_setarg {
249 	struct hn_rx_ring	*rxr;
250 	if_t			vf_ifp;
251 };
252 
253 #define HN_RXINFO_VLAN			0x0001
254 #define HN_RXINFO_CSUM			0x0002
255 #define HN_RXINFO_HASHINF		0x0004
256 #define HN_RXINFO_HASHVAL		0x0008
257 #define HN_RXINFO_PKTINFO_ID		0x0010
258 #define HN_RXINFO_ALL			\
259 	(HN_RXINFO_VLAN |		\
260 	 HN_RXINFO_CSUM |		\
261 	 HN_RXINFO_HASHINF |		\
262 	 HN_RXINFO_HASHVAL |		\
263 	 HN_RXINFO_PKTINFO_ID)
264 
265 static int			hn_probe(device_t);
266 static int			hn_attach(device_t);
267 static int			hn_detach(device_t);
268 static int			hn_shutdown(device_t);
269 static void			hn_chan_callback(struct vmbus_channel *,
270 				    void *);
271 
272 static void			hn_init(void *);
273 static int			hn_ioctl(if_t, u_long, caddr_t);
274 #ifdef HN_IFSTART_SUPPORT
275 static void			hn_start(if_t);
276 #endif
277 static int			hn_transmit(if_t, struct mbuf *);
278 static void			hn_xmit_qflush(if_t);
279 static int			hn_ifmedia_upd(if_t);
280 static void			hn_ifmedia_sts(if_t,
281 				    struct ifmediareq *);
282 
283 static void			hn_ifnet_event(void *, if_t, int);
284 static void			hn_ifaddr_event(void *, if_t);
285 static void			hn_ifnet_attevent(void *, if_t);
286 static void			hn_ifnet_detevent(void *, if_t);
287 static void			hn_ifnet_lnkevent(void *, if_t, int);
288 
289 static bool			hn_ismyvf(const struct hn_softc *,
290 				    const if_t);
291 static void			hn_rxvf_change(struct hn_softc *,
292 				    if_t, bool);
293 static void			hn_rxvf_set(struct hn_softc *, if_t);
294 static void			hn_rxvf_set_task(void *, int);
295 static void			hn_xpnt_vf_input(if_t, struct mbuf *);
296 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
297 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
298 				    struct ifreq *);
299 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
300 static bool			hn_xpnt_vf_isready(struct hn_softc *);
301 static void			hn_xpnt_vf_setready(struct hn_softc *);
302 static void			hn_xpnt_vf_init_taskfunc(void *, int);
303 static void			hn_xpnt_vf_init(struct hn_softc *);
304 static void			hn_xpnt_vf_setenable(struct hn_softc *);
305 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
306 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
307 static void			hn_vf_rss_restore(struct hn_softc *);
308 
309 static int			hn_rndis_rxinfo(const void *, int,
310 				    struct hn_rxinfo *);
311 static void			hn_rndis_rx_data(struct hn_rx_ring *,
312 				    const void *, int);
313 static void			hn_rndis_rx_status(struct hn_softc *,
314 				    const void *, int);
315 static void			hn_rndis_init_fixat(struct hn_softc *, int);
316 
317 static void			hn_nvs_handle_notify(struct hn_softc *,
318 				    const struct vmbus_chanpkt_hdr *);
319 static void			hn_nvs_handle_comp(struct hn_softc *,
320 				    struct vmbus_channel *,
321 				    const struct vmbus_chanpkt_hdr *);
322 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
323 				    struct vmbus_channel *,
324 				    const struct vmbus_chanpkt_hdr *);
325 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
326 				    struct vmbus_channel *, uint64_t);
327 
328 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
329 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
331 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
340 #ifndef RSS
341 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
343 #endif
344 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
346 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
347 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
349 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
350 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
358 
359 static void			hn_stop(struct hn_softc *, bool);
360 static void			hn_init_locked(struct hn_softc *);
361 static int			hn_chan_attach(struct hn_softc *,
362 				    struct vmbus_channel *);
363 static void			hn_chan_detach(struct hn_softc *,
364 				    struct vmbus_channel *);
365 static int			hn_attach_subchans(struct hn_softc *);
366 static void			hn_detach_allchans(struct hn_softc *);
367 static void			hn_chan_rollup(struct hn_rx_ring *,
368 				    struct hn_tx_ring *);
369 static void			hn_set_ring_inuse(struct hn_softc *, int);
370 static int			hn_synth_attach(struct hn_softc *, int);
371 static void			hn_synth_detach(struct hn_softc *);
372 static int			hn_synth_alloc_subchans(struct hn_softc *,
373 				    int *);
374 static bool			hn_synth_attachable(const struct hn_softc *);
375 static void			hn_suspend(struct hn_softc *);
376 static void			hn_suspend_data(struct hn_softc *);
377 static void			hn_suspend_mgmt(struct hn_softc *);
378 static void			hn_resume(struct hn_softc *);
379 static void			hn_resume_data(struct hn_softc *);
380 static void			hn_resume_mgmt(struct hn_softc *);
381 static void			hn_suspend_mgmt_taskfunc(void *, int);
382 static void			hn_chan_drain(struct hn_softc *,
383 				    struct vmbus_channel *);
384 static void			hn_disable_rx(struct hn_softc *);
385 static void			hn_drain_rxtx(struct hn_softc *, int);
386 static void			hn_polling(struct hn_softc *, u_int);
387 static void			hn_chan_polling(struct vmbus_channel *, u_int);
388 static void			hn_mtu_change_fixup(struct hn_softc *);
389 
390 static void			hn_update_link_status(struct hn_softc *);
391 static void			hn_change_network(struct hn_softc *);
392 static void			hn_link_taskfunc(void *, int);
393 static void			hn_netchg_init_taskfunc(void *, int);
394 static void			hn_netchg_status_taskfunc(void *, int);
395 static void			hn_link_status(struct hn_softc *);
396 
397 static int			hn_create_rx_data(struct hn_softc *, int);
398 static void			hn_destroy_rx_data(struct hn_softc *);
399 static int			hn_check_iplen(const struct mbuf *, int);
400 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
401 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
402 static int			hn_rxfilter_config(struct hn_softc *);
403 static int			hn_rss_reconfig(struct hn_softc *);
404 static void			hn_rss_ind_fixup(struct hn_softc *);
405 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
406 static int			hn_rxpkt(struct hn_rx_ring *);
407 static uint32_t			hn_rss_type_fromndis(uint32_t);
408 static uint32_t			hn_rss_type_tondis(uint32_t);
409 
410 static int			hn_tx_ring_create(struct hn_softc *, int);
411 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
412 static int			hn_create_tx_data(struct hn_softc *, int);
413 static void			hn_fixup_tx_data(struct hn_softc *);
414 static void			hn_fixup_rx_data(struct hn_softc *);
415 static void			hn_destroy_tx_data(struct hn_softc *);
416 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
417 static void			hn_txdesc_gc(struct hn_tx_ring *,
418 				    struct hn_txdesc *);
419 static int			hn_encap(if_t, struct hn_tx_ring *,
420 				    struct hn_txdesc *, struct mbuf **);
421 static int			hn_txpkt(if_t, struct hn_tx_ring *,
422 				    struct hn_txdesc *);
423 static void			hn_set_chim_size(struct hn_softc *, int);
424 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
425 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
426 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
427 static void			hn_resume_tx(struct hn_softc *, int);
428 static void			hn_set_txagg(struct hn_softc *);
429 static void			*hn_try_txagg(if_t,
430 				    struct hn_tx_ring *, struct hn_txdesc *,
431 				    int);
432 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
433 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
434 				    struct hn_softc *, struct vmbus_channel *,
435 				    const void *, int);
436 static int			hn_txpkt_sglist(struct hn_tx_ring *,
437 				    struct hn_txdesc *);
438 static int			hn_txpkt_chim(struct hn_tx_ring *,
439 				    struct hn_txdesc *);
440 static int			hn_xmit(struct hn_tx_ring *, int);
441 static void			hn_xmit_taskfunc(void *, int);
442 static void			hn_xmit_txeof(struct hn_tx_ring *);
443 static void			hn_xmit_txeof_taskfunc(void *, int);
444 #ifdef HN_IFSTART_SUPPORT
445 static int			hn_start_locked(struct hn_tx_ring *, int);
446 static void			hn_start_taskfunc(void *, int);
447 static void			hn_start_txeof(struct hn_tx_ring *);
448 static void			hn_start_txeof_taskfunc(void *, int);
449 #endif
450 
451 static int			hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
452 
453 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
454     "Hyper-V network interface");
455 
456 /* Trust tcp segment verification on host side. */
457 static int			hn_trust_hosttcp = 1;
458 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
459     &hn_trust_hosttcp, 0,
460     "Trust tcp segment verification on host side, "
461     "when csum info is missing (global setting)");
462 
463 /* Trust udp datagrams verification on host side. */
464 static int			hn_trust_hostudp = 1;
465 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
466     &hn_trust_hostudp, 0,
467     "Trust udp datagram verification on host side, "
468     "when csum info is missing (global setting)");
469 
470 /* Trust ip packets verification on host side. */
471 static int			hn_trust_hostip = 1;
472 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
473     &hn_trust_hostip, 0,
474     "Trust ip packet verification on host side, "
475     "when csum info is missing (global setting)");
476 
477 /*
478  * Offload UDP/IPv4 checksum.
479  */
480 static int			hn_enable_udp4cs = 1;
481 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
482     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
483 
484 /*
485  * Offload UDP/IPv6 checksum.
486  */
487 static int			hn_enable_udp6cs = 1;
488 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
489     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
490 
491 /* Stats. */
492 static counter_u64_t		hn_udpcs_fixup;
493 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
494     &hn_udpcs_fixup, "# of UDP checksum fixup");
495 
496 /*
497  * See hn_set_hlen().
498  *
499  * This value is for Azure.  For Hyper-V, set this above
500  * 65536 to disable UDP datagram checksum fixup.
501  */
502 static int			hn_udpcs_fixup_mtu = 1420;
503 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
504     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
505 
506 /* Limit TSO burst size */
507 static int			hn_tso_maxlen = IP_MAXPACKET;
508 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
509     &hn_tso_maxlen, 0, "TSO burst limit");
510 
511 /* Limit chimney send size */
512 static int			hn_tx_chimney_size = 0;
513 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
514     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
515 
516 /* Limit the size of packet for direct transmission */
517 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
518 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
519     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
520 
521 /* # of LRO entries per RX ring */
522 #if defined(INET) || defined(INET6)
523 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
524 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
525     &hn_lro_entry_count, 0, "LRO entry count");
526 #endif
527 
528 static int			hn_tx_taskq_cnt = 1;
529 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
530     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
531 
532 #define HN_TX_TASKQ_M_INDEP	0
533 #define HN_TX_TASKQ_M_GLOBAL	1
534 #define HN_TX_TASKQ_M_EVTTQ	2
535 
536 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
537 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
538     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
539     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
540 
541 #ifndef HN_USE_TXDESC_BUFRING
542 static int			hn_use_txdesc_bufring = 0;
543 #else
544 static int			hn_use_txdesc_bufring = 1;
545 #endif
546 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
547     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
548 
549 #ifdef HN_IFSTART_SUPPORT
550 /* Use ifnet.if_start instead of ifnet.if_transmit */
551 static int			hn_use_if_start = 0;
552 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
553     &hn_use_if_start, 0, "Use if_start TX method");
554 #endif
555 
556 /* # of channels to use */
557 static int			hn_chan_cnt = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
559     &hn_chan_cnt, 0,
560     "# of channels to use; each channel has one RX ring and one TX ring");
561 
562 /* # of transmit rings to use */
563 static int			hn_tx_ring_cnt = 0;
564 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
565     &hn_tx_ring_cnt, 0, "# of TX rings to use");
566 
567 /* Software TX ring deptch */
568 static int			hn_tx_swq_depth = 0;
569 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
570     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
571 
572 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
573 static u_int			hn_lro_mbufq_depth = 0;
574 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
575     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
576 
577 /* Packet transmission aggregation size limit */
578 static int			hn_tx_agg_size = -1;
579 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
580     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
581 
582 /* Packet transmission aggregation count limit */
583 static int			hn_tx_agg_pkts = -1;
584 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
585     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
586 
587 /* VF list */
588 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
589     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
590     hn_vflist_sysctl, "A",
591     "VF list");
592 
593 /* VF mapping */
594 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
595     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
596     hn_vfmap_sysctl, "A",
597     "VF mapping");
598 
599 /* Transparent VF */
600 static int			hn_xpnt_vf = 1;
601 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
602     &hn_xpnt_vf, 0, "Transparent VF mod");
603 
604 /* Accurate BPF support for Transparent VF */
605 static int			hn_xpnt_vf_accbpf = 0;
606 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
607     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
608 
609 /* Extra wait for transparent VF attach routing; unit seconds. */
610 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
611 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
612     &hn_xpnt_vf_attwait, 0,
613     "Extra wait for transparent VF attach routing; unit: seconds");
614 
615 static u_int			hn_cpu_index;	/* next CPU for channel */
616 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
617 
618 static struct rmlock		hn_vfmap_lock;
619 static int			hn_vfmap_size;
620 static if_t			*hn_vfmap;
621 
622 static const struct hyperv_guid	hn_guid = {
623 	.hv_guid = {
624 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
625 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
626 };
627 
628 static device_method_t hn_methods[] = {
629 	/* Device interface */
630 	DEVMETHOD(device_probe,		hn_probe),
631 	DEVMETHOD(device_attach,	hn_attach),
632 	DEVMETHOD(device_detach,	hn_detach),
633 	DEVMETHOD(device_shutdown,	hn_shutdown),
634 	DEVMETHOD_END
635 };
636 
637 static driver_t hn_driver = {
638 	"hn",
639 	hn_methods,
640 	sizeof(struct hn_softc)
641 };
642 
643 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0);
644 MODULE_VERSION(hn, 1);
645 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
646 
647 static void
648 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
649 {
650 	int i;
651 
652 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
653 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
654 }
655 
656 static int
657 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
658 {
659 
660 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
661 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
662 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
663 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
664 }
665 
666 static int
667 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
668 {
669 	struct hn_nvs_rndis rndis;
670 
671 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
672 	    txd->chim_size > 0, ("invalid rndis chim txd"));
673 
674 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
675 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
676 	rndis.nvs_chim_idx = txd->chim_index;
677 	rndis.nvs_chim_sz = txd->chim_size;
678 
679 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
680 	    &rndis, sizeof(rndis), &txd->send_ctx));
681 }
682 
683 static __inline uint32_t
684 hn_chim_alloc(struct hn_softc *sc)
685 {
686 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
687 	u_long *bmap = sc->hn_chim_bmap;
688 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
689 
690 	for (i = 0; i < bmap_cnt; ++i) {
691 		int idx;
692 
693 		idx = ffsl(~bmap[i]);
694 		if (idx == 0)
695 			continue;
696 
697 		--idx; /* ffsl is 1-based */
698 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
699 		    ("invalid i %d and idx %d", i, idx));
700 
701 		if (atomic_testandset_long(&bmap[i], idx))
702 			continue;
703 
704 		ret = i * LONG_BIT + idx;
705 		break;
706 	}
707 	return (ret);
708 }
709 
710 static __inline void
711 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
712 {
713 	u_long mask;
714 	uint32_t idx;
715 
716 	idx = chim_idx / LONG_BIT;
717 	KASSERT(idx < sc->hn_chim_bmap_cnt,
718 	    ("invalid chimney index 0x%x", chim_idx));
719 
720 	mask = 1UL << (chim_idx % LONG_BIT);
721 	KASSERT(sc->hn_chim_bmap[idx] & mask,
722 	    ("index bitmap 0x%lx, chimney index %u, "
723 	     "bitmap idx %d, bitmask 0x%lx",
724 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
725 
726 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
727 }
728 
729 #if defined(INET6) || defined(INET)
730 
731 #define PULLUP_HDR(m, len)				\
732 do {							\
733 	if (__predict_false((m)->m_len < (len))) {	\
734 		(m) = m_pullup((m), (len));		\
735 		if ((m) == NULL)			\
736 			return (NULL);			\
737 	}						\
738 } while (0)
739 
740 /*
741  * NOTE: If this function failed, the m_head would be freed.
742  */
743 static __inline struct mbuf *
744 hn_tso_fixup(struct mbuf *m_head)
745 {
746 	struct ether_vlan_header *evl;
747 	struct tcphdr *th;
748 	int ehlen;
749 
750 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
751 
752 	PULLUP_HDR(m_head, sizeof(*evl));
753 	evl = mtod(m_head, struct ether_vlan_header *);
754 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
755 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
756 	else
757 		ehlen = ETHER_HDR_LEN;
758 	m_head->m_pkthdr.l2hlen = ehlen;
759 
760 #ifdef INET
761 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
762 		struct ip *ip;
763 		int iphlen;
764 
765 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
766 		ip = mtodo(m_head, ehlen);
767 		iphlen = ip->ip_hl << 2;
768 		m_head->m_pkthdr.l3hlen = iphlen;
769 
770 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
771 		th = mtodo(m_head, ehlen + iphlen);
772 
773 		ip->ip_len = 0;
774 		ip->ip_sum = 0;
775 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
776 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
777 	}
778 #endif
779 #if defined(INET6) && defined(INET)
780 	else
781 #endif
782 #ifdef INET6
783 	{
784 		struct ip6_hdr *ip6;
785 
786 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
787 		ip6 = mtodo(m_head, ehlen);
788 		if (ip6->ip6_nxt != IPPROTO_TCP) {
789 			m_freem(m_head);
790 			return (NULL);
791 		}
792 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
793 
794 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
795 		th = mtodo(m_head, ehlen + sizeof(*ip6));
796 
797 		ip6->ip6_plen = 0;
798 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
799 	}
800 #endif
801 	return (m_head);
802 }
803 
804 /*
805  * NOTE: If this function failed, the m_head would be freed.
806  */
807 static __inline struct mbuf *
808 hn_set_hlen(struct mbuf *m_head)
809 {
810 	const struct ether_vlan_header *evl;
811 	int ehlen;
812 
813 	PULLUP_HDR(m_head, sizeof(*evl));
814 	evl = mtod(m_head, const struct ether_vlan_header *);
815 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
816 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
817 	else
818 		ehlen = ETHER_HDR_LEN;
819 	m_head->m_pkthdr.l2hlen = ehlen;
820 
821 #ifdef INET
822 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
823 		const struct ip *ip;
824 		int iphlen;
825 
826 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
827 		ip = mtodo(m_head, ehlen);
828 		iphlen = ip->ip_hl << 2;
829 		m_head->m_pkthdr.l3hlen = iphlen;
830 
831 		/*
832 		 * UDP checksum offload does not work in Azure, if the
833 		 * following conditions meet:
834 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
835 		 * - IP_DF is not set in the IP hdr.
836 		 *
837 		 * Fallback to software checksum for these UDP datagrams.
838 		 */
839 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
840 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
841 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
842 			uint16_t off = ehlen + iphlen;
843 
844 			counter_u64_add(hn_udpcs_fixup, 1);
845 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
846 			*(uint16_t *)(m_head->m_data + off +
847                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
848 			    m_head, m_head->m_pkthdr.len, off);
849 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
850 		}
851 	}
852 #endif
853 #if defined(INET6) && defined(INET)
854 	else
855 #endif
856 #ifdef INET6
857 	{
858 		const struct ip6_hdr *ip6;
859 
860 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
861 		ip6 = mtodo(m_head, ehlen);
862 		if (ip6->ip6_nxt != IPPROTO_TCP &&
863 		    ip6->ip6_nxt != IPPROTO_UDP) {
864 			m_freem(m_head);
865 			return (NULL);
866 		}
867 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
868 	}
869 #endif
870 	return (m_head);
871 }
872 
873 /*
874  * NOTE: If this function failed, the m_head would be freed.
875  */
876 static __inline struct mbuf *
877 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
878 {
879 	const struct tcphdr *th;
880 	int ehlen, iphlen;
881 
882 	*tcpsyn = 0;
883 	ehlen = m_head->m_pkthdr.l2hlen;
884 	iphlen = m_head->m_pkthdr.l3hlen;
885 
886 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
887 	th = mtodo(m_head, ehlen + iphlen);
888 	if (tcp_get_flags(th) & TH_SYN)
889 		*tcpsyn = 1;
890 	return (m_head);
891 }
892 
893 #undef PULLUP_HDR
894 
895 #endif	/* INET6 || INET */
896 
897 static int
898 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
899 {
900 	int error = 0;
901 
902 	HN_LOCK_ASSERT(sc);
903 
904 	if (sc->hn_rx_filter != filter) {
905 		error = hn_rndis_set_rxfilter(sc, filter);
906 		if (!error)
907 			sc->hn_rx_filter = filter;
908 	}
909 	return (error);
910 }
911 
912 static int
913 hn_rxfilter_config(struct hn_softc *sc)
914 {
915 	if_t ifp = sc->hn_ifp;
916 	uint32_t filter;
917 
918 	HN_LOCK_ASSERT(sc);
919 
920 	/*
921 	 * If the non-transparent mode VF is activated, we don't know how
922 	 * its RX filter is configured, so stick the synthetic device in
923 	 * the promiscous mode.
924 	 */
925 	if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
926 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
927 	} else {
928 		filter = NDIS_PACKET_TYPE_DIRECTED;
929 		if (if_getflags(ifp) & IFF_BROADCAST)
930 			filter |= NDIS_PACKET_TYPE_BROADCAST;
931 		/* TODO: support multicast list */
932 		if ((if_getflags(ifp) & IFF_ALLMULTI) ||
933 		    !if_maddr_empty(ifp))
934 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
935 	}
936 	return (hn_set_rxfilter(sc, filter));
937 }
938 
939 static void
940 hn_set_txagg(struct hn_softc *sc)
941 {
942 	uint32_t size, pkts;
943 	int i;
944 
945 	/*
946 	 * Setup aggregation size.
947 	 */
948 	if (sc->hn_agg_size < 0)
949 		size = UINT32_MAX;
950 	else
951 		size = sc->hn_agg_size;
952 
953 	if (sc->hn_rndis_agg_size < size)
954 		size = sc->hn_rndis_agg_size;
955 
956 	/* NOTE: We only aggregate packets using chimney sending buffers. */
957 	if (size > (uint32_t)sc->hn_chim_szmax)
958 		size = sc->hn_chim_szmax;
959 
960 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
961 		/* Disable */
962 		size = 0;
963 		pkts = 0;
964 		goto done;
965 	}
966 
967 	/* NOTE: Type of the per TX ring setting is 'int'. */
968 	if (size > INT_MAX)
969 		size = INT_MAX;
970 
971 	/*
972 	 * Setup aggregation packet count.
973 	 */
974 	if (sc->hn_agg_pkts < 0)
975 		pkts = UINT32_MAX;
976 	else
977 		pkts = sc->hn_agg_pkts;
978 
979 	if (sc->hn_rndis_agg_pkts < pkts)
980 		pkts = sc->hn_rndis_agg_pkts;
981 
982 	if (pkts <= 1) {
983 		/* Disable */
984 		size = 0;
985 		pkts = 0;
986 		goto done;
987 	}
988 
989 	/* NOTE: Type of the per TX ring setting is 'short'. */
990 	if (pkts > SHRT_MAX)
991 		pkts = SHRT_MAX;
992 
993 done:
994 	/* NOTE: Type of the per TX ring setting is 'short'. */
995 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
996 		/* Disable */
997 		size = 0;
998 		pkts = 0;
999 	}
1000 
1001 	if (bootverbose) {
1002 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1003 		    size, pkts, sc->hn_rndis_agg_align);
1004 	}
1005 
1006 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1007 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1008 
1009 		mtx_lock(&txr->hn_tx_lock);
1010 		txr->hn_agg_szmax = size;
1011 		txr->hn_agg_pktmax = pkts;
1012 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1013 		mtx_unlock(&txr->hn_tx_lock);
1014 	}
1015 }
1016 
1017 static int
1018 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1019 {
1020 
1021 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1022 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1023 		return txr->hn_txdesc_cnt;
1024 	return hn_tx_swq_depth;
1025 }
1026 
1027 static int
1028 hn_rss_reconfig(struct hn_softc *sc)
1029 {
1030 	int error;
1031 
1032 	HN_LOCK_ASSERT(sc);
1033 
1034 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1035 		return (ENXIO);
1036 
1037 	/*
1038 	 * Disable RSS first.
1039 	 *
1040 	 * NOTE:
1041 	 * Direct reconfiguration by setting the UNCHG flags does
1042 	 * _not_ work properly.
1043 	 */
1044 	if (bootverbose)
1045 		if_printf(sc->hn_ifp, "disable RSS\n");
1046 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1047 	if (error) {
1048 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1049 		return (error);
1050 	}
1051 
1052 	/*
1053 	 * Reenable the RSS w/ the updated RSS key or indirect
1054 	 * table.
1055 	 */
1056 	if (bootverbose)
1057 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1058 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1059 	if (error) {
1060 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1061 		return (error);
1062 	}
1063 	return (0);
1064 }
1065 
1066 static void
1067 hn_rss_ind_fixup(struct hn_softc *sc)
1068 {
1069 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1070 	int i, nchan;
1071 
1072 	nchan = sc->hn_rx_ring_inuse;
1073 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1074 
1075 	/*
1076 	 * Check indirect table to make sure that all channels in it
1077 	 * can be used.
1078 	 */
1079 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1080 		if (rss->rss_ind[i] >= nchan) {
1081 			if_printf(sc->hn_ifp,
1082 			    "RSS indirect table %d fixup: %u -> %d\n",
1083 			    i, rss->rss_ind[i], nchan - 1);
1084 			rss->rss_ind[i] = nchan - 1;
1085 		}
1086 	}
1087 }
1088 
1089 static int
1090 hn_ifmedia_upd(if_t ifp __unused)
1091 {
1092 
1093 	/* Ignore since autoselect is the only defined and valid media */
1094 	return (0);
1095 }
1096 
1097 static void
1098 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr)
1099 {
1100 	struct hn_softc *sc = if_getsoftc(ifp);
1101 
1102 	ifmr->ifm_status = IFM_AVALID;
1103 	ifmr->ifm_active = IFM_ETHER;
1104 
1105 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1106 		ifmr->ifm_active |= IFM_NONE;
1107 		return;
1108 	}
1109 	ifmr->ifm_status |= IFM_ACTIVE;
1110 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1111 }
1112 
1113 static void
1114 hn_rxvf_set_task(void *xarg, int pending __unused)
1115 {
1116 	struct hn_rxvf_setarg *arg = xarg;
1117 
1118 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1119 }
1120 
1121 static void
1122 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp)
1123 {
1124 	struct hn_rx_ring *rxr;
1125 	struct hn_rxvf_setarg arg;
1126 	struct task task;
1127 	int i;
1128 
1129 	HN_LOCK_ASSERT(sc);
1130 
1131 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1132 
1133 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1134 		rxr = &sc->hn_rx_ring[i];
1135 
1136 		if (i < sc->hn_rx_ring_inuse) {
1137 			arg.rxr = rxr;
1138 			arg.vf_ifp = vf_ifp;
1139 			vmbus_chan_run_task(rxr->hn_chan, &task);
1140 		} else {
1141 			rxr->hn_rxvf_ifp = vf_ifp;
1142 		}
1143 	}
1144 }
1145 
1146 static bool
1147 hn_ismyvf(const struct hn_softc *sc, const if_t ifp)
1148 {
1149 	if_t hn_ifp;
1150 
1151 	hn_ifp = sc->hn_ifp;
1152 
1153 	if (ifp == hn_ifp)
1154 		return (false);
1155 
1156 	if (if_getalloctype(ifp) != IFT_ETHER)
1157 		return (false);
1158 
1159 	/* Ignore lagg/vlan interfaces */
1160 	if (strcmp(if_getdname(ifp), "lagg") == 0 ||
1161 	    strcmp(if_getdname(ifp), "vlan") == 0)
1162 		return (false);
1163 
1164 	/*
1165 	 * During detach events if_getifaddr(ifp) might be NULL.
1166 	 * Make sure the bcmp() below doesn't panic on that:
1167 	 */
1168 	if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL)
1169 		return (false);
1170 
1171 	if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0)
1172 		return (false);
1173 
1174 	return (true);
1175 }
1176 
1177 static void
1178 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf)
1179 {
1180 	if_t hn_ifp;
1181 
1182 	HN_LOCK(sc);
1183 
1184 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1185 		goto out;
1186 
1187 	if (!hn_ismyvf(sc, ifp))
1188 		goto out;
1189 	hn_ifp = sc->hn_ifp;
1190 
1191 	if (rxvf) {
1192 		if (sc->hn_flags & HN_FLAG_RXVF)
1193 			goto out;
1194 
1195 		sc->hn_flags |= HN_FLAG_RXVF;
1196 		hn_rxfilter_config(sc);
1197 	} else {
1198 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1199 			goto out;
1200 
1201 		sc->hn_flags &= ~HN_FLAG_RXVF;
1202 		if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING)
1203 			hn_rxfilter_config(sc);
1204 		else
1205 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1206 	}
1207 
1208 	hn_nvs_set_datapath(sc,
1209 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1210 
1211 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1212 
1213 	if (rxvf) {
1214 		hn_vf_rss_fixup(sc, true);
1215 		hn_suspend_mgmt(sc);
1216 		sc->hn_link_flags &=
1217 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1218 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1219 	} else {
1220 		hn_vf_rss_restore(sc);
1221 		hn_resume_mgmt(sc);
1222 	}
1223 
1224 	devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1225 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1226 
1227 	if (bootverbose) {
1228 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1229 		    rxvf ? "to" : "from", if_name(ifp));
1230 	}
1231 out:
1232 	HN_UNLOCK(sc);
1233 }
1234 
1235 static void
1236 hn_ifnet_event(void *arg, if_t ifp, int event)
1237 {
1238 
1239 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1240 		return;
1241 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1242 }
1243 
1244 static void
1245 hn_ifaddr_event(void *arg, if_t ifp)
1246 {
1247 
1248 	hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP);
1249 }
1250 
1251 static int
1252 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr __unused)
1253 {
1254 	if_t ifp, vf_ifp;
1255 
1256 	HN_LOCK_ASSERT(sc);
1257 	ifp = sc->hn_ifp;
1258 	vf_ifp = sc->hn_vf_ifp;
1259 
1260 	/*
1261 	 * Just sync up with VF's enabled capabilities.
1262 	 */
1263 	if_setcapenable(ifp, if_getcapenable(vf_ifp));
1264 	if_sethwassist(ifp, if_gethwassist(vf_ifp));
1265 
1266 	return (0);
1267 }
1268 
1269 static int
1270 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1271 {
1272 	if_t vf_ifp;
1273 	struct ifreq ifr;
1274 
1275 	HN_LOCK_ASSERT(sc);
1276 	vf_ifp = sc->hn_vf_ifp;
1277 
1278 	memset(&ifr, 0, sizeof(ifr));
1279 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1280 	ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff;
1281 	ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16;
1282 	return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread));
1283 }
1284 
1285 static void
1286 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1287 {
1288 	if_t ifp = sc->hn_ifp;
1289 	int allmulti = 0;
1290 
1291 	HN_LOCK_ASSERT(sc);
1292 
1293 	/* XXX vlan(4) style mcast addr maintenance */
1294 	if (!if_maddr_empty(ifp))
1295 		allmulti = IFF_ALLMULTI;
1296 
1297 	/* Always set the VF's if_flags */
1298 	if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti);
1299 }
1300 
1301 static void
1302 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m)
1303 {
1304 	struct rm_priotracker pt;
1305 	if_t hn_ifp = NULL;
1306 	struct mbuf *mn;
1307 
1308 	/*
1309 	 * XXX racy, if hn(4) ever detached.
1310 	 */
1311 	rm_rlock(&hn_vfmap_lock, &pt);
1312 	if (if_getindex(vf_ifp) < hn_vfmap_size)
1313 		hn_ifp = hn_vfmap[if_getindex(vf_ifp)];
1314 	rm_runlock(&hn_vfmap_lock, &pt);
1315 
1316 	if (hn_ifp != NULL) {
1317 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1318 			/*
1319 			 * Allow tapping on the VF.
1320 			 */
1321 			ETHER_BPF_MTAP(vf_ifp, mn);
1322 
1323 			/*
1324 			 * Update VF stats.
1325 			 */
1326 			if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) {
1327 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1328 				    mn->m_pkthdr.len);
1329 			}
1330 			/*
1331 			 * XXX IFCOUNTER_IMCAST
1332 			 * This stat updating is kinda invasive, since it
1333 			 * requires two checks on the mbuf: the length check
1334 			 * and the ethernet header check.  As of this write,
1335 			 * all multicast packets go directly to hn(4), which
1336 			 * makes imcast stat updating in the VF a try in vian.
1337 			 */
1338 
1339 			/*
1340 			 * Fix up rcvif and increase hn(4)'s ipackets.
1341 			 */
1342 			mn->m_pkthdr.rcvif = hn_ifp;
1343 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1344 		}
1345 		/*
1346 		 * Go through hn(4)'s if_input.
1347 		 */
1348 		if_input(hn_ifp, m);
1349 	} else {
1350 		/*
1351 		 * In the middle of the transition; free this
1352 		 * mbuf chain.
1353 		 */
1354 		while (m != NULL) {
1355 			mn = m->m_nextpkt;
1356 			m->m_nextpkt = NULL;
1357 			m_freem(m);
1358 			m = mn;
1359 		}
1360 	}
1361 }
1362 
1363 static void
1364 hn_mtu_change_fixup(struct hn_softc *sc)
1365 {
1366 	if_t ifp;
1367 
1368 	HN_LOCK_ASSERT(sc);
1369 	ifp = sc->hn_ifp;
1370 
1371 	hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp));
1372 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1373 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1374 }
1375 
1376 static uint32_t
1377 hn_rss_type_fromndis(uint32_t rss_hash)
1378 {
1379 	uint32_t types = 0;
1380 
1381 	if (rss_hash & NDIS_HASH_IPV4)
1382 		types |= RSS_TYPE_IPV4;
1383 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1384 		types |= RSS_TYPE_TCP_IPV4;
1385 	if (rss_hash & NDIS_HASH_IPV6)
1386 		types |= RSS_TYPE_IPV6;
1387 	if (rss_hash & NDIS_HASH_IPV6_EX)
1388 		types |= RSS_TYPE_IPV6_EX;
1389 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1390 		types |= RSS_TYPE_TCP_IPV6;
1391 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1392 		types |= RSS_TYPE_TCP_IPV6_EX;
1393 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1394 		types |= RSS_TYPE_UDP_IPV4;
1395 	return (types);
1396 }
1397 
1398 static uint32_t
1399 hn_rss_type_tondis(uint32_t types)
1400 {
1401 	uint32_t rss_hash = 0;
1402 
1403 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1404 	    ("UDP6 and UDP6EX are not supported"));
1405 
1406 	if (types & RSS_TYPE_IPV4)
1407 		rss_hash |= NDIS_HASH_IPV4;
1408 	if (types & RSS_TYPE_TCP_IPV4)
1409 		rss_hash |= NDIS_HASH_TCP_IPV4;
1410 	if (types & RSS_TYPE_IPV6)
1411 		rss_hash |= NDIS_HASH_IPV6;
1412 	if (types & RSS_TYPE_IPV6_EX)
1413 		rss_hash |= NDIS_HASH_IPV6_EX;
1414 	if (types & RSS_TYPE_TCP_IPV6)
1415 		rss_hash |= NDIS_HASH_TCP_IPV6;
1416 	if (types & RSS_TYPE_TCP_IPV6_EX)
1417 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1418 	if (types & RSS_TYPE_UDP_IPV4)
1419 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1420 	return (rss_hash);
1421 }
1422 
1423 static void
1424 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1425 {
1426 	int i;
1427 
1428 	HN_LOCK_ASSERT(sc);
1429 
1430 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1431 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1432 }
1433 
1434 static void
1435 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1436 {
1437 	if_t ifp, vf_ifp;
1438 	struct ifrsshash ifrh;
1439 	struct ifrsskey ifrk;
1440 	int error;
1441 	uint32_t my_types, diff_types, mbuf_types = 0;
1442 
1443 	HN_LOCK_ASSERT(sc);
1444 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1445 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1446 
1447 	if (sc->hn_rx_ring_inuse == 1) {
1448 		/* No RSS on synthetic parts; done. */
1449 		return;
1450 	}
1451 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1452 		/* Synthetic parts do not support Toeplitz; done. */
1453 		return;
1454 	}
1455 
1456 	ifp = sc->hn_ifp;
1457 	vf_ifp = sc->hn_vf_ifp;
1458 
1459 	/*
1460 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1461 	 * supported.
1462 	 */
1463 	memset(&ifrk, 0, sizeof(ifrk));
1464 	strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name));
1465 	error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread);
1466 	if (error) {
1467 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1468 		    if_name(vf_ifp), error);
1469 		goto done;
1470 	}
1471 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1472 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1473 		    if_name(vf_ifp), ifrk.ifrk_func);
1474 		goto done;
1475 	}
1476 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1477 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1478 		    if_name(vf_ifp), ifrk.ifrk_keylen);
1479 		goto done;
1480 	}
1481 
1482 	/*
1483 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1484 	 */
1485 	memset(&ifrh, 0, sizeof(ifrh));
1486 	strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name));
1487 	error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread);
1488 	if (error) {
1489 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1490 		    if_name(vf_ifp), error);
1491 		goto done;
1492 	}
1493 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1494 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1495 		    if_name(vf_ifp), ifrh.ifrh_func);
1496 		goto done;
1497 	}
1498 
1499 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1500 	if ((ifrh.ifrh_types & my_types) == 0) {
1501 		/* This disables RSS; ignore it then */
1502 		if_printf(ifp, "%s intersection of RSS types failed.  "
1503 		    "VF %#x, mine %#x\n", if_name(vf_ifp),
1504 		    ifrh.ifrh_types, my_types);
1505 		goto done;
1506 	}
1507 
1508 	diff_types = my_types ^ ifrh.ifrh_types;
1509 	my_types &= ifrh.ifrh_types;
1510 	mbuf_types = my_types;
1511 
1512 	/*
1513 	 * Detect RSS hash value/type confliction.
1514 	 *
1515 	 * NOTE:
1516 	 * We don't disable the hash type, but stop delivery the hash
1517 	 * value/type through mbufs on RX path.
1518 	 *
1519 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1520 	 * hash is delivered with type of TCP_IPV4.  This means if
1521 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1522 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1523 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1524 	 * here.
1525 	 */
1526 	if ((my_types & RSS_TYPE_IPV4) &&
1527 	    (diff_types & ifrh.ifrh_types &
1528 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1529 		/* Conflict; disable IPV4 hash type/value delivery. */
1530 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1531 		mbuf_types &= ~RSS_TYPE_IPV4;
1532 	}
1533 	if ((my_types & RSS_TYPE_IPV6) &&
1534 	    (diff_types & ifrh.ifrh_types &
1535 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1536 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1537 	      RSS_TYPE_IPV6_EX))) {
1538 		/* Conflict; disable IPV6 hash type/value delivery. */
1539 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1540 		mbuf_types &= ~RSS_TYPE_IPV6;
1541 	}
1542 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1543 	    (diff_types & ifrh.ifrh_types &
1544 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1545 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1546 	      RSS_TYPE_IPV6))) {
1547 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1548 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1549 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1550 	}
1551 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1552 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1553 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1554 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1555 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1556 	}
1557 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1558 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1559 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1560 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1561 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1562 	}
1563 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1564 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1565 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1566 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1567 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1568 	}
1569 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1570 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1571 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1572 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1573 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1574 	}
1575 
1576 	/*
1577 	 * Indirect table does not matter.
1578 	 */
1579 
1580 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1581 	    hn_rss_type_tondis(my_types);
1582 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1583 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1584 
1585 	if (reconf) {
1586 		error = hn_rss_reconfig(sc);
1587 		if (error) {
1588 			/* XXX roll-back? */
1589 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1590 			/* XXX keep going. */
1591 		}
1592 	}
1593 done:
1594 	/* Hash deliverability for mbufs. */
1595 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1596 }
1597 
1598 static void
1599 hn_vf_rss_restore(struct hn_softc *sc)
1600 {
1601 
1602 	HN_LOCK_ASSERT(sc);
1603 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1604 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1605 
1606 	if (sc->hn_rx_ring_inuse == 1)
1607 		goto done;
1608 
1609 	/*
1610 	 * Restore hash types.  Key does _not_ matter.
1611 	 */
1612 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1613 		int error;
1614 
1615 		sc->hn_rss_hash = sc->hn_rss_hcap;
1616 		error = hn_rss_reconfig(sc);
1617 		if (error) {
1618 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1619 			    error);
1620 			/* XXX keep going. */
1621 		}
1622 	}
1623 done:
1624 	/* Hash deliverability for mbufs. */
1625 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1626 }
1627 
1628 static void
1629 hn_xpnt_vf_setready(struct hn_softc *sc)
1630 {
1631 	if_t ifp, vf_ifp;
1632 	struct ifreq ifr;
1633 
1634 	HN_LOCK_ASSERT(sc);
1635 	ifp = sc->hn_ifp;
1636 	vf_ifp = sc->hn_vf_ifp;
1637 
1638 	/*
1639 	 * Mark the VF ready.
1640 	 */
1641 	sc->hn_vf_rdytick = 0;
1642 
1643 	/*
1644 	 * Save information for restoration.
1645 	 */
1646 	sc->hn_saved_caps = if_getcapabilities(ifp);
1647 	sc->hn_saved_tsomax = if_gethwtsomax(ifp);
1648 	sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp);
1649 	sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp);
1650 	sc->hn_saved_capenable = if_getcapenable(ifp);
1651 	sc->hn_saved_hwassist = if_gethwassist(ifp);
1652 
1653 	/*
1654 	 * Intersect supported/enabled capabilities.
1655 	 *
1656 	 * NOTE:
1657 	 * if_hwassist is not changed here.
1658 	 */
1659 	if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp));
1660 	if_setcapenablebit(ifp, 0, if_getcapabilities(ifp));
1661 
1662 	/*
1663 	 * Fix TSO settings.
1664 	 */
1665 	if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp))
1666 		if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp));
1667 	if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp))
1668 		if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp));
1669 	if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp))
1670 		if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp));
1671 
1672 	/*
1673 	 * Change VF's enabled capabilities.
1674 	 */
1675 	memset(&ifr, 0, sizeof(ifr));
1676 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1677 	ifr.ifr_reqcap = if_getcapenable(ifp);
1678 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1679 
1680 	if (if_getmtu(ifp) != ETHERMTU) {
1681 		int error;
1682 
1683 		/*
1684 		 * Change VF's MTU.
1685 		 */
1686 		memset(&ifr, 0, sizeof(ifr));
1687 		strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1688 		ifr.ifr_mtu = if_getmtu(ifp);
1689 		error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread);
1690 		if (error) {
1691 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1692 			    if_name(vf_ifp), if_getmtu(ifp));
1693 			if (if_getmtu(ifp) > ETHERMTU) {
1694 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1695 
1696 				/*
1697 				 * XXX
1698 				 * No need to adjust the synthetic parts' MTU;
1699 				 * failure of the adjustment will cause us
1700 				 * infinite headache.
1701 				 */
1702 				if_setmtu(ifp, ETHERMTU);
1703 				hn_mtu_change_fixup(sc);
1704 			}
1705 		}
1706 	}
1707 }
1708 
1709 static bool
1710 hn_xpnt_vf_isready(struct hn_softc *sc)
1711 {
1712 
1713 	HN_LOCK_ASSERT(sc);
1714 
1715 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1716 		return (false);
1717 
1718 	if (sc->hn_vf_rdytick == 0)
1719 		return (true);
1720 
1721 	if (sc->hn_vf_rdytick > ticks)
1722 		return (false);
1723 
1724 	/* Mark VF as ready. */
1725 	hn_xpnt_vf_setready(sc);
1726 	return (true);
1727 }
1728 
1729 static void
1730 hn_xpnt_vf_setenable(struct hn_softc *sc)
1731 {
1732 	int i;
1733 
1734 	HN_LOCK_ASSERT(sc);
1735 
1736 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1737 	rm_wlock(&sc->hn_vf_lock);
1738 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1739 	rm_wunlock(&sc->hn_vf_lock);
1740 
1741 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1742 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1743 }
1744 
1745 static void
1746 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1747 {
1748 	int i;
1749 
1750 	HN_LOCK_ASSERT(sc);
1751 
1752 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1753 	rm_wlock(&sc->hn_vf_lock);
1754 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1755 	if (clear_vf)
1756 		sc->hn_vf_ifp = NULL;
1757 	rm_wunlock(&sc->hn_vf_lock);
1758 
1759 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1760 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1761 }
1762 
1763 static void
1764 hn_xpnt_vf_init(struct hn_softc *sc)
1765 {
1766 	int error;
1767 
1768 	HN_LOCK_ASSERT(sc);
1769 
1770 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1771 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1772 
1773 	if (bootverbose) {
1774 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1775 		    if_name(sc->hn_vf_ifp));
1776 	}
1777 
1778 	/*
1779 	 * Bring the VF up.
1780 	 */
1781 	hn_xpnt_vf_saveifflags(sc);
1782 	if_setflagbits(sc->hn_ifp, IFF_UP, 0);
1783 	error = hn_xpnt_vf_iocsetflags(sc);
1784 	if (error) {
1785 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1786 		    if_name(sc->hn_vf_ifp), error);
1787 		return;
1788 	}
1789 
1790 	/*
1791 	 * NOTE:
1792 	 * Datapath setting must happen _after_ bringing the VF up.
1793 	 */
1794 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1795 
1796 	/*
1797 	 * NOTE:
1798 	 * Fixup RSS related bits _after_ the VF is brought up, since
1799 	 * many VFs generate RSS key during it's initialization.
1800 	 */
1801 	hn_vf_rss_fixup(sc, true);
1802 
1803 	/* Mark transparent mode VF as enabled. */
1804 	hn_xpnt_vf_setenable(sc);
1805 }
1806 
1807 static void
1808 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1809 {
1810 	struct hn_softc *sc = xsc;
1811 
1812 	HN_LOCK(sc);
1813 
1814 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1815 		goto done;
1816 	if (sc->hn_vf_ifp == NULL)
1817 		goto done;
1818 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1819 		goto done;
1820 
1821 	if (sc->hn_vf_rdytick != 0) {
1822 		/* Mark VF as ready. */
1823 		hn_xpnt_vf_setready(sc);
1824 	}
1825 
1826 	if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) {
1827 		/*
1828 		 * Delayed VF initialization.
1829 		 */
1830 		if (bootverbose) {
1831 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1832 			    if_name(sc->hn_vf_ifp));
1833 		}
1834 		hn_xpnt_vf_init(sc);
1835 	}
1836 done:
1837 	HN_UNLOCK(sc);
1838 }
1839 
1840 static void
1841 hn_ifnet_attevent(void *xsc, if_t ifp)
1842 {
1843 	struct hn_softc *sc = xsc;
1844 
1845 	HN_LOCK(sc);
1846 
1847 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1848 		goto done;
1849 
1850 	if (!hn_ismyvf(sc, ifp))
1851 		goto done;
1852 
1853 	if (sc->hn_vf_ifp != NULL) {
1854 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1855 		    if_name(sc->hn_vf_ifp));
1856 		goto done;
1857 	}
1858 
1859 	if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) {
1860 		/*
1861 		 * ifnet.if_start is _not_ supported by transparent
1862 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1863 		 */
1864 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1865 		    "in transparent VF mode.\n", if_name(sc->hn_vf_ifp));
1866 
1867 		goto done;
1868 	}
1869 
1870 	rm_wlock(&hn_vfmap_lock);
1871 
1872 	if (if_getindex(ifp) >= hn_vfmap_size) {
1873 		if_t *newmap;
1874 		int newsize;
1875 
1876 		newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF;
1877 		newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF,
1878 		    M_WAITOK | M_ZERO);
1879 
1880 		memcpy(newmap, hn_vfmap,
1881 		    sizeof(if_t) * hn_vfmap_size);
1882 		free(hn_vfmap, M_DEVBUF);
1883 		hn_vfmap = newmap;
1884 		hn_vfmap_size = newsize;
1885 	}
1886 	KASSERT(hn_vfmap[if_getindex(ifp)] == NULL,
1887 	    ("%s: ifindex %d was mapped to %s",
1888 	     if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)])));
1889 	hn_vfmap[if_getindex(ifp)] = sc->hn_ifp;
1890 
1891 	rm_wunlock(&hn_vfmap_lock);
1892 
1893 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1894 	rm_wlock(&sc->hn_vf_lock);
1895 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1896 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1897 	sc->hn_vf_ifp = ifp;
1898 	rm_wunlock(&sc->hn_vf_lock);
1899 
1900 	if (hn_xpnt_vf) {
1901 		int wait_ticks;
1902 
1903 		/*
1904 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1905 		 * Save vf_ifp's current if_input for later restoration.
1906 		 */
1907 		sc->hn_vf_input = if_getinputfn(ifp);
1908 		if_setinputfn(ifp, hn_xpnt_vf_input);
1909 
1910 		/*
1911 		 * Stop link status management; use the VF's.
1912 		 */
1913 		hn_suspend_mgmt(sc);
1914 
1915 		/*
1916 		 * Give VF sometime to complete its attach routing.
1917 		 */
1918 		wait_ticks = hn_xpnt_vf_attwait * hz;
1919 		sc->hn_vf_rdytick = ticks + wait_ticks;
1920 
1921 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1922 		    wait_ticks);
1923 	}
1924 done:
1925 	HN_UNLOCK(sc);
1926 }
1927 
1928 static void
1929 hn_ifnet_detevent(void *xsc, if_t ifp)
1930 {
1931 	struct hn_softc *sc = xsc;
1932 
1933 	HN_LOCK(sc);
1934 
1935 	if (sc->hn_vf_ifp == NULL)
1936 		goto done;
1937 
1938 	if (!hn_ismyvf(sc, ifp))
1939 		goto done;
1940 
1941 	if (hn_xpnt_vf) {
1942 		/*
1943 		 * Make sure that the delayed initialization is not running.
1944 		 *
1945 		 * NOTE:
1946 		 * - This lock _must_ be released, since the hn_vf_init task
1947 		 *   will try holding this lock.
1948 		 * - It is safe to release this lock here, since the
1949 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1950 		 *
1951 		 * XXX racy, if hn(4) ever detached.
1952 		 */
1953 		HN_UNLOCK(sc);
1954 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1955 		HN_LOCK(sc);
1956 
1957 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1958 		    if_name(sc->hn_ifp)));
1959 		if_setinputfn(ifp, sc->hn_vf_input);
1960 		sc->hn_vf_input = NULL;
1961 
1962 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1963 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1964 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1965 
1966 		if (sc->hn_vf_rdytick == 0) {
1967 			/*
1968 			 * The VF was ready; restore some settings.
1969 			 */
1970 			if_setcapabilities(ifp, sc->hn_saved_caps);
1971 
1972 			if_sethwtsomax(ifp, sc->hn_saved_tsomax);
1973 			if_sethwtsomaxsegcount(sc->hn_ifp,
1974 			    sc->hn_saved_tsosegcnt);
1975 			if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz);
1976 
1977 			if_setcapenable(ifp, sc->hn_saved_capenable);
1978 			if_sethwassist(ifp, sc->hn_saved_hwassist);
1979 		}
1980 
1981 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1982 			/*
1983 			 * Restore RSS settings.
1984 			 */
1985 			hn_vf_rss_restore(sc);
1986 
1987 			/*
1988 			 * Resume link status management, which was suspended
1989 			 * by hn_ifnet_attevent().
1990 			 */
1991 			hn_resume_mgmt(sc);
1992 		}
1993 	}
1994 
1995 	/* Mark transparent mode VF as disabled. */
1996 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
1997 
1998 	rm_wlock(&hn_vfmap_lock);
1999 
2000 	KASSERT(if_getindex(ifp) < hn_vfmap_size,
2001 	    ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size));
2002 	if (hn_vfmap[if_getindex(ifp)] != NULL) {
2003 		KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp,
2004 		    ("%s: ifindex %d was mapped to %s",
2005 		     if_name(ifp), if_getindex(ifp),
2006 		     if_name(hn_vfmap[if_getindex(ifp)])));
2007 		hn_vfmap[if_getindex(ifp)] = NULL;
2008 	}
2009 
2010 	rm_wunlock(&hn_vfmap_lock);
2011 done:
2012 	HN_UNLOCK(sc);
2013 }
2014 
2015 static void
2016 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state)
2017 {
2018 	struct hn_softc *sc = xsc;
2019 
2020 	if (sc->hn_vf_ifp == ifp)
2021 		if_link_state_change(sc->hn_ifp, link_state);
2022 }
2023 
2024 static int
2025 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS)
2026 {
2027 	struct hn_softc *sc = arg1;
2028 	unsigned int tsomax;
2029 	int error;
2030 
2031 	tsomax = if_gethwtsomax(sc->hn_ifp);
2032 	error = sysctl_handle_int(oidp, &tsomax, 0, req);
2033 	return error;
2034 }
2035 
2036 static int
2037 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS)
2038 {
2039 	struct hn_softc *sc = arg1;
2040 	unsigned int tsomaxsegcnt;
2041 	int error;
2042 
2043 	tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp);
2044 	error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req);
2045 	return error;
2046 }
2047 
2048 static int
2049 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS)
2050 {
2051 	struct hn_softc *sc = arg1;
2052 	unsigned int tsomaxsegsz;
2053 	int error;
2054 
2055 	tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp);
2056 	error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req);
2057 	return error;
2058 }
2059 
2060 static int
2061 hn_probe(device_t dev)
2062 {
2063 
2064 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2065 		device_set_desc(dev, "Hyper-V Network Interface");
2066 		return BUS_PROBE_DEFAULT;
2067 	}
2068 	return ENXIO;
2069 }
2070 
2071 static int
2072 hn_attach(device_t dev)
2073 {
2074 	struct hn_softc *sc = device_get_softc(dev);
2075 	struct sysctl_oid_list *child;
2076 	struct sysctl_ctx_list *ctx;
2077 	uint8_t eaddr[ETHER_ADDR_LEN];
2078 	if_t ifp = NULL;
2079 	int error, ring_cnt, tx_ring_cnt;
2080 	uint32_t mtu;
2081 
2082 	sc->hn_dev = dev;
2083 	sc->hn_prichan = vmbus_get_channel(dev);
2084 	HN_LOCK_INIT(sc);
2085 	rm_init(&sc->hn_vf_lock, "hnvf");
2086 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2087 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2088 
2089 	/*
2090 	 * Initialize these tunables once.
2091 	 */
2092 	sc->hn_agg_size = hn_tx_agg_size;
2093 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2094 
2095 	/*
2096 	 * Setup taskqueue for transmission.
2097 	 */
2098 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2099 		int i;
2100 
2101 		sc->hn_tx_taskqs =
2102 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2103 		    M_DEVBUF, M_WAITOK);
2104 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2105 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2106 			    M_WAITOK, taskqueue_thread_enqueue,
2107 			    &sc->hn_tx_taskqs[i]);
2108 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2109 			    "%s tx%d", device_get_nameunit(dev), i);
2110 		}
2111 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2112 		sc->hn_tx_taskqs = hn_tx_taskque;
2113 	}
2114 
2115 	/*
2116 	 * Setup taskqueue for mangement tasks, e.g. link status.
2117 	 */
2118 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2119 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2120 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2121 	    device_get_nameunit(dev));
2122 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2123 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2124 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2125 	    hn_netchg_status_taskfunc, sc);
2126 
2127 	if (hn_xpnt_vf) {
2128 		/*
2129 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2130 		 */
2131 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2132 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2133 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2134 		    device_get_nameunit(dev));
2135 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2136 		    hn_xpnt_vf_init_taskfunc, sc);
2137 	}
2138 
2139 	/*
2140 	 * Allocate ifnet and setup its name earlier, so that if_printf
2141 	 * can be used by functions, which will be called after
2142 	 * ether_ifattach().
2143 	 */
2144 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2145 	if_setsoftc(ifp, sc);
2146 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2147 
2148 	/*
2149 	 * Initialize ifmedia earlier so that it can be unconditionally
2150 	 * destroyed, if error happened later on.
2151 	 */
2152 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2153 
2154 	/*
2155 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2156 	 * to use (tx_ring_cnt).
2157 	 *
2158 	 * NOTE:
2159 	 * The # of RX rings to use is same as the # of channels to use.
2160 	 */
2161 	ring_cnt = hn_chan_cnt;
2162 	if (ring_cnt <= 0) {
2163 		/* Default */
2164 		ring_cnt = mp_ncpus;
2165 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2166 			ring_cnt = HN_RING_CNT_DEF_MAX;
2167 	} else if (ring_cnt > mp_ncpus) {
2168 		ring_cnt = mp_ncpus;
2169 	}
2170 #ifdef RSS
2171 	if (ring_cnt > rss_getnumbuckets())
2172 		ring_cnt = rss_getnumbuckets();
2173 #endif
2174 
2175 	tx_ring_cnt = hn_tx_ring_cnt;
2176 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2177 		tx_ring_cnt = ring_cnt;
2178 #ifdef HN_IFSTART_SUPPORT
2179 	if (hn_use_if_start) {
2180 		/* ifnet.if_start only needs one TX ring. */
2181 		tx_ring_cnt = 1;
2182 	}
2183 #endif
2184 
2185 	/*
2186 	 * Set the leader CPU for channels.
2187 	 */
2188 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2189 
2190 	/*
2191 	 * Create enough TX/RX rings, even if only limited number of
2192 	 * channels can be allocated.
2193 	 */
2194 	error = hn_create_tx_data(sc, tx_ring_cnt);
2195 	if (error)
2196 		goto failed;
2197 	error = hn_create_rx_data(sc, ring_cnt);
2198 	if (error)
2199 		goto failed;
2200 
2201 	/*
2202 	 * Create transaction context for NVS and RNDIS transactions.
2203 	 */
2204 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2205 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2206 	if (sc->hn_xact == NULL) {
2207 		error = ENXIO;
2208 		goto failed;
2209 	}
2210 
2211 	/*
2212 	 * Install orphan handler for the revocation of this device's
2213 	 * primary channel.
2214 	 *
2215 	 * NOTE:
2216 	 * The processing order is critical here:
2217 	 * Install the orphan handler, _before_ testing whether this
2218 	 * device's primary channel has been revoked or not.
2219 	 */
2220 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2221 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2222 		error = ENXIO;
2223 		goto failed;
2224 	}
2225 
2226 	/*
2227 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2228 	 */
2229 	error = hn_synth_attach(sc, ETHERMTU);
2230 	if (error)
2231 		goto failed;
2232 
2233 	error = hn_rndis_get_eaddr(sc, eaddr);
2234 	if (error)
2235 		goto failed;
2236 
2237 	error = hn_rndis_get_mtu(sc, &mtu);
2238 	if (error)
2239 		mtu = ETHERMTU;
2240 	else if (bootverbose)
2241 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2242 
2243 	if (sc->hn_rx_ring_inuse > 1) {
2244 		/*
2245 		 * Reduce TCP segment aggregation limit for multiple
2246 		 * RX rings to increase ACK timeliness.
2247 		 */
2248 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2249 	}
2250 
2251 	/*
2252 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2253 	 */
2254 	hn_fixup_tx_data(sc);
2255 	hn_fixup_rx_data(sc);
2256 
2257 	ctx = device_get_sysctl_ctx(dev);
2258 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2259 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2260 	    &sc->hn_nvs_ver, 0, "NVS version");
2261 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2262 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2263 	    hn_ndis_version_sysctl, "A", "NDIS version");
2264 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2265 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2266 	    hn_caps_sysctl, "A", "capabilities");
2267 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2268 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2269 	    hn_hwassist_sysctl, "A", "hwassist");
2270 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max",
2271 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl,
2272 	    "IU", "max TSO size");
2273 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt",
2274 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl,
2275 	    "IU", "max # of TSO segments");
2276 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz",
2277 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl,
2278 	    "IU", "max size of TSO segment");
2279 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2280 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2281 	    hn_rxfilter_sysctl, "A", "rxfilter");
2282 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2283 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2284 	    hn_rss_hash_sysctl, "A", "RSS hash");
2285 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2286 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2287 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2288 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2289 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2290 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2291 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2292 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2293 #ifndef RSS
2294 	/*
2295 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2296 	 */
2297 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2298 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2299 	    hn_rss_key_sysctl, "IU", "RSS key");
2300 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2301 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2302 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2303 #endif
2304 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2305 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2306 	    "RNDIS offered packet transmission aggregation size limit");
2307 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2308 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2309 	    "RNDIS offered packet transmission aggregation count limit");
2310 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2311 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2312 	    "RNDIS packet transmission aggregation alignment");
2313 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2314 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2315 	    hn_txagg_size_sysctl, "I",
2316 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2317 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2318 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2319 	    hn_txagg_pkts_sysctl, "I",
2320 	    "Packet transmission aggregation packets, "
2321 	    "0 -- disable, -1 -- auto");
2322 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2323 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2324 	    hn_polling_sysctl, "I",
2325 	    "Polling frequency: [100,1000000], 0 disable polling");
2326 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2327 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2328 	    hn_vf_sysctl, "A", "Virtual Function's name");
2329 	if (!hn_xpnt_vf) {
2330 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2331 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2332 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2333 	} else {
2334 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2335 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2336 		    hn_xpnt_vf_enabled_sysctl, "I",
2337 		    "Transparent VF enabled");
2338 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2339 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2340 		    hn_xpnt_vf_accbpf_sysctl, "I",
2341 		    "Accurate BPF for transparent VF");
2342 	}
2343 
2344 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
2345 	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "I",
2346 	    "switch to rsc");
2347 
2348 	/*
2349 	 * Setup the ifmedia, which has been initialized earlier.
2350 	 */
2351 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2352 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2353 	/* XXX ifmedia_set really should do this for us */
2354 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2355 
2356 	/*
2357 	 * Setup the ifnet for this interface.
2358 	 */
2359 
2360 	if_setbaudrate(ifp, IF_Gbps(10));
2361 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
2362 	if_setioctlfn(ifp, hn_ioctl);
2363 	if_setinitfn(ifp, hn_init);
2364 #ifdef HN_IFSTART_SUPPORT
2365 	if (hn_use_if_start) {
2366 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2367 
2368 		if_setstartfn(ifp, hn_start);
2369 		if_setsendqlen(ifp, qdepth);
2370 		if_setsendqready(ifp);
2371 	} else
2372 #endif
2373 	{
2374 		if_settransmitfn(ifp, hn_transmit);
2375 		if_setqflushfn(ifp, hn_xmit_qflush);
2376 	}
2377 
2378 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0);
2379 #ifdef foo
2380 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2381 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0);
2382 #endif
2383 	if (sc->hn_caps & HN_CAP_VLAN) {
2384 		/* XXX not sure about VLAN_MTU. */
2385 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0);
2386 	}
2387 
2388 	if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist);
2389 	if (if_gethwassist(ifp) & HN_CSUM_IP_MASK)
2390 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0);
2391 	if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK)
2392 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0);
2393 	if (sc->hn_caps & HN_CAP_TSO4) {
2394 		if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
2395 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2396 	}
2397 	if (sc->hn_caps & HN_CAP_TSO6) {
2398 		if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
2399 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2400 	}
2401 
2402 	/* Enable all available capabilities by default. */
2403 	if_setcapenable(ifp, if_getcapabilities(ifp));
2404 
2405 	/*
2406 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2407 	 * be enabled through SIOCSIFCAP.
2408 	 */
2409 	if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6));
2410 	if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO));
2411 
2412 	if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) {
2413 		/*
2414 		 * Lock hn_set_tso_maxsize() to simplify its
2415 		 * internal logic.
2416 		 */
2417 		HN_LOCK(sc);
2418 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2419 		HN_UNLOCK(sc);
2420 		if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX);
2421 		if_sethwtsomaxsegsize(ifp, PAGE_SIZE);
2422 	}
2423 
2424 	ether_ifattach(ifp, eaddr);
2425 
2426 	if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2427 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2428 		    if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp));
2429 	}
2430 	if (mtu < ETHERMTU) {
2431 
2432 		if_setmtu(ifp, mtu);
2433 	}
2434 
2435 	/* Inform the upper layer about the long frame support. */
2436 	if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
2437 
2438 	/*
2439 	 * Kick off link status check.
2440 	 */
2441 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2442 	hn_update_link_status(sc);
2443 
2444 	if (!hn_xpnt_vf) {
2445 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2446 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2447 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2448 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2449 	} else {
2450 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2451 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2452 	}
2453 
2454 	/*
2455 	 * NOTE:
2456 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2457 	 * since interface's LLADDR is needed; interface LLADDR is not
2458 	 * available when ifnet_arrival event is triggered.
2459 	 */
2460 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2461 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2462 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2463 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2464 
2465 	return (0);
2466 failed:
2467 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2468 		hn_synth_detach(sc);
2469 	hn_detach(dev);
2470 	return (error);
2471 }
2472 
2473 static int
2474 hn_detach(device_t dev)
2475 {
2476 	struct hn_softc *sc = device_get_softc(dev);
2477 	if_t ifp = sc->hn_ifp, vf_ifp;
2478 
2479 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2480 		/*
2481 		 * In case that the vmbus missed the orphan handler
2482 		 * installation.
2483 		 */
2484 		vmbus_xact_ctx_orphan(sc->hn_xact);
2485 	}
2486 
2487 	if (sc->hn_ifaddr_evthand != NULL)
2488 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2489 	if (sc->hn_ifnet_evthand != NULL)
2490 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2491 	if (sc->hn_ifnet_atthand != NULL) {
2492 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2493 		    sc->hn_ifnet_atthand);
2494 	}
2495 	if (sc->hn_ifnet_dethand != NULL) {
2496 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2497 		    sc->hn_ifnet_dethand);
2498 	}
2499 	if (sc->hn_ifnet_lnkhand != NULL)
2500 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2501 
2502 	vf_ifp = sc->hn_vf_ifp;
2503 	__compiler_membar();
2504 	if (vf_ifp != NULL)
2505 		hn_ifnet_detevent(sc, vf_ifp);
2506 
2507 	if (device_is_attached(dev)) {
2508 		HN_LOCK(sc);
2509 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2510 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
2511 				hn_stop(sc, true);
2512 			/*
2513 			 * NOTE:
2514 			 * hn_stop() only suspends data, so management
2515 			 * stuffs have to be suspended manually here.
2516 			 */
2517 			hn_suspend_mgmt(sc);
2518 			hn_synth_detach(sc);
2519 		}
2520 		HN_UNLOCK(sc);
2521 		ether_ifdetach(ifp);
2522 	}
2523 
2524 	ifmedia_removeall(&sc->hn_media);
2525 	hn_destroy_rx_data(sc);
2526 	hn_destroy_tx_data(sc);
2527 
2528 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2529 		int i;
2530 
2531 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2532 			taskqueue_free(sc->hn_tx_taskqs[i]);
2533 		free(sc->hn_tx_taskqs, M_DEVBUF);
2534 	}
2535 	taskqueue_free(sc->hn_mgmt_taskq0);
2536 	if (sc->hn_vf_taskq != NULL)
2537 		taskqueue_free(sc->hn_vf_taskq);
2538 
2539 	if (sc->hn_xact != NULL) {
2540 		/*
2541 		 * Uninstall the orphan handler _before_ the xact is
2542 		 * destructed.
2543 		 */
2544 		vmbus_chan_unset_orphan(sc->hn_prichan);
2545 		vmbus_xact_ctx_destroy(sc->hn_xact);
2546 	}
2547 
2548 	if_free(ifp);
2549 
2550 	HN_LOCK_DESTROY(sc);
2551 	rm_destroy(&sc->hn_vf_lock);
2552 	return (0);
2553 }
2554 
2555 static int
2556 hn_shutdown(device_t dev)
2557 {
2558 
2559 	return (0);
2560 }
2561 
2562 static void
2563 hn_link_status(struct hn_softc *sc)
2564 {
2565 	uint32_t link_status;
2566 	int error;
2567 
2568 	error = hn_rndis_get_linkstatus(sc, &link_status);
2569 	if (error) {
2570 		/* XXX what to do? */
2571 		return;
2572 	}
2573 
2574 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2575 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2576 	else
2577 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2578 	if_link_state_change(sc->hn_ifp,
2579 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2580 	    LINK_STATE_UP : LINK_STATE_DOWN);
2581 }
2582 
2583 static void
2584 hn_link_taskfunc(void *xsc, int pending __unused)
2585 {
2586 	struct hn_softc *sc = xsc;
2587 
2588 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2589 		return;
2590 	hn_link_status(sc);
2591 }
2592 
2593 static void
2594 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2595 {
2596 	struct hn_softc *sc = xsc;
2597 
2598 	/* Prevent any link status checks from running. */
2599 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2600 
2601 	/*
2602 	 * Fake up a [link down --> link up] state change; 5 seconds
2603 	 * delay is used, which closely simulates miibus reaction
2604 	 * upon link down event.
2605 	 */
2606 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2607 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2608 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2609 	    &sc->hn_netchg_status, 5 * hz);
2610 }
2611 
2612 static void
2613 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2614 {
2615 	struct hn_softc *sc = xsc;
2616 
2617 	/* Re-allow link status checks. */
2618 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2619 	hn_link_status(sc);
2620 }
2621 
2622 static void
2623 hn_update_link_status(struct hn_softc *sc)
2624 {
2625 
2626 	if (sc->hn_mgmt_taskq != NULL)
2627 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2628 }
2629 
2630 static void
2631 hn_change_network(struct hn_softc *sc)
2632 {
2633 
2634 	if (sc->hn_mgmt_taskq != NULL)
2635 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2636 }
2637 
2638 static __inline int
2639 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2640     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2641 {
2642 	struct mbuf *m = *m_head;
2643 	int error;
2644 
2645 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2646 
2647 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2648 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2649 	if (error == EFBIG) {
2650 		struct mbuf *m_new;
2651 
2652 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2653 		if (m_new == NULL)
2654 			return ENOBUFS;
2655 		else
2656 			*m_head = m = m_new;
2657 		txr->hn_tx_collapsed++;
2658 
2659 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2660 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2661 	}
2662 	if (!error) {
2663 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2664 		    BUS_DMASYNC_PREWRITE);
2665 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2666 	}
2667 	return error;
2668 }
2669 
2670 static __inline int
2671 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2672 {
2673 
2674 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2675 	    ("put an onlist txd %#x", txd->flags));
2676 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2677 	    ("put an onagg txd %#x", txd->flags));
2678 
2679 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2680 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2681 		return 0;
2682 
2683 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2684 		struct hn_txdesc *tmp_txd;
2685 
2686 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2687 			int freed __diagused;
2688 
2689 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2690 			    ("resursive aggregation on aggregated txdesc"));
2691 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2692 			    ("not aggregated txdesc"));
2693 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2694 			    ("aggregated txdesc uses dmamap"));
2695 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2696 			    ("aggregated txdesc consumes "
2697 			     "chimney sending buffer"));
2698 			KASSERT(tmp_txd->chim_size == 0,
2699 			    ("aggregated txdesc has non-zero "
2700 			     "chimney sending size"));
2701 
2702 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2703 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2704 			freed = hn_txdesc_put(txr, tmp_txd);
2705 			KASSERT(freed, ("failed to free aggregated txdesc"));
2706 		}
2707 	}
2708 
2709 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2710 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2711 		    ("chim txd uses dmamap"));
2712 		hn_chim_free(txr->hn_sc, txd->chim_index);
2713 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2714 		txd->chim_size = 0;
2715 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2716 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2717 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2718 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2719 		    txd->data_dmap);
2720 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2721 	}
2722 
2723 	if (txd->m != NULL) {
2724 		m_freem(txd->m);
2725 		txd->m = NULL;
2726 	}
2727 
2728 	txd->flags |= HN_TXD_FLAG_ONLIST;
2729 #ifndef HN_USE_TXDESC_BUFRING
2730 	mtx_lock_spin(&txr->hn_txlist_spin);
2731 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2732 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2733 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2734 	txr->hn_txdesc_avail++;
2735 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2736 	mtx_unlock_spin(&txr->hn_txlist_spin);
2737 #else	/* HN_USE_TXDESC_BUFRING */
2738 #ifdef HN_DEBUG
2739 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2740 #endif
2741 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2742 #endif	/* !HN_USE_TXDESC_BUFRING */
2743 
2744 	return 1;
2745 }
2746 
2747 static __inline struct hn_txdesc *
2748 hn_txdesc_get(struct hn_tx_ring *txr)
2749 {
2750 	struct hn_txdesc *txd;
2751 
2752 #ifndef HN_USE_TXDESC_BUFRING
2753 	mtx_lock_spin(&txr->hn_txlist_spin);
2754 	txd = SLIST_FIRST(&txr->hn_txlist);
2755 	if (txd != NULL) {
2756 		KASSERT(txr->hn_txdesc_avail > 0,
2757 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2758 		txr->hn_txdesc_avail--;
2759 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2760 	}
2761 	mtx_unlock_spin(&txr->hn_txlist_spin);
2762 #else
2763 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2764 #endif
2765 
2766 	if (txd != NULL) {
2767 #ifdef HN_USE_TXDESC_BUFRING
2768 #ifdef HN_DEBUG
2769 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2770 #endif
2771 #endif	/* HN_USE_TXDESC_BUFRING */
2772 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2773 		    STAILQ_EMPTY(&txd->agg_list) &&
2774 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2775 		    txd->chim_size == 0 &&
2776 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2777 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2778 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2779 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2780 		txd->refs = 1;
2781 	}
2782 	return txd;
2783 }
2784 
2785 static __inline void
2786 hn_txdesc_hold(struct hn_txdesc *txd)
2787 {
2788 
2789 	/* 0->1 transition will never work */
2790 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2791 	atomic_add_int(&txd->refs, 1);
2792 }
2793 
2794 static __inline void
2795 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2796 {
2797 
2798 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2799 	    ("recursive aggregation on aggregating txdesc"));
2800 
2801 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2802 	    ("already aggregated"));
2803 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2804 	    ("recursive aggregation on to-be-aggregated txdesc"));
2805 
2806 	txd->flags |= HN_TXD_FLAG_ONAGG;
2807 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2808 }
2809 
2810 static bool
2811 hn_tx_ring_pending(struct hn_tx_ring *txr)
2812 {
2813 	bool pending = false;
2814 
2815 #ifndef HN_USE_TXDESC_BUFRING
2816 	mtx_lock_spin(&txr->hn_txlist_spin);
2817 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2818 		pending = true;
2819 	mtx_unlock_spin(&txr->hn_txlist_spin);
2820 #else
2821 	if (!buf_ring_full(txr->hn_txdesc_br))
2822 		pending = true;
2823 #endif
2824 	return (pending);
2825 }
2826 
2827 static __inline void
2828 hn_txeof(struct hn_tx_ring *txr)
2829 {
2830 	txr->hn_has_txeof = 0;
2831 	txr->hn_txeof(txr);
2832 }
2833 
2834 static void
2835 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2836     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2837 {
2838 	struct hn_txdesc *txd = sndc->hn_cbarg;
2839 	struct hn_tx_ring *txr;
2840 
2841 	txr = txd->txr;
2842 	KASSERT(txr->hn_chan == chan,
2843 	    ("channel mismatch, on chan%u, should be chan%u",
2844 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2845 
2846 	txr->hn_has_txeof = 1;
2847 	hn_txdesc_put(txr, txd);
2848 
2849 	++txr->hn_txdone_cnt;
2850 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2851 		txr->hn_txdone_cnt = 0;
2852 		if (txr->hn_oactive)
2853 			hn_txeof(txr);
2854 	}
2855 }
2856 
2857 static void
2858 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2859 {
2860 #if defined(INET) || defined(INET6)
2861 	struct epoch_tracker et;
2862 
2863 	NET_EPOCH_ENTER(et);
2864 	tcp_lro_flush_all(&rxr->hn_lro);
2865 	NET_EPOCH_EXIT(et);
2866 #endif
2867 
2868 	/*
2869 	 * NOTE:
2870 	 * 'txr' could be NULL, if multiple channels and
2871 	 * ifnet.if_start method are enabled.
2872 	 */
2873 	if (txr == NULL || !txr->hn_has_txeof)
2874 		return;
2875 
2876 	txr->hn_txdone_cnt = 0;
2877 	hn_txeof(txr);
2878 }
2879 
2880 static __inline uint32_t
2881 hn_rndis_pktmsg_offset(uint32_t ofs)
2882 {
2883 
2884 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2885 	    ("invalid RNDIS packet msg offset %u", ofs));
2886 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2887 }
2888 
2889 static __inline void *
2890 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2891     size_t pi_dlen, uint32_t pi_type)
2892 {
2893 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2894 	struct rndis_pktinfo *pi;
2895 
2896 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2897 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2898 
2899 	/*
2900 	 * Per-packet-info does not move; it only grows.
2901 	 *
2902 	 * NOTE:
2903 	 * rm_pktinfooffset in this phase counts from the beginning
2904 	 * of rndis_packet_msg.
2905 	 */
2906 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2907 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2908 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2909 	    pkt->rm_pktinfolen);
2910 	pkt->rm_pktinfolen += pi_size;
2911 
2912 	pi->rm_size = pi_size;
2913 	pi->rm_type = pi_type;
2914 	pi->rm_internal = 0;
2915 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2916 
2917 	return (pi->rm_data);
2918 }
2919 
2920 static __inline int
2921 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr)
2922 {
2923 	struct hn_txdesc *txd;
2924 	struct mbuf *m;
2925 	int error, pkts;
2926 
2927 	txd = txr->hn_agg_txd;
2928 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2929 
2930 	/*
2931 	 * Since hn_txpkt() will reset this temporary stat, save
2932 	 * it now, so that oerrors can be updated properly, if
2933 	 * hn_txpkt() ever fails.
2934 	 */
2935 	pkts = txr->hn_stat_pkts;
2936 
2937 	/*
2938 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2939 	 * failure, save it for later freeing, if hn_txpkt() ever
2940 	 * fails.
2941 	 */
2942 	m = txd->m;
2943 	error = hn_txpkt(ifp, txr, txd);
2944 	if (__predict_false(error)) {
2945 		/* txd is freed, but m is not. */
2946 		m_freem(m);
2947 
2948 		txr->hn_flush_failed++;
2949 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2950 	}
2951 
2952 	/* Reset all aggregation states. */
2953 	txr->hn_agg_txd = NULL;
2954 	txr->hn_agg_szleft = 0;
2955 	txr->hn_agg_pktleft = 0;
2956 	txr->hn_agg_prevpkt = NULL;
2957 
2958 	return (error);
2959 }
2960 
2961 static void *
2962 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2963     int pktsize)
2964 {
2965 	void *chim;
2966 
2967 	if (txr->hn_agg_txd != NULL) {
2968 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2969 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2970 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2971 			int olen;
2972 
2973 			/*
2974 			 * Update the previous RNDIS packet's total length,
2975 			 * it can be increased due to the mandatory alignment
2976 			 * padding for this RNDIS packet.  And update the
2977 			 * aggregating txdesc's chimney sending buffer size
2978 			 * accordingly.
2979 			 *
2980 			 * XXX
2981 			 * Zero-out the padding, as required by the RNDIS spec.
2982 			 */
2983 			olen = pkt->rm_len;
2984 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2985 			agg_txd->chim_size += pkt->rm_len - olen;
2986 
2987 			/* Link this txdesc to the parent. */
2988 			hn_txdesc_agg(agg_txd, txd);
2989 
2990 			chim = (uint8_t *)pkt + pkt->rm_len;
2991 			/* Save the current packet for later fixup. */
2992 			txr->hn_agg_prevpkt = chim;
2993 
2994 			txr->hn_agg_pktleft--;
2995 			txr->hn_agg_szleft -= pktsize;
2996 			if (txr->hn_agg_szleft <=
2997 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2998 				/*
2999 				 * Probably can't aggregate more packets,
3000 				 * flush this aggregating txdesc proactively.
3001 				 */
3002 				txr->hn_agg_pktleft = 0;
3003 			}
3004 			/* Done! */
3005 			return (chim);
3006 		}
3007 		hn_flush_txagg(ifp, txr);
3008 	}
3009 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3010 
3011 	txr->hn_tx_chimney_tried++;
3012 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3013 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3014 		return (NULL);
3015 	txr->hn_tx_chimney++;
3016 
3017 	chim = txr->hn_sc->hn_chim +
3018 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3019 
3020 	if (txr->hn_agg_pktmax > 1 &&
3021 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3022 		txr->hn_agg_txd = txd;
3023 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3024 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3025 		txr->hn_agg_prevpkt = chim;
3026 	}
3027 	return (chim);
3028 }
3029 
3030 /*
3031  * NOTE:
3032  * If this function fails, then both txd and m_head0 will be freed.
3033  */
3034 static int
3035 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3036     struct mbuf **m_head0)
3037 {
3038 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3039 	int error, nsegs, i;
3040 	struct mbuf *m_head = *m_head0;
3041 	struct rndis_packet_msg *pkt;
3042 	uint32_t *pi_data;
3043 	void *chim = NULL;
3044 	int pkt_hlen, pkt_size;
3045 
3046 	pkt = txd->rndis_pkt;
3047 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3048 	if (pkt_size < txr->hn_chim_size) {
3049 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3050 		if (chim != NULL)
3051 			pkt = chim;
3052 	} else {
3053 		if (txr->hn_agg_txd != NULL)
3054 			hn_flush_txagg(ifp, txr);
3055 	}
3056 
3057 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3058 	pkt->rm_len = m_head->m_pkthdr.len;
3059 	pkt->rm_dataoffset = 0;
3060 	pkt->rm_datalen = m_head->m_pkthdr.len;
3061 	pkt->rm_oobdataoffset = 0;
3062 	pkt->rm_oobdatalen = 0;
3063 	pkt->rm_oobdataelements = 0;
3064 	pkt->rm_pktinfooffset = sizeof(*pkt);
3065 	pkt->rm_pktinfolen = 0;
3066 	pkt->rm_vchandle = 0;
3067 	pkt->rm_reserved = 0;
3068 
3069 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3070 		/*
3071 		 * Set the hash value for this packet.
3072 		 */
3073 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3074 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3075 
3076 		if (M_HASHTYPE_ISHASH(m_head))
3077 			/*
3078 			 * The flowid field contains the hash value host
3079 			 * set in the rx queue if it is a ip forwarding pkt.
3080 			 * Set the same hash value so host can send on the
3081 			 * cpu it was received.
3082 			 */
3083 			*pi_data = m_head->m_pkthdr.flowid;
3084 		else
3085 			/*
3086 			 * Otherwise just put the tx queue index.
3087 			 */
3088 			*pi_data = txr->hn_tx_idx;
3089 	}
3090 
3091 	if (m_head->m_flags & M_VLANTAG) {
3092 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3093 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3094 		*pi_data = NDIS_VLAN_INFO_MAKE(
3095 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3096 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3097 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3098 	}
3099 
3100 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3101 #if defined(INET6) || defined(INET)
3102 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3103 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3104 #ifdef INET
3105 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3106 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3107 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3108 			    m_head->m_pkthdr.tso_segsz);
3109 		}
3110 #endif
3111 #if defined(INET6) && defined(INET)
3112 		else
3113 #endif
3114 #ifdef INET6
3115 		{
3116 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3117 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3118 			    m_head->m_pkthdr.tso_segsz);
3119 		}
3120 #endif
3121 #endif	/* INET6 || INET */
3122 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3123 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3124 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3125 		if (m_head->m_pkthdr.csum_flags &
3126 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3127 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3128 		} else {
3129 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3130 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3131 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3132 		}
3133 
3134 		if (m_head->m_pkthdr.csum_flags &
3135 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3136 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3137 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3138 		} else if (m_head->m_pkthdr.csum_flags &
3139 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3140 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3141 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3142 		}
3143 	}
3144 
3145 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3146 	/* Fixup RNDIS packet message total length */
3147 	pkt->rm_len += pkt_hlen;
3148 	/* Convert RNDIS packet message offsets */
3149 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3150 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3151 
3152 	/*
3153 	 * Fast path: Chimney sending.
3154 	 */
3155 	if (chim != NULL) {
3156 		struct hn_txdesc *tgt_txd = txd;
3157 
3158 		if (txr->hn_agg_txd != NULL) {
3159 			tgt_txd = txr->hn_agg_txd;
3160 #ifdef INVARIANTS
3161 			*m_head0 = NULL;
3162 #endif
3163 		}
3164 
3165 		KASSERT(pkt == chim,
3166 		    ("RNDIS pkt not in chimney sending buffer"));
3167 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3168 		    ("chimney sending buffer is not used"));
3169 		tgt_txd->chim_size += pkt->rm_len;
3170 
3171 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3172 		    ((uint8_t *)chim) + pkt_hlen);
3173 
3174 		txr->hn_gpa_cnt = 0;
3175 		txr->hn_sendpkt = hn_txpkt_chim;
3176 		goto done;
3177 	}
3178 
3179 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3180 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3181 	    ("chimney buffer is used"));
3182 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3183 
3184 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3185 	if (__predict_false(error)) {
3186 		int freed __diagused;
3187 
3188 		/*
3189 		 * This mbuf is not linked w/ the txd yet, so free it now.
3190 		 */
3191 		m_freem(m_head);
3192 		*m_head0 = NULL;
3193 
3194 		freed = hn_txdesc_put(txr, txd);
3195 		KASSERT(freed != 0,
3196 		    ("fail to free txd upon txdma error"));
3197 
3198 		txr->hn_txdma_failed++;
3199 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3200 		return error;
3201 	}
3202 	*m_head0 = m_head;
3203 
3204 	/* +1 RNDIS packet message */
3205 	txr->hn_gpa_cnt = nsegs + 1;
3206 
3207 	/* send packet with page buffer */
3208 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3209 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3210 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3211 
3212 	/*
3213 	 * Fill the page buffers with mbuf info after the page
3214 	 * buffer for RNDIS packet message.
3215 	 */
3216 	for (i = 0; i < nsegs; ++i) {
3217 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3218 
3219 		gpa->gpa_page = atop(segs[i].ds_addr);
3220 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3221 		gpa->gpa_len = segs[i].ds_len;
3222 	}
3223 
3224 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3225 	txd->chim_size = 0;
3226 	txr->hn_sendpkt = hn_txpkt_sglist;
3227 done:
3228 	txd->m = m_head;
3229 
3230 	/* Set the completion routine */
3231 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3232 
3233 	/* Update temporary stats for later use. */
3234 	txr->hn_stat_pkts++;
3235 	txr->hn_stat_size += m_head->m_pkthdr.len;
3236 	if (m_head->m_flags & M_MCAST)
3237 		txr->hn_stat_mcasts++;
3238 
3239 	return 0;
3240 }
3241 
3242 /*
3243  * NOTE:
3244  * If this function fails, then txd will be freed, but the mbuf
3245  * associated w/ the txd will _not_ be freed.
3246  */
3247 static int
3248 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3249 {
3250 	int error, send_failed = 0, has_bpf;
3251 
3252 again:
3253 	has_bpf = bpf_peers_present_if(ifp);
3254 	if (has_bpf) {
3255 		/*
3256 		 * Make sure that this txd and any aggregated txds are not
3257 		 * freed before ETHER_BPF_MTAP.
3258 		 */
3259 		hn_txdesc_hold(txd);
3260 	}
3261 	error = txr->hn_sendpkt(txr, txd);
3262 	if (!error) {
3263 		if (has_bpf) {
3264 			const struct hn_txdesc *tmp_txd;
3265 
3266 			ETHER_BPF_MTAP(ifp, txd->m);
3267 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3268 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3269 		}
3270 
3271 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3272 #ifdef HN_IFSTART_SUPPORT
3273 		if (!hn_use_if_start)
3274 #endif
3275 		{
3276 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3277 			    txr->hn_stat_size);
3278 			if (txr->hn_stat_mcasts != 0) {
3279 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3280 				    txr->hn_stat_mcasts);
3281 			}
3282 		}
3283 		txr->hn_pkts += txr->hn_stat_pkts;
3284 		txr->hn_sends++;
3285 	}
3286 	if (has_bpf)
3287 		hn_txdesc_put(txr, txd);
3288 
3289 	if (__predict_false(error)) {
3290 		int freed __diagused;
3291 
3292 		/*
3293 		 * This should "really rarely" happen.
3294 		 *
3295 		 * XXX Too many RX to be acked or too many sideband
3296 		 * commands to run?  Ask netvsc_channel_rollup()
3297 		 * to kick start later.
3298 		 */
3299 		txr->hn_has_txeof = 1;
3300 		if (!send_failed) {
3301 			txr->hn_send_failed++;
3302 			send_failed = 1;
3303 			/*
3304 			 * Try sending again after set hn_has_txeof;
3305 			 * in case that we missed the last
3306 			 * netvsc_channel_rollup().
3307 			 */
3308 			goto again;
3309 		}
3310 		if_printf(ifp, "send failed\n");
3311 
3312 		/*
3313 		 * Caller will perform further processing on the
3314 		 * associated mbuf, so don't free it in hn_txdesc_put();
3315 		 * only unload it from the DMA map in hn_txdesc_put(),
3316 		 * if it was loaded.
3317 		 */
3318 		txd->m = NULL;
3319 		freed = hn_txdesc_put(txr, txd);
3320 		KASSERT(freed != 0,
3321 		    ("fail to free txd upon send error"));
3322 
3323 		txr->hn_send_failed++;
3324 	}
3325 
3326 	/* Reset temporary stats, after this sending is done. */
3327 	txr->hn_stat_size = 0;
3328 	txr->hn_stat_pkts = 0;
3329 	txr->hn_stat_mcasts = 0;
3330 
3331 	return (error);
3332 }
3333 
3334 /*
3335  * Append the specified data to the indicated mbuf chain,
3336  * Extend the mbuf chain if the new data does not fit in
3337  * existing space.
3338  *
3339  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3340  * There should be an equivalent in the kernel mbuf code,
3341  * but there does not appear to be one yet.
3342  *
3343  * Differs from m_append() in that additional mbufs are
3344  * allocated with cluster size MJUMPAGESIZE, and filled
3345  * accordingly.
3346  *
3347  * Return the last mbuf in the chain or NULL if failed to
3348  * allocate new mbuf.
3349  */
3350 static struct mbuf *
3351 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3352 {
3353 	struct mbuf *m, *n;
3354 	int remainder, space;
3355 
3356 	for (m = m0; m->m_next != NULL; m = m->m_next)
3357 		;
3358 	remainder = len;
3359 	space = M_TRAILINGSPACE(m);
3360 	if (space > 0) {
3361 		/*
3362 		 * Copy into available space.
3363 		 */
3364 		if (space > remainder)
3365 			space = remainder;
3366 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3367 		m->m_len += space;
3368 		cp += space;
3369 		remainder -= space;
3370 	}
3371 	while (remainder > 0) {
3372 		/*
3373 		 * Allocate a new mbuf; could check space
3374 		 * and allocate a cluster instead.
3375 		 */
3376 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3377 		if (n == NULL)
3378 			return NULL;
3379 		n->m_len = min(MJUMPAGESIZE, remainder);
3380 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3381 		cp += n->m_len;
3382 		remainder -= n->m_len;
3383 		m->m_next = n;
3384 		m = n;
3385 	}
3386 
3387 	return m;
3388 }
3389 
3390 #if defined(INET) || defined(INET6)
3391 static __inline int
3392 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3393 {
3394 	if (hn_lro_mbufq_depth) {
3395 		tcp_lro_queue_mbuf(lc, m);
3396 		return 0;
3397 	}
3398 	return tcp_lro_rx(lc, m, 0);
3399 }
3400 #endif
3401 
3402 static int
3403 hn_rxpkt(struct hn_rx_ring *rxr)
3404 {
3405 	if_t ifp, hn_ifp = rxr->hn_ifp;
3406 	struct mbuf *m_new, *n;
3407 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3408 	int hash_type = M_HASHTYPE_NONE;
3409 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3410 	int i;
3411 
3412 	ifp = hn_ifp;
3413 	if (rxr->hn_rxvf_ifp != NULL) {
3414 		/*
3415 		 * Non-transparent mode VF; pretend this packet is from
3416 		 * the VF.
3417 		 */
3418 		ifp = rxr->hn_rxvf_ifp;
3419 		is_vf = 1;
3420 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3421 		/* Transparent mode VF. */
3422 		is_vf = 1;
3423 	}
3424 
3425 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
3426 		/*
3427 		 * NOTE:
3428 		 * See the NOTE of hn_rndis_init_fixat().  This
3429 		 * function can be reached, immediately after the
3430 		 * RNDIS is initialized but before the ifnet is
3431 		 * setup on the hn_attach() path; drop the unexpected
3432 		 * packets.
3433 		 */
3434 		return (0);
3435 	}
3436 
3437 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3438 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3439 		return (0);
3440 	}
3441 
3442 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3443 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3444 		if (m_new == NULL) {
3445 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3446 			return (0);
3447 		}
3448 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3449 		    rxr->rsc.frag_len[0]);
3450 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3451 	} else {
3452 		/*
3453 		 * Get an mbuf with a cluster.  For packets 2K or less,
3454 		 * get a standard 2K cluster.  For anything larger, get a
3455 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3456 		 * if looped around to the Hyper-V TX channel, so avoid them.
3457 		 */
3458 		size = MCLBYTES;
3459 		if (rxr->rsc.pktlen > MCLBYTES) {
3460 			/* 4096 */
3461 			size = MJUMPAGESIZE;
3462 		}
3463 
3464 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3465 		if (m_new == NULL) {
3466 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3467 			return (0);
3468 		}
3469 
3470 		n = m_new;
3471 		for (i = 0; i < rxr->rsc.cnt; i++) {
3472 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3473 			    rxr->rsc.frag_data[i]);
3474 			if (n == NULL) {
3475 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3476 				return (0);
3477 			} else {
3478 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3479 			}
3480 		}
3481 	}
3482 	if (rxr->rsc.pktlen <= MHLEN)
3483 		rxr->hn_small_pkts++;
3484 
3485 	m_new->m_pkthdr.rcvif = ifp;
3486 
3487 	if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0))
3488 		do_csum = 0;
3489 
3490 	/* receive side checksum offload */
3491 	if (rxr->rsc.csum_info != NULL) {
3492 		/* IP csum offload */
3493 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3494 			m_new->m_pkthdr.csum_flags |=
3495 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3496 			rxr->hn_csum_ip++;
3497 		}
3498 
3499 		/* TCP/UDP csum offload */
3500 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3501 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3502 			m_new->m_pkthdr.csum_flags |=
3503 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3504 			m_new->m_pkthdr.csum_data = 0xffff;
3505 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3506 				rxr->hn_csum_tcp++;
3507 			else
3508 				rxr->hn_csum_udp++;
3509 		}
3510 
3511 		/*
3512 		 * XXX
3513 		 * As of this write (Oct 28th, 2016), host side will turn
3514 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3515 		 * the do_lro setting here is actually _not_ accurate.  We
3516 		 * depend on the RSS hash type check to reset do_lro.
3517 		 */
3518 		if ((*(rxr->rsc.csum_info) &
3519 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3520 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3521 			do_lro = 1;
3522 	} else {
3523 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3524 		if (l3proto == ETHERTYPE_IP) {
3525 			if (l4proto == IPPROTO_TCP) {
3526 				if (do_csum &&
3527 				    (rxr->hn_trust_hcsum &
3528 				     HN_TRUST_HCSUM_TCP)) {
3529 					rxr->hn_csum_trusted++;
3530 					m_new->m_pkthdr.csum_flags |=
3531 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3532 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3533 					m_new->m_pkthdr.csum_data = 0xffff;
3534 				}
3535 				do_lro = 1;
3536 			} else if (l4proto == IPPROTO_UDP) {
3537 				if (do_csum &&
3538 				    (rxr->hn_trust_hcsum &
3539 				     HN_TRUST_HCSUM_UDP)) {
3540 					rxr->hn_csum_trusted++;
3541 					m_new->m_pkthdr.csum_flags |=
3542 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3543 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3544 					m_new->m_pkthdr.csum_data = 0xffff;
3545 				}
3546 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3547 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3548 				rxr->hn_csum_trusted++;
3549 				m_new->m_pkthdr.csum_flags |=
3550 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3551 			}
3552 		}
3553 	}
3554 
3555 	if (rxr->rsc.vlan_info != NULL) {
3556 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3557 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3558 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3559 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3560 		m_new->m_flags |= M_VLANTAG;
3561 	}
3562 
3563 	/*
3564 	 * If VF is activated (transparent/non-transparent mode does not
3565 	 * matter here).
3566 	 *
3567 	 * - Disable LRO
3568 	 *
3569 	 *   hn(4) will only receive broadcast packets, multicast packets,
3570 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3571 	 *   packet types.
3572 	 *
3573 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3574 	 *   all, since the LRO flush will use hn(4) as the receiving
3575 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3576 	 */
3577 	if (is_vf)
3578 		do_lro = 0;
3579 
3580 	/*
3581 	 * If VF is activated (transparent/non-transparent mode does not
3582 	 * matter here), do _not_ mess with unsupported hash types or
3583 	 * functions.
3584 	 */
3585 	if (rxr->rsc.hash_info != NULL) {
3586 		rxr->hn_rss_pkts++;
3587 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3588 		if (!is_vf)
3589 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3590 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3591 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3592 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3593 			    rxr->hn_mbuf_hash);
3594 
3595 			/*
3596 			 * NOTE:
3597 			 * do_lro is resetted, if the hash types are not TCP
3598 			 * related.  See the comment in the above csum_flags
3599 			 * setup section.
3600 			 */
3601 			switch (type) {
3602 			case NDIS_HASH_IPV4:
3603 				hash_type = M_HASHTYPE_RSS_IPV4;
3604 				do_lro = 0;
3605 				break;
3606 
3607 			case NDIS_HASH_TCP_IPV4:
3608 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3609 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3610 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3611 
3612 					if (is_vf)
3613 						def_htype = M_HASHTYPE_NONE;
3614 
3615 					/*
3616 					 * UDP 4-tuple hash is delivered as
3617 					 * TCP 4-tuple hash.
3618 					 */
3619 					if (l3proto == ETHERTYPE_MAX) {
3620 						hn_rxpkt_proto(m_new,
3621 						    &l3proto, &l4proto);
3622 					}
3623 					if (l3proto == ETHERTYPE_IP) {
3624 						if (l4proto == IPPROTO_UDP &&
3625 						    (rxr->hn_mbuf_hash &
3626 						     NDIS_HASH_UDP_IPV4_X)) {
3627 							hash_type =
3628 							M_HASHTYPE_RSS_UDP_IPV4;
3629 							do_lro = 0;
3630 						} else if (l4proto !=
3631 						    IPPROTO_TCP) {
3632 							hash_type = def_htype;
3633 							do_lro = 0;
3634 						}
3635 					} else {
3636 						hash_type = def_htype;
3637 						do_lro = 0;
3638 					}
3639 				}
3640 				break;
3641 
3642 			case NDIS_HASH_IPV6:
3643 				hash_type = M_HASHTYPE_RSS_IPV6;
3644 				do_lro = 0;
3645 				break;
3646 
3647 			case NDIS_HASH_IPV6_EX:
3648 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3649 				do_lro = 0;
3650 				break;
3651 
3652 			case NDIS_HASH_TCP_IPV6:
3653 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3654 				break;
3655 
3656 			case NDIS_HASH_TCP_IPV6_EX:
3657 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3658 				break;
3659 			}
3660 		}
3661 	} else if (!is_vf) {
3662 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3663 		hash_type = M_HASHTYPE_OPAQUE;
3664 	}
3665 	M_HASHTYPE_SET(m_new, hash_type);
3666 
3667 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3668 	if (hn_ifp != ifp) {
3669 		const struct ether_header *eh;
3670 
3671 		/*
3672 		 * Non-transparent mode VF is activated.
3673 		 */
3674 
3675 		/*
3676 		 * Allow tapping on hn(4).
3677 		 */
3678 		ETHER_BPF_MTAP(hn_ifp, m_new);
3679 
3680 		/*
3681 		 * Update hn(4)'s stats.
3682 		 */
3683 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3684 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3685 		/* Checked at the beginning of this function. */
3686 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3687 		eh = mtod(m_new, struct ether_header *);
3688 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3689 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3690 	}
3691 	rxr->hn_pkts++;
3692 
3693 	if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) {
3694 #if defined(INET) || defined(INET6)
3695 		struct lro_ctrl *lro = &rxr->hn_lro;
3696 
3697 		if (lro->lro_cnt) {
3698 			rxr->hn_lro_tried++;
3699 			if (hn_lro_rx(lro, m_new) == 0) {
3700 				/* DONE! */
3701 				return 0;
3702 			}
3703 		}
3704 #endif
3705 	}
3706 	if_input(ifp, m_new);
3707 
3708 	return (0);
3709 }
3710 
3711 static int
3712 hn_ioctl(if_t ifp, u_long cmd, caddr_t data)
3713 {
3714 	struct hn_softc *sc = if_getsoftc(ifp);
3715 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3716 	if_t vf_ifp;
3717 	int mask, error = 0;
3718 	struct ifrsskey *ifrk;
3719 	struct ifrsshash *ifrh;
3720 	uint32_t mtu;
3721 
3722 	switch (cmd) {
3723 	case SIOCSIFMTU:
3724 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3725 			error = EINVAL;
3726 			break;
3727 		}
3728 
3729 		HN_LOCK(sc);
3730 
3731 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3732 			HN_UNLOCK(sc);
3733 			break;
3734 		}
3735 
3736 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3737 			/* Can't change MTU */
3738 			HN_UNLOCK(sc);
3739 			error = EOPNOTSUPP;
3740 			break;
3741 		}
3742 
3743 		if (if_getmtu(ifp) == ifr->ifr_mtu) {
3744 			HN_UNLOCK(sc);
3745 			break;
3746 		}
3747 
3748 		if (hn_xpnt_vf_isready(sc)) {
3749 			vf_ifp = sc->hn_vf_ifp;
3750 			ifr_vf = *ifr;
3751 			strlcpy(ifr_vf.ifr_name, if_name(vf_ifp),
3752 			    sizeof(ifr_vf.ifr_name));
3753 			error = ifhwioctl(SIOCSIFMTU, vf_ifp,
3754 			    (caddr_t)&ifr_vf, curthread);
3755 			HN_UNLOCK(sc);
3756 			if (error) {
3757 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3758 				    if_name(vf_ifp), ifr->ifr_mtu, error);
3759 			} else {
3760 				if_setmtu(ifp, ifr->ifr_mtu);
3761 			}
3762 			break;
3763 		}
3764 
3765 		/*
3766 		 * Suspend this interface before the synthetic parts
3767 		 * are ripped.
3768 		 */
3769 		hn_suspend(sc);
3770 
3771 		/*
3772 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3773 		 */
3774 		hn_synth_detach(sc);
3775 
3776 		/*
3777 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3778 		 * with the new MTU setting.
3779 		 */
3780 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3781 		if (error) {
3782 			HN_UNLOCK(sc);
3783 			break;
3784 		}
3785 
3786 		error = hn_rndis_get_mtu(sc, &mtu);
3787 		if (error)
3788 			mtu = ifr->ifr_mtu;
3789 		else if (bootverbose)
3790 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3791 
3792 		/*
3793 		 * Commit the requested MTU, after the synthetic parts
3794 		 * have been successfully attached.
3795 		 */
3796 		if (mtu >= ifr->ifr_mtu) {
3797 			mtu = ifr->ifr_mtu;
3798 		} else {
3799 			if_printf(ifp, "fixup mtu %d -> %u\n",
3800 			    ifr->ifr_mtu, mtu);
3801 		}
3802 		if_setmtu(ifp, mtu);
3803 
3804 		/*
3805 		 * Synthetic parts' reattach may change the chimney
3806 		 * sending size; update it.
3807 		 */
3808 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3809 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3810 
3811 		/*
3812 		 * Make sure that various parameters based on MTU are
3813 		 * still valid, after the MTU change.
3814 		 */
3815 		hn_mtu_change_fixup(sc);
3816 
3817 		/*
3818 		 * All done!  Resume the interface now.
3819 		 */
3820 		hn_resume(sc);
3821 
3822 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3823 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3824 			/*
3825 			 * Since we have reattached the NVS part,
3826 			 * change the datapath to VF again; in case
3827 			 * that it is lost, after the NVS was detached.
3828 			 */
3829 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3830 		}
3831 
3832 		HN_UNLOCK(sc);
3833 		break;
3834 
3835 	case SIOCSIFFLAGS:
3836 		HN_LOCK(sc);
3837 
3838 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3839 			HN_UNLOCK(sc);
3840 			break;
3841 		}
3842 
3843 		if (hn_xpnt_vf_isready(sc))
3844 			hn_xpnt_vf_saveifflags(sc);
3845 
3846 		if (if_getflags(ifp) & IFF_UP) {
3847 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3848 				/*
3849 				 * Caller meight hold mutex, e.g.
3850 				 * bpf; use busy-wait for the RNDIS
3851 				 * reply.
3852 				 */
3853 				HN_NO_SLEEPING(sc);
3854 				hn_rxfilter_config(sc);
3855 				HN_SLEEPING_OK(sc);
3856 
3857 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3858 					error = hn_xpnt_vf_iocsetflags(sc);
3859 			} else {
3860 				hn_init_locked(sc);
3861 			}
3862 		} else {
3863 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
3864 				hn_stop(sc, false);
3865 		}
3866 		sc->hn_if_flags = if_getflags(ifp);
3867 
3868 		HN_UNLOCK(sc);
3869 		break;
3870 
3871 	case SIOCSIFCAP:
3872 		HN_LOCK(sc);
3873 
3874 		if (hn_xpnt_vf_isready(sc)) {
3875 			ifr_vf = *ifr;
3876 			strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp),
3877 			    sizeof(ifr_vf.ifr_name));
3878 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3879 			HN_UNLOCK(sc);
3880 			break;
3881 		}
3882 
3883 		/*
3884 		 * Fix up requested capabilities w/ supported capabilities,
3885 		 * since the supported capabilities could have been changed.
3886 		 */
3887 		mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^
3888 		    if_getcapenable(ifp);
3889 
3890 		if (mask & IFCAP_TXCSUM) {
3891 			if_togglecapenable(ifp, IFCAP_TXCSUM);
3892 			if (if_getcapenable(ifp) & IFCAP_TXCSUM)
3893 				if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0);
3894 			else
3895 				if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc));
3896 		}
3897 		if (mask & IFCAP_TXCSUM_IPV6) {
3898 			if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6);
3899 			if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
3900 				if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0);
3901 			else
3902 				if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc));
3903 		}
3904 
3905 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3906 		if (mask & IFCAP_RXCSUM)
3907 			if_togglecapenable(ifp, IFCAP_RXCSUM);
3908 #ifdef foo
3909 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3910 		if (mask & IFCAP_RXCSUM_IPV6)
3911 			if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6);
3912 #endif
3913 
3914 		if (mask & IFCAP_LRO)
3915 			if_togglecapenable(ifp, IFCAP_LRO);
3916 
3917 		if (mask & IFCAP_TSO4) {
3918 			if_togglecapenable(ifp, IFCAP_TSO4);
3919 			if (if_getcapenable(ifp) & IFCAP_TSO4)
3920 				if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
3921 			else
3922 				if_sethwassistbits(ifp, 0, CSUM_IP_TSO);
3923 		}
3924 		if (mask & IFCAP_TSO6) {
3925 			if_togglecapenable(ifp, IFCAP_TSO6);
3926 			if (if_getcapenable(ifp) & IFCAP_TSO6)
3927 				if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
3928 			else
3929 				if_sethwassistbits(ifp, 0, CSUM_IP6_TSO);
3930 		}
3931 
3932 		HN_UNLOCK(sc);
3933 		break;
3934 
3935 	case SIOCADDMULTI:
3936 	case SIOCDELMULTI:
3937 		HN_LOCK(sc);
3938 
3939 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3940 			HN_UNLOCK(sc);
3941 			break;
3942 		}
3943 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3944 			/*
3945 			 * Multicast uses mutex; use busy-wait for
3946 			 * the RNDIS reply.
3947 			 */
3948 			HN_NO_SLEEPING(sc);
3949 			hn_rxfilter_config(sc);
3950 			HN_SLEEPING_OK(sc);
3951 		}
3952 
3953 		/* XXX vlan(4) style mcast addr maintenance */
3954 		if (hn_xpnt_vf_isready(sc)) {
3955 			int old_if_flags;
3956 
3957 			old_if_flags = if_getflags(sc->hn_vf_ifp);
3958 			hn_xpnt_vf_saveifflags(sc);
3959 
3960 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3961 			    ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) &
3962 			     IFF_ALLMULTI))
3963 				error = hn_xpnt_vf_iocsetflags(sc);
3964 		}
3965 
3966 		HN_UNLOCK(sc);
3967 		break;
3968 
3969 	case SIOCSIFMEDIA:
3970 	case SIOCGIFMEDIA:
3971 		HN_LOCK(sc);
3972 		if (hn_xpnt_vf_isready(sc)) {
3973 			/*
3974 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3975 			 * create and pass ifr_vf to the VF here; just
3976 			 * replace the ifr_name.
3977 			 */
3978 			vf_ifp = sc->hn_vf_ifp;
3979 			strlcpy(ifr->ifr_name, if_name(vf_ifp),
3980 			    sizeof(ifr->ifr_name));
3981 			error = ifhwioctl(cmd, vf_ifp, data, curthread);
3982 			/* Restore the ifr_name. */
3983 			strlcpy(ifr->ifr_name, if_name(ifp),
3984 			    sizeof(ifr->ifr_name));
3985 			HN_UNLOCK(sc);
3986 			break;
3987 		}
3988 		HN_UNLOCK(sc);
3989 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3990 		break;
3991 
3992 	case SIOCGIFRSSHASH:
3993 		ifrh = (struct ifrsshash *)data;
3994 		HN_LOCK(sc);
3995 		if (sc->hn_rx_ring_inuse == 1) {
3996 			HN_UNLOCK(sc);
3997 			ifrh->ifrh_func = RSS_FUNC_NONE;
3998 			ifrh->ifrh_types = 0;
3999 			break;
4000 		}
4001 
4002 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4003 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4004 		else
4005 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4006 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4007 		HN_UNLOCK(sc);
4008 		break;
4009 
4010 	case SIOCGIFRSSKEY:
4011 		ifrk = (struct ifrsskey *)data;
4012 		HN_LOCK(sc);
4013 		if (sc->hn_rx_ring_inuse == 1) {
4014 			HN_UNLOCK(sc);
4015 			ifrk->ifrk_func = RSS_FUNC_NONE;
4016 			ifrk->ifrk_keylen = 0;
4017 			break;
4018 		}
4019 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4020 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4021 		else
4022 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4023 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4024 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4025 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4026 		HN_UNLOCK(sc);
4027 		break;
4028 
4029 	default:
4030 		error = ether_ioctl(ifp, cmd, data);
4031 		break;
4032 	}
4033 	return (error);
4034 }
4035 
4036 static void
4037 hn_stop(struct hn_softc *sc, bool detaching)
4038 {
4039 	if_t ifp = sc->hn_ifp;
4040 	int i;
4041 
4042 	HN_LOCK_ASSERT(sc);
4043 
4044 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4045 	    ("synthetic parts were not attached"));
4046 
4047 	/* Clear RUNNING bit ASAP. */
4048 	if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
4049 
4050 	/* Disable polling. */
4051 	hn_polling(sc, 0);
4052 
4053 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4054 		KASSERT(sc->hn_vf_ifp != NULL,
4055 		    ("%s: VF is not attached", if_name(ifp)));
4056 
4057 		/* Mark transparent mode VF as disabled. */
4058 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4059 
4060 		/*
4061 		 * NOTE:
4062 		 * Datapath setting must happen _before_ bringing
4063 		 * the VF down.
4064 		 */
4065 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4066 
4067 		/*
4068 		 * Bring the VF down.
4069 		 */
4070 		hn_xpnt_vf_saveifflags(sc);
4071 		if_setflagbits(ifp, 0, IFF_UP);
4072 		hn_xpnt_vf_iocsetflags(sc);
4073 	}
4074 
4075 	/* Suspend data transfers. */
4076 	hn_suspend_data(sc);
4077 
4078 	/* Clear OACTIVE bit. */
4079 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4080 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4081 		sc->hn_tx_ring[i].hn_oactive = 0;
4082 
4083 	/*
4084 	 * If the non-transparent mode VF is active, make sure
4085 	 * that the RX filter still allows packet reception.
4086 	 */
4087 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4088 		hn_rxfilter_config(sc);
4089 }
4090 
4091 static void
4092 hn_init_locked(struct hn_softc *sc)
4093 {
4094 	if_t ifp = sc->hn_ifp;
4095 	int i;
4096 
4097 	HN_LOCK_ASSERT(sc);
4098 
4099 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4100 		return;
4101 
4102 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
4103 		return;
4104 
4105 	/* Configure RX filter */
4106 	hn_rxfilter_config(sc);
4107 
4108 	/* Clear OACTIVE bit. */
4109 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4110 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4111 		sc->hn_tx_ring[i].hn_oactive = 0;
4112 
4113 	/* Clear TX 'suspended' bit. */
4114 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4115 
4116 	if (hn_xpnt_vf_isready(sc)) {
4117 		/* Initialize transparent VF. */
4118 		hn_xpnt_vf_init(sc);
4119 	}
4120 
4121 	/* Everything is ready; unleash! */
4122 	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
4123 
4124 	/* Re-enable polling if requested. */
4125 	if (sc->hn_pollhz > 0)
4126 		hn_polling(sc, sc->hn_pollhz);
4127 }
4128 
4129 static void
4130 hn_init(void *xsc)
4131 {
4132 	struct hn_softc *sc = xsc;
4133 
4134 	HN_LOCK(sc);
4135 	hn_init_locked(sc);
4136 	HN_UNLOCK(sc);
4137 }
4138 
4139 static int
4140 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4141 {
4142 	struct hn_softc *sc = arg1;
4143 	unsigned int lenlim;
4144 	int error;
4145 
4146 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4147 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4148 	if (error || req->newptr == NULL)
4149 		return error;
4150 
4151 	HN_LOCK(sc);
4152 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4153 	    lenlim > TCP_LRO_LENGTH_MAX) {
4154 		HN_UNLOCK(sc);
4155 		return EINVAL;
4156 	}
4157 	hn_set_lro_lenlim(sc, lenlim);
4158 	HN_UNLOCK(sc);
4159 
4160 	return 0;
4161 }
4162 
4163 static int
4164 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4165 {
4166 	struct hn_softc *sc = arg1;
4167 	int ackcnt, error, i;
4168 
4169 	/*
4170 	 * lro_ackcnt_lim is append count limit,
4171 	 * +1 to turn it into aggregation limit.
4172 	 */
4173 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4174 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4175 	if (error || req->newptr == NULL)
4176 		return error;
4177 
4178 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4179 		return EINVAL;
4180 
4181 	/*
4182 	 * Convert aggregation limit back to append
4183 	 * count limit.
4184 	 */
4185 	--ackcnt;
4186 	HN_LOCK(sc);
4187 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4188 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4189 	HN_UNLOCK(sc);
4190 	return 0;
4191 }
4192 
4193 static int
4194 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4195 {
4196 	struct hn_softc *sc = arg1;
4197 	int hcsum = arg2;
4198 	int on, error, i;
4199 
4200 	on = 0;
4201 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4202 		on = 1;
4203 
4204 	error = sysctl_handle_int(oidp, &on, 0, req);
4205 	if (error || req->newptr == NULL)
4206 		return error;
4207 
4208 	HN_LOCK(sc);
4209 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4210 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4211 
4212 		if (on)
4213 			rxr->hn_trust_hcsum |= hcsum;
4214 		else
4215 			rxr->hn_trust_hcsum &= ~hcsum;
4216 	}
4217 	HN_UNLOCK(sc);
4218 	return 0;
4219 }
4220 
4221 static int
4222 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4223 {
4224 	struct hn_softc *sc = arg1;
4225 	int chim_size, error;
4226 
4227 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4228 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4229 	if (error || req->newptr == NULL)
4230 		return error;
4231 
4232 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4233 		return EINVAL;
4234 
4235 	HN_LOCK(sc);
4236 	hn_set_chim_size(sc, chim_size);
4237 	HN_UNLOCK(sc);
4238 	return 0;
4239 }
4240 
4241 static int
4242 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4243 {
4244 	struct hn_softc *sc = arg1;
4245 	int ofs = arg2, i, error;
4246 	struct hn_rx_ring *rxr;
4247 	uint64_t stat;
4248 
4249 	stat = 0;
4250 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4251 		rxr = &sc->hn_rx_ring[i];
4252 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4253 	}
4254 
4255 	error = sysctl_handle_64(oidp, &stat, 0, req);
4256 	if (error || req->newptr == NULL)
4257 		return error;
4258 
4259 	/* Zero out this stat. */
4260 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4261 		rxr = &sc->hn_rx_ring[i];
4262 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4263 	}
4264 	return 0;
4265 }
4266 
4267 static int
4268 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4269 {
4270 	struct hn_softc *sc = arg1;
4271 	int ofs = arg2, i, error;
4272 	struct hn_rx_ring *rxr;
4273 	u_long stat;
4274 
4275 	stat = 0;
4276 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4277 		rxr = &sc->hn_rx_ring[i];
4278 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4279 	}
4280 
4281 	error = sysctl_handle_long(oidp, &stat, 0, req);
4282 	if (error || req->newptr == NULL)
4283 		return error;
4284 
4285 	/* Zero out this stat. */
4286 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4287 		rxr = &sc->hn_rx_ring[i];
4288 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4289 	}
4290 	return 0;
4291 }
4292 
4293 static int
4294 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4295 {
4296 	struct hn_softc *sc = arg1;
4297 	int ofs = arg2, i, error;
4298 	struct hn_tx_ring *txr;
4299 	u_long stat;
4300 
4301 	stat = 0;
4302 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4303 		txr = &sc->hn_tx_ring[i];
4304 		stat += *((u_long *)((uint8_t *)txr + ofs));
4305 	}
4306 
4307 	error = sysctl_handle_long(oidp, &stat, 0, req);
4308 	if (error || req->newptr == NULL)
4309 		return error;
4310 
4311 	/* Zero out this stat. */
4312 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4313 		txr = &sc->hn_tx_ring[i];
4314 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4315 	}
4316 	return 0;
4317 }
4318 
4319 static int
4320 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4321 {
4322 	struct hn_softc *sc = arg1;
4323 	int ofs = arg2, i, error, conf;
4324 	struct hn_tx_ring *txr;
4325 
4326 	txr = &sc->hn_tx_ring[0];
4327 	conf = *((int *)((uint8_t *)txr + ofs));
4328 
4329 	error = sysctl_handle_int(oidp, &conf, 0, req);
4330 	if (error || req->newptr == NULL)
4331 		return error;
4332 
4333 	HN_LOCK(sc);
4334 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4335 		txr = &sc->hn_tx_ring[i];
4336 		*((int *)((uint8_t *)txr + ofs)) = conf;
4337 	}
4338 	HN_UNLOCK(sc);
4339 
4340 	return 0;
4341 }
4342 
4343 static int
4344 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4345 {
4346 	struct hn_softc *sc = arg1;
4347 	int error, size;
4348 
4349 	size = sc->hn_agg_size;
4350 	error = sysctl_handle_int(oidp, &size, 0, req);
4351 	if (error || req->newptr == NULL)
4352 		return (error);
4353 
4354 	HN_LOCK(sc);
4355 	sc->hn_agg_size = size;
4356 	hn_set_txagg(sc);
4357 	HN_UNLOCK(sc);
4358 
4359 	return (0);
4360 }
4361 
4362 static int
4363 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4364 {
4365 	struct hn_softc *sc = arg1;
4366 	int error, pkts;
4367 
4368 	pkts = sc->hn_agg_pkts;
4369 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4370 	if (error || req->newptr == NULL)
4371 		return (error);
4372 
4373 	HN_LOCK(sc);
4374 	sc->hn_agg_pkts = pkts;
4375 	hn_set_txagg(sc);
4376 	HN_UNLOCK(sc);
4377 
4378 	return (0);
4379 }
4380 
4381 static int
4382 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4383 {
4384 	struct hn_softc *sc = arg1;
4385 	int pkts;
4386 
4387 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4388 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4389 }
4390 
4391 static int
4392 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4393 {
4394 	struct hn_softc *sc = arg1;
4395 	int align;
4396 
4397 	align = sc->hn_tx_ring[0].hn_agg_align;
4398 	return (sysctl_handle_int(oidp, &align, 0, req));
4399 }
4400 
4401 static void
4402 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4403 {
4404 	if (pollhz == 0)
4405 		vmbus_chan_poll_disable(chan);
4406 	else
4407 		vmbus_chan_poll_enable(chan, pollhz);
4408 }
4409 
4410 static void
4411 hn_polling(struct hn_softc *sc, u_int pollhz)
4412 {
4413 	int nsubch = sc->hn_rx_ring_inuse - 1;
4414 
4415 	HN_LOCK_ASSERT(sc);
4416 
4417 	if (nsubch > 0) {
4418 		struct vmbus_channel **subch;
4419 		int i;
4420 
4421 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4422 		for (i = 0; i < nsubch; ++i)
4423 			hn_chan_polling(subch[i], pollhz);
4424 		vmbus_subchan_rel(subch, nsubch);
4425 	}
4426 	hn_chan_polling(sc->hn_prichan, pollhz);
4427 }
4428 
4429 static int
4430 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4431 {
4432 	struct hn_softc *sc = arg1;
4433 	int pollhz, error;
4434 
4435 	pollhz = sc->hn_pollhz;
4436 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4437 	if (error || req->newptr == NULL)
4438 		return (error);
4439 
4440 	if (pollhz != 0 &&
4441 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4442 		return (EINVAL);
4443 
4444 	HN_LOCK(sc);
4445 	if (sc->hn_pollhz != pollhz) {
4446 		sc->hn_pollhz = pollhz;
4447 		if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) &&
4448 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4449 			hn_polling(sc, sc->hn_pollhz);
4450 	}
4451 	HN_UNLOCK(sc);
4452 
4453 	return (0);
4454 }
4455 
4456 static int
4457 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4458 {
4459 	struct hn_softc *sc = arg1;
4460 	char verstr[16];
4461 
4462 	snprintf(verstr, sizeof(verstr), "%u.%u",
4463 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4464 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4465 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4466 }
4467 
4468 static int
4469 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4470 {
4471 	struct hn_softc *sc = arg1;
4472 	char caps_str[128];
4473 	uint32_t caps;
4474 
4475 	HN_LOCK(sc);
4476 	caps = sc->hn_caps;
4477 	HN_UNLOCK(sc);
4478 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4479 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4480 }
4481 
4482 static int
4483 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4484 {
4485 	struct hn_softc *sc = arg1;
4486 	char assist_str[128];
4487 	uint32_t hwassist;
4488 
4489 	HN_LOCK(sc);
4490 	hwassist = if_gethwassist(sc->hn_ifp);
4491 	HN_UNLOCK(sc);
4492 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4493 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4494 }
4495 
4496 static int
4497 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4498 {
4499 	struct hn_softc *sc = arg1;
4500 	char filter_str[128];
4501 	uint32_t filter;
4502 
4503 	HN_LOCK(sc);
4504 	filter = sc->hn_rx_filter;
4505 	HN_UNLOCK(sc);
4506 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4507 	    NDIS_PACKET_TYPES);
4508 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4509 }
4510 
4511 static int
4512 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
4513 {
4514 	struct hn_softc *sc = arg1;
4515 	int rsc_ctrl, mtu;
4516 	int error;
4517 
4518 	rsc_ctrl = sc->hn_rsc_ctrl;
4519 	error = sysctl_handle_int(oidp, &rsc_ctrl, 0, req);
4520 	if (error || req->newptr == NULL)
4521 		return (error);
4522 
4523 	if (sc->hn_rsc_ctrl != rsc_ctrl) {
4524 		HN_LOCK(sc);
4525 		sc->hn_rsc_ctrl = rsc_ctrl;
4526 		mtu = if_getmtu(sc->hn_ifp);
4527 		error = hn_rndis_reconf_offload(sc, mtu);
4528 		HN_UNLOCK(sc);
4529 	}
4530 
4531 	return (error);
4532 }
4533 #ifndef RSS
4534 
4535 static int
4536 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4537 {
4538 	struct hn_softc *sc = arg1;
4539 	int error;
4540 
4541 	HN_LOCK(sc);
4542 
4543 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4544 	if (error || req->newptr == NULL)
4545 		goto back;
4546 
4547 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4548 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4549 		/*
4550 		 * RSS key is synchronized w/ VF's, don't allow users
4551 		 * to change it.
4552 		 */
4553 		error = EBUSY;
4554 		goto back;
4555 	}
4556 
4557 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4558 	if (error)
4559 		goto back;
4560 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4561 
4562 	if (sc->hn_rx_ring_inuse > 1) {
4563 		error = hn_rss_reconfig(sc);
4564 	} else {
4565 		/* Not RSS capable, at least for now; just save the RSS key. */
4566 		error = 0;
4567 	}
4568 back:
4569 	HN_UNLOCK(sc);
4570 	return (error);
4571 }
4572 
4573 static int
4574 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4575 {
4576 	struct hn_softc *sc = arg1;
4577 	int error;
4578 
4579 	HN_LOCK(sc);
4580 
4581 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4582 	if (error || req->newptr == NULL)
4583 		goto back;
4584 
4585 	/*
4586 	 * Don't allow RSS indirect table change, if this interface is not
4587 	 * RSS capable currently.
4588 	 */
4589 	if (sc->hn_rx_ring_inuse == 1) {
4590 		error = EOPNOTSUPP;
4591 		goto back;
4592 	}
4593 
4594 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4595 	if (error)
4596 		goto back;
4597 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4598 
4599 	hn_rss_ind_fixup(sc);
4600 	error = hn_rss_reconfig(sc);
4601 back:
4602 	HN_UNLOCK(sc);
4603 	return (error);
4604 }
4605 
4606 #endif	/* !RSS */
4607 
4608 static int
4609 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4610 {
4611 	struct hn_softc *sc = arg1;
4612 	char hash_str[128];
4613 	uint32_t hash;
4614 
4615 	HN_LOCK(sc);
4616 	hash = sc->hn_rss_hash;
4617 	HN_UNLOCK(sc);
4618 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4619 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4620 }
4621 
4622 static int
4623 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4624 {
4625 	struct hn_softc *sc = arg1;
4626 	char hash_str[128];
4627 	uint32_t hash;
4628 
4629 	HN_LOCK(sc);
4630 	hash = sc->hn_rss_hcap;
4631 	HN_UNLOCK(sc);
4632 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4633 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4634 }
4635 
4636 static int
4637 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4638 {
4639 	struct hn_softc *sc = arg1;
4640 	char hash_str[128];
4641 	uint32_t hash;
4642 
4643 	HN_LOCK(sc);
4644 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4645 	HN_UNLOCK(sc);
4646 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4647 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4648 }
4649 
4650 static int
4651 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4652 {
4653 	struct hn_softc *sc = arg1;
4654 	char vf_name[IFNAMSIZ + 1];
4655 	if_t vf_ifp;
4656 
4657 	HN_LOCK(sc);
4658 	vf_name[0] = '\0';
4659 	vf_ifp = sc->hn_vf_ifp;
4660 	if (vf_ifp != NULL)
4661 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4662 	HN_UNLOCK(sc);
4663 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4664 }
4665 
4666 static int
4667 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4668 {
4669 	struct hn_softc *sc = arg1;
4670 	char vf_name[IFNAMSIZ + 1];
4671 	if_t vf_ifp;
4672 
4673 	HN_LOCK(sc);
4674 	vf_name[0] = '\0';
4675 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4676 	if (vf_ifp != NULL)
4677 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4678 	HN_UNLOCK(sc);
4679 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4680 }
4681 
4682 static int
4683 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4684 {
4685 	struct rm_priotracker pt;
4686 	struct sbuf *sb;
4687 	int error, i;
4688 	bool first;
4689 
4690 	error = sysctl_wire_old_buffer(req, 0);
4691 	if (error != 0)
4692 		return (error);
4693 
4694 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4695 	if (sb == NULL)
4696 		return (ENOMEM);
4697 
4698 	rm_rlock(&hn_vfmap_lock, &pt);
4699 
4700 	first = true;
4701 	for (i = 0; i < hn_vfmap_size; ++i) {
4702 		struct epoch_tracker et;
4703 		if_t ifp;
4704 
4705 		if (hn_vfmap[i] == NULL)
4706 			continue;
4707 
4708 		NET_EPOCH_ENTER(et);
4709 		ifp = ifnet_byindex(i);
4710 		if (ifp != NULL) {
4711 			if (first)
4712 				sbuf_printf(sb, "%s", if_name(ifp));
4713 			else
4714 				sbuf_printf(sb, " %s", if_name(ifp));
4715 			first = false;
4716 		}
4717 		NET_EPOCH_EXIT(et);
4718 	}
4719 
4720 	rm_runlock(&hn_vfmap_lock, &pt);
4721 
4722 	error = sbuf_finish(sb);
4723 	sbuf_delete(sb);
4724 	return (error);
4725 }
4726 
4727 static int
4728 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4729 {
4730 	struct rm_priotracker pt;
4731 	struct sbuf *sb;
4732 	int error, i;
4733 	bool first;
4734 
4735 	error = sysctl_wire_old_buffer(req, 0);
4736 	if (error != 0)
4737 		return (error);
4738 
4739 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4740 	if (sb == NULL)
4741 		return (ENOMEM);
4742 
4743 	rm_rlock(&hn_vfmap_lock, &pt);
4744 
4745 	first = true;
4746 	for (i = 0; i < hn_vfmap_size; ++i) {
4747 		struct epoch_tracker et;
4748 		if_t ifp, hn_ifp;
4749 
4750 		hn_ifp = hn_vfmap[i];
4751 		if (hn_ifp == NULL)
4752 			continue;
4753 
4754 		NET_EPOCH_ENTER(et);
4755 		ifp = ifnet_byindex(i);
4756 		if (ifp != NULL) {
4757 			if (first) {
4758 				sbuf_printf(sb, "%s:%s", if_name(ifp),
4759 				    if_name(hn_ifp));
4760 			} else {
4761 				sbuf_printf(sb, " %s:%s", if_name(ifp),
4762 				    if_name(hn_ifp));
4763 			}
4764 			first = false;
4765 		}
4766 		NET_EPOCH_EXIT(et);
4767 	}
4768 
4769 	rm_runlock(&hn_vfmap_lock, &pt);
4770 
4771 	error = sbuf_finish(sb);
4772 	sbuf_delete(sb);
4773 	return (error);
4774 }
4775 
4776 static int
4777 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4778 {
4779 	struct hn_softc *sc = arg1;
4780 	int error, onoff = 0;
4781 
4782 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4783 		onoff = 1;
4784 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4785 	if (error || req->newptr == NULL)
4786 		return (error);
4787 
4788 	HN_LOCK(sc);
4789 	/* NOTE: hn_vf_lock for hn_transmit() */
4790 	rm_wlock(&sc->hn_vf_lock);
4791 	if (onoff)
4792 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4793 	else
4794 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4795 	rm_wunlock(&sc->hn_vf_lock);
4796 	HN_UNLOCK(sc);
4797 
4798 	return (0);
4799 }
4800 
4801 static int
4802 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4803 {
4804 	struct hn_softc *sc = arg1;
4805 	int enabled = 0;
4806 
4807 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4808 		enabled = 1;
4809 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4810 }
4811 
4812 static int
4813 hn_check_iplen(const struct mbuf *m, int hoff)
4814 {
4815 	const struct ip *ip;
4816 	int len, iphlen, iplen;
4817 	const struct tcphdr *th;
4818 	int thoff;				/* TCP data offset */
4819 
4820 	len = hoff + sizeof(struct ip);
4821 
4822 	/* The packet must be at least the size of an IP header. */
4823 	if (m->m_pkthdr.len < len)
4824 		return IPPROTO_DONE;
4825 
4826 	/* The fixed IP header must reside completely in the first mbuf. */
4827 	if (m->m_len < len)
4828 		return IPPROTO_DONE;
4829 
4830 	ip = mtodo(m, hoff);
4831 
4832 	/* Bound check the packet's stated IP header length. */
4833 	iphlen = ip->ip_hl << 2;
4834 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4835 		return IPPROTO_DONE;
4836 
4837 	/* The full IP header must reside completely in the one mbuf. */
4838 	if (m->m_len < hoff + iphlen)
4839 		return IPPROTO_DONE;
4840 
4841 	iplen = ntohs(ip->ip_len);
4842 
4843 	/*
4844 	 * Check that the amount of data in the buffers is as
4845 	 * at least much as the IP header would have us expect.
4846 	 */
4847 	if (m->m_pkthdr.len < hoff + iplen)
4848 		return IPPROTO_DONE;
4849 
4850 	/*
4851 	 * Ignore IP fragments.
4852 	 */
4853 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4854 		return IPPROTO_DONE;
4855 
4856 	/*
4857 	 * The TCP/IP or UDP/IP header must be entirely contained within
4858 	 * the first fragment of a packet.
4859 	 */
4860 	switch (ip->ip_p) {
4861 	case IPPROTO_TCP:
4862 		if (iplen < iphlen + sizeof(struct tcphdr))
4863 			return IPPROTO_DONE;
4864 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4865 			return IPPROTO_DONE;
4866 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4867 		thoff = th->th_off << 2;
4868 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4869 			return IPPROTO_DONE;
4870 		if (m->m_len < hoff + iphlen + thoff)
4871 			return IPPROTO_DONE;
4872 		break;
4873 	case IPPROTO_UDP:
4874 		if (iplen < iphlen + sizeof(struct udphdr))
4875 			return IPPROTO_DONE;
4876 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4877 			return IPPROTO_DONE;
4878 		break;
4879 	default:
4880 		if (iplen < iphlen)
4881 			return IPPROTO_DONE;
4882 		break;
4883 	}
4884 	return ip->ip_p;
4885 }
4886 
4887 static void
4888 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4889 {
4890 	const struct ether_header *eh;
4891 	uint16_t etype;
4892 	int hoff;
4893 
4894 	hoff = sizeof(*eh);
4895 	/* Checked at the beginning of this function. */
4896 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4897 
4898 	eh = mtod(m_new, const struct ether_header *);
4899 	etype = ntohs(eh->ether_type);
4900 	if (etype == ETHERTYPE_VLAN) {
4901 		const struct ether_vlan_header *evl;
4902 
4903 		hoff = sizeof(*evl);
4904 		if (m_new->m_len < hoff)
4905 			return;
4906 		evl = mtod(m_new, const struct ether_vlan_header *);
4907 		etype = ntohs(evl->evl_proto);
4908 	}
4909 	*l3proto = etype;
4910 
4911 	if (etype == ETHERTYPE_IP)
4912 		*l4proto = hn_check_iplen(m_new, hoff);
4913 	else
4914 		*l4proto = IPPROTO_DONE;
4915 }
4916 
4917 static int
4918 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4919 {
4920 	struct sysctl_oid_list *child;
4921 	struct sysctl_ctx_list *ctx;
4922 	device_t dev = sc->hn_dev;
4923 #if defined(INET) || defined(INET6)
4924 	int lroent_cnt;
4925 #endif
4926 	int i;
4927 
4928 	/*
4929 	 * Create RXBUF for reception.
4930 	 *
4931 	 * NOTE:
4932 	 * - It is shared by all channels.
4933 	 * - A large enough buffer is allocated, certain version of NVSes
4934 	 *   may further limit the usable space.
4935 	 */
4936 	sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
4937 	    0ul, ~0ul, PAGE_SIZE, 0);
4938 	if (sc->hn_rxbuf == NULL) {
4939 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4940 		return (ENOMEM);
4941 	}
4942 
4943 	sc->hn_rx_ring_cnt = ring_cnt;
4944 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4945 
4946 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4947 	    M_DEVBUF, M_WAITOK | M_ZERO);
4948 
4949 #if defined(INET) || defined(INET6)
4950 	lroent_cnt = hn_lro_entry_count;
4951 	if (lroent_cnt < TCP_LRO_ENTRIES)
4952 		lroent_cnt = TCP_LRO_ENTRIES;
4953 	if (bootverbose)
4954 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4955 #endif	/* INET || INET6 */
4956 
4957 	ctx = device_get_sysctl_ctx(dev);
4958 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4959 
4960 	/* Create dev.hn.UNIT.rx sysctl tree */
4961 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4962 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4963 
4964 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4965 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4966 
4967 		rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF,
4968 		    M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0);
4969 		if (rxr->hn_br == NULL) {
4970 			device_printf(dev, "allocate bufring failed\n");
4971 			return (ENOMEM);
4972 		}
4973 
4974 		if (hn_trust_hosttcp)
4975 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4976 		if (hn_trust_hostudp)
4977 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4978 		if (hn_trust_hostip)
4979 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4980 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4981 		rxr->hn_ifp = sc->hn_ifp;
4982 		if (i < sc->hn_tx_ring_cnt)
4983 			rxr->hn_txr = &sc->hn_tx_ring[i];
4984 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4985 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4986 		rxr->hn_rx_idx = i;
4987 		rxr->hn_rxbuf = sc->hn_rxbuf;
4988 
4989 		/*
4990 		 * Initialize LRO.
4991 		 */
4992 #if defined(INET) || defined(INET6)
4993 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4994 		    hn_lro_mbufq_depth);
4995 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4996 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4997 #endif	/* INET || INET6 */
4998 
4999 		if (sc->hn_rx_sysctl_tree != NULL) {
5000 			char name[16];
5001 
5002 			/*
5003 			 * Create per RX ring sysctl tree:
5004 			 * dev.hn.UNIT.rx.RINGID
5005 			 */
5006 			snprintf(name, sizeof(name), "%d", i);
5007 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5008 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5009 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5010 
5011 			if (rxr->hn_rx_sysctl_tree != NULL) {
5012 				SYSCTL_ADD_ULONG(ctx,
5013 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5014 				    OID_AUTO, "packets",
5015 				    CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5016 				    "# of packets received");
5017 				SYSCTL_ADD_ULONG(ctx,
5018 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5019 				    OID_AUTO, "rss_pkts",
5020 				    CTLFLAG_RW | CTLFLAG_STATS,
5021 				    &rxr->hn_rss_pkts,
5022 				    "# of packets w/ RSS info received");
5023 				SYSCTL_ADD_ULONG(ctx,
5024 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5025 				    OID_AUTO, "rsc_pkts",
5026 				    CTLFLAG_RW | CTLFLAG_STATS,
5027 				    &rxr->hn_rsc_pkts,
5028 				    "# of RSC packets received");
5029 				SYSCTL_ADD_ULONG(ctx,
5030 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5031 				    OID_AUTO, "rsc_drop",
5032 				    CTLFLAG_RW | CTLFLAG_STATS,
5033 				    &rxr->hn_rsc_drop,
5034 				    "# of RSC fragments dropped");
5035 				SYSCTL_ADD_INT(ctx,
5036 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5037 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5038 				    &rxr->hn_pktbuf_len, 0,
5039 				    "Temporary channel packet buffer length");
5040 			}
5041 		}
5042 	}
5043 
5044 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5045 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5046 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5047 	    hn_rx_stat_u64_sysctl,
5048 	    "LU", "LRO queued");
5049 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5050 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5051 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5052 	    hn_rx_stat_u64_sysctl,
5053 	    "LU", "LRO flushed");
5054 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5055 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5056 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5057 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5058 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5059 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5060 	    hn_lro_lenlim_sysctl, "IU",
5061 	    "Max # of data bytes to be aggregated by LRO");
5062 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5063 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5064 	    hn_lro_ackcnt_sysctl, "I",
5065 	    "Max # of ACKs to be aggregated by LRO");
5066 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5067 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5068 	    hn_trust_hcsum_sysctl, "I",
5069 	    "Trust tcp segment verification on host side, "
5070 	    "when csum info is missing");
5071 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5072 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5073 	    hn_trust_hcsum_sysctl, "I",
5074 	    "Trust udp datagram verification on host side, "
5075 	    "when csum info is missing");
5076 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5077 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5078 	    hn_trust_hcsum_sysctl, "I",
5079 	    "Trust ip packet verification on host side, "
5080 	    "when csum info is missing");
5081 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5082 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5083 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5084 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5085 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5086 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5087 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5088 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5089 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5090 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5091 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5092 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5093 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5094 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5095 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5096 	    hn_rx_stat_ulong_sysctl, "LU",
5097 	    "# of packets that we trust host's csum verification");
5098 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5099 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5100 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5101 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5102 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5103 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5104 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5105 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5106 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5107 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5108 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5109 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5110 
5111 	return (0);
5112 }
5113 
5114 static void
5115 hn_destroy_rx_data(struct hn_softc *sc)
5116 {
5117 	int i;
5118 
5119 	if (sc->hn_rxbuf != NULL) {
5120 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5121 			free(sc->hn_rxbuf, M_DEVBUF);
5122 		else
5123 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5124 		sc->hn_rxbuf = NULL;
5125 	}
5126 
5127 	if (sc->hn_rx_ring_cnt == 0)
5128 		return;
5129 
5130 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5131 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5132 
5133 		if (rxr->hn_br == NULL)
5134 			continue;
5135 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5136 			free(rxr->hn_br, M_DEVBUF);
5137 		} else {
5138 			device_printf(sc->hn_dev,
5139 			    "%dth channel bufring is referenced", i);
5140 		}
5141 		rxr->hn_br = NULL;
5142 
5143 #if defined(INET) || defined(INET6)
5144 		tcp_lro_free(&rxr->hn_lro);
5145 #endif
5146 		free(rxr->hn_pktbuf, M_DEVBUF);
5147 	}
5148 	free(sc->hn_rx_ring, M_DEVBUF);
5149 	sc->hn_rx_ring = NULL;
5150 
5151 	sc->hn_rx_ring_cnt = 0;
5152 	sc->hn_rx_ring_inuse = 0;
5153 }
5154 
5155 static int
5156 hn_tx_ring_create(struct hn_softc *sc, int id)
5157 {
5158 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5159 	device_t dev = sc->hn_dev;
5160 	bus_dma_tag_t parent_dtag;
5161 	int error, i;
5162 
5163 	txr->hn_sc = sc;
5164 	txr->hn_tx_idx = id;
5165 
5166 #ifndef HN_USE_TXDESC_BUFRING
5167 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5168 #endif
5169 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5170 
5171 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5172 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5173 	    M_DEVBUF, M_WAITOK | M_ZERO);
5174 #ifndef HN_USE_TXDESC_BUFRING
5175 	SLIST_INIT(&txr->hn_txlist);
5176 #else
5177 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5178 	    M_WAITOK, &txr->hn_tx_lock);
5179 #endif
5180 
5181 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5182 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5183 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5184 	} else {
5185 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5186 	}
5187 
5188 #ifdef HN_IFSTART_SUPPORT
5189 	if (hn_use_if_start) {
5190 		txr->hn_txeof = hn_start_txeof;
5191 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5192 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5193 	} else
5194 #endif
5195 	{
5196 		int br_depth;
5197 
5198 		txr->hn_txeof = hn_xmit_txeof;
5199 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5200 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5201 
5202 		br_depth = hn_get_txswq_depth(txr);
5203 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5204 		    M_WAITOK, &txr->hn_tx_lock);
5205 	}
5206 
5207 	txr->hn_direct_tx_size = hn_direct_tx_size;
5208 
5209 	/*
5210 	 * Always schedule transmission instead of trying to do direct
5211 	 * transmission.  This one gives the best performance so far.
5212 	 */
5213 	txr->hn_sched_tx = 1;
5214 
5215 	parent_dtag = bus_get_dma_tag(dev);
5216 
5217 	/* DMA tag for RNDIS packet messages. */
5218 	error = bus_dma_tag_create(parent_dtag, /* parent */
5219 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5220 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5221 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5222 	    BUS_SPACE_MAXADDR,		/* highaddr */
5223 	    NULL, NULL,			/* filter, filterarg */
5224 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5225 	    1,				/* nsegments */
5226 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5227 	    0,				/* flags */
5228 	    NULL,			/* lockfunc */
5229 	    NULL,			/* lockfuncarg */
5230 	    &txr->hn_tx_rndis_dtag);
5231 	if (error) {
5232 		device_printf(dev, "failed to create rndis dmatag\n");
5233 		return error;
5234 	}
5235 
5236 	/* DMA tag for data. */
5237 	error = bus_dma_tag_create(parent_dtag, /* parent */
5238 	    1,				/* alignment */
5239 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5240 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5241 	    BUS_SPACE_MAXADDR,		/* highaddr */
5242 	    NULL, NULL,			/* filter, filterarg */
5243 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5244 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5245 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5246 	    0,				/* flags */
5247 	    NULL,			/* lockfunc */
5248 	    NULL,			/* lockfuncarg */
5249 	    &txr->hn_tx_data_dtag);
5250 	if (error) {
5251 		device_printf(dev, "failed to create data dmatag\n");
5252 		return error;
5253 	}
5254 
5255 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5256 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5257 
5258 		txd->txr = txr;
5259 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5260 		STAILQ_INIT(&txd->agg_list);
5261 
5262 		/*
5263 		 * Allocate and load RNDIS packet message.
5264 		 */
5265         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5266 		    (void **)&txd->rndis_pkt,
5267 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5268 		    &txd->rndis_pkt_dmap);
5269 		if (error) {
5270 			device_printf(dev,
5271 			    "failed to allocate rndis_packet_msg, %d\n", i);
5272 			return error;
5273 		}
5274 
5275 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5276 		    txd->rndis_pkt_dmap,
5277 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5278 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5279 		    BUS_DMA_NOWAIT);
5280 		if (error) {
5281 			device_printf(dev,
5282 			    "failed to load rndis_packet_msg, %d\n", i);
5283 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5284 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5285 			return error;
5286 		}
5287 
5288 		/* DMA map for TX data. */
5289 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5290 		    &txd->data_dmap);
5291 		if (error) {
5292 			device_printf(dev,
5293 			    "failed to allocate tx data dmamap\n");
5294 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5295 			    txd->rndis_pkt_dmap);
5296 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5297 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5298 			return error;
5299 		}
5300 
5301 		/* All set, put it to list */
5302 		txd->flags |= HN_TXD_FLAG_ONLIST;
5303 #ifndef HN_USE_TXDESC_BUFRING
5304 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5305 #else
5306 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5307 #endif
5308 	}
5309 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5310 
5311 	if (sc->hn_tx_sysctl_tree != NULL) {
5312 		struct sysctl_oid_list *child;
5313 		struct sysctl_ctx_list *ctx;
5314 		char name[16];
5315 
5316 		/*
5317 		 * Create per TX ring sysctl tree:
5318 		 * dev.hn.UNIT.tx.RINGID
5319 		 */
5320 		ctx = device_get_sysctl_ctx(dev);
5321 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5322 
5323 		snprintf(name, sizeof(name), "%d", id);
5324 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5325 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5326 
5327 		if (txr->hn_tx_sysctl_tree != NULL) {
5328 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5329 
5330 #ifdef HN_DEBUG
5331 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5332 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5333 			    "# of available TX descs");
5334 #endif
5335 #ifdef HN_IFSTART_SUPPORT
5336 			if (!hn_use_if_start)
5337 #endif
5338 			{
5339 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5340 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5341 				    "over active");
5342 			}
5343 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5344 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5345 			    "# of packets transmitted");
5346 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5347 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5348 			    "# of sends");
5349 		}
5350 	}
5351 
5352 	return 0;
5353 }
5354 
5355 static void
5356 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5357 {
5358 	struct hn_tx_ring *txr = txd->txr;
5359 
5360 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5361 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5362 
5363 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5364 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5365 	    txd->rndis_pkt_dmap);
5366 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5367 }
5368 
5369 static void
5370 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5371 {
5372 
5373 	KASSERT(txd->refs == 0 || txd->refs == 1,
5374 	    ("invalid txd refs %d", txd->refs));
5375 
5376 	/* Aggregated txds will be freed by their aggregating txd. */
5377 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5378 		int freed __diagused;
5379 
5380 		freed = hn_txdesc_put(txr, txd);
5381 		KASSERT(freed, ("can't free txdesc"));
5382 	}
5383 }
5384 
5385 static void
5386 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5387 {
5388 	int i;
5389 
5390 	if (txr->hn_txdesc == NULL)
5391 		return;
5392 
5393 	/*
5394 	 * NOTE:
5395 	 * Because the freeing of aggregated txds will be deferred
5396 	 * to the aggregating txd, two passes are used here:
5397 	 * - The first pass GCes any pending txds.  This GC is necessary,
5398 	 *   since if the channels are revoked, hypervisor will not
5399 	 *   deliver send-done for all pending txds.
5400 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5401 	 *   were freed.
5402 	 */
5403 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5404 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5405 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5406 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5407 
5408 	if (txr->hn_tx_data_dtag != NULL)
5409 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5410 	if (txr->hn_tx_rndis_dtag != NULL)
5411 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5412 
5413 #ifdef HN_USE_TXDESC_BUFRING
5414 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5415 #endif
5416 
5417 	free(txr->hn_txdesc, M_DEVBUF);
5418 	txr->hn_txdesc = NULL;
5419 
5420 	if (txr->hn_mbuf_br != NULL)
5421 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5422 
5423 #ifndef HN_USE_TXDESC_BUFRING
5424 	mtx_destroy(&txr->hn_txlist_spin);
5425 #endif
5426 	mtx_destroy(&txr->hn_tx_lock);
5427 }
5428 
5429 static int
5430 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5431 {
5432 	struct sysctl_oid_list *child;
5433 	struct sysctl_ctx_list *ctx;
5434 	int i;
5435 
5436 	/*
5437 	 * Create TXBUF for chimney sending.
5438 	 *
5439 	 * NOTE: It is shared by all channels.
5440 	 */
5441 	sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
5442 	    0ul, ~0ul, PAGE_SIZE, 0);
5443 	if (sc->hn_chim == NULL) {
5444 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5445 		return (ENOMEM);
5446 	}
5447 
5448 	sc->hn_tx_ring_cnt = ring_cnt;
5449 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5450 
5451 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5452 	    M_DEVBUF, M_WAITOK | M_ZERO);
5453 
5454 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5455 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5456 
5457 	/* Create dev.hn.UNIT.tx sysctl tree */
5458 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5459 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5460 
5461 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5462 		int error;
5463 
5464 		error = hn_tx_ring_create(sc, i);
5465 		if (error)
5466 			return error;
5467 	}
5468 
5469 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5470 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5471 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5472 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5473 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5474 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5475 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5476 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5477 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5478 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5479 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5480 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5481 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5482 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5483 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5484 	    hn_tx_stat_ulong_sysctl, "LU",
5485 	    "# of packet transmission aggregation flush failure");
5486 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5487 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5488 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5489 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5490 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5491 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5492 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5493 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5494 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5495 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5496 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5497 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5498 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5499 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5500 	    "# of total TX descs");
5501 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5502 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5503 	    "Chimney send packet size upper boundary");
5504 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5505 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5506 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5507 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5508 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5509 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5510 	    hn_tx_conf_int_sysctl, "I",
5511 	    "Size of the packet for direct transmission");
5512 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5513 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5514 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5515 	    hn_tx_conf_int_sysctl, "I",
5516 	    "Always schedule transmission "
5517 	    "instead of doing direct transmission");
5518 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5519 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5520 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5521 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5522 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5523 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5524 	    "Applied packet transmission aggregation size");
5525 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5526 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5527 	    hn_txagg_pktmax_sysctl, "I",
5528 	    "Applied packet transmission aggregation packets");
5529 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5530 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5531 	    hn_txagg_align_sysctl, "I",
5532 	    "Applied packet transmission aggregation alignment");
5533 
5534 	return 0;
5535 }
5536 
5537 static void
5538 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5539 {
5540 	int i;
5541 
5542 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5543 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5544 }
5545 
5546 static void
5547 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5548 {
5549 	if_t ifp = sc->hn_ifp;
5550 	u_int hw_tsomax;
5551 	int tso_minlen;
5552 
5553 	HN_LOCK_ASSERT(sc);
5554 
5555 	if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5556 		return;
5557 
5558 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5559 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5560 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5561 
5562 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5563 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5564 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5565 
5566 	if (tso_maxlen < tso_minlen)
5567 		tso_maxlen = tso_minlen;
5568 	else if (tso_maxlen > IP_MAXPACKET)
5569 		tso_maxlen = IP_MAXPACKET;
5570 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5571 		tso_maxlen = sc->hn_ndis_tso_szmax;
5572 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5573 
5574 	if (hn_xpnt_vf_isready(sc)) {
5575 		if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp))
5576 			hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp);
5577 	}
5578 	if_sethwtsomax(ifp, hw_tsomax);
5579 	if (bootverbose)
5580 		if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp));
5581 }
5582 
5583 static void
5584 hn_fixup_tx_data(struct hn_softc *sc)
5585 {
5586 	uint64_t csum_assist;
5587 	int i;
5588 
5589 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5590 	if (hn_tx_chimney_size > 0 &&
5591 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5592 		hn_set_chim_size(sc, hn_tx_chimney_size);
5593 
5594 	csum_assist = 0;
5595 	if (sc->hn_caps & HN_CAP_IPCS)
5596 		csum_assist |= CSUM_IP;
5597 	if (sc->hn_caps & HN_CAP_TCP4CS)
5598 		csum_assist |= CSUM_IP_TCP;
5599 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5600 		csum_assist |= CSUM_IP_UDP;
5601 	if (sc->hn_caps & HN_CAP_TCP6CS)
5602 		csum_assist |= CSUM_IP6_TCP;
5603 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5604 		csum_assist |= CSUM_IP6_UDP;
5605 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5606 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5607 
5608 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5609 		/*
5610 		 * Support HASHVAL pktinfo on TX path.
5611 		 */
5612 		if (bootverbose)
5613 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5614 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5615 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5616 	}
5617 }
5618 
5619 static void
5620 hn_fixup_rx_data(struct hn_softc *sc)
5621 {
5622 
5623 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5624 		int i;
5625 
5626 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5627 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5628 	}
5629 }
5630 
5631 static void
5632 hn_destroy_tx_data(struct hn_softc *sc)
5633 {
5634 	int i;
5635 
5636 	if (sc->hn_chim != NULL) {
5637 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5638 			free(sc->hn_chim, M_DEVBUF);
5639 		} else {
5640 			device_printf(sc->hn_dev,
5641 			    "chimney sending buffer is referenced");
5642 		}
5643 		sc->hn_chim = NULL;
5644 	}
5645 
5646 	if (sc->hn_tx_ring_cnt == 0)
5647 		return;
5648 
5649 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5650 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5651 
5652 	free(sc->hn_tx_ring, M_DEVBUF);
5653 	sc->hn_tx_ring = NULL;
5654 
5655 	sc->hn_tx_ring_cnt = 0;
5656 	sc->hn_tx_ring_inuse = 0;
5657 }
5658 
5659 #ifdef HN_IFSTART_SUPPORT
5660 
5661 static void
5662 hn_start_taskfunc(void *xtxr, int pending __unused)
5663 {
5664 	struct hn_tx_ring *txr = xtxr;
5665 
5666 	mtx_lock(&txr->hn_tx_lock);
5667 	hn_start_locked(txr, 0);
5668 	mtx_unlock(&txr->hn_tx_lock);
5669 }
5670 
5671 static int
5672 hn_start_locked(struct hn_tx_ring *txr, int len)
5673 {
5674 	struct hn_softc *sc = txr->hn_sc;
5675 	if_t ifp = sc->hn_ifp;
5676 	int sched = 0;
5677 
5678 	KASSERT(hn_use_if_start,
5679 	    ("hn_start_locked is called, when if_start is disabled"));
5680 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5681 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5682 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5683 
5684 	if (__predict_false(txr->hn_suspended))
5685 		return (0);
5686 
5687 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5688 	    IFF_DRV_RUNNING)
5689 		return (0);
5690 
5691 	while (!if_sendq_empty(ifp)) {
5692 		struct hn_txdesc *txd;
5693 		struct mbuf *m_head;
5694 		int error;
5695 
5696 		m_head = if_dequeue(ifp);
5697 		if (m_head == NULL)
5698 			break;
5699 
5700 		if (len > 0 && m_head->m_pkthdr.len > len) {
5701 			/*
5702 			 * This sending could be time consuming; let callers
5703 			 * dispatch this packet sending (and sending of any
5704 			 * following up packets) to tx taskqueue.
5705 			 */
5706 			if_sendq_prepend(ifp, m_head);
5707 			sched = 1;
5708 			break;
5709 		}
5710 
5711 #if defined(INET6) || defined(INET)
5712 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5713 			m_head = hn_tso_fixup(m_head);
5714 			if (__predict_false(m_head == NULL)) {
5715 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5716 				continue;
5717 			}
5718 		} else if (m_head->m_pkthdr.csum_flags &
5719 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5720 			m_head = hn_set_hlen(m_head);
5721 			if (__predict_false(m_head == NULL)) {
5722 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5723 				continue;
5724 			}
5725 		}
5726 #endif
5727 
5728 		txd = hn_txdesc_get(txr);
5729 		if (txd == NULL) {
5730 			txr->hn_no_txdescs++;
5731 			if_sendq_prepend(ifp, m_head);
5732 			if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
5733 			break;
5734 		}
5735 
5736 		error = hn_encap(ifp, txr, txd, &m_head);
5737 		if (error) {
5738 			/* Both txd and m_head are freed */
5739 			KASSERT(txr->hn_agg_txd == NULL,
5740 			    ("encap failed w/ pending aggregating txdesc"));
5741 			continue;
5742 		}
5743 
5744 		if (txr->hn_agg_pktleft == 0) {
5745 			if (txr->hn_agg_txd != NULL) {
5746 				KASSERT(m_head == NULL,
5747 				    ("pending mbuf for aggregating txdesc"));
5748 				error = hn_flush_txagg(ifp, txr);
5749 				if (__predict_false(error)) {
5750 					if_setdrvflagbits(ifp,
5751 					    IFF_DRV_OACTIVE, 0);
5752 					break;
5753 				}
5754 			} else {
5755 				KASSERT(m_head != NULL, ("mbuf was freed"));
5756 				error = hn_txpkt(ifp, txr, txd);
5757 				if (__predict_false(error)) {
5758 					/* txd is freed, but m_head is not */
5759 					if_sendq_prepend(ifp, m_head);
5760 					if_setdrvflagbits(ifp,
5761 					    IFF_DRV_OACTIVE, 0);
5762 					break;
5763 				}
5764 			}
5765 		}
5766 #ifdef INVARIANTS
5767 		else {
5768 			KASSERT(txr->hn_agg_txd != NULL,
5769 			    ("no aggregating txdesc"));
5770 			KASSERT(m_head == NULL,
5771 			    ("pending mbuf for aggregating txdesc"));
5772 		}
5773 #endif
5774 	}
5775 
5776 	/* Flush pending aggerated transmission. */
5777 	if (txr->hn_agg_txd != NULL)
5778 		hn_flush_txagg(ifp, txr);
5779 	return (sched);
5780 }
5781 
5782 static void
5783 hn_start(if_t ifp)
5784 {
5785 	struct hn_softc *sc = if_getsoftc(ifp);
5786 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5787 
5788 	if (txr->hn_sched_tx)
5789 		goto do_sched;
5790 
5791 	if (mtx_trylock(&txr->hn_tx_lock)) {
5792 		int sched;
5793 
5794 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5795 		mtx_unlock(&txr->hn_tx_lock);
5796 		if (!sched)
5797 			return;
5798 	}
5799 do_sched:
5800 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5801 }
5802 
5803 static void
5804 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5805 {
5806 	struct hn_tx_ring *txr = xtxr;
5807 
5808 	mtx_lock(&txr->hn_tx_lock);
5809 	if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE);
5810 	hn_start_locked(txr, 0);
5811 	mtx_unlock(&txr->hn_tx_lock);
5812 }
5813 
5814 static void
5815 hn_start_txeof(struct hn_tx_ring *txr)
5816 {
5817 	struct hn_softc *sc = txr->hn_sc;
5818 	if_t ifp = sc->hn_ifp;
5819 
5820 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5821 
5822 	if (txr->hn_sched_tx)
5823 		goto do_sched;
5824 
5825 	if (mtx_trylock(&txr->hn_tx_lock)) {
5826 		int sched;
5827 
5828 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5829 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5830 		mtx_unlock(&txr->hn_tx_lock);
5831 		if (sched) {
5832 			taskqueue_enqueue(txr->hn_tx_taskq,
5833 			    &txr->hn_tx_task);
5834 		}
5835 	} else {
5836 do_sched:
5837 		/*
5838 		 * Release the OACTIVE earlier, with the hope, that
5839 		 * others could catch up.  The task will clear the
5840 		 * flag again with the hn_tx_lock to avoid possible
5841 		 * races.
5842 		 */
5843 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5844 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5845 	}
5846 }
5847 
5848 #endif	/* HN_IFSTART_SUPPORT */
5849 
5850 static int
5851 hn_xmit(struct hn_tx_ring *txr, int len)
5852 {
5853 	struct hn_softc *sc = txr->hn_sc;
5854 	if_t ifp = sc->hn_ifp;
5855 	struct mbuf *m_head;
5856 	int sched = 0;
5857 
5858 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5859 #ifdef HN_IFSTART_SUPPORT
5860 	KASSERT(hn_use_if_start == 0,
5861 	    ("hn_xmit is called, when if_start is enabled"));
5862 #endif
5863 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5864 
5865 	if (__predict_false(txr->hn_suspended))
5866 		return (0);
5867 
5868 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5869 		return (0);
5870 
5871 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5872 		struct hn_txdesc *txd;
5873 		int error;
5874 
5875 		if (len > 0 && m_head->m_pkthdr.len > len) {
5876 			/*
5877 			 * This sending could be time consuming; let callers
5878 			 * dispatch this packet sending (and sending of any
5879 			 * following up packets) to tx taskqueue.
5880 			 */
5881 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5882 			sched = 1;
5883 			break;
5884 		}
5885 
5886 		txd = hn_txdesc_get(txr);
5887 		if (txd == NULL) {
5888 			txr->hn_no_txdescs++;
5889 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5890 			txr->hn_oactive = 1;
5891 			break;
5892 		}
5893 
5894 		error = hn_encap(ifp, txr, txd, &m_head);
5895 		if (error) {
5896 			/* Both txd and m_head are freed; discard */
5897 			KASSERT(txr->hn_agg_txd == NULL,
5898 			    ("encap failed w/ pending aggregating txdesc"));
5899 			drbr_advance(ifp, txr->hn_mbuf_br);
5900 			continue;
5901 		}
5902 
5903 		if (txr->hn_agg_pktleft == 0) {
5904 			if (txr->hn_agg_txd != NULL) {
5905 				KASSERT(m_head == NULL,
5906 				    ("pending mbuf for aggregating txdesc"));
5907 				error = hn_flush_txagg(ifp, txr);
5908 				if (__predict_false(error)) {
5909 					txr->hn_oactive = 1;
5910 					break;
5911 				}
5912 			} else {
5913 				KASSERT(m_head != NULL, ("mbuf was freed"));
5914 				error = hn_txpkt(ifp, txr, txd);
5915 				if (__predict_false(error)) {
5916 					/* txd is freed, but m_head is not */
5917 					drbr_putback(ifp, txr->hn_mbuf_br,
5918 					    m_head);
5919 					txr->hn_oactive = 1;
5920 					break;
5921 				}
5922 			}
5923 		}
5924 #ifdef INVARIANTS
5925 		else {
5926 			KASSERT(txr->hn_agg_txd != NULL,
5927 			    ("no aggregating txdesc"));
5928 			KASSERT(m_head == NULL,
5929 			    ("pending mbuf for aggregating txdesc"));
5930 		}
5931 #endif
5932 
5933 		/* Sent */
5934 		drbr_advance(ifp, txr->hn_mbuf_br);
5935 	}
5936 
5937 	/* Flush pending aggerated transmission. */
5938 	if (txr->hn_agg_txd != NULL)
5939 		hn_flush_txagg(ifp, txr);
5940 	return (sched);
5941 }
5942 
5943 static int
5944 hn_transmit(if_t ifp, struct mbuf *m)
5945 {
5946 	struct hn_softc *sc = if_getsoftc(ifp);
5947 	struct hn_tx_ring *txr;
5948 	int error, idx = 0;
5949 
5950 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5951 		struct rm_priotracker pt;
5952 
5953 		rm_rlock(&sc->hn_vf_lock, &pt);
5954 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5955 			struct mbuf *m_bpf = NULL;
5956 			int obytes, omcast;
5957 
5958 			obytes = m->m_pkthdr.len;
5959 			omcast = (m->m_flags & M_MCAST) != 0;
5960 
5961 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5962 				if (bpf_peers_present_if(ifp)) {
5963 					m_bpf = m_copypacket(m, M_NOWAIT);
5964 					if (m_bpf == NULL) {
5965 						/*
5966 						 * Failed to grab a shallow
5967 						 * copy; tap now.
5968 						 */
5969 						ETHER_BPF_MTAP(ifp, m);
5970 					}
5971 				}
5972 			} else {
5973 				ETHER_BPF_MTAP(ifp, m);
5974 			}
5975 
5976 			error = if_transmit(sc->hn_vf_ifp, m);
5977 			rm_runlock(&sc->hn_vf_lock, &pt);
5978 
5979 			if (m_bpf != NULL) {
5980 				if (!error)
5981 					ETHER_BPF_MTAP(ifp, m_bpf);
5982 				m_freem(m_bpf);
5983 			}
5984 
5985 			if (error == ENOBUFS) {
5986 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5987 			} else if (error) {
5988 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5989 			} else {
5990 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5991 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5992 				if (omcast) {
5993 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5994 					    omcast);
5995 				}
5996 			}
5997 			return (error);
5998 		}
5999 		rm_runlock(&sc->hn_vf_lock, &pt);
6000 	}
6001 
6002 #if defined(INET6) || defined(INET)
6003 	/*
6004 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6005 	 * since packet headers should be cache-hot.
6006 	 */
6007 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6008 		m = hn_tso_fixup(m);
6009 		if (__predict_false(m == NULL)) {
6010 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6011 			return EIO;
6012 		}
6013 	} else if (m->m_pkthdr.csum_flags &
6014 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6015 		m = hn_set_hlen(m);
6016 		if (__predict_false(m == NULL)) {
6017 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6018 			return EIO;
6019 		}
6020 	}
6021 #endif
6022 
6023 	/*
6024 	 * Select the TX ring based on flowid
6025 	 */
6026 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6027 #ifdef RSS
6028 		uint32_t bid;
6029 
6030 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6031 		    &bid) == 0)
6032 			idx = bid % sc->hn_tx_ring_inuse;
6033 		else
6034 #endif
6035 		{
6036 #if defined(INET6) || defined(INET)
6037 			int tcpsyn = 0;
6038 
6039 			if (m->m_pkthdr.len < 128 &&
6040 			    (m->m_pkthdr.csum_flags &
6041 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6042 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6043 				m = hn_check_tcpsyn(m, &tcpsyn);
6044 				if (__predict_false(m == NULL)) {
6045 					if_inc_counter(ifp,
6046 					    IFCOUNTER_OERRORS, 1);
6047 					return (EIO);
6048 				}
6049 			}
6050 #else
6051 			const int tcpsyn = 0;
6052 #endif
6053 			if (tcpsyn)
6054 				idx = 0;
6055 			else
6056 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6057 		}
6058 	}
6059 	txr = &sc->hn_tx_ring[idx];
6060 
6061 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6062 	if (error) {
6063 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6064 		return error;
6065 	}
6066 
6067 	if (txr->hn_oactive)
6068 		return 0;
6069 
6070 	if (txr->hn_sched_tx)
6071 		goto do_sched;
6072 
6073 	if (mtx_trylock(&txr->hn_tx_lock)) {
6074 		int sched;
6075 
6076 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6077 		mtx_unlock(&txr->hn_tx_lock);
6078 		if (!sched)
6079 			return 0;
6080 	}
6081 do_sched:
6082 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6083 	return 0;
6084 }
6085 
6086 static void
6087 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6088 {
6089 	struct mbuf *m;
6090 
6091 	mtx_lock(&txr->hn_tx_lock);
6092 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6093 		m_freem(m);
6094 	mtx_unlock(&txr->hn_tx_lock);
6095 }
6096 
6097 static void
6098 hn_xmit_qflush(if_t ifp)
6099 {
6100 	struct hn_softc *sc = if_getsoftc(ifp);
6101 	struct rm_priotracker pt;
6102 	int i;
6103 
6104 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6105 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6106 	if_qflush(ifp);
6107 
6108 	rm_rlock(&sc->hn_vf_lock, &pt);
6109 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6110 		if_qflush(sc->hn_vf_ifp);
6111 	rm_runlock(&sc->hn_vf_lock, &pt);
6112 }
6113 
6114 static void
6115 hn_xmit_txeof(struct hn_tx_ring *txr)
6116 {
6117 
6118 	if (txr->hn_sched_tx)
6119 		goto do_sched;
6120 
6121 	if (mtx_trylock(&txr->hn_tx_lock)) {
6122 		int sched;
6123 
6124 		txr->hn_oactive = 0;
6125 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6126 		mtx_unlock(&txr->hn_tx_lock);
6127 		if (sched) {
6128 			taskqueue_enqueue(txr->hn_tx_taskq,
6129 			    &txr->hn_tx_task);
6130 		}
6131 	} else {
6132 do_sched:
6133 		/*
6134 		 * Release the oactive earlier, with the hope, that
6135 		 * others could catch up.  The task will clear the
6136 		 * oactive again with the hn_tx_lock to avoid possible
6137 		 * races.
6138 		 */
6139 		txr->hn_oactive = 0;
6140 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6141 	}
6142 }
6143 
6144 static void
6145 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6146 {
6147 	struct hn_tx_ring *txr = xtxr;
6148 
6149 	mtx_lock(&txr->hn_tx_lock);
6150 	hn_xmit(txr, 0);
6151 	mtx_unlock(&txr->hn_tx_lock);
6152 }
6153 
6154 static void
6155 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6156 {
6157 	struct hn_tx_ring *txr = xtxr;
6158 
6159 	mtx_lock(&txr->hn_tx_lock);
6160 	txr->hn_oactive = 0;
6161 	hn_xmit(txr, 0);
6162 	mtx_unlock(&txr->hn_tx_lock);
6163 }
6164 
6165 static int
6166 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6167 {
6168 	struct vmbus_chan_br cbr;
6169 	struct hn_rx_ring *rxr;
6170 	struct hn_tx_ring *txr = NULL;
6171 	int idx, error;
6172 
6173 	idx = vmbus_chan_subidx(chan);
6174 
6175 	/*
6176 	 * Link this channel to RX/TX ring.
6177 	 */
6178 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6179 	    ("invalid channel index %d, should > 0 && < %d",
6180 	     idx, sc->hn_rx_ring_inuse));
6181 	rxr = &sc->hn_rx_ring[idx];
6182 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6183 	    ("RX ring %d already attached", idx));
6184 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6185 	rxr->hn_chan = chan;
6186 
6187 	if (bootverbose) {
6188 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6189 		    idx, vmbus_chan_id(chan));
6190 	}
6191 
6192 	if (idx < sc->hn_tx_ring_inuse) {
6193 		txr = &sc->hn_tx_ring[idx];
6194 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6195 		    ("TX ring %d already attached", idx));
6196 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6197 
6198 		txr->hn_chan = chan;
6199 		if (bootverbose) {
6200 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6201 			    idx, vmbus_chan_id(chan));
6202 		}
6203 	}
6204 
6205 	/* Bind this channel to a proper CPU. */
6206 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6207 
6208 	/*
6209 	 * Open this channel
6210 	 */
6211 	cbr.cbr = rxr->hn_br;
6212 	cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br);
6213 	cbr.cbr_txsz = HN_TXBR_SIZE;
6214 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6215 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6216 	if (error) {
6217 		if (error == EISCONN) {
6218 			if_printf(sc->hn_ifp, "bufring is connected after "
6219 			    "chan%u open failure\n", vmbus_chan_id(chan));
6220 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6221 		} else {
6222 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6223 			    vmbus_chan_id(chan), error);
6224 		}
6225 	}
6226 	return (error);
6227 }
6228 
6229 static void
6230 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6231 {
6232 	struct hn_rx_ring *rxr;
6233 	int idx, error;
6234 
6235 	idx = vmbus_chan_subidx(chan);
6236 
6237 	/*
6238 	 * Link this channel to RX/TX ring.
6239 	 */
6240 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6241 	    ("invalid channel index %d, should > 0 && < %d",
6242 	     idx, sc->hn_rx_ring_inuse));
6243 	rxr = &sc->hn_rx_ring[idx];
6244 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6245 	    ("RX ring %d is not attached", idx));
6246 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6247 
6248 	if (idx < sc->hn_tx_ring_inuse) {
6249 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6250 
6251 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6252 		    ("TX ring %d is not attached attached", idx));
6253 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6254 	}
6255 
6256 	/*
6257 	 * Close this channel.
6258 	 *
6259 	 * NOTE:
6260 	 * Channel closing does _not_ destroy the target channel.
6261 	 */
6262 	error = vmbus_chan_close_direct(chan);
6263 	if (error == EISCONN) {
6264 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6265 		    "after being closed\n", vmbus_chan_id(chan));
6266 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6267 	} else if (error) {
6268 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6269 		    vmbus_chan_id(chan), error);
6270 	}
6271 }
6272 
6273 static int
6274 hn_attach_subchans(struct hn_softc *sc)
6275 {
6276 	struct vmbus_channel **subchans;
6277 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6278 	int i, error = 0;
6279 
6280 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6281 
6282 	/* Attach the sub-channels. */
6283 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6284 	for (i = 0; i < subchan_cnt; ++i) {
6285 		int error1;
6286 
6287 		error1 = hn_chan_attach(sc, subchans[i]);
6288 		if (error1) {
6289 			error = error1;
6290 			/* Move on; all channels will be detached later. */
6291 		}
6292 	}
6293 	vmbus_subchan_rel(subchans, subchan_cnt);
6294 
6295 	if (error) {
6296 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6297 	} else {
6298 		if (bootverbose) {
6299 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6300 			    subchan_cnt);
6301 		}
6302 	}
6303 	return (error);
6304 }
6305 
6306 static void
6307 hn_detach_allchans(struct hn_softc *sc)
6308 {
6309 	struct vmbus_channel **subchans;
6310 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6311 	int i;
6312 
6313 	if (subchan_cnt == 0)
6314 		goto back;
6315 
6316 	/* Detach the sub-channels. */
6317 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6318 	for (i = 0; i < subchan_cnt; ++i)
6319 		hn_chan_detach(sc, subchans[i]);
6320 	vmbus_subchan_rel(subchans, subchan_cnt);
6321 
6322 back:
6323 	/*
6324 	 * Detach the primary channel, _after_ all sub-channels
6325 	 * are detached.
6326 	 */
6327 	hn_chan_detach(sc, sc->hn_prichan);
6328 
6329 	/* Wait for sub-channels to be destroyed, if any. */
6330 	vmbus_subchan_drain(sc->hn_prichan);
6331 
6332 #ifdef INVARIANTS
6333 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6334 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6335 		    HN_RX_FLAG_ATTACHED) == 0,
6336 		    ("%dth RX ring is still attached", i));
6337 	}
6338 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6339 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6340 		    HN_TX_FLAG_ATTACHED) == 0,
6341 		    ("%dth TX ring is still attached", i));
6342 	}
6343 #endif
6344 }
6345 
6346 static int
6347 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6348 {
6349 	struct vmbus_channel **subchans;
6350 	int nchan, rxr_cnt, error;
6351 
6352 	nchan = *nsubch + 1;
6353 	if (nchan == 1) {
6354 		/*
6355 		 * Multiple RX/TX rings are not requested.
6356 		 */
6357 		*nsubch = 0;
6358 		return (0);
6359 	}
6360 
6361 	/*
6362 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6363 	 * table entries.
6364 	 */
6365 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6366 	if (error) {
6367 		/* No RSS; this is benign. */
6368 		*nsubch = 0;
6369 		return (0);
6370 	}
6371 	if (bootverbose) {
6372 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6373 		    rxr_cnt, nchan);
6374 	}
6375 
6376 	if (nchan > rxr_cnt)
6377 		nchan = rxr_cnt;
6378 	if (nchan == 1) {
6379 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6380 		*nsubch = 0;
6381 		return (0);
6382 	}
6383 
6384 	/*
6385 	 * Allocate sub-channels from NVS.
6386 	 */
6387 	*nsubch = nchan - 1;
6388 	error = hn_nvs_alloc_subchans(sc, nsubch);
6389 	if (error || *nsubch == 0) {
6390 		/* Failed to allocate sub-channels. */
6391 		*nsubch = 0;
6392 		return (0);
6393 	}
6394 
6395 	/*
6396 	 * Wait for all sub-channels to become ready before moving on.
6397 	 */
6398 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6399 	vmbus_subchan_rel(subchans, *nsubch);
6400 	return (0);
6401 }
6402 
6403 static bool
6404 hn_synth_attachable(const struct hn_softc *sc)
6405 {
6406 	int i;
6407 
6408 	if (sc->hn_flags & HN_FLAG_ERRORS)
6409 		return (false);
6410 
6411 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6412 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6413 
6414 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6415 			return (false);
6416 	}
6417 	return (true);
6418 }
6419 
6420 /*
6421  * Make sure that the RX filter is zero after the successful
6422  * RNDIS initialization.
6423  *
6424  * NOTE:
6425  * Under certain conditions on certain versions of Hyper-V,
6426  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6427  * after the successful RNDIS initialization, which breaks
6428  * the assumption of any following code (well, it breaks the
6429  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6430  * explicitly, drain packets sneaking through, and drain the
6431  * interrupt taskqueues scheduled due to the stealth packets.
6432  */
6433 static void
6434 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6435 {
6436 
6437 	hn_disable_rx(sc);
6438 	hn_drain_rxtx(sc, nchan);
6439 }
6440 
6441 static int
6442 hn_synth_attach(struct hn_softc *sc, int mtu)
6443 {
6444 #define ATTACHED_NVS		0x0002
6445 #define ATTACHED_RNDIS		0x0004
6446 
6447 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6448 	int error, nsubch, nchan = 1, i, rndis_inited;
6449 	uint32_t old_caps, attached = 0;
6450 
6451 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6452 	    ("synthetic parts were attached"));
6453 
6454 	if (!hn_synth_attachable(sc))
6455 		return (ENXIO);
6456 
6457 	/* Save capabilities for later verification. */
6458 	old_caps = sc->hn_caps;
6459 	sc->hn_caps = 0;
6460 
6461 	/* Clear RSS stuffs. */
6462 	sc->hn_rss_ind_size = 0;
6463 	sc->hn_rss_hash = 0;
6464 	sc->hn_rss_hcap = 0;
6465 
6466 	/*
6467 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6468 	 */
6469 	error = hn_chan_attach(sc, sc->hn_prichan);
6470 	if (error)
6471 		goto failed;
6472 
6473 	/*
6474 	 * Attach NVS.
6475 	 */
6476 	error = hn_nvs_attach(sc, mtu);
6477 	if (error)
6478 		goto failed;
6479 	attached |= ATTACHED_NVS;
6480 
6481 	/*
6482 	 * Attach RNDIS _after_ NVS is attached.
6483 	 */
6484 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6485 	if (rndis_inited)
6486 		attached |= ATTACHED_RNDIS;
6487 	if (error)
6488 		goto failed;
6489 
6490 	/*
6491 	 * Make sure capabilities are not changed.
6492 	 */
6493 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6494 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6495 		    old_caps, sc->hn_caps);
6496 		error = ENXIO;
6497 		goto failed;
6498 	}
6499 
6500 	/*
6501 	 * Allocate sub-channels for multi-TX/RX rings.
6502 	 *
6503 	 * NOTE:
6504 	 * The # of RX rings that can be used is equivalent to the # of
6505 	 * channels to be requested.
6506 	 */
6507 	nsubch = sc->hn_rx_ring_cnt - 1;
6508 	error = hn_synth_alloc_subchans(sc, &nsubch);
6509 	if (error)
6510 		goto failed;
6511 	/* NOTE: _Full_ synthetic parts detach is required now. */
6512 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6513 
6514 	/*
6515 	 * Set the # of TX/RX rings that could be used according to
6516 	 * the # of channels that NVS offered.
6517 	 */
6518 	nchan = nsubch + 1;
6519 	hn_set_ring_inuse(sc, nchan);
6520 	if (nchan == 1) {
6521 		/* Only the primary channel can be used; done */
6522 		goto back;
6523 	}
6524 
6525 	/*
6526 	 * Attach the sub-channels.
6527 	 *
6528 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6529 	 */
6530 	error = hn_attach_subchans(sc);
6531 	if (error)
6532 		goto failed;
6533 
6534 	/*
6535 	 * Configure RSS key and indirect table _after_ all sub-channels
6536 	 * are attached.
6537 	 */
6538 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6539 		/*
6540 		 * RSS key is not set yet; set it to the default RSS key.
6541 		 */
6542 		if (bootverbose)
6543 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6544 		rss_getkey(rss->rss_key);
6545 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6546 	}
6547 
6548 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6549 		/*
6550 		 * RSS indirect table is not set yet; set it up in round-
6551 		 * robin fashion.
6552 		 */
6553 		if (bootverbose) {
6554 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6555 			    "table\n");
6556 		}
6557 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6558 			uint32_t subidx;
6559 
6560 #ifdef RSS
6561 			subidx = rss_get_indirection_to_bucket(i);
6562 #else
6563 			subidx = i;
6564 #endif
6565 			rss->rss_ind[i] = subidx % nchan;
6566 		}
6567 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6568 	} else {
6569 		/*
6570 		 * # of usable channels may be changed, so we have to
6571 		 * make sure that all entries in RSS indirect table
6572 		 * are valid.
6573 		 *
6574 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6575 		 */
6576 		hn_rss_ind_fixup(sc);
6577 	}
6578 
6579 	sc->hn_rss_hash = sc->hn_rss_hcap;
6580 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6581 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6582 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6583 		hn_vf_rss_fixup(sc, false);
6584 	}
6585 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6586 	if (error)
6587 		goto failed;
6588 back:
6589 	/*
6590 	 * Fixup transmission aggregation setup.
6591 	 */
6592 	hn_set_txagg(sc);
6593 	hn_rndis_init_fixat(sc, nchan);
6594 	return (0);
6595 
6596 failed:
6597 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6598 		hn_rndis_init_fixat(sc, nchan);
6599 		hn_synth_detach(sc);
6600 	} else {
6601 		if (attached & ATTACHED_RNDIS) {
6602 			hn_rndis_init_fixat(sc, nchan);
6603 			hn_rndis_detach(sc);
6604 		}
6605 		if (attached & ATTACHED_NVS)
6606 			hn_nvs_detach(sc);
6607 		hn_chan_detach(sc, sc->hn_prichan);
6608 		/* Restore old capabilities. */
6609 		sc->hn_caps = old_caps;
6610 	}
6611 	return (error);
6612 
6613 #undef ATTACHED_RNDIS
6614 #undef ATTACHED_NVS
6615 }
6616 
6617 /*
6618  * NOTE:
6619  * The interface must have been suspended though hn_suspend(), before
6620  * this function get called.
6621  */
6622 static void
6623 hn_synth_detach(struct hn_softc *sc)
6624 {
6625 
6626 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6627 	    ("synthetic parts were not attached"));
6628 
6629 	/* Detach the RNDIS first. */
6630 	hn_rndis_detach(sc);
6631 
6632 	/* Detach NVS. */
6633 	hn_nvs_detach(sc);
6634 
6635 	/* Detach all of the channels. */
6636 	hn_detach_allchans(sc);
6637 
6638 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6639 		/*
6640 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6641 		 */
6642 		int error;
6643 
6644 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6645 		    sc->hn_rxbuf_gpadl);
6646 		if (error) {
6647 			if_printf(sc->hn_ifp,
6648 			    "rxbuf gpadl disconn failed: %d\n", error);
6649 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6650 		}
6651 		sc->hn_rxbuf_gpadl = 0;
6652 	}
6653 
6654 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6655 		/*
6656 		 * Host is post-Win2016, disconnect chimney sending buffer from
6657 		 * primary channel here.
6658 		 */
6659 		int error;
6660 
6661 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6662 		    sc->hn_chim_gpadl);
6663 		if (error) {
6664 			if_printf(sc->hn_ifp,
6665 			    "chim gpadl disconn failed: %d\n", error);
6666 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6667 		}
6668 		sc->hn_chim_gpadl = 0;
6669 	}
6670 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6671 }
6672 
6673 static void
6674 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6675 {
6676 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6677 	    ("invalid ring count %d", ring_cnt));
6678 
6679 	if (sc->hn_tx_ring_cnt > ring_cnt)
6680 		sc->hn_tx_ring_inuse = ring_cnt;
6681 	else
6682 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6683 	sc->hn_rx_ring_inuse = ring_cnt;
6684 
6685 #ifdef RSS
6686 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6687 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6688 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6689 		    rss_getnumbuckets());
6690 	}
6691 #endif
6692 
6693 	if (bootverbose) {
6694 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6695 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6696 	}
6697 }
6698 
6699 static void
6700 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6701 {
6702 
6703 	/*
6704 	 * NOTE:
6705 	 * The TX bufring will not be drained by the hypervisor,
6706 	 * if the primary channel is revoked.
6707 	 */
6708 	while (!vmbus_chan_rx_empty(chan) ||
6709 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6710 	     !vmbus_chan_tx_empty(chan)))
6711 		pause("waitch", 1);
6712 	vmbus_chan_intr_drain(chan);
6713 }
6714 
6715 static void
6716 hn_disable_rx(struct hn_softc *sc)
6717 {
6718 
6719 	/*
6720 	 * Disable RX by clearing RX filter forcefully.
6721 	 */
6722 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6723 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6724 
6725 	/*
6726 	 * Give RNDIS enough time to flush all pending data packets.
6727 	 */
6728 	pause("waitrx", (200 * hz) / 1000);
6729 }
6730 
6731 /*
6732  * NOTE:
6733  * RX/TX _must_ have been suspended/disabled, before this function
6734  * is called.
6735  */
6736 static void
6737 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6738 {
6739 	struct vmbus_channel **subch = NULL;
6740 	int nsubch;
6741 
6742 	/*
6743 	 * Drain RX/TX bufrings and interrupts.
6744 	 */
6745 	nsubch = nchan - 1;
6746 	if (nsubch > 0)
6747 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6748 
6749 	if (subch != NULL) {
6750 		int i;
6751 
6752 		for (i = 0; i < nsubch; ++i)
6753 			hn_chan_drain(sc, subch[i]);
6754 	}
6755 	hn_chan_drain(sc, sc->hn_prichan);
6756 
6757 	if (subch != NULL)
6758 		vmbus_subchan_rel(subch, nsubch);
6759 }
6760 
6761 static void
6762 hn_suspend_data(struct hn_softc *sc)
6763 {
6764 	struct hn_tx_ring *txr;
6765 	int i;
6766 
6767 	HN_LOCK_ASSERT(sc);
6768 
6769 	/*
6770 	 * Suspend TX.
6771 	 */
6772 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6773 		txr = &sc->hn_tx_ring[i];
6774 
6775 		mtx_lock(&txr->hn_tx_lock);
6776 		txr->hn_suspended = 1;
6777 		mtx_unlock(&txr->hn_tx_lock);
6778 		/* No one is able send more packets now. */
6779 
6780 		/*
6781 		 * Wait for all pending sends to finish.
6782 		 *
6783 		 * NOTE:
6784 		 * We will _not_ receive all pending send-done, if the
6785 		 * primary channel is revoked.
6786 		 */
6787 		while (hn_tx_ring_pending(txr) &&
6788 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6789 			pause("hnwtx", 1 /* 1 tick */);
6790 	}
6791 
6792 	/*
6793 	 * Disable RX.
6794 	 */
6795 	hn_disable_rx(sc);
6796 
6797 	/*
6798 	 * Drain RX/TX.
6799 	 */
6800 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6801 
6802 	/*
6803 	 * Drain any pending TX tasks.
6804 	 *
6805 	 * NOTE:
6806 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6807 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6808 	 */
6809 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6810 		txr = &sc->hn_tx_ring[i];
6811 
6812 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6813 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6814 	}
6815 }
6816 
6817 static void
6818 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6819 {
6820 
6821 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6822 }
6823 
6824 static void
6825 hn_suspend_mgmt(struct hn_softc *sc)
6826 {
6827 	struct task task;
6828 
6829 	HN_LOCK_ASSERT(sc);
6830 
6831 	/*
6832 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6833 	 * through hn_mgmt_taskq.
6834 	 */
6835 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6836 	vmbus_chan_run_task(sc->hn_prichan, &task);
6837 
6838 	/*
6839 	 * Make sure that all pending management tasks are completed.
6840 	 */
6841 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6842 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6843 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6844 }
6845 
6846 static void
6847 hn_suspend(struct hn_softc *sc)
6848 {
6849 
6850 	/* Disable polling. */
6851 	hn_polling(sc, 0);
6852 
6853 	/*
6854 	 * If the non-transparent mode VF is activated, the synthetic
6855 	 * device is receiving packets, so the data path of the
6856 	 * synthetic device must be suspended.
6857 	 */
6858 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6859 	    (sc->hn_flags & HN_FLAG_RXVF))
6860 		hn_suspend_data(sc);
6861 	hn_suspend_mgmt(sc);
6862 }
6863 
6864 static void
6865 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6866 {
6867 	int i;
6868 
6869 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6870 	    ("invalid TX ring count %d", tx_ring_cnt));
6871 
6872 	for (i = 0; i < tx_ring_cnt; ++i) {
6873 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6874 
6875 		mtx_lock(&txr->hn_tx_lock);
6876 		txr->hn_suspended = 0;
6877 		mtx_unlock(&txr->hn_tx_lock);
6878 	}
6879 }
6880 
6881 static void
6882 hn_resume_data(struct hn_softc *sc)
6883 {
6884 	int i;
6885 
6886 	HN_LOCK_ASSERT(sc);
6887 
6888 	/*
6889 	 * Re-enable RX.
6890 	 */
6891 	hn_rxfilter_config(sc);
6892 
6893 	/*
6894 	 * Make sure to clear suspend status on "all" TX rings,
6895 	 * since hn_tx_ring_inuse can be changed after
6896 	 * hn_suspend_data().
6897 	 */
6898 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6899 
6900 #ifdef HN_IFSTART_SUPPORT
6901 	if (!hn_use_if_start)
6902 #endif
6903 	{
6904 		/*
6905 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6906 		 * reduced.
6907 		 */
6908 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6909 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6910 	}
6911 
6912 	/*
6913 	 * Kick start TX.
6914 	 */
6915 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6916 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6917 
6918 		/*
6919 		 * Use txeof task, so that any pending oactive can be
6920 		 * cleared properly.
6921 		 */
6922 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6923 	}
6924 }
6925 
6926 static void
6927 hn_resume_mgmt(struct hn_softc *sc)
6928 {
6929 
6930 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6931 
6932 	/*
6933 	 * Kick off network change detection, if it was pending.
6934 	 * If no network change was pending, start link status
6935 	 * checks, which is more lightweight than network change
6936 	 * detection.
6937 	 */
6938 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6939 		hn_change_network(sc);
6940 	else
6941 		hn_update_link_status(sc);
6942 }
6943 
6944 static void
6945 hn_resume(struct hn_softc *sc)
6946 {
6947 
6948 	/*
6949 	 * If the non-transparent mode VF is activated, the synthetic
6950 	 * device have to receive packets, so the data path of the
6951 	 * synthetic device must be resumed.
6952 	 */
6953 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6954 	    (sc->hn_flags & HN_FLAG_RXVF))
6955 		hn_resume_data(sc);
6956 
6957 	/*
6958 	 * Don't resume link status change if VF is attached/activated.
6959 	 * - In the non-transparent VF mode, the synthetic device marks
6960 	 *   link down until the VF is deactivated; i.e. VF is down.
6961 	 * - In transparent VF mode, VF's media status is used until
6962 	 *   the VF is detached.
6963 	 */
6964 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6965 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6966 		hn_resume_mgmt(sc);
6967 
6968 	/*
6969 	 * Re-enable polling if this interface is running and
6970 	 * the polling is requested.
6971 	 */
6972 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6973 		hn_polling(sc, sc->hn_pollhz);
6974 }
6975 
6976 static void
6977 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6978 {
6979 	const struct rndis_status_msg *msg;
6980 	int ofs;
6981 
6982 	if (dlen < sizeof(*msg)) {
6983 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6984 		return;
6985 	}
6986 	msg = data;
6987 
6988 	switch (msg->rm_status) {
6989 	case RNDIS_STATUS_MEDIA_CONNECT:
6990 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6991 		hn_update_link_status(sc);
6992 		break;
6993 
6994 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6995 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6996 		/* Not really useful; ignore. */
6997 		break;
6998 
6999 	case RNDIS_STATUS_NETWORK_CHANGE:
7000 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7001 		if (dlen < ofs + msg->rm_stbuflen ||
7002 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7003 			if_printf(sc->hn_ifp, "network changed\n");
7004 		} else {
7005 			uint32_t change;
7006 
7007 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7008 			    sizeof(change));
7009 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7010 			    change);
7011 		}
7012 		hn_change_network(sc);
7013 		break;
7014 
7015 	default:
7016 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7017 		    msg->rm_status);
7018 		break;
7019 	}
7020 }
7021 
7022 static int
7023 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7024 {
7025 	const struct rndis_pktinfo *pi = info_data;
7026 	uint32_t mask = 0;
7027 
7028 	while (info_dlen != 0) {
7029 		const void *data;
7030 		uint32_t dlen;
7031 
7032 		if (__predict_false(info_dlen < sizeof(*pi)))
7033 			return (EINVAL);
7034 		if (__predict_false(info_dlen < pi->rm_size))
7035 			return (EINVAL);
7036 		info_dlen -= pi->rm_size;
7037 
7038 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7039 			return (EINVAL);
7040 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7041 			return (EINVAL);
7042 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7043 		data = pi->rm_data;
7044 
7045 		if (pi->rm_internal == 1) {
7046 			switch (pi->rm_type) {
7047 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7048 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7049 					return (EINVAL);
7050 				info->pktinfo_id =
7051 				    (const struct packet_info_id *)data;
7052 				mask |= HN_RXINFO_PKTINFO_ID;
7053 				break;
7054 
7055 			default:
7056 				goto next;
7057 			}
7058 		} else {
7059 			switch (pi->rm_type) {
7060 			case NDIS_PKTINFO_TYPE_VLAN:
7061 				if (__predict_false(dlen
7062 				    < NDIS_VLAN_INFO_SIZE))
7063 					return (EINVAL);
7064 				info->vlan_info = (const uint32_t *)data;
7065 				mask |= HN_RXINFO_VLAN;
7066 				break;
7067 
7068 			case NDIS_PKTINFO_TYPE_CSUM:
7069 				if (__predict_false(dlen
7070 				    < NDIS_RXCSUM_INFO_SIZE))
7071 					return (EINVAL);
7072 				info->csum_info = (const uint32_t *)data;
7073 				mask |= HN_RXINFO_CSUM;
7074 				break;
7075 
7076 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7077 				if (__predict_false(dlen
7078 				    < HN_NDIS_HASH_VALUE_SIZE))
7079 					return (EINVAL);
7080 				info->hash_value = (const uint32_t *)data;
7081 				mask |= HN_RXINFO_HASHVAL;
7082 				break;
7083 
7084 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7085 				if (__predict_false(dlen
7086 				    < HN_NDIS_HASH_INFO_SIZE))
7087 					return (EINVAL);
7088 				info->hash_info = (const uint32_t *)data;
7089 				mask |= HN_RXINFO_HASHINF;
7090 				break;
7091 
7092 			default:
7093 				goto next;
7094 			}
7095 		}
7096 
7097 		if (mask == HN_RXINFO_ALL) {
7098 			/* All found; done */
7099 			break;
7100 		}
7101 next:
7102 		pi = (const struct rndis_pktinfo *)
7103 		    ((const uint8_t *)pi + pi->rm_size);
7104 	}
7105 
7106 	/*
7107 	 * Final fixup.
7108 	 * - If there is no hash value, invalidate the hash info.
7109 	 */
7110 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7111 		info->hash_info = NULL;
7112 	return (0);
7113 }
7114 
7115 static __inline bool
7116 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7117 {
7118 
7119 	if (off < check_off) {
7120 		if (__predict_true(off + len <= check_off))
7121 			return (false);
7122 	} else if (off > check_off) {
7123 		if (__predict_true(check_off + check_len <= off))
7124 			return (false);
7125 	}
7126 	return (true);
7127 }
7128 
7129 static __inline void
7130 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7131 		uint32_t len, struct hn_rxinfo *info)
7132 {
7133 	uint32_t cnt = rxr->rsc.cnt;
7134 
7135 	if (cnt) {
7136 		rxr->rsc.pktlen += len;
7137 	} else {
7138 		rxr->rsc.vlan_info = info->vlan_info;
7139 		rxr->rsc.csum_info = info->csum_info;
7140 		rxr->rsc.hash_info = info->hash_info;
7141 		rxr->rsc.hash_value = info->hash_value;
7142 		rxr->rsc.pktlen = len;
7143 	}
7144 
7145 	rxr->rsc.frag_data[cnt] = data;
7146 	rxr->rsc.frag_len[cnt] = len;
7147 	rxr->rsc.cnt++;
7148 }
7149 
7150 static void
7151 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7152 {
7153 	const struct rndis_packet_msg *pkt;
7154 	struct hn_rxinfo info;
7155 	int data_off, pktinfo_off, data_len, pktinfo_len;
7156 	bool rsc_more= false;
7157 
7158 	/*
7159 	 * Check length.
7160 	 */
7161 	if (__predict_false(dlen < sizeof(*pkt))) {
7162 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7163 		return;
7164 	}
7165 	pkt = data;
7166 
7167 	if (__predict_false(dlen < pkt->rm_len)) {
7168 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7169 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7170 		return;
7171 	}
7172 	if (__predict_false(pkt->rm_len <
7173 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7174 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7175 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7176 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7177 		    pkt->rm_pktinfolen);
7178 		return;
7179 	}
7180 	if (__predict_false(pkt->rm_datalen == 0)) {
7181 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7182 		return;
7183 	}
7184 
7185 	/*
7186 	 * Check offests.
7187 	 */
7188 #define IS_OFFSET_INVALID(ofs)			\
7189 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7190 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7191 
7192 	/* XXX Hyper-V does not meet data offset alignment requirement */
7193 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7194 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7195 		    "data offset %u\n", pkt->rm_dataoffset);
7196 		return;
7197 	}
7198 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7199 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7200 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7201 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7202 		return;
7203 	}
7204 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7205 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7206 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7207 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7208 		return;
7209 	}
7210 
7211 #undef IS_OFFSET_INVALID
7212 
7213 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7214 	data_len = pkt->rm_datalen;
7215 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7216 	pktinfo_len = pkt->rm_pktinfolen;
7217 
7218 	/*
7219 	 * Check OOB coverage.
7220 	 */
7221 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7222 		int oob_off, oob_len;
7223 
7224 		if_printf(rxr->hn_ifp, "got oobdata\n");
7225 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7226 		oob_len = pkt->rm_oobdatalen;
7227 
7228 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7229 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7230 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7231 			    pkt->rm_len, oob_off, oob_len);
7232 			return;
7233 		}
7234 
7235 		/*
7236 		 * Check against data.
7237 		 */
7238 		if (hn_rndis_check_overlap(oob_off, oob_len,
7239 		    data_off, data_len)) {
7240 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7241 			    "oob overlaps data, oob abs %d len %d, "
7242 			    "data abs %d len %d\n",
7243 			    oob_off, oob_len, data_off, data_len);
7244 			return;
7245 		}
7246 
7247 		/*
7248 		 * Check against pktinfo.
7249 		 */
7250 		if (pktinfo_len != 0 &&
7251 		    hn_rndis_check_overlap(oob_off, oob_len,
7252 		    pktinfo_off, pktinfo_len)) {
7253 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7254 			    "oob overlaps pktinfo, oob abs %d len %d, "
7255 			    "pktinfo abs %d len %d\n",
7256 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7257 			return;
7258 		}
7259 	}
7260 
7261 	/*
7262 	 * Check per-packet-info coverage and find useful per-packet-info.
7263 	 */
7264 	info.vlan_info = NULL;
7265 	info.csum_info = NULL;
7266 	info.hash_info = NULL;
7267 	info.pktinfo_id = NULL;
7268 
7269 	if (__predict_true(pktinfo_len != 0)) {
7270 		bool overlap;
7271 		int error;
7272 
7273 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7274 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7275 			    "pktinfo overflow, msglen %u, "
7276 			    "pktinfo abs %d len %d\n",
7277 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7278 			return;
7279 		}
7280 
7281 		/*
7282 		 * Check packet info coverage.
7283 		 */
7284 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7285 		    data_off, data_len);
7286 		if (__predict_false(overlap)) {
7287 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7288 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7289 			    "data abs %d len %d\n",
7290 			    pktinfo_off, pktinfo_len, data_off, data_len);
7291 			return;
7292 		}
7293 
7294 		/*
7295 		 * Find useful per-packet-info.
7296 		 */
7297 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7298 		    pktinfo_len, &info);
7299 		if (__predict_false(error)) {
7300 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7301 			    "pktinfo\n");
7302 			return;
7303 		}
7304 	}
7305 
7306 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7307 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7308 		    "data overflow, msglen %u, data abs %d len %d\n",
7309 		    pkt->rm_len, data_off, data_len);
7310 		return;
7311 	}
7312 
7313 	/* Identify RSC fragments, drop invalid packets */
7314 	if ((info.pktinfo_id != NULL) &&
7315 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7316 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7317 			rxr->rsc.cnt = 0;
7318 			rxr->hn_rsc_pkts++;
7319 		} else if (rxr->rsc.cnt == 0)
7320 			goto drop;
7321 
7322 		rsc_more = true;
7323 
7324 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7325 			rsc_more = false;
7326 
7327 		if (rsc_more && rxr->rsc.is_last)
7328 			goto drop;
7329 	} else {
7330 		rxr->rsc.cnt = 0;
7331 	}
7332 
7333 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7334 		goto drop;
7335 
7336 	/* Store data in per rx ring structure */
7337 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7338 	    data_len, &info);
7339 
7340 	if (rsc_more)
7341 		return;
7342 
7343 	hn_rxpkt(rxr);
7344 	rxr->rsc.cnt = 0;
7345 	return;
7346 drop:
7347 	rxr->hn_rsc_drop++;
7348 	return;
7349 }
7350 
7351 static __inline void
7352 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7353 {
7354 	const struct rndis_msghdr *hdr;
7355 
7356 	if (__predict_false(dlen < sizeof(*hdr))) {
7357 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7358 		return;
7359 	}
7360 	hdr = data;
7361 
7362 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7363 		/* Hot data path. */
7364 		hn_rndis_rx_data(rxr, data, dlen);
7365 		/* Done! */
7366 		return;
7367 	}
7368 
7369 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7370 		hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen);
7371 	else
7372 		hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen);
7373 }
7374 
7375 static void
7376 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7377 {
7378 	const struct hn_nvs_hdr *hdr;
7379 
7380 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7381 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7382 		return;
7383 	}
7384 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7385 
7386 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7387 		/* Useless; ignore */
7388 		return;
7389 	}
7390 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7391 }
7392 
7393 static void
7394 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7395     const struct vmbus_chanpkt_hdr *pkt)
7396 {
7397 	struct hn_nvs_sendctx *sndc;
7398 
7399 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7400 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7401 	    VMBUS_CHANPKT_DATALEN(pkt));
7402 	/*
7403 	 * NOTE:
7404 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7405 	 * its callback.
7406 	 */
7407 }
7408 
7409 static void
7410 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7411     const struct vmbus_chanpkt_hdr *pkthdr)
7412 {
7413 	struct epoch_tracker et;
7414 	const struct vmbus_chanpkt_rxbuf *pkt;
7415 	const struct hn_nvs_hdr *nvs_hdr;
7416 	int count, i, hlen;
7417 
7418 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7419 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7420 		return;
7421 	}
7422 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7423 
7424 	/* Make sure that this is a RNDIS message. */
7425 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7426 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7427 		    nvs_hdr->nvs_type);
7428 		return;
7429 	}
7430 
7431 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7432 	if (__predict_false(hlen < sizeof(*pkt))) {
7433 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7434 		return;
7435 	}
7436 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7437 
7438 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7439 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7440 		    pkt->cp_rxbuf_id);
7441 		return;
7442 	}
7443 
7444 	count = pkt->cp_rxbuf_cnt;
7445 	if (__predict_false(hlen <
7446 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7447 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7448 		return;
7449 	}
7450 
7451 	NET_EPOCH_ENTER(et);
7452 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7453 	for (i = 0; i < count; ++i) {
7454 		int ofs, len;
7455 
7456 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7457 		len = pkt->cp_rxbuf[i].rb_len;
7458 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7459 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7460 			    "ofs %d, len %d\n", i, ofs, len);
7461 			continue;
7462 		}
7463 
7464 		rxr->rsc.is_last = (i == (count - 1));
7465 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7466 	}
7467 	NET_EPOCH_EXIT(et);
7468 
7469 	/*
7470 	 * Ack the consumed RXBUF associated w/ this channel packet,
7471 	 * so that this RXBUF can be recycled by the hypervisor.
7472 	 */
7473 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7474 }
7475 
7476 static void
7477 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7478     uint64_t tid)
7479 {
7480 	struct hn_nvs_rndis_ack ack;
7481 	int retries, error;
7482 
7483 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7484 	ack.nvs_status = HN_NVS_STATUS_OK;
7485 
7486 	retries = 0;
7487 again:
7488 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7489 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7490 	if (__predict_false(error == EAGAIN)) {
7491 		/*
7492 		 * NOTE:
7493 		 * This should _not_ happen in real world, since the
7494 		 * consumption of the TX bufring from the TX path is
7495 		 * controlled.
7496 		 */
7497 		if (rxr->hn_ack_failed == 0)
7498 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7499 		rxr->hn_ack_failed++;
7500 		retries++;
7501 		if (retries < 10) {
7502 			DELAY(100);
7503 			goto again;
7504 		}
7505 		/* RXBUF leaks! */
7506 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7507 	}
7508 }
7509 
7510 static void
7511 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7512 {
7513 	struct hn_rx_ring *rxr = xrxr;
7514 	struct hn_softc *sc = if_getsoftc(rxr->hn_ifp);
7515 
7516 	for (;;) {
7517 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7518 		int error, pktlen;
7519 
7520 		pktlen = rxr->hn_pktbuf_len;
7521 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7522 		if (__predict_false(error == ENOBUFS)) {
7523 			void *nbuf;
7524 			int nlen;
7525 
7526 			/*
7527 			 * Expand channel packet buffer.
7528 			 *
7529 			 * XXX
7530 			 * Use M_WAITOK here, since allocation failure
7531 			 * is fatal.
7532 			 */
7533 			nlen = rxr->hn_pktbuf_len * 2;
7534 			while (nlen < pktlen)
7535 				nlen *= 2;
7536 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7537 
7538 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7539 			    rxr->hn_pktbuf_len, nlen);
7540 
7541 			free(rxr->hn_pktbuf, M_DEVBUF);
7542 			rxr->hn_pktbuf = nbuf;
7543 			rxr->hn_pktbuf_len = nlen;
7544 			/* Retry! */
7545 			continue;
7546 		} else if (__predict_false(error == EAGAIN)) {
7547 			/* No more channel packets; done! */
7548 			break;
7549 		}
7550 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7551 
7552 		switch (pkt->cph_type) {
7553 		case VMBUS_CHANPKT_TYPE_COMP:
7554 			hn_nvs_handle_comp(sc, chan, pkt);
7555 			break;
7556 
7557 		case VMBUS_CHANPKT_TYPE_RXBUF:
7558 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7559 			break;
7560 
7561 		case VMBUS_CHANPKT_TYPE_INBAND:
7562 			hn_nvs_handle_notify(sc, pkt);
7563 			break;
7564 
7565 		default:
7566 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7567 			    pkt->cph_type);
7568 			break;
7569 		}
7570 	}
7571 	hn_chan_rollup(rxr, rxr->hn_txr);
7572 }
7573 
7574 static void
7575 hn_sysinit(void *arg __unused)
7576 {
7577 	int i;
7578 
7579 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7580 
7581 #ifdef HN_IFSTART_SUPPORT
7582 	/*
7583 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7584 	 * mainly due to the IFF_DRV_OACTIVE flag.
7585 	 */
7586 	if (hn_xpnt_vf && hn_use_if_start) {
7587 		hn_use_if_start = 0;
7588 		printf("hn: transparent VF mode, if_transmit will be used, "
7589 		    "instead of if_start\n");
7590 	}
7591 #endif
7592 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7593 		printf("hn: invalid transparent VF attach routing "
7594 		    "wait timeout %d, reset to %d\n",
7595 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7596 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7597 	}
7598 
7599 	/*
7600 	 * Initialize VF map.
7601 	 */
7602 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7603 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7604 	hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF,
7605 	    M_WAITOK | M_ZERO);
7606 
7607 	/*
7608 	 * Fix the # of TX taskqueues.
7609 	 */
7610 	if (hn_tx_taskq_cnt <= 0)
7611 		hn_tx_taskq_cnt = 1;
7612 	else if (hn_tx_taskq_cnt > mp_ncpus)
7613 		hn_tx_taskq_cnt = mp_ncpus;
7614 
7615 	/*
7616 	 * Fix the TX taskqueue mode.
7617 	 */
7618 	switch (hn_tx_taskq_mode) {
7619 	case HN_TX_TASKQ_M_INDEP:
7620 	case HN_TX_TASKQ_M_GLOBAL:
7621 	case HN_TX_TASKQ_M_EVTTQ:
7622 		break;
7623 	default:
7624 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7625 		break;
7626 	}
7627 
7628 	if (vm_guest != VM_GUEST_HV)
7629 		return;
7630 
7631 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7632 		return;
7633 
7634 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7635 	    M_DEVBUF, M_WAITOK);
7636 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7637 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7638 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7639 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7640 		    "hn tx%d", i);
7641 	}
7642 }
7643 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7644 
7645 static void
7646 hn_sysuninit(void *arg __unused)
7647 {
7648 
7649 	if (hn_tx_taskque != NULL) {
7650 		int i;
7651 
7652 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7653 			taskqueue_free(hn_tx_taskque[i]);
7654 		free(hn_tx_taskque, M_DEVBUF);
7655 	}
7656 
7657 	if (hn_vfmap != NULL)
7658 		free(hn_vfmap, M_DEVBUF);
7659 	rm_destroy(&hn_vfmap_lock);
7660 
7661 	counter_u64_free(hn_udpcs_fixup);
7662 }
7663 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7664