xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision f2b7bf8afcfd630e0fbd8417f1ce974de79feaf0)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/bus.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/smp.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
75 #include <sys/sx.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
80 #include <sys/eventhandler.h>
81 
82 #include <machine/atomic.h>
83 #include <machine/in_cksum.h>
84 
85 #include <net/bpf.h>
86 #include <net/ethernet.h>
87 #include <net/if.h>
88 #include <net/if_dl.h>
89 #include <net/if_media.h>
90 #include <net/if_types.h>
91 #include <net/if_var.h>
92 #include <net/rndis.h>
93 #ifdef RSS
94 #include <net/rss_config.h>
95 #endif
96 
97 #include <netinet/in_systm.h>
98 #include <netinet/in.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip6.h>
101 #include <netinet/tcp.h>
102 #include <netinet/tcp_lro.h>
103 #include <netinet/udp.h>
104 
105 #include <dev/hyperv/include/hyperv.h>
106 #include <dev/hyperv/include/hyperv_busdma.h>
107 #include <dev/hyperv/include/vmbus.h>
108 #include <dev/hyperv/include/vmbus_xact.h>
109 
110 #include <dev/hyperv/netvsc/ndis.h>
111 #include <dev/hyperv/netvsc/if_hnreg.h>
112 #include <dev/hyperv/netvsc/if_hnvar.h>
113 #include <dev/hyperv/netvsc/hn_nvs.h>
114 #include <dev/hyperv/netvsc/hn_rndis.h>
115 
116 #include "vmbus_if.h"
117 
118 #define HN_IFSTART_SUPPORT
119 
120 #define HN_RING_CNT_DEF_MAX		8
121 
122 /* YYY should get it from the underlying channel */
123 #define HN_TX_DESC_CNT			512
124 
125 #define HN_RNDIS_PKT_LEN					\
126 	(sizeof(struct rndis_packet_msg) +			\
127 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
128 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
129 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
130 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
131 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
132 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
133 
134 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
135 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
136 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
137 /* -1 for RNDIS packet message */
138 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
139 
140 #define HN_DIRECT_TX_SIZE_DEF		128
141 
142 #define HN_EARLY_TXEOF_THRESH		8
143 
144 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
145 
146 #define HN_LROENT_CNT_DEF		128
147 
148 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
149 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
150 /* YYY 2*MTU is a bit rough, but should be good enough. */
151 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
152 
153 #define HN_LRO_ACKCNT_DEF		1
154 
155 #define HN_LOCK_INIT(sc)		\
156 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
157 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
158 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
159 #define HN_LOCK(sc)					\
160 do {							\
161 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
162 		DELAY(1000);				\
163 } while (0)
164 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
165 
166 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
167 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
168 #define HN_CSUM_IP_HWASSIST(sc)		\
169 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
170 #define HN_CSUM_IP6_HWASSIST(sc)	\
171 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
172 
173 #define HN_PKTSIZE_MIN(align)		\
174 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
175 	    HN_RNDIS_PKT_LEN, (align))
176 #define HN_PKTSIZE(m, align)		\
177 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
178 
179 #ifdef RSS
180 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
181 #else
182 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
183 #endif
184 
185 struct hn_txdesc {
186 #ifndef HN_USE_TXDESC_BUFRING
187 	SLIST_ENTRY(hn_txdesc)		link;
188 #endif
189 	STAILQ_ENTRY(hn_txdesc)		agg_link;
190 
191 	/* Aggregated txdescs, in sending order. */
192 	STAILQ_HEAD(, hn_txdesc)	agg_list;
193 
194 	/* The oldest packet, if transmission aggregation happens. */
195 	struct mbuf			*m;
196 	struct hn_tx_ring		*txr;
197 	int				refs;
198 	uint32_t			flags;	/* HN_TXD_FLAG_ */
199 	struct hn_nvs_sendctx		send_ctx;
200 	uint32_t			chim_index;
201 	int				chim_size;
202 
203 	bus_dmamap_t			data_dmap;
204 
205 	bus_addr_t			rndis_pkt_paddr;
206 	struct rndis_packet_msg		*rndis_pkt;
207 	bus_dmamap_t			rndis_pkt_dmap;
208 };
209 
210 #define HN_TXD_FLAG_ONLIST		0x0001
211 #define HN_TXD_FLAG_DMAMAP		0x0002
212 #define HN_TXD_FLAG_ONAGG		0x0004
213 
214 struct hn_rxinfo {
215 	uint32_t			vlan_info;
216 	uint32_t			csum_info;
217 	uint32_t			hash_info;
218 	uint32_t			hash_value;
219 };
220 
221 struct hn_update_vf {
222 	struct hn_rx_ring	*rxr;
223 	struct ifnet		*vf;
224 };
225 
226 #define HN_RXINFO_VLAN			0x0001
227 #define HN_RXINFO_CSUM			0x0002
228 #define HN_RXINFO_HASHINF		0x0004
229 #define HN_RXINFO_HASHVAL		0x0008
230 #define HN_RXINFO_ALL			\
231 	(HN_RXINFO_VLAN |		\
232 	 HN_RXINFO_CSUM |		\
233 	 HN_RXINFO_HASHINF |		\
234 	 HN_RXINFO_HASHVAL)
235 
236 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
237 #define HN_NDIS_RXCSUM_INFO_INVALID	0
238 #define HN_NDIS_HASH_INFO_INVALID	0
239 
240 static int			hn_probe(device_t);
241 static int			hn_attach(device_t);
242 static int			hn_detach(device_t);
243 static int			hn_shutdown(device_t);
244 static void			hn_chan_callback(struct vmbus_channel *,
245 				    void *);
246 
247 static void			hn_init(void *);
248 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
249 #ifdef HN_IFSTART_SUPPORT
250 static void			hn_start(struct ifnet *);
251 #endif
252 static int			hn_transmit(struct ifnet *, struct mbuf *);
253 static void			hn_xmit_qflush(struct ifnet *);
254 static int			hn_ifmedia_upd(struct ifnet *);
255 static void			hn_ifmedia_sts(struct ifnet *,
256 				    struct ifmediareq *);
257 
258 static int			hn_rndis_rxinfo(const void *, int,
259 				    struct hn_rxinfo *);
260 static void			hn_rndis_rx_data(struct hn_rx_ring *,
261 				    const void *, int);
262 static void			hn_rndis_rx_status(struct hn_softc *,
263 				    const void *, int);
264 static void			hn_rndis_init_fixat(struct hn_softc *, int);
265 
266 static void			hn_nvs_handle_notify(struct hn_softc *,
267 				    const struct vmbus_chanpkt_hdr *);
268 static void			hn_nvs_handle_comp(struct hn_softc *,
269 				    struct vmbus_channel *,
270 				    const struct vmbus_chanpkt_hdr *);
271 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
272 				    struct vmbus_channel *,
273 				    const struct vmbus_chanpkt_hdr *);
274 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
275 				    struct vmbus_channel *, uint64_t);
276 
277 #if __FreeBSD_version >= 1100099
278 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
279 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
280 #endif
281 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
282 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
283 #if __FreeBSD_version < 1100095
284 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
285 #else
286 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
287 #endif
288 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
289 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
290 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
291 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
292 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
293 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
294 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
295 #ifndef RSS
296 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
297 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
298 #endif
299 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
300 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
301 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
302 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
303 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
304 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
305 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
306 
307 static void			hn_stop(struct hn_softc *, bool);
308 static void			hn_init_locked(struct hn_softc *);
309 static int			hn_chan_attach(struct hn_softc *,
310 				    struct vmbus_channel *);
311 static void			hn_chan_detach(struct hn_softc *,
312 				    struct vmbus_channel *);
313 static int			hn_attach_subchans(struct hn_softc *);
314 static void			hn_detach_allchans(struct hn_softc *);
315 static void			hn_chan_rollup(struct hn_rx_ring *,
316 				    struct hn_tx_ring *);
317 static void			hn_set_ring_inuse(struct hn_softc *, int);
318 static int			hn_synth_attach(struct hn_softc *, int);
319 static void			hn_synth_detach(struct hn_softc *);
320 static int			hn_synth_alloc_subchans(struct hn_softc *,
321 				    int *);
322 static bool			hn_synth_attachable(const struct hn_softc *);
323 static void			hn_suspend(struct hn_softc *);
324 static void			hn_suspend_data(struct hn_softc *);
325 static void			hn_suspend_mgmt(struct hn_softc *);
326 static void			hn_resume(struct hn_softc *);
327 static void			hn_resume_data(struct hn_softc *);
328 static void			hn_resume_mgmt(struct hn_softc *);
329 static void			hn_suspend_mgmt_taskfunc(void *, int);
330 static void			hn_chan_drain(struct hn_softc *,
331 				    struct vmbus_channel *);
332 static void			hn_disable_rx(struct hn_softc *);
333 static void			hn_drain_rxtx(struct hn_softc *, int);
334 static void			hn_polling(struct hn_softc *, u_int);
335 static void			hn_chan_polling(struct vmbus_channel *, u_int);
336 
337 static void			hn_update_link_status(struct hn_softc *);
338 static void			hn_change_network(struct hn_softc *);
339 static void			hn_link_taskfunc(void *, int);
340 static void			hn_netchg_init_taskfunc(void *, int);
341 static void			hn_netchg_status_taskfunc(void *, int);
342 static void			hn_link_status(struct hn_softc *);
343 
344 static int			hn_create_rx_data(struct hn_softc *, int);
345 static void			hn_destroy_rx_data(struct hn_softc *);
346 static int			hn_check_iplen(const struct mbuf *, int);
347 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
348 static int			hn_rxfilter_config(struct hn_softc *);
349 #ifndef RSS
350 static int			hn_rss_reconfig(struct hn_softc *);
351 #endif
352 static void			hn_rss_ind_fixup(struct hn_softc *);
353 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
354 				    int, const struct hn_rxinfo *);
355 
356 static int			hn_tx_ring_create(struct hn_softc *, int);
357 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
358 static int			hn_create_tx_data(struct hn_softc *, int);
359 static void			hn_fixup_tx_data(struct hn_softc *);
360 static void			hn_destroy_tx_data(struct hn_softc *);
361 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
362 static void			hn_txdesc_gc(struct hn_tx_ring *,
363 				    struct hn_txdesc *);
364 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
365 				    struct hn_txdesc *, struct mbuf **);
366 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
367 				    struct hn_txdesc *);
368 static void			hn_set_chim_size(struct hn_softc *, int);
369 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
370 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
371 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
372 static void			hn_resume_tx(struct hn_softc *, int);
373 static void			hn_set_txagg(struct hn_softc *);
374 static void			*hn_try_txagg(struct ifnet *,
375 				    struct hn_tx_ring *, struct hn_txdesc *,
376 				    int);
377 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
378 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
379 				    struct hn_softc *, struct vmbus_channel *,
380 				    const void *, int);
381 static int			hn_txpkt_sglist(struct hn_tx_ring *,
382 				    struct hn_txdesc *);
383 static int			hn_txpkt_chim(struct hn_tx_ring *,
384 				    struct hn_txdesc *);
385 static int			hn_xmit(struct hn_tx_ring *, int);
386 static void			hn_xmit_taskfunc(void *, int);
387 static void			hn_xmit_txeof(struct hn_tx_ring *);
388 static void			hn_xmit_txeof_taskfunc(void *, int);
389 #ifdef HN_IFSTART_SUPPORT
390 static int			hn_start_locked(struct hn_tx_ring *, int);
391 static void			hn_start_taskfunc(void *, int);
392 static void			hn_start_txeof(struct hn_tx_ring *);
393 static void			hn_start_txeof_taskfunc(void *, int);
394 #endif
395 
396 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
397     "Hyper-V network interface");
398 
399 /* Trust tcp segements verification on host side. */
400 static int			hn_trust_hosttcp = 1;
401 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
402     &hn_trust_hosttcp, 0,
403     "Trust tcp segement verification on host side, "
404     "when csum info is missing (global setting)");
405 
406 /* Trust udp datagrams verification on host side. */
407 static int			hn_trust_hostudp = 1;
408 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
409     &hn_trust_hostudp, 0,
410     "Trust udp datagram verification on host side, "
411     "when csum info is missing (global setting)");
412 
413 /* Trust ip packets verification on host side. */
414 static int			hn_trust_hostip = 1;
415 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
416     &hn_trust_hostip, 0,
417     "Trust ip packet verification on host side, "
418     "when csum info is missing (global setting)");
419 
420 /* Limit TSO burst size */
421 static int			hn_tso_maxlen = IP_MAXPACKET;
422 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
423     &hn_tso_maxlen, 0, "TSO burst limit");
424 
425 /* Limit chimney send size */
426 static int			hn_tx_chimney_size = 0;
427 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
428     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
429 
430 /* Limit the size of packet for direct transmission */
431 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
432 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
433     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
434 
435 /* # of LRO entries per RX ring */
436 #if defined(INET) || defined(INET6)
437 #if __FreeBSD_version >= 1100095
438 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
439 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
440     &hn_lro_entry_count, 0, "LRO entry count");
441 #endif
442 #endif
443 
444 static int			hn_tx_taskq_cnt = 1;
445 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
446     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
447 
448 #define HN_TX_TASKQ_M_INDEP	0
449 #define HN_TX_TASKQ_M_GLOBAL	1
450 #define HN_TX_TASKQ_M_EVTTQ	2
451 
452 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
453 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
454     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
455     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
456 
457 #ifndef HN_USE_TXDESC_BUFRING
458 static int			hn_use_txdesc_bufring = 0;
459 #else
460 static int			hn_use_txdesc_bufring = 1;
461 #endif
462 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
463     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
464 
465 #ifdef HN_IFSTART_SUPPORT
466 /* Use ifnet.if_start instead of ifnet.if_transmit */
467 static int			hn_use_if_start = 0;
468 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
469     &hn_use_if_start, 0, "Use if_start TX method");
470 #endif
471 
472 /* # of channels to use */
473 static int			hn_chan_cnt = 0;
474 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
475     &hn_chan_cnt, 0,
476     "# of channels to use; each channel has one RX ring and one TX ring");
477 
478 /* # of transmit rings to use */
479 static int			hn_tx_ring_cnt = 0;
480 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
481     &hn_tx_ring_cnt, 0, "# of TX rings to use");
482 
483 /* Software TX ring deptch */
484 static int			hn_tx_swq_depth = 0;
485 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
486     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
487 
488 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
489 #if __FreeBSD_version >= 1100095
490 static u_int			hn_lro_mbufq_depth = 0;
491 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
492     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
493 #endif
494 
495 /* Packet transmission aggregation size limit */
496 static int			hn_tx_agg_size = -1;
497 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
498     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
499 
500 /* Packet transmission aggregation count limit */
501 static int			hn_tx_agg_pkts = -1;
502 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
503     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
504 
505 static u_int			hn_cpu_index;	/* next CPU for channel */
506 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
507 
508 #ifndef RSS
509 static const uint8_t
510 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
511 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
512 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
513 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
514 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
515 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
516 };
517 #endif	/* !RSS */
518 
519 static device_method_t hn_methods[] = {
520 	/* Device interface */
521 	DEVMETHOD(device_probe,		hn_probe),
522 	DEVMETHOD(device_attach,	hn_attach),
523 	DEVMETHOD(device_detach,	hn_detach),
524 	DEVMETHOD(device_shutdown,	hn_shutdown),
525 	DEVMETHOD_END
526 };
527 
528 static driver_t hn_driver = {
529 	"hn",
530 	hn_methods,
531 	sizeof(struct hn_softc)
532 };
533 
534 static devclass_t hn_devclass;
535 
536 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
537 MODULE_VERSION(hn, 1);
538 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
539 
540 #if __FreeBSD_version >= 1100099
541 static void
542 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
543 {
544 	int i;
545 
546 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
547 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
548 }
549 #endif
550 
551 static int
552 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
553 {
554 
555 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
556 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
557 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
558 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
559 }
560 
561 static int
562 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
563 {
564 	struct hn_nvs_rndis rndis;
565 
566 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
567 	    txd->chim_size > 0, ("invalid rndis chim txd"));
568 
569 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
570 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
571 	rndis.nvs_chim_idx = txd->chim_index;
572 	rndis.nvs_chim_sz = txd->chim_size;
573 
574 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
575 	    &rndis, sizeof(rndis), &txd->send_ctx));
576 }
577 
578 static __inline uint32_t
579 hn_chim_alloc(struct hn_softc *sc)
580 {
581 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
582 	u_long *bmap = sc->hn_chim_bmap;
583 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
584 
585 	for (i = 0; i < bmap_cnt; ++i) {
586 		int idx;
587 
588 		idx = ffsl(~bmap[i]);
589 		if (idx == 0)
590 			continue;
591 
592 		--idx; /* ffsl is 1-based */
593 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
594 		    ("invalid i %d and idx %d", i, idx));
595 
596 		if (atomic_testandset_long(&bmap[i], idx))
597 			continue;
598 
599 		ret = i * LONG_BIT + idx;
600 		break;
601 	}
602 	return (ret);
603 }
604 
605 static __inline void
606 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
607 {
608 	u_long mask;
609 	uint32_t idx;
610 
611 	idx = chim_idx / LONG_BIT;
612 	KASSERT(idx < sc->hn_chim_bmap_cnt,
613 	    ("invalid chimney index 0x%x", chim_idx));
614 
615 	mask = 1UL << (chim_idx % LONG_BIT);
616 	KASSERT(sc->hn_chim_bmap[idx] & mask,
617 	    ("index bitmap 0x%lx, chimney index %u, "
618 	     "bitmap idx %d, bitmask 0x%lx",
619 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
620 
621 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
622 }
623 
624 #if defined(INET6) || defined(INET)
625 
626 #define PULLUP_HDR(m, len)				\
627 do {							\
628 	if (__predict_false((m)->m_len < (len))) {	\
629 		(m) = m_pullup((m), (len));		\
630 		if ((m) == NULL)			\
631 			return (NULL);			\
632 	}						\
633 } while (0)
634 
635 /*
636  * NOTE: If this function failed, the m_head would be freed.
637  */
638 static __inline struct mbuf *
639 hn_tso_fixup(struct mbuf *m_head)
640 {
641 	struct ether_vlan_header *evl;
642 	struct tcphdr *th;
643 	int ehlen;
644 
645 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
646 
647 	PULLUP_HDR(m_head, sizeof(*evl));
648 	evl = mtod(m_head, struct ether_vlan_header *);
649 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
650 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
651 	else
652 		ehlen = ETHER_HDR_LEN;
653 
654 #ifdef INET
655 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
656 		struct ip *ip;
657 		int iphlen;
658 
659 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
660 		ip = mtodo(m_head, ehlen);
661 		iphlen = ip->ip_hl << 2;
662 
663 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
664 		th = mtodo(m_head, ehlen + iphlen);
665 
666 		ip->ip_len = 0;
667 		ip->ip_sum = 0;
668 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
669 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
670 	}
671 #endif
672 #if defined(INET6) && defined(INET)
673 	else
674 #endif
675 #ifdef INET6
676 	{
677 		struct ip6_hdr *ip6;
678 
679 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
680 		ip6 = mtodo(m_head, ehlen);
681 		if (ip6->ip6_nxt != IPPROTO_TCP) {
682 			m_freem(m_head);
683 			return (NULL);
684 		}
685 
686 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
687 		th = mtodo(m_head, ehlen + sizeof(*ip6));
688 
689 		ip6->ip6_plen = 0;
690 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
691 	}
692 #endif
693 	return (m_head);
694 
695 }
696 
697 /*
698  * NOTE: If this function failed, the m_head would be freed.
699  */
700 static __inline struct mbuf *
701 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
702 {
703 	const struct ether_vlan_header *evl;
704 	const struct tcphdr *th;
705 	int ehlen;
706 
707 	*tcpsyn = 0;
708 
709 	PULLUP_HDR(m_head, sizeof(*evl));
710 	evl = mtod(m_head, const struct ether_vlan_header *);
711 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
712 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
713 	else
714 		ehlen = ETHER_HDR_LEN;
715 
716 #ifdef INET
717 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) {
718 		const struct ip *ip;
719 		int iphlen;
720 
721 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
722 		ip = mtodo(m_head, ehlen);
723 		iphlen = ip->ip_hl << 2;
724 
725 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
726 		th = mtodo(m_head, ehlen + iphlen);
727 		if (th->th_flags & TH_SYN)
728 			*tcpsyn = 1;
729 	}
730 #endif
731 #if defined(INET6) && defined(INET)
732 	else
733 #endif
734 #ifdef INET6
735 	{
736 		const struct ip6_hdr *ip6;
737 
738 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
739 		ip6 = mtodo(m_head, ehlen);
740 		if (ip6->ip6_nxt != IPPROTO_TCP)
741 			return (m_head);
742 
743 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
744 		th = mtodo(m_head, ehlen + sizeof(*ip6));
745 		if (th->th_flags & TH_SYN)
746 			*tcpsyn = 1;
747 	}
748 #endif
749 	return (m_head);
750 }
751 
752 #undef PULLUP_HDR
753 
754 #endif	/* INET6 || INET */
755 
756 static int
757 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
758 {
759 	int error = 0;
760 
761 	HN_LOCK_ASSERT(sc);
762 
763 	if (sc->hn_rx_filter != filter) {
764 		error = hn_rndis_set_rxfilter(sc, filter);
765 		if (!error)
766 			sc->hn_rx_filter = filter;
767 	}
768 	return (error);
769 }
770 
771 static int
772 hn_rxfilter_config(struct hn_softc *sc)
773 {
774 	struct ifnet *ifp = sc->hn_ifp;
775 	uint32_t filter;
776 
777 	HN_LOCK_ASSERT(sc);
778 
779 	if ((ifp->if_flags & IFF_PROMISC) ||
780 	    (sc->hn_flags & HN_FLAG_VF)) {
781 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
782 	} else {
783 		filter = NDIS_PACKET_TYPE_DIRECTED;
784 		if (ifp->if_flags & IFF_BROADCAST)
785 			filter |= NDIS_PACKET_TYPE_BROADCAST;
786 		/* TODO: support multicast list */
787 		if ((ifp->if_flags & IFF_ALLMULTI) ||
788 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
789 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
790 	}
791 	return (hn_set_rxfilter(sc, filter));
792 }
793 
794 static void
795 hn_set_txagg(struct hn_softc *sc)
796 {
797 	uint32_t size, pkts;
798 	int i;
799 
800 	/*
801 	 * Setup aggregation size.
802 	 */
803 	if (sc->hn_agg_size < 0)
804 		size = UINT32_MAX;
805 	else
806 		size = sc->hn_agg_size;
807 
808 	if (sc->hn_rndis_agg_size < size)
809 		size = sc->hn_rndis_agg_size;
810 
811 	/* NOTE: We only aggregate packets using chimney sending buffers. */
812 	if (size > (uint32_t)sc->hn_chim_szmax)
813 		size = sc->hn_chim_szmax;
814 
815 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
816 		/* Disable */
817 		size = 0;
818 		pkts = 0;
819 		goto done;
820 	}
821 
822 	/* NOTE: Type of the per TX ring setting is 'int'. */
823 	if (size > INT_MAX)
824 		size = INT_MAX;
825 
826 	/*
827 	 * Setup aggregation packet count.
828 	 */
829 	if (sc->hn_agg_pkts < 0)
830 		pkts = UINT32_MAX;
831 	else
832 		pkts = sc->hn_agg_pkts;
833 
834 	if (sc->hn_rndis_agg_pkts < pkts)
835 		pkts = sc->hn_rndis_agg_pkts;
836 
837 	if (pkts <= 1) {
838 		/* Disable */
839 		size = 0;
840 		pkts = 0;
841 		goto done;
842 	}
843 
844 	/* NOTE: Type of the per TX ring setting is 'short'. */
845 	if (pkts > SHRT_MAX)
846 		pkts = SHRT_MAX;
847 
848 done:
849 	/* NOTE: Type of the per TX ring setting is 'short'. */
850 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
851 		/* Disable */
852 		size = 0;
853 		pkts = 0;
854 	}
855 
856 	if (bootverbose) {
857 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
858 		    size, pkts, sc->hn_rndis_agg_align);
859 	}
860 
861 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
862 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
863 
864 		mtx_lock(&txr->hn_tx_lock);
865 		txr->hn_agg_szmax = size;
866 		txr->hn_agg_pktmax = pkts;
867 		txr->hn_agg_align = sc->hn_rndis_agg_align;
868 		mtx_unlock(&txr->hn_tx_lock);
869 	}
870 }
871 
872 static int
873 hn_get_txswq_depth(const struct hn_tx_ring *txr)
874 {
875 
876 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
877 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
878 		return txr->hn_txdesc_cnt;
879 	return hn_tx_swq_depth;
880 }
881 
882 #ifndef RSS
883 static int
884 hn_rss_reconfig(struct hn_softc *sc)
885 {
886 	int error;
887 
888 	HN_LOCK_ASSERT(sc);
889 
890 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
891 		return (ENXIO);
892 
893 	/*
894 	 * Disable RSS first.
895 	 *
896 	 * NOTE:
897 	 * Direct reconfiguration by setting the UNCHG flags does
898 	 * _not_ work properly.
899 	 */
900 	if (bootverbose)
901 		if_printf(sc->hn_ifp, "disable RSS\n");
902 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
903 	if (error) {
904 		if_printf(sc->hn_ifp, "RSS disable failed\n");
905 		return (error);
906 	}
907 
908 	/*
909 	 * Reenable the RSS w/ the updated RSS key or indirect
910 	 * table.
911 	 */
912 	if (bootverbose)
913 		if_printf(sc->hn_ifp, "reconfig RSS\n");
914 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
915 	if (error) {
916 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
917 		return (error);
918 	}
919 	return (0);
920 }
921 #endif	/* !RSS */
922 
923 static void
924 hn_rss_ind_fixup(struct hn_softc *sc)
925 {
926 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
927 	int i, nchan;
928 
929 	nchan = sc->hn_rx_ring_inuse;
930 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
931 
932 	/*
933 	 * Check indirect table to make sure that all channels in it
934 	 * can be used.
935 	 */
936 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
937 		if (rss->rss_ind[i] >= nchan) {
938 			if_printf(sc->hn_ifp,
939 			    "RSS indirect table %d fixup: %u -> %d\n",
940 			    i, rss->rss_ind[i], nchan - 1);
941 			rss->rss_ind[i] = nchan - 1;
942 		}
943 	}
944 }
945 
946 static int
947 hn_ifmedia_upd(struct ifnet *ifp __unused)
948 {
949 
950 	return EOPNOTSUPP;
951 }
952 
953 static void
954 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
955 {
956 	struct hn_softc *sc = ifp->if_softc;
957 
958 	ifmr->ifm_status = IFM_AVALID;
959 	ifmr->ifm_active = IFM_ETHER;
960 
961 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
962 		ifmr->ifm_active |= IFM_NONE;
963 		return;
964 	}
965 	ifmr->ifm_status |= IFM_ACTIVE;
966 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
967 }
968 
969 static void
970 hn_update_vf_task(void *arg, int pending __unused)
971 {
972 	struct hn_update_vf *uv = arg;
973 
974 	uv->rxr->hn_vf = uv->vf;
975 }
976 
977 static void
978 hn_update_vf(struct hn_softc *sc, struct ifnet *vf)
979 {
980 	struct hn_rx_ring *rxr;
981 	struct hn_update_vf uv;
982 	struct task task;
983 	int i;
984 
985 	HN_LOCK_ASSERT(sc);
986 
987 	TASK_INIT(&task, 0, hn_update_vf_task, &uv);
988 
989 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
990 		rxr = &sc->hn_rx_ring[i];
991 
992 		if (i < sc->hn_rx_ring_inuse) {
993 			uv.rxr = rxr;
994 			uv.vf = vf;
995 			vmbus_chan_run_task(rxr->hn_chan, &task);
996 		} else {
997 			rxr->hn_vf = vf;
998 		}
999 	}
1000 }
1001 
1002 static void
1003 hn_set_vf(struct hn_softc *sc, struct ifnet *ifp, bool vf)
1004 {
1005 	struct ifnet *hn_ifp;
1006 
1007 	HN_LOCK(sc);
1008 
1009 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1010 		goto out;
1011 
1012 	hn_ifp = sc->hn_ifp;
1013 
1014 	if (ifp == hn_ifp)
1015 		goto out;
1016 
1017 	if (ifp->if_alloctype != IFT_ETHER)
1018 		goto out;
1019 
1020 	/* Ignore lagg/vlan interfaces */
1021 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1022 	    strcmp(ifp->if_dname, "vlan") == 0)
1023 		goto out;
1024 
1025 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1026 		goto out;
1027 
1028 	/* Now we're sure 'ifp' is a real VF device. */
1029 	if (vf) {
1030 		if (sc->hn_flags & HN_FLAG_VF)
1031 			goto out;
1032 
1033 		sc->hn_flags |= HN_FLAG_VF;
1034 		hn_rxfilter_config(sc);
1035 	} else {
1036 		if (!(sc->hn_flags & HN_FLAG_VF))
1037 			goto out;
1038 
1039 		sc->hn_flags &= ~HN_FLAG_VF;
1040 		if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1041 			hn_rxfilter_config(sc);
1042 		else
1043 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1044 	}
1045 
1046 	hn_nvs_set_datapath(sc,
1047 	    vf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC);
1048 
1049 	hn_update_vf(sc, vf ? ifp : NULL);
1050 
1051 	if (vf) {
1052 		hn_suspend_mgmt(sc);
1053 		sc->hn_link_flags &=
1054 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1055 		if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1056 	} else {
1057 		hn_resume_mgmt(sc);
1058 	}
1059 
1060 	devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1061 	    vf ? "VF_UP" : "VF_DOWN", NULL);
1062 
1063 	if (bootverbose)
1064 		if_printf(hn_ifp, "Data path is switched %s %s\n",
1065 		    vf ? "to" : "from", if_name(ifp));
1066 out:
1067 	HN_UNLOCK(sc);
1068 }
1069 
1070 static void
1071 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1072 {
1073 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1074 		return;
1075 
1076 	hn_set_vf(arg, ifp, event == IFNET_EVENT_UP);
1077 }
1078 
1079 static void
1080 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1081 {
1082 	hn_set_vf(arg, ifp, ifp->if_flags & IFF_UP);
1083 }
1084 
1085 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
1086 static const struct hyperv_guid g_net_vsc_device_type = {
1087 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
1088 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
1089 };
1090 
1091 static int
1092 hn_probe(device_t dev)
1093 {
1094 
1095 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1096 	    &g_net_vsc_device_type) == 0) {
1097 		device_set_desc(dev, "Hyper-V Network Interface");
1098 		return BUS_PROBE_DEFAULT;
1099 	}
1100 	return ENXIO;
1101 }
1102 
1103 static int
1104 hn_attach(device_t dev)
1105 {
1106 	struct hn_softc *sc = device_get_softc(dev);
1107 	struct sysctl_oid_list *child;
1108 	struct sysctl_ctx_list *ctx;
1109 	uint8_t eaddr[ETHER_ADDR_LEN];
1110 	struct ifnet *ifp = NULL;
1111 	int error, ring_cnt, tx_ring_cnt;
1112 
1113 	sc->hn_dev = dev;
1114 	sc->hn_prichan = vmbus_get_channel(dev);
1115 	HN_LOCK_INIT(sc);
1116 
1117 	/*
1118 	 * Initialize these tunables once.
1119 	 */
1120 	sc->hn_agg_size = hn_tx_agg_size;
1121 	sc->hn_agg_pkts = hn_tx_agg_pkts;
1122 
1123 	/*
1124 	 * Setup taskqueue for transmission.
1125 	 */
1126 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1127 		int i;
1128 
1129 		sc->hn_tx_taskqs =
1130 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1131 		    M_DEVBUF, M_WAITOK);
1132 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1133 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1134 			    M_WAITOK, taskqueue_thread_enqueue,
1135 			    &sc->hn_tx_taskqs[i]);
1136 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1137 			    "%s tx%d", device_get_nameunit(dev), i);
1138 		}
1139 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1140 		sc->hn_tx_taskqs = hn_tx_taskque;
1141 	}
1142 
1143 	/*
1144 	 * Setup taskqueue for mangement tasks, e.g. link status.
1145 	 */
1146 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1147 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1148 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1149 	    device_get_nameunit(dev));
1150 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1151 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1152 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1153 	    hn_netchg_status_taskfunc, sc);
1154 
1155 	/*
1156 	 * Allocate ifnet and setup its name earlier, so that if_printf
1157 	 * can be used by functions, which will be called after
1158 	 * ether_ifattach().
1159 	 */
1160 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
1161 	ifp->if_softc = sc;
1162 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1163 
1164 	/*
1165 	 * Initialize ifmedia earlier so that it can be unconditionally
1166 	 * destroyed, if error happened later on.
1167 	 */
1168 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1169 
1170 	/*
1171 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1172 	 * to use (tx_ring_cnt).
1173 	 *
1174 	 * NOTE:
1175 	 * The # of RX rings to use is same as the # of channels to use.
1176 	 */
1177 	ring_cnt = hn_chan_cnt;
1178 	if (ring_cnt <= 0) {
1179 		/* Default */
1180 		ring_cnt = mp_ncpus;
1181 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
1182 			ring_cnt = HN_RING_CNT_DEF_MAX;
1183 	} else if (ring_cnt > mp_ncpus) {
1184 		ring_cnt = mp_ncpus;
1185 	}
1186 #ifdef RSS
1187 	if (ring_cnt > rss_getnumbuckets())
1188 		ring_cnt = rss_getnumbuckets();
1189 #endif
1190 
1191 	tx_ring_cnt = hn_tx_ring_cnt;
1192 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1193 		tx_ring_cnt = ring_cnt;
1194 #ifdef HN_IFSTART_SUPPORT
1195 	if (hn_use_if_start) {
1196 		/* ifnet.if_start only needs one TX ring. */
1197 		tx_ring_cnt = 1;
1198 	}
1199 #endif
1200 
1201 	/*
1202 	 * Set the leader CPU for channels.
1203 	 */
1204 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1205 
1206 	/*
1207 	 * Create enough TX/RX rings, even if only limited number of
1208 	 * channels can be allocated.
1209 	 */
1210 	error = hn_create_tx_data(sc, tx_ring_cnt);
1211 	if (error)
1212 		goto failed;
1213 	error = hn_create_rx_data(sc, ring_cnt);
1214 	if (error)
1215 		goto failed;
1216 
1217 	/*
1218 	 * Create transaction context for NVS and RNDIS transactions.
1219 	 */
1220 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1221 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1222 	if (sc->hn_xact == NULL) {
1223 		error = ENXIO;
1224 		goto failed;
1225 	}
1226 
1227 	/*
1228 	 * Install orphan handler for the revocation of this device's
1229 	 * primary channel.
1230 	 *
1231 	 * NOTE:
1232 	 * The processing order is critical here:
1233 	 * Install the orphan handler, _before_ testing whether this
1234 	 * device's primary channel has been revoked or not.
1235 	 */
1236 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1237 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1238 		error = ENXIO;
1239 		goto failed;
1240 	}
1241 
1242 	/*
1243 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1244 	 */
1245 	error = hn_synth_attach(sc, ETHERMTU);
1246 	if (error)
1247 		goto failed;
1248 
1249 	error = hn_rndis_get_eaddr(sc, eaddr);
1250 	if (error)
1251 		goto failed;
1252 
1253 #if __FreeBSD_version >= 1100099
1254 	if (sc->hn_rx_ring_inuse > 1) {
1255 		/*
1256 		 * Reduce TCP segment aggregation limit for multiple
1257 		 * RX rings to increase ACK timeliness.
1258 		 */
1259 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1260 	}
1261 #endif
1262 
1263 	/*
1264 	 * Fixup TX stuffs after synthetic parts are attached.
1265 	 */
1266 	hn_fixup_tx_data(sc);
1267 
1268 	ctx = device_get_sysctl_ctx(dev);
1269 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1270 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1271 	    &sc->hn_nvs_ver, 0, "NVS version");
1272 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1273 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1274 	    hn_ndis_version_sysctl, "A", "NDIS version");
1275 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1276 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1277 	    hn_caps_sysctl, "A", "capabilities");
1278 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1279 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1280 	    hn_hwassist_sysctl, "A", "hwassist");
1281 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1282 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1283 	    hn_rxfilter_sysctl, "A", "rxfilter");
1284 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1285 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1286 	    hn_rss_hash_sysctl, "A", "RSS hash");
1287 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1288 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1289 #ifndef RSS
1290 	/*
1291 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
1292 	 */
1293 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1294 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1295 	    hn_rss_key_sysctl, "IU", "RSS key");
1296 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1297 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1298 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1299 #endif
1300 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1301 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1302 	    "RNDIS offered packet transmission aggregation size limit");
1303 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1304 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1305 	    "RNDIS offered packet transmission aggregation count limit");
1306 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1307 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1308 	    "RNDIS packet transmission aggregation alignment");
1309 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1310 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1311 	    hn_txagg_size_sysctl, "I",
1312 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1313 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1314 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1315 	    hn_txagg_pkts_sysctl, "I",
1316 	    "Packet transmission aggregation packets, "
1317 	    "0 -- disable, -1 -- auto");
1318 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1319 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1320 	    hn_polling_sysctl, "I",
1321 	    "Polling frequency: [100,1000000], 0 disable polling");
1322 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1323 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1324 	    hn_vf_sysctl, "A", "Virtual Function's name");
1325 
1326 	/*
1327 	 * Setup the ifmedia, which has been initialized earlier.
1328 	 */
1329 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1330 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1331 	/* XXX ifmedia_set really should do this for us */
1332 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1333 
1334 	/*
1335 	 * Setup the ifnet for this interface.
1336 	 */
1337 
1338 	ifp->if_baudrate = IF_Gbps(10);
1339 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1340 	ifp->if_ioctl = hn_ioctl;
1341 	ifp->if_init = hn_init;
1342 #ifdef HN_IFSTART_SUPPORT
1343 	if (hn_use_if_start) {
1344 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1345 
1346 		ifp->if_start = hn_start;
1347 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1348 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1349 		IFQ_SET_READY(&ifp->if_snd);
1350 	} else
1351 #endif
1352 	{
1353 		ifp->if_transmit = hn_transmit;
1354 		ifp->if_qflush = hn_xmit_qflush;
1355 	}
1356 
1357 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1358 #ifdef foo
1359 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
1360 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1361 #endif
1362 	if (sc->hn_caps & HN_CAP_VLAN) {
1363 		/* XXX not sure about VLAN_MTU. */
1364 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1365 	}
1366 
1367 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1368 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1369 		ifp->if_capabilities |= IFCAP_TXCSUM;
1370 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1371 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1372 	if (sc->hn_caps & HN_CAP_TSO4) {
1373 		ifp->if_capabilities |= IFCAP_TSO4;
1374 		ifp->if_hwassist |= CSUM_IP_TSO;
1375 	}
1376 	if (sc->hn_caps & HN_CAP_TSO6) {
1377 		ifp->if_capabilities |= IFCAP_TSO6;
1378 		ifp->if_hwassist |= CSUM_IP6_TSO;
1379 	}
1380 
1381 	/* Enable all available capabilities by default. */
1382 	ifp->if_capenable = ifp->if_capabilities;
1383 
1384 	/*
1385 	 * Disable IPv6 TSO and TXCSUM by default, they still can
1386 	 * be enabled through SIOCSIFCAP.
1387 	 */
1388 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1389 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1390 
1391 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1392 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1393 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1394 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1395 	}
1396 
1397 	ether_ifattach(ifp, eaddr);
1398 
1399 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1400 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
1401 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1402 	}
1403 
1404 	/* Inform the upper layer about the long frame support. */
1405 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1406 
1407 	/*
1408 	 * Kick off link status check.
1409 	 */
1410 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1411 	hn_update_link_status(sc);
1412 
1413 	sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
1414 	    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
1415 
1416 	sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
1417 	    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
1418 
1419 	return (0);
1420 failed:
1421 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1422 		hn_synth_detach(sc);
1423 	hn_detach(dev);
1424 	return (error);
1425 }
1426 
1427 static int
1428 hn_detach(device_t dev)
1429 {
1430 	struct hn_softc *sc = device_get_softc(dev);
1431 	struct ifnet *ifp = sc->hn_ifp;
1432 
1433 	if (sc->hn_ifaddr_evthand != NULL)
1434 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
1435 	if (sc->hn_ifnet_evthand != NULL)
1436 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
1437 
1438 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1439 		/*
1440 		 * In case that the vmbus missed the orphan handler
1441 		 * installation.
1442 		 */
1443 		vmbus_xact_ctx_orphan(sc->hn_xact);
1444 	}
1445 
1446 	if (device_is_attached(dev)) {
1447 		HN_LOCK(sc);
1448 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1449 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1450 				hn_stop(sc, true);
1451 			/*
1452 			 * NOTE:
1453 			 * hn_stop() only suspends data, so managment
1454 			 * stuffs have to be suspended manually here.
1455 			 */
1456 			hn_suspend_mgmt(sc);
1457 			hn_synth_detach(sc);
1458 		}
1459 		HN_UNLOCK(sc);
1460 		ether_ifdetach(ifp);
1461 	}
1462 
1463 	ifmedia_removeall(&sc->hn_media);
1464 	hn_destroy_rx_data(sc);
1465 	hn_destroy_tx_data(sc);
1466 
1467 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1468 		int i;
1469 
1470 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
1471 			taskqueue_free(sc->hn_tx_taskqs[i]);
1472 		free(sc->hn_tx_taskqs, M_DEVBUF);
1473 	}
1474 	taskqueue_free(sc->hn_mgmt_taskq0);
1475 
1476 	if (sc->hn_xact != NULL) {
1477 		/*
1478 		 * Uninstall the orphan handler _before_ the xact is
1479 		 * destructed.
1480 		 */
1481 		vmbus_chan_unset_orphan(sc->hn_prichan);
1482 		vmbus_xact_ctx_destroy(sc->hn_xact);
1483 	}
1484 
1485 	if_free(ifp);
1486 
1487 	HN_LOCK_DESTROY(sc);
1488 	return (0);
1489 }
1490 
1491 static int
1492 hn_shutdown(device_t dev)
1493 {
1494 
1495 	return (0);
1496 }
1497 
1498 static void
1499 hn_link_status(struct hn_softc *sc)
1500 {
1501 	uint32_t link_status;
1502 	int error;
1503 
1504 	error = hn_rndis_get_linkstatus(sc, &link_status);
1505 	if (error) {
1506 		/* XXX what to do? */
1507 		return;
1508 	}
1509 
1510 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1511 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1512 	else
1513 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1514 	if_link_state_change(sc->hn_ifp,
1515 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1516 	    LINK_STATE_UP : LINK_STATE_DOWN);
1517 }
1518 
1519 static void
1520 hn_link_taskfunc(void *xsc, int pending __unused)
1521 {
1522 	struct hn_softc *sc = xsc;
1523 
1524 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1525 		return;
1526 	hn_link_status(sc);
1527 }
1528 
1529 static void
1530 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1531 {
1532 	struct hn_softc *sc = xsc;
1533 
1534 	/* Prevent any link status checks from running. */
1535 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1536 
1537 	/*
1538 	 * Fake up a [link down --> link up] state change; 5 seconds
1539 	 * delay is used, which closely simulates miibus reaction
1540 	 * upon link down event.
1541 	 */
1542 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1543 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1544 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1545 	    &sc->hn_netchg_status, 5 * hz);
1546 }
1547 
1548 static void
1549 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1550 {
1551 	struct hn_softc *sc = xsc;
1552 
1553 	/* Re-allow link status checks. */
1554 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1555 	hn_link_status(sc);
1556 }
1557 
1558 static void
1559 hn_update_link_status(struct hn_softc *sc)
1560 {
1561 
1562 	if (sc->hn_mgmt_taskq != NULL)
1563 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1564 }
1565 
1566 static void
1567 hn_change_network(struct hn_softc *sc)
1568 {
1569 
1570 	if (sc->hn_mgmt_taskq != NULL)
1571 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1572 }
1573 
1574 static __inline int
1575 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1576     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1577 {
1578 	struct mbuf *m = *m_head;
1579 	int error;
1580 
1581 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1582 
1583 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1584 	    m, segs, nsegs, BUS_DMA_NOWAIT);
1585 	if (error == EFBIG) {
1586 		struct mbuf *m_new;
1587 
1588 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1589 		if (m_new == NULL)
1590 			return ENOBUFS;
1591 		else
1592 			*m_head = m = m_new;
1593 		txr->hn_tx_collapsed++;
1594 
1595 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1596 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1597 	}
1598 	if (!error) {
1599 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1600 		    BUS_DMASYNC_PREWRITE);
1601 		txd->flags |= HN_TXD_FLAG_DMAMAP;
1602 	}
1603 	return error;
1604 }
1605 
1606 static __inline int
1607 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1608 {
1609 
1610 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1611 	    ("put an onlist txd %#x", txd->flags));
1612 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1613 	    ("put an onagg txd %#x", txd->flags));
1614 
1615 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1616 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1617 		return 0;
1618 
1619 	if (!STAILQ_EMPTY(&txd->agg_list)) {
1620 		struct hn_txdesc *tmp_txd;
1621 
1622 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1623 			int freed;
1624 
1625 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1626 			    ("resursive aggregation on aggregated txdesc"));
1627 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1628 			    ("not aggregated txdesc"));
1629 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1630 			    ("aggregated txdesc uses dmamap"));
1631 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1632 			    ("aggregated txdesc consumes "
1633 			     "chimney sending buffer"));
1634 			KASSERT(tmp_txd->chim_size == 0,
1635 			    ("aggregated txdesc has non-zero "
1636 			     "chimney sending size"));
1637 
1638 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1639 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1640 			freed = hn_txdesc_put(txr, tmp_txd);
1641 			KASSERT(freed, ("failed to free aggregated txdesc"));
1642 		}
1643 	}
1644 
1645 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1646 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1647 		    ("chim txd uses dmamap"));
1648 		hn_chim_free(txr->hn_sc, txd->chim_index);
1649 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1650 		txd->chim_size = 0;
1651 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1652 		bus_dmamap_sync(txr->hn_tx_data_dtag,
1653 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1654 		bus_dmamap_unload(txr->hn_tx_data_dtag,
1655 		    txd->data_dmap);
1656 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1657 	}
1658 
1659 	if (txd->m != NULL) {
1660 		m_freem(txd->m);
1661 		txd->m = NULL;
1662 	}
1663 
1664 	txd->flags |= HN_TXD_FLAG_ONLIST;
1665 #ifndef HN_USE_TXDESC_BUFRING
1666 	mtx_lock_spin(&txr->hn_txlist_spin);
1667 	KASSERT(txr->hn_txdesc_avail >= 0 &&
1668 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1669 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1670 	txr->hn_txdesc_avail++;
1671 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1672 	mtx_unlock_spin(&txr->hn_txlist_spin);
1673 #else	/* HN_USE_TXDESC_BUFRING */
1674 #ifdef HN_DEBUG
1675 	atomic_add_int(&txr->hn_txdesc_avail, 1);
1676 #endif
1677 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1678 #endif	/* !HN_USE_TXDESC_BUFRING */
1679 
1680 	return 1;
1681 }
1682 
1683 static __inline struct hn_txdesc *
1684 hn_txdesc_get(struct hn_tx_ring *txr)
1685 {
1686 	struct hn_txdesc *txd;
1687 
1688 #ifndef HN_USE_TXDESC_BUFRING
1689 	mtx_lock_spin(&txr->hn_txlist_spin);
1690 	txd = SLIST_FIRST(&txr->hn_txlist);
1691 	if (txd != NULL) {
1692 		KASSERT(txr->hn_txdesc_avail > 0,
1693 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1694 		txr->hn_txdesc_avail--;
1695 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1696 	}
1697 	mtx_unlock_spin(&txr->hn_txlist_spin);
1698 #else
1699 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1700 #endif
1701 
1702 	if (txd != NULL) {
1703 #ifdef HN_USE_TXDESC_BUFRING
1704 #ifdef HN_DEBUG
1705 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1706 #endif
1707 #endif	/* HN_USE_TXDESC_BUFRING */
1708 		KASSERT(txd->m == NULL && txd->refs == 0 &&
1709 		    STAILQ_EMPTY(&txd->agg_list) &&
1710 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1711 		    txd->chim_size == 0 &&
1712 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1713 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1714 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1715 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1716 		txd->refs = 1;
1717 	}
1718 	return txd;
1719 }
1720 
1721 static __inline void
1722 hn_txdesc_hold(struct hn_txdesc *txd)
1723 {
1724 
1725 	/* 0->1 transition will never work */
1726 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1727 	atomic_add_int(&txd->refs, 1);
1728 }
1729 
1730 static __inline void
1731 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1732 {
1733 
1734 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1735 	    ("recursive aggregation on aggregating txdesc"));
1736 
1737 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1738 	    ("already aggregated"));
1739 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
1740 	    ("recursive aggregation on to-be-aggregated txdesc"));
1741 
1742 	txd->flags |= HN_TXD_FLAG_ONAGG;
1743 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1744 }
1745 
1746 static bool
1747 hn_tx_ring_pending(struct hn_tx_ring *txr)
1748 {
1749 	bool pending = false;
1750 
1751 #ifndef HN_USE_TXDESC_BUFRING
1752 	mtx_lock_spin(&txr->hn_txlist_spin);
1753 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1754 		pending = true;
1755 	mtx_unlock_spin(&txr->hn_txlist_spin);
1756 #else
1757 	if (!buf_ring_full(txr->hn_txdesc_br))
1758 		pending = true;
1759 #endif
1760 	return (pending);
1761 }
1762 
1763 static __inline void
1764 hn_txeof(struct hn_tx_ring *txr)
1765 {
1766 	txr->hn_has_txeof = 0;
1767 	txr->hn_txeof(txr);
1768 }
1769 
1770 static void
1771 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1772     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1773 {
1774 	struct hn_txdesc *txd = sndc->hn_cbarg;
1775 	struct hn_tx_ring *txr;
1776 
1777 	txr = txd->txr;
1778 	KASSERT(txr->hn_chan == chan,
1779 	    ("channel mismatch, on chan%u, should be chan%u",
1780 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1781 
1782 	txr->hn_has_txeof = 1;
1783 	hn_txdesc_put(txr, txd);
1784 
1785 	++txr->hn_txdone_cnt;
1786 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1787 		txr->hn_txdone_cnt = 0;
1788 		if (txr->hn_oactive)
1789 			hn_txeof(txr);
1790 	}
1791 }
1792 
1793 static void
1794 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1795 {
1796 #if defined(INET) || defined(INET6)
1797 	tcp_lro_flush_all(&rxr->hn_lro);
1798 #endif
1799 
1800 	/*
1801 	 * NOTE:
1802 	 * 'txr' could be NULL, if multiple channels and
1803 	 * ifnet.if_start method are enabled.
1804 	 */
1805 	if (txr == NULL || !txr->hn_has_txeof)
1806 		return;
1807 
1808 	txr->hn_txdone_cnt = 0;
1809 	hn_txeof(txr);
1810 }
1811 
1812 static __inline uint32_t
1813 hn_rndis_pktmsg_offset(uint32_t ofs)
1814 {
1815 
1816 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1817 	    ("invalid RNDIS packet msg offset %u", ofs));
1818 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1819 }
1820 
1821 static __inline void *
1822 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1823     size_t pi_dlen, uint32_t pi_type)
1824 {
1825 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1826 	struct rndis_pktinfo *pi;
1827 
1828 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1829 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1830 
1831 	/*
1832 	 * Per-packet-info does not move; it only grows.
1833 	 *
1834 	 * NOTE:
1835 	 * rm_pktinfooffset in this phase counts from the beginning
1836 	 * of rndis_packet_msg.
1837 	 */
1838 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1839 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1840 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1841 	    pkt->rm_pktinfolen);
1842 	pkt->rm_pktinfolen += pi_size;
1843 
1844 	pi->rm_size = pi_size;
1845 	pi->rm_type = pi_type;
1846 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1847 
1848 	return (pi->rm_data);
1849 }
1850 
1851 static __inline int
1852 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1853 {
1854 	struct hn_txdesc *txd;
1855 	struct mbuf *m;
1856 	int error, pkts;
1857 
1858 	txd = txr->hn_agg_txd;
1859 	KASSERT(txd != NULL, ("no aggregate txdesc"));
1860 
1861 	/*
1862 	 * Since hn_txpkt() will reset this temporary stat, save
1863 	 * it now, so that oerrors can be updated properly, if
1864 	 * hn_txpkt() ever fails.
1865 	 */
1866 	pkts = txr->hn_stat_pkts;
1867 
1868 	/*
1869 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1870 	 * failure, save it for later freeing, if hn_txpkt() ever
1871 	 * fails.
1872 	 */
1873 	m = txd->m;
1874 	error = hn_txpkt(ifp, txr, txd);
1875 	if (__predict_false(error)) {
1876 		/* txd is freed, but m is not. */
1877 		m_freem(m);
1878 
1879 		txr->hn_flush_failed++;
1880 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1881 	}
1882 
1883 	/* Reset all aggregation states. */
1884 	txr->hn_agg_txd = NULL;
1885 	txr->hn_agg_szleft = 0;
1886 	txr->hn_agg_pktleft = 0;
1887 	txr->hn_agg_prevpkt = NULL;
1888 
1889 	return (error);
1890 }
1891 
1892 static void *
1893 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1894     int pktsize)
1895 {
1896 	void *chim;
1897 
1898 	if (txr->hn_agg_txd != NULL) {
1899 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1900 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1901 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1902 			int olen;
1903 
1904 			/*
1905 			 * Update the previous RNDIS packet's total length,
1906 			 * it can be increased due to the mandatory alignment
1907 			 * padding for this RNDIS packet.  And update the
1908 			 * aggregating txdesc's chimney sending buffer size
1909 			 * accordingly.
1910 			 *
1911 			 * XXX
1912 			 * Zero-out the padding, as required by the RNDIS spec.
1913 			 */
1914 			olen = pkt->rm_len;
1915 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1916 			agg_txd->chim_size += pkt->rm_len - olen;
1917 
1918 			/* Link this txdesc to the parent. */
1919 			hn_txdesc_agg(agg_txd, txd);
1920 
1921 			chim = (uint8_t *)pkt + pkt->rm_len;
1922 			/* Save the current packet for later fixup. */
1923 			txr->hn_agg_prevpkt = chim;
1924 
1925 			txr->hn_agg_pktleft--;
1926 			txr->hn_agg_szleft -= pktsize;
1927 			if (txr->hn_agg_szleft <=
1928 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1929 				/*
1930 				 * Probably can't aggregate more packets,
1931 				 * flush this aggregating txdesc proactively.
1932 				 */
1933 				txr->hn_agg_pktleft = 0;
1934 			}
1935 			/* Done! */
1936 			return (chim);
1937 		}
1938 		hn_flush_txagg(ifp, txr);
1939 	}
1940 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1941 
1942 	txr->hn_tx_chimney_tried++;
1943 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
1944 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1945 		return (NULL);
1946 	txr->hn_tx_chimney++;
1947 
1948 	chim = txr->hn_sc->hn_chim +
1949 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1950 
1951 	if (txr->hn_agg_pktmax > 1 &&
1952 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1953 		txr->hn_agg_txd = txd;
1954 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1955 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1956 		txr->hn_agg_prevpkt = chim;
1957 	}
1958 	return (chim);
1959 }
1960 
1961 /*
1962  * NOTE:
1963  * If this function fails, then both txd and m_head0 will be freed.
1964  */
1965 static int
1966 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1967     struct mbuf **m_head0)
1968 {
1969 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1970 	int error, nsegs, i;
1971 	struct mbuf *m_head = *m_head0;
1972 	struct rndis_packet_msg *pkt;
1973 	uint32_t *pi_data;
1974 	void *chim = NULL;
1975 	int pkt_hlen, pkt_size;
1976 
1977 	pkt = txd->rndis_pkt;
1978 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1979 	if (pkt_size < txr->hn_chim_size) {
1980 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1981 		if (chim != NULL)
1982 			pkt = chim;
1983 	} else {
1984 		if (txr->hn_agg_txd != NULL)
1985 			hn_flush_txagg(ifp, txr);
1986 	}
1987 
1988 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1989 	pkt->rm_len = m_head->m_pkthdr.len;
1990 	pkt->rm_dataoffset = 0;
1991 	pkt->rm_datalen = m_head->m_pkthdr.len;
1992 	pkt->rm_oobdataoffset = 0;
1993 	pkt->rm_oobdatalen = 0;
1994 	pkt->rm_oobdataelements = 0;
1995 	pkt->rm_pktinfooffset = sizeof(*pkt);
1996 	pkt->rm_pktinfolen = 0;
1997 	pkt->rm_vchandle = 0;
1998 	pkt->rm_reserved = 0;
1999 
2000 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2001 		/*
2002 		 * Set the hash value for this packet, so that the host could
2003 		 * dispatch the TX done event for this packet back to this TX
2004 		 * ring's channel.
2005 		 */
2006 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2007 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2008 		*pi_data = txr->hn_tx_idx;
2009 	}
2010 
2011 	if (m_head->m_flags & M_VLANTAG) {
2012 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2013 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
2014 		*pi_data = NDIS_VLAN_INFO_MAKE(
2015 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
2016 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
2017 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
2018 	}
2019 
2020 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2021 #if defined(INET6) || defined(INET)
2022 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2023 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
2024 #ifdef INET
2025 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
2026 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
2027 			    m_head->m_pkthdr.tso_segsz);
2028 		}
2029 #endif
2030 #if defined(INET6) && defined(INET)
2031 		else
2032 #endif
2033 #ifdef INET6
2034 		{
2035 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
2036 			    m_head->m_pkthdr.tso_segsz);
2037 		}
2038 #endif
2039 #endif	/* INET6 || INET */
2040 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
2041 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2042 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
2043 		if (m_head->m_pkthdr.csum_flags &
2044 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
2045 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
2046 		} else {
2047 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
2048 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
2049 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
2050 		}
2051 
2052 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
2053 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
2054 		else if (m_head->m_pkthdr.csum_flags &
2055 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
2056 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
2057 	}
2058 
2059 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
2060 	/* Fixup RNDIS packet message total length */
2061 	pkt->rm_len += pkt_hlen;
2062 	/* Convert RNDIS packet message offsets */
2063 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
2064 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2065 
2066 	/*
2067 	 * Fast path: Chimney sending.
2068 	 */
2069 	if (chim != NULL) {
2070 		struct hn_txdesc *tgt_txd = txd;
2071 
2072 		if (txr->hn_agg_txd != NULL) {
2073 			tgt_txd = txr->hn_agg_txd;
2074 #ifdef INVARIANTS
2075 			*m_head0 = NULL;
2076 #endif
2077 		}
2078 
2079 		KASSERT(pkt == chim,
2080 		    ("RNDIS pkt not in chimney sending buffer"));
2081 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2082 		    ("chimney sending buffer is not used"));
2083 		tgt_txd->chim_size += pkt->rm_len;
2084 
2085 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
2086 		    ((uint8_t *)chim) + pkt_hlen);
2087 
2088 		txr->hn_gpa_cnt = 0;
2089 		txr->hn_sendpkt = hn_txpkt_chim;
2090 		goto done;
2091 	}
2092 
2093 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2094 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2095 	    ("chimney buffer is used"));
2096 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2097 
2098 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2099 	if (__predict_false(error)) {
2100 		int freed;
2101 
2102 		/*
2103 		 * This mbuf is not linked w/ the txd yet, so free it now.
2104 		 */
2105 		m_freem(m_head);
2106 		*m_head0 = NULL;
2107 
2108 		freed = hn_txdesc_put(txr, txd);
2109 		KASSERT(freed != 0,
2110 		    ("fail to free txd upon txdma error"));
2111 
2112 		txr->hn_txdma_failed++;
2113 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2114 		return error;
2115 	}
2116 	*m_head0 = m_head;
2117 
2118 	/* +1 RNDIS packet message */
2119 	txr->hn_gpa_cnt = nsegs + 1;
2120 
2121 	/* send packet with page buffer */
2122 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2123 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2124 	txr->hn_gpa[0].gpa_len = pkt_hlen;
2125 
2126 	/*
2127 	 * Fill the page buffers with mbuf info after the page
2128 	 * buffer for RNDIS packet message.
2129 	 */
2130 	for (i = 0; i < nsegs; ++i) {
2131 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2132 
2133 		gpa->gpa_page = atop(segs[i].ds_addr);
2134 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2135 		gpa->gpa_len = segs[i].ds_len;
2136 	}
2137 
2138 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2139 	txd->chim_size = 0;
2140 	txr->hn_sendpkt = hn_txpkt_sglist;
2141 done:
2142 	txd->m = m_head;
2143 
2144 	/* Set the completion routine */
2145 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2146 
2147 	/* Update temporary stats for later use. */
2148 	txr->hn_stat_pkts++;
2149 	txr->hn_stat_size += m_head->m_pkthdr.len;
2150 	if (m_head->m_flags & M_MCAST)
2151 		txr->hn_stat_mcasts++;
2152 
2153 	return 0;
2154 }
2155 
2156 /*
2157  * NOTE:
2158  * If this function fails, then txd will be freed, but the mbuf
2159  * associated w/ the txd will _not_ be freed.
2160  */
2161 static int
2162 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2163 {
2164 	int error, send_failed = 0, has_bpf;
2165 
2166 again:
2167 	has_bpf = bpf_peers_present(ifp->if_bpf);
2168 	if (has_bpf) {
2169 		/*
2170 		 * Make sure that this txd and any aggregated txds are not
2171 		 * freed before ETHER_BPF_MTAP.
2172 		 */
2173 		hn_txdesc_hold(txd);
2174 	}
2175 	error = txr->hn_sendpkt(txr, txd);
2176 	if (!error) {
2177 		if (has_bpf) {
2178 			const struct hn_txdesc *tmp_txd;
2179 
2180 			ETHER_BPF_MTAP(ifp, txd->m);
2181 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2182 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
2183 		}
2184 
2185 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2186 #ifdef HN_IFSTART_SUPPORT
2187 		if (!hn_use_if_start)
2188 #endif
2189 		{
2190 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
2191 			    txr->hn_stat_size);
2192 			if (txr->hn_stat_mcasts != 0) {
2193 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2194 				    txr->hn_stat_mcasts);
2195 			}
2196 		}
2197 		txr->hn_pkts += txr->hn_stat_pkts;
2198 		txr->hn_sends++;
2199 	}
2200 	if (has_bpf)
2201 		hn_txdesc_put(txr, txd);
2202 
2203 	if (__predict_false(error)) {
2204 		int freed;
2205 
2206 		/*
2207 		 * This should "really rarely" happen.
2208 		 *
2209 		 * XXX Too many RX to be acked or too many sideband
2210 		 * commands to run?  Ask netvsc_channel_rollup()
2211 		 * to kick start later.
2212 		 */
2213 		txr->hn_has_txeof = 1;
2214 		if (!send_failed) {
2215 			txr->hn_send_failed++;
2216 			send_failed = 1;
2217 			/*
2218 			 * Try sending again after set hn_has_txeof;
2219 			 * in case that we missed the last
2220 			 * netvsc_channel_rollup().
2221 			 */
2222 			goto again;
2223 		}
2224 		if_printf(ifp, "send failed\n");
2225 
2226 		/*
2227 		 * Caller will perform further processing on the
2228 		 * associated mbuf, so don't free it in hn_txdesc_put();
2229 		 * only unload it from the DMA map in hn_txdesc_put(),
2230 		 * if it was loaded.
2231 		 */
2232 		txd->m = NULL;
2233 		freed = hn_txdesc_put(txr, txd);
2234 		KASSERT(freed != 0,
2235 		    ("fail to free txd upon send error"));
2236 
2237 		txr->hn_send_failed++;
2238 	}
2239 
2240 	/* Reset temporary stats, after this sending is done. */
2241 	txr->hn_stat_size = 0;
2242 	txr->hn_stat_pkts = 0;
2243 	txr->hn_stat_mcasts = 0;
2244 
2245 	return (error);
2246 }
2247 
2248 /*
2249  * Append the specified data to the indicated mbuf chain,
2250  * Extend the mbuf chain if the new data does not fit in
2251  * existing space.
2252  *
2253  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2254  * There should be an equivalent in the kernel mbuf code,
2255  * but there does not appear to be one yet.
2256  *
2257  * Differs from m_append() in that additional mbufs are
2258  * allocated with cluster size MJUMPAGESIZE, and filled
2259  * accordingly.
2260  *
2261  * Return 1 if able to complete the job; otherwise 0.
2262  */
2263 static int
2264 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2265 {
2266 	struct mbuf *m, *n;
2267 	int remainder, space;
2268 
2269 	for (m = m0; m->m_next != NULL; m = m->m_next)
2270 		;
2271 	remainder = len;
2272 	space = M_TRAILINGSPACE(m);
2273 	if (space > 0) {
2274 		/*
2275 		 * Copy into available space.
2276 		 */
2277 		if (space > remainder)
2278 			space = remainder;
2279 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2280 		m->m_len += space;
2281 		cp += space;
2282 		remainder -= space;
2283 	}
2284 	while (remainder > 0) {
2285 		/*
2286 		 * Allocate a new mbuf; could check space
2287 		 * and allocate a cluster instead.
2288 		 */
2289 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2290 		if (n == NULL)
2291 			break;
2292 		n->m_len = min(MJUMPAGESIZE, remainder);
2293 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2294 		cp += n->m_len;
2295 		remainder -= n->m_len;
2296 		m->m_next = n;
2297 		m = n;
2298 	}
2299 	if (m0->m_flags & M_PKTHDR)
2300 		m0->m_pkthdr.len += len - remainder;
2301 
2302 	return (remainder == 0);
2303 }
2304 
2305 #if defined(INET) || defined(INET6)
2306 static __inline int
2307 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2308 {
2309 #if __FreeBSD_version >= 1100095
2310 	if (hn_lro_mbufq_depth) {
2311 		tcp_lro_queue_mbuf(lc, m);
2312 		return 0;
2313 	}
2314 #endif
2315 	return tcp_lro_rx(lc, m, 0);
2316 }
2317 #endif
2318 
2319 static int
2320 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2321     const struct hn_rxinfo *info)
2322 {
2323 	struct ifnet *ifp;
2324 	struct mbuf *m_new;
2325 	int size, do_lro = 0, do_csum = 1;
2326 	int hash_type;
2327 
2328 	/* If the VF is active, inject the packet through the VF */
2329 	ifp = rxr->hn_vf ? rxr->hn_vf : rxr->hn_ifp;
2330 
2331 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
2332 		/*
2333 		 * NOTE:
2334 		 * See the NOTE of hn_rndis_init_fixat().  This
2335 		 * function can be reached, immediately after the
2336 		 * RNDIS is initialized but before the ifnet is
2337 		 * setup on the hn_attach() path; drop the unexpected
2338 		 * packets.
2339 		 */
2340 		return (0);
2341 	}
2342 
2343 	if (dlen <= MHLEN) {
2344 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
2345 		if (m_new == NULL) {
2346 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2347 			return (0);
2348 		}
2349 		memcpy(mtod(m_new, void *), data, dlen);
2350 		m_new->m_pkthdr.len = m_new->m_len = dlen;
2351 		rxr->hn_small_pkts++;
2352 	} else {
2353 		/*
2354 		 * Get an mbuf with a cluster.  For packets 2K or less,
2355 		 * get a standard 2K cluster.  For anything larger, get a
2356 		 * 4K cluster.  Any buffers larger than 4K can cause problems
2357 		 * if looped around to the Hyper-V TX channel, so avoid them.
2358 		 */
2359 		size = MCLBYTES;
2360 		if (dlen > MCLBYTES) {
2361 			/* 4096 */
2362 			size = MJUMPAGESIZE;
2363 		}
2364 
2365 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2366 		if (m_new == NULL) {
2367 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2368 			return (0);
2369 		}
2370 
2371 		hv_m_append(m_new, dlen, data);
2372 	}
2373 	m_new->m_pkthdr.rcvif = ifp;
2374 
2375 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2376 		do_csum = 0;
2377 
2378 	/* receive side checksum offload */
2379 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2380 		/* IP csum offload */
2381 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2382 			m_new->m_pkthdr.csum_flags |=
2383 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2384 			rxr->hn_csum_ip++;
2385 		}
2386 
2387 		/* TCP/UDP csum offload */
2388 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2389 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2390 			m_new->m_pkthdr.csum_flags |=
2391 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2392 			m_new->m_pkthdr.csum_data = 0xffff;
2393 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2394 				rxr->hn_csum_tcp++;
2395 			else
2396 				rxr->hn_csum_udp++;
2397 		}
2398 
2399 		/*
2400 		 * XXX
2401 		 * As of this write (Oct 28th, 2016), host side will turn
2402 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2403 		 * the do_lro setting here is actually _not_ accurate.  We
2404 		 * depend on the RSS hash type check to reset do_lro.
2405 		 */
2406 		if ((info->csum_info &
2407 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2408 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2409 			do_lro = 1;
2410 	} else {
2411 		const struct ether_header *eh;
2412 		uint16_t etype;
2413 		int hoff;
2414 
2415 		hoff = sizeof(*eh);
2416 		if (m_new->m_len < hoff)
2417 			goto skip;
2418 		eh = mtod(m_new, struct ether_header *);
2419 		etype = ntohs(eh->ether_type);
2420 		if (etype == ETHERTYPE_VLAN) {
2421 			const struct ether_vlan_header *evl;
2422 
2423 			hoff = sizeof(*evl);
2424 			if (m_new->m_len < hoff)
2425 				goto skip;
2426 			evl = mtod(m_new, struct ether_vlan_header *);
2427 			etype = ntohs(evl->evl_proto);
2428 		}
2429 
2430 		if (etype == ETHERTYPE_IP) {
2431 			int pr;
2432 
2433 			pr = hn_check_iplen(m_new, hoff);
2434 			if (pr == IPPROTO_TCP) {
2435 				if (do_csum &&
2436 				    (rxr->hn_trust_hcsum &
2437 				     HN_TRUST_HCSUM_TCP)) {
2438 					rxr->hn_csum_trusted++;
2439 					m_new->m_pkthdr.csum_flags |=
2440 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2441 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2442 					m_new->m_pkthdr.csum_data = 0xffff;
2443 				}
2444 				do_lro = 1;
2445 			} else if (pr == IPPROTO_UDP) {
2446 				if (do_csum &&
2447 				    (rxr->hn_trust_hcsum &
2448 				     HN_TRUST_HCSUM_UDP)) {
2449 					rxr->hn_csum_trusted++;
2450 					m_new->m_pkthdr.csum_flags |=
2451 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2452 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2453 					m_new->m_pkthdr.csum_data = 0xffff;
2454 				}
2455 			} else if (pr != IPPROTO_DONE && do_csum &&
2456 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2457 				rxr->hn_csum_trusted++;
2458 				m_new->m_pkthdr.csum_flags |=
2459 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2460 			}
2461 		}
2462 	}
2463 skip:
2464 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2465 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2466 		    NDIS_VLAN_INFO_ID(info->vlan_info),
2467 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
2468 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
2469 		m_new->m_flags |= M_VLANTAG;
2470 	}
2471 
2472 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2473 		rxr->hn_rss_pkts++;
2474 		m_new->m_pkthdr.flowid = info->hash_value;
2475 		hash_type = M_HASHTYPE_OPAQUE_HASH;
2476 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2477 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
2478 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2479 
2480 			/*
2481 			 * NOTE:
2482 			 * do_lro is resetted, if the hash types are not TCP
2483 			 * related.  See the comment in the above csum_flags
2484 			 * setup section.
2485 			 */
2486 			switch (type) {
2487 			case NDIS_HASH_IPV4:
2488 				hash_type = M_HASHTYPE_RSS_IPV4;
2489 				do_lro = 0;
2490 				break;
2491 
2492 			case NDIS_HASH_TCP_IPV4:
2493 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2494 				break;
2495 
2496 			case NDIS_HASH_IPV6:
2497 				hash_type = M_HASHTYPE_RSS_IPV6;
2498 				do_lro = 0;
2499 				break;
2500 
2501 			case NDIS_HASH_IPV6_EX:
2502 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
2503 				do_lro = 0;
2504 				break;
2505 
2506 			case NDIS_HASH_TCP_IPV6:
2507 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2508 				break;
2509 
2510 			case NDIS_HASH_TCP_IPV6_EX:
2511 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2512 				break;
2513 			}
2514 		}
2515 	} else {
2516 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2517 		hash_type = M_HASHTYPE_OPAQUE;
2518 	}
2519 	M_HASHTYPE_SET(m_new, hash_type);
2520 
2521 	/*
2522 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
2523 	 * messages (not just data messages) will trigger a response.
2524 	 */
2525 
2526 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2527 	rxr->hn_pkts++;
2528 
2529 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2530 #if defined(INET) || defined(INET6)
2531 		struct lro_ctrl *lro = &rxr->hn_lro;
2532 
2533 		if (lro->lro_cnt) {
2534 			rxr->hn_lro_tried++;
2535 			if (hn_lro_rx(lro, m_new) == 0) {
2536 				/* DONE! */
2537 				return 0;
2538 			}
2539 		}
2540 #endif
2541 	}
2542 
2543 	/* We're not holding the lock here, so don't release it */
2544 	(*ifp->if_input)(ifp, m_new);
2545 
2546 	return (0);
2547 }
2548 
2549 static int
2550 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2551 {
2552 	struct hn_softc *sc = ifp->if_softc;
2553 	struct ifreq *ifr = (struct ifreq *)data;
2554 	int mask, error = 0;
2555 
2556 	switch (cmd) {
2557 	case SIOCSIFMTU:
2558 		if (ifr->ifr_mtu > HN_MTU_MAX) {
2559 			error = EINVAL;
2560 			break;
2561 		}
2562 
2563 		HN_LOCK(sc);
2564 
2565 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2566 			HN_UNLOCK(sc);
2567 			break;
2568 		}
2569 
2570 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2571 			/* Can't change MTU */
2572 			HN_UNLOCK(sc);
2573 			error = EOPNOTSUPP;
2574 			break;
2575 		}
2576 
2577 		if (ifp->if_mtu == ifr->ifr_mtu) {
2578 			HN_UNLOCK(sc);
2579 			break;
2580 		}
2581 
2582 		/*
2583 		 * Suspend this interface before the synthetic parts
2584 		 * are ripped.
2585 		 */
2586 		hn_suspend(sc);
2587 
2588 		/*
2589 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
2590 		 */
2591 		hn_synth_detach(sc);
2592 
2593 		/*
2594 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2595 		 * with the new MTU setting.
2596 		 */
2597 		error = hn_synth_attach(sc, ifr->ifr_mtu);
2598 		if (error) {
2599 			HN_UNLOCK(sc);
2600 			break;
2601 		}
2602 
2603 		/*
2604 		 * Commit the requested MTU, after the synthetic parts
2605 		 * have been successfully attached.
2606 		 */
2607 		ifp->if_mtu = ifr->ifr_mtu;
2608 
2609 		/*
2610 		 * Make sure that various parameters based on MTU are
2611 		 * still valid, after the MTU change.
2612 		 */
2613 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2614 			hn_set_chim_size(sc, sc->hn_chim_szmax);
2615 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2616 #if __FreeBSD_version >= 1100099
2617 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2618 		    HN_LRO_LENLIM_MIN(ifp))
2619 			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2620 #endif
2621 
2622 		/*
2623 		 * All done!  Resume the interface now.
2624 		 */
2625 		hn_resume(sc);
2626 
2627 		HN_UNLOCK(sc);
2628 		break;
2629 
2630 	case SIOCSIFFLAGS:
2631 		HN_LOCK(sc);
2632 
2633 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2634 			HN_UNLOCK(sc);
2635 			break;
2636 		}
2637 
2638 		if (ifp->if_flags & IFF_UP) {
2639 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2640 				/*
2641 				 * Caller meight hold mutex, e.g.
2642 				 * bpf; use busy-wait for the RNDIS
2643 				 * reply.
2644 				 */
2645 				HN_NO_SLEEPING(sc);
2646 				hn_rxfilter_config(sc);
2647 				HN_SLEEPING_OK(sc);
2648 			} else {
2649 				hn_init_locked(sc);
2650 			}
2651 		} else {
2652 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2653 				hn_stop(sc, false);
2654 		}
2655 		sc->hn_if_flags = ifp->if_flags;
2656 
2657 		HN_UNLOCK(sc);
2658 		break;
2659 
2660 	case SIOCSIFCAP:
2661 		HN_LOCK(sc);
2662 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2663 
2664 		if (mask & IFCAP_TXCSUM) {
2665 			ifp->if_capenable ^= IFCAP_TXCSUM;
2666 			if (ifp->if_capenable & IFCAP_TXCSUM)
2667 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2668 			else
2669 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2670 		}
2671 		if (mask & IFCAP_TXCSUM_IPV6) {
2672 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2673 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2674 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2675 			else
2676 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2677 		}
2678 
2679 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2680 		if (mask & IFCAP_RXCSUM)
2681 			ifp->if_capenable ^= IFCAP_RXCSUM;
2682 #ifdef foo
2683 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2684 		if (mask & IFCAP_RXCSUM_IPV6)
2685 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2686 #endif
2687 
2688 		if (mask & IFCAP_LRO)
2689 			ifp->if_capenable ^= IFCAP_LRO;
2690 
2691 		if (mask & IFCAP_TSO4) {
2692 			ifp->if_capenable ^= IFCAP_TSO4;
2693 			if (ifp->if_capenable & IFCAP_TSO4)
2694 				ifp->if_hwassist |= CSUM_IP_TSO;
2695 			else
2696 				ifp->if_hwassist &= ~CSUM_IP_TSO;
2697 		}
2698 		if (mask & IFCAP_TSO6) {
2699 			ifp->if_capenable ^= IFCAP_TSO6;
2700 			if (ifp->if_capenable & IFCAP_TSO6)
2701 				ifp->if_hwassist |= CSUM_IP6_TSO;
2702 			else
2703 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2704 		}
2705 
2706 		HN_UNLOCK(sc);
2707 		break;
2708 
2709 	case SIOCADDMULTI:
2710 	case SIOCDELMULTI:
2711 		HN_LOCK(sc);
2712 
2713 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2714 			HN_UNLOCK(sc);
2715 			break;
2716 		}
2717 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2718 			/*
2719 			 * Multicast uses mutex; use busy-wait for
2720 			 * the RNDIS reply.
2721 			 */
2722 			HN_NO_SLEEPING(sc);
2723 			hn_rxfilter_config(sc);
2724 			HN_SLEEPING_OK(sc);
2725 		}
2726 
2727 		HN_UNLOCK(sc);
2728 		break;
2729 
2730 	case SIOCSIFMEDIA:
2731 	case SIOCGIFMEDIA:
2732 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2733 		break;
2734 
2735 	default:
2736 		error = ether_ioctl(ifp, cmd, data);
2737 		break;
2738 	}
2739 	return (error);
2740 }
2741 
2742 static void
2743 hn_stop(struct hn_softc *sc, bool detaching)
2744 {
2745 	struct ifnet *ifp = sc->hn_ifp;
2746 	int i;
2747 
2748 	HN_LOCK_ASSERT(sc);
2749 
2750 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2751 	    ("synthetic parts were not attached"));
2752 
2753 	/* Disable polling. */
2754 	hn_polling(sc, 0);
2755 
2756 	/* Clear RUNNING bit _before_ hn_suspend_data() */
2757 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2758 	hn_suspend_data(sc);
2759 
2760 	/* Clear OACTIVE bit. */
2761 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2762 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2763 		sc->hn_tx_ring[i].hn_oactive = 0;
2764 
2765 	/*
2766 	 * If the VF is active, make sure the filter is not 0, even if
2767 	 * the synthetic NIC is down.
2768 	 */
2769 	if (!detaching && (sc->hn_flags & HN_FLAG_VF))
2770 		hn_rxfilter_config(sc);
2771 }
2772 
2773 static void
2774 hn_init_locked(struct hn_softc *sc)
2775 {
2776 	struct ifnet *ifp = sc->hn_ifp;
2777 	int i;
2778 
2779 	HN_LOCK_ASSERT(sc);
2780 
2781 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2782 		return;
2783 
2784 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2785 		return;
2786 
2787 	/* Configure RX filter */
2788 	hn_rxfilter_config(sc);
2789 
2790 	/* Clear OACTIVE bit. */
2791 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2792 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2793 		sc->hn_tx_ring[i].hn_oactive = 0;
2794 
2795 	/* Clear TX 'suspended' bit. */
2796 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2797 
2798 	/* Everything is ready; unleash! */
2799 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2800 
2801 	/* Re-enable polling if requested. */
2802 	if (sc->hn_pollhz > 0)
2803 		hn_polling(sc, sc->hn_pollhz);
2804 }
2805 
2806 static void
2807 hn_init(void *xsc)
2808 {
2809 	struct hn_softc *sc = xsc;
2810 
2811 	HN_LOCK(sc);
2812 	hn_init_locked(sc);
2813 	HN_UNLOCK(sc);
2814 }
2815 
2816 #if __FreeBSD_version >= 1100099
2817 
2818 static int
2819 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2820 {
2821 	struct hn_softc *sc = arg1;
2822 	unsigned int lenlim;
2823 	int error;
2824 
2825 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2826 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2827 	if (error || req->newptr == NULL)
2828 		return error;
2829 
2830 	HN_LOCK(sc);
2831 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2832 	    lenlim > TCP_LRO_LENGTH_MAX) {
2833 		HN_UNLOCK(sc);
2834 		return EINVAL;
2835 	}
2836 	hn_set_lro_lenlim(sc, lenlim);
2837 	HN_UNLOCK(sc);
2838 
2839 	return 0;
2840 }
2841 
2842 static int
2843 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2844 {
2845 	struct hn_softc *sc = arg1;
2846 	int ackcnt, error, i;
2847 
2848 	/*
2849 	 * lro_ackcnt_lim is append count limit,
2850 	 * +1 to turn it into aggregation limit.
2851 	 */
2852 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2853 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2854 	if (error || req->newptr == NULL)
2855 		return error;
2856 
2857 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2858 		return EINVAL;
2859 
2860 	/*
2861 	 * Convert aggregation limit back to append
2862 	 * count limit.
2863 	 */
2864 	--ackcnt;
2865 	HN_LOCK(sc);
2866 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2867 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2868 	HN_UNLOCK(sc);
2869 	return 0;
2870 }
2871 
2872 #endif
2873 
2874 static int
2875 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2876 {
2877 	struct hn_softc *sc = arg1;
2878 	int hcsum = arg2;
2879 	int on, error, i;
2880 
2881 	on = 0;
2882 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2883 		on = 1;
2884 
2885 	error = sysctl_handle_int(oidp, &on, 0, req);
2886 	if (error || req->newptr == NULL)
2887 		return error;
2888 
2889 	HN_LOCK(sc);
2890 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2891 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2892 
2893 		if (on)
2894 			rxr->hn_trust_hcsum |= hcsum;
2895 		else
2896 			rxr->hn_trust_hcsum &= ~hcsum;
2897 	}
2898 	HN_UNLOCK(sc);
2899 	return 0;
2900 }
2901 
2902 static int
2903 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2904 {
2905 	struct hn_softc *sc = arg1;
2906 	int chim_size, error;
2907 
2908 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
2909 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
2910 	if (error || req->newptr == NULL)
2911 		return error;
2912 
2913 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2914 		return EINVAL;
2915 
2916 	HN_LOCK(sc);
2917 	hn_set_chim_size(sc, chim_size);
2918 	HN_UNLOCK(sc);
2919 	return 0;
2920 }
2921 
2922 #if __FreeBSD_version < 1100095
2923 static int
2924 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2925 {
2926 	struct hn_softc *sc = arg1;
2927 	int ofs = arg2, i, error;
2928 	struct hn_rx_ring *rxr;
2929 	uint64_t stat;
2930 
2931 	stat = 0;
2932 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2933 		rxr = &sc->hn_rx_ring[i];
2934 		stat += *((int *)((uint8_t *)rxr + ofs));
2935 	}
2936 
2937 	error = sysctl_handle_64(oidp, &stat, 0, req);
2938 	if (error || req->newptr == NULL)
2939 		return error;
2940 
2941 	/* Zero out this stat. */
2942 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2943 		rxr = &sc->hn_rx_ring[i];
2944 		*((int *)((uint8_t *)rxr + ofs)) = 0;
2945 	}
2946 	return 0;
2947 }
2948 #else
2949 static int
2950 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2951 {
2952 	struct hn_softc *sc = arg1;
2953 	int ofs = arg2, i, error;
2954 	struct hn_rx_ring *rxr;
2955 	uint64_t stat;
2956 
2957 	stat = 0;
2958 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2959 		rxr = &sc->hn_rx_ring[i];
2960 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2961 	}
2962 
2963 	error = sysctl_handle_64(oidp, &stat, 0, req);
2964 	if (error || req->newptr == NULL)
2965 		return error;
2966 
2967 	/* Zero out this stat. */
2968 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2969 		rxr = &sc->hn_rx_ring[i];
2970 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2971 	}
2972 	return 0;
2973 }
2974 
2975 #endif
2976 
2977 static int
2978 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2979 {
2980 	struct hn_softc *sc = arg1;
2981 	int ofs = arg2, i, error;
2982 	struct hn_rx_ring *rxr;
2983 	u_long stat;
2984 
2985 	stat = 0;
2986 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2987 		rxr = &sc->hn_rx_ring[i];
2988 		stat += *((u_long *)((uint8_t *)rxr + ofs));
2989 	}
2990 
2991 	error = sysctl_handle_long(oidp, &stat, 0, req);
2992 	if (error || req->newptr == NULL)
2993 		return error;
2994 
2995 	/* Zero out this stat. */
2996 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2997 		rxr = &sc->hn_rx_ring[i];
2998 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
2999 	}
3000 	return 0;
3001 }
3002 
3003 static int
3004 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3005 {
3006 	struct hn_softc *sc = arg1;
3007 	int ofs = arg2, i, error;
3008 	struct hn_tx_ring *txr;
3009 	u_long stat;
3010 
3011 	stat = 0;
3012 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3013 		txr = &sc->hn_tx_ring[i];
3014 		stat += *((u_long *)((uint8_t *)txr + ofs));
3015 	}
3016 
3017 	error = sysctl_handle_long(oidp, &stat, 0, req);
3018 	if (error || req->newptr == NULL)
3019 		return error;
3020 
3021 	/* Zero out this stat. */
3022 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3023 		txr = &sc->hn_tx_ring[i];
3024 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
3025 	}
3026 	return 0;
3027 }
3028 
3029 static int
3030 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
3031 {
3032 	struct hn_softc *sc = arg1;
3033 	int ofs = arg2, i, error, conf;
3034 	struct hn_tx_ring *txr;
3035 
3036 	txr = &sc->hn_tx_ring[0];
3037 	conf = *((int *)((uint8_t *)txr + ofs));
3038 
3039 	error = sysctl_handle_int(oidp, &conf, 0, req);
3040 	if (error || req->newptr == NULL)
3041 		return error;
3042 
3043 	HN_LOCK(sc);
3044 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3045 		txr = &sc->hn_tx_ring[i];
3046 		*((int *)((uint8_t *)txr + ofs)) = conf;
3047 	}
3048 	HN_UNLOCK(sc);
3049 
3050 	return 0;
3051 }
3052 
3053 static int
3054 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
3055 {
3056 	struct hn_softc *sc = arg1;
3057 	int error, size;
3058 
3059 	size = sc->hn_agg_size;
3060 	error = sysctl_handle_int(oidp, &size, 0, req);
3061 	if (error || req->newptr == NULL)
3062 		return (error);
3063 
3064 	HN_LOCK(sc);
3065 	sc->hn_agg_size = size;
3066 	hn_set_txagg(sc);
3067 	HN_UNLOCK(sc);
3068 
3069 	return (0);
3070 }
3071 
3072 static int
3073 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3074 {
3075 	struct hn_softc *sc = arg1;
3076 	int error, pkts;
3077 
3078 	pkts = sc->hn_agg_pkts;
3079 	error = sysctl_handle_int(oidp, &pkts, 0, req);
3080 	if (error || req->newptr == NULL)
3081 		return (error);
3082 
3083 	HN_LOCK(sc);
3084 	sc->hn_agg_pkts = pkts;
3085 	hn_set_txagg(sc);
3086 	HN_UNLOCK(sc);
3087 
3088 	return (0);
3089 }
3090 
3091 static int
3092 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
3093 {
3094 	struct hn_softc *sc = arg1;
3095 	int pkts;
3096 
3097 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
3098 	return (sysctl_handle_int(oidp, &pkts, 0, req));
3099 }
3100 
3101 static int
3102 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
3103 {
3104 	struct hn_softc *sc = arg1;
3105 	int align;
3106 
3107 	align = sc->hn_tx_ring[0].hn_agg_align;
3108 	return (sysctl_handle_int(oidp, &align, 0, req));
3109 }
3110 
3111 static void
3112 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
3113 {
3114 	if (pollhz == 0)
3115 		vmbus_chan_poll_disable(chan);
3116 	else
3117 		vmbus_chan_poll_enable(chan, pollhz);
3118 }
3119 
3120 static void
3121 hn_polling(struct hn_softc *sc, u_int pollhz)
3122 {
3123 	int nsubch = sc->hn_rx_ring_inuse - 1;
3124 
3125 	HN_LOCK_ASSERT(sc);
3126 
3127 	if (nsubch > 0) {
3128 		struct vmbus_channel **subch;
3129 		int i;
3130 
3131 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3132 		for (i = 0; i < nsubch; ++i)
3133 			hn_chan_polling(subch[i], pollhz);
3134 		vmbus_subchan_rel(subch, nsubch);
3135 	}
3136 	hn_chan_polling(sc->hn_prichan, pollhz);
3137 }
3138 
3139 static int
3140 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
3141 {
3142 	struct hn_softc *sc = arg1;
3143 	int pollhz, error;
3144 
3145 	pollhz = sc->hn_pollhz;
3146 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
3147 	if (error || req->newptr == NULL)
3148 		return (error);
3149 
3150 	if (pollhz != 0 &&
3151 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
3152 		return (EINVAL);
3153 
3154 	HN_LOCK(sc);
3155 	if (sc->hn_pollhz != pollhz) {
3156 		sc->hn_pollhz = pollhz;
3157 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
3158 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
3159 			hn_polling(sc, sc->hn_pollhz);
3160 	}
3161 	HN_UNLOCK(sc);
3162 
3163 	return (0);
3164 }
3165 
3166 static int
3167 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
3168 {
3169 	struct hn_softc *sc = arg1;
3170 	char verstr[16];
3171 
3172 	snprintf(verstr, sizeof(verstr), "%u.%u",
3173 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
3174 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
3175 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
3176 }
3177 
3178 static int
3179 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
3180 {
3181 	struct hn_softc *sc = arg1;
3182 	char caps_str[128];
3183 	uint32_t caps;
3184 
3185 	HN_LOCK(sc);
3186 	caps = sc->hn_caps;
3187 	HN_UNLOCK(sc);
3188 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
3189 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
3190 }
3191 
3192 static int
3193 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
3194 {
3195 	struct hn_softc *sc = arg1;
3196 	char assist_str[128];
3197 	uint32_t hwassist;
3198 
3199 	HN_LOCK(sc);
3200 	hwassist = sc->hn_ifp->if_hwassist;
3201 	HN_UNLOCK(sc);
3202 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
3203 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
3204 }
3205 
3206 static int
3207 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
3208 {
3209 	struct hn_softc *sc = arg1;
3210 	char filter_str[128];
3211 	uint32_t filter;
3212 
3213 	HN_LOCK(sc);
3214 	filter = sc->hn_rx_filter;
3215 	HN_UNLOCK(sc);
3216 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
3217 	    NDIS_PACKET_TYPES);
3218 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3219 }
3220 
3221 #ifndef RSS
3222 
3223 static int
3224 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3225 {
3226 	struct hn_softc *sc = arg1;
3227 	int error;
3228 
3229 	HN_LOCK(sc);
3230 
3231 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3232 	if (error || req->newptr == NULL)
3233 		goto back;
3234 
3235 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3236 	if (error)
3237 		goto back;
3238 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3239 
3240 	if (sc->hn_rx_ring_inuse > 1) {
3241 		error = hn_rss_reconfig(sc);
3242 	} else {
3243 		/* Not RSS capable, at least for now; just save the RSS key. */
3244 		error = 0;
3245 	}
3246 back:
3247 	HN_UNLOCK(sc);
3248 	return (error);
3249 }
3250 
3251 static int
3252 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3253 {
3254 	struct hn_softc *sc = arg1;
3255 	int error;
3256 
3257 	HN_LOCK(sc);
3258 
3259 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3260 	if (error || req->newptr == NULL)
3261 		goto back;
3262 
3263 	/*
3264 	 * Don't allow RSS indirect table change, if this interface is not
3265 	 * RSS capable currently.
3266 	 */
3267 	if (sc->hn_rx_ring_inuse == 1) {
3268 		error = EOPNOTSUPP;
3269 		goto back;
3270 	}
3271 
3272 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3273 	if (error)
3274 		goto back;
3275 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3276 
3277 	hn_rss_ind_fixup(sc);
3278 	error = hn_rss_reconfig(sc);
3279 back:
3280 	HN_UNLOCK(sc);
3281 	return (error);
3282 }
3283 
3284 #endif	/* !RSS */
3285 
3286 static int
3287 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3288 {
3289 	struct hn_softc *sc = arg1;
3290 	char hash_str[128];
3291 	uint32_t hash;
3292 
3293 	HN_LOCK(sc);
3294 	hash = sc->hn_rss_hash;
3295 	HN_UNLOCK(sc);
3296 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3297 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3298 }
3299 
3300 static int
3301 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
3302 {
3303 	struct hn_softc *sc = arg1;
3304 	char vf_name[128];
3305 	struct ifnet *vf;
3306 
3307 	HN_LOCK(sc);
3308 	vf_name[0] = '\0';
3309 	vf = sc->hn_rx_ring[0].hn_vf;
3310 	if (vf != NULL)
3311 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf));
3312 	HN_UNLOCK(sc);
3313 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3314 }
3315 
3316 static int
3317 hn_check_iplen(const struct mbuf *m, int hoff)
3318 {
3319 	const struct ip *ip;
3320 	int len, iphlen, iplen;
3321 	const struct tcphdr *th;
3322 	int thoff;				/* TCP data offset */
3323 
3324 	len = hoff + sizeof(struct ip);
3325 
3326 	/* The packet must be at least the size of an IP header. */
3327 	if (m->m_pkthdr.len < len)
3328 		return IPPROTO_DONE;
3329 
3330 	/* The fixed IP header must reside completely in the first mbuf. */
3331 	if (m->m_len < len)
3332 		return IPPROTO_DONE;
3333 
3334 	ip = mtodo(m, hoff);
3335 
3336 	/* Bound check the packet's stated IP header length. */
3337 	iphlen = ip->ip_hl << 2;
3338 	if (iphlen < sizeof(struct ip))		/* minimum header length */
3339 		return IPPROTO_DONE;
3340 
3341 	/* The full IP header must reside completely in the one mbuf. */
3342 	if (m->m_len < hoff + iphlen)
3343 		return IPPROTO_DONE;
3344 
3345 	iplen = ntohs(ip->ip_len);
3346 
3347 	/*
3348 	 * Check that the amount of data in the buffers is as
3349 	 * at least much as the IP header would have us expect.
3350 	 */
3351 	if (m->m_pkthdr.len < hoff + iplen)
3352 		return IPPROTO_DONE;
3353 
3354 	/*
3355 	 * Ignore IP fragments.
3356 	 */
3357 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3358 		return IPPROTO_DONE;
3359 
3360 	/*
3361 	 * The TCP/IP or UDP/IP header must be entirely contained within
3362 	 * the first fragment of a packet.
3363 	 */
3364 	switch (ip->ip_p) {
3365 	case IPPROTO_TCP:
3366 		if (iplen < iphlen + sizeof(struct tcphdr))
3367 			return IPPROTO_DONE;
3368 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3369 			return IPPROTO_DONE;
3370 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3371 		thoff = th->th_off << 2;
3372 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3373 			return IPPROTO_DONE;
3374 		if (m->m_len < hoff + iphlen + thoff)
3375 			return IPPROTO_DONE;
3376 		break;
3377 	case IPPROTO_UDP:
3378 		if (iplen < iphlen + sizeof(struct udphdr))
3379 			return IPPROTO_DONE;
3380 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3381 			return IPPROTO_DONE;
3382 		break;
3383 	default:
3384 		if (iplen < iphlen)
3385 			return IPPROTO_DONE;
3386 		break;
3387 	}
3388 	return ip->ip_p;
3389 }
3390 
3391 static int
3392 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3393 {
3394 	struct sysctl_oid_list *child;
3395 	struct sysctl_ctx_list *ctx;
3396 	device_t dev = sc->hn_dev;
3397 #if defined(INET) || defined(INET6)
3398 #if __FreeBSD_version >= 1100095
3399 	int lroent_cnt;
3400 #endif
3401 #endif
3402 	int i;
3403 
3404 	/*
3405 	 * Create RXBUF for reception.
3406 	 *
3407 	 * NOTE:
3408 	 * - It is shared by all channels.
3409 	 * - A large enough buffer is allocated, certain version of NVSes
3410 	 *   may further limit the usable space.
3411 	 */
3412 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3413 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3414 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3415 	if (sc->hn_rxbuf == NULL) {
3416 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3417 		return (ENOMEM);
3418 	}
3419 
3420 	sc->hn_rx_ring_cnt = ring_cnt;
3421 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3422 
3423 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3424 	    M_DEVBUF, M_WAITOK | M_ZERO);
3425 
3426 #if defined(INET) || defined(INET6)
3427 #if __FreeBSD_version >= 1100095
3428 	lroent_cnt = hn_lro_entry_count;
3429 	if (lroent_cnt < TCP_LRO_ENTRIES)
3430 		lroent_cnt = TCP_LRO_ENTRIES;
3431 	if (bootverbose)
3432 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3433 #endif
3434 #endif	/* INET || INET6 */
3435 
3436 	ctx = device_get_sysctl_ctx(dev);
3437 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3438 
3439 	/* Create dev.hn.UNIT.rx sysctl tree */
3440 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3441 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3442 
3443 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3444 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3445 
3446 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3447 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3448 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
3449 		if (rxr->hn_br == NULL) {
3450 			device_printf(dev, "allocate bufring failed\n");
3451 			return (ENOMEM);
3452 		}
3453 
3454 		if (hn_trust_hosttcp)
3455 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3456 		if (hn_trust_hostudp)
3457 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3458 		if (hn_trust_hostip)
3459 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3460 		rxr->hn_ifp = sc->hn_ifp;
3461 		if (i < sc->hn_tx_ring_cnt)
3462 			rxr->hn_txr = &sc->hn_tx_ring[i];
3463 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3464 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3465 		rxr->hn_rx_idx = i;
3466 		rxr->hn_rxbuf = sc->hn_rxbuf;
3467 
3468 		/*
3469 		 * Initialize LRO.
3470 		 */
3471 #if defined(INET) || defined(INET6)
3472 #if __FreeBSD_version >= 1100095
3473 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3474 		    hn_lro_mbufq_depth);
3475 #else
3476 		tcp_lro_init(&rxr->hn_lro);
3477 		rxr->hn_lro.ifp = sc->hn_ifp;
3478 #endif
3479 #if __FreeBSD_version >= 1100099
3480 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3481 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3482 #endif
3483 #endif	/* INET || INET6 */
3484 
3485 		if (sc->hn_rx_sysctl_tree != NULL) {
3486 			char name[16];
3487 
3488 			/*
3489 			 * Create per RX ring sysctl tree:
3490 			 * dev.hn.UNIT.rx.RINGID
3491 			 */
3492 			snprintf(name, sizeof(name), "%d", i);
3493 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3494 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3495 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3496 
3497 			if (rxr->hn_rx_sysctl_tree != NULL) {
3498 				SYSCTL_ADD_ULONG(ctx,
3499 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3500 				    OID_AUTO, "packets", CTLFLAG_RW,
3501 				    &rxr->hn_pkts, "# of packets received");
3502 				SYSCTL_ADD_ULONG(ctx,
3503 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3504 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
3505 				    &rxr->hn_rss_pkts,
3506 				    "# of packets w/ RSS info received");
3507 				SYSCTL_ADD_INT(ctx,
3508 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3509 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3510 				    &rxr->hn_pktbuf_len, 0,
3511 				    "Temporary channel packet buffer length");
3512 			}
3513 		}
3514 	}
3515 
3516 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3517 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3518 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3519 #if __FreeBSD_version < 1100095
3520 	    hn_rx_stat_int_sysctl,
3521 #else
3522 	    hn_rx_stat_u64_sysctl,
3523 #endif
3524 	    "LU", "LRO queued");
3525 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3526 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3527 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3528 #if __FreeBSD_version < 1100095
3529 	    hn_rx_stat_int_sysctl,
3530 #else
3531 	    hn_rx_stat_u64_sysctl,
3532 #endif
3533 	    "LU", "LRO flushed");
3534 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3535 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3536 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
3537 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3538 #if __FreeBSD_version >= 1100099
3539 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3540 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3541 	    hn_lro_lenlim_sysctl, "IU",
3542 	    "Max # of data bytes to be aggregated by LRO");
3543 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3544 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3545 	    hn_lro_ackcnt_sysctl, "I",
3546 	    "Max # of ACKs to be aggregated by LRO");
3547 #endif
3548 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3549 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3550 	    hn_trust_hcsum_sysctl, "I",
3551 	    "Trust tcp segement verification on host side, "
3552 	    "when csum info is missing");
3553 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3554 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3555 	    hn_trust_hcsum_sysctl, "I",
3556 	    "Trust udp datagram verification on host side, "
3557 	    "when csum info is missing");
3558 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3559 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3560 	    hn_trust_hcsum_sysctl, "I",
3561 	    "Trust ip packet verification on host side, "
3562 	    "when csum info is missing");
3563 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3564 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3565 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
3566 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3567 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3568 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3569 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
3570 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3571 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3572 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3573 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
3574 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3575 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3576 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3577 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
3578 	    hn_rx_stat_ulong_sysctl, "LU",
3579 	    "# of packets that we trust host's csum verification");
3580 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3581 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3582 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
3583 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3584 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3585 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3586 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
3587 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3588 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3589 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3590 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3591 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3592 
3593 	return (0);
3594 }
3595 
3596 static void
3597 hn_destroy_rx_data(struct hn_softc *sc)
3598 {
3599 	int i;
3600 
3601 	if (sc->hn_rxbuf != NULL) {
3602 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3603 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3604 		else
3605 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
3606 		sc->hn_rxbuf = NULL;
3607 	}
3608 
3609 	if (sc->hn_rx_ring_cnt == 0)
3610 		return;
3611 
3612 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3613 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3614 
3615 		if (rxr->hn_br == NULL)
3616 			continue;
3617 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3618 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3619 		} else {
3620 			device_printf(sc->hn_dev,
3621 			    "%dth channel bufring is referenced", i);
3622 		}
3623 		rxr->hn_br = NULL;
3624 
3625 #if defined(INET) || defined(INET6)
3626 		tcp_lro_free(&rxr->hn_lro);
3627 #endif
3628 		free(rxr->hn_pktbuf, M_DEVBUF);
3629 	}
3630 	free(sc->hn_rx_ring, M_DEVBUF);
3631 	sc->hn_rx_ring = NULL;
3632 
3633 	sc->hn_rx_ring_cnt = 0;
3634 	sc->hn_rx_ring_inuse = 0;
3635 }
3636 
3637 static int
3638 hn_tx_ring_create(struct hn_softc *sc, int id)
3639 {
3640 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3641 	device_t dev = sc->hn_dev;
3642 	bus_dma_tag_t parent_dtag;
3643 	int error, i;
3644 
3645 	txr->hn_sc = sc;
3646 	txr->hn_tx_idx = id;
3647 
3648 #ifndef HN_USE_TXDESC_BUFRING
3649 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3650 #endif
3651 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3652 
3653 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3654 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3655 	    M_DEVBUF, M_WAITOK | M_ZERO);
3656 #ifndef HN_USE_TXDESC_BUFRING
3657 	SLIST_INIT(&txr->hn_txlist);
3658 #else
3659 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3660 	    M_WAITOK, &txr->hn_tx_lock);
3661 #endif
3662 
3663 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3664 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3665 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3666 	} else {
3667 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3668 	}
3669 
3670 #ifdef HN_IFSTART_SUPPORT
3671 	if (hn_use_if_start) {
3672 		txr->hn_txeof = hn_start_txeof;
3673 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3674 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3675 	} else
3676 #endif
3677 	{
3678 		int br_depth;
3679 
3680 		txr->hn_txeof = hn_xmit_txeof;
3681 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3682 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3683 
3684 		br_depth = hn_get_txswq_depth(txr);
3685 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3686 		    M_WAITOK, &txr->hn_tx_lock);
3687 	}
3688 
3689 	txr->hn_direct_tx_size = hn_direct_tx_size;
3690 
3691 	/*
3692 	 * Always schedule transmission instead of trying to do direct
3693 	 * transmission.  This one gives the best performance so far.
3694 	 */
3695 	txr->hn_sched_tx = 1;
3696 
3697 	parent_dtag = bus_get_dma_tag(dev);
3698 
3699 	/* DMA tag for RNDIS packet messages. */
3700 	error = bus_dma_tag_create(parent_dtag, /* parent */
3701 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
3702 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
3703 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3704 	    BUS_SPACE_MAXADDR,		/* highaddr */
3705 	    NULL, NULL,			/* filter, filterarg */
3706 	    HN_RNDIS_PKT_LEN,		/* maxsize */
3707 	    1,				/* nsegments */
3708 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
3709 	    0,				/* flags */
3710 	    NULL,			/* lockfunc */
3711 	    NULL,			/* lockfuncarg */
3712 	    &txr->hn_tx_rndis_dtag);
3713 	if (error) {
3714 		device_printf(dev, "failed to create rndis dmatag\n");
3715 		return error;
3716 	}
3717 
3718 	/* DMA tag for data. */
3719 	error = bus_dma_tag_create(parent_dtag, /* parent */
3720 	    1,				/* alignment */
3721 	    HN_TX_DATA_BOUNDARY,	/* boundary */
3722 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3723 	    BUS_SPACE_MAXADDR,		/* highaddr */
3724 	    NULL, NULL,			/* filter, filterarg */
3725 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
3726 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
3727 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
3728 	    0,				/* flags */
3729 	    NULL,			/* lockfunc */
3730 	    NULL,			/* lockfuncarg */
3731 	    &txr->hn_tx_data_dtag);
3732 	if (error) {
3733 		device_printf(dev, "failed to create data dmatag\n");
3734 		return error;
3735 	}
3736 
3737 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3738 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
3739 
3740 		txd->txr = txr;
3741 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3742 		STAILQ_INIT(&txd->agg_list);
3743 
3744 		/*
3745 		 * Allocate and load RNDIS packet message.
3746 		 */
3747         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3748 		    (void **)&txd->rndis_pkt,
3749 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3750 		    &txd->rndis_pkt_dmap);
3751 		if (error) {
3752 			device_printf(dev,
3753 			    "failed to allocate rndis_packet_msg, %d\n", i);
3754 			return error;
3755 		}
3756 
3757 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3758 		    txd->rndis_pkt_dmap,
3759 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3760 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3761 		    BUS_DMA_NOWAIT);
3762 		if (error) {
3763 			device_printf(dev,
3764 			    "failed to load rndis_packet_msg, %d\n", i);
3765 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3766 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3767 			return error;
3768 		}
3769 
3770 		/* DMA map for TX data. */
3771 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3772 		    &txd->data_dmap);
3773 		if (error) {
3774 			device_printf(dev,
3775 			    "failed to allocate tx data dmamap\n");
3776 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3777 			    txd->rndis_pkt_dmap);
3778 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3779 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3780 			return error;
3781 		}
3782 
3783 		/* All set, put it to list */
3784 		txd->flags |= HN_TXD_FLAG_ONLIST;
3785 #ifndef HN_USE_TXDESC_BUFRING
3786 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3787 #else
3788 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
3789 #endif
3790 	}
3791 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3792 
3793 	if (sc->hn_tx_sysctl_tree != NULL) {
3794 		struct sysctl_oid_list *child;
3795 		struct sysctl_ctx_list *ctx;
3796 		char name[16];
3797 
3798 		/*
3799 		 * Create per TX ring sysctl tree:
3800 		 * dev.hn.UNIT.tx.RINGID
3801 		 */
3802 		ctx = device_get_sysctl_ctx(dev);
3803 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3804 
3805 		snprintf(name, sizeof(name), "%d", id);
3806 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3807 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3808 
3809 		if (txr->hn_tx_sysctl_tree != NULL) {
3810 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3811 
3812 #ifdef HN_DEBUG
3813 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3814 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3815 			    "# of available TX descs");
3816 #endif
3817 #ifdef HN_IFSTART_SUPPORT
3818 			if (!hn_use_if_start)
3819 #endif
3820 			{
3821 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3822 				    CTLFLAG_RD, &txr->hn_oactive, 0,
3823 				    "over active");
3824 			}
3825 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3826 			    CTLFLAG_RW, &txr->hn_pkts,
3827 			    "# of packets transmitted");
3828 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3829 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
3830 		}
3831 	}
3832 
3833 	return 0;
3834 }
3835 
3836 static void
3837 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3838 {
3839 	struct hn_tx_ring *txr = txd->txr;
3840 
3841 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
3842 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3843 
3844 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3845 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3846 	    txd->rndis_pkt_dmap);
3847 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3848 }
3849 
3850 static void
3851 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3852 {
3853 
3854 	KASSERT(txd->refs == 0 || txd->refs == 1,
3855 	    ("invalid txd refs %d", txd->refs));
3856 
3857 	/* Aggregated txds will be freed by their aggregating txd. */
3858 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3859 		int freed;
3860 
3861 		freed = hn_txdesc_put(txr, txd);
3862 		KASSERT(freed, ("can't free txdesc"));
3863 	}
3864 }
3865 
3866 static void
3867 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3868 {
3869 	int i;
3870 
3871 	if (txr->hn_txdesc == NULL)
3872 		return;
3873 
3874 	/*
3875 	 * NOTE:
3876 	 * Because the freeing of aggregated txds will be deferred
3877 	 * to the aggregating txd, two passes are used here:
3878 	 * - The first pass GCes any pending txds.  This GC is necessary,
3879 	 *   since if the channels are revoked, hypervisor will not
3880 	 *   deliver send-done for all pending txds.
3881 	 * - The second pass frees the busdma stuffs, i.e. after all txds
3882 	 *   were freed.
3883 	 */
3884 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3885 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3886 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3887 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3888 
3889 	if (txr->hn_tx_data_dtag != NULL)
3890 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3891 	if (txr->hn_tx_rndis_dtag != NULL)
3892 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3893 
3894 #ifdef HN_USE_TXDESC_BUFRING
3895 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3896 #endif
3897 
3898 	free(txr->hn_txdesc, M_DEVBUF);
3899 	txr->hn_txdesc = NULL;
3900 
3901 	if (txr->hn_mbuf_br != NULL)
3902 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3903 
3904 #ifndef HN_USE_TXDESC_BUFRING
3905 	mtx_destroy(&txr->hn_txlist_spin);
3906 #endif
3907 	mtx_destroy(&txr->hn_tx_lock);
3908 }
3909 
3910 static int
3911 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3912 {
3913 	struct sysctl_oid_list *child;
3914 	struct sysctl_ctx_list *ctx;
3915 	int i;
3916 
3917 	/*
3918 	 * Create TXBUF for chimney sending.
3919 	 *
3920 	 * NOTE: It is shared by all channels.
3921 	 */
3922 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3923 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3924 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3925 	if (sc->hn_chim == NULL) {
3926 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
3927 		return (ENOMEM);
3928 	}
3929 
3930 	sc->hn_tx_ring_cnt = ring_cnt;
3931 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3932 
3933 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3934 	    M_DEVBUF, M_WAITOK | M_ZERO);
3935 
3936 	ctx = device_get_sysctl_ctx(sc->hn_dev);
3937 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3938 
3939 	/* Create dev.hn.UNIT.tx sysctl tree */
3940 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3941 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3942 
3943 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3944 		int error;
3945 
3946 		error = hn_tx_ring_create(sc, i);
3947 		if (error)
3948 			return error;
3949 	}
3950 
3951 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3952 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3953 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
3954 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3955 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3956 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3957 	    __offsetof(struct hn_tx_ring, hn_send_failed),
3958 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3959 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3960 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3961 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
3962 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3963 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3964 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3965 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
3966 	    hn_tx_stat_ulong_sysctl, "LU",
3967 	    "# of packet transmission aggregation flush failure");
3968 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3969 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3970 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3971 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3972 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3973 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3974 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
3975 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3976 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3977 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3978 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3979 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3980 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3981 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3982 	    "# of total TX descs");
3983 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3984 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3985 	    "Chimney send packet size upper boundary");
3986 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3987 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3988 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3989 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3990 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3991 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3992 	    hn_tx_conf_int_sysctl, "I",
3993 	    "Size of the packet for direct transmission");
3994 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3995 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3996 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
3997 	    hn_tx_conf_int_sysctl, "I",
3998 	    "Always schedule transmission "
3999 	    "instead of doing direct transmission");
4000 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
4001 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
4002 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
4003 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
4004 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
4005 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
4006 	    "Applied packet transmission aggregation size");
4007 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
4008 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
4009 	    hn_txagg_pktmax_sysctl, "I",
4010 	    "Applied packet transmission aggregation packets");
4011 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
4012 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
4013 	    hn_txagg_align_sysctl, "I",
4014 	    "Applied packet transmission aggregation alignment");
4015 
4016 	return 0;
4017 }
4018 
4019 static void
4020 hn_set_chim_size(struct hn_softc *sc, int chim_size)
4021 {
4022 	int i;
4023 
4024 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4025 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
4026 }
4027 
4028 static void
4029 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
4030 {
4031 	struct ifnet *ifp = sc->hn_ifp;
4032 	int tso_minlen;
4033 
4034 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
4035 		return;
4036 
4037 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
4038 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
4039 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
4040 
4041 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
4042 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
4043 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
4044 
4045 	if (tso_maxlen < tso_minlen)
4046 		tso_maxlen = tso_minlen;
4047 	else if (tso_maxlen > IP_MAXPACKET)
4048 		tso_maxlen = IP_MAXPACKET;
4049 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
4050 		tso_maxlen = sc->hn_ndis_tso_szmax;
4051 	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4052 	if (bootverbose)
4053 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
4054 }
4055 
4056 static void
4057 hn_fixup_tx_data(struct hn_softc *sc)
4058 {
4059 	uint64_t csum_assist;
4060 	int i;
4061 
4062 	hn_set_chim_size(sc, sc->hn_chim_szmax);
4063 	if (hn_tx_chimney_size > 0 &&
4064 	    hn_tx_chimney_size < sc->hn_chim_szmax)
4065 		hn_set_chim_size(sc, hn_tx_chimney_size);
4066 
4067 	csum_assist = 0;
4068 	if (sc->hn_caps & HN_CAP_IPCS)
4069 		csum_assist |= CSUM_IP;
4070 	if (sc->hn_caps & HN_CAP_TCP4CS)
4071 		csum_assist |= CSUM_IP_TCP;
4072 	if (sc->hn_caps & HN_CAP_UDP4CS)
4073 		csum_assist |= CSUM_IP_UDP;
4074 	if (sc->hn_caps & HN_CAP_TCP6CS)
4075 		csum_assist |= CSUM_IP6_TCP;
4076 	if (sc->hn_caps & HN_CAP_UDP6CS)
4077 		csum_assist |= CSUM_IP6_UDP;
4078 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4079 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
4080 
4081 	if (sc->hn_caps & HN_CAP_HASHVAL) {
4082 		/*
4083 		 * Support HASHVAL pktinfo on TX path.
4084 		 */
4085 		if (bootverbose)
4086 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
4087 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4088 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
4089 	}
4090 }
4091 
4092 static void
4093 hn_destroy_tx_data(struct hn_softc *sc)
4094 {
4095 	int i;
4096 
4097 	if (sc->hn_chim != NULL) {
4098 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
4099 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
4100 		} else {
4101 			device_printf(sc->hn_dev,
4102 			    "chimney sending buffer is referenced");
4103 		}
4104 		sc->hn_chim = NULL;
4105 	}
4106 
4107 	if (sc->hn_tx_ring_cnt == 0)
4108 		return;
4109 
4110 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4111 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
4112 
4113 	free(sc->hn_tx_ring, M_DEVBUF);
4114 	sc->hn_tx_ring = NULL;
4115 
4116 	sc->hn_tx_ring_cnt = 0;
4117 	sc->hn_tx_ring_inuse = 0;
4118 }
4119 
4120 #ifdef HN_IFSTART_SUPPORT
4121 
4122 static void
4123 hn_start_taskfunc(void *xtxr, int pending __unused)
4124 {
4125 	struct hn_tx_ring *txr = xtxr;
4126 
4127 	mtx_lock(&txr->hn_tx_lock);
4128 	hn_start_locked(txr, 0);
4129 	mtx_unlock(&txr->hn_tx_lock);
4130 }
4131 
4132 static int
4133 hn_start_locked(struct hn_tx_ring *txr, int len)
4134 {
4135 	struct hn_softc *sc = txr->hn_sc;
4136 	struct ifnet *ifp = sc->hn_ifp;
4137 	int sched = 0;
4138 
4139 	KASSERT(hn_use_if_start,
4140 	    ("hn_start_locked is called, when if_start is disabled"));
4141 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4142 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4143 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4144 
4145 	if (__predict_false(txr->hn_suspended))
4146 		return (0);
4147 
4148 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
4149 	    IFF_DRV_RUNNING)
4150 		return (0);
4151 
4152 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
4153 		struct hn_txdesc *txd;
4154 		struct mbuf *m_head;
4155 		int error;
4156 
4157 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
4158 		if (m_head == NULL)
4159 			break;
4160 
4161 		if (len > 0 && m_head->m_pkthdr.len > len) {
4162 			/*
4163 			 * This sending could be time consuming; let callers
4164 			 * dispatch this packet sending (and sending of any
4165 			 * following up packets) to tx taskqueue.
4166 			 */
4167 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4168 			sched = 1;
4169 			break;
4170 		}
4171 
4172 #if defined(INET6) || defined(INET)
4173 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
4174 			m_head = hn_tso_fixup(m_head);
4175 			if (__predict_false(m_head == NULL)) {
4176 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4177 				continue;
4178 			}
4179 		}
4180 #endif
4181 
4182 		txd = hn_txdesc_get(txr);
4183 		if (txd == NULL) {
4184 			txr->hn_no_txdescs++;
4185 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4186 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4187 			break;
4188 		}
4189 
4190 		error = hn_encap(ifp, txr, txd, &m_head);
4191 		if (error) {
4192 			/* Both txd and m_head are freed */
4193 			KASSERT(txr->hn_agg_txd == NULL,
4194 			    ("encap failed w/ pending aggregating txdesc"));
4195 			continue;
4196 		}
4197 
4198 		if (txr->hn_agg_pktleft == 0) {
4199 			if (txr->hn_agg_txd != NULL) {
4200 				KASSERT(m_head == NULL,
4201 				    ("pending mbuf for aggregating txdesc"));
4202 				error = hn_flush_txagg(ifp, txr);
4203 				if (__predict_false(error)) {
4204 					atomic_set_int(&ifp->if_drv_flags,
4205 					    IFF_DRV_OACTIVE);
4206 					break;
4207 				}
4208 			} else {
4209 				KASSERT(m_head != NULL, ("mbuf was freed"));
4210 				error = hn_txpkt(ifp, txr, txd);
4211 				if (__predict_false(error)) {
4212 					/* txd is freed, but m_head is not */
4213 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4214 					atomic_set_int(&ifp->if_drv_flags,
4215 					    IFF_DRV_OACTIVE);
4216 					break;
4217 				}
4218 			}
4219 		}
4220 #ifdef INVARIANTS
4221 		else {
4222 			KASSERT(txr->hn_agg_txd != NULL,
4223 			    ("no aggregating txdesc"));
4224 			KASSERT(m_head == NULL,
4225 			    ("pending mbuf for aggregating txdesc"));
4226 		}
4227 #endif
4228 	}
4229 
4230 	/* Flush pending aggerated transmission. */
4231 	if (txr->hn_agg_txd != NULL)
4232 		hn_flush_txagg(ifp, txr);
4233 	return (sched);
4234 }
4235 
4236 static void
4237 hn_start(struct ifnet *ifp)
4238 {
4239 	struct hn_softc *sc = ifp->if_softc;
4240 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
4241 
4242 	if (txr->hn_sched_tx)
4243 		goto do_sched;
4244 
4245 	if (mtx_trylock(&txr->hn_tx_lock)) {
4246 		int sched;
4247 
4248 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4249 		mtx_unlock(&txr->hn_tx_lock);
4250 		if (!sched)
4251 			return;
4252 	}
4253 do_sched:
4254 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4255 }
4256 
4257 static void
4258 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4259 {
4260 	struct hn_tx_ring *txr = xtxr;
4261 
4262 	mtx_lock(&txr->hn_tx_lock);
4263 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4264 	hn_start_locked(txr, 0);
4265 	mtx_unlock(&txr->hn_tx_lock);
4266 }
4267 
4268 static void
4269 hn_start_txeof(struct hn_tx_ring *txr)
4270 {
4271 	struct hn_softc *sc = txr->hn_sc;
4272 	struct ifnet *ifp = sc->hn_ifp;
4273 
4274 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4275 
4276 	if (txr->hn_sched_tx)
4277 		goto do_sched;
4278 
4279 	if (mtx_trylock(&txr->hn_tx_lock)) {
4280 		int sched;
4281 
4282 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4283 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4284 		mtx_unlock(&txr->hn_tx_lock);
4285 		if (sched) {
4286 			taskqueue_enqueue(txr->hn_tx_taskq,
4287 			    &txr->hn_tx_task);
4288 		}
4289 	} else {
4290 do_sched:
4291 		/*
4292 		 * Release the OACTIVE earlier, with the hope, that
4293 		 * others could catch up.  The task will clear the
4294 		 * flag again with the hn_tx_lock to avoid possible
4295 		 * races.
4296 		 */
4297 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4298 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4299 	}
4300 }
4301 
4302 #endif	/* HN_IFSTART_SUPPORT */
4303 
4304 static int
4305 hn_xmit(struct hn_tx_ring *txr, int len)
4306 {
4307 	struct hn_softc *sc = txr->hn_sc;
4308 	struct ifnet *ifp = sc->hn_ifp;
4309 	struct mbuf *m_head;
4310 	int sched = 0;
4311 
4312 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4313 #ifdef HN_IFSTART_SUPPORT
4314 	KASSERT(hn_use_if_start == 0,
4315 	    ("hn_xmit is called, when if_start is enabled"));
4316 #endif
4317 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4318 
4319 	if (__predict_false(txr->hn_suspended))
4320 		return (0);
4321 
4322 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4323 		return (0);
4324 
4325 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4326 		struct hn_txdesc *txd;
4327 		int error;
4328 
4329 		if (len > 0 && m_head->m_pkthdr.len > len) {
4330 			/*
4331 			 * This sending could be time consuming; let callers
4332 			 * dispatch this packet sending (and sending of any
4333 			 * following up packets) to tx taskqueue.
4334 			 */
4335 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4336 			sched = 1;
4337 			break;
4338 		}
4339 
4340 		txd = hn_txdesc_get(txr);
4341 		if (txd == NULL) {
4342 			txr->hn_no_txdescs++;
4343 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4344 			txr->hn_oactive = 1;
4345 			break;
4346 		}
4347 
4348 		error = hn_encap(ifp, txr, txd, &m_head);
4349 		if (error) {
4350 			/* Both txd and m_head are freed; discard */
4351 			KASSERT(txr->hn_agg_txd == NULL,
4352 			    ("encap failed w/ pending aggregating txdesc"));
4353 			drbr_advance(ifp, txr->hn_mbuf_br);
4354 			continue;
4355 		}
4356 
4357 		if (txr->hn_agg_pktleft == 0) {
4358 			if (txr->hn_agg_txd != NULL) {
4359 				KASSERT(m_head == NULL,
4360 				    ("pending mbuf for aggregating txdesc"));
4361 				error = hn_flush_txagg(ifp, txr);
4362 				if (__predict_false(error)) {
4363 					txr->hn_oactive = 1;
4364 					break;
4365 				}
4366 			} else {
4367 				KASSERT(m_head != NULL, ("mbuf was freed"));
4368 				error = hn_txpkt(ifp, txr, txd);
4369 				if (__predict_false(error)) {
4370 					/* txd is freed, but m_head is not */
4371 					drbr_putback(ifp, txr->hn_mbuf_br,
4372 					    m_head);
4373 					txr->hn_oactive = 1;
4374 					break;
4375 				}
4376 			}
4377 		}
4378 #ifdef INVARIANTS
4379 		else {
4380 			KASSERT(txr->hn_agg_txd != NULL,
4381 			    ("no aggregating txdesc"));
4382 			KASSERT(m_head == NULL,
4383 			    ("pending mbuf for aggregating txdesc"));
4384 		}
4385 #endif
4386 
4387 		/* Sent */
4388 		drbr_advance(ifp, txr->hn_mbuf_br);
4389 	}
4390 
4391 	/* Flush pending aggerated transmission. */
4392 	if (txr->hn_agg_txd != NULL)
4393 		hn_flush_txagg(ifp, txr);
4394 	return (sched);
4395 }
4396 
4397 static int
4398 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4399 {
4400 	struct hn_softc *sc = ifp->if_softc;
4401 	struct hn_tx_ring *txr;
4402 	int error, idx = 0;
4403 
4404 #if defined(INET6) || defined(INET)
4405 	/*
4406 	 * Perform TSO packet header fixup now, since the TSO
4407 	 * packet header should be cache-hot.
4408 	 */
4409 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4410 		m = hn_tso_fixup(m);
4411 		if (__predict_false(m == NULL)) {
4412 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4413 			return EIO;
4414 		}
4415 	}
4416 #endif
4417 
4418 	/*
4419 	 * Select the TX ring based on flowid
4420 	 */
4421 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4422 #ifdef RSS
4423 		uint32_t bid;
4424 
4425 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
4426 		    &bid) == 0)
4427 			idx = bid % sc->hn_tx_ring_inuse;
4428 		else
4429 #endif
4430 		{
4431 #if defined(INET6) || defined(INET)
4432 			int tcpsyn = 0;
4433 
4434 			if (m->m_pkthdr.len < 128 &&
4435 			    (m->m_pkthdr.csum_flags &
4436 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
4437 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
4438 				m = hn_check_tcpsyn(m, &tcpsyn);
4439 				if (__predict_false(m == NULL)) {
4440 					if_inc_counter(ifp,
4441 					    IFCOUNTER_OERRORS, 1);
4442 					return (EIO);
4443 				}
4444 			}
4445 #else
4446 			const int tcpsyn = 0;
4447 #endif
4448 			if (tcpsyn)
4449 				idx = 0;
4450 			else
4451 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4452 		}
4453 	}
4454 	txr = &sc->hn_tx_ring[idx];
4455 
4456 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4457 	if (error) {
4458 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4459 		return error;
4460 	}
4461 
4462 	if (txr->hn_oactive)
4463 		return 0;
4464 
4465 	if (txr->hn_sched_tx)
4466 		goto do_sched;
4467 
4468 	if (mtx_trylock(&txr->hn_tx_lock)) {
4469 		int sched;
4470 
4471 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4472 		mtx_unlock(&txr->hn_tx_lock);
4473 		if (!sched)
4474 			return 0;
4475 	}
4476 do_sched:
4477 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4478 	return 0;
4479 }
4480 
4481 static void
4482 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4483 {
4484 	struct mbuf *m;
4485 
4486 	mtx_lock(&txr->hn_tx_lock);
4487 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4488 		m_freem(m);
4489 	mtx_unlock(&txr->hn_tx_lock);
4490 }
4491 
4492 static void
4493 hn_xmit_qflush(struct ifnet *ifp)
4494 {
4495 	struct hn_softc *sc = ifp->if_softc;
4496 	int i;
4497 
4498 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4499 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4500 	if_qflush(ifp);
4501 }
4502 
4503 static void
4504 hn_xmit_txeof(struct hn_tx_ring *txr)
4505 {
4506 
4507 	if (txr->hn_sched_tx)
4508 		goto do_sched;
4509 
4510 	if (mtx_trylock(&txr->hn_tx_lock)) {
4511 		int sched;
4512 
4513 		txr->hn_oactive = 0;
4514 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4515 		mtx_unlock(&txr->hn_tx_lock);
4516 		if (sched) {
4517 			taskqueue_enqueue(txr->hn_tx_taskq,
4518 			    &txr->hn_tx_task);
4519 		}
4520 	} else {
4521 do_sched:
4522 		/*
4523 		 * Release the oactive earlier, with the hope, that
4524 		 * others could catch up.  The task will clear the
4525 		 * oactive again with the hn_tx_lock to avoid possible
4526 		 * races.
4527 		 */
4528 		txr->hn_oactive = 0;
4529 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4530 	}
4531 }
4532 
4533 static void
4534 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4535 {
4536 	struct hn_tx_ring *txr = xtxr;
4537 
4538 	mtx_lock(&txr->hn_tx_lock);
4539 	hn_xmit(txr, 0);
4540 	mtx_unlock(&txr->hn_tx_lock);
4541 }
4542 
4543 static void
4544 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4545 {
4546 	struct hn_tx_ring *txr = xtxr;
4547 
4548 	mtx_lock(&txr->hn_tx_lock);
4549 	txr->hn_oactive = 0;
4550 	hn_xmit(txr, 0);
4551 	mtx_unlock(&txr->hn_tx_lock);
4552 }
4553 
4554 static int
4555 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4556 {
4557 	struct vmbus_chan_br cbr;
4558 	struct hn_rx_ring *rxr;
4559 	struct hn_tx_ring *txr = NULL;
4560 	int idx, error;
4561 
4562 	idx = vmbus_chan_subidx(chan);
4563 
4564 	/*
4565 	 * Link this channel to RX/TX ring.
4566 	 */
4567 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4568 	    ("invalid channel index %d, should > 0 && < %d",
4569 	     idx, sc->hn_rx_ring_inuse));
4570 	rxr = &sc->hn_rx_ring[idx];
4571 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4572 	    ("RX ring %d already attached", idx));
4573 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4574 	rxr->hn_chan = chan;
4575 
4576 	if (bootverbose) {
4577 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4578 		    idx, vmbus_chan_id(chan));
4579 	}
4580 
4581 	if (idx < sc->hn_tx_ring_inuse) {
4582 		txr = &sc->hn_tx_ring[idx];
4583 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4584 		    ("TX ring %d already attached", idx));
4585 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4586 
4587 		txr->hn_chan = chan;
4588 		if (bootverbose) {
4589 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4590 			    idx, vmbus_chan_id(chan));
4591 		}
4592 	}
4593 
4594 	/* Bind this channel to a proper CPU. */
4595 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4596 
4597 	/*
4598 	 * Open this channel
4599 	 */
4600 	cbr.cbr = rxr->hn_br;
4601 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4602 	cbr.cbr_txsz = HN_TXBR_SIZE;
4603 	cbr.cbr_rxsz = HN_RXBR_SIZE;
4604 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4605 	if (error) {
4606 		if (error == EISCONN) {
4607 			if_printf(sc->hn_ifp, "bufring is connected after "
4608 			    "chan%u open failure\n", vmbus_chan_id(chan));
4609 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4610 		} else {
4611 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4612 			    vmbus_chan_id(chan), error);
4613 		}
4614 	}
4615 	return (error);
4616 }
4617 
4618 static void
4619 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4620 {
4621 	struct hn_rx_ring *rxr;
4622 	int idx, error;
4623 
4624 	idx = vmbus_chan_subidx(chan);
4625 
4626 	/*
4627 	 * Link this channel to RX/TX ring.
4628 	 */
4629 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4630 	    ("invalid channel index %d, should > 0 && < %d",
4631 	     idx, sc->hn_rx_ring_inuse));
4632 	rxr = &sc->hn_rx_ring[idx];
4633 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4634 	    ("RX ring %d is not attached", idx));
4635 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4636 
4637 	if (idx < sc->hn_tx_ring_inuse) {
4638 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4639 
4640 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4641 		    ("TX ring %d is not attached attached", idx));
4642 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4643 	}
4644 
4645 	/*
4646 	 * Close this channel.
4647 	 *
4648 	 * NOTE:
4649 	 * Channel closing does _not_ destroy the target channel.
4650 	 */
4651 	error = vmbus_chan_close_direct(chan);
4652 	if (error == EISCONN) {
4653 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
4654 		    "after being closed\n", vmbus_chan_id(chan));
4655 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4656 	} else if (error) {
4657 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4658 		    vmbus_chan_id(chan), error);
4659 	}
4660 }
4661 
4662 static int
4663 hn_attach_subchans(struct hn_softc *sc)
4664 {
4665 	struct vmbus_channel **subchans;
4666 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4667 	int i, error = 0;
4668 
4669 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
4670 
4671 	/* Attach the sub-channels. */
4672 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4673 	for (i = 0; i < subchan_cnt; ++i) {
4674 		int error1;
4675 
4676 		error1 = hn_chan_attach(sc, subchans[i]);
4677 		if (error1) {
4678 			error = error1;
4679 			/* Move on; all channels will be detached later. */
4680 		}
4681 	}
4682 	vmbus_subchan_rel(subchans, subchan_cnt);
4683 
4684 	if (error) {
4685 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4686 	} else {
4687 		if (bootverbose) {
4688 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4689 			    subchan_cnt);
4690 		}
4691 	}
4692 	return (error);
4693 }
4694 
4695 static void
4696 hn_detach_allchans(struct hn_softc *sc)
4697 {
4698 	struct vmbus_channel **subchans;
4699 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4700 	int i;
4701 
4702 	if (subchan_cnt == 0)
4703 		goto back;
4704 
4705 	/* Detach the sub-channels. */
4706 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4707 	for (i = 0; i < subchan_cnt; ++i)
4708 		hn_chan_detach(sc, subchans[i]);
4709 	vmbus_subchan_rel(subchans, subchan_cnt);
4710 
4711 back:
4712 	/*
4713 	 * Detach the primary channel, _after_ all sub-channels
4714 	 * are detached.
4715 	 */
4716 	hn_chan_detach(sc, sc->hn_prichan);
4717 
4718 	/* Wait for sub-channels to be destroyed, if any. */
4719 	vmbus_subchan_drain(sc->hn_prichan);
4720 
4721 #ifdef INVARIANTS
4722 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4723 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4724 		    HN_RX_FLAG_ATTACHED) == 0,
4725 		    ("%dth RX ring is still attached", i));
4726 	}
4727 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4728 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4729 		    HN_TX_FLAG_ATTACHED) == 0,
4730 		    ("%dth TX ring is still attached", i));
4731 	}
4732 #endif
4733 }
4734 
4735 static int
4736 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4737 {
4738 	struct vmbus_channel **subchans;
4739 	int nchan, rxr_cnt, error;
4740 
4741 	nchan = *nsubch + 1;
4742 	if (nchan == 1) {
4743 		/*
4744 		 * Multiple RX/TX rings are not requested.
4745 		 */
4746 		*nsubch = 0;
4747 		return (0);
4748 	}
4749 
4750 	/*
4751 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4752 	 * table entries.
4753 	 */
4754 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4755 	if (error) {
4756 		/* No RSS; this is benign. */
4757 		*nsubch = 0;
4758 		return (0);
4759 	}
4760 	if (bootverbose) {
4761 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4762 		    rxr_cnt, nchan);
4763 	}
4764 
4765 	if (nchan > rxr_cnt)
4766 		nchan = rxr_cnt;
4767 	if (nchan == 1) {
4768 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4769 		*nsubch = 0;
4770 		return (0);
4771 	}
4772 
4773 	/*
4774 	 * Allocate sub-channels from NVS.
4775 	 */
4776 	*nsubch = nchan - 1;
4777 	error = hn_nvs_alloc_subchans(sc, nsubch);
4778 	if (error || *nsubch == 0) {
4779 		/* Failed to allocate sub-channels. */
4780 		*nsubch = 0;
4781 		return (0);
4782 	}
4783 
4784 	/*
4785 	 * Wait for all sub-channels to become ready before moving on.
4786 	 */
4787 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4788 	vmbus_subchan_rel(subchans, *nsubch);
4789 	return (0);
4790 }
4791 
4792 static bool
4793 hn_synth_attachable(const struct hn_softc *sc)
4794 {
4795 	int i;
4796 
4797 	if (sc->hn_flags & HN_FLAG_ERRORS)
4798 		return (false);
4799 
4800 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4801 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4802 
4803 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4804 			return (false);
4805 	}
4806 	return (true);
4807 }
4808 
4809 /*
4810  * Make sure that the RX filter is zero after the successful
4811  * RNDIS initialization.
4812  *
4813  * NOTE:
4814  * Under certain conditions on certain versions of Hyper-V,
4815  * the RNDIS rxfilter is _not_ zero on the hypervisor side
4816  * after the successful RNDIS initialization, which breaks
4817  * the assumption of any following code (well, it breaks the
4818  * RNDIS API contract actually).  Clear the RNDIS rxfilter
4819  * explicitly, drain packets sneaking through, and drain the
4820  * interrupt taskqueues scheduled due to the stealth packets.
4821  */
4822 static void
4823 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
4824 {
4825 
4826 	hn_disable_rx(sc);
4827 	hn_drain_rxtx(sc, nchan);
4828 }
4829 
4830 static int
4831 hn_synth_attach(struct hn_softc *sc, int mtu)
4832 {
4833 #define ATTACHED_NVS		0x0002
4834 #define ATTACHED_RNDIS		0x0004
4835 
4836 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4837 	int error, nsubch, nchan = 1, i, rndis_inited;
4838 	uint32_t old_caps, attached = 0;
4839 
4840 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4841 	    ("synthetic parts were attached"));
4842 
4843 	if (!hn_synth_attachable(sc))
4844 		return (ENXIO);
4845 
4846 	/* Save capabilities for later verification. */
4847 	old_caps = sc->hn_caps;
4848 	sc->hn_caps = 0;
4849 
4850 	/* Clear RSS stuffs. */
4851 	sc->hn_rss_ind_size = 0;
4852 	sc->hn_rss_hash = 0;
4853 
4854 	/*
4855 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
4856 	 */
4857 	error = hn_chan_attach(sc, sc->hn_prichan);
4858 	if (error)
4859 		goto failed;
4860 
4861 	/*
4862 	 * Attach NVS.
4863 	 */
4864 	error = hn_nvs_attach(sc, mtu);
4865 	if (error)
4866 		goto failed;
4867 	attached |= ATTACHED_NVS;
4868 
4869 	/*
4870 	 * Attach RNDIS _after_ NVS is attached.
4871 	 */
4872 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
4873 	if (rndis_inited)
4874 		attached |= ATTACHED_RNDIS;
4875 	if (error)
4876 		goto failed;
4877 
4878 	/*
4879 	 * Make sure capabilities are not changed.
4880 	 */
4881 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4882 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4883 		    old_caps, sc->hn_caps);
4884 		error = ENXIO;
4885 		goto failed;
4886 	}
4887 
4888 	/*
4889 	 * Allocate sub-channels for multi-TX/RX rings.
4890 	 *
4891 	 * NOTE:
4892 	 * The # of RX rings that can be used is equivalent to the # of
4893 	 * channels to be requested.
4894 	 */
4895 	nsubch = sc->hn_rx_ring_cnt - 1;
4896 	error = hn_synth_alloc_subchans(sc, &nsubch);
4897 	if (error)
4898 		goto failed;
4899 	/* NOTE: _Full_ synthetic parts detach is required now. */
4900 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4901 
4902 	/*
4903 	 * Set the # of TX/RX rings that could be used according to
4904 	 * the # of channels that NVS offered.
4905 	 */
4906 	nchan = nsubch + 1;
4907 	hn_set_ring_inuse(sc, nchan);
4908 	if (nchan == 1) {
4909 		/* Only the primary channel can be used; done */
4910 		goto back;
4911 	}
4912 
4913 	/*
4914 	 * Attach the sub-channels.
4915 	 *
4916 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
4917 	 */
4918 	error = hn_attach_subchans(sc);
4919 	if (error)
4920 		goto failed;
4921 
4922 	/*
4923 	 * Configure RSS key and indirect table _after_ all sub-channels
4924 	 * are attached.
4925 	 */
4926 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4927 		/*
4928 		 * RSS key is not set yet; set it to the default RSS key.
4929 		 */
4930 		if (bootverbose)
4931 			if_printf(sc->hn_ifp, "setup default RSS key\n");
4932 #ifdef RSS
4933 		rss_getkey(rss->rss_key);
4934 #else
4935 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4936 #endif
4937 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4938 	}
4939 
4940 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4941 		/*
4942 		 * RSS indirect table is not set yet; set it up in round-
4943 		 * robin fashion.
4944 		 */
4945 		if (bootverbose) {
4946 			if_printf(sc->hn_ifp, "setup default RSS indirect "
4947 			    "table\n");
4948 		}
4949 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
4950 			uint32_t subidx;
4951 
4952 #ifdef RSS
4953 			subidx = rss_get_indirection_to_bucket(i);
4954 #else
4955 			subidx = i;
4956 #endif
4957 			rss->rss_ind[i] = subidx % nchan;
4958 		}
4959 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4960 	} else {
4961 		/*
4962 		 * # of usable channels may be changed, so we have to
4963 		 * make sure that all entries in RSS indirect table
4964 		 * are valid.
4965 		 *
4966 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
4967 		 */
4968 		hn_rss_ind_fixup(sc);
4969 	}
4970 
4971 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4972 	if (error)
4973 		goto failed;
4974 back:
4975 	/*
4976 	 * Fixup transmission aggregation setup.
4977 	 */
4978 	hn_set_txagg(sc);
4979 	hn_rndis_init_fixat(sc, nchan);
4980 	return (0);
4981 
4982 failed:
4983 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4984 		hn_rndis_init_fixat(sc, nchan);
4985 		hn_synth_detach(sc);
4986 	} else {
4987 		if (attached & ATTACHED_RNDIS) {
4988 			hn_rndis_init_fixat(sc, nchan);
4989 			hn_rndis_detach(sc);
4990 		}
4991 		if (attached & ATTACHED_NVS)
4992 			hn_nvs_detach(sc);
4993 		hn_chan_detach(sc, sc->hn_prichan);
4994 		/* Restore old capabilities. */
4995 		sc->hn_caps = old_caps;
4996 	}
4997 	return (error);
4998 
4999 #undef ATTACHED_RNDIS
5000 #undef ATTACHED_NVS
5001 }
5002 
5003 /*
5004  * NOTE:
5005  * The interface must have been suspended though hn_suspend(), before
5006  * this function get called.
5007  */
5008 static void
5009 hn_synth_detach(struct hn_softc *sc)
5010 {
5011 
5012 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
5013 	    ("synthetic parts were not attached"));
5014 
5015 	/* Detach the RNDIS first. */
5016 	hn_rndis_detach(sc);
5017 
5018 	/* Detach NVS. */
5019 	hn_nvs_detach(sc);
5020 
5021 	/* Detach all of the channels. */
5022 	hn_detach_allchans(sc);
5023 
5024 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
5025 }
5026 
5027 static void
5028 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
5029 {
5030 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
5031 	    ("invalid ring count %d", ring_cnt));
5032 
5033 	if (sc->hn_tx_ring_cnt > ring_cnt)
5034 		sc->hn_tx_ring_inuse = ring_cnt;
5035 	else
5036 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5037 	sc->hn_rx_ring_inuse = ring_cnt;
5038 
5039 #ifdef RSS
5040 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
5041 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
5042 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
5043 		    rss_getnumbuckets());
5044 	}
5045 #endif
5046 
5047 	if (bootverbose) {
5048 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
5049 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
5050 	}
5051 }
5052 
5053 static void
5054 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
5055 {
5056 
5057 	/*
5058 	 * NOTE:
5059 	 * The TX bufring will not be drained by the hypervisor,
5060 	 * if the primary channel is revoked.
5061 	 */
5062 	while (!vmbus_chan_rx_empty(chan) ||
5063 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
5064 	     !vmbus_chan_tx_empty(chan)))
5065 		pause("waitch", 1);
5066 	vmbus_chan_intr_drain(chan);
5067 }
5068 
5069 static void
5070 hn_disable_rx(struct hn_softc *sc)
5071 {
5072 
5073 	/*
5074 	 * Disable RX by clearing RX filter forcefully.
5075 	 */
5076 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
5077 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
5078 
5079 	/*
5080 	 * Give RNDIS enough time to flush all pending data packets.
5081 	 */
5082 	pause("waitrx", (200 * hz) / 1000);
5083 }
5084 
5085 /*
5086  * NOTE:
5087  * RX/TX _must_ have been suspended/disabled, before this function
5088  * is called.
5089  */
5090 static void
5091 hn_drain_rxtx(struct hn_softc *sc, int nchan)
5092 {
5093 	struct vmbus_channel **subch = NULL;
5094 	int nsubch;
5095 
5096 	/*
5097 	 * Drain RX/TX bufrings and interrupts.
5098 	 */
5099 	nsubch = nchan - 1;
5100 	if (nsubch > 0)
5101 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
5102 
5103 	if (subch != NULL) {
5104 		int i;
5105 
5106 		for (i = 0; i < nsubch; ++i)
5107 			hn_chan_drain(sc, subch[i]);
5108 	}
5109 	hn_chan_drain(sc, sc->hn_prichan);
5110 
5111 	if (subch != NULL)
5112 		vmbus_subchan_rel(subch, nsubch);
5113 }
5114 
5115 static void
5116 hn_suspend_data(struct hn_softc *sc)
5117 {
5118 	struct hn_tx_ring *txr;
5119 	int i;
5120 
5121 	HN_LOCK_ASSERT(sc);
5122 
5123 	/*
5124 	 * Suspend TX.
5125 	 */
5126 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5127 		txr = &sc->hn_tx_ring[i];
5128 
5129 		mtx_lock(&txr->hn_tx_lock);
5130 		txr->hn_suspended = 1;
5131 		mtx_unlock(&txr->hn_tx_lock);
5132 		/* No one is able send more packets now. */
5133 
5134 		/*
5135 		 * Wait for all pending sends to finish.
5136 		 *
5137 		 * NOTE:
5138 		 * We will _not_ receive all pending send-done, if the
5139 		 * primary channel is revoked.
5140 		 */
5141 		while (hn_tx_ring_pending(txr) &&
5142 		    !vmbus_chan_is_revoked(sc->hn_prichan))
5143 			pause("hnwtx", 1 /* 1 tick */);
5144 	}
5145 
5146 	/*
5147 	 * Disable RX.
5148 	 */
5149 	hn_disable_rx(sc);
5150 
5151 	/*
5152 	 * Drain RX/TX.
5153 	 */
5154 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
5155 
5156 	/*
5157 	 * Drain any pending TX tasks.
5158 	 *
5159 	 * NOTE:
5160 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
5161 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
5162 	 */
5163 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5164 		txr = &sc->hn_tx_ring[i];
5165 
5166 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
5167 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
5168 	}
5169 }
5170 
5171 static void
5172 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
5173 {
5174 
5175 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
5176 }
5177 
5178 static void
5179 hn_suspend_mgmt(struct hn_softc *sc)
5180 {
5181 	struct task task;
5182 
5183 	HN_LOCK_ASSERT(sc);
5184 
5185 	/*
5186 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
5187 	 * through hn_mgmt_taskq.
5188 	 */
5189 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
5190 	vmbus_chan_run_task(sc->hn_prichan, &task);
5191 
5192 	/*
5193 	 * Make sure that all pending management tasks are completed.
5194 	 */
5195 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
5196 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
5197 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
5198 }
5199 
5200 static void
5201 hn_suspend(struct hn_softc *sc)
5202 {
5203 
5204 	/* Disable polling. */
5205 	hn_polling(sc, 0);
5206 
5207 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5208 	    (sc->hn_flags & HN_FLAG_VF))
5209 		hn_suspend_data(sc);
5210 	hn_suspend_mgmt(sc);
5211 }
5212 
5213 static void
5214 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
5215 {
5216 	int i;
5217 
5218 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
5219 	    ("invalid TX ring count %d", tx_ring_cnt));
5220 
5221 	for (i = 0; i < tx_ring_cnt; ++i) {
5222 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5223 
5224 		mtx_lock(&txr->hn_tx_lock);
5225 		txr->hn_suspended = 0;
5226 		mtx_unlock(&txr->hn_tx_lock);
5227 	}
5228 }
5229 
5230 static void
5231 hn_resume_data(struct hn_softc *sc)
5232 {
5233 	int i;
5234 
5235 	HN_LOCK_ASSERT(sc);
5236 
5237 	/*
5238 	 * Re-enable RX.
5239 	 */
5240 	hn_rxfilter_config(sc);
5241 
5242 	/*
5243 	 * Make sure to clear suspend status on "all" TX rings,
5244 	 * since hn_tx_ring_inuse can be changed after
5245 	 * hn_suspend_data().
5246 	 */
5247 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
5248 
5249 #ifdef HN_IFSTART_SUPPORT
5250 	if (!hn_use_if_start)
5251 #endif
5252 	{
5253 		/*
5254 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
5255 		 * reduced.
5256 		 */
5257 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
5258 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5259 	}
5260 
5261 	/*
5262 	 * Kick start TX.
5263 	 */
5264 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5265 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5266 
5267 		/*
5268 		 * Use txeof task, so that any pending oactive can be
5269 		 * cleared properly.
5270 		 */
5271 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5272 	}
5273 }
5274 
5275 static void
5276 hn_resume_mgmt(struct hn_softc *sc)
5277 {
5278 
5279 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
5280 
5281 	/*
5282 	 * Kick off network change detection, if it was pending.
5283 	 * If no network change was pending, start link status
5284 	 * checks, which is more lightweight than network change
5285 	 * detection.
5286 	 */
5287 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
5288 		hn_change_network(sc);
5289 	else
5290 		hn_update_link_status(sc);
5291 }
5292 
5293 static void
5294 hn_resume(struct hn_softc *sc)
5295 {
5296 
5297 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5298 	    (sc->hn_flags & HN_FLAG_VF))
5299 		hn_resume_data(sc);
5300 
5301 	/*
5302 	 * When the VF is activated, the synthetic interface is changed
5303 	 * to DOWN in hn_set_vf(). Here, if the VF is still active, we
5304 	 * don't call hn_resume_mgmt() until the VF is deactivated in
5305 	 * hn_set_vf().
5306 	 */
5307 	if (!(sc->hn_flags & HN_FLAG_VF))
5308 		hn_resume_mgmt(sc);
5309 
5310 	/*
5311 	 * Re-enable polling if this interface is running and
5312 	 * the polling is requested.
5313 	 */
5314 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
5315 		hn_polling(sc, sc->hn_pollhz);
5316 }
5317 
5318 static void
5319 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
5320 {
5321 	const struct rndis_status_msg *msg;
5322 	int ofs;
5323 
5324 	if (dlen < sizeof(*msg)) {
5325 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
5326 		return;
5327 	}
5328 	msg = data;
5329 
5330 	switch (msg->rm_status) {
5331 	case RNDIS_STATUS_MEDIA_CONNECT:
5332 	case RNDIS_STATUS_MEDIA_DISCONNECT:
5333 		hn_update_link_status(sc);
5334 		break;
5335 
5336 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
5337 		/* Not really useful; ignore. */
5338 		break;
5339 
5340 	case RNDIS_STATUS_NETWORK_CHANGE:
5341 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
5342 		if (dlen < ofs + msg->rm_stbuflen ||
5343 		    msg->rm_stbuflen < sizeof(uint32_t)) {
5344 			if_printf(sc->hn_ifp, "network changed\n");
5345 		} else {
5346 			uint32_t change;
5347 
5348 			memcpy(&change, ((const uint8_t *)msg) + ofs,
5349 			    sizeof(change));
5350 			if_printf(sc->hn_ifp, "network changed, change %u\n",
5351 			    change);
5352 		}
5353 		hn_change_network(sc);
5354 		break;
5355 
5356 	default:
5357 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
5358 		    msg->rm_status);
5359 		break;
5360 	}
5361 }
5362 
5363 static int
5364 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5365 {
5366 	const struct rndis_pktinfo *pi = info_data;
5367 	uint32_t mask = 0;
5368 
5369 	while (info_dlen != 0) {
5370 		const void *data;
5371 		uint32_t dlen;
5372 
5373 		if (__predict_false(info_dlen < sizeof(*pi)))
5374 			return (EINVAL);
5375 		if (__predict_false(info_dlen < pi->rm_size))
5376 			return (EINVAL);
5377 		info_dlen -= pi->rm_size;
5378 
5379 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5380 			return (EINVAL);
5381 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5382 			return (EINVAL);
5383 		dlen = pi->rm_size - pi->rm_pktinfooffset;
5384 		data = pi->rm_data;
5385 
5386 		switch (pi->rm_type) {
5387 		case NDIS_PKTINFO_TYPE_VLAN:
5388 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5389 				return (EINVAL);
5390 			info->vlan_info = *((const uint32_t *)data);
5391 			mask |= HN_RXINFO_VLAN;
5392 			break;
5393 
5394 		case NDIS_PKTINFO_TYPE_CSUM:
5395 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5396 				return (EINVAL);
5397 			info->csum_info = *((const uint32_t *)data);
5398 			mask |= HN_RXINFO_CSUM;
5399 			break;
5400 
5401 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5402 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5403 				return (EINVAL);
5404 			info->hash_value = *((const uint32_t *)data);
5405 			mask |= HN_RXINFO_HASHVAL;
5406 			break;
5407 
5408 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
5409 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5410 				return (EINVAL);
5411 			info->hash_info = *((const uint32_t *)data);
5412 			mask |= HN_RXINFO_HASHINF;
5413 			break;
5414 
5415 		default:
5416 			goto next;
5417 		}
5418 
5419 		if (mask == HN_RXINFO_ALL) {
5420 			/* All found; done */
5421 			break;
5422 		}
5423 next:
5424 		pi = (const struct rndis_pktinfo *)
5425 		    ((const uint8_t *)pi + pi->rm_size);
5426 	}
5427 
5428 	/*
5429 	 * Final fixup.
5430 	 * - If there is no hash value, invalidate the hash info.
5431 	 */
5432 	if ((mask & HN_RXINFO_HASHVAL) == 0)
5433 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5434 	return (0);
5435 }
5436 
5437 static __inline bool
5438 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5439 {
5440 
5441 	if (off < check_off) {
5442 		if (__predict_true(off + len <= check_off))
5443 			return (false);
5444 	} else if (off > check_off) {
5445 		if (__predict_true(check_off + check_len <= off))
5446 			return (false);
5447 	}
5448 	return (true);
5449 }
5450 
5451 static void
5452 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5453 {
5454 	const struct rndis_packet_msg *pkt;
5455 	struct hn_rxinfo info;
5456 	int data_off, pktinfo_off, data_len, pktinfo_len;
5457 
5458 	/*
5459 	 * Check length.
5460 	 */
5461 	if (__predict_false(dlen < sizeof(*pkt))) {
5462 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5463 		return;
5464 	}
5465 	pkt = data;
5466 
5467 	if (__predict_false(dlen < pkt->rm_len)) {
5468 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5469 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5470 		return;
5471 	}
5472 	if (__predict_false(pkt->rm_len <
5473 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5474 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5475 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
5476 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5477 		    pkt->rm_pktinfolen);
5478 		return;
5479 	}
5480 	if (__predict_false(pkt->rm_datalen == 0)) {
5481 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5482 		return;
5483 	}
5484 
5485 	/*
5486 	 * Check offests.
5487 	 */
5488 #define IS_OFFSET_INVALID(ofs)			\
5489 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
5490 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5491 
5492 	/* XXX Hyper-V does not meet data offset alignment requirement */
5493 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5494 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5495 		    "data offset %u\n", pkt->rm_dataoffset);
5496 		return;
5497 	}
5498 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5499 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5500 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5501 		    "oob offset %u\n", pkt->rm_oobdataoffset);
5502 		return;
5503 	}
5504 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5505 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5506 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5507 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5508 		return;
5509 	}
5510 
5511 #undef IS_OFFSET_INVALID
5512 
5513 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5514 	data_len = pkt->rm_datalen;
5515 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5516 	pktinfo_len = pkt->rm_pktinfolen;
5517 
5518 	/*
5519 	 * Check OOB coverage.
5520 	 */
5521 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
5522 		int oob_off, oob_len;
5523 
5524 		if_printf(rxr->hn_ifp, "got oobdata\n");
5525 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5526 		oob_len = pkt->rm_oobdatalen;
5527 
5528 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5529 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5530 			    "oob overflow, msglen %u, oob abs %d len %d\n",
5531 			    pkt->rm_len, oob_off, oob_len);
5532 			return;
5533 		}
5534 
5535 		/*
5536 		 * Check against data.
5537 		 */
5538 		if (hn_rndis_check_overlap(oob_off, oob_len,
5539 		    data_off, data_len)) {
5540 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5541 			    "oob overlaps data, oob abs %d len %d, "
5542 			    "data abs %d len %d\n",
5543 			    oob_off, oob_len, data_off, data_len);
5544 			return;
5545 		}
5546 
5547 		/*
5548 		 * Check against pktinfo.
5549 		 */
5550 		if (pktinfo_len != 0 &&
5551 		    hn_rndis_check_overlap(oob_off, oob_len,
5552 		    pktinfo_off, pktinfo_len)) {
5553 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5554 			    "oob overlaps pktinfo, oob abs %d len %d, "
5555 			    "pktinfo abs %d len %d\n",
5556 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
5557 			return;
5558 		}
5559 	}
5560 
5561 	/*
5562 	 * Check per-packet-info coverage and find useful per-packet-info.
5563 	 */
5564 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5565 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5566 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5567 	if (__predict_true(pktinfo_len != 0)) {
5568 		bool overlap;
5569 		int error;
5570 
5571 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5572 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5573 			    "pktinfo overflow, msglen %u, "
5574 			    "pktinfo abs %d len %d\n",
5575 			    pkt->rm_len, pktinfo_off, pktinfo_len);
5576 			return;
5577 		}
5578 
5579 		/*
5580 		 * Check packet info coverage.
5581 		 */
5582 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5583 		    data_off, data_len);
5584 		if (__predict_false(overlap)) {
5585 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5586 			    "pktinfo overlap data, pktinfo abs %d len %d, "
5587 			    "data abs %d len %d\n",
5588 			    pktinfo_off, pktinfo_len, data_off, data_len);
5589 			return;
5590 		}
5591 
5592 		/*
5593 		 * Find useful per-packet-info.
5594 		 */
5595 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5596 		    pktinfo_len, &info);
5597 		if (__predict_false(error)) {
5598 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5599 			    "pktinfo\n");
5600 			return;
5601 		}
5602 	}
5603 
5604 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
5605 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5606 		    "data overflow, msglen %u, data abs %d len %d\n",
5607 		    pkt->rm_len, data_off, data_len);
5608 		return;
5609 	}
5610 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5611 }
5612 
5613 static __inline void
5614 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5615 {
5616 	const struct rndis_msghdr *hdr;
5617 
5618 	if (__predict_false(dlen < sizeof(*hdr))) {
5619 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5620 		return;
5621 	}
5622 	hdr = data;
5623 
5624 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5625 		/* Hot data path. */
5626 		hn_rndis_rx_data(rxr, data, dlen);
5627 		/* Done! */
5628 		return;
5629 	}
5630 
5631 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5632 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5633 	else
5634 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5635 }
5636 
5637 static void
5638 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5639 {
5640 	const struct hn_nvs_hdr *hdr;
5641 
5642 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5643 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
5644 		return;
5645 	}
5646 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5647 
5648 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5649 		/* Useless; ignore */
5650 		return;
5651 	}
5652 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5653 }
5654 
5655 static void
5656 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5657     const struct vmbus_chanpkt_hdr *pkt)
5658 {
5659 	struct hn_nvs_sendctx *sndc;
5660 
5661 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5662 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5663 	    VMBUS_CHANPKT_DATALEN(pkt));
5664 	/*
5665 	 * NOTE:
5666 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5667 	 * its callback.
5668 	 */
5669 }
5670 
5671 static void
5672 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5673     const struct vmbus_chanpkt_hdr *pkthdr)
5674 {
5675 	const struct vmbus_chanpkt_rxbuf *pkt;
5676 	const struct hn_nvs_hdr *nvs_hdr;
5677 	int count, i, hlen;
5678 
5679 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5680 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5681 		return;
5682 	}
5683 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5684 
5685 	/* Make sure that this is a RNDIS message. */
5686 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5687 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5688 		    nvs_hdr->nvs_type);
5689 		return;
5690 	}
5691 
5692 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5693 	if (__predict_false(hlen < sizeof(*pkt))) {
5694 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5695 		return;
5696 	}
5697 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5698 
5699 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5700 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5701 		    pkt->cp_rxbuf_id);
5702 		return;
5703 	}
5704 
5705 	count = pkt->cp_rxbuf_cnt;
5706 	if (__predict_false(hlen <
5707 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5708 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5709 		return;
5710 	}
5711 
5712 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5713 	for (i = 0; i < count; ++i) {
5714 		int ofs, len;
5715 
5716 		ofs = pkt->cp_rxbuf[i].rb_ofs;
5717 		len = pkt->cp_rxbuf[i].rb_len;
5718 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5719 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5720 			    "ofs %d, len %d\n", i, ofs, len);
5721 			continue;
5722 		}
5723 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5724 	}
5725 
5726 	/*
5727 	 * Ack the consumed RXBUF associated w/ this channel packet,
5728 	 * so that this RXBUF can be recycled by the hypervisor.
5729 	 */
5730 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5731 }
5732 
5733 static void
5734 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5735     uint64_t tid)
5736 {
5737 	struct hn_nvs_rndis_ack ack;
5738 	int retries, error;
5739 
5740 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5741 	ack.nvs_status = HN_NVS_STATUS_OK;
5742 
5743 	retries = 0;
5744 again:
5745 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5746 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5747 	if (__predict_false(error == EAGAIN)) {
5748 		/*
5749 		 * NOTE:
5750 		 * This should _not_ happen in real world, since the
5751 		 * consumption of the TX bufring from the TX path is
5752 		 * controlled.
5753 		 */
5754 		if (rxr->hn_ack_failed == 0)
5755 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5756 		rxr->hn_ack_failed++;
5757 		retries++;
5758 		if (retries < 10) {
5759 			DELAY(100);
5760 			goto again;
5761 		}
5762 		/* RXBUF leaks! */
5763 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5764 	}
5765 }
5766 
5767 static void
5768 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5769 {
5770 	struct hn_rx_ring *rxr = xrxr;
5771 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
5772 
5773 	for (;;) {
5774 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5775 		int error, pktlen;
5776 
5777 		pktlen = rxr->hn_pktbuf_len;
5778 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5779 		if (__predict_false(error == ENOBUFS)) {
5780 			void *nbuf;
5781 			int nlen;
5782 
5783 			/*
5784 			 * Expand channel packet buffer.
5785 			 *
5786 			 * XXX
5787 			 * Use M_WAITOK here, since allocation failure
5788 			 * is fatal.
5789 			 */
5790 			nlen = rxr->hn_pktbuf_len * 2;
5791 			while (nlen < pktlen)
5792 				nlen *= 2;
5793 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5794 
5795 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5796 			    rxr->hn_pktbuf_len, nlen);
5797 
5798 			free(rxr->hn_pktbuf, M_DEVBUF);
5799 			rxr->hn_pktbuf = nbuf;
5800 			rxr->hn_pktbuf_len = nlen;
5801 			/* Retry! */
5802 			continue;
5803 		} else if (__predict_false(error == EAGAIN)) {
5804 			/* No more channel packets; done! */
5805 			break;
5806 		}
5807 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5808 
5809 		switch (pkt->cph_type) {
5810 		case VMBUS_CHANPKT_TYPE_COMP:
5811 			hn_nvs_handle_comp(sc, chan, pkt);
5812 			break;
5813 
5814 		case VMBUS_CHANPKT_TYPE_RXBUF:
5815 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
5816 			break;
5817 
5818 		case VMBUS_CHANPKT_TYPE_INBAND:
5819 			hn_nvs_handle_notify(sc, pkt);
5820 			break;
5821 
5822 		default:
5823 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5824 			    pkt->cph_type);
5825 			break;
5826 		}
5827 	}
5828 	hn_chan_rollup(rxr, rxr->hn_txr);
5829 }
5830 
5831 static void
5832 hn_tx_taskq_create(void *arg __unused)
5833 {
5834 	int i;
5835 
5836 	/*
5837 	 * Fix the # of TX taskqueues.
5838 	 */
5839 	if (hn_tx_taskq_cnt <= 0)
5840 		hn_tx_taskq_cnt = 1;
5841 	else if (hn_tx_taskq_cnt > mp_ncpus)
5842 		hn_tx_taskq_cnt = mp_ncpus;
5843 
5844 	/*
5845 	 * Fix the TX taskqueue mode.
5846 	 */
5847 	switch (hn_tx_taskq_mode) {
5848 	case HN_TX_TASKQ_M_INDEP:
5849 	case HN_TX_TASKQ_M_GLOBAL:
5850 	case HN_TX_TASKQ_M_EVTTQ:
5851 		break;
5852 	default:
5853 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5854 		break;
5855 	}
5856 
5857 	if (vm_guest != VM_GUEST_HV)
5858 		return;
5859 
5860 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5861 		return;
5862 
5863 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5864 	    M_DEVBUF, M_WAITOK);
5865 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5866 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5867 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5868 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5869 		    "hn tx%d", i);
5870 	}
5871 }
5872 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5873     hn_tx_taskq_create, NULL);
5874 
5875 static void
5876 hn_tx_taskq_destroy(void *arg __unused)
5877 {
5878 
5879 	if (hn_tx_taskque != NULL) {
5880 		int i;
5881 
5882 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
5883 			taskqueue_free(hn_tx_taskque[i]);
5884 		free(hn_tx_taskque, M_DEVBUF);
5885 	}
5886 }
5887 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5888     hn_tx_taskq_destroy, NULL);
5889