xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 93a065e7496dfbfbd0a5b0208ef763f37ea975c7)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/bus.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/smp.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
75 #include <sys/sx.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
80 #include <sys/eventhandler.h>
81 
82 #include <machine/atomic.h>
83 #include <machine/in_cksum.h>
84 
85 #include <net/bpf.h>
86 #include <net/ethernet.h>
87 #include <net/if.h>
88 #include <net/if_dl.h>
89 #include <net/if_media.h>
90 #include <net/if_types.h>
91 #include <net/if_var.h>
92 #include <net/rndis.h>
93 #ifdef RSS
94 #include <net/rss_config.h>
95 #endif
96 
97 #include <netinet/in_systm.h>
98 #include <netinet/in.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip6.h>
101 #include <netinet/tcp.h>
102 #include <netinet/tcp_lro.h>
103 #include <netinet/udp.h>
104 
105 #include <dev/hyperv/include/hyperv.h>
106 #include <dev/hyperv/include/hyperv_busdma.h>
107 #include <dev/hyperv/include/vmbus.h>
108 #include <dev/hyperv/include/vmbus_xact.h>
109 
110 #include <dev/hyperv/netvsc/ndis.h>
111 #include <dev/hyperv/netvsc/if_hnreg.h>
112 #include <dev/hyperv/netvsc/if_hnvar.h>
113 #include <dev/hyperv/netvsc/hn_nvs.h>
114 #include <dev/hyperv/netvsc/hn_rndis.h>
115 
116 #include "vmbus_if.h"
117 
118 #define HN_IFSTART_SUPPORT
119 
120 #define HN_RING_CNT_DEF_MAX		8
121 
122 /* YYY should get it from the underlying channel */
123 #define HN_TX_DESC_CNT			512
124 
125 #define HN_RNDIS_PKT_LEN					\
126 	(sizeof(struct rndis_packet_msg) +			\
127 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
128 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
129 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
130 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
131 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
132 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
133 
134 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
135 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
136 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
137 /* -1 for RNDIS packet message */
138 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
139 
140 #define HN_DIRECT_TX_SIZE_DEF		128
141 
142 #define HN_EARLY_TXEOF_THRESH		8
143 
144 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
145 
146 #define HN_LROENT_CNT_DEF		128
147 
148 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
149 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
150 /* YYY 2*MTU is a bit rough, but should be good enough. */
151 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
152 
153 #define HN_LRO_ACKCNT_DEF		1
154 
155 #define HN_LOCK_INIT(sc)		\
156 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
157 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
158 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
159 #define HN_LOCK(sc)					\
160 do {							\
161 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
162 		DELAY(1000);				\
163 } while (0)
164 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
165 
166 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
167 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
168 #define HN_CSUM_IP_HWASSIST(sc)		\
169 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
170 #define HN_CSUM_IP6_HWASSIST(sc)	\
171 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
172 
173 #define HN_PKTSIZE_MIN(align)		\
174 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
175 	    HN_RNDIS_PKT_LEN, (align))
176 #define HN_PKTSIZE(m, align)		\
177 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
178 
179 #ifdef RSS
180 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
181 #else
182 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
183 #endif
184 
185 struct hn_txdesc {
186 #ifndef HN_USE_TXDESC_BUFRING
187 	SLIST_ENTRY(hn_txdesc)		link;
188 #endif
189 	STAILQ_ENTRY(hn_txdesc)		agg_link;
190 
191 	/* Aggregated txdescs, in sending order. */
192 	STAILQ_HEAD(, hn_txdesc)	agg_list;
193 
194 	/* The oldest packet, if transmission aggregation happens. */
195 	struct mbuf			*m;
196 	struct hn_tx_ring		*txr;
197 	int				refs;
198 	uint32_t			flags;	/* HN_TXD_FLAG_ */
199 	struct hn_nvs_sendctx		send_ctx;
200 	uint32_t			chim_index;
201 	int				chim_size;
202 
203 	bus_dmamap_t			data_dmap;
204 
205 	bus_addr_t			rndis_pkt_paddr;
206 	struct rndis_packet_msg		*rndis_pkt;
207 	bus_dmamap_t			rndis_pkt_dmap;
208 };
209 
210 #define HN_TXD_FLAG_ONLIST		0x0001
211 #define HN_TXD_FLAG_DMAMAP		0x0002
212 #define HN_TXD_FLAG_ONAGG		0x0004
213 
214 struct hn_rxinfo {
215 	uint32_t			vlan_info;
216 	uint32_t			csum_info;
217 	uint32_t			hash_info;
218 	uint32_t			hash_value;
219 };
220 
221 struct hn_update_vf {
222 	struct hn_rx_ring	*rxr;
223 	struct ifnet		*vf;
224 };
225 
226 #define HN_RXINFO_VLAN			0x0001
227 #define HN_RXINFO_CSUM			0x0002
228 #define HN_RXINFO_HASHINF		0x0004
229 #define HN_RXINFO_HASHVAL		0x0008
230 #define HN_RXINFO_ALL			\
231 	(HN_RXINFO_VLAN |		\
232 	 HN_RXINFO_CSUM |		\
233 	 HN_RXINFO_HASHINF |		\
234 	 HN_RXINFO_HASHVAL)
235 
236 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
237 #define HN_NDIS_RXCSUM_INFO_INVALID	0
238 #define HN_NDIS_HASH_INFO_INVALID	0
239 
240 static int			hn_probe(device_t);
241 static int			hn_attach(device_t);
242 static int			hn_detach(device_t);
243 static int			hn_shutdown(device_t);
244 static void			hn_chan_callback(struct vmbus_channel *,
245 				    void *);
246 
247 static void			hn_init(void *);
248 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
249 #ifdef HN_IFSTART_SUPPORT
250 static void			hn_start(struct ifnet *);
251 #endif
252 static int			hn_transmit(struct ifnet *, struct mbuf *);
253 static void			hn_xmit_qflush(struct ifnet *);
254 static int			hn_ifmedia_upd(struct ifnet *);
255 static void			hn_ifmedia_sts(struct ifnet *,
256 				    struct ifmediareq *);
257 
258 static int			hn_rndis_rxinfo(const void *, int,
259 				    struct hn_rxinfo *);
260 static void			hn_rndis_rx_data(struct hn_rx_ring *,
261 				    const void *, int);
262 static void			hn_rndis_rx_status(struct hn_softc *,
263 				    const void *, int);
264 
265 static void			hn_nvs_handle_notify(struct hn_softc *,
266 				    const struct vmbus_chanpkt_hdr *);
267 static void			hn_nvs_handle_comp(struct hn_softc *,
268 				    struct vmbus_channel *,
269 				    const struct vmbus_chanpkt_hdr *);
270 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
271 				    struct vmbus_channel *,
272 				    const struct vmbus_chanpkt_hdr *);
273 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
274 				    struct vmbus_channel *, uint64_t);
275 
276 #if __FreeBSD_version >= 1100099
277 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
278 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
279 #endif
280 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
281 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
282 #if __FreeBSD_version < 1100095
283 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
284 #else
285 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
286 #endif
287 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
288 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
289 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
290 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
291 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
292 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
293 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
294 #ifndef RSS
295 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
296 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
297 #endif
298 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
299 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
300 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
301 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
302 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
303 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
304 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
305 
306 static void			hn_stop(struct hn_softc *, bool);
307 static void			hn_init_locked(struct hn_softc *);
308 static int			hn_chan_attach(struct hn_softc *,
309 				    struct vmbus_channel *);
310 static void			hn_chan_detach(struct hn_softc *,
311 				    struct vmbus_channel *);
312 static int			hn_attach_subchans(struct hn_softc *);
313 static void			hn_detach_allchans(struct hn_softc *);
314 static void			hn_chan_rollup(struct hn_rx_ring *,
315 				    struct hn_tx_ring *);
316 static void			hn_set_ring_inuse(struct hn_softc *, int);
317 static int			hn_synth_attach(struct hn_softc *, int);
318 static void			hn_synth_detach(struct hn_softc *);
319 static int			hn_synth_alloc_subchans(struct hn_softc *,
320 				    int *);
321 static bool			hn_synth_attachable(const struct hn_softc *);
322 static void			hn_suspend(struct hn_softc *);
323 static void			hn_suspend_data(struct hn_softc *);
324 static void			hn_suspend_mgmt(struct hn_softc *);
325 static void			hn_resume(struct hn_softc *);
326 static void			hn_resume_data(struct hn_softc *);
327 static void			hn_resume_mgmt(struct hn_softc *);
328 static void			hn_suspend_mgmt_taskfunc(void *, int);
329 static void			hn_chan_drain(struct hn_softc *,
330 				    struct vmbus_channel *);
331 static void			hn_polling(struct hn_softc *, u_int);
332 static void			hn_chan_polling(struct vmbus_channel *, u_int);
333 
334 static void			hn_update_link_status(struct hn_softc *);
335 static void			hn_change_network(struct hn_softc *);
336 static void			hn_link_taskfunc(void *, int);
337 static void			hn_netchg_init_taskfunc(void *, int);
338 static void			hn_netchg_status_taskfunc(void *, int);
339 static void			hn_link_status(struct hn_softc *);
340 
341 static int			hn_create_rx_data(struct hn_softc *, int);
342 static void			hn_destroy_rx_data(struct hn_softc *);
343 static int			hn_check_iplen(const struct mbuf *, int);
344 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
345 static int			hn_rxfilter_config(struct hn_softc *);
346 #ifndef RSS
347 static int			hn_rss_reconfig(struct hn_softc *);
348 #endif
349 static void			hn_rss_ind_fixup(struct hn_softc *);
350 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
351 				    int, const struct hn_rxinfo *);
352 
353 static int			hn_tx_ring_create(struct hn_softc *, int);
354 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
355 static int			hn_create_tx_data(struct hn_softc *, int);
356 static void			hn_fixup_tx_data(struct hn_softc *);
357 static void			hn_destroy_tx_data(struct hn_softc *);
358 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
359 static void			hn_txdesc_gc(struct hn_tx_ring *,
360 				    struct hn_txdesc *);
361 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
362 				    struct hn_txdesc *, struct mbuf **);
363 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
364 				    struct hn_txdesc *);
365 static void			hn_set_chim_size(struct hn_softc *, int);
366 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
367 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
368 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
369 static void			hn_resume_tx(struct hn_softc *, int);
370 static void			hn_set_txagg(struct hn_softc *);
371 static void			*hn_try_txagg(struct ifnet *,
372 				    struct hn_tx_ring *, struct hn_txdesc *,
373 				    int);
374 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
375 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
376 				    struct hn_softc *, struct vmbus_channel *,
377 				    const void *, int);
378 static int			hn_txpkt_sglist(struct hn_tx_ring *,
379 				    struct hn_txdesc *);
380 static int			hn_txpkt_chim(struct hn_tx_ring *,
381 				    struct hn_txdesc *);
382 static int			hn_xmit(struct hn_tx_ring *, int);
383 static void			hn_xmit_taskfunc(void *, int);
384 static void			hn_xmit_txeof(struct hn_tx_ring *);
385 static void			hn_xmit_txeof_taskfunc(void *, int);
386 #ifdef HN_IFSTART_SUPPORT
387 static int			hn_start_locked(struct hn_tx_ring *, int);
388 static void			hn_start_taskfunc(void *, int);
389 static void			hn_start_txeof(struct hn_tx_ring *);
390 static void			hn_start_txeof_taskfunc(void *, int);
391 #endif
392 
393 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
394     "Hyper-V network interface");
395 
396 /* Trust tcp segements verification on host side. */
397 static int			hn_trust_hosttcp = 1;
398 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
399     &hn_trust_hosttcp, 0,
400     "Trust tcp segement verification on host side, "
401     "when csum info is missing (global setting)");
402 
403 /* Trust udp datagrams verification on host side. */
404 static int			hn_trust_hostudp = 1;
405 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
406     &hn_trust_hostudp, 0,
407     "Trust udp datagram verification on host side, "
408     "when csum info is missing (global setting)");
409 
410 /* Trust ip packets verification on host side. */
411 static int			hn_trust_hostip = 1;
412 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
413     &hn_trust_hostip, 0,
414     "Trust ip packet verification on host side, "
415     "when csum info is missing (global setting)");
416 
417 /* Limit TSO burst size */
418 static int			hn_tso_maxlen = IP_MAXPACKET;
419 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
420     &hn_tso_maxlen, 0, "TSO burst limit");
421 
422 /* Limit chimney send size */
423 static int			hn_tx_chimney_size = 0;
424 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
425     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
426 
427 /* Limit the size of packet for direct transmission */
428 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
429 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
430     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
431 
432 /* # of LRO entries per RX ring */
433 #if defined(INET) || defined(INET6)
434 #if __FreeBSD_version >= 1100095
435 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
436 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
437     &hn_lro_entry_count, 0, "LRO entry count");
438 #endif
439 #endif
440 
441 static int			hn_tx_taskq_cnt = 1;
442 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
443     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
444 
445 #define HN_TX_TASKQ_M_INDEP	0
446 #define HN_TX_TASKQ_M_GLOBAL	1
447 #define HN_TX_TASKQ_M_EVTTQ	2
448 
449 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
450 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
451     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
452     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
453 
454 #ifndef HN_USE_TXDESC_BUFRING
455 static int			hn_use_txdesc_bufring = 0;
456 #else
457 static int			hn_use_txdesc_bufring = 1;
458 #endif
459 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
460     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
461 
462 #ifdef HN_IFSTART_SUPPORT
463 /* Use ifnet.if_start instead of ifnet.if_transmit */
464 static int			hn_use_if_start = 0;
465 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
466     &hn_use_if_start, 0, "Use if_start TX method");
467 #endif
468 
469 /* # of channels to use */
470 static int			hn_chan_cnt = 0;
471 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
472     &hn_chan_cnt, 0,
473     "# of channels to use; each channel has one RX ring and one TX ring");
474 
475 /* # of transmit rings to use */
476 static int			hn_tx_ring_cnt = 0;
477 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
478     &hn_tx_ring_cnt, 0, "# of TX rings to use");
479 
480 /* Software TX ring deptch */
481 static int			hn_tx_swq_depth = 0;
482 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
483     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
484 
485 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
486 #if __FreeBSD_version >= 1100095
487 static u_int			hn_lro_mbufq_depth = 0;
488 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
489     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
490 #endif
491 
492 /* Packet transmission aggregation size limit */
493 static int			hn_tx_agg_size = -1;
494 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
495     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
496 
497 /* Packet transmission aggregation count limit */
498 static int			hn_tx_agg_pkts = -1;
499 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
500     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
501 
502 static u_int			hn_cpu_index;	/* next CPU for channel */
503 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
504 
505 #ifndef RSS
506 static const uint8_t
507 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
508 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
509 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
510 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
511 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
512 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
513 };
514 #endif	/* !RSS */
515 
516 static device_method_t hn_methods[] = {
517 	/* Device interface */
518 	DEVMETHOD(device_probe,		hn_probe),
519 	DEVMETHOD(device_attach,	hn_attach),
520 	DEVMETHOD(device_detach,	hn_detach),
521 	DEVMETHOD(device_shutdown,	hn_shutdown),
522 	DEVMETHOD_END
523 };
524 
525 static driver_t hn_driver = {
526 	"hn",
527 	hn_methods,
528 	sizeof(struct hn_softc)
529 };
530 
531 static devclass_t hn_devclass;
532 
533 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
534 MODULE_VERSION(hn, 1);
535 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
536 
537 #if __FreeBSD_version >= 1100099
538 static void
539 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
540 {
541 	int i;
542 
543 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
544 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
545 }
546 #endif
547 
548 static int
549 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
550 {
551 
552 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
553 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
554 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
555 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
556 }
557 
558 static int
559 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
560 {
561 	struct hn_nvs_rndis rndis;
562 
563 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
564 	    txd->chim_size > 0, ("invalid rndis chim txd"));
565 
566 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
567 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
568 	rndis.nvs_chim_idx = txd->chim_index;
569 	rndis.nvs_chim_sz = txd->chim_size;
570 
571 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
572 	    &rndis, sizeof(rndis), &txd->send_ctx));
573 }
574 
575 static __inline uint32_t
576 hn_chim_alloc(struct hn_softc *sc)
577 {
578 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
579 	u_long *bmap = sc->hn_chim_bmap;
580 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
581 
582 	for (i = 0; i < bmap_cnt; ++i) {
583 		int idx;
584 
585 		idx = ffsl(~bmap[i]);
586 		if (idx == 0)
587 			continue;
588 
589 		--idx; /* ffsl is 1-based */
590 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
591 		    ("invalid i %d and idx %d", i, idx));
592 
593 		if (atomic_testandset_long(&bmap[i], idx))
594 			continue;
595 
596 		ret = i * LONG_BIT + idx;
597 		break;
598 	}
599 	return (ret);
600 }
601 
602 static __inline void
603 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
604 {
605 	u_long mask;
606 	uint32_t idx;
607 
608 	idx = chim_idx / LONG_BIT;
609 	KASSERT(idx < sc->hn_chim_bmap_cnt,
610 	    ("invalid chimney index 0x%x", chim_idx));
611 
612 	mask = 1UL << (chim_idx % LONG_BIT);
613 	KASSERT(sc->hn_chim_bmap[idx] & mask,
614 	    ("index bitmap 0x%lx, chimney index %u, "
615 	     "bitmap idx %d, bitmask 0x%lx",
616 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
617 
618 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
619 }
620 
621 #if defined(INET6) || defined(INET)
622 /*
623  * NOTE: If this function failed, the m_head would be freed.
624  */
625 static __inline struct mbuf *
626 hn_tso_fixup(struct mbuf *m_head)
627 {
628 	struct ether_vlan_header *evl;
629 	struct tcphdr *th;
630 	int ehlen;
631 
632 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
633 
634 #define PULLUP_HDR(m, len)				\
635 do {							\
636 	if (__predict_false((m)->m_len < (len))) {	\
637 		(m) = m_pullup((m), (len));		\
638 		if ((m) == NULL)			\
639 			return (NULL);			\
640 	}						\
641 } while (0)
642 
643 	PULLUP_HDR(m_head, sizeof(*evl));
644 	evl = mtod(m_head, struct ether_vlan_header *);
645 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
646 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
647 	else
648 		ehlen = ETHER_HDR_LEN;
649 
650 #ifdef INET
651 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
652 		struct ip *ip;
653 		int iphlen;
654 
655 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
656 		ip = mtodo(m_head, ehlen);
657 		iphlen = ip->ip_hl << 2;
658 
659 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
660 		th = mtodo(m_head, ehlen + iphlen);
661 
662 		ip->ip_len = 0;
663 		ip->ip_sum = 0;
664 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
665 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
666 	}
667 #endif
668 #if defined(INET6) && defined(INET)
669 	else
670 #endif
671 #ifdef INET6
672 	{
673 		struct ip6_hdr *ip6;
674 
675 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
676 		ip6 = mtodo(m_head, ehlen);
677 		if (ip6->ip6_nxt != IPPROTO_TCP) {
678 			m_freem(m_head);
679 			return (NULL);
680 		}
681 
682 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
683 		th = mtodo(m_head, ehlen + sizeof(*ip6));
684 
685 		ip6->ip6_plen = 0;
686 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
687 	}
688 #endif
689 	return (m_head);
690 
691 #undef PULLUP_HDR
692 }
693 #endif	/* INET6 || INET */
694 
695 static int
696 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
697 {
698 	int error = 0;
699 
700 	HN_LOCK_ASSERT(sc);
701 
702 	if (sc->hn_rx_filter != filter) {
703 		error = hn_rndis_set_rxfilter(sc, filter);
704 		if (!error)
705 			sc->hn_rx_filter = filter;
706 	}
707 	return (error);
708 }
709 
710 static int
711 hn_rxfilter_config(struct hn_softc *sc)
712 {
713 	struct ifnet *ifp = sc->hn_ifp;
714 	uint32_t filter;
715 
716 	HN_LOCK_ASSERT(sc);
717 
718 	if ((ifp->if_flags & IFF_PROMISC) ||
719 	    (sc->hn_flags & HN_FLAG_VF)) {
720 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
721 	} else {
722 		filter = NDIS_PACKET_TYPE_DIRECTED;
723 		if (ifp->if_flags & IFF_BROADCAST)
724 			filter |= NDIS_PACKET_TYPE_BROADCAST;
725 		/* TODO: support multicast list */
726 		if ((ifp->if_flags & IFF_ALLMULTI) ||
727 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
728 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
729 	}
730 	return (hn_set_rxfilter(sc, filter));
731 }
732 
733 static void
734 hn_set_txagg(struct hn_softc *sc)
735 {
736 	uint32_t size, pkts;
737 	int i;
738 
739 	/*
740 	 * Setup aggregation size.
741 	 */
742 	if (sc->hn_agg_size < 0)
743 		size = UINT32_MAX;
744 	else
745 		size = sc->hn_agg_size;
746 
747 	if (sc->hn_rndis_agg_size < size)
748 		size = sc->hn_rndis_agg_size;
749 
750 	/* NOTE: We only aggregate packets using chimney sending buffers. */
751 	if (size > (uint32_t)sc->hn_chim_szmax)
752 		size = sc->hn_chim_szmax;
753 
754 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
755 		/* Disable */
756 		size = 0;
757 		pkts = 0;
758 		goto done;
759 	}
760 
761 	/* NOTE: Type of the per TX ring setting is 'int'. */
762 	if (size > INT_MAX)
763 		size = INT_MAX;
764 
765 	/*
766 	 * Setup aggregation packet count.
767 	 */
768 	if (sc->hn_agg_pkts < 0)
769 		pkts = UINT32_MAX;
770 	else
771 		pkts = sc->hn_agg_pkts;
772 
773 	if (sc->hn_rndis_agg_pkts < pkts)
774 		pkts = sc->hn_rndis_agg_pkts;
775 
776 	if (pkts <= 1) {
777 		/* Disable */
778 		size = 0;
779 		pkts = 0;
780 		goto done;
781 	}
782 
783 	/* NOTE: Type of the per TX ring setting is 'short'. */
784 	if (pkts > SHRT_MAX)
785 		pkts = SHRT_MAX;
786 
787 done:
788 	/* NOTE: Type of the per TX ring setting is 'short'. */
789 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
790 		/* Disable */
791 		size = 0;
792 		pkts = 0;
793 	}
794 
795 	if (bootverbose) {
796 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
797 		    size, pkts, sc->hn_rndis_agg_align);
798 	}
799 
800 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
801 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
802 
803 		mtx_lock(&txr->hn_tx_lock);
804 		txr->hn_agg_szmax = size;
805 		txr->hn_agg_pktmax = pkts;
806 		txr->hn_agg_align = sc->hn_rndis_agg_align;
807 		mtx_unlock(&txr->hn_tx_lock);
808 	}
809 }
810 
811 static int
812 hn_get_txswq_depth(const struct hn_tx_ring *txr)
813 {
814 
815 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
816 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
817 		return txr->hn_txdesc_cnt;
818 	return hn_tx_swq_depth;
819 }
820 
821 #ifndef RSS
822 static int
823 hn_rss_reconfig(struct hn_softc *sc)
824 {
825 	int error;
826 
827 	HN_LOCK_ASSERT(sc);
828 
829 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
830 		return (ENXIO);
831 
832 	/*
833 	 * Disable RSS first.
834 	 *
835 	 * NOTE:
836 	 * Direct reconfiguration by setting the UNCHG flags does
837 	 * _not_ work properly.
838 	 */
839 	if (bootverbose)
840 		if_printf(sc->hn_ifp, "disable RSS\n");
841 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
842 	if (error) {
843 		if_printf(sc->hn_ifp, "RSS disable failed\n");
844 		return (error);
845 	}
846 
847 	/*
848 	 * Reenable the RSS w/ the updated RSS key or indirect
849 	 * table.
850 	 */
851 	if (bootverbose)
852 		if_printf(sc->hn_ifp, "reconfig RSS\n");
853 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
854 	if (error) {
855 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
856 		return (error);
857 	}
858 	return (0);
859 }
860 #endif	/* !RSS */
861 
862 static void
863 hn_rss_ind_fixup(struct hn_softc *sc)
864 {
865 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
866 	int i, nchan;
867 
868 	nchan = sc->hn_rx_ring_inuse;
869 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
870 
871 	/*
872 	 * Check indirect table to make sure that all channels in it
873 	 * can be used.
874 	 */
875 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
876 		if (rss->rss_ind[i] >= nchan) {
877 			if_printf(sc->hn_ifp,
878 			    "RSS indirect table %d fixup: %u -> %d\n",
879 			    i, rss->rss_ind[i], nchan - 1);
880 			rss->rss_ind[i] = nchan - 1;
881 		}
882 	}
883 }
884 
885 static int
886 hn_ifmedia_upd(struct ifnet *ifp __unused)
887 {
888 
889 	return EOPNOTSUPP;
890 }
891 
892 static void
893 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
894 {
895 	struct hn_softc *sc = ifp->if_softc;
896 
897 	ifmr->ifm_status = IFM_AVALID;
898 	ifmr->ifm_active = IFM_ETHER;
899 
900 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
901 		ifmr->ifm_active |= IFM_NONE;
902 		return;
903 	}
904 	ifmr->ifm_status |= IFM_ACTIVE;
905 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
906 }
907 
908 static void
909 hn_update_vf_task(void *arg, int pending __unused)
910 {
911 	struct hn_update_vf *uv = arg;
912 
913 	uv->rxr->hn_vf = uv->vf;
914 }
915 
916 static void
917 hn_update_vf(struct hn_softc *sc, struct ifnet *vf)
918 {
919 	struct hn_rx_ring *rxr;
920 	struct hn_update_vf uv;
921 	struct task task;
922 	int i;
923 
924 	HN_LOCK_ASSERT(sc);
925 
926 	TASK_INIT(&task, 0, hn_update_vf_task, &uv);
927 
928 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
929 		rxr = &sc->hn_rx_ring[i];
930 
931 		if (i < sc->hn_rx_ring_inuse) {
932 			uv.rxr = rxr;
933 			uv.vf = vf;
934 			vmbus_chan_run_task(rxr->hn_chan, &task);
935 		} else {
936 			rxr->hn_vf = vf;
937 		}
938 	}
939 }
940 
941 static void
942 hn_set_vf(struct hn_softc *sc, struct ifnet *ifp, bool vf)
943 {
944 	struct ifnet *hn_ifp;
945 
946 	HN_LOCK(sc);
947 
948 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
949 		goto out;
950 
951 	hn_ifp = sc->hn_ifp;
952 
953 	if (ifp == hn_ifp)
954 		goto out;
955 
956 	if (ifp->if_alloctype != IFT_ETHER)
957 		goto out;
958 
959 	/* Ignore lagg/vlan interfaces */
960 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
961 	    strcmp(ifp->if_dname, "vlan") == 0)
962 		goto out;
963 
964 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
965 		goto out;
966 
967 	/* Now we're sure 'ifp' is a real VF device. */
968 	if (vf) {
969 		if (sc->hn_flags & HN_FLAG_VF)
970 			goto out;
971 
972 		sc->hn_flags |= HN_FLAG_VF;
973 		hn_rxfilter_config(sc);
974 	} else {
975 		if (!(sc->hn_flags & HN_FLAG_VF))
976 			goto out;
977 
978 		sc->hn_flags &= ~HN_FLAG_VF;
979 		if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
980 			hn_rxfilter_config(sc);
981 		else
982 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
983 	}
984 
985 	hn_nvs_set_datapath(sc,
986 	    vf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC);
987 
988 	hn_update_vf(sc, vf ? ifp : NULL);
989 
990 	if (vf) {
991 		hn_suspend_mgmt(sc);
992 		sc->hn_link_flags &=
993 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
994 		if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
995 	} else {
996 		hn_resume_mgmt(sc);
997 	}
998 
999 	devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1000 	    vf ? "VF_UP" : "VF_DOWN", NULL);
1001 
1002 	if (bootverbose)
1003 		if_printf(hn_ifp, "Data path is switched %s %s\n",
1004 		    vf ? "to" : "from", if_name(ifp));
1005 out:
1006 	HN_UNLOCK(sc);
1007 }
1008 
1009 static void
1010 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1011 {
1012 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1013 		return;
1014 
1015 	hn_set_vf(arg, ifp, event == IFNET_EVENT_UP);
1016 }
1017 
1018 static void
1019 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1020 {
1021 	hn_set_vf(arg, ifp, ifp->if_flags & IFF_UP);
1022 }
1023 
1024 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
1025 static const struct hyperv_guid g_net_vsc_device_type = {
1026 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
1027 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
1028 };
1029 
1030 static int
1031 hn_probe(device_t dev)
1032 {
1033 
1034 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1035 	    &g_net_vsc_device_type) == 0) {
1036 		device_set_desc(dev, "Hyper-V Network Interface");
1037 		return BUS_PROBE_DEFAULT;
1038 	}
1039 	return ENXIO;
1040 }
1041 
1042 static int
1043 hn_attach(device_t dev)
1044 {
1045 	struct hn_softc *sc = device_get_softc(dev);
1046 	struct sysctl_oid_list *child;
1047 	struct sysctl_ctx_list *ctx;
1048 	uint8_t eaddr[ETHER_ADDR_LEN];
1049 	struct ifnet *ifp = NULL;
1050 	int error, ring_cnt, tx_ring_cnt;
1051 
1052 	sc->hn_dev = dev;
1053 	sc->hn_prichan = vmbus_get_channel(dev);
1054 	HN_LOCK_INIT(sc);
1055 
1056 	/*
1057 	 * Initialize these tunables once.
1058 	 */
1059 	sc->hn_agg_size = hn_tx_agg_size;
1060 	sc->hn_agg_pkts = hn_tx_agg_pkts;
1061 
1062 	/*
1063 	 * Setup taskqueue for transmission.
1064 	 */
1065 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1066 		int i;
1067 
1068 		sc->hn_tx_taskqs =
1069 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1070 		    M_DEVBUF, M_WAITOK);
1071 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1072 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1073 			    M_WAITOK, taskqueue_thread_enqueue,
1074 			    &sc->hn_tx_taskqs[i]);
1075 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1076 			    "%s tx%d", device_get_nameunit(dev), i);
1077 		}
1078 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1079 		sc->hn_tx_taskqs = hn_tx_taskque;
1080 	}
1081 
1082 	/*
1083 	 * Setup taskqueue for mangement tasks, e.g. link status.
1084 	 */
1085 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1086 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1087 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1088 	    device_get_nameunit(dev));
1089 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1090 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1091 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1092 	    hn_netchg_status_taskfunc, sc);
1093 
1094 	/*
1095 	 * Allocate ifnet and setup its name earlier, so that if_printf
1096 	 * can be used by functions, which will be called after
1097 	 * ether_ifattach().
1098 	 */
1099 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
1100 	ifp->if_softc = sc;
1101 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1102 
1103 	/*
1104 	 * Initialize ifmedia earlier so that it can be unconditionally
1105 	 * destroyed, if error happened later on.
1106 	 */
1107 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1108 
1109 	/*
1110 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1111 	 * to use (tx_ring_cnt).
1112 	 *
1113 	 * NOTE:
1114 	 * The # of RX rings to use is same as the # of channels to use.
1115 	 */
1116 	ring_cnt = hn_chan_cnt;
1117 	if (ring_cnt <= 0) {
1118 		/* Default */
1119 		ring_cnt = mp_ncpus;
1120 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
1121 			ring_cnt = HN_RING_CNT_DEF_MAX;
1122 	} else if (ring_cnt > mp_ncpus) {
1123 		ring_cnt = mp_ncpus;
1124 	}
1125 #ifdef RSS
1126 	if (ring_cnt > rss_getnumbuckets())
1127 		ring_cnt = rss_getnumbuckets();
1128 #endif
1129 
1130 	tx_ring_cnt = hn_tx_ring_cnt;
1131 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1132 		tx_ring_cnt = ring_cnt;
1133 #ifdef HN_IFSTART_SUPPORT
1134 	if (hn_use_if_start) {
1135 		/* ifnet.if_start only needs one TX ring. */
1136 		tx_ring_cnt = 1;
1137 	}
1138 #endif
1139 
1140 	/*
1141 	 * Set the leader CPU for channels.
1142 	 */
1143 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1144 
1145 	/*
1146 	 * Create enough TX/RX rings, even if only limited number of
1147 	 * channels can be allocated.
1148 	 */
1149 	error = hn_create_tx_data(sc, tx_ring_cnt);
1150 	if (error)
1151 		goto failed;
1152 	error = hn_create_rx_data(sc, ring_cnt);
1153 	if (error)
1154 		goto failed;
1155 
1156 	/*
1157 	 * Create transaction context for NVS and RNDIS transactions.
1158 	 */
1159 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1160 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1161 	if (sc->hn_xact == NULL) {
1162 		error = ENXIO;
1163 		goto failed;
1164 	}
1165 
1166 	/*
1167 	 * Install orphan handler for the revocation of this device's
1168 	 * primary channel.
1169 	 *
1170 	 * NOTE:
1171 	 * The processing order is critical here:
1172 	 * Install the orphan handler, _before_ testing whether this
1173 	 * device's primary channel has been revoked or not.
1174 	 */
1175 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1176 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1177 		error = ENXIO;
1178 		goto failed;
1179 	}
1180 
1181 	/*
1182 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1183 	 */
1184 	error = hn_synth_attach(sc, ETHERMTU);
1185 	if (error)
1186 		goto failed;
1187 
1188 	error = hn_rndis_get_eaddr(sc, eaddr);
1189 	if (error)
1190 		goto failed;
1191 
1192 #if __FreeBSD_version >= 1100099
1193 	if (sc->hn_rx_ring_inuse > 1) {
1194 		/*
1195 		 * Reduce TCP segment aggregation limit for multiple
1196 		 * RX rings to increase ACK timeliness.
1197 		 */
1198 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1199 	}
1200 #endif
1201 
1202 	/*
1203 	 * Fixup TX stuffs after synthetic parts are attached.
1204 	 */
1205 	hn_fixup_tx_data(sc);
1206 
1207 	ctx = device_get_sysctl_ctx(dev);
1208 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1209 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1210 	    &sc->hn_nvs_ver, 0, "NVS version");
1211 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1212 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1213 	    hn_ndis_version_sysctl, "A", "NDIS version");
1214 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1215 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1216 	    hn_caps_sysctl, "A", "capabilities");
1217 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1218 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1219 	    hn_hwassist_sysctl, "A", "hwassist");
1220 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1221 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1222 	    hn_rxfilter_sysctl, "A", "rxfilter");
1223 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1224 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1225 	    hn_rss_hash_sysctl, "A", "RSS hash");
1226 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1227 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1228 #ifndef RSS
1229 	/*
1230 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
1231 	 */
1232 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1233 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1234 	    hn_rss_key_sysctl, "IU", "RSS key");
1235 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1236 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1237 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1238 #endif
1239 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1240 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1241 	    "RNDIS offered packet transmission aggregation size limit");
1242 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1243 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1244 	    "RNDIS offered packet transmission aggregation count limit");
1245 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1246 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1247 	    "RNDIS packet transmission aggregation alignment");
1248 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1249 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1250 	    hn_txagg_size_sysctl, "I",
1251 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1252 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1253 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1254 	    hn_txagg_pkts_sysctl, "I",
1255 	    "Packet transmission aggregation packets, "
1256 	    "0 -- disable, -1 -- auto");
1257 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1258 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1259 	    hn_polling_sysctl, "I",
1260 	    "Polling frequency: [100,1000000], 0 disable polling");
1261 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1262 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1263 	    hn_vf_sysctl, "A", "Virtual Function's name");
1264 
1265 	/*
1266 	 * Setup the ifmedia, which has been initialized earlier.
1267 	 */
1268 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1269 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1270 	/* XXX ifmedia_set really should do this for us */
1271 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1272 
1273 	/*
1274 	 * Setup the ifnet for this interface.
1275 	 */
1276 
1277 	ifp->if_baudrate = IF_Gbps(10);
1278 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1279 	ifp->if_ioctl = hn_ioctl;
1280 	ifp->if_init = hn_init;
1281 #ifdef HN_IFSTART_SUPPORT
1282 	if (hn_use_if_start) {
1283 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1284 
1285 		ifp->if_start = hn_start;
1286 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1287 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1288 		IFQ_SET_READY(&ifp->if_snd);
1289 	} else
1290 #endif
1291 	{
1292 		ifp->if_transmit = hn_transmit;
1293 		ifp->if_qflush = hn_xmit_qflush;
1294 	}
1295 
1296 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1297 #ifdef foo
1298 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
1299 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1300 #endif
1301 	if (sc->hn_caps & HN_CAP_VLAN) {
1302 		/* XXX not sure about VLAN_MTU. */
1303 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1304 	}
1305 
1306 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1307 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1308 		ifp->if_capabilities |= IFCAP_TXCSUM;
1309 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1310 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1311 	if (sc->hn_caps & HN_CAP_TSO4) {
1312 		ifp->if_capabilities |= IFCAP_TSO4;
1313 		ifp->if_hwassist |= CSUM_IP_TSO;
1314 	}
1315 	if (sc->hn_caps & HN_CAP_TSO6) {
1316 		ifp->if_capabilities |= IFCAP_TSO6;
1317 		ifp->if_hwassist |= CSUM_IP6_TSO;
1318 	}
1319 
1320 	/* Enable all available capabilities by default. */
1321 	ifp->if_capenable = ifp->if_capabilities;
1322 
1323 	/*
1324 	 * Disable IPv6 TSO and TXCSUM by default, they still can
1325 	 * be enabled through SIOCSIFCAP.
1326 	 */
1327 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1328 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1329 
1330 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1331 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1332 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1333 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1334 	}
1335 
1336 	ether_ifattach(ifp, eaddr);
1337 
1338 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1339 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
1340 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1341 	}
1342 
1343 	/* Inform the upper layer about the long frame support. */
1344 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1345 
1346 	/*
1347 	 * Kick off link status check.
1348 	 */
1349 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1350 	hn_update_link_status(sc);
1351 
1352 	sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
1353 	    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
1354 
1355 	sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
1356 	    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
1357 
1358 	return (0);
1359 failed:
1360 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1361 		hn_synth_detach(sc);
1362 	hn_detach(dev);
1363 	return (error);
1364 }
1365 
1366 static int
1367 hn_detach(device_t dev)
1368 {
1369 	struct hn_softc *sc = device_get_softc(dev);
1370 	struct ifnet *ifp = sc->hn_ifp;
1371 
1372 	if (sc->hn_ifaddr_evthand != NULL)
1373 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
1374 	if (sc->hn_ifnet_evthand != NULL)
1375 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
1376 
1377 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1378 		/*
1379 		 * In case that the vmbus missed the orphan handler
1380 		 * installation.
1381 		 */
1382 		vmbus_xact_ctx_orphan(sc->hn_xact);
1383 	}
1384 
1385 	if (device_is_attached(dev)) {
1386 		HN_LOCK(sc);
1387 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1388 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1389 				hn_stop(sc, true);
1390 			/*
1391 			 * NOTE:
1392 			 * hn_stop() only suspends data, so managment
1393 			 * stuffs have to be suspended manually here.
1394 			 */
1395 			hn_suspend_mgmt(sc);
1396 			hn_synth_detach(sc);
1397 		}
1398 		HN_UNLOCK(sc);
1399 		ether_ifdetach(ifp);
1400 	}
1401 
1402 	ifmedia_removeall(&sc->hn_media);
1403 	hn_destroy_rx_data(sc);
1404 	hn_destroy_tx_data(sc);
1405 
1406 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1407 		int i;
1408 
1409 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
1410 			taskqueue_free(sc->hn_tx_taskqs[i]);
1411 		free(sc->hn_tx_taskqs, M_DEVBUF);
1412 	}
1413 	taskqueue_free(sc->hn_mgmt_taskq0);
1414 
1415 	if (sc->hn_xact != NULL) {
1416 		/*
1417 		 * Uninstall the orphan handler _before_ the xact is
1418 		 * destructed.
1419 		 */
1420 		vmbus_chan_unset_orphan(sc->hn_prichan);
1421 		vmbus_xact_ctx_destroy(sc->hn_xact);
1422 	}
1423 
1424 	if_free(ifp);
1425 
1426 	HN_LOCK_DESTROY(sc);
1427 	return (0);
1428 }
1429 
1430 static int
1431 hn_shutdown(device_t dev)
1432 {
1433 
1434 	return (0);
1435 }
1436 
1437 static void
1438 hn_link_status(struct hn_softc *sc)
1439 {
1440 	uint32_t link_status;
1441 	int error;
1442 
1443 	error = hn_rndis_get_linkstatus(sc, &link_status);
1444 	if (error) {
1445 		/* XXX what to do? */
1446 		return;
1447 	}
1448 
1449 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1450 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1451 	else
1452 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1453 	if_link_state_change(sc->hn_ifp,
1454 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1455 	    LINK_STATE_UP : LINK_STATE_DOWN);
1456 }
1457 
1458 static void
1459 hn_link_taskfunc(void *xsc, int pending __unused)
1460 {
1461 	struct hn_softc *sc = xsc;
1462 
1463 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1464 		return;
1465 	hn_link_status(sc);
1466 }
1467 
1468 static void
1469 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1470 {
1471 	struct hn_softc *sc = xsc;
1472 
1473 	/* Prevent any link status checks from running. */
1474 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1475 
1476 	/*
1477 	 * Fake up a [link down --> link up] state change; 5 seconds
1478 	 * delay is used, which closely simulates miibus reaction
1479 	 * upon link down event.
1480 	 */
1481 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1482 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1483 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1484 	    &sc->hn_netchg_status, 5 * hz);
1485 }
1486 
1487 static void
1488 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1489 {
1490 	struct hn_softc *sc = xsc;
1491 
1492 	/* Re-allow link status checks. */
1493 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1494 	hn_link_status(sc);
1495 }
1496 
1497 static void
1498 hn_update_link_status(struct hn_softc *sc)
1499 {
1500 
1501 	if (sc->hn_mgmt_taskq != NULL)
1502 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1503 }
1504 
1505 static void
1506 hn_change_network(struct hn_softc *sc)
1507 {
1508 
1509 	if (sc->hn_mgmt_taskq != NULL)
1510 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1511 }
1512 
1513 static __inline int
1514 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1515     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1516 {
1517 	struct mbuf *m = *m_head;
1518 	int error;
1519 
1520 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1521 
1522 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1523 	    m, segs, nsegs, BUS_DMA_NOWAIT);
1524 	if (error == EFBIG) {
1525 		struct mbuf *m_new;
1526 
1527 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1528 		if (m_new == NULL)
1529 			return ENOBUFS;
1530 		else
1531 			*m_head = m = m_new;
1532 		txr->hn_tx_collapsed++;
1533 
1534 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1535 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1536 	}
1537 	if (!error) {
1538 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1539 		    BUS_DMASYNC_PREWRITE);
1540 		txd->flags |= HN_TXD_FLAG_DMAMAP;
1541 	}
1542 	return error;
1543 }
1544 
1545 static __inline int
1546 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1547 {
1548 
1549 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1550 	    ("put an onlist txd %#x", txd->flags));
1551 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1552 	    ("put an onagg txd %#x", txd->flags));
1553 
1554 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1555 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1556 		return 0;
1557 
1558 	if (!STAILQ_EMPTY(&txd->agg_list)) {
1559 		struct hn_txdesc *tmp_txd;
1560 
1561 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1562 			int freed;
1563 
1564 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1565 			    ("resursive aggregation on aggregated txdesc"));
1566 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1567 			    ("not aggregated txdesc"));
1568 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1569 			    ("aggregated txdesc uses dmamap"));
1570 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1571 			    ("aggregated txdesc consumes "
1572 			     "chimney sending buffer"));
1573 			KASSERT(tmp_txd->chim_size == 0,
1574 			    ("aggregated txdesc has non-zero "
1575 			     "chimney sending size"));
1576 
1577 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1578 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1579 			freed = hn_txdesc_put(txr, tmp_txd);
1580 			KASSERT(freed, ("failed to free aggregated txdesc"));
1581 		}
1582 	}
1583 
1584 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1585 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1586 		    ("chim txd uses dmamap"));
1587 		hn_chim_free(txr->hn_sc, txd->chim_index);
1588 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1589 		txd->chim_size = 0;
1590 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1591 		bus_dmamap_sync(txr->hn_tx_data_dtag,
1592 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1593 		bus_dmamap_unload(txr->hn_tx_data_dtag,
1594 		    txd->data_dmap);
1595 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1596 	}
1597 
1598 	if (txd->m != NULL) {
1599 		m_freem(txd->m);
1600 		txd->m = NULL;
1601 	}
1602 
1603 	txd->flags |= HN_TXD_FLAG_ONLIST;
1604 #ifndef HN_USE_TXDESC_BUFRING
1605 	mtx_lock_spin(&txr->hn_txlist_spin);
1606 	KASSERT(txr->hn_txdesc_avail >= 0 &&
1607 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1608 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1609 	txr->hn_txdesc_avail++;
1610 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1611 	mtx_unlock_spin(&txr->hn_txlist_spin);
1612 #else	/* HN_USE_TXDESC_BUFRING */
1613 #ifdef HN_DEBUG
1614 	atomic_add_int(&txr->hn_txdesc_avail, 1);
1615 #endif
1616 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1617 #endif	/* !HN_USE_TXDESC_BUFRING */
1618 
1619 	return 1;
1620 }
1621 
1622 static __inline struct hn_txdesc *
1623 hn_txdesc_get(struct hn_tx_ring *txr)
1624 {
1625 	struct hn_txdesc *txd;
1626 
1627 #ifndef HN_USE_TXDESC_BUFRING
1628 	mtx_lock_spin(&txr->hn_txlist_spin);
1629 	txd = SLIST_FIRST(&txr->hn_txlist);
1630 	if (txd != NULL) {
1631 		KASSERT(txr->hn_txdesc_avail > 0,
1632 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1633 		txr->hn_txdesc_avail--;
1634 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1635 	}
1636 	mtx_unlock_spin(&txr->hn_txlist_spin);
1637 #else
1638 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1639 #endif
1640 
1641 	if (txd != NULL) {
1642 #ifdef HN_USE_TXDESC_BUFRING
1643 #ifdef HN_DEBUG
1644 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1645 #endif
1646 #endif	/* HN_USE_TXDESC_BUFRING */
1647 		KASSERT(txd->m == NULL && txd->refs == 0 &&
1648 		    STAILQ_EMPTY(&txd->agg_list) &&
1649 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1650 		    txd->chim_size == 0 &&
1651 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1652 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1653 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1654 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1655 		txd->refs = 1;
1656 	}
1657 	return txd;
1658 }
1659 
1660 static __inline void
1661 hn_txdesc_hold(struct hn_txdesc *txd)
1662 {
1663 
1664 	/* 0->1 transition will never work */
1665 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1666 	atomic_add_int(&txd->refs, 1);
1667 }
1668 
1669 static __inline void
1670 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1671 {
1672 
1673 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1674 	    ("recursive aggregation on aggregating txdesc"));
1675 
1676 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1677 	    ("already aggregated"));
1678 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
1679 	    ("recursive aggregation on to-be-aggregated txdesc"));
1680 
1681 	txd->flags |= HN_TXD_FLAG_ONAGG;
1682 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1683 }
1684 
1685 static bool
1686 hn_tx_ring_pending(struct hn_tx_ring *txr)
1687 {
1688 	bool pending = false;
1689 
1690 #ifndef HN_USE_TXDESC_BUFRING
1691 	mtx_lock_spin(&txr->hn_txlist_spin);
1692 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1693 		pending = true;
1694 	mtx_unlock_spin(&txr->hn_txlist_spin);
1695 #else
1696 	if (!buf_ring_full(txr->hn_txdesc_br))
1697 		pending = true;
1698 #endif
1699 	return (pending);
1700 }
1701 
1702 static __inline void
1703 hn_txeof(struct hn_tx_ring *txr)
1704 {
1705 	txr->hn_has_txeof = 0;
1706 	txr->hn_txeof(txr);
1707 }
1708 
1709 static void
1710 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1711     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1712 {
1713 	struct hn_txdesc *txd = sndc->hn_cbarg;
1714 	struct hn_tx_ring *txr;
1715 
1716 	txr = txd->txr;
1717 	KASSERT(txr->hn_chan == chan,
1718 	    ("channel mismatch, on chan%u, should be chan%u",
1719 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1720 
1721 	txr->hn_has_txeof = 1;
1722 	hn_txdesc_put(txr, txd);
1723 
1724 	++txr->hn_txdone_cnt;
1725 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1726 		txr->hn_txdone_cnt = 0;
1727 		if (txr->hn_oactive)
1728 			hn_txeof(txr);
1729 	}
1730 }
1731 
1732 static void
1733 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1734 {
1735 #if defined(INET) || defined(INET6)
1736 	tcp_lro_flush_all(&rxr->hn_lro);
1737 #endif
1738 
1739 	/*
1740 	 * NOTE:
1741 	 * 'txr' could be NULL, if multiple channels and
1742 	 * ifnet.if_start method are enabled.
1743 	 */
1744 	if (txr == NULL || !txr->hn_has_txeof)
1745 		return;
1746 
1747 	txr->hn_txdone_cnt = 0;
1748 	hn_txeof(txr);
1749 }
1750 
1751 static __inline uint32_t
1752 hn_rndis_pktmsg_offset(uint32_t ofs)
1753 {
1754 
1755 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1756 	    ("invalid RNDIS packet msg offset %u", ofs));
1757 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1758 }
1759 
1760 static __inline void *
1761 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1762     size_t pi_dlen, uint32_t pi_type)
1763 {
1764 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1765 	struct rndis_pktinfo *pi;
1766 
1767 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1768 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1769 
1770 	/*
1771 	 * Per-packet-info does not move; it only grows.
1772 	 *
1773 	 * NOTE:
1774 	 * rm_pktinfooffset in this phase counts from the beginning
1775 	 * of rndis_packet_msg.
1776 	 */
1777 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1778 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1779 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1780 	    pkt->rm_pktinfolen);
1781 	pkt->rm_pktinfolen += pi_size;
1782 
1783 	pi->rm_size = pi_size;
1784 	pi->rm_type = pi_type;
1785 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1786 
1787 	/* Data immediately follow per-packet-info. */
1788 	pkt->rm_dataoffset += pi_size;
1789 
1790 	/* Update RNDIS packet msg length */
1791 	pkt->rm_len += pi_size;
1792 
1793 	return (pi->rm_data);
1794 }
1795 
1796 static __inline int
1797 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1798 {
1799 	struct hn_txdesc *txd;
1800 	struct mbuf *m;
1801 	int error, pkts;
1802 
1803 	txd = txr->hn_agg_txd;
1804 	KASSERT(txd != NULL, ("no aggregate txdesc"));
1805 
1806 	/*
1807 	 * Since hn_txpkt() will reset this temporary stat, save
1808 	 * it now, so that oerrors can be updated properly, if
1809 	 * hn_txpkt() ever fails.
1810 	 */
1811 	pkts = txr->hn_stat_pkts;
1812 
1813 	/*
1814 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1815 	 * failure, save it for later freeing, if hn_txpkt() ever
1816 	 * fails.
1817 	 */
1818 	m = txd->m;
1819 	error = hn_txpkt(ifp, txr, txd);
1820 	if (__predict_false(error)) {
1821 		/* txd is freed, but m is not. */
1822 		m_freem(m);
1823 
1824 		txr->hn_flush_failed++;
1825 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1826 	}
1827 
1828 	/* Reset all aggregation states. */
1829 	txr->hn_agg_txd = NULL;
1830 	txr->hn_agg_szleft = 0;
1831 	txr->hn_agg_pktleft = 0;
1832 	txr->hn_agg_prevpkt = NULL;
1833 
1834 	return (error);
1835 }
1836 
1837 static void *
1838 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1839     int pktsize)
1840 {
1841 	void *chim;
1842 
1843 	if (txr->hn_agg_txd != NULL) {
1844 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1845 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1846 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1847 			int olen;
1848 
1849 			/*
1850 			 * Update the previous RNDIS packet's total length,
1851 			 * it can be increased due to the mandatory alignment
1852 			 * padding for this RNDIS packet.  And update the
1853 			 * aggregating txdesc's chimney sending buffer size
1854 			 * accordingly.
1855 			 *
1856 			 * XXX
1857 			 * Zero-out the padding, as required by the RNDIS spec.
1858 			 */
1859 			olen = pkt->rm_len;
1860 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1861 			agg_txd->chim_size += pkt->rm_len - olen;
1862 
1863 			/* Link this txdesc to the parent. */
1864 			hn_txdesc_agg(agg_txd, txd);
1865 
1866 			chim = (uint8_t *)pkt + pkt->rm_len;
1867 			/* Save the current packet for later fixup. */
1868 			txr->hn_agg_prevpkt = chim;
1869 
1870 			txr->hn_agg_pktleft--;
1871 			txr->hn_agg_szleft -= pktsize;
1872 			if (txr->hn_agg_szleft <=
1873 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1874 				/*
1875 				 * Probably can't aggregate more packets,
1876 				 * flush this aggregating txdesc proactively.
1877 				 */
1878 				txr->hn_agg_pktleft = 0;
1879 			}
1880 			/* Done! */
1881 			return (chim);
1882 		}
1883 		hn_flush_txagg(ifp, txr);
1884 	}
1885 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1886 
1887 	txr->hn_tx_chimney_tried++;
1888 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
1889 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1890 		return (NULL);
1891 	txr->hn_tx_chimney++;
1892 
1893 	chim = txr->hn_sc->hn_chim +
1894 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1895 
1896 	if (txr->hn_agg_pktmax > 1 &&
1897 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1898 		txr->hn_agg_txd = txd;
1899 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1900 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1901 		txr->hn_agg_prevpkt = chim;
1902 	}
1903 	return (chim);
1904 }
1905 
1906 /*
1907  * NOTE:
1908  * If this function fails, then both txd and m_head0 will be freed.
1909  */
1910 static int
1911 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1912     struct mbuf **m_head0)
1913 {
1914 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1915 	int error, nsegs, i;
1916 	struct mbuf *m_head = *m_head0;
1917 	struct rndis_packet_msg *pkt;
1918 	uint32_t *pi_data;
1919 	void *chim = NULL;
1920 	int pkt_hlen, pkt_size;
1921 
1922 	pkt = txd->rndis_pkt;
1923 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1924 	if (pkt_size < txr->hn_chim_size) {
1925 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1926 		if (chim != NULL)
1927 			pkt = chim;
1928 	} else {
1929 		if (txr->hn_agg_txd != NULL)
1930 			hn_flush_txagg(ifp, txr);
1931 	}
1932 
1933 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1934 	pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1935 	pkt->rm_dataoffset = sizeof(*pkt);
1936 	pkt->rm_datalen = m_head->m_pkthdr.len;
1937 	pkt->rm_oobdataoffset = 0;
1938 	pkt->rm_oobdatalen = 0;
1939 	pkt->rm_oobdataelements = 0;
1940 	pkt->rm_pktinfooffset = sizeof(*pkt);
1941 	pkt->rm_pktinfolen = 0;
1942 	pkt->rm_vchandle = 0;
1943 	pkt->rm_reserved = 0;
1944 
1945 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1946 		/*
1947 		 * Set the hash value for this packet, so that the host could
1948 		 * dispatch the TX done event for this packet back to this TX
1949 		 * ring's channel.
1950 		 */
1951 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1952 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1953 		*pi_data = txr->hn_tx_idx;
1954 	}
1955 
1956 	if (m_head->m_flags & M_VLANTAG) {
1957 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1958 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1959 		*pi_data = NDIS_VLAN_INFO_MAKE(
1960 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1961 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1962 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1963 	}
1964 
1965 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1966 #if defined(INET6) || defined(INET)
1967 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1968 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1969 #ifdef INET
1970 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1971 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1972 			    m_head->m_pkthdr.tso_segsz);
1973 		}
1974 #endif
1975 #if defined(INET6) && defined(INET)
1976 		else
1977 #endif
1978 #ifdef INET6
1979 		{
1980 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1981 			    m_head->m_pkthdr.tso_segsz);
1982 		}
1983 #endif
1984 #endif	/* INET6 || INET */
1985 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1986 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1987 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1988 		if (m_head->m_pkthdr.csum_flags &
1989 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1990 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
1991 		} else {
1992 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
1993 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1994 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
1995 		}
1996 
1997 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1998 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1999 		else if (m_head->m_pkthdr.csum_flags &
2000 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
2001 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
2002 	}
2003 
2004 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
2005 	/* Convert RNDIS packet message offsets */
2006 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
2007 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2008 
2009 	/*
2010 	 * Fast path: Chimney sending.
2011 	 */
2012 	if (chim != NULL) {
2013 		struct hn_txdesc *tgt_txd = txd;
2014 
2015 		if (txr->hn_agg_txd != NULL) {
2016 			tgt_txd = txr->hn_agg_txd;
2017 #ifdef INVARIANTS
2018 			*m_head0 = NULL;
2019 #endif
2020 		}
2021 
2022 		KASSERT(pkt == chim,
2023 		    ("RNDIS pkt not in chimney sending buffer"));
2024 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2025 		    ("chimney sending buffer is not used"));
2026 		tgt_txd->chim_size += pkt->rm_len;
2027 
2028 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
2029 		    ((uint8_t *)chim) + pkt_hlen);
2030 
2031 		txr->hn_gpa_cnt = 0;
2032 		txr->hn_sendpkt = hn_txpkt_chim;
2033 		goto done;
2034 	}
2035 
2036 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2037 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2038 	    ("chimney buffer is used"));
2039 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2040 
2041 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2042 	if (__predict_false(error)) {
2043 		int freed;
2044 
2045 		/*
2046 		 * This mbuf is not linked w/ the txd yet, so free it now.
2047 		 */
2048 		m_freem(m_head);
2049 		*m_head0 = NULL;
2050 
2051 		freed = hn_txdesc_put(txr, txd);
2052 		KASSERT(freed != 0,
2053 		    ("fail to free txd upon txdma error"));
2054 
2055 		txr->hn_txdma_failed++;
2056 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2057 		return error;
2058 	}
2059 	*m_head0 = m_head;
2060 
2061 	/* +1 RNDIS packet message */
2062 	txr->hn_gpa_cnt = nsegs + 1;
2063 
2064 	/* send packet with page buffer */
2065 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2066 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2067 	txr->hn_gpa[0].gpa_len = pkt_hlen;
2068 
2069 	/*
2070 	 * Fill the page buffers with mbuf info after the page
2071 	 * buffer for RNDIS packet message.
2072 	 */
2073 	for (i = 0; i < nsegs; ++i) {
2074 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2075 
2076 		gpa->gpa_page = atop(segs[i].ds_addr);
2077 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2078 		gpa->gpa_len = segs[i].ds_len;
2079 	}
2080 
2081 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2082 	txd->chim_size = 0;
2083 	txr->hn_sendpkt = hn_txpkt_sglist;
2084 done:
2085 	txd->m = m_head;
2086 
2087 	/* Set the completion routine */
2088 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2089 
2090 	/* Update temporary stats for later use. */
2091 	txr->hn_stat_pkts++;
2092 	txr->hn_stat_size += m_head->m_pkthdr.len;
2093 	if (m_head->m_flags & M_MCAST)
2094 		txr->hn_stat_mcasts++;
2095 
2096 	return 0;
2097 }
2098 
2099 /*
2100  * NOTE:
2101  * If this function fails, then txd will be freed, but the mbuf
2102  * associated w/ the txd will _not_ be freed.
2103  */
2104 static int
2105 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2106 {
2107 	int error, send_failed = 0, has_bpf;
2108 
2109 again:
2110 	has_bpf = bpf_peers_present(ifp->if_bpf);
2111 	if (has_bpf) {
2112 		/*
2113 		 * Make sure that this txd and any aggregated txds are not
2114 		 * freed before ETHER_BPF_MTAP.
2115 		 */
2116 		hn_txdesc_hold(txd);
2117 	}
2118 	error = txr->hn_sendpkt(txr, txd);
2119 	if (!error) {
2120 		if (has_bpf) {
2121 			const struct hn_txdesc *tmp_txd;
2122 
2123 			ETHER_BPF_MTAP(ifp, txd->m);
2124 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2125 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
2126 		}
2127 
2128 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2129 #ifdef HN_IFSTART_SUPPORT
2130 		if (!hn_use_if_start)
2131 #endif
2132 		{
2133 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
2134 			    txr->hn_stat_size);
2135 			if (txr->hn_stat_mcasts != 0) {
2136 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2137 				    txr->hn_stat_mcasts);
2138 			}
2139 		}
2140 		txr->hn_pkts += txr->hn_stat_pkts;
2141 		txr->hn_sends++;
2142 	}
2143 	if (has_bpf)
2144 		hn_txdesc_put(txr, txd);
2145 
2146 	if (__predict_false(error)) {
2147 		int freed;
2148 
2149 		/*
2150 		 * This should "really rarely" happen.
2151 		 *
2152 		 * XXX Too many RX to be acked or too many sideband
2153 		 * commands to run?  Ask netvsc_channel_rollup()
2154 		 * to kick start later.
2155 		 */
2156 		txr->hn_has_txeof = 1;
2157 		if (!send_failed) {
2158 			txr->hn_send_failed++;
2159 			send_failed = 1;
2160 			/*
2161 			 * Try sending again after set hn_has_txeof;
2162 			 * in case that we missed the last
2163 			 * netvsc_channel_rollup().
2164 			 */
2165 			goto again;
2166 		}
2167 		if_printf(ifp, "send failed\n");
2168 
2169 		/*
2170 		 * Caller will perform further processing on the
2171 		 * associated mbuf, so don't free it in hn_txdesc_put();
2172 		 * only unload it from the DMA map in hn_txdesc_put(),
2173 		 * if it was loaded.
2174 		 */
2175 		txd->m = NULL;
2176 		freed = hn_txdesc_put(txr, txd);
2177 		KASSERT(freed != 0,
2178 		    ("fail to free txd upon send error"));
2179 
2180 		txr->hn_send_failed++;
2181 	}
2182 
2183 	/* Reset temporary stats, after this sending is done. */
2184 	txr->hn_stat_size = 0;
2185 	txr->hn_stat_pkts = 0;
2186 	txr->hn_stat_mcasts = 0;
2187 
2188 	return (error);
2189 }
2190 
2191 /*
2192  * Append the specified data to the indicated mbuf chain,
2193  * Extend the mbuf chain if the new data does not fit in
2194  * existing space.
2195  *
2196  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2197  * There should be an equivalent in the kernel mbuf code,
2198  * but there does not appear to be one yet.
2199  *
2200  * Differs from m_append() in that additional mbufs are
2201  * allocated with cluster size MJUMPAGESIZE, and filled
2202  * accordingly.
2203  *
2204  * Return 1 if able to complete the job; otherwise 0.
2205  */
2206 static int
2207 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2208 {
2209 	struct mbuf *m, *n;
2210 	int remainder, space;
2211 
2212 	for (m = m0; m->m_next != NULL; m = m->m_next)
2213 		;
2214 	remainder = len;
2215 	space = M_TRAILINGSPACE(m);
2216 	if (space > 0) {
2217 		/*
2218 		 * Copy into available space.
2219 		 */
2220 		if (space > remainder)
2221 			space = remainder;
2222 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2223 		m->m_len += space;
2224 		cp += space;
2225 		remainder -= space;
2226 	}
2227 	while (remainder > 0) {
2228 		/*
2229 		 * Allocate a new mbuf; could check space
2230 		 * and allocate a cluster instead.
2231 		 */
2232 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2233 		if (n == NULL)
2234 			break;
2235 		n->m_len = min(MJUMPAGESIZE, remainder);
2236 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2237 		cp += n->m_len;
2238 		remainder -= n->m_len;
2239 		m->m_next = n;
2240 		m = n;
2241 	}
2242 	if (m0->m_flags & M_PKTHDR)
2243 		m0->m_pkthdr.len += len - remainder;
2244 
2245 	return (remainder == 0);
2246 }
2247 
2248 #if defined(INET) || defined(INET6)
2249 static __inline int
2250 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2251 {
2252 #if __FreeBSD_version >= 1100095
2253 	if (hn_lro_mbufq_depth) {
2254 		tcp_lro_queue_mbuf(lc, m);
2255 		return 0;
2256 	}
2257 #endif
2258 	return tcp_lro_rx(lc, m, 0);
2259 }
2260 #endif
2261 
2262 static int
2263 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2264     const struct hn_rxinfo *info)
2265 {
2266 	struct ifnet *ifp;
2267 	struct mbuf *m_new;
2268 	int size, do_lro = 0, do_csum = 1;
2269 	int hash_type;
2270 
2271 	/* If the VF is active, inject the packet through the VF */
2272 	ifp = rxr->hn_vf ? rxr->hn_vf : rxr->hn_ifp;
2273 
2274 	if (dlen <= MHLEN) {
2275 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
2276 		if (m_new == NULL) {
2277 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2278 			return (0);
2279 		}
2280 		memcpy(mtod(m_new, void *), data, dlen);
2281 		m_new->m_pkthdr.len = m_new->m_len = dlen;
2282 		rxr->hn_small_pkts++;
2283 	} else {
2284 		/*
2285 		 * Get an mbuf with a cluster.  For packets 2K or less,
2286 		 * get a standard 2K cluster.  For anything larger, get a
2287 		 * 4K cluster.  Any buffers larger than 4K can cause problems
2288 		 * if looped around to the Hyper-V TX channel, so avoid them.
2289 		 */
2290 		size = MCLBYTES;
2291 		if (dlen > MCLBYTES) {
2292 			/* 4096 */
2293 			size = MJUMPAGESIZE;
2294 		}
2295 
2296 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2297 		if (m_new == NULL) {
2298 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2299 			return (0);
2300 		}
2301 
2302 		hv_m_append(m_new, dlen, data);
2303 	}
2304 	m_new->m_pkthdr.rcvif = ifp;
2305 
2306 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2307 		do_csum = 0;
2308 
2309 	/* receive side checksum offload */
2310 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2311 		/* IP csum offload */
2312 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2313 			m_new->m_pkthdr.csum_flags |=
2314 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2315 			rxr->hn_csum_ip++;
2316 		}
2317 
2318 		/* TCP/UDP csum offload */
2319 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2320 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2321 			m_new->m_pkthdr.csum_flags |=
2322 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2323 			m_new->m_pkthdr.csum_data = 0xffff;
2324 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2325 				rxr->hn_csum_tcp++;
2326 			else
2327 				rxr->hn_csum_udp++;
2328 		}
2329 
2330 		/*
2331 		 * XXX
2332 		 * As of this write (Oct 28th, 2016), host side will turn
2333 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2334 		 * the do_lro setting here is actually _not_ accurate.  We
2335 		 * depend on the RSS hash type check to reset do_lro.
2336 		 */
2337 		if ((info->csum_info &
2338 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2339 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2340 			do_lro = 1;
2341 	} else {
2342 		const struct ether_header *eh;
2343 		uint16_t etype;
2344 		int hoff;
2345 
2346 		hoff = sizeof(*eh);
2347 		if (m_new->m_len < hoff)
2348 			goto skip;
2349 		eh = mtod(m_new, struct ether_header *);
2350 		etype = ntohs(eh->ether_type);
2351 		if (etype == ETHERTYPE_VLAN) {
2352 			const struct ether_vlan_header *evl;
2353 
2354 			hoff = sizeof(*evl);
2355 			if (m_new->m_len < hoff)
2356 				goto skip;
2357 			evl = mtod(m_new, struct ether_vlan_header *);
2358 			etype = ntohs(evl->evl_proto);
2359 		}
2360 
2361 		if (etype == ETHERTYPE_IP) {
2362 			int pr;
2363 
2364 			pr = hn_check_iplen(m_new, hoff);
2365 			if (pr == IPPROTO_TCP) {
2366 				if (do_csum &&
2367 				    (rxr->hn_trust_hcsum &
2368 				     HN_TRUST_HCSUM_TCP)) {
2369 					rxr->hn_csum_trusted++;
2370 					m_new->m_pkthdr.csum_flags |=
2371 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2372 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2373 					m_new->m_pkthdr.csum_data = 0xffff;
2374 				}
2375 				do_lro = 1;
2376 			} else if (pr == IPPROTO_UDP) {
2377 				if (do_csum &&
2378 				    (rxr->hn_trust_hcsum &
2379 				     HN_TRUST_HCSUM_UDP)) {
2380 					rxr->hn_csum_trusted++;
2381 					m_new->m_pkthdr.csum_flags |=
2382 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2383 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2384 					m_new->m_pkthdr.csum_data = 0xffff;
2385 				}
2386 			} else if (pr != IPPROTO_DONE && do_csum &&
2387 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2388 				rxr->hn_csum_trusted++;
2389 				m_new->m_pkthdr.csum_flags |=
2390 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2391 			}
2392 		}
2393 	}
2394 skip:
2395 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2396 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2397 		    NDIS_VLAN_INFO_ID(info->vlan_info),
2398 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
2399 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
2400 		m_new->m_flags |= M_VLANTAG;
2401 	}
2402 
2403 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2404 		rxr->hn_rss_pkts++;
2405 		m_new->m_pkthdr.flowid = info->hash_value;
2406 		hash_type = M_HASHTYPE_OPAQUE_HASH;
2407 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2408 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
2409 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2410 
2411 			/*
2412 			 * NOTE:
2413 			 * do_lro is resetted, if the hash types are not TCP
2414 			 * related.  See the comment in the above csum_flags
2415 			 * setup section.
2416 			 */
2417 			switch (type) {
2418 			case NDIS_HASH_IPV4:
2419 				hash_type = M_HASHTYPE_RSS_IPV4;
2420 				do_lro = 0;
2421 				break;
2422 
2423 			case NDIS_HASH_TCP_IPV4:
2424 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2425 				break;
2426 
2427 			case NDIS_HASH_IPV6:
2428 				hash_type = M_HASHTYPE_RSS_IPV6;
2429 				do_lro = 0;
2430 				break;
2431 
2432 			case NDIS_HASH_IPV6_EX:
2433 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
2434 				do_lro = 0;
2435 				break;
2436 
2437 			case NDIS_HASH_TCP_IPV6:
2438 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2439 				break;
2440 
2441 			case NDIS_HASH_TCP_IPV6_EX:
2442 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2443 				break;
2444 			}
2445 		}
2446 	} else {
2447 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2448 		hash_type = M_HASHTYPE_OPAQUE;
2449 	}
2450 	M_HASHTYPE_SET(m_new, hash_type);
2451 
2452 	/*
2453 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
2454 	 * messages (not just data messages) will trigger a response.
2455 	 */
2456 
2457 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2458 	rxr->hn_pkts++;
2459 
2460 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2461 #if defined(INET) || defined(INET6)
2462 		struct lro_ctrl *lro = &rxr->hn_lro;
2463 
2464 		if (lro->lro_cnt) {
2465 			rxr->hn_lro_tried++;
2466 			if (hn_lro_rx(lro, m_new) == 0) {
2467 				/* DONE! */
2468 				return 0;
2469 			}
2470 		}
2471 #endif
2472 	}
2473 
2474 	/* We're not holding the lock here, so don't release it */
2475 	(*ifp->if_input)(ifp, m_new);
2476 
2477 	return (0);
2478 }
2479 
2480 static int
2481 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2482 {
2483 	struct hn_softc *sc = ifp->if_softc;
2484 	struct ifreq *ifr = (struct ifreq *)data;
2485 	int mask, error = 0;
2486 
2487 	switch (cmd) {
2488 	case SIOCSIFMTU:
2489 		if (ifr->ifr_mtu > HN_MTU_MAX) {
2490 			error = EINVAL;
2491 			break;
2492 		}
2493 
2494 		HN_LOCK(sc);
2495 
2496 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2497 			HN_UNLOCK(sc);
2498 			break;
2499 		}
2500 
2501 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2502 			/* Can't change MTU */
2503 			HN_UNLOCK(sc);
2504 			error = EOPNOTSUPP;
2505 			break;
2506 		}
2507 
2508 		if (ifp->if_mtu == ifr->ifr_mtu) {
2509 			HN_UNLOCK(sc);
2510 			break;
2511 		}
2512 
2513 		/*
2514 		 * Suspend this interface before the synthetic parts
2515 		 * are ripped.
2516 		 */
2517 		hn_suspend(sc);
2518 
2519 		/*
2520 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
2521 		 */
2522 		hn_synth_detach(sc);
2523 
2524 		/*
2525 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2526 		 * with the new MTU setting.
2527 		 */
2528 		error = hn_synth_attach(sc, ifr->ifr_mtu);
2529 		if (error) {
2530 			HN_UNLOCK(sc);
2531 			break;
2532 		}
2533 
2534 		/*
2535 		 * Commit the requested MTU, after the synthetic parts
2536 		 * have been successfully attached.
2537 		 */
2538 		ifp->if_mtu = ifr->ifr_mtu;
2539 
2540 		/*
2541 		 * Make sure that various parameters based on MTU are
2542 		 * still valid, after the MTU change.
2543 		 */
2544 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2545 			hn_set_chim_size(sc, sc->hn_chim_szmax);
2546 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2547 #if __FreeBSD_version >= 1100099
2548 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2549 		    HN_LRO_LENLIM_MIN(ifp))
2550 			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2551 #endif
2552 
2553 		/*
2554 		 * All done!  Resume the interface now.
2555 		 */
2556 		hn_resume(sc);
2557 
2558 		HN_UNLOCK(sc);
2559 		break;
2560 
2561 	case SIOCSIFFLAGS:
2562 		HN_LOCK(sc);
2563 
2564 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2565 			HN_UNLOCK(sc);
2566 			break;
2567 		}
2568 
2569 		if (ifp->if_flags & IFF_UP) {
2570 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2571 				/*
2572 				 * Caller meight hold mutex, e.g.
2573 				 * bpf; use busy-wait for the RNDIS
2574 				 * reply.
2575 				 */
2576 				HN_NO_SLEEPING(sc);
2577 				hn_rxfilter_config(sc);
2578 				HN_SLEEPING_OK(sc);
2579 			} else {
2580 				hn_init_locked(sc);
2581 			}
2582 		} else {
2583 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2584 				hn_stop(sc, false);
2585 		}
2586 		sc->hn_if_flags = ifp->if_flags;
2587 
2588 		HN_UNLOCK(sc);
2589 		break;
2590 
2591 	case SIOCSIFCAP:
2592 		HN_LOCK(sc);
2593 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2594 
2595 		if (mask & IFCAP_TXCSUM) {
2596 			ifp->if_capenable ^= IFCAP_TXCSUM;
2597 			if (ifp->if_capenable & IFCAP_TXCSUM)
2598 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2599 			else
2600 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2601 		}
2602 		if (mask & IFCAP_TXCSUM_IPV6) {
2603 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2604 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2605 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2606 			else
2607 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2608 		}
2609 
2610 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2611 		if (mask & IFCAP_RXCSUM)
2612 			ifp->if_capenable ^= IFCAP_RXCSUM;
2613 #ifdef foo
2614 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2615 		if (mask & IFCAP_RXCSUM_IPV6)
2616 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2617 #endif
2618 
2619 		if (mask & IFCAP_LRO)
2620 			ifp->if_capenable ^= IFCAP_LRO;
2621 
2622 		if (mask & IFCAP_TSO4) {
2623 			ifp->if_capenable ^= IFCAP_TSO4;
2624 			if (ifp->if_capenable & IFCAP_TSO4)
2625 				ifp->if_hwassist |= CSUM_IP_TSO;
2626 			else
2627 				ifp->if_hwassist &= ~CSUM_IP_TSO;
2628 		}
2629 		if (mask & IFCAP_TSO6) {
2630 			ifp->if_capenable ^= IFCAP_TSO6;
2631 			if (ifp->if_capenable & IFCAP_TSO6)
2632 				ifp->if_hwassist |= CSUM_IP6_TSO;
2633 			else
2634 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2635 		}
2636 
2637 		HN_UNLOCK(sc);
2638 		break;
2639 
2640 	case SIOCADDMULTI:
2641 	case SIOCDELMULTI:
2642 		HN_LOCK(sc);
2643 
2644 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2645 			HN_UNLOCK(sc);
2646 			break;
2647 		}
2648 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2649 			/*
2650 			 * Multicast uses mutex; use busy-wait for
2651 			 * the RNDIS reply.
2652 			 */
2653 			HN_NO_SLEEPING(sc);
2654 			hn_rxfilter_config(sc);
2655 			HN_SLEEPING_OK(sc);
2656 		}
2657 
2658 		HN_UNLOCK(sc);
2659 		break;
2660 
2661 	case SIOCSIFMEDIA:
2662 	case SIOCGIFMEDIA:
2663 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2664 		break;
2665 
2666 	default:
2667 		error = ether_ioctl(ifp, cmd, data);
2668 		break;
2669 	}
2670 	return (error);
2671 }
2672 
2673 static void
2674 hn_stop(struct hn_softc *sc, bool detaching)
2675 {
2676 	struct ifnet *ifp = sc->hn_ifp;
2677 	int i;
2678 
2679 	HN_LOCK_ASSERT(sc);
2680 
2681 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2682 	    ("synthetic parts were not attached"));
2683 
2684 	/* Disable polling. */
2685 	hn_polling(sc, 0);
2686 
2687 	/* Clear RUNNING bit _before_ hn_suspend_data() */
2688 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2689 	hn_suspend_data(sc);
2690 
2691 	/* Clear OACTIVE bit. */
2692 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2693 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2694 		sc->hn_tx_ring[i].hn_oactive = 0;
2695 
2696 	/*
2697 	 * If the VF is active, make sure the filter is not 0, even if
2698 	 * the synthetic NIC is down.
2699 	 */
2700 	if (!detaching && (sc->hn_flags & HN_FLAG_VF))
2701 		hn_rxfilter_config(sc);
2702 }
2703 
2704 static void
2705 hn_init_locked(struct hn_softc *sc)
2706 {
2707 	struct ifnet *ifp = sc->hn_ifp;
2708 	int i;
2709 
2710 	HN_LOCK_ASSERT(sc);
2711 
2712 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2713 		return;
2714 
2715 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2716 		return;
2717 
2718 	/* Configure RX filter */
2719 	hn_rxfilter_config(sc);
2720 
2721 	/* Clear OACTIVE bit. */
2722 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2723 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2724 		sc->hn_tx_ring[i].hn_oactive = 0;
2725 
2726 	/* Clear TX 'suspended' bit. */
2727 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2728 
2729 	/* Everything is ready; unleash! */
2730 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2731 
2732 	/* Re-enable polling if requested. */
2733 	if (sc->hn_pollhz > 0)
2734 		hn_polling(sc, sc->hn_pollhz);
2735 }
2736 
2737 static void
2738 hn_init(void *xsc)
2739 {
2740 	struct hn_softc *sc = xsc;
2741 
2742 	HN_LOCK(sc);
2743 	hn_init_locked(sc);
2744 	HN_UNLOCK(sc);
2745 }
2746 
2747 #if __FreeBSD_version >= 1100099
2748 
2749 static int
2750 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2751 {
2752 	struct hn_softc *sc = arg1;
2753 	unsigned int lenlim;
2754 	int error;
2755 
2756 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2757 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2758 	if (error || req->newptr == NULL)
2759 		return error;
2760 
2761 	HN_LOCK(sc);
2762 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2763 	    lenlim > TCP_LRO_LENGTH_MAX) {
2764 		HN_UNLOCK(sc);
2765 		return EINVAL;
2766 	}
2767 	hn_set_lro_lenlim(sc, lenlim);
2768 	HN_UNLOCK(sc);
2769 
2770 	return 0;
2771 }
2772 
2773 static int
2774 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2775 {
2776 	struct hn_softc *sc = arg1;
2777 	int ackcnt, error, i;
2778 
2779 	/*
2780 	 * lro_ackcnt_lim is append count limit,
2781 	 * +1 to turn it into aggregation limit.
2782 	 */
2783 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2784 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2785 	if (error || req->newptr == NULL)
2786 		return error;
2787 
2788 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2789 		return EINVAL;
2790 
2791 	/*
2792 	 * Convert aggregation limit back to append
2793 	 * count limit.
2794 	 */
2795 	--ackcnt;
2796 	HN_LOCK(sc);
2797 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2798 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2799 	HN_UNLOCK(sc);
2800 	return 0;
2801 }
2802 
2803 #endif
2804 
2805 static int
2806 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2807 {
2808 	struct hn_softc *sc = arg1;
2809 	int hcsum = arg2;
2810 	int on, error, i;
2811 
2812 	on = 0;
2813 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2814 		on = 1;
2815 
2816 	error = sysctl_handle_int(oidp, &on, 0, req);
2817 	if (error || req->newptr == NULL)
2818 		return error;
2819 
2820 	HN_LOCK(sc);
2821 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2822 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2823 
2824 		if (on)
2825 			rxr->hn_trust_hcsum |= hcsum;
2826 		else
2827 			rxr->hn_trust_hcsum &= ~hcsum;
2828 	}
2829 	HN_UNLOCK(sc);
2830 	return 0;
2831 }
2832 
2833 static int
2834 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2835 {
2836 	struct hn_softc *sc = arg1;
2837 	int chim_size, error;
2838 
2839 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
2840 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
2841 	if (error || req->newptr == NULL)
2842 		return error;
2843 
2844 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2845 		return EINVAL;
2846 
2847 	HN_LOCK(sc);
2848 	hn_set_chim_size(sc, chim_size);
2849 	HN_UNLOCK(sc);
2850 	return 0;
2851 }
2852 
2853 #if __FreeBSD_version < 1100095
2854 static int
2855 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2856 {
2857 	struct hn_softc *sc = arg1;
2858 	int ofs = arg2, i, error;
2859 	struct hn_rx_ring *rxr;
2860 	uint64_t stat;
2861 
2862 	stat = 0;
2863 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2864 		rxr = &sc->hn_rx_ring[i];
2865 		stat += *((int *)((uint8_t *)rxr + ofs));
2866 	}
2867 
2868 	error = sysctl_handle_64(oidp, &stat, 0, req);
2869 	if (error || req->newptr == NULL)
2870 		return error;
2871 
2872 	/* Zero out this stat. */
2873 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2874 		rxr = &sc->hn_rx_ring[i];
2875 		*((int *)((uint8_t *)rxr + ofs)) = 0;
2876 	}
2877 	return 0;
2878 }
2879 #else
2880 static int
2881 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2882 {
2883 	struct hn_softc *sc = arg1;
2884 	int ofs = arg2, i, error;
2885 	struct hn_rx_ring *rxr;
2886 	uint64_t stat;
2887 
2888 	stat = 0;
2889 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2890 		rxr = &sc->hn_rx_ring[i];
2891 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2892 	}
2893 
2894 	error = sysctl_handle_64(oidp, &stat, 0, req);
2895 	if (error || req->newptr == NULL)
2896 		return error;
2897 
2898 	/* Zero out this stat. */
2899 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2900 		rxr = &sc->hn_rx_ring[i];
2901 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2902 	}
2903 	return 0;
2904 }
2905 
2906 #endif
2907 
2908 static int
2909 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2910 {
2911 	struct hn_softc *sc = arg1;
2912 	int ofs = arg2, i, error;
2913 	struct hn_rx_ring *rxr;
2914 	u_long stat;
2915 
2916 	stat = 0;
2917 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2918 		rxr = &sc->hn_rx_ring[i];
2919 		stat += *((u_long *)((uint8_t *)rxr + ofs));
2920 	}
2921 
2922 	error = sysctl_handle_long(oidp, &stat, 0, req);
2923 	if (error || req->newptr == NULL)
2924 		return error;
2925 
2926 	/* Zero out this stat. */
2927 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2928 		rxr = &sc->hn_rx_ring[i];
2929 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
2930 	}
2931 	return 0;
2932 }
2933 
2934 static int
2935 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2936 {
2937 	struct hn_softc *sc = arg1;
2938 	int ofs = arg2, i, error;
2939 	struct hn_tx_ring *txr;
2940 	u_long stat;
2941 
2942 	stat = 0;
2943 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2944 		txr = &sc->hn_tx_ring[i];
2945 		stat += *((u_long *)((uint8_t *)txr + ofs));
2946 	}
2947 
2948 	error = sysctl_handle_long(oidp, &stat, 0, req);
2949 	if (error || req->newptr == NULL)
2950 		return error;
2951 
2952 	/* Zero out this stat. */
2953 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2954 		txr = &sc->hn_tx_ring[i];
2955 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
2956 	}
2957 	return 0;
2958 }
2959 
2960 static int
2961 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2962 {
2963 	struct hn_softc *sc = arg1;
2964 	int ofs = arg2, i, error, conf;
2965 	struct hn_tx_ring *txr;
2966 
2967 	txr = &sc->hn_tx_ring[0];
2968 	conf = *((int *)((uint8_t *)txr + ofs));
2969 
2970 	error = sysctl_handle_int(oidp, &conf, 0, req);
2971 	if (error || req->newptr == NULL)
2972 		return error;
2973 
2974 	HN_LOCK(sc);
2975 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2976 		txr = &sc->hn_tx_ring[i];
2977 		*((int *)((uint8_t *)txr + ofs)) = conf;
2978 	}
2979 	HN_UNLOCK(sc);
2980 
2981 	return 0;
2982 }
2983 
2984 static int
2985 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2986 {
2987 	struct hn_softc *sc = arg1;
2988 	int error, size;
2989 
2990 	size = sc->hn_agg_size;
2991 	error = sysctl_handle_int(oidp, &size, 0, req);
2992 	if (error || req->newptr == NULL)
2993 		return (error);
2994 
2995 	HN_LOCK(sc);
2996 	sc->hn_agg_size = size;
2997 	hn_set_txagg(sc);
2998 	HN_UNLOCK(sc);
2999 
3000 	return (0);
3001 }
3002 
3003 static int
3004 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3005 {
3006 	struct hn_softc *sc = arg1;
3007 	int error, pkts;
3008 
3009 	pkts = sc->hn_agg_pkts;
3010 	error = sysctl_handle_int(oidp, &pkts, 0, req);
3011 	if (error || req->newptr == NULL)
3012 		return (error);
3013 
3014 	HN_LOCK(sc);
3015 	sc->hn_agg_pkts = pkts;
3016 	hn_set_txagg(sc);
3017 	HN_UNLOCK(sc);
3018 
3019 	return (0);
3020 }
3021 
3022 static int
3023 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
3024 {
3025 	struct hn_softc *sc = arg1;
3026 	int pkts;
3027 
3028 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
3029 	return (sysctl_handle_int(oidp, &pkts, 0, req));
3030 }
3031 
3032 static int
3033 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
3034 {
3035 	struct hn_softc *sc = arg1;
3036 	int align;
3037 
3038 	align = sc->hn_tx_ring[0].hn_agg_align;
3039 	return (sysctl_handle_int(oidp, &align, 0, req));
3040 }
3041 
3042 static void
3043 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
3044 {
3045 	if (pollhz == 0)
3046 		vmbus_chan_poll_disable(chan);
3047 	else
3048 		vmbus_chan_poll_enable(chan, pollhz);
3049 }
3050 
3051 static void
3052 hn_polling(struct hn_softc *sc, u_int pollhz)
3053 {
3054 	int nsubch = sc->hn_rx_ring_inuse - 1;
3055 
3056 	HN_LOCK_ASSERT(sc);
3057 
3058 	if (nsubch > 0) {
3059 		struct vmbus_channel **subch;
3060 		int i;
3061 
3062 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3063 		for (i = 0; i < nsubch; ++i)
3064 			hn_chan_polling(subch[i], pollhz);
3065 		vmbus_subchan_rel(subch, nsubch);
3066 	}
3067 	hn_chan_polling(sc->hn_prichan, pollhz);
3068 }
3069 
3070 static int
3071 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
3072 {
3073 	struct hn_softc *sc = arg1;
3074 	int pollhz, error;
3075 
3076 	pollhz = sc->hn_pollhz;
3077 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
3078 	if (error || req->newptr == NULL)
3079 		return (error);
3080 
3081 	if (pollhz != 0 &&
3082 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
3083 		return (EINVAL);
3084 
3085 	HN_LOCK(sc);
3086 	if (sc->hn_pollhz != pollhz) {
3087 		sc->hn_pollhz = pollhz;
3088 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
3089 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
3090 			hn_polling(sc, sc->hn_pollhz);
3091 	}
3092 	HN_UNLOCK(sc);
3093 
3094 	return (0);
3095 }
3096 
3097 static int
3098 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
3099 {
3100 	struct hn_softc *sc = arg1;
3101 	char verstr[16];
3102 
3103 	snprintf(verstr, sizeof(verstr), "%u.%u",
3104 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
3105 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
3106 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
3107 }
3108 
3109 static int
3110 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
3111 {
3112 	struct hn_softc *sc = arg1;
3113 	char caps_str[128];
3114 	uint32_t caps;
3115 
3116 	HN_LOCK(sc);
3117 	caps = sc->hn_caps;
3118 	HN_UNLOCK(sc);
3119 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
3120 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
3121 }
3122 
3123 static int
3124 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
3125 {
3126 	struct hn_softc *sc = arg1;
3127 	char assist_str[128];
3128 	uint32_t hwassist;
3129 
3130 	HN_LOCK(sc);
3131 	hwassist = sc->hn_ifp->if_hwassist;
3132 	HN_UNLOCK(sc);
3133 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
3134 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
3135 }
3136 
3137 static int
3138 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
3139 {
3140 	struct hn_softc *sc = arg1;
3141 	char filter_str[128];
3142 	uint32_t filter;
3143 
3144 	HN_LOCK(sc);
3145 	filter = sc->hn_rx_filter;
3146 	HN_UNLOCK(sc);
3147 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
3148 	    NDIS_PACKET_TYPES);
3149 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3150 }
3151 
3152 #ifndef RSS
3153 
3154 static int
3155 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3156 {
3157 	struct hn_softc *sc = arg1;
3158 	int error;
3159 
3160 	HN_LOCK(sc);
3161 
3162 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3163 	if (error || req->newptr == NULL)
3164 		goto back;
3165 
3166 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3167 	if (error)
3168 		goto back;
3169 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3170 
3171 	if (sc->hn_rx_ring_inuse > 1) {
3172 		error = hn_rss_reconfig(sc);
3173 	} else {
3174 		/* Not RSS capable, at least for now; just save the RSS key. */
3175 		error = 0;
3176 	}
3177 back:
3178 	HN_UNLOCK(sc);
3179 	return (error);
3180 }
3181 
3182 static int
3183 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3184 {
3185 	struct hn_softc *sc = arg1;
3186 	int error;
3187 
3188 	HN_LOCK(sc);
3189 
3190 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3191 	if (error || req->newptr == NULL)
3192 		goto back;
3193 
3194 	/*
3195 	 * Don't allow RSS indirect table change, if this interface is not
3196 	 * RSS capable currently.
3197 	 */
3198 	if (sc->hn_rx_ring_inuse == 1) {
3199 		error = EOPNOTSUPP;
3200 		goto back;
3201 	}
3202 
3203 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3204 	if (error)
3205 		goto back;
3206 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3207 
3208 	hn_rss_ind_fixup(sc);
3209 	error = hn_rss_reconfig(sc);
3210 back:
3211 	HN_UNLOCK(sc);
3212 	return (error);
3213 }
3214 
3215 #endif	/* !RSS */
3216 
3217 static int
3218 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3219 {
3220 	struct hn_softc *sc = arg1;
3221 	char hash_str[128];
3222 	uint32_t hash;
3223 
3224 	HN_LOCK(sc);
3225 	hash = sc->hn_rss_hash;
3226 	HN_UNLOCK(sc);
3227 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3228 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3229 }
3230 
3231 static int
3232 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
3233 {
3234 	struct hn_softc *sc = arg1;
3235 	char vf_name[128];
3236 	struct ifnet *vf;
3237 
3238 	HN_LOCK(sc);
3239 	vf_name[0] = '\0';
3240 	vf = sc->hn_rx_ring[0].hn_vf;
3241 	if (vf != NULL)
3242 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf));
3243 	HN_UNLOCK(sc);
3244 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3245 }
3246 
3247 static int
3248 hn_check_iplen(const struct mbuf *m, int hoff)
3249 {
3250 	const struct ip *ip;
3251 	int len, iphlen, iplen;
3252 	const struct tcphdr *th;
3253 	int thoff;				/* TCP data offset */
3254 
3255 	len = hoff + sizeof(struct ip);
3256 
3257 	/* The packet must be at least the size of an IP header. */
3258 	if (m->m_pkthdr.len < len)
3259 		return IPPROTO_DONE;
3260 
3261 	/* The fixed IP header must reside completely in the first mbuf. */
3262 	if (m->m_len < len)
3263 		return IPPROTO_DONE;
3264 
3265 	ip = mtodo(m, hoff);
3266 
3267 	/* Bound check the packet's stated IP header length. */
3268 	iphlen = ip->ip_hl << 2;
3269 	if (iphlen < sizeof(struct ip))		/* minimum header length */
3270 		return IPPROTO_DONE;
3271 
3272 	/* The full IP header must reside completely in the one mbuf. */
3273 	if (m->m_len < hoff + iphlen)
3274 		return IPPROTO_DONE;
3275 
3276 	iplen = ntohs(ip->ip_len);
3277 
3278 	/*
3279 	 * Check that the amount of data in the buffers is as
3280 	 * at least much as the IP header would have us expect.
3281 	 */
3282 	if (m->m_pkthdr.len < hoff + iplen)
3283 		return IPPROTO_DONE;
3284 
3285 	/*
3286 	 * Ignore IP fragments.
3287 	 */
3288 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3289 		return IPPROTO_DONE;
3290 
3291 	/*
3292 	 * The TCP/IP or UDP/IP header must be entirely contained within
3293 	 * the first fragment of a packet.
3294 	 */
3295 	switch (ip->ip_p) {
3296 	case IPPROTO_TCP:
3297 		if (iplen < iphlen + sizeof(struct tcphdr))
3298 			return IPPROTO_DONE;
3299 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3300 			return IPPROTO_DONE;
3301 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3302 		thoff = th->th_off << 2;
3303 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3304 			return IPPROTO_DONE;
3305 		if (m->m_len < hoff + iphlen + thoff)
3306 			return IPPROTO_DONE;
3307 		break;
3308 	case IPPROTO_UDP:
3309 		if (iplen < iphlen + sizeof(struct udphdr))
3310 			return IPPROTO_DONE;
3311 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3312 			return IPPROTO_DONE;
3313 		break;
3314 	default:
3315 		if (iplen < iphlen)
3316 			return IPPROTO_DONE;
3317 		break;
3318 	}
3319 	return ip->ip_p;
3320 }
3321 
3322 static int
3323 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3324 {
3325 	struct sysctl_oid_list *child;
3326 	struct sysctl_ctx_list *ctx;
3327 	device_t dev = sc->hn_dev;
3328 #if defined(INET) || defined(INET6)
3329 #if __FreeBSD_version >= 1100095
3330 	int lroent_cnt;
3331 #endif
3332 #endif
3333 	int i;
3334 
3335 	/*
3336 	 * Create RXBUF for reception.
3337 	 *
3338 	 * NOTE:
3339 	 * - It is shared by all channels.
3340 	 * - A large enough buffer is allocated, certain version of NVSes
3341 	 *   may further limit the usable space.
3342 	 */
3343 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3344 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3345 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3346 	if (sc->hn_rxbuf == NULL) {
3347 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3348 		return (ENOMEM);
3349 	}
3350 
3351 	sc->hn_rx_ring_cnt = ring_cnt;
3352 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3353 
3354 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3355 	    M_DEVBUF, M_WAITOK | M_ZERO);
3356 
3357 #if defined(INET) || defined(INET6)
3358 #if __FreeBSD_version >= 1100095
3359 	lroent_cnt = hn_lro_entry_count;
3360 	if (lroent_cnt < TCP_LRO_ENTRIES)
3361 		lroent_cnt = TCP_LRO_ENTRIES;
3362 	if (bootverbose)
3363 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3364 #endif
3365 #endif	/* INET || INET6 */
3366 
3367 	ctx = device_get_sysctl_ctx(dev);
3368 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3369 
3370 	/* Create dev.hn.UNIT.rx sysctl tree */
3371 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3372 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3373 
3374 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3375 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3376 
3377 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3378 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3379 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
3380 		if (rxr->hn_br == NULL) {
3381 			device_printf(dev, "allocate bufring failed\n");
3382 			return (ENOMEM);
3383 		}
3384 
3385 		if (hn_trust_hosttcp)
3386 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3387 		if (hn_trust_hostudp)
3388 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3389 		if (hn_trust_hostip)
3390 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3391 		rxr->hn_ifp = sc->hn_ifp;
3392 		if (i < sc->hn_tx_ring_cnt)
3393 			rxr->hn_txr = &sc->hn_tx_ring[i];
3394 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3395 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3396 		rxr->hn_rx_idx = i;
3397 		rxr->hn_rxbuf = sc->hn_rxbuf;
3398 
3399 		/*
3400 		 * Initialize LRO.
3401 		 */
3402 #if defined(INET) || defined(INET6)
3403 #if __FreeBSD_version >= 1100095
3404 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3405 		    hn_lro_mbufq_depth);
3406 #else
3407 		tcp_lro_init(&rxr->hn_lro);
3408 		rxr->hn_lro.ifp = sc->hn_ifp;
3409 #endif
3410 #if __FreeBSD_version >= 1100099
3411 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3412 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3413 #endif
3414 #endif	/* INET || INET6 */
3415 
3416 		if (sc->hn_rx_sysctl_tree != NULL) {
3417 			char name[16];
3418 
3419 			/*
3420 			 * Create per RX ring sysctl tree:
3421 			 * dev.hn.UNIT.rx.RINGID
3422 			 */
3423 			snprintf(name, sizeof(name), "%d", i);
3424 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3425 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3426 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3427 
3428 			if (rxr->hn_rx_sysctl_tree != NULL) {
3429 				SYSCTL_ADD_ULONG(ctx,
3430 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3431 				    OID_AUTO, "packets", CTLFLAG_RW,
3432 				    &rxr->hn_pkts, "# of packets received");
3433 				SYSCTL_ADD_ULONG(ctx,
3434 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3435 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
3436 				    &rxr->hn_rss_pkts,
3437 				    "# of packets w/ RSS info received");
3438 				SYSCTL_ADD_INT(ctx,
3439 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3440 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3441 				    &rxr->hn_pktbuf_len, 0,
3442 				    "Temporary channel packet buffer length");
3443 			}
3444 		}
3445 	}
3446 
3447 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3448 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3449 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3450 #if __FreeBSD_version < 1100095
3451 	    hn_rx_stat_int_sysctl,
3452 #else
3453 	    hn_rx_stat_u64_sysctl,
3454 #endif
3455 	    "LU", "LRO queued");
3456 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3457 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3458 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3459 #if __FreeBSD_version < 1100095
3460 	    hn_rx_stat_int_sysctl,
3461 #else
3462 	    hn_rx_stat_u64_sysctl,
3463 #endif
3464 	    "LU", "LRO flushed");
3465 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3466 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3467 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
3468 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3469 #if __FreeBSD_version >= 1100099
3470 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3471 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3472 	    hn_lro_lenlim_sysctl, "IU",
3473 	    "Max # of data bytes to be aggregated by LRO");
3474 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3475 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3476 	    hn_lro_ackcnt_sysctl, "I",
3477 	    "Max # of ACKs to be aggregated by LRO");
3478 #endif
3479 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3480 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3481 	    hn_trust_hcsum_sysctl, "I",
3482 	    "Trust tcp segement verification on host side, "
3483 	    "when csum info is missing");
3484 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3485 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3486 	    hn_trust_hcsum_sysctl, "I",
3487 	    "Trust udp datagram verification on host side, "
3488 	    "when csum info is missing");
3489 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3490 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3491 	    hn_trust_hcsum_sysctl, "I",
3492 	    "Trust ip packet verification on host side, "
3493 	    "when csum info is missing");
3494 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3495 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3496 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
3497 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3498 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3499 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3500 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
3501 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3502 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3503 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3504 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
3505 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3506 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3507 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3508 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
3509 	    hn_rx_stat_ulong_sysctl, "LU",
3510 	    "# of packets that we trust host's csum verification");
3511 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3512 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3513 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
3514 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3515 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3516 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3517 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
3518 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3519 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3520 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3521 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3522 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3523 
3524 	return (0);
3525 }
3526 
3527 static void
3528 hn_destroy_rx_data(struct hn_softc *sc)
3529 {
3530 	int i;
3531 
3532 	if (sc->hn_rxbuf != NULL) {
3533 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3534 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3535 		else
3536 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
3537 		sc->hn_rxbuf = NULL;
3538 	}
3539 
3540 	if (sc->hn_rx_ring_cnt == 0)
3541 		return;
3542 
3543 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3544 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3545 
3546 		if (rxr->hn_br == NULL)
3547 			continue;
3548 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3549 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3550 		} else {
3551 			device_printf(sc->hn_dev,
3552 			    "%dth channel bufring is referenced", i);
3553 		}
3554 		rxr->hn_br = NULL;
3555 
3556 #if defined(INET) || defined(INET6)
3557 		tcp_lro_free(&rxr->hn_lro);
3558 #endif
3559 		free(rxr->hn_pktbuf, M_DEVBUF);
3560 	}
3561 	free(sc->hn_rx_ring, M_DEVBUF);
3562 	sc->hn_rx_ring = NULL;
3563 
3564 	sc->hn_rx_ring_cnt = 0;
3565 	sc->hn_rx_ring_inuse = 0;
3566 }
3567 
3568 static int
3569 hn_tx_ring_create(struct hn_softc *sc, int id)
3570 {
3571 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3572 	device_t dev = sc->hn_dev;
3573 	bus_dma_tag_t parent_dtag;
3574 	int error, i;
3575 
3576 	txr->hn_sc = sc;
3577 	txr->hn_tx_idx = id;
3578 
3579 #ifndef HN_USE_TXDESC_BUFRING
3580 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3581 #endif
3582 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3583 
3584 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3585 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3586 	    M_DEVBUF, M_WAITOK | M_ZERO);
3587 #ifndef HN_USE_TXDESC_BUFRING
3588 	SLIST_INIT(&txr->hn_txlist);
3589 #else
3590 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3591 	    M_WAITOK, &txr->hn_tx_lock);
3592 #endif
3593 
3594 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3595 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3596 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3597 	} else {
3598 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3599 	}
3600 
3601 #ifdef HN_IFSTART_SUPPORT
3602 	if (hn_use_if_start) {
3603 		txr->hn_txeof = hn_start_txeof;
3604 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3605 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3606 	} else
3607 #endif
3608 	{
3609 		int br_depth;
3610 
3611 		txr->hn_txeof = hn_xmit_txeof;
3612 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3613 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3614 
3615 		br_depth = hn_get_txswq_depth(txr);
3616 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3617 		    M_WAITOK, &txr->hn_tx_lock);
3618 	}
3619 
3620 	txr->hn_direct_tx_size = hn_direct_tx_size;
3621 
3622 	/*
3623 	 * Always schedule transmission instead of trying to do direct
3624 	 * transmission.  This one gives the best performance so far.
3625 	 */
3626 	txr->hn_sched_tx = 1;
3627 
3628 	parent_dtag = bus_get_dma_tag(dev);
3629 
3630 	/* DMA tag for RNDIS packet messages. */
3631 	error = bus_dma_tag_create(parent_dtag, /* parent */
3632 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
3633 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
3634 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3635 	    BUS_SPACE_MAXADDR,		/* highaddr */
3636 	    NULL, NULL,			/* filter, filterarg */
3637 	    HN_RNDIS_PKT_LEN,		/* maxsize */
3638 	    1,				/* nsegments */
3639 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
3640 	    0,				/* flags */
3641 	    NULL,			/* lockfunc */
3642 	    NULL,			/* lockfuncarg */
3643 	    &txr->hn_tx_rndis_dtag);
3644 	if (error) {
3645 		device_printf(dev, "failed to create rndis dmatag\n");
3646 		return error;
3647 	}
3648 
3649 	/* DMA tag for data. */
3650 	error = bus_dma_tag_create(parent_dtag, /* parent */
3651 	    1,				/* alignment */
3652 	    HN_TX_DATA_BOUNDARY,	/* boundary */
3653 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3654 	    BUS_SPACE_MAXADDR,		/* highaddr */
3655 	    NULL, NULL,			/* filter, filterarg */
3656 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
3657 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
3658 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
3659 	    0,				/* flags */
3660 	    NULL,			/* lockfunc */
3661 	    NULL,			/* lockfuncarg */
3662 	    &txr->hn_tx_data_dtag);
3663 	if (error) {
3664 		device_printf(dev, "failed to create data dmatag\n");
3665 		return error;
3666 	}
3667 
3668 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3669 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
3670 
3671 		txd->txr = txr;
3672 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3673 		STAILQ_INIT(&txd->agg_list);
3674 
3675 		/*
3676 		 * Allocate and load RNDIS packet message.
3677 		 */
3678         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3679 		    (void **)&txd->rndis_pkt,
3680 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3681 		    &txd->rndis_pkt_dmap);
3682 		if (error) {
3683 			device_printf(dev,
3684 			    "failed to allocate rndis_packet_msg, %d\n", i);
3685 			return error;
3686 		}
3687 
3688 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3689 		    txd->rndis_pkt_dmap,
3690 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3691 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3692 		    BUS_DMA_NOWAIT);
3693 		if (error) {
3694 			device_printf(dev,
3695 			    "failed to load rndis_packet_msg, %d\n", i);
3696 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3697 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3698 			return error;
3699 		}
3700 
3701 		/* DMA map for TX data. */
3702 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3703 		    &txd->data_dmap);
3704 		if (error) {
3705 			device_printf(dev,
3706 			    "failed to allocate tx data dmamap\n");
3707 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3708 			    txd->rndis_pkt_dmap);
3709 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3710 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3711 			return error;
3712 		}
3713 
3714 		/* All set, put it to list */
3715 		txd->flags |= HN_TXD_FLAG_ONLIST;
3716 #ifndef HN_USE_TXDESC_BUFRING
3717 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3718 #else
3719 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
3720 #endif
3721 	}
3722 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3723 
3724 	if (sc->hn_tx_sysctl_tree != NULL) {
3725 		struct sysctl_oid_list *child;
3726 		struct sysctl_ctx_list *ctx;
3727 		char name[16];
3728 
3729 		/*
3730 		 * Create per TX ring sysctl tree:
3731 		 * dev.hn.UNIT.tx.RINGID
3732 		 */
3733 		ctx = device_get_sysctl_ctx(dev);
3734 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3735 
3736 		snprintf(name, sizeof(name), "%d", id);
3737 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3738 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3739 
3740 		if (txr->hn_tx_sysctl_tree != NULL) {
3741 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3742 
3743 #ifdef HN_DEBUG
3744 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3745 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3746 			    "# of available TX descs");
3747 #endif
3748 #ifdef HN_IFSTART_SUPPORT
3749 			if (!hn_use_if_start)
3750 #endif
3751 			{
3752 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3753 				    CTLFLAG_RD, &txr->hn_oactive, 0,
3754 				    "over active");
3755 			}
3756 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3757 			    CTLFLAG_RW, &txr->hn_pkts,
3758 			    "# of packets transmitted");
3759 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3760 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
3761 		}
3762 	}
3763 
3764 	return 0;
3765 }
3766 
3767 static void
3768 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3769 {
3770 	struct hn_tx_ring *txr = txd->txr;
3771 
3772 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
3773 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3774 
3775 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3776 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3777 	    txd->rndis_pkt_dmap);
3778 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3779 }
3780 
3781 static void
3782 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3783 {
3784 
3785 	KASSERT(txd->refs == 0 || txd->refs == 1,
3786 	    ("invalid txd refs %d", txd->refs));
3787 
3788 	/* Aggregated txds will be freed by their aggregating txd. */
3789 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3790 		int freed;
3791 
3792 		freed = hn_txdesc_put(txr, txd);
3793 		KASSERT(freed, ("can't free txdesc"));
3794 	}
3795 }
3796 
3797 static void
3798 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3799 {
3800 	int i;
3801 
3802 	if (txr->hn_txdesc == NULL)
3803 		return;
3804 
3805 	/*
3806 	 * NOTE:
3807 	 * Because the freeing of aggregated txds will be deferred
3808 	 * to the aggregating txd, two passes are used here:
3809 	 * - The first pass GCes any pending txds.  This GC is necessary,
3810 	 *   since if the channels are revoked, hypervisor will not
3811 	 *   deliver send-done for all pending txds.
3812 	 * - The second pass frees the busdma stuffs, i.e. after all txds
3813 	 *   were freed.
3814 	 */
3815 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3816 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3817 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3818 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3819 
3820 	if (txr->hn_tx_data_dtag != NULL)
3821 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3822 	if (txr->hn_tx_rndis_dtag != NULL)
3823 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3824 
3825 #ifdef HN_USE_TXDESC_BUFRING
3826 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3827 #endif
3828 
3829 	free(txr->hn_txdesc, M_DEVBUF);
3830 	txr->hn_txdesc = NULL;
3831 
3832 	if (txr->hn_mbuf_br != NULL)
3833 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3834 
3835 #ifndef HN_USE_TXDESC_BUFRING
3836 	mtx_destroy(&txr->hn_txlist_spin);
3837 #endif
3838 	mtx_destroy(&txr->hn_tx_lock);
3839 }
3840 
3841 static int
3842 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3843 {
3844 	struct sysctl_oid_list *child;
3845 	struct sysctl_ctx_list *ctx;
3846 	int i;
3847 
3848 	/*
3849 	 * Create TXBUF for chimney sending.
3850 	 *
3851 	 * NOTE: It is shared by all channels.
3852 	 */
3853 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3854 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3855 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3856 	if (sc->hn_chim == NULL) {
3857 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
3858 		return (ENOMEM);
3859 	}
3860 
3861 	sc->hn_tx_ring_cnt = ring_cnt;
3862 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3863 
3864 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3865 	    M_DEVBUF, M_WAITOK | M_ZERO);
3866 
3867 	ctx = device_get_sysctl_ctx(sc->hn_dev);
3868 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3869 
3870 	/* Create dev.hn.UNIT.tx sysctl tree */
3871 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3872 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3873 
3874 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3875 		int error;
3876 
3877 		error = hn_tx_ring_create(sc, i);
3878 		if (error)
3879 			return error;
3880 	}
3881 
3882 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3883 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3884 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
3885 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3886 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3887 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3888 	    __offsetof(struct hn_tx_ring, hn_send_failed),
3889 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3890 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3891 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3892 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
3893 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3894 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3895 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3896 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
3897 	    hn_tx_stat_ulong_sysctl, "LU",
3898 	    "# of packet transmission aggregation flush failure");
3899 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3900 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3901 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3902 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3903 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3904 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3905 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
3906 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3907 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3908 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3909 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3910 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3911 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3912 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3913 	    "# of total TX descs");
3914 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3915 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3916 	    "Chimney send packet size upper boundary");
3917 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3918 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3919 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3920 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3921 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3922 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3923 	    hn_tx_conf_int_sysctl, "I",
3924 	    "Size of the packet for direct transmission");
3925 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3926 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3927 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
3928 	    hn_tx_conf_int_sysctl, "I",
3929 	    "Always schedule transmission "
3930 	    "instead of doing direct transmission");
3931 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3932 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3933 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3934 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3935 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3936 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3937 	    "Applied packet transmission aggregation size");
3938 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3939 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3940 	    hn_txagg_pktmax_sysctl, "I",
3941 	    "Applied packet transmission aggregation packets");
3942 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3943 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3944 	    hn_txagg_align_sysctl, "I",
3945 	    "Applied packet transmission aggregation alignment");
3946 
3947 	return 0;
3948 }
3949 
3950 static void
3951 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3952 {
3953 	int i;
3954 
3955 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3956 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
3957 }
3958 
3959 static void
3960 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3961 {
3962 	struct ifnet *ifp = sc->hn_ifp;
3963 	int tso_minlen;
3964 
3965 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3966 		return;
3967 
3968 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3969 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3970 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3971 
3972 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3973 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3974 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3975 
3976 	if (tso_maxlen < tso_minlen)
3977 		tso_maxlen = tso_minlen;
3978 	else if (tso_maxlen > IP_MAXPACKET)
3979 		tso_maxlen = IP_MAXPACKET;
3980 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
3981 		tso_maxlen = sc->hn_ndis_tso_szmax;
3982 	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3983 	if (bootverbose)
3984 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3985 }
3986 
3987 static void
3988 hn_fixup_tx_data(struct hn_softc *sc)
3989 {
3990 	uint64_t csum_assist;
3991 	int i;
3992 
3993 	hn_set_chim_size(sc, sc->hn_chim_szmax);
3994 	if (hn_tx_chimney_size > 0 &&
3995 	    hn_tx_chimney_size < sc->hn_chim_szmax)
3996 		hn_set_chim_size(sc, hn_tx_chimney_size);
3997 
3998 	csum_assist = 0;
3999 	if (sc->hn_caps & HN_CAP_IPCS)
4000 		csum_assist |= CSUM_IP;
4001 	if (sc->hn_caps & HN_CAP_TCP4CS)
4002 		csum_assist |= CSUM_IP_TCP;
4003 	if (sc->hn_caps & HN_CAP_UDP4CS)
4004 		csum_assist |= CSUM_IP_UDP;
4005 	if (sc->hn_caps & HN_CAP_TCP6CS)
4006 		csum_assist |= CSUM_IP6_TCP;
4007 	if (sc->hn_caps & HN_CAP_UDP6CS)
4008 		csum_assist |= CSUM_IP6_UDP;
4009 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4010 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
4011 
4012 	if (sc->hn_caps & HN_CAP_HASHVAL) {
4013 		/*
4014 		 * Support HASHVAL pktinfo on TX path.
4015 		 */
4016 		if (bootverbose)
4017 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
4018 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4019 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
4020 	}
4021 }
4022 
4023 static void
4024 hn_destroy_tx_data(struct hn_softc *sc)
4025 {
4026 	int i;
4027 
4028 	if (sc->hn_chim != NULL) {
4029 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
4030 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
4031 		} else {
4032 			device_printf(sc->hn_dev,
4033 			    "chimney sending buffer is referenced");
4034 		}
4035 		sc->hn_chim = NULL;
4036 	}
4037 
4038 	if (sc->hn_tx_ring_cnt == 0)
4039 		return;
4040 
4041 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4042 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
4043 
4044 	free(sc->hn_tx_ring, M_DEVBUF);
4045 	sc->hn_tx_ring = NULL;
4046 
4047 	sc->hn_tx_ring_cnt = 0;
4048 	sc->hn_tx_ring_inuse = 0;
4049 }
4050 
4051 #ifdef HN_IFSTART_SUPPORT
4052 
4053 static void
4054 hn_start_taskfunc(void *xtxr, int pending __unused)
4055 {
4056 	struct hn_tx_ring *txr = xtxr;
4057 
4058 	mtx_lock(&txr->hn_tx_lock);
4059 	hn_start_locked(txr, 0);
4060 	mtx_unlock(&txr->hn_tx_lock);
4061 }
4062 
4063 static int
4064 hn_start_locked(struct hn_tx_ring *txr, int len)
4065 {
4066 	struct hn_softc *sc = txr->hn_sc;
4067 	struct ifnet *ifp = sc->hn_ifp;
4068 	int sched = 0;
4069 
4070 	KASSERT(hn_use_if_start,
4071 	    ("hn_start_locked is called, when if_start is disabled"));
4072 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4073 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4074 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4075 
4076 	if (__predict_false(txr->hn_suspended))
4077 		return (0);
4078 
4079 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
4080 	    IFF_DRV_RUNNING)
4081 		return (0);
4082 
4083 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
4084 		struct hn_txdesc *txd;
4085 		struct mbuf *m_head;
4086 		int error;
4087 
4088 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
4089 		if (m_head == NULL)
4090 			break;
4091 
4092 		if (len > 0 && m_head->m_pkthdr.len > len) {
4093 			/*
4094 			 * This sending could be time consuming; let callers
4095 			 * dispatch this packet sending (and sending of any
4096 			 * following up packets) to tx taskqueue.
4097 			 */
4098 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4099 			sched = 1;
4100 			break;
4101 		}
4102 
4103 #if defined(INET6) || defined(INET)
4104 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
4105 			m_head = hn_tso_fixup(m_head);
4106 			if (__predict_false(m_head == NULL)) {
4107 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4108 				continue;
4109 			}
4110 		}
4111 #endif
4112 
4113 		txd = hn_txdesc_get(txr);
4114 		if (txd == NULL) {
4115 			txr->hn_no_txdescs++;
4116 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4117 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4118 			break;
4119 		}
4120 
4121 		error = hn_encap(ifp, txr, txd, &m_head);
4122 		if (error) {
4123 			/* Both txd and m_head are freed */
4124 			KASSERT(txr->hn_agg_txd == NULL,
4125 			    ("encap failed w/ pending aggregating txdesc"));
4126 			continue;
4127 		}
4128 
4129 		if (txr->hn_agg_pktleft == 0) {
4130 			if (txr->hn_agg_txd != NULL) {
4131 				KASSERT(m_head == NULL,
4132 				    ("pending mbuf for aggregating txdesc"));
4133 				error = hn_flush_txagg(ifp, txr);
4134 				if (__predict_false(error)) {
4135 					atomic_set_int(&ifp->if_drv_flags,
4136 					    IFF_DRV_OACTIVE);
4137 					break;
4138 				}
4139 			} else {
4140 				KASSERT(m_head != NULL, ("mbuf was freed"));
4141 				error = hn_txpkt(ifp, txr, txd);
4142 				if (__predict_false(error)) {
4143 					/* txd is freed, but m_head is not */
4144 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4145 					atomic_set_int(&ifp->if_drv_flags,
4146 					    IFF_DRV_OACTIVE);
4147 					break;
4148 				}
4149 			}
4150 		}
4151 #ifdef INVARIANTS
4152 		else {
4153 			KASSERT(txr->hn_agg_txd != NULL,
4154 			    ("no aggregating txdesc"));
4155 			KASSERT(m_head == NULL,
4156 			    ("pending mbuf for aggregating txdesc"));
4157 		}
4158 #endif
4159 	}
4160 
4161 	/* Flush pending aggerated transmission. */
4162 	if (txr->hn_agg_txd != NULL)
4163 		hn_flush_txagg(ifp, txr);
4164 	return (sched);
4165 }
4166 
4167 static void
4168 hn_start(struct ifnet *ifp)
4169 {
4170 	struct hn_softc *sc = ifp->if_softc;
4171 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
4172 
4173 	if (txr->hn_sched_tx)
4174 		goto do_sched;
4175 
4176 	if (mtx_trylock(&txr->hn_tx_lock)) {
4177 		int sched;
4178 
4179 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4180 		mtx_unlock(&txr->hn_tx_lock);
4181 		if (!sched)
4182 			return;
4183 	}
4184 do_sched:
4185 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4186 }
4187 
4188 static void
4189 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4190 {
4191 	struct hn_tx_ring *txr = xtxr;
4192 
4193 	mtx_lock(&txr->hn_tx_lock);
4194 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4195 	hn_start_locked(txr, 0);
4196 	mtx_unlock(&txr->hn_tx_lock);
4197 }
4198 
4199 static void
4200 hn_start_txeof(struct hn_tx_ring *txr)
4201 {
4202 	struct hn_softc *sc = txr->hn_sc;
4203 	struct ifnet *ifp = sc->hn_ifp;
4204 
4205 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4206 
4207 	if (txr->hn_sched_tx)
4208 		goto do_sched;
4209 
4210 	if (mtx_trylock(&txr->hn_tx_lock)) {
4211 		int sched;
4212 
4213 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4214 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4215 		mtx_unlock(&txr->hn_tx_lock);
4216 		if (sched) {
4217 			taskqueue_enqueue(txr->hn_tx_taskq,
4218 			    &txr->hn_tx_task);
4219 		}
4220 	} else {
4221 do_sched:
4222 		/*
4223 		 * Release the OACTIVE earlier, with the hope, that
4224 		 * others could catch up.  The task will clear the
4225 		 * flag again with the hn_tx_lock to avoid possible
4226 		 * races.
4227 		 */
4228 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4229 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4230 	}
4231 }
4232 
4233 #endif	/* HN_IFSTART_SUPPORT */
4234 
4235 static int
4236 hn_xmit(struct hn_tx_ring *txr, int len)
4237 {
4238 	struct hn_softc *sc = txr->hn_sc;
4239 	struct ifnet *ifp = sc->hn_ifp;
4240 	struct mbuf *m_head;
4241 	int sched = 0;
4242 
4243 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4244 #ifdef HN_IFSTART_SUPPORT
4245 	KASSERT(hn_use_if_start == 0,
4246 	    ("hn_xmit is called, when if_start is enabled"));
4247 #endif
4248 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4249 
4250 	if (__predict_false(txr->hn_suspended))
4251 		return (0);
4252 
4253 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4254 		return (0);
4255 
4256 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4257 		struct hn_txdesc *txd;
4258 		int error;
4259 
4260 		if (len > 0 && m_head->m_pkthdr.len > len) {
4261 			/*
4262 			 * This sending could be time consuming; let callers
4263 			 * dispatch this packet sending (and sending of any
4264 			 * following up packets) to tx taskqueue.
4265 			 */
4266 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4267 			sched = 1;
4268 			break;
4269 		}
4270 
4271 		txd = hn_txdesc_get(txr);
4272 		if (txd == NULL) {
4273 			txr->hn_no_txdescs++;
4274 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4275 			txr->hn_oactive = 1;
4276 			break;
4277 		}
4278 
4279 		error = hn_encap(ifp, txr, txd, &m_head);
4280 		if (error) {
4281 			/* Both txd and m_head are freed; discard */
4282 			KASSERT(txr->hn_agg_txd == NULL,
4283 			    ("encap failed w/ pending aggregating txdesc"));
4284 			drbr_advance(ifp, txr->hn_mbuf_br);
4285 			continue;
4286 		}
4287 
4288 		if (txr->hn_agg_pktleft == 0) {
4289 			if (txr->hn_agg_txd != NULL) {
4290 				KASSERT(m_head == NULL,
4291 				    ("pending mbuf for aggregating txdesc"));
4292 				error = hn_flush_txagg(ifp, txr);
4293 				if (__predict_false(error)) {
4294 					txr->hn_oactive = 1;
4295 					break;
4296 				}
4297 			} else {
4298 				KASSERT(m_head != NULL, ("mbuf was freed"));
4299 				error = hn_txpkt(ifp, txr, txd);
4300 				if (__predict_false(error)) {
4301 					/* txd is freed, but m_head is not */
4302 					drbr_putback(ifp, txr->hn_mbuf_br,
4303 					    m_head);
4304 					txr->hn_oactive = 1;
4305 					break;
4306 				}
4307 			}
4308 		}
4309 #ifdef INVARIANTS
4310 		else {
4311 			KASSERT(txr->hn_agg_txd != NULL,
4312 			    ("no aggregating txdesc"));
4313 			KASSERT(m_head == NULL,
4314 			    ("pending mbuf for aggregating txdesc"));
4315 		}
4316 #endif
4317 
4318 		/* Sent */
4319 		drbr_advance(ifp, txr->hn_mbuf_br);
4320 	}
4321 
4322 	/* Flush pending aggerated transmission. */
4323 	if (txr->hn_agg_txd != NULL)
4324 		hn_flush_txagg(ifp, txr);
4325 	return (sched);
4326 }
4327 
4328 static int
4329 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4330 {
4331 	struct hn_softc *sc = ifp->if_softc;
4332 	struct hn_tx_ring *txr;
4333 	int error, idx = 0;
4334 
4335 #if defined(INET6) || defined(INET)
4336 	/*
4337 	 * Perform TSO packet header fixup now, since the TSO
4338 	 * packet header should be cache-hot.
4339 	 */
4340 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4341 		m = hn_tso_fixup(m);
4342 		if (__predict_false(m == NULL)) {
4343 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4344 			return EIO;
4345 		}
4346 	}
4347 #endif
4348 
4349 	/*
4350 	 * Select the TX ring based on flowid
4351 	 */
4352 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4353 #ifdef RSS
4354 		uint32_t bid;
4355 
4356 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
4357 		    &bid) == 0)
4358 			idx = bid % sc->hn_tx_ring_inuse;
4359 		else
4360 #endif
4361 			idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4362 	}
4363 	txr = &sc->hn_tx_ring[idx];
4364 
4365 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4366 	if (error) {
4367 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4368 		return error;
4369 	}
4370 
4371 	if (txr->hn_oactive)
4372 		return 0;
4373 
4374 	if (txr->hn_sched_tx)
4375 		goto do_sched;
4376 
4377 	if (mtx_trylock(&txr->hn_tx_lock)) {
4378 		int sched;
4379 
4380 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4381 		mtx_unlock(&txr->hn_tx_lock);
4382 		if (!sched)
4383 			return 0;
4384 	}
4385 do_sched:
4386 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4387 	return 0;
4388 }
4389 
4390 static void
4391 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4392 {
4393 	struct mbuf *m;
4394 
4395 	mtx_lock(&txr->hn_tx_lock);
4396 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4397 		m_freem(m);
4398 	mtx_unlock(&txr->hn_tx_lock);
4399 }
4400 
4401 static void
4402 hn_xmit_qflush(struct ifnet *ifp)
4403 {
4404 	struct hn_softc *sc = ifp->if_softc;
4405 	int i;
4406 
4407 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4408 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4409 	if_qflush(ifp);
4410 }
4411 
4412 static void
4413 hn_xmit_txeof(struct hn_tx_ring *txr)
4414 {
4415 
4416 	if (txr->hn_sched_tx)
4417 		goto do_sched;
4418 
4419 	if (mtx_trylock(&txr->hn_tx_lock)) {
4420 		int sched;
4421 
4422 		txr->hn_oactive = 0;
4423 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4424 		mtx_unlock(&txr->hn_tx_lock);
4425 		if (sched) {
4426 			taskqueue_enqueue(txr->hn_tx_taskq,
4427 			    &txr->hn_tx_task);
4428 		}
4429 	} else {
4430 do_sched:
4431 		/*
4432 		 * Release the oactive earlier, with the hope, that
4433 		 * others could catch up.  The task will clear the
4434 		 * oactive again with the hn_tx_lock to avoid possible
4435 		 * races.
4436 		 */
4437 		txr->hn_oactive = 0;
4438 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4439 	}
4440 }
4441 
4442 static void
4443 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4444 {
4445 	struct hn_tx_ring *txr = xtxr;
4446 
4447 	mtx_lock(&txr->hn_tx_lock);
4448 	hn_xmit(txr, 0);
4449 	mtx_unlock(&txr->hn_tx_lock);
4450 }
4451 
4452 static void
4453 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4454 {
4455 	struct hn_tx_ring *txr = xtxr;
4456 
4457 	mtx_lock(&txr->hn_tx_lock);
4458 	txr->hn_oactive = 0;
4459 	hn_xmit(txr, 0);
4460 	mtx_unlock(&txr->hn_tx_lock);
4461 }
4462 
4463 static int
4464 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4465 {
4466 	struct vmbus_chan_br cbr;
4467 	struct hn_rx_ring *rxr;
4468 	struct hn_tx_ring *txr = NULL;
4469 	int idx, error;
4470 
4471 	idx = vmbus_chan_subidx(chan);
4472 
4473 	/*
4474 	 * Link this channel to RX/TX ring.
4475 	 */
4476 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4477 	    ("invalid channel index %d, should > 0 && < %d",
4478 	     idx, sc->hn_rx_ring_inuse));
4479 	rxr = &sc->hn_rx_ring[idx];
4480 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4481 	    ("RX ring %d already attached", idx));
4482 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4483 	rxr->hn_chan = chan;
4484 
4485 	if (bootverbose) {
4486 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4487 		    idx, vmbus_chan_id(chan));
4488 	}
4489 
4490 	if (idx < sc->hn_tx_ring_inuse) {
4491 		txr = &sc->hn_tx_ring[idx];
4492 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4493 		    ("TX ring %d already attached", idx));
4494 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4495 
4496 		txr->hn_chan = chan;
4497 		if (bootverbose) {
4498 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4499 			    idx, vmbus_chan_id(chan));
4500 		}
4501 	}
4502 
4503 	/* Bind this channel to a proper CPU. */
4504 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4505 
4506 	/*
4507 	 * Open this channel
4508 	 */
4509 	cbr.cbr = rxr->hn_br;
4510 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4511 	cbr.cbr_txsz = HN_TXBR_SIZE;
4512 	cbr.cbr_rxsz = HN_RXBR_SIZE;
4513 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4514 	if (error) {
4515 		if (error == EISCONN) {
4516 			if_printf(sc->hn_ifp, "bufring is connected after "
4517 			    "chan%u open failure\n", vmbus_chan_id(chan));
4518 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4519 		} else {
4520 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4521 			    vmbus_chan_id(chan), error);
4522 		}
4523 	}
4524 	return (error);
4525 }
4526 
4527 static void
4528 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4529 {
4530 	struct hn_rx_ring *rxr;
4531 	int idx, error;
4532 
4533 	idx = vmbus_chan_subidx(chan);
4534 
4535 	/*
4536 	 * Link this channel to RX/TX ring.
4537 	 */
4538 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4539 	    ("invalid channel index %d, should > 0 && < %d",
4540 	     idx, sc->hn_rx_ring_inuse));
4541 	rxr = &sc->hn_rx_ring[idx];
4542 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4543 	    ("RX ring %d is not attached", idx));
4544 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4545 
4546 	if (idx < sc->hn_tx_ring_inuse) {
4547 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4548 
4549 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4550 		    ("TX ring %d is not attached attached", idx));
4551 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4552 	}
4553 
4554 	/*
4555 	 * Close this channel.
4556 	 *
4557 	 * NOTE:
4558 	 * Channel closing does _not_ destroy the target channel.
4559 	 */
4560 	error = vmbus_chan_close_direct(chan);
4561 	if (error == EISCONN) {
4562 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
4563 		    "after being closed\n", vmbus_chan_id(chan));
4564 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4565 	} else if (error) {
4566 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4567 		    vmbus_chan_id(chan), error);
4568 	}
4569 }
4570 
4571 static int
4572 hn_attach_subchans(struct hn_softc *sc)
4573 {
4574 	struct vmbus_channel **subchans;
4575 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4576 	int i, error = 0;
4577 
4578 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
4579 
4580 	/* Attach the sub-channels. */
4581 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4582 	for (i = 0; i < subchan_cnt; ++i) {
4583 		int error1;
4584 
4585 		error1 = hn_chan_attach(sc, subchans[i]);
4586 		if (error1) {
4587 			error = error1;
4588 			/* Move on; all channels will be detached later. */
4589 		}
4590 	}
4591 	vmbus_subchan_rel(subchans, subchan_cnt);
4592 
4593 	if (error) {
4594 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4595 	} else {
4596 		if (bootverbose) {
4597 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4598 			    subchan_cnt);
4599 		}
4600 	}
4601 	return (error);
4602 }
4603 
4604 static void
4605 hn_detach_allchans(struct hn_softc *sc)
4606 {
4607 	struct vmbus_channel **subchans;
4608 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4609 	int i;
4610 
4611 	if (subchan_cnt == 0)
4612 		goto back;
4613 
4614 	/* Detach the sub-channels. */
4615 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4616 	for (i = 0; i < subchan_cnt; ++i)
4617 		hn_chan_detach(sc, subchans[i]);
4618 	vmbus_subchan_rel(subchans, subchan_cnt);
4619 
4620 back:
4621 	/*
4622 	 * Detach the primary channel, _after_ all sub-channels
4623 	 * are detached.
4624 	 */
4625 	hn_chan_detach(sc, sc->hn_prichan);
4626 
4627 	/* Wait for sub-channels to be destroyed, if any. */
4628 	vmbus_subchan_drain(sc->hn_prichan);
4629 
4630 #ifdef INVARIANTS
4631 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4632 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4633 		    HN_RX_FLAG_ATTACHED) == 0,
4634 		    ("%dth RX ring is still attached", i));
4635 	}
4636 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4637 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4638 		    HN_TX_FLAG_ATTACHED) == 0,
4639 		    ("%dth TX ring is still attached", i));
4640 	}
4641 #endif
4642 }
4643 
4644 static int
4645 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4646 {
4647 	struct vmbus_channel **subchans;
4648 	int nchan, rxr_cnt, error;
4649 
4650 	nchan = *nsubch + 1;
4651 	if (nchan == 1) {
4652 		/*
4653 		 * Multiple RX/TX rings are not requested.
4654 		 */
4655 		*nsubch = 0;
4656 		return (0);
4657 	}
4658 
4659 	/*
4660 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4661 	 * table entries.
4662 	 */
4663 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4664 	if (error) {
4665 		/* No RSS; this is benign. */
4666 		*nsubch = 0;
4667 		return (0);
4668 	}
4669 	if (bootverbose) {
4670 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4671 		    rxr_cnt, nchan);
4672 	}
4673 
4674 	if (nchan > rxr_cnt)
4675 		nchan = rxr_cnt;
4676 	if (nchan == 1) {
4677 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4678 		*nsubch = 0;
4679 		return (0);
4680 	}
4681 
4682 	/*
4683 	 * Allocate sub-channels from NVS.
4684 	 */
4685 	*nsubch = nchan - 1;
4686 	error = hn_nvs_alloc_subchans(sc, nsubch);
4687 	if (error || *nsubch == 0) {
4688 		/* Failed to allocate sub-channels. */
4689 		*nsubch = 0;
4690 		return (0);
4691 	}
4692 
4693 	/*
4694 	 * Wait for all sub-channels to become ready before moving on.
4695 	 */
4696 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4697 	vmbus_subchan_rel(subchans, *nsubch);
4698 	return (0);
4699 }
4700 
4701 static bool
4702 hn_synth_attachable(const struct hn_softc *sc)
4703 {
4704 	int i;
4705 
4706 	if (sc->hn_flags & HN_FLAG_ERRORS)
4707 		return (false);
4708 
4709 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4710 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4711 
4712 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4713 			return (false);
4714 	}
4715 	return (true);
4716 }
4717 
4718 static int
4719 hn_synth_attach(struct hn_softc *sc, int mtu)
4720 {
4721 #define ATTACHED_NVS		0x0002
4722 #define ATTACHED_RNDIS		0x0004
4723 
4724 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4725 	int error, nsubch, nchan, i;
4726 	uint32_t old_caps, attached = 0;
4727 
4728 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4729 	    ("synthetic parts were attached"));
4730 
4731 	if (!hn_synth_attachable(sc))
4732 		return (ENXIO);
4733 
4734 	/* Save capabilities for later verification. */
4735 	old_caps = sc->hn_caps;
4736 	sc->hn_caps = 0;
4737 
4738 	/* Clear RSS stuffs. */
4739 	sc->hn_rss_ind_size = 0;
4740 	sc->hn_rss_hash = 0;
4741 
4742 	/*
4743 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
4744 	 */
4745 	error = hn_chan_attach(sc, sc->hn_prichan);
4746 	if (error)
4747 		goto failed;
4748 
4749 	/*
4750 	 * Attach NVS.
4751 	 */
4752 	error = hn_nvs_attach(sc, mtu);
4753 	if (error)
4754 		goto failed;
4755 	attached |= ATTACHED_NVS;
4756 
4757 	/*
4758 	 * Attach RNDIS _after_ NVS is attached.
4759 	 */
4760 	error = hn_rndis_attach(sc, mtu);
4761 	if (error)
4762 		goto failed;
4763 	attached |= ATTACHED_RNDIS;
4764 
4765 	/*
4766 	 * Make sure capabilities are not changed.
4767 	 */
4768 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4769 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4770 		    old_caps, sc->hn_caps);
4771 		error = ENXIO;
4772 		goto failed;
4773 	}
4774 
4775 	/*
4776 	 * Allocate sub-channels for multi-TX/RX rings.
4777 	 *
4778 	 * NOTE:
4779 	 * The # of RX rings that can be used is equivalent to the # of
4780 	 * channels to be requested.
4781 	 */
4782 	nsubch = sc->hn_rx_ring_cnt - 1;
4783 	error = hn_synth_alloc_subchans(sc, &nsubch);
4784 	if (error)
4785 		goto failed;
4786 	/* NOTE: _Full_ synthetic parts detach is required now. */
4787 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4788 
4789 	/*
4790 	 * Set the # of TX/RX rings that could be used according to
4791 	 * the # of channels that NVS offered.
4792 	 */
4793 	nchan = nsubch + 1;
4794 	hn_set_ring_inuse(sc, nchan);
4795 	if (nchan == 1) {
4796 		/* Only the primary channel can be used; done */
4797 		goto back;
4798 	}
4799 
4800 	/*
4801 	 * Attach the sub-channels.
4802 	 *
4803 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
4804 	 */
4805 	error = hn_attach_subchans(sc);
4806 	if (error)
4807 		goto failed;
4808 
4809 	/*
4810 	 * Configure RSS key and indirect table _after_ all sub-channels
4811 	 * are attached.
4812 	 */
4813 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4814 		/*
4815 		 * RSS key is not set yet; set it to the default RSS key.
4816 		 */
4817 		if (bootverbose)
4818 			if_printf(sc->hn_ifp, "setup default RSS key\n");
4819 #ifdef RSS
4820 		rss_getkey(rss->rss_key);
4821 #else
4822 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4823 #endif
4824 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4825 	}
4826 
4827 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4828 		/*
4829 		 * RSS indirect table is not set yet; set it up in round-
4830 		 * robin fashion.
4831 		 */
4832 		if (bootverbose) {
4833 			if_printf(sc->hn_ifp, "setup default RSS indirect "
4834 			    "table\n");
4835 		}
4836 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
4837 			uint32_t subidx;
4838 
4839 #ifdef RSS
4840 			subidx = rss_get_indirection_to_bucket(i);
4841 #else
4842 			subidx = i;
4843 #endif
4844 			rss->rss_ind[i] = subidx % nchan;
4845 		}
4846 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4847 	} else {
4848 		/*
4849 		 * # of usable channels may be changed, so we have to
4850 		 * make sure that all entries in RSS indirect table
4851 		 * are valid.
4852 		 *
4853 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
4854 		 */
4855 		hn_rss_ind_fixup(sc);
4856 	}
4857 
4858 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4859 	if (error)
4860 		goto failed;
4861 back:
4862 	/*
4863 	 * Fixup transmission aggregation setup.
4864 	 */
4865 	hn_set_txagg(sc);
4866 	return (0);
4867 
4868 failed:
4869 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4870 		hn_synth_detach(sc);
4871 	} else {
4872 		if (attached & ATTACHED_RNDIS)
4873 			hn_rndis_detach(sc);
4874 		if (attached & ATTACHED_NVS)
4875 			hn_nvs_detach(sc);
4876 		hn_chan_detach(sc, sc->hn_prichan);
4877 		/* Restore old capabilities. */
4878 		sc->hn_caps = old_caps;
4879 	}
4880 	return (error);
4881 
4882 #undef ATTACHED_RNDIS
4883 #undef ATTACHED_NVS
4884 }
4885 
4886 /*
4887  * NOTE:
4888  * The interface must have been suspended though hn_suspend(), before
4889  * this function get called.
4890  */
4891 static void
4892 hn_synth_detach(struct hn_softc *sc)
4893 {
4894 
4895 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4896 	    ("synthetic parts were not attached"));
4897 
4898 	/* Detach the RNDIS first. */
4899 	hn_rndis_detach(sc);
4900 
4901 	/* Detach NVS. */
4902 	hn_nvs_detach(sc);
4903 
4904 	/* Detach all of the channels. */
4905 	hn_detach_allchans(sc);
4906 
4907 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4908 }
4909 
4910 static void
4911 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4912 {
4913 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4914 	    ("invalid ring count %d", ring_cnt));
4915 
4916 	if (sc->hn_tx_ring_cnt > ring_cnt)
4917 		sc->hn_tx_ring_inuse = ring_cnt;
4918 	else
4919 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4920 	sc->hn_rx_ring_inuse = ring_cnt;
4921 
4922 #ifdef RSS
4923 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
4924 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
4925 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
4926 		    rss_getnumbuckets());
4927 	}
4928 #endif
4929 
4930 	if (bootverbose) {
4931 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4932 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4933 	}
4934 }
4935 
4936 static void
4937 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4938 {
4939 
4940 	/*
4941 	 * NOTE:
4942 	 * The TX bufring will not be drained by the hypervisor,
4943 	 * if the primary channel is revoked.
4944 	 */
4945 	while (!vmbus_chan_rx_empty(chan) ||
4946 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4947 	     !vmbus_chan_tx_empty(chan)))
4948 		pause("waitch", 1);
4949 	vmbus_chan_intr_drain(chan);
4950 }
4951 
4952 static void
4953 hn_suspend_data(struct hn_softc *sc)
4954 {
4955 	struct vmbus_channel **subch = NULL;
4956 	struct hn_tx_ring *txr;
4957 	int i, nsubch;
4958 
4959 	HN_LOCK_ASSERT(sc);
4960 
4961 	/*
4962 	 * Suspend TX.
4963 	 */
4964 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4965 		txr = &sc->hn_tx_ring[i];
4966 
4967 		mtx_lock(&txr->hn_tx_lock);
4968 		txr->hn_suspended = 1;
4969 		mtx_unlock(&txr->hn_tx_lock);
4970 		/* No one is able send more packets now. */
4971 
4972 		/*
4973 		 * Wait for all pending sends to finish.
4974 		 *
4975 		 * NOTE:
4976 		 * We will _not_ receive all pending send-done, if the
4977 		 * primary channel is revoked.
4978 		 */
4979 		while (hn_tx_ring_pending(txr) &&
4980 		    !vmbus_chan_is_revoked(sc->hn_prichan))
4981 			pause("hnwtx", 1 /* 1 tick */);
4982 	}
4983 
4984 	/*
4985 	 * Disable RX by clearing RX filter.
4986 	 */
4987 	hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
4988 
4989 	/*
4990 	 * Give RNDIS enough time to flush all pending data packets.
4991 	 */
4992 	pause("waitrx", (200 * hz) / 1000);
4993 
4994 	/*
4995 	 * Drain RX/TX bufrings and interrupts.
4996 	 */
4997 	nsubch = sc->hn_rx_ring_inuse - 1;
4998 	if (nsubch > 0)
4999 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
5000 
5001 	if (subch != NULL) {
5002 		for (i = 0; i < nsubch; ++i)
5003 			hn_chan_drain(sc, subch[i]);
5004 	}
5005 	hn_chan_drain(sc, sc->hn_prichan);
5006 
5007 	if (subch != NULL)
5008 		vmbus_subchan_rel(subch, nsubch);
5009 
5010 	/*
5011 	 * Drain any pending TX tasks.
5012 	 *
5013 	 * NOTE:
5014 	 * The above hn_chan_drain() can dispatch TX tasks, so the TX
5015 	 * tasks will have to be drained _after_ the above hn_chan_drain()
5016 	 * calls.
5017 	 */
5018 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5019 		txr = &sc->hn_tx_ring[i];
5020 
5021 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
5022 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
5023 	}
5024 }
5025 
5026 static void
5027 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
5028 {
5029 
5030 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
5031 }
5032 
5033 static void
5034 hn_suspend_mgmt(struct hn_softc *sc)
5035 {
5036 	struct task task;
5037 
5038 	HN_LOCK_ASSERT(sc);
5039 
5040 	/*
5041 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
5042 	 * through hn_mgmt_taskq.
5043 	 */
5044 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
5045 	vmbus_chan_run_task(sc->hn_prichan, &task);
5046 
5047 	/*
5048 	 * Make sure that all pending management tasks are completed.
5049 	 */
5050 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
5051 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
5052 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
5053 }
5054 
5055 static void
5056 hn_suspend(struct hn_softc *sc)
5057 {
5058 
5059 	/* Disable polling. */
5060 	hn_polling(sc, 0);
5061 
5062 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5063 	    (sc->hn_flags & HN_FLAG_VF))
5064 		hn_suspend_data(sc);
5065 	hn_suspend_mgmt(sc);
5066 }
5067 
5068 static void
5069 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
5070 {
5071 	int i;
5072 
5073 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
5074 	    ("invalid TX ring count %d", tx_ring_cnt));
5075 
5076 	for (i = 0; i < tx_ring_cnt; ++i) {
5077 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5078 
5079 		mtx_lock(&txr->hn_tx_lock);
5080 		txr->hn_suspended = 0;
5081 		mtx_unlock(&txr->hn_tx_lock);
5082 	}
5083 }
5084 
5085 static void
5086 hn_resume_data(struct hn_softc *sc)
5087 {
5088 	int i;
5089 
5090 	HN_LOCK_ASSERT(sc);
5091 
5092 	/*
5093 	 * Re-enable RX.
5094 	 */
5095 	hn_rxfilter_config(sc);
5096 
5097 	/*
5098 	 * Make sure to clear suspend status on "all" TX rings,
5099 	 * since hn_tx_ring_inuse can be changed after
5100 	 * hn_suspend_data().
5101 	 */
5102 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
5103 
5104 #ifdef HN_IFSTART_SUPPORT
5105 	if (!hn_use_if_start)
5106 #endif
5107 	{
5108 		/*
5109 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
5110 		 * reduced.
5111 		 */
5112 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
5113 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5114 	}
5115 
5116 	/*
5117 	 * Kick start TX.
5118 	 */
5119 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5120 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5121 
5122 		/*
5123 		 * Use txeof task, so that any pending oactive can be
5124 		 * cleared properly.
5125 		 */
5126 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5127 	}
5128 }
5129 
5130 static void
5131 hn_resume_mgmt(struct hn_softc *sc)
5132 {
5133 
5134 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
5135 
5136 	/*
5137 	 * Kick off network change detection, if it was pending.
5138 	 * If no network change was pending, start link status
5139 	 * checks, which is more lightweight than network change
5140 	 * detection.
5141 	 */
5142 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
5143 		hn_change_network(sc);
5144 	else
5145 		hn_update_link_status(sc);
5146 }
5147 
5148 static void
5149 hn_resume(struct hn_softc *sc)
5150 {
5151 
5152 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5153 	    (sc->hn_flags & HN_FLAG_VF))
5154 		hn_resume_data(sc);
5155 
5156 	/*
5157 	 * When the VF is activated, the synthetic interface is changed
5158 	 * to DOWN in hn_set_vf(). Here, if the VF is still active, we
5159 	 * don't call hn_resume_mgmt() until the VF is deactivated in
5160 	 * hn_set_vf().
5161 	 */
5162 	if (!(sc->hn_flags & HN_FLAG_VF))
5163 		hn_resume_mgmt(sc);
5164 
5165 	/*
5166 	 * Re-enable polling if this interface is running and
5167 	 * the polling is requested.
5168 	 */
5169 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
5170 		hn_polling(sc, sc->hn_pollhz);
5171 }
5172 
5173 static void
5174 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
5175 {
5176 	const struct rndis_status_msg *msg;
5177 	int ofs;
5178 
5179 	if (dlen < sizeof(*msg)) {
5180 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
5181 		return;
5182 	}
5183 	msg = data;
5184 
5185 	switch (msg->rm_status) {
5186 	case RNDIS_STATUS_MEDIA_CONNECT:
5187 	case RNDIS_STATUS_MEDIA_DISCONNECT:
5188 		hn_update_link_status(sc);
5189 		break;
5190 
5191 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
5192 		/* Not really useful; ignore. */
5193 		break;
5194 
5195 	case RNDIS_STATUS_NETWORK_CHANGE:
5196 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
5197 		if (dlen < ofs + msg->rm_stbuflen ||
5198 		    msg->rm_stbuflen < sizeof(uint32_t)) {
5199 			if_printf(sc->hn_ifp, "network changed\n");
5200 		} else {
5201 			uint32_t change;
5202 
5203 			memcpy(&change, ((const uint8_t *)msg) + ofs,
5204 			    sizeof(change));
5205 			if_printf(sc->hn_ifp, "network changed, change %u\n",
5206 			    change);
5207 		}
5208 		hn_change_network(sc);
5209 		break;
5210 
5211 	default:
5212 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
5213 		    msg->rm_status);
5214 		break;
5215 	}
5216 }
5217 
5218 static int
5219 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5220 {
5221 	const struct rndis_pktinfo *pi = info_data;
5222 	uint32_t mask = 0;
5223 
5224 	while (info_dlen != 0) {
5225 		const void *data;
5226 		uint32_t dlen;
5227 
5228 		if (__predict_false(info_dlen < sizeof(*pi)))
5229 			return (EINVAL);
5230 		if (__predict_false(info_dlen < pi->rm_size))
5231 			return (EINVAL);
5232 		info_dlen -= pi->rm_size;
5233 
5234 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5235 			return (EINVAL);
5236 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5237 			return (EINVAL);
5238 		dlen = pi->rm_size - pi->rm_pktinfooffset;
5239 		data = pi->rm_data;
5240 
5241 		switch (pi->rm_type) {
5242 		case NDIS_PKTINFO_TYPE_VLAN:
5243 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5244 				return (EINVAL);
5245 			info->vlan_info = *((const uint32_t *)data);
5246 			mask |= HN_RXINFO_VLAN;
5247 			break;
5248 
5249 		case NDIS_PKTINFO_TYPE_CSUM:
5250 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5251 				return (EINVAL);
5252 			info->csum_info = *((const uint32_t *)data);
5253 			mask |= HN_RXINFO_CSUM;
5254 			break;
5255 
5256 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5257 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5258 				return (EINVAL);
5259 			info->hash_value = *((const uint32_t *)data);
5260 			mask |= HN_RXINFO_HASHVAL;
5261 			break;
5262 
5263 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
5264 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5265 				return (EINVAL);
5266 			info->hash_info = *((const uint32_t *)data);
5267 			mask |= HN_RXINFO_HASHINF;
5268 			break;
5269 
5270 		default:
5271 			goto next;
5272 		}
5273 
5274 		if (mask == HN_RXINFO_ALL) {
5275 			/* All found; done */
5276 			break;
5277 		}
5278 next:
5279 		pi = (const struct rndis_pktinfo *)
5280 		    ((const uint8_t *)pi + pi->rm_size);
5281 	}
5282 
5283 	/*
5284 	 * Final fixup.
5285 	 * - If there is no hash value, invalidate the hash info.
5286 	 */
5287 	if ((mask & HN_RXINFO_HASHVAL) == 0)
5288 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5289 	return (0);
5290 }
5291 
5292 static __inline bool
5293 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5294 {
5295 
5296 	if (off < check_off) {
5297 		if (__predict_true(off + len <= check_off))
5298 			return (false);
5299 	} else if (off > check_off) {
5300 		if (__predict_true(check_off + check_len <= off))
5301 			return (false);
5302 	}
5303 	return (true);
5304 }
5305 
5306 static void
5307 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5308 {
5309 	const struct rndis_packet_msg *pkt;
5310 	struct hn_rxinfo info;
5311 	int data_off, pktinfo_off, data_len, pktinfo_len;
5312 
5313 	/*
5314 	 * Check length.
5315 	 */
5316 	if (__predict_false(dlen < sizeof(*pkt))) {
5317 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5318 		return;
5319 	}
5320 	pkt = data;
5321 
5322 	if (__predict_false(dlen < pkt->rm_len)) {
5323 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5324 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5325 		return;
5326 	}
5327 	if (__predict_false(pkt->rm_len <
5328 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5329 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5330 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
5331 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5332 		    pkt->rm_pktinfolen);
5333 		return;
5334 	}
5335 	if (__predict_false(pkt->rm_datalen == 0)) {
5336 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5337 		return;
5338 	}
5339 
5340 	/*
5341 	 * Check offests.
5342 	 */
5343 #define IS_OFFSET_INVALID(ofs)			\
5344 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
5345 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5346 
5347 	/* XXX Hyper-V does not meet data offset alignment requirement */
5348 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5349 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5350 		    "data offset %u\n", pkt->rm_dataoffset);
5351 		return;
5352 	}
5353 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5354 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5355 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5356 		    "oob offset %u\n", pkt->rm_oobdataoffset);
5357 		return;
5358 	}
5359 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5360 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5361 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5362 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5363 		return;
5364 	}
5365 
5366 #undef IS_OFFSET_INVALID
5367 
5368 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5369 	data_len = pkt->rm_datalen;
5370 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5371 	pktinfo_len = pkt->rm_pktinfolen;
5372 
5373 	/*
5374 	 * Check OOB coverage.
5375 	 */
5376 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
5377 		int oob_off, oob_len;
5378 
5379 		if_printf(rxr->hn_ifp, "got oobdata\n");
5380 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5381 		oob_len = pkt->rm_oobdatalen;
5382 
5383 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5384 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5385 			    "oob overflow, msglen %u, oob abs %d len %d\n",
5386 			    pkt->rm_len, oob_off, oob_len);
5387 			return;
5388 		}
5389 
5390 		/*
5391 		 * Check against data.
5392 		 */
5393 		if (hn_rndis_check_overlap(oob_off, oob_len,
5394 		    data_off, data_len)) {
5395 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5396 			    "oob overlaps data, oob abs %d len %d, "
5397 			    "data abs %d len %d\n",
5398 			    oob_off, oob_len, data_off, data_len);
5399 			return;
5400 		}
5401 
5402 		/*
5403 		 * Check against pktinfo.
5404 		 */
5405 		if (pktinfo_len != 0 &&
5406 		    hn_rndis_check_overlap(oob_off, oob_len,
5407 		    pktinfo_off, pktinfo_len)) {
5408 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5409 			    "oob overlaps pktinfo, oob abs %d len %d, "
5410 			    "pktinfo abs %d len %d\n",
5411 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
5412 			return;
5413 		}
5414 	}
5415 
5416 	/*
5417 	 * Check per-packet-info coverage and find useful per-packet-info.
5418 	 */
5419 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5420 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5421 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5422 	if (__predict_true(pktinfo_len != 0)) {
5423 		bool overlap;
5424 		int error;
5425 
5426 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5427 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5428 			    "pktinfo overflow, msglen %u, "
5429 			    "pktinfo abs %d len %d\n",
5430 			    pkt->rm_len, pktinfo_off, pktinfo_len);
5431 			return;
5432 		}
5433 
5434 		/*
5435 		 * Check packet info coverage.
5436 		 */
5437 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5438 		    data_off, data_len);
5439 		if (__predict_false(overlap)) {
5440 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5441 			    "pktinfo overlap data, pktinfo abs %d len %d, "
5442 			    "data abs %d len %d\n",
5443 			    pktinfo_off, pktinfo_len, data_off, data_len);
5444 			return;
5445 		}
5446 
5447 		/*
5448 		 * Find useful per-packet-info.
5449 		 */
5450 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5451 		    pktinfo_len, &info);
5452 		if (__predict_false(error)) {
5453 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5454 			    "pktinfo\n");
5455 			return;
5456 		}
5457 	}
5458 
5459 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
5460 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5461 		    "data overflow, msglen %u, data abs %d len %d\n",
5462 		    pkt->rm_len, data_off, data_len);
5463 		return;
5464 	}
5465 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5466 }
5467 
5468 static __inline void
5469 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5470 {
5471 	const struct rndis_msghdr *hdr;
5472 
5473 	if (__predict_false(dlen < sizeof(*hdr))) {
5474 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5475 		return;
5476 	}
5477 	hdr = data;
5478 
5479 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5480 		/* Hot data path. */
5481 		hn_rndis_rx_data(rxr, data, dlen);
5482 		/* Done! */
5483 		return;
5484 	}
5485 
5486 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5487 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5488 	else
5489 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5490 }
5491 
5492 static void
5493 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5494 {
5495 	const struct hn_nvs_hdr *hdr;
5496 
5497 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5498 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
5499 		return;
5500 	}
5501 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5502 
5503 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5504 		/* Useless; ignore */
5505 		return;
5506 	}
5507 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5508 }
5509 
5510 static void
5511 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5512     const struct vmbus_chanpkt_hdr *pkt)
5513 {
5514 	struct hn_nvs_sendctx *sndc;
5515 
5516 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5517 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5518 	    VMBUS_CHANPKT_DATALEN(pkt));
5519 	/*
5520 	 * NOTE:
5521 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5522 	 * its callback.
5523 	 */
5524 }
5525 
5526 static void
5527 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5528     const struct vmbus_chanpkt_hdr *pkthdr)
5529 {
5530 	const struct vmbus_chanpkt_rxbuf *pkt;
5531 	const struct hn_nvs_hdr *nvs_hdr;
5532 	int count, i, hlen;
5533 
5534 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5535 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5536 		return;
5537 	}
5538 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5539 
5540 	/* Make sure that this is a RNDIS message. */
5541 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5542 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5543 		    nvs_hdr->nvs_type);
5544 		return;
5545 	}
5546 
5547 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5548 	if (__predict_false(hlen < sizeof(*pkt))) {
5549 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5550 		return;
5551 	}
5552 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5553 
5554 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5555 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5556 		    pkt->cp_rxbuf_id);
5557 		return;
5558 	}
5559 
5560 	count = pkt->cp_rxbuf_cnt;
5561 	if (__predict_false(hlen <
5562 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5563 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5564 		return;
5565 	}
5566 
5567 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5568 	for (i = 0; i < count; ++i) {
5569 		int ofs, len;
5570 
5571 		ofs = pkt->cp_rxbuf[i].rb_ofs;
5572 		len = pkt->cp_rxbuf[i].rb_len;
5573 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5574 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5575 			    "ofs %d, len %d\n", i, ofs, len);
5576 			continue;
5577 		}
5578 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5579 	}
5580 
5581 	/*
5582 	 * Ack the consumed RXBUF associated w/ this channel packet,
5583 	 * so that this RXBUF can be recycled by the hypervisor.
5584 	 */
5585 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5586 }
5587 
5588 static void
5589 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5590     uint64_t tid)
5591 {
5592 	struct hn_nvs_rndis_ack ack;
5593 	int retries, error;
5594 
5595 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5596 	ack.nvs_status = HN_NVS_STATUS_OK;
5597 
5598 	retries = 0;
5599 again:
5600 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5601 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5602 	if (__predict_false(error == EAGAIN)) {
5603 		/*
5604 		 * NOTE:
5605 		 * This should _not_ happen in real world, since the
5606 		 * consumption of the TX bufring from the TX path is
5607 		 * controlled.
5608 		 */
5609 		if (rxr->hn_ack_failed == 0)
5610 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5611 		rxr->hn_ack_failed++;
5612 		retries++;
5613 		if (retries < 10) {
5614 			DELAY(100);
5615 			goto again;
5616 		}
5617 		/* RXBUF leaks! */
5618 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5619 	}
5620 }
5621 
5622 static void
5623 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5624 {
5625 	struct hn_rx_ring *rxr = xrxr;
5626 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
5627 
5628 	for (;;) {
5629 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5630 		int error, pktlen;
5631 
5632 		pktlen = rxr->hn_pktbuf_len;
5633 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5634 		if (__predict_false(error == ENOBUFS)) {
5635 			void *nbuf;
5636 			int nlen;
5637 
5638 			/*
5639 			 * Expand channel packet buffer.
5640 			 *
5641 			 * XXX
5642 			 * Use M_WAITOK here, since allocation failure
5643 			 * is fatal.
5644 			 */
5645 			nlen = rxr->hn_pktbuf_len * 2;
5646 			while (nlen < pktlen)
5647 				nlen *= 2;
5648 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5649 
5650 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5651 			    rxr->hn_pktbuf_len, nlen);
5652 
5653 			free(rxr->hn_pktbuf, M_DEVBUF);
5654 			rxr->hn_pktbuf = nbuf;
5655 			rxr->hn_pktbuf_len = nlen;
5656 			/* Retry! */
5657 			continue;
5658 		} else if (__predict_false(error == EAGAIN)) {
5659 			/* No more channel packets; done! */
5660 			break;
5661 		}
5662 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5663 
5664 		switch (pkt->cph_type) {
5665 		case VMBUS_CHANPKT_TYPE_COMP:
5666 			hn_nvs_handle_comp(sc, chan, pkt);
5667 			break;
5668 
5669 		case VMBUS_CHANPKT_TYPE_RXBUF:
5670 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
5671 			break;
5672 
5673 		case VMBUS_CHANPKT_TYPE_INBAND:
5674 			hn_nvs_handle_notify(sc, pkt);
5675 			break;
5676 
5677 		default:
5678 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5679 			    pkt->cph_type);
5680 			break;
5681 		}
5682 	}
5683 	hn_chan_rollup(rxr, rxr->hn_txr);
5684 }
5685 
5686 static void
5687 hn_tx_taskq_create(void *arg __unused)
5688 {
5689 	int i;
5690 
5691 	/*
5692 	 * Fix the # of TX taskqueues.
5693 	 */
5694 	if (hn_tx_taskq_cnt <= 0)
5695 		hn_tx_taskq_cnt = 1;
5696 	else if (hn_tx_taskq_cnt > mp_ncpus)
5697 		hn_tx_taskq_cnt = mp_ncpus;
5698 
5699 	/*
5700 	 * Fix the TX taskqueue mode.
5701 	 */
5702 	switch (hn_tx_taskq_mode) {
5703 	case HN_TX_TASKQ_M_INDEP:
5704 	case HN_TX_TASKQ_M_GLOBAL:
5705 	case HN_TX_TASKQ_M_EVTTQ:
5706 		break;
5707 	default:
5708 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5709 		break;
5710 	}
5711 
5712 	if (vm_guest != VM_GUEST_HV)
5713 		return;
5714 
5715 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5716 		return;
5717 
5718 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5719 	    M_DEVBUF, M_WAITOK);
5720 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5721 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5722 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5723 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5724 		    "hn tx%d", i);
5725 	}
5726 }
5727 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5728     hn_tx_taskq_create, NULL);
5729 
5730 static void
5731 hn_tx_taskq_destroy(void *arg __unused)
5732 {
5733 
5734 	if (hn_tx_taskque != NULL) {
5735 		int i;
5736 
5737 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
5738 			taskqueue_free(hn_tx_taskque[i]);
5739 		free(hn_tx_taskque, M_DEVBUF);
5740 	}
5741 }
5742 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5743     hn_tx_taskq_destroy, NULL);
5744