xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 792bbaba989533a1fc93823df1720c8c4aaf0442)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/bus.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/smp.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
75 #include <sys/sx.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
80 
81 #include <machine/atomic.h>
82 #include <machine/in_cksum.h>
83 
84 #include <net/bpf.h>
85 #include <net/ethernet.h>
86 #include <net/if.h>
87 #include <net/if_media.h>
88 #include <net/if_types.h>
89 #include <net/if_var.h>
90 #include <net/rndis.h>
91 #ifdef RSS
92 #include <net/rss_config.h>
93 #endif
94 
95 #include <netinet/in_systm.h>
96 #include <netinet/in.h>
97 #include <netinet/ip.h>
98 #include <netinet/ip6.h>
99 #include <netinet/tcp.h>
100 #include <netinet/tcp_lro.h>
101 #include <netinet/udp.h>
102 
103 #include <dev/hyperv/include/hyperv.h>
104 #include <dev/hyperv/include/hyperv_busdma.h>
105 #include <dev/hyperv/include/vmbus.h>
106 #include <dev/hyperv/include/vmbus_xact.h>
107 
108 #include <dev/hyperv/netvsc/ndis.h>
109 #include <dev/hyperv/netvsc/if_hnreg.h>
110 #include <dev/hyperv/netvsc/if_hnvar.h>
111 #include <dev/hyperv/netvsc/hn_nvs.h>
112 #include <dev/hyperv/netvsc/hn_rndis.h>
113 
114 #include "vmbus_if.h"
115 
116 #define HN_IFSTART_SUPPORT
117 
118 #define HN_RING_CNT_DEF_MAX		8
119 
120 /* YYY should get it from the underlying channel */
121 #define HN_TX_DESC_CNT			512
122 
123 #define HN_RNDIS_PKT_LEN					\
124 	(sizeof(struct rndis_packet_msg) +			\
125 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
126 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
127 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
128 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
129 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
130 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
131 
132 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
133 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
134 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
135 /* -1 for RNDIS packet message */
136 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
137 
138 #define HN_DIRECT_TX_SIZE_DEF		128
139 
140 #define HN_EARLY_TXEOF_THRESH		8
141 
142 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
143 
144 #define HN_LROENT_CNT_DEF		128
145 
146 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
147 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
148 /* YYY 2*MTU is a bit rough, but should be good enough. */
149 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
150 
151 #define HN_LRO_ACKCNT_DEF		1
152 
153 #define HN_LOCK_INIT(sc)		\
154 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
155 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
156 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
157 #define HN_LOCK(sc)					\
158 do {							\
159 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
160 		DELAY(1000);				\
161 } while (0)
162 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
163 
164 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
165 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
166 #define HN_CSUM_IP_HWASSIST(sc)		\
167 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
168 #define HN_CSUM_IP6_HWASSIST(sc)	\
169 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
170 
171 #define HN_PKTSIZE_MIN(align)		\
172 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
173 	    HN_RNDIS_PKT_LEN, (align))
174 #define HN_PKTSIZE(m, align)		\
175 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
176 
177 #ifdef RSS
178 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
179 #else
180 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
181 #endif
182 
183 struct hn_txdesc {
184 #ifndef HN_USE_TXDESC_BUFRING
185 	SLIST_ENTRY(hn_txdesc)		link;
186 #endif
187 	STAILQ_ENTRY(hn_txdesc)		agg_link;
188 
189 	/* Aggregated txdescs, in sending order. */
190 	STAILQ_HEAD(, hn_txdesc)	agg_list;
191 
192 	/* The oldest packet, if transmission aggregation happens. */
193 	struct mbuf			*m;
194 	struct hn_tx_ring		*txr;
195 	int				refs;
196 	uint32_t			flags;	/* HN_TXD_FLAG_ */
197 	struct hn_nvs_sendctx		send_ctx;
198 	uint32_t			chim_index;
199 	int				chim_size;
200 
201 	bus_dmamap_t			data_dmap;
202 
203 	bus_addr_t			rndis_pkt_paddr;
204 	struct rndis_packet_msg		*rndis_pkt;
205 	bus_dmamap_t			rndis_pkt_dmap;
206 };
207 
208 #define HN_TXD_FLAG_ONLIST		0x0001
209 #define HN_TXD_FLAG_DMAMAP		0x0002
210 #define HN_TXD_FLAG_ONAGG		0x0004
211 
212 struct hn_rxinfo {
213 	uint32_t			vlan_info;
214 	uint32_t			csum_info;
215 	uint32_t			hash_info;
216 	uint32_t			hash_value;
217 };
218 
219 #define HN_RXINFO_VLAN			0x0001
220 #define HN_RXINFO_CSUM			0x0002
221 #define HN_RXINFO_HASHINF		0x0004
222 #define HN_RXINFO_HASHVAL		0x0008
223 #define HN_RXINFO_ALL			\
224 	(HN_RXINFO_VLAN |		\
225 	 HN_RXINFO_CSUM |		\
226 	 HN_RXINFO_HASHINF |		\
227 	 HN_RXINFO_HASHVAL)
228 
229 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
230 #define HN_NDIS_RXCSUM_INFO_INVALID	0
231 #define HN_NDIS_HASH_INFO_INVALID	0
232 
233 static int			hn_probe(device_t);
234 static int			hn_attach(device_t);
235 static int			hn_detach(device_t);
236 static int			hn_shutdown(device_t);
237 static void			hn_chan_callback(struct vmbus_channel *,
238 				    void *);
239 
240 static void			hn_init(void *);
241 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
242 #ifdef HN_IFSTART_SUPPORT
243 static void			hn_start(struct ifnet *);
244 #endif
245 static int			hn_transmit(struct ifnet *, struct mbuf *);
246 static void			hn_xmit_qflush(struct ifnet *);
247 static int			hn_ifmedia_upd(struct ifnet *);
248 static void			hn_ifmedia_sts(struct ifnet *,
249 				    struct ifmediareq *);
250 
251 static int			hn_rndis_rxinfo(const void *, int,
252 				    struct hn_rxinfo *);
253 static void			hn_rndis_rx_data(struct hn_rx_ring *,
254 				    const void *, int);
255 static void			hn_rndis_rx_status(struct hn_softc *,
256 				    const void *, int);
257 
258 static void			hn_nvs_handle_notify(struct hn_softc *,
259 				    const struct vmbus_chanpkt_hdr *);
260 static void			hn_nvs_handle_comp(struct hn_softc *,
261 				    struct vmbus_channel *,
262 				    const struct vmbus_chanpkt_hdr *);
263 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
264 				    struct vmbus_channel *,
265 				    const struct vmbus_chanpkt_hdr *);
266 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
267 				    struct vmbus_channel *, uint64_t);
268 
269 #if __FreeBSD_version >= 1100099
270 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
271 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
272 #endif
273 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
274 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
275 #if __FreeBSD_version < 1100095
276 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
277 #else
278 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
279 #endif
280 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
281 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
282 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
283 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
284 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
285 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
286 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
287 #ifndef RSS
288 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
289 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
290 #endif
291 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
292 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
293 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
294 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
295 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
296 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
297 
298 static void			hn_stop(struct hn_softc *);
299 static void			hn_init_locked(struct hn_softc *);
300 static int			hn_chan_attach(struct hn_softc *,
301 				    struct vmbus_channel *);
302 static void			hn_chan_detach(struct hn_softc *,
303 				    struct vmbus_channel *);
304 static int			hn_attach_subchans(struct hn_softc *);
305 static void			hn_detach_allchans(struct hn_softc *);
306 static void			hn_chan_rollup(struct hn_rx_ring *,
307 				    struct hn_tx_ring *);
308 static void			hn_set_ring_inuse(struct hn_softc *, int);
309 static int			hn_synth_attach(struct hn_softc *, int);
310 static void			hn_synth_detach(struct hn_softc *);
311 static int			hn_synth_alloc_subchans(struct hn_softc *,
312 				    int *);
313 static bool			hn_synth_attachable(const struct hn_softc *);
314 static void			hn_suspend(struct hn_softc *);
315 static void			hn_suspend_data(struct hn_softc *);
316 static void			hn_suspend_mgmt(struct hn_softc *);
317 static void			hn_resume(struct hn_softc *);
318 static void			hn_resume_data(struct hn_softc *);
319 static void			hn_resume_mgmt(struct hn_softc *);
320 static void			hn_suspend_mgmt_taskfunc(void *, int);
321 static void			hn_chan_drain(struct hn_softc *,
322 				    struct vmbus_channel *);
323 static void			hn_polling(struct hn_softc *, u_int);
324 static void			hn_chan_polling(struct vmbus_channel *, u_int);
325 
326 static void			hn_update_link_status(struct hn_softc *);
327 static void			hn_change_network(struct hn_softc *);
328 static void			hn_link_taskfunc(void *, int);
329 static void			hn_netchg_init_taskfunc(void *, int);
330 static void			hn_netchg_status_taskfunc(void *, int);
331 static void			hn_link_status(struct hn_softc *);
332 
333 static int			hn_create_rx_data(struct hn_softc *, int);
334 static void			hn_destroy_rx_data(struct hn_softc *);
335 static int			hn_check_iplen(const struct mbuf *, int);
336 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
337 static int			hn_rxfilter_config(struct hn_softc *);
338 #ifndef RSS
339 static int			hn_rss_reconfig(struct hn_softc *);
340 #endif
341 static void			hn_rss_ind_fixup(struct hn_softc *);
342 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
343 				    int, const struct hn_rxinfo *);
344 
345 static int			hn_tx_ring_create(struct hn_softc *, int);
346 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
347 static int			hn_create_tx_data(struct hn_softc *, int);
348 static void			hn_fixup_tx_data(struct hn_softc *);
349 static void			hn_destroy_tx_data(struct hn_softc *);
350 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
351 static void			hn_txdesc_gc(struct hn_tx_ring *,
352 				    struct hn_txdesc *);
353 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
354 				    struct hn_txdesc *, struct mbuf **);
355 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
356 				    struct hn_txdesc *);
357 static void			hn_set_chim_size(struct hn_softc *, int);
358 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
359 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
360 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
361 static void			hn_resume_tx(struct hn_softc *, int);
362 static void			hn_set_txagg(struct hn_softc *);
363 static void			*hn_try_txagg(struct ifnet *,
364 				    struct hn_tx_ring *, struct hn_txdesc *,
365 				    int);
366 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
367 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
368 				    struct hn_softc *, struct vmbus_channel *,
369 				    const void *, int);
370 static int			hn_txpkt_sglist(struct hn_tx_ring *,
371 				    struct hn_txdesc *);
372 static int			hn_txpkt_chim(struct hn_tx_ring *,
373 				    struct hn_txdesc *);
374 static int			hn_xmit(struct hn_tx_ring *, int);
375 static void			hn_xmit_taskfunc(void *, int);
376 static void			hn_xmit_txeof(struct hn_tx_ring *);
377 static void			hn_xmit_txeof_taskfunc(void *, int);
378 #ifdef HN_IFSTART_SUPPORT
379 static int			hn_start_locked(struct hn_tx_ring *, int);
380 static void			hn_start_taskfunc(void *, int);
381 static void			hn_start_txeof(struct hn_tx_ring *);
382 static void			hn_start_txeof_taskfunc(void *, int);
383 #endif
384 
385 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
386     "Hyper-V network interface");
387 
388 /* Trust tcp segements verification on host side. */
389 static int			hn_trust_hosttcp = 1;
390 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
391     &hn_trust_hosttcp, 0,
392     "Trust tcp segement verification on host side, "
393     "when csum info is missing (global setting)");
394 
395 /* Trust udp datagrams verification on host side. */
396 static int			hn_trust_hostudp = 1;
397 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
398     &hn_trust_hostudp, 0,
399     "Trust udp datagram verification on host side, "
400     "when csum info is missing (global setting)");
401 
402 /* Trust ip packets verification on host side. */
403 static int			hn_trust_hostip = 1;
404 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
405     &hn_trust_hostip, 0,
406     "Trust ip packet verification on host side, "
407     "when csum info is missing (global setting)");
408 
409 /* Limit TSO burst size */
410 static int			hn_tso_maxlen = IP_MAXPACKET;
411 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
412     &hn_tso_maxlen, 0, "TSO burst limit");
413 
414 /* Limit chimney send size */
415 static int			hn_tx_chimney_size = 0;
416 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
417     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
418 
419 /* Limit the size of packet for direct transmission */
420 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
421 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
422     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
423 
424 /* # of LRO entries per RX ring */
425 #if defined(INET) || defined(INET6)
426 #if __FreeBSD_version >= 1100095
427 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
428 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
429     &hn_lro_entry_count, 0, "LRO entry count");
430 #endif
431 #endif
432 
433 static int			hn_tx_taskq_cnt = 1;
434 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
435     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
436 
437 #define HN_TX_TASKQ_M_INDEP	0
438 #define HN_TX_TASKQ_M_GLOBAL	1
439 #define HN_TX_TASKQ_M_EVTTQ	2
440 
441 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
442 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
443     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
444     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
445 
446 #ifndef HN_USE_TXDESC_BUFRING
447 static int			hn_use_txdesc_bufring = 0;
448 #else
449 static int			hn_use_txdesc_bufring = 1;
450 #endif
451 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
452     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
453 
454 #ifdef HN_IFSTART_SUPPORT
455 /* Use ifnet.if_start instead of ifnet.if_transmit */
456 static int			hn_use_if_start = 0;
457 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
458     &hn_use_if_start, 0, "Use if_start TX method");
459 #endif
460 
461 /* # of channels to use */
462 static int			hn_chan_cnt = 0;
463 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
464     &hn_chan_cnt, 0,
465     "# of channels to use; each channel has one RX ring and one TX ring");
466 
467 /* # of transmit rings to use */
468 static int			hn_tx_ring_cnt = 0;
469 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
470     &hn_tx_ring_cnt, 0, "# of TX rings to use");
471 
472 /* Software TX ring deptch */
473 static int			hn_tx_swq_depth = 0;
474 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
475     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
476 
477 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
478 #if __FreeBSD_version >= 1100095
479 static u_int			hn_lro_mbufq_depth = 0;
480 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
481     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
482 #endif
483 
484 /* Packet transmission aggregation size limit */
485 static int			hn_tx_agg_size = -1;
486 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
487     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
488 
489 /* Packet transmission aggregation count limit */
490 static int			hn_tx_agg_pkts = -1;
491 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
492     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
493 
494 static u_int			hn_cpu_index;	/* next CPU for channel */
495 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
496 
497 #ifndef RSS
498 static const uint8_t
499 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
500 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
501 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
502 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
503 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
504 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
505 };
506 #endif	/* !RSS */
507 
508 static device_method_t hn_methods[] = {
509 	/* Device interface */
510 	DEVMETHOD(device_probe,		hn_probe),
511 	DEVMETHOD(device_attach,	hn_attach),
512 	DEVMETHOD(device_detach,	hn_detach),
513 	DEVMETHOD(device_shutdown,	hn_shutdown),
514 	DEVMETHOD_END
515 };
516 
517 static driver_t hn_driver = {
518 	"hn",
519 	hn_methods,
520 	sizeof(struct hn_softc)
521 };
522 
523 static devclass_t hn_devclass;
524 
525 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
526 MODULE_VERSION(hn, 1);
527 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
528 
529 #if __FreeBSD_version >= 1100099
530 static void
531 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
532 {
533 	int i;
534 
535 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
536 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
537 }
538 #endif
539 
540 static int
541 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
542 {
543 
544 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
545 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
546 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
547 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
548 }
549 
550 static int
551 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
552 {
553 	struct hn_nvs_rndis rndis;
554 
555 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
556 	    txd->chim_size > 0, ("invalid rndis chim txd"));
557 
558 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
559 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
560 	rndis.nvs_chim_idx = txd->chim_index;
561 	rndis.nvs_chim_sz = txd->chim_size;
562 
563 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
564 	    &rndis, sizeof(rndis), &txd->send_ctx));
565 }
566 
567 static __inline uint32_t
568 hn_chim_alloc(struct hn_softc *sc)
569 {
570 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
571 	u_long *bmap = sc->hn_chim_bmap;
572 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
573 
574 	for (i = 0; i < bmap_cnt; ++i) {
575 		int idx;
576 
577 		idx = ffsl(~bmap[i]);
578 		if (idx == 0)
579 			continue;
580 
581 		--idx; /* ffsl is 1-based */
582 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
583 		    ("invalid i %d and idx %d", i, idx));
584 
585 		if (atomic_testandset_long(&bmap[i], idx))
586 			continue;
587 
588 		ret = i * LONG_BIT + idx;
589 		break;
590 	}
591 	return (ret);
592 }
593 
594 static __inline void
595 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
596 {
597 	u_long mask;
598 	uint32_t idx;
599 
600 	idx = chim_idx / LONG_BIT;
601 	KASSERT(idx < sc->hn_chim_bmap_cnt,
602 	    ("invalid chimney index 0x%x", chim_idx));
603 
604 	mask = 1UL << (chim_idx % LONG_BIT);
605 	KASSERT(sc->hn_chim_bmap[idx] & mask,
606 	    ("index bitmap 0x%lx, chimney index %u, "
607 	     "bitmap idx %d, bitmask 0x%lx",
608 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
609 
610 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
611 }
612 
613 #if defined(INET6) || defined(INET)
614 /*
615  * NOTE: If this function failed, the m_head would be freed.
616  */
617 static __inline struct mbuf *
618 hn_tso_fixup(struct mbuf *m_head)
619 {
620 	struct ether_vlan_header *evl;
621 	struct tcphdr *th;
622 	int ehlen;
623 
624 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
625 
626 #define PULLUP_HDR(m, len)				\
627 do {							\
628 	if (__predict_false((m)->m_len < (len))) {	\
629 		(m) = m_pullup((m), (len));		\
630 		if ((m) == NULL)			\
631 			return (NULL);			\
632 	}						\
633 } while (0)
634 
635 	PULLUP_HDR(m_head, sizeof(*evl));
636 	evl = mtod(m_head, struct ether_vlan_header *);
637 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
638 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
639 	else
640 		ehlen = ETHER_HDR_LEN;
641 
642 #ifdef INET
643 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
644 		struct ip *ip;
645 		int iphlen;
646 
647 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
648 		ip = mtodo(m_head, ehlen);
649 		iphlen = ip->ip_hl << 2;
650 
651 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
652 		th = mtodo(m_head, ehlen + iphlen);
653 
654 		ip->ip_len = 0;
655 		ip->ip_sum = 0;
656 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
657 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
658 	}
659 #endif
660 #if defined(INET6) && defined(INET)
661 	else
662 #endif
663 #ifdef INET6
664 	{
665 		struct ip6_hdr *ip6;
666 
667 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
668 		ip6 = mtodo(m_head, ehlen);
669 		if (ip6->ip6_nxt != IPPROTO_TCP) {
670 			m_freem(m_head);
671 			return (NULL);
672 		}
673 
674 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
675 		th = mtodo(m_head, ehlen + sizeof(*ip6));
676 
677 		ip6->ip6_plen = 0;
678 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
679 	}
680 #endif
681 	return (m_head);
682 
683 #undef PULLUP_HDR
684 }
685 #endif	/* INET6 || INET */
686 
687 static int
688 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
689 {
690 	int error = 0;
691 
692 	HN_LOCK_ASSERT(sc);
693 
694 	if (sc->hn_rx_filter != filter) {
695 		error = hn_rndis_set_rxfilter(sc, filter);
696 		if (!error)
697 			sc->hn_rx_filter = filter;
698 	}
699 	return (error);
700 }
701 
702 static int
703 hn_rxfilter_config(struct hn_softc *sc)
704 {
705 	struct ifnet *ifp = sc->hn_ifp;
706 	uint32_t filter;
707 
708 	HN_LOCK_ASSERT(sc);
709 
710 	if (ifp->if_flags & IFF_PROMISC) {
711 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
712 	} else {
713 		filter = NDIS_PACKET_TYPE_DIRECTED;
714 		if (ifp->if_flags & IFF_BROADCAST)
715 			filter |= NDIS_PACKET_TYPE_BROADCAST;
716 		/* TODO: support multicast list */
717 		if ((ifp->if_flags & IFF_ALLMULTI) ||
718 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
719 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
720 	}
721 	return (hn_set_rxfilter(sc, filter));
722 }
723 
724 static void
725 hn_set_txagg(struct hn_softc *sc)
726 {
727 	uint32_t size, pkts;
728 	int i;
729 
730 	/*
731 	 * Setup aggregation size.
732 	 */
733 	if (sc->hn_agg_size < 0)
734 		size = UINT32_MAX;
735 	else
736 		size = sc->hn_agg_size;
737 
738 	if (sc->hn_rndis_agg_size < size)
739 		size = sc->hn_rndis_agg_size;
740 
741 	/* NOTE: We only aggregate packets using chimney sending buffers. */
742 	if (size > (uint32_t)sc->hn_chim_szmax)
743 		size = sc->hn_chim_szmax;
744 
745 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
746 		/* Disable */
747 		size = 0;
748 		pkts = 0;
749 		goto done;
750 	}
751 
752 	/* NOTE: Type of the per TX ring setting is 'int'. */
753 	if (size > INT_MAX)
754 		size = INT_MAX;
755 
756 	/*
757 	 * Setup aggregation packet count.
758 	 */
759 	if (sc->hn_agg_pkts < 0)
760 		pkts = UINT32_MAX;
761 	else
762 		pkts = sc->hn_agg_pkts;
763 
764 	if (sc->hn_rndis_agg_pkts < pkts)
765 		pkts = sc->hn_rndis_agg_pkts;
766 
767 	if (pkts <= 1) {
768 		/* Disable */
769 		size = 0;
770 		pkts = 0;
771 		goto done;
772 	}
773 
774 	/* NOTE: Type of the per TX ring setting is 'short'. */
775 	if (pkts > SHRT_MAX)
776 		pkts = SHRT_MAX;
777 
778 done:
779 	/* NOTE: Type of the per TX ring setting is 'short'. */
780 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
781 		/* Disable */
782 		size = 0;
783 		pkts = 0;
784 	}
785 
786 	if (bootverbose) {
787 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
788 		    size, pkts, sc->hn_rndis_agg_align);
789 	}
790 
791 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
792 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
793 
794 		mtx_lock(&txr->hn_tx_lock);
795 		txr->hn_agg_szmax = size;
796 		txr->hn_agg_pktmax = pkts;
797 		txr->hn_agg_align = sc->hn_rndis_agg_align;
798 		mtx_unlock(&txr->hn_tx_lock);
799 	}
800 }
801 
802 static int
803 hn_get_txswq_depth(const struct hn_tx_ring *txr)
804 {
805 
806 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
807 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
808 		return txr->hn_txdesc_cnt;
809 	return hn_tx_swq_depth;
810 }
811 
812 #ifndef RSS
813 static int
814 hn_rss_reconfig(struct hn_softc *sc)
815 {
816 	int error;
817 
818 	HN_LOCK_ASSERT(sc);
819 
820 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
821 		return (ENXIO);
822 
823 	/*
824 	 * Disable RSS first.
825 	 *
826 	 * NOTE:
827 	 * Direct reconfiguration by setting the UNCHG flags does
828 	 * _not_ work properly.
829 	 */
830 	if (bootverbose)
831 		if_printf(sc->hn_ifp, "disable RSS\n");
832 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
833 	if (error) {
834 		if_printf(sc->hn_ifp, "RSS disable failed\n");
835 		return (error);
836 	}
837 
838 	/*
839 	 * Reenable the RSS w/ the updated RSS key or indirect
840 	 * table.
841 	 */
842 	if (bootverbose)
843 		if_printf(sc->hn_ifp, "reconfig RSS\n");
844 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
845 	if (error) {
846 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
847 		return (error);
848 	}
849 	return (0);
850 }
851 #endif	/* !RSS */
852 
853 static void
854 hn_rss_ind_fixup(struct hn_softc *sc)
855 {
856 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
857 	int i, nchan;
858 
859 	nchan = sc->hn_rx_ring_inuse;
860 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
861 
862 	/*
863 	 * Check indirect table to make sure that all channels in it
864 	 * can be used.
865 	 */
866 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
867 		if (rss->rss_ind[i] >= nchan) {
868 			if_printf(sc->hn_ifp,
869 			    "RSS indirect table %d fixup: %u -> %d\n",
870 			    i, rss->rss_ind[i], nchan - 1);
871 			rss->rss_ind[i] = nchan - 1;
872 		}
873 	}
874 }
875 
876 static int
877 hn_ifmedia_upd(struct ifnet *ifp __unused)
878 {
879 
880 	return EOPNOTSUPP;
881 }
882 
883 static void
884 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
885 {
886 	struct hn_softc *sc = ifp->if_softc;
887 
888 	ifmr->ifm_status = IFM_AVALID;
889 	ifmr->ifm_active = IFM_ETHER;
890 
891 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
892 		ifmr->ifm_active |= IFM_NONE;
893 		return;
894 	}
895 	ifmr->ifm_status |= IFM_ACTIVE;
896 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
897 }
898 
899 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
900 static const struct hyperv_guid g_net_vsc_device_type = {
901 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
902 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
903 };
904 
905 static int
906 hn_probe(device_t dev)
907 {
908 
909 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
910 	    &g_net_vsc_device_type) == 0) {
911 		device_set_desc(dev, "Hyper-V Network Interface");
912 		return BUS_PROBE_DEFAULT;
913 	}
914 	return ENXIO;
915 }
916 
917 static int
918 hn_attach(device_t dev)
919 {
920 	struct hn_softc *sc = device_get_softc(dev);
921 	struct sysctl_oid_list *child;
922 	struct sysctl_ctx_list *ctx;
923 	uint8_t eaddr[ETHER_ADDR_LEN];
924 	struct ifnet *ifp = NULL;
925 	int error, ring_cnt, tx_ring_cnt;
926 
927 	sc->hn_dev = dev;
928 	sc->hn_prichan = vmbus_get_channel(dev);
929 	HN_LOCK_INIT(sc);
930 
931 	/*
932 	 * Initialize these tunables once.
933 	 */
934 	sc->hn_agg_size = hn_tx_agg_size;
935 	sc->hn_agg_pkts = hn_tx_agg_pkts;
936 
937 	/*
938 	 * Setup taskqueue for transmission.
939 	 */
940 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
941 		int i;
942 
943 		sc->hn_tx_taskqs =
944 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
945 		    M_DEVBUF, M_WAITOK);
946 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
947 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
948 			    M_WAITOK, taskqueue_thread_enqueue,
949 			    &sc->hn_tx_taskqs[i]);
950 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
951 			    "%s tx%d", device_get_nameunit(dev), i);
952 		}
953 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
954 		sc->hn_tx_taskqs = hn_tx_taskque;
955 	}
956 
957 	/*
958 	 * Setup taskqueue for mangement tasks, e.g. link status.
959 	 */
960 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
961 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
962 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
963 	    device_get_nameunit(dev));
964 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
965 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
966 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
967 	    hn_netchg_status_taskfunc, sc);
968 
969 	/*
970 	 * Allocate ifnet and setup its name earlier, so that if_printf
971 	 * can be used by functions, which will be called after
972 	 * ether_ifattach().
973 	 */
974 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
975 	ifp->if_softc = sc;
976 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
977 
978 	/*
979 	 * Initialize ifmedia earlier so that it can be unconditionally
980 	 * destroyed, if error happened later on.
981 	 */
982 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
983 
984 	/*
985 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
986 	 * to use (tx_ring_cnt).
987 	 *
988 	 * NOTE:
989 	 * The # of RX rings to use is same as the # of channels to use.
990 	 */
991 	ring_cnt = hn_chan_cnt;
992 	if (ring_cnt <= 0) {
993 		/* Default */
994 		ring_cnt = mp_ncpus;
995 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
996 			ring_cnt = HN_RING_CNT_DEF_MAX;
997 	} else if (ring_cnt > mp_ncpus) {
998 		ring_cnt = mp_ncpus;
999 	}
1000 #ifdef RSS
1001 	if (ring_cnt > rss_getnumbuckets())
1002 		ring_cnt = rss_getnumbuckets();
1003 #endif
1004 
1005 	tx_ring_cnt = hn_tx_ring_cnt;
1006 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1007 		tx_ring_cnt = ring_cnt;
1008 #ifdef HN_IFSTART_SUPPORT
1009 	if (hn_use_if_start) {
1010 		/* ifnet.if_start only needs one TX ring. */
1011 		tx_ring_cnt = 1;
1012 	}
1013 #endif
1014 
1015 	/*
1016 	 * Set the leader CPU for channels.
1017 	 */
1018 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1019 
1020 	/*
1021 	 * Create enough TX/RX rings, even if only limited number of
1022 	 * channels can be allocated.
1023 	 */
1024 	error = hn_create_tx_data(sc, tx_ring_cnt);
1025 	if (error)
1026 		goto failed;
1027 	error = hn_create_rx_data(sc, ring_cnt);
1028 	if (error)
1029 		goto failed;
1030 
1031 	/*
1032 	 * Create transaction context for NVS and RNDIS transactions.
1033 	 */
1034 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1035 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1036 	if (sc->hn_xact == NULL) {
1037 		error = ENXIO;
1038 		goto failed;
1039 	}
1040 
1041 	/*
1042 	 * Install orphan handler for the revocation of this device's
1043 	 * primary channel.
1044 	 *
1045 	 * NOTE:
1046 	 * The processing order is critical here:
1047 	 * Install the orphan handler, _before_ testing whether this
1048 	 * device's primary channel has been revoked or not.
1049 	 */
1050 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1051 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1052 		error = ENXIO;
1053 		goto failed;
1054 	}
1055 
1056 	/*
1057 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1058 	 */
1059 	error = hn_synth_attach(sc, ETHERMTU);
1060 	if (error)
1061 		goto failed;
1062 
1063 	error = hn_rndis_get_eaddr(sc, eaddr);
1064 	if (error)
1065 		goto failed;
1066 
1067 #if __FreeBSD_version >= 1100099
1068 	if (sc->hn_rx_ring_inuse > 1) {
1069 		/*
1070 		 * Reduce TCP segment aggregation limit for multiple
1071 		 * RX rings to increase ACK timeliness.
1072 		 */
1073 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1074 	}
1075 #endif
1076 
1077 	/*
1078 	 * Fixup TX stuffs after synthetic parts are attached.
1079 	 */
1080 	hn_fixup_tx_data(sc);
1081 
1082 	ctx = device_get_sysctl_ctx(dev);
1083 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1084 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1085 	    &sc->hn_nvs_ver, 0, "NVS version");
1086 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1087 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1088 	    hn_ndis_version_sysctl, "A", "NDIS version");
1089 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1090 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1091 	    hn_caps_sysctl, "A", "capabilities");
1092 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1093 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1094 	    hn_hwassist_sysctl, "A", "hwassist");
1095 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1096 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1097 	    hn_rxfilter_sysctl, "A", "rxfilter");
1098 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1099 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1100 	    hn_rss_hash_sysctl, "A", "RSS hash");
1101 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1102 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1103 #ifndef RSS
1104 	/*
1105 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
1106 	 */
1107 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1108 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1109 	    hn_rss_key_sysctl, "IU", "RSS key");
1110 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1111 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1112 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1113 #endif
1114 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1115 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1116 	    "RNDIS offered packet transmission aggregation size limit");
1117 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1118 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1119 	    "RNDIS offered packet transmission aggregation count limit");
1120 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1121 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1122 	    "RNDIS packet transmission aggregation alignment");
1123 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1124 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1125 	    hn_txagg_size_sysctl, "I",
1126 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1127 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1128 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1129 	    hn_txagg_pkts_sysctl, "I",
1130 	    "Packet transmission aggregation packets, "
1131 	    "0 -- disable, -1 -- auto");
1132 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1133 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1134 	    hn_polling_sysctl, "I",
1135 	    "Polling frequency: [100,1000000], 0 disable polling");
1136 
1137 	/*
1138 	 * Setup the ifmedia, which has been initialized earlier.
1139 	 */
1140 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1141 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1142 	/* XXX ifmedia_set really should do this for us */
1143 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1144 
1145 	/*
1146 	 * Setup the ifnet for this interface.
1147 	 */
1148 
1149 	ifp->if_baudrate = IF_Gbps(10);
1150 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1151 	ifp->if_ioctl = hn_ioctl;
1152 	ifp->if_init = hn_init;
1153 #ifdef HN_IFSTART_SUPPORT
1154 	if (hn_use_if_start) {
1155 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1156 
1157 		ifp->if_start = hn_start;
1158 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1159 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1160 		IFQ_SET_READY(&ifp->if_snd);
1161 	} else
1162 #endif
1163 	{
1164 		ifp->if_transmit = hn_transmit;
1165 		ifp->if_qflush = hn_xmit_qflush;
1166 	}
1167 
1168 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1169 #ifdef foo
1170 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
1171 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1172 #endif
1173 	if (sc->hn_caps & HN_CAP_VLAN) {
1174 		/* XXX not sure about VLAN_MTU. */
1175 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1176 	}
1177 
1178 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1179 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1180 		ifp->if_capabilities |= IFCAP_TXCSUM;
1181 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1182 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1183 	if (sc->hn_caps & HN_CAP_TSO4) {
1184 		ifp->if_capabilities |= IFCAP_TSO4;
1185 		ifp->if_hwassist |= CSUM_IP_TSO;
1186 	}
1187 	if (sc->hn_caps & HN_CAP_TSO6) {
1188 		ifp->if_capabilities |= IFCAP_TSO6;
1189 		ifp->if_hwassist |= CSUM_IP6_TSO;
1190 	}
1191 
1192 	/* Enable all available capabilities by default. */
1193 	ifp->if_capenable = ifp->if_capabilities;
1194 
1195 	/*
1196 	 * Disable IPv6 TSO and TXCSUM by default, they still can
1197 	 * be enabled through SIOCSIFCAP.
1198 	 */
1199 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1200 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1201 
1202 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1203 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1204 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1205 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1206 	}
1207 
1208 	ether_ifattach(ifp, eaddr);
1209 
1210 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1211 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
1212 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1213 	}
1214 
1215 	/* Inform the upper layer about the long frame support. */
1216 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1217 
1218 	/*
1219 	 * Kick off link status check.
1220 	 */
1221 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1222 	hn_update_link_status(sc);
1223 
1224 	return (0);
1225 failed:
1226 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1227 		hn_synth_detach(sc);
1228 	hn_detach(dev);
1229 	return (error);
1230 }
1231 
1232 static int
1233 hn_detach(device_t dev)
1234 {
1235 	struct hn_softc *sc = device_get_softc(dev);
1236 	struct ifnet *ifp = sc->hn_ifp;
1237 
1238 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1239 		/*
1240 		 * In case that the vmbus missed the orphan handler
1241 		 * installation.
1242 		 */
1243 		vmbus_xact_ctx_orphan(sc->hn_xact);
1244 	}
1245 
1246 	if (device_is_attached(dev)) {
1247 		HN_LOCK(sc);
1248 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1249 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1250 				hn_stop(sc);
1251 			/*
1252 			 * NOTE:
1253 			 * hn_stop() only suspends data, so managment
1254 			 * stuffs have to be suspended manually here.
1255 			 */
1256 			hn_suspend_mgmt(sc);
1257 			hn_synth_detach(sc);
1258 		}
1259 		HN_UNLOCK(sc);
1260 		ether_ifdetach(ifp);
1261 	}
1262 
1263 	ifmedia_removeall(&sc->hn_media);
1264 	hn_destroy_rx_data(sc);
1265 	hn_destroy_tx_data(sc);
1266 
1267 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1268 		int i;
1269 
1270 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
1271 			taskqueue_free(sc->hn_tx_taskqs[i]);
1272 		free(sc->hn_tx_taskqs, M_DEVBUF);
1273 	}
1274 	taskqueue_free(sc->hn_mgmt_taskq0);
1275 
1276 	if (sc->hn_xact != NULL) {
1277 		/*
1278 		 * Uninstall the orphan handler _before_ the xact is
1279 		 * destructed.
1280 		 */
1281 		vmbus_chan_unset_orphan(sc->hn_prichan);
1282 		vmbus_xact_ctx_destroy(sc->hn_xact);
1283 	}
1284 
1285 	if_free(ifp);
1286 
1287 	HN_LOCK_DESTROY(sc);
1288 	return (0);
1289 }
1290 
1291 static int
1292 hn_shutdown(device_t dev)
1293 {
1294 
1295 	return (0);
1296 }
1297 
1298 static void
1299 hn_link_status(struct hn_softc *sc)
1300 {
1301 	uint32_t link_status;
1302 	int error;
1303 
1304 	error = hn_rndis_get_linkstatus(sc, &link_status);
1305 	if (error) {
1306 		/* XXX what to do? */
1307 		return;
1308 	}
1309 
1310 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1311 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1312 	else
1313 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1314 	if_link_state_change(sc->hn_ifp,
1315 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1316 	    LINK_STATE_UP : LINK_STATE_DOWN);
1317 }
1318 
1319 static void
1320 hn_link_taskfunc(void *xsc, int pending __unused)
1321 {
1322 	struct hn_softc *sc = xsc;
1323 
1324 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1325 		return;
1326 	hn_link_status(sc);
1327 }
1328 
1329 static void
1330 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1331 {
1332 	struct hn_softc *sc = xsc;
1333 
1334 	/* Prevent any link status checks from running. */
1335 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1336 
1337 	/*
1338 	 * Fake up a [link down --> link up] state change; 5 seconds
1339 	 * delay is used, which closely simulates miibus reaction
1340 	 * upon link down event.
1341 	 */
1342 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1343 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1344 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1345 	    &sc->hn_netchg_status, 5 * hz);
1346 }
1347 
1348 static void
1349 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1350 {
1351 	struct hn_softc *sc = xsc;
1352 
1353 	/* Re-allow link status checks. */
1354 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1355 	hn_link_status(sc);
1356 }
1357 
1358 static void
1359 hn_update_link_status(struct hn_softc *sc)
1360 {
1361 
1362 	if (sc->hn_mgmt_taskq != NULL)
1363 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1364 }
1365 
1366 static void
1367 hn_change_network(struct hn_softc *sc)
1368 {
1369 
1370 	if (sc->hn_mgmt_taskq != NULL)
1371 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1372 }
1373 
1374 static __inline int
1375 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1376     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1377 {
1378 	struct mbuf *m = *m_head;
1379 	int error;
1380 
1381 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1382 
1383 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1384 	    m, segs, nsegs, BUS_DMA_NOWAIT);
1385 	if (error == EFBIG) {
1386 		struct mbuf *m_new;
1387 
1388 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1389 		if (m_new == NULL)
1390 			return ENOBUFS;
1391 		else
1392 			*m_head = m = m_new;
1393 		txr->hn_tx_collapsed++;
1394 
1395 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1396 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1397 	}
1398 	if (!error) {
1399 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1400 		    BUS_DMASYNC_PREWRITE);
1401 		txd->flags |= HN_TXD_FLAG_DMAMAP;
1402 	}
1403 	return error;
1404 }
1405 
1406 static __inline int
1407 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1408 {
1409 
1410 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1411 	    ("put an onlist txd %#x", txd->flags));
1412 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1413 	    ("put an onagg txd %#x", txd->flags));
1414 
1415 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1416 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1417 		return 0;
1418 
1419 	if (!STAILQ_EMPTY(&txd->agg_list)) {
1420 		struct hn_txdesc *tmp_txd;
1421 
1422 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1423 			int freed;
1424 
1425 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1426 			    ("resursive aggregation on aggregated txdesc"));
1427 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1428 			    ("not aggregated txdesc"));
1429 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1430 			    ("aggregated txdesc uses dmamap"));
1431 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1432 			    ("aggregated txdesc consumes "
1433 			     "chimney sending buffer"));
1434 			KASSERT(tmp_txd->chim_size == 0,
1435 			    ("aggregated txdesc has non-zero "
1436 			     "chimney sending size"));
1437 
1438 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1439 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1440 			freed = hn_txdesc_put(txr, tmp_txd);
1441 			KASSERT(freed, ("failed to free aggregated txdesc"));
1442 		}
1443 	}
1444 
1445 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1446 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1447 		    ("chim txd uses dmamap"));
1448 		hn_chim_free(txr->hn_sc, txd->chim_index);
1449 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1450 		txd->chim_size = 0;
1451 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1452 		bus_dmamap_sync(txr->hn_tx_data_dtag,
1453 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1454 		bus_dmamap_unload(txr->hn_tx_data_dtag,
1455 		    txd->data_dmap);
1456 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1457 	}
1458 
1459 	if (txd->m != NULL) {
1460 		m_freem(txd->m);
1461 		txd->m = NULL;
1462 	}
1463 
1464 	txd->flags |= HN_TXD_FLAG_ONLIST;
1465 #ifndef HN_USE_TXDESC_BUFRING
1466 	mtx_lock_spin(&txr->hn_txlist_spin);
1467 	KASSERT(txr->hn_txdesc_avail >= 0 &&
1468 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1469 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1470 	txr->hn_txdesc_avail++;
1471 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1472 	mtx_unlock_spin(&txr->hn_txlist_spin);
1473 #else	/* HN_USE_TXDESC_BUFRING */
1474 #ifdef HN_DEBUG
1475 	atomic_add_int(&txr->hn_txdesc_avail, 1);
1476 #endif
1477 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1478 #endif	/* !HN_USE_TXDESC_BUFRING */
1479 
1480 	return 1;
1481 }
1482 
1483 static __inline struct hn_txdesc *
1484 hn_txdesc_get(struct hn_tx_ring *txr)
1485 {
1486 	struct hn_txdesc *txd;
1487 
1488 #ifndef HN_USE_TXDESC_BUFRING
1489 	mtx_lock_spin(&txr->hn_txlist_spin);
1490 	txd = SLIST_FIRST(&txr->hn_txlist);
1491 	if (txd != NULL) {
1492 		KASSERT(txr->hn_txdesc_avail > 0,
1493 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1494 		txr->hn_txdesc_avail--;
1495 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1496 	}
1497 	mtx_unlock_spin(&txr->hn_txlist_spin);
1498 #else
1499 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1500 #endif
1501 
1502 	if (txd != NULL) {
1503 #ifdef HN_USE_TXDESC_BUFRING
1504 #ifdef HN_DEBUG
1505 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1506 #endif
1507 #endif	/* HN_USE_TXDESC_BUFRING */
1508 		KASSERT(txd->m == NULL && txd->refs == 0 &&
1509 		    STAILQ_EMPTY(&txd->agg_list) &&
1510 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1511 		    txd->chim_size == 0 &&
1512 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1513 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1514 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1515 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1516 		txd->refs = 1;
1517 	}
1518 	return txd;
1519 }
1520 
1521 static __inline void
1522 hn_txdesc_hold(struct hn_txdesc *txd)
1523 {
1524 
1525 	/* 0->1 transition will never work */
1526 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1527 	atomic_add_int(&txd->refs, 1);
1528 }
1529 
1530 static __inline void
1531 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1532 {
1533 
1534 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1535 	    ("recursive aggregation on aggregating txdesc"));
1536 
1537 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1538 	    ("already aggregated"));
1539 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
1540 	    ("recursive aggregation on to-be-aggregated txdesc"));
1541 
1542 	txd->flags |= HN_TXD_FLAG_ONAGG;
1543 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1544 }
1545 
1546 static bool
1547 hn_tx_ring_pending(struct hn_tx_ring *txr)
1548 {
1549 	bool pending = false;
1550 
1551 #ifndef HN_USE_TXDESC_BUFRING
1552 	mtx_lock_spin(&txr->hn_txlist_spin);
1553 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1554 		pending = true;
1555 	mtx_unlock_spin(&txr->hn_txlist_spin);
1556 #else
1557 	if (!buf_ring_full(txr->hn_txdesc_br))
1558 		pending = true;
1559 #endif
1560 	return (pending);
1561 }
1562 
1563 static __inline void
1564 hn_txeof(struct hn_tx_ring *txr)
1565 {
1566 	txr->hn_has_txeof = 0;
1567 	txr->hn_txeof(txr);
1568 }
1569 
1570 static void
1571 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1572     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1573 {
1574 	struct hn_txdesc *txd = sndc->hn_cbarg;
1575 	struct hn_tx_ring *txr;
1576 
1577 	txr = txd->txr;
1578 	KASSERT(txr->hn_chan == chan,
1579 	    ("channel mismatch, on chan%u, should be chan%u",
1580 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1581 
1582 	txr->hn_has_txeof = 1;
1583 	hn_txdesc_put(txr, txd);
1584 
1585 	++txr->hn_txdone_cnt;
1586 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1587 		txr->hn_txdone_cnt = 0;
1588 		if (txr->hn_oactive)
1589 			hn_txeof(txr);
1590 	}
1591 }
1592 
1593 static void
1594 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1595 {
1596 #if defined(INET) || defined(INET6)
1597 	tcp_lro_flush_all(&rxr->hn_lro);
1598 #endif
1599 
1600 	/*
1601 	 * NOTE:
1602 	 * 'txr' could be NULL, if multiple channels and
1603 	 * ifnet.if_start method are enabled.
1604 	 */
1605 	if (txr == NULL || !txr->hn_has_txeof)
1606 		return;
1607 
1608 	txr->hn_txdone_cnt = 0;
1609 	hn_txeof(txr);
1610 }
1611 
1612 static __inline uint32_t
1613 hn_rndis_pktmsg_offset(uint32_t ofs)
1614 {
1615 
1616 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1617 	    ("invalid RNDIS packet msg offset %u", ofs));
1618 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1619 }
1620 
1621 static __inline void *
1622 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1623     size_t pi_dlen, uint32_t pi_type)
1624 {
1625 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1626 	struct rndis_pktinfo *pi;
1627 
1628 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1629 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1630 
1631 	/*
1632 	 * Per-packet-info does not move; it only grows.
1633 	 *
1634 	 * NOTE:
1635 	 * rm_pktinfooffset in this phase counts from the beginning
1636 	 * of rndis_packet_msg.
1637 	 */
1638 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1639 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1640 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1641 	    pkt->rm_pktinfolen);
1642 	pkt->rm_pktinfolen += pi_size;
1643 
1644 	pi->rm_size = pi_size;
1645 	pi->rm_type = pi_type;
1646 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1647 
1648 	/* Data immediately follow per-packet-info. */
1649 	pkt->rm_dataoffset += pi_size;
1650 
1651 	/* Update RNDIS packet msg length */
1652 	pkt->rm_len += pi_size;
1653 
1654 	return (pi->rm_data);
1655 }
1656 
1657 static __inline int
1658 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1659 {
1660 	struct hn_txdesc *txd;
1661 	struct mbuf *m;
1662 	int error, pkts;
1663 
1664 	txd = txr->hn_agg_txd;
1665 	KASSERT(txd != NULL, ("no aggregate txdesc"));
1666 
1667 	/*
1668 	 * Since hn_txpkt() will reset this temporary stat, save
1669 	 * it now, so that oerrors can be updated properly, if
1670 	 * hn_txpkt() ever fails.
1671 	 */
1672 	pkts = txr->hn_stat_pkts;
1673 
1674 	/*
1675 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1676 	 * failure, save it for later freeing, if hn_txpkt() ever
1677 	 * fails.
1678 	 */
1679 	m = txd->m;
1680 	error = hn_txpkt(ifp, txr, txd);
1681 	if (__predict_false(error)) {
1682 		/* txd is freed, but m is not. */
1683 		m_freem(m);
1684 
1685 		txr->hn_flush_failed++;
1686 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1687 	}
1688 
1689 	/* Reset all aggregation states. */
1690 	txr->hn_agg_txd = NULL;
1691 	txr->hn_agg_szleft = 0;
1692 	txr->hn_agg_pktleft = 0;
1693 	txr->hn_agg_prevpkt = NULL;
1694 
1695 	return (error);
1696 }
1697 
1698 static void *
1699 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1700     int pktsize)
1701 {
1702 	void *chim;
1703 
1704 	if (txr->hn_agg_txd != NULL) {
1705 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1706 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1707 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1708 			int olen;
1709 
1710 			/*
1711 			 * Update the previous RNDIS packet's total length,
1712 			 * it can be increased due to the mandatory alignment
1713 			 * padding for this RNDIS packet.  And update the
1714 			 * aggregating txdesc's chimney sending buffer size
1715 			 * accordingly.
1716 			 *
1717 			 * XXX
1718 			 * Zero-out the padding, as required by the RNDIS spec.
1719 			 */
1720 			olen = pkt->rm_len;
1721 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1722 			agg_txd->chim_size += pkt->rm_len - olen;
1723 
1724 			/* Link this txdesc to the parent. */
1725 			hn_txdesc_agg(agg_txd, txd);
1726 
1727 			chim = (uint8_t *)pkt + pkt->rm_len;
1728 			/* Save the current packet for later fixup. */
1729 			txr->hn_agg_prevpkt = chim;
1730 
1731 			txr->hn_agg_pktleft--;
1732 			txr->hn_agg_szleft -= pktsize;
1733 			if (txr->hn_agg_szleft <=
1734 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1735 				/*
1736 				 * Probably can't aggregate more packets,
1737 				 * flush this aggregating txdesc proactively.
1738 				 */
1739 				txr->hn_agg_pktleft = 0;
1740 			}
1741 			/* Done! */
1742 			return (chim);
1743 		}
1744 		hn_flush_txagg(ifp, txr);
1745 	}
1746 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1747 
1748 	txr->hn_tx_chimney_tried++;
1749 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
1750 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1751 		return (NULL);
1752 	txr->hn_tx_chimney++;
1753 
1754 	chim = txr->hn_sc->hn_chim +
1755 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1756 
1757 	if (txr->hn_agg_pktmax > 1 &&
1758 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1759 		txr->hn_agg_txd = txd;
1760 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1761 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1762 		txr->hn_agg_prevpkt = chim;
1763 	}
1764 	return (chim);
1765 }
1766 
1767 /*
1768  * NOTE:
1769  * If this function fails, then both txd and m_head0 will be freed.
1770  */
1771 static int
1772 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1773     struct mbuf **m_head0)
1774 {
1775 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1776 	int error, nsegs, i;
1777 	struct mbuf *m_head = *m_head0;
1778 	struct rndis_packet_msg *pkt;
1779 	uint32_t *pi_data;
1780 	void *chim = NULL;
1781 	int pkt_hlen, pkt_size;
1782 
1783 	pkt = txd->rndis_pkt;
1784 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1785 	if (pkt_size < txr->hn_chim_size) {
1786 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1787 		if (chim != NULL)
1788 			pkt = chim;
1789 	} else {
1790 		if (txr->hn_agg_txd != NULL)
1791 			hn_flush_txagg(ifp, txr);
1792 	}
1793 
1794 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1795 	pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1796 	pkt->rm_dataoffset = sizeof(*pkt);
1797 	pkt->rm_datalen = m_head->m_pkthdr.len;
1798 	pkt->rm_oobdataoffset = 0;
1799 	pkt->rm_oobdatalen = 0;
1800 	pkt->rm_oobdataelements = 0;
1801 	pkt->rm_pktinfooffset = sizeof(*pkt);
1802 	pkt->rm_pktinfolen = 0;
1803 	pkt->rm_vchandle = 0;
1804 	pkt->rm_reserved = 0;
1805 
1806 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1807 		/*
1808 		 * Set the hash value for this packet, so that the host could
1809 		 * dispatch the TX done event for this packet back to this TX
1810 		 * ring's channel.
1811 		 */
1812 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1813 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1814 		*pi_data = txr->hn_tx_idx;
1815 	}
1816 
1817 	if (m_head->m_flags & M_VLANTAG) {
1818 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1819 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1820 		*pi_data = NDIS_VLAN_INFO_MAKE(
1821 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1822 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1823 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1824 	}
1825 
1826 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1827 #if defined(INET6) || defined(INET)
1828 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1829 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1830 #ifdef INET
1831 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1832 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1833 			    m_head->m_pkthdr.tso_segsz);
1834 		}
1835 #endif
1836 #if defined(INET6) && defined(INET)
1837 		else
1838 #endif
1839 #ifdef INET6
1840 		{
1841 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1842 			    m_head->m_pkthdr.tso_segsz);
1843 		}
1844 #endif
1845 #endif	/* INET6 || INET */
1846 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1847 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1848 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1849 		if (m_head->m_pkthdr.csum_flags &
1850 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1851 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
1852 		} else {
1853 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
1854 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1855 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
1856 		}
1857 
1858 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1859 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1860 		else if (m_head->m_pkthdr.csum_flags &
1861 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
1862 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1863 	}
1864 
1865 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1866 	/* Convert RNDIS packet message offsets */
1867 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1868 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1869 
1870 	/*
1871 	 * Fast path: Chimney sending.
1872 	 */
1873 	if (chim != NULL) {
1874 		struct hn_txdesc *tgt_txd = txd;
1875 
1876 		if (txr->hn_agg_txd != NULL) {
1877 			tgt_txd = txr->hn_agg_txd;
1878 #ifdef INVARIANTS
1879 			*m_head0 = NULL;
1880 #endif
1881 		}
1882 
1883 		KASSERT(pkt == chim,
1884 		    ("RNDIS pkt not in chimney sending buffer"));
1885 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1886 		    ("chimney sending buffer is not used"));
1887 		tgt_txd->chim_size += pkt->rm_len;
1888 
1889 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
1890 		    ((uint8_t *)chim) + pkt_hlen);
1891 
1892 		txr->hn_gpa_cnt = 0;
1893 		txr->hn_sendpkt = hn_txpkt_chim;
1894 		goto done;
1895 	}
1896 
1897 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1898 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1899 	    ("chimney buffer is used"));
1900 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1901 
1902 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1903 	if (__predict_false(error)) {
1904 		int freed;
1905 
1906 		/*
1907 		 * This mbuf is not linked w/ the txd yet, so free it now.
1908 		 */
1909 		m_freem(m_head);
1910 		*m_head0 = NULL;
1911 
1912 		freed = hn_txdesc_put(txr, txd);
1913 		KASSERT(freed != 0,
1914 		    ("fail to free txd upon txdma error"));
1915 
1916 		txr->hn_txdma_failed++;
1917 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1918 		return error;
1919 	}
1920 	*m_head0 = m_head;
1921 
1922 	/* +1 RNDIS packet message */
1923 	txr->hn_gpa_cnt = nsegs + 1;
1924 
1925 	/* send packet with page buffer */
1926 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1927 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1928 	txr->hn_gpa[0].gpa_len = pkt_hlen;
1929 
1930 	/*
1931 	 * Fill the page buffers with mbuf info after the page
1932 	 * buffer for RNDIS packet message.
1933 	 */
1934 	for (i = 0; i < nsegs; ++i) {
1935 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1936 
1937 		gpa->gpa_page = atop(segs[i].ds_addr);
1938 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1939 		gpa->gpa_len = segs[i].ds_len;
1940 	}
1941 
1942 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1943 	txd->chim_size = 0;
1944 	txr->hn_sendpkt = hn_txpkt_sglist;
1945 done:
1946 	txd->m = m_head;
1947 
1948 	/* Set the completion routine */
1949 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1950 
1951 	/* Update temporary stats for later use. */
1952 	txr->hn_stat_pkts++;
1953 	txr->hn_stat_size += m_head->m_pkthdr.len;
1954 	if (m_head->m_flags & M_MCAST)
1955 		txr->hn_stat_mcasts++;
1956 
1957 	return 0;
1958 }
1959 
1960 /*
1961  * NOTE:
1962  * If this function fails, then txd will be freed, but the mbuf
1963  * associated w/ the txd will _not_ be freed.
1964  */
1965 static int
1966 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1967 {
1968 	int error, send_failed = 0, has_bpf;
1969 
1970 again:
1971 	has_bpf = bpf_peers_present(ifp->if_bpf);
1972 	if (has_bpf) {
1973 		/*
1974 		 * Make sure that this txd and any aggregated txds are not
1975 		 * freed before ETHER_BPF_MTAP.
1976 		 */
1977 		hn_txdesc_hold(txd);
1978 	}
1979 	error = txr->hn_sendpkt(txr, txd);
1980 	if (!error) {
1981 		if (has_bpf) {
1982 			const struct hn_txdesc *tmp_txd;
1983 
1984 			ETHER_BPF_MTAP(ifp, txd->m);
1985 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1986 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
1987 		}
1988 
1989 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1990 #ifdef HN_IFSTART_SUPPORT
1991 		if (!hn_use_if_start)
1992 #endif
1993 		{
1994 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
1995 			    txr->hn_stat_size);
1996 			if (txr->hn_stat_mcasts != 0) {
1997 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1998 				    txr->hn_stat_mcasts);
1999 			}
2000 		}
2001 		txr->hn_pkts += txr->hn_stat_pkts;
2002 		txr->hn_sends++;
2003 	}
2004 	if (has_bpf)
2005 		hn_txdesc_put(txr, txd);
2006 
2007 	if (__predict_false(error)) {
2008 		int freed;
2009 
2010 		/*
2011 		 * This should "really rarely" happen.
2012 		 *
2013 		 * XXX Too many RX to be acked or too many sideband
2014 		 * commands to run?  Ask netvsc_channel_rollup()
2015 		 * to kick start later.
2016 		 */
2017 		txr->hn_has_txeof = 1;
2018 		if (!send_failed) {
2019 			txr->hn_send_failed++;
2020 			send_failed = 1;
2021 			/*
2022 			 * Try sending again after set hn_has_txeof;
2023 			 * in case that we missed the last
2024 			 * netvsc_channel_rollup().
2025 			 */
2026 			goto again;
2027 		}
2028 		if_printf(ifp, "send failed\n");
2029 
2030 		/*
2031 		 * Caller will perform further processing on the
2032 		 * associated mbuf, so don't free it in hn_txdesc_put();
2033 		 * only unload it from the DMA map in hn_txdesc_put(),
2034 		 * if it was loaded.
2035 		 */
2036 		txd->m = NULL;
2037 		freed = hn_txdesc_put(txr, txd);
2038 		KASSERT(freed != 0,
2039 		    ("fail to free txd upon send error"));
2040 
2041 		txr->hn_send_failed++;
2042 	}
2043 
2044 	/* Reset temporary stats, after this sending is done. */
2045 	txr->hn_stat_size = 0;
2046 	txr->hn_stat_pkts = 0;
2047 	txr->hn_stat_mcasts = 0;
2048 
2049 	return (error);
2050 }
2051 
2052 /*
2053  * Append the specified data to the indicated mbuf chain,
2054  * Extend the mbuf chain if the new data does not fit in
2055  * existing space.
2056  *
2057  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2058  * There should be an equivalent in the kernel mbuf code,
2059  * but there does not appear to be one yet.
2060  *
2061  * Differs from m_append() in that additional mbufs are
2062  * allocated with cluster size MJUMPAGESIZE, and filled
2063  * accordingly.
2064  *
2065  * Return 1 if able to complete the job; otherwise 0.
2066  */
2067 static int
2068 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2069 {
2070 	struct mbuf *m, *n;
2071 	int remainder, space;
2072 
2073 	for (m = m0; m->m_next != NULL; m = m->m_next)
2074 		;
2075 	remainder = len;
2076 	space = M_TRAILINGSPACE(m);
2077 	if (space > 0) {
2078 		/*
2079 		 * Copy into available space.
2080 		 */
2081 		if (space > remainder)
2082 			space = remainder;
2083 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2084 		m->m_len += space;
2085 		cp += space;
2086 		remainder -= space;
2087 	}
2088 	while (remainder > 0) {
2089 		/*
2090 		 * Allocate a new mbuf; could check space
2091 		 * and allocate a cluster instead.
2092 		 */
2093 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2094 		if (n == NULL)
2095 			break;
2096 		n->m_len = min(MJUMPAGESIZE, remainder);
2097 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2098 		cp += n->m_len;
2099 		remainder -= n->m_len;
2100 		m->m_next = n;
2101 		m = n;
2102 	}
2103 	if (m0->m_flags & M_PKTHDR)
2104 		m0->m_pkthdr.len += len - remainder;
2105 
2106 	return (remainder == 0);
2107 }
2108 
2109 #if defined(INET) || defined(INET6)
2110 static __inline int
2111 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2112 {
2113 #if __FreeBSD_version >= 1100095
2114 	if (hn_lro_mbufq_depth) {
2115 		tcp_lro_queue_mbuf(lc, m);
2116 		return 0;
2117 	}
2118 #endif
2119 	return tcp_lro_rx(lc, m, 0);
2120 }
2121 #endif
2122 
2123 static int
2124 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2125     const struct hn_rxinfo *info)
2126 {
2127 	struct ifnet *ifp = rxr->hn_ifp;
2128 	struct mbuf *m_new;
2129 	int size, do_lro = 0, do_csum = 1;
2130 	int hash_type;
2131 
2132 	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2133 		return (0);
2134 
2135 	/*
2136 	 * Bail out if packet contains more data than configured MTU.
2137 	 */
2138 	if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2139 		return (0);
2140 	} else if (dlen <= MHLEN) {
2141 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
2142 		if (m_new == NULL) {
2143 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2144 			return (0);
2145 		}
2146 		memcpy(mtod(m_new, void *), data, dlen);
2147 		m_new->m_pkthdr.len = m_new->m_len = dlen;
2148 		rxr->hn_small_pkts++;
2149 	} else {
2150 		/*
2151 		 * Get an mbuf with a cluster.  For packets 2K or less,
2152 		 * get a standard 2K cluster.  For anything larger, get a
2153 		 * 4K cluster.  Any buffers larger than 4K can cause problems
2154 		 * if looped around to the Hyper-V TX channel, so avoid them.
2155 		 */
2156 		size = MCLBYTES;
2157 		if (dlen > MCLBYTES) {
2158 			/* 4096 */
2159 			size = MJUMPAGESIZE;
2160 		}
2161 
2162 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2163 		if (m_new == NULL) {
2164 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2165 			return (0);
2166 		}
2167 
2168 		hv_m_append(m_new, dlen, data);
2169 	}
2170 	m_new->m_pkthdr.rcvif = ifp;
2171 
2172 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2173 		do_csum = 0;
2174 
2175 	/* receive side checksum offload */
2176 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2177 		/* IP csum offload */
2178 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2179 			m_new->m_pkthdr.csum_flags |=
2180 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2181 			rxr->hn_csum_ip++;
2182 		}
2183 
2184 		/* TCP/UDP csum offload */
2185 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2186 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2187 			m_new->m_pkthdr.csum_flags |=
2188 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2189 			m_new->m_pkthdr.csum_data = 0xffff;
2190 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2191 				rxr->hn_csum_tcp++;
2192 			else
2193 				rxr->hn_csum_udp++;
2194 		}
2195 
2196 		/*
2197 		 * XXX
2198 		 * As of this write (Oct 28th, 2016), host side will turn
2199 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2200 		 * the do_lro setting here is actually _not_ accurate.  We
2201 		 * depend on the RSS hash type check to reset do_lro.
2202 		 */
2203 		if ((info->csum_info &
2204 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2205 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2206 			do_lro = 1;
2207 	} else {
2208 		const struct ether_header *eh;
2209 		uint16_t etype;
2210 		int hoff;
2211 
2212 		hoff = sizeof(*eh);
2213 		if (m_new->m_len < hoff)
2214 			goto skip;
2215 		eh = mtod(m_new, struct ether_header *);
2216 		etype = ntohs(eh->ether_type);
2217 		if (etype == ETHERTYPE_VLAN) {
2218 			const struct ether_vlan_header *evl;
2219 
2220 			hoff = sizeof(*evl);
2221 			if (m_new->m_len < hoff)
2222 				goto skip;
2223 			evl = mtod(m_new, struct ether_vlan_header *);
2224 			etype = ntohs(evl->evl_proto);
2225 		}
2226 
2227 		if (etype == ETHERTYPE_IP) {
2228 			int pr;
2229 
2230 			pr = hn_check_iplen(m_new, hoff);
2231 			if (pr == IPPROTO_TCP) {
2232 				if (do_csum &&
2233 				    (rxr->hn_trust_hcsum &
2234 				     HN_TRUST_HCSUM_TCP)) {
2235 					rxr->hn_csum_trusted++;
2236 					m_new->m_pkthdr.csum_flags |=
2237 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2238 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2239 					m_new->m_pkthdr.csum_data = 0xffff;
2240 				}
2241 				do_lro = 1;
2242 			} else if (pr == IPPROTO_UDP) {
2243 				if (do_csum &&
2244 				    (rxr->hn_trust_hcsum &
2245 				     HN_TRUST_HCSUM_UDP)) {
2246 					rxr->hn_csum_trusted++;
2247 					m_new->m_pkthdr.csum_flags |=
2248 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2249 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2250 					m_new->m_pkthdr.csum_data = 0xffff;
2251 				}
2252 			} else if (pr != IPPROTO_DONE && do_csum &&
2253 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2254 				rxr->hn_csum_trusted++;
2255 				m_new->m_pkthdr.csum_flags |=
2256 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2257 			}
2258 		}
2259 	}
2260 skip:
2261 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2262 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2263 		    NDIS_VLAN_INFO_ID(info->vlan_info),
2264 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
2265 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
2266 		m_new->m_flags |= M_VLANTAG;
2267 	}
2268 
2269 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2270 		rxr->hn_rss_pkts++;
2271 		m_new->m_pkthdr.flowid = info->hash_value;
2272 		hash_type = M_HASHTYPE_OPAQUE_HASH;
2273 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2274 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
2275 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2276 
2277 			/*
2278 			 * NOTE:
2279 			 * do_lro is resetted, if the hash types are not TCP
2280 			 * related.  See the comment in the above csum_flags
2281 			 * setup section.
2282 			 */
2283 			switch (type) {
2284 			case NDIS_HASH_IPV4:
2285 				hash_type = M_HASHTYPE_RSS_IPV4;
2286 				do_lro = 0;
2287 				break;
2288 
2289 			case NDIS_HASH_TCP_IPV4:
2290 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2291 				break;
2292 
2293 			case NDIS_HASH_IPV6:
2294 				hash_type = M_HASHTYPE_RSS_IPV6;
2295 				do_lro = 0;
2296 				break;
2297 
2298 			case NDIS_HASH_IPV6_EX:
2299 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
2300 				do_lro = 0;
2301 				break;
2302 
2303 			case NDIS_HASH_TCP_IPV6:
2304 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2305 				break;
2306 
2307 			case NDIS_HASH_TCP_IPV6_EX:
2308 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2309 				break;
2310 			}
2311 		}
2312 	} else {
2313 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2314 		hash_type = M_HASHTYPE_OPAQUE;
2315 	}
2316 	M_HASHTYPE_SET(m_new, hash_type);
2317 
2318 	/*
2319 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
2320 	 * messages (not just data messages) will trigger a response.
2321 	 */
2322 
2323 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2324 	rxr->hn_pkts++;
2325 
2326 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2327 #if defined(INET) || defined(INET6)
2328 		struct lro_ctrl *lro = &rxr->hn_lro;
2329 
2330 		if (lro->lro_cnt) {
2331 			rxr->hn_lro_tried++;
2332 			if (hn_lro_rx(lro, m_new) == 0) {
2333 				/* DONE! */
2334 				return 0;
2335 			}
2336 		}
2337 #endif
2338 	}
2339 
2340 	/* We're not holding the lock here, so don't release it */
2341 	(*ifp->if_input)(ifp, m_new);
2342 
2343 	return (0);
2344 }
2345 
2346 static int
2347 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2348 {
2349 	struct hn_softc *sc = ifp->if_softc;
2350 	struct ifreq *ifr = (struct ifreq *)data;
2351 	int mask, error = 0;
2352 
2353 	switch (cmd) {
2354 	case SIOCSIFMTU:
2355 		if (ifr->ifr_mtu > HN_MTU_MAX) {
2356 			error = EINVAL;
2357 			break;
2358 		}
2359 
2360 		HN_LOCK(sc);
2361 
2362 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2363 			HN_UNLOCK(sc);
2364 			break;
2365 		}
2366 
2367 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2368 			/* Can't change MTU */
2369 			HN_UNLOCK(sc);
2370 			error = EOPNOTSUPP;
2371 			break;
2372 		}
2373 
2374 		if (ifp->if_mtu == ifr->ifr_mtu) {
2375 			HN_UNLOCK(sc);
2376 			break;
2377 		}
2378 
2379 		/*
2380 		 * Suspend this interface before the synthetic parts
2381 		 * are ripped.
2382 		 */
2383 		hn_suspend(sc);
2384 
2385 		/*
2386 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
2387 		 */
2388 		hn_synth_detach(sc);
2389 
2390 		/*
2391 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2392 		 * with the new MTU setting.
2393 		 */
2394 		error = hn_synth_attach(sc, ifr->ifr_mtu);
2395 		if (error) {
2396 			HN_UNLOCK(sc);
2397 			break;
2398 		}
2399 
2400 		/*
2401 		 * Commit the requested MTU, after the synthetic parts
2402 		 * have been successfully attached.
2403 		 */
2404 		ifp->if_mtu = ifr->ifr_mtu;
2405 
2406 		/*
2407 		 * Make sure that various parameters based on MTU are
2408 		 * still valid, after the MTU change.
2409 		 */
2410 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2411 			hn_set_chim_size(sc, sc->hn_chim_szmax);
2412 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2413 #if __FreeBSD_version >= 1100099
2414 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2415 		    HN_LRO_LENLIM_MIN(ifp))
2416 			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2417 #endif
2418 
2419 		/*
2420 		 * All done!  Resume the interface now.
2421 		 */
2422 		hn_resume(sc);
2423 
2424 		HN_UNLOCK(sc);
2425 		break;
2426 
2427 	case SIOCSIFFLAGS:
2428 		HN_LOCK(sc);
2429 
2430 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2431 			HN_UNLOCK(sc);
2432 			break;
2433 		}
2434 
2435 		if (ifp->if_flags & IFF_UP) {
2436 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2437 				/*
2438 				 * Caller meight hold mutex, e.g.
2439 				 * bpf; use busy-wait for the RNDIS
2440 				 * reply.
2441 				 */
2442 				HN_NO_SLEEPING(sc);
2443 				hn_rxfilter_config(sc);
2444 				HN_SLEEPING_OK(sc);
2445 			} else {
2446 				hn_init_locked(sc);
2447 			}
2448 		} else {
2449 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2450 				hn_stop(sc);
2451 		}
2452 		sc->hn_if_flags = ifp->if_flags;
2453 
2454 		HN_UNLOCK(sc);
2455 		break;
2456 
2457 	case SIOCSIFCAP:
2458 		HN_LOCK(sc);
2459 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2460 
2461 		if (mask & IFCAP_TXCSUM) {
2462 			ifp->if_capenable ^= IFCAP_TXCSUM;
2463 			if (ifp->if_capenable & IFCAP_TXCSUM)
2464 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2465 			else
2466 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2467 		}
2468 		if (mask & IFCAP_TXCSUM_IPV6) {
2469 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2470 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2471 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2472 			else
2473 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2474 		}
2475 
2476 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2477 		if (mask & IFCAP_RXCSUM)
2478 			ifp->if_capenable ^= IFCAP_RXCSUM;
2479 #ifdef foo
2480 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2481 		if (mask & IFCAP_RXCSUM_IPV6)
2482 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2483 #endif
2484 
2485 		if (mask & IFCAP_LRO)
2486 			ifp->if_capenable ^= IFCAP_LRO;
2487 
2488 		if (mask & IFCAP_TSO4) {
2489 			ifp->if_capenable ^= IFCAP_TSO4;
2490 			if (ifp->if_capenable & IFCAP_TSO4)
2491 				ifp->if_hwassist |= CSUM_IP_TSO;
2492 			else
2493 				ifp->if_hwassist &= ~CSUM_IP_TSO;
2494 		}
2495 		if (mask & IFCAP_TSO6) {
2496 			ifp->if_capenable ^= IFCAP_TSO6;
2497 			if (ifp->if_capenable & IFCAP_TSO6)
2498 				ifp->if_hwassist |= CSUM_IP6_TSO;
2499 			else
2500 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2501 		}
2502 
2503 		HN_UNLOCK(sc);
2504 		break;
2505 
2506 	case SIOCADDMULTI:
2507 	case SIOCDELMULTI:
2508 		HN_LOCK(sc);
2509 
2510 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2511 			HN_UNLOCK(sc);
2512 			break;
2513 		}
2514 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2515 			/*
2516 			 * Multicast uses mutex; use busy-wait for
2517 			 * the RNDIS reply.
2518 			 */
2519 			HN_NO_SLEEPING(sc);
2520 			hn_rxfilter_config(sc);
2521 			HN_SLEEPING_OK(sc);
2522 		}
2523 
2524 		HN_UNLOCK(sc);
2525 		break;
2526 
2527 	case SIOCSIFMEDIA:
2528 	case SIOCGIFMEDIA:
2529 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2530 		break;
2531 
2532 	default:
2533 		error = ether_ioctl(ifp, cmd, data);
2534 		break;
2535 	}
2536 	return (error);
2537 }
2538 
2539 static void
2540 hn_stop(struct hn_softc *sc)
2541 {
2542 	struct ifnet *ifp = sc->hn_ifp;
2543 	int i;
2544 
2545 	HN_LOCK_ASSERT(sc);
2546 
2547 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2548 	    ("synthetic parts were not attached"));
2549 
2550 	/* Disable polling. */
2551 	hn_polling(sc, 0);
2552 
2553 	/* Clear RUNNING bit _before_ hn_suspend_data() */
2554 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2555 	hn_suspend_data(sc);
2556 
2557 	/* Clear OACTIVE bit. */
2558 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2559 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2560 		sc->hn_tx_ring[i].hn_oactive = 0;
2561 }
2562 
2563 static void
2564 hn_init_locked(struct hn_softc *sc)
2565 {
2566 	struct ifnet *ifp = sc->hn_ifp;
2567 	int i;
2568 
2569 	HN_LOCK_ASSERT(sc);
2570 
2571 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2572 		return;
2573 
2574 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2575 		return;
2576 
2577 	/* Configure RX filter */
2578 	hn_rxfilter_config(sc);
2579 
2580 	/* Clear OACTIVE bit. */
2581 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2582 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2583 		sc->hn_tx_ring[i].hn_oactive = 0;
2584 
2585 	/* Clear TX 'suspended' bit. */
2586 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2587 
2588 	/* Everything is ready; unleash! */
2589 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2590 
2591 	/* Re-enable polling if requested. */
2592 	if (sc->hn_pollhz > 0)
2593 		hn_polling(sc, sc->hn_pollhz);
2594 }
2595 
2596 static void
2597 hn_init(void *xsc)
2598 {
2599 	struct hn_softc *sc = xsc;
2600 
2601 	HN_LOCK(sc);
2602 	hn_init_locked(sc);
2603 	HN_UNLOCK(sc);
2604 }
2605 
2606 #if __FreeBSD_version >= 1100099
2607 
2608 static int
2609 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2610 {
2611 	struct hn_softc *sc = arg1;
2612 	unsigned int lenlim;
2613 	int error;
2614 
2615 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2616 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2617 	if (error || req->newptr == NULL)
2618 		return error;
2619 
2620 	HN_LOCK(sc);
2621 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2622 	    lenlim > TCP_LRO_LENGTH_MAX) {
2623 		HN_UNLOCK(sc);
2624 		return EINVAL;
2625 	}
2626 	hn_set_lro_lenlim(sc, lenlim);
2627 	HN_UNLOCK(sc);
2628 
2629 	return 0;
2630 }
2631 
2632 static int
2633 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2634 {
2635 	struct hn_softc *sc = arg1;
2636 	int ackcnt, error, i;
2637 
2638 	/*
2639 	 * lro_ackcnt_lim is append count limit,
2640 	 * +1 to turn it into aggregation limit.
2641 	 */
2642 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2643 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2644 	if (error || req->newptr == NULL)
2645 		return error;
2646 
2647 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2648 		return EINVAL;
2649 
2650 	/*
2651 	 * Convert aggregation limit back to append
2652 	 * count limit.
2653 	 */
2654 	--ackcnt;
2655 	HN_LOCK(sc);
2656 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2657 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2658 	HN_UNLOCK(sc);
2659 	return 0;
2660 }
2661 
2662 #endif
2663 
2664 static int
2665 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2666 {
2667 	struct hn_softc *sc = arg1;
2668 	int hcsum = arg2;
2669 	int on, error, i;
2670 
2671 	on = 0;
2672 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2673 		on = 1;
2674 
2675 	error = sysctl_handle_int(oidp, &on, 0, req);
2676 	if (error || req->newptr == NULL)
2677 		return error;
2678 
2679 	HN_LOCK(sc);
2680 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2681 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2682 
2683 		if (on)
2684 			rxr->hn_trust_hcsum |= hcsum;
2685 		else
2686 			rxr->hn_trust_hcsum &= ~hcsum;
2687 	}
2688 	HN_UNLOCK(sc);
2689 	return 0;
2690 }
2691 
2692 static int
2693 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2694 {
2695 	struct hn_softc *sc = arg1;
2696 	int chim_size, error;
2697 
2698 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
2699 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
2700 	if (error || req->newptr == NULL)
2701 		return error;
2702 
2703 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2704 		return EINVAL;
2705 
2706 	HN_LOCK(sc);
2707 	hn_set_chim_size(sc, chim_size);
2708 	HN_UNLOCK(sc);
2709 	return 0;
2710 }
2711 
2712 #if __FreeBSD_version < 1100095
2713 static int
2714 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2715 {
2716 	struct hn_softc *sc = arg1;
2717 	int ofs = arg2, i, error;
2718 	struct hn_rx_ring *rxr;
2719 	uint64_t stat;
2720 
2721 	stat = 0;
2722 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2723 		rxr = &sc->hn_rx_ring[i];
2724 		stat += *((int *)((uint8_t *)rxr + ofs));
2725 	}
2726 
2727 	error = sysctl_handle_64(oidp, &stat, 0, req);
2728 	if (error || req->newptr == NULL)
2729 		return error;
2730 
2731 	/* Zero out this stat. */
2732 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2733 		rxr = &sc->hn_rx_ring[i];
2734 		*((int *)((uint8_t *)rxr + ofs)) = 0;
2735 	}
2736 	return 0;
2737 }
2738 #else
2739 static int
2740 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2741 {
2742 	struct hn_softc *sc = arg1;
2743 	int ofs = arg2, i, error;
2744 	struct hn_rx_ring *rxr;
2745 	uint64_t stat;
2746 
2747 	stat = 0;
2748 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2749 		rxr = &sc->hn_rx_ring[i];
2750 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2751 	}
2752 
2753 	error = sysctl_handle_64(oidp, &stat, 0, req);
2754 	if (error || req->newptr == NULL)
2755 		return error;
2756 
2757 	/* Zero out this stat. */
2758 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2759 		rxr = &sc->hn_rx_ring[i];
2760 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2761 	}
2762 	return 0;
2763 }
2764 
2765 #endif
2766 
2767 static int
2768 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2769 {
2770 	struct hn_softc *sc = arg1;
2771 	int ofs = arg2, i, error;
2772 	struct hn_rx_ring *rxr;
2773 	u_long stat;
2774 
2775 	stat = 0;
2776 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2777 		rxr = &sc->hn_rx_ring[i];
2778 		stat += *((u_long *)((uint8_t *)rxr + ofs));
2779 	}
2780 
2781 	error = sysctl_handle_long(oidp, &stat, 0, req);
2782 	if (error || req->newptr == NULL)
2783 		return error;
2784 
2785 	/* Zero out this stat. */
2786 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2787 		rxr = &sc->hn_rx_ring[i];
2788 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
2789 	}
2790 	return 0;
2791 }
2792 
2793 static int
2794 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2795 {
2796 	struct hn_softc *sc = arg1;
2797 	int ofs = arg2, i, error;
2798 	struct hn_tx_ring *txr;
2799 	u_long stat;
2800 
2801 	stat = 0;
2802 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2803 		txr = &sc->hn_tx_ring[i];
2804 		stat += *((u_long *)((uint8_t *)txr + ofs));
2805 	}
2806 
2807 	error = sysctl_handle_long(oidp, &stat, 0, req);
2808 	if (error || req->newptr == NULL)
2809 		return error;
2810 
2811 	/* Zero out this stat. */
2812 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2813 		txr = &sc->hn_tx_ring[i];
2814 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
2815 	}
2816 	return 0;
2817 }
2818 
2819 static int
2820 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2821 {
2822 	struct hn_softc *sc = arg1;
2823 	int ofs = arg2, i, error, conf;
2824 	struct hn_tx_ring *txr;
2825 
2826 	txr = &sc->hn_tx_ring[0];
2827 	conf = *((int *)((uint8_t *)txr + ofs));
2828 
2829 	error = sysctl_handle_int(oidp, &conf, 0, req);
2830 	if (error || req->newptr == NULL)
2831 		return error;
2832 
2833 	HN_LOCK(sc);
2834 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2835 		txr = &sc->hn_tx_ring[i];
2836 		*((int *)((uint8_t *)txr + ofs)) = conf;
2837 	}
2838 	HN_UNLOCK(sc);
2839 
2840 	return 0;
2841 }
2842 
2843 static int
2844 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2845 {
2846 	struct hn_softc *sc = arg1;
2847 	int error, size;
2848 
2849 	size = sc->hn_agg_size;
2850 	error = sysctl_handle_int(oidp, &size, 0, req);
2851 	if (error || req->newptr == NULL)
2852 		return (error);
2853 
2854 	HN_LOCK(sc);
2855 	sc->hn_agg_size = size;
2856 	hn_set_txagg(sc);
2857 	HN_UNLOCK(sc);
2858 
2859 	return (0);
2860 }
2861 
2862 static int
2863 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2864 {
2865 	struct hn_softc *sc = arg1;
2866 	int error, pkts;
2867 
2868 	pkts = sc->hn_agg_pkts;
2869 	error = sysctl_handle_int(oidp, &pkts, 0, req);
2870 	if (error || req->newptr == NULL)
2871 		return (error);
2872 
2873 	HN_LOCK(sc);
2874 	sc->hn_agg_pkts = pkts;
2875 	hn_set_txagg(sc);
2876 	HN_UNLOCK(sc);
2877 
2878 	return (0);
2879 }
2880 
2881 static int
2882 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2883 {
2884 	struct hn_softc *sc = arg1;
2885 	int pkts;
2886 
2887 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2888 	return (sysctl_handle_int(oidp, &pkts, 0, req));
2889 }
2890 
2891 static int
2892 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2893 {
2894 	struct hn_softc *sc = arg1;
2895 	int align;
2896 
2897 	align = sc->hn_tx_ring[0].hn_agg_align;
2898 	return (sysctl_handle_int(oidp, &align, 0, req));
2899 }
2900 
2901 static void
2902 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
2903 {
2904 	if (pollhz == 0)
2905 		vmbus_chan_poll_disable(chan);
2906 	else
2907 		vmbus_chan_poll_enable(chan, pollhz);
2908 }
2909 
2910 static void
2911 hn_polling(struct hn_softc *sc, u_int pollhz)
2912 {
2913 	int nsubch = sc->hn_rx_ring_inuse - 1;
2914 
2915 	HN_LOCK_ASSERT(sc);
2916 
2917 	if (nsubch > 0) {
2918 		struct vmbus_channel **subch;
2919 		int i;
2920 
2921 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
2922 		for (i = 0; i < nsubch; ++i)
2923 			hn_chan_polling(subch[i], pollhz);
2924 		vmbus_subchan_rel(subch, nsubch);
2925 	}
2926 	hn_chan_polling(sc->hn_prichan, pollhz);
2927 }
2928 
2929 static int
2930 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
2931 {
2932 	struct hn_softc *sc = arg1;
2933 	int pollhz, error;
2934 
2935 	pollhz = sc->hn_pollhz;
2936 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
2937 	if (error || req->newptr == NULL)
2938 		return (error);
2939 
2940 	if (pollhz != 0 &&
2941 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
2942 		return (EINVAL);
2943 
2944 	HN_LOCK(sc);
2945 	if (sc->hn_pollhz != pollhz) {
2946 		sc->hn_pollhz = pollhz;
2947 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
2948 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
2949 			hn_polling(sc, sc->hn_pollhz);
2950 	}
2951 	HN_UNLOCK(sc);
2952 
2953 	return (0);
2954 }
2955 
2956 static int
2957 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2958 {
2959 	struct hn_softc *sc = arg1;
2960 	char verstr[16];
2961 
2962 	snprintf(verstr, sizeof(verstr), "%u.%u",
2963 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2964 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2965 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2966 }
2967 
2968 static int
2969 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2970 {
2971 	struct hn_softc *sc = arg1;
2972 	char caps_str[128];
2973 	uint32_t caps;
2974 
2975 	HN_LOCK(sc);
2976 	caps = sc->hn_caps;
2977 	HN_UNLOCK(sc);
2978 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2979 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2980 }
2981 
2982 static int
2983 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2984 {
2985 	struct hn_softc *sc = arg1;
2986 	char assist_str[128];
2987 	uint32_t hwassist;
2988 
2989 	HN_LOCK(sc);
2990 	hwassist = sc->hn_ifp->if_hwassist;
2991 	HN_UNLOCK(sc);
2992 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2993 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2994 }
2995 
2996 static int
2997 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2998 {
2999 	struct hn_softc *sc = arg1;
3000 	char filter_str[128];
3001 	uint32_t filter;
3002 
3003 	HN_LOCK(sc);
3004 	filter = sc->hn_rx_filter;
3005 	HN_UNLOCK(sc);
3006 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
3007 	    NDIS_PACKET_TYPES);
3008 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3009 }
3010 
3011 #ifndef RSS
3012 
3013 static int
3014 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3015 {
3016 	struct hn_softc *sc = arg1;
3017 	int error;
3018 
3019 	HN_LOCK(sc);
3020 
3021 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3022 	if (error || req->newptr == NULL)
3023 		goto back;
3024 
3025 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3026 	if (error)
3027 		goto back;
3028 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3029 
3030 	if (sc->hn_rx_ring_inuse > 1) {
3031 		error = hn_rss_reconfig(sc);
3032 	} else {
3033 		/* Not RSS capable, at least for now; just save the RSS key. */
3034 		error = 0;
3035 	}
3036 back:
3037 	HN_UNLOCK(sc);
3038 	return (error);
3039 }
3040 
3041 static int
3042 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3043 {
3044 	struct hn_softc *sc = arg1;
3045 	int error;
3046 
3047 	HN_LOCK(sc);
3048 
3049 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3050 	if (error || req->newptr == NULL)
3051 		goto back;
3052 
3053 	/*
3054 	 * Don't allow RSS indirect table change, if this interface is not
3055 	 * RSS capable currently.
3056 	 */
3057 	if (sc->hn_rx_ring_inuse == 1) {
3058 		error = EOPNOTSUPP;
3059 		goto back;
3060 	}
3061 
3062 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3063 	if (error)
3064 		goto back;
3065 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3066 
3067 	hn_rss_ind_fixup(sc);
3068 	error = hn_rss_reconfig(sc);
3069 back:
3070 	HN_UNLOCK(sc);
3071 	return (error);
3072 }
3073 
3074 #endif	/* !RSS */
3075 
3076 static int
3077 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3078 {
3079 	struct hn_softc *sc = arg1;
3080 	char hash_str[128];
3081 	uint32_t hash;
3082 
3083 	HN_LOCK(sc);
3084 	hash = sc->hn_rss_hash;
3085 	HN_UNLOCK(sc);
3086 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3087 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3088 }
3089 
3090 static int
3091 hn_check_iplen(const struct mbuf *m, int hoff)
3092 {
3093 	const struct ip *ip;
3094 	int len, iphlen, iplen;
3095 	const struct tcphdr *th;
3096 	int thoff;				/* TCP data offset */
3097 
3098 	len = hoff + sizeof(struct ip);
3099 
3100 	/* The packet must be at least the size of an IP header. */
3101 	if (m->m_pkthdr.len < len)
3102 		return IPPROTO_DONE;
3103 
3104 	/* The fixed IP header must reside completely in the first mbuf. */
3105 	if (m->m_len < len)
3106 		return IPPROTO_DONE;
3107 
3108 	ip = mtodo(m, hoff);
3109 
3110 	/* Bound check the packet's stated IP header length. */
3111 	iphlen = ip->ip_hl << 2;
3112 	if (iphlen < sizeof(struct ip))		/* minimum header length */
3113 		return IPPROTO_DONE;
3114 
3115 	/* The full IP header must reside completely in the one mbuf. */
3116 	if (m->m_len < hoff + iphlen)
3117 		return IPPROTO_DONE;
3118 
3119 	iplen = ntohs(ip->ip_len);
3120 
3121 	/*
3122 	 * Check that the amount of data in the buffers is as
3123 	 * at least much as the IP header would have us expect.
3124 	 */
3125 	if (m->m_pkthdr.len < hoff + iplen)
3126 		return IPPROTO_DONE;
3127 
3128 	/*
3129 	 * Ignore IP fragments.
3130 	 */
3131 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3132 		return IPPROTO_DONE;
3133 
3134 	/*
3135 	 * The TCP/IP or UDP/IP header must be entirely contained within
3136 	 * the first fragment of a packet.
3137 	 */
3138 	switch (ip->ip_p) {
3139 	case IPPROTO_TCP:
3140 		if (iplen < iphlen + sizeof(struct tcphdr))
3141 			return IPPROTO_DONE;
3142 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3143 			return IPPROTO_DONE;
3144 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3145 		thoff = th->th_off << 2;
3146 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3147 			return IPPROTO_DONE;
3148 		if (m->m_len < hoff + iphlen + thoff)
3149 			return IPPROTO_DONE;
3150 		break;
3151 	case IPPROTO_UDP:
3152 		if (iplen < iphlen + sizeof(struct udphdr))
3153 			return IPPROTO_DONE;
3154 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3155 			return IPPROTO_DONE;
3156 		break;
3157 	default:
3158 		if (iplen < iphlen)
3159 			return IPPROTO_DONE;
3160 		break;
3161 	}
3162 	return ip->ip_p;
3163 }
3164 
3165 static int
3166 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3167 {
3168 	struct sysctl_oid_list *child;
3169 	struct sysctl_ctx_list *ctx;
3170 	device_t dev = sc->hn_dev;
3171 #if defined(INET) || defined(INET6)
3172 #if __FreeBSD_version >= 1100095
3173 	int lroent_cnt;
3174 #endif
3175 #endif
3176 	int i;
3177 
3178 	/*
3179 	 * Create RXBUF for reception.
3180 	 *
3181 	 * NOTE:
3182 	 * - It is shared by all channels.
3183 	 * - A large enough buffer is allocated, certain version of NVSes
3184 	 *   may further limit the usable space.
3185 	 */
3186 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3187 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3188 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3189 	if (sc->hn_rxbuf == NULL) {
3190 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3191 		return (ENOMEM);
3192 	}
3193 
3194 	sc->hn_rx_ring_cnt = ring_cnt;
3195 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3196 
3197 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3198 	    M_DEVBUF, M_WAITOK | M_ZERO);
3199 
3200 #if defined(INET) || defined(INET6)
3201 #if __FreeBSD_version >= 1100095
3202 	lroent_cnt = hn_lro_entry_count;
3203 	if (lroent_cnt < TCP_LRO_ENTRIES)
3204 		lroent_cnt = TCP_LRO_ENTRIES;
3205 	if (bootverbose)
3206 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3207 #endif
3208 #endif	/* INET || INET6 */
3209 
3210 	ctx = device_get_sysctl_ctx(dev);
3211 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3212 
3213 	/* Create dev.hn.UNIT.rx sysctl tree */
3214 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3215 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3216 
3217 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3218 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3219 
3220 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3221 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3222 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
3223 		if (rxr->hn_br == NULL) {
3224 			device_printf(dev, "allocate bufring failed\n");
3225 			return (ENOMEM);
3226 		}
3227 
3228 		if (hn_trust_hosttcp)
3229 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3230 		if (hn_trust_hostudp)
3231 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3232 		if (hn_trust_hostip)
3233 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3234 		rxr->hn_ifp = sc->hn_ifp;
3235 		if (i < sc->hn_tx_ring_cnt)
3236 			rxr->hn_txr = &sc->hn_tx_ring[i];
3237 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3238 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3239 		rxr->hn_rx_idx = i;
3240 		rxr->hn_rxbuf = sc->hn_rxbuf;
3241 
3242 		/*
3243 		 * Initialize LRO.
3244 		 */
3245 #if defined(INET) || defined(INET6)
3246 #if __FreeBSD_version >= 1100095
3247 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3248 		    hn_lro_mbufq_depth);
3249 #else
3250 		tcp_lro_init(&rxr->hn_lro);
3251 		rxr->hn_lro.ifp = sc->hn_ifp;
3252 #endif
3253 #if __FreeBSD_version >= 1100099
3254 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3255 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3256 #endif
3257 #endif	/* INET || INET6 */
3258 
3259 		if (sc->hn_rx_sysctl_tree != NULL) {
3260 			char name[16];
3261 
3262 			/*
3263 			 * Create per RX ring sysctl tree:
3264 			 * dev.hn.UNIT.rx.RINGID
3265 			 */
3266 			snprintf(name, sizeof(name), "%d", i);
3267 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3268 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3269 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3270 
3271 			if (rxr->hn_rx_sysctl_tree != NULL) {
3272 				SYSCTL_ADD_ULONG(ctx,
3273 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3274 				    OID_AUTO, "packets", CTLFLAG_RW,
3275 				    &rxr->hn_pkts, "# of packets received");
3276 				SYSCTL_ADD_ULONG(ctx,
3277 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3278 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
3279 				    &rxr->hn_rss_pkts,
3280 				    "# of packets w/ RSS info received");
3281 				SYSCTL_ADD_INT(ctx,
3282 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3283 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3284 				    &rxr->hn_pktbuf_len, 0,
3285 				    "Temporary channel packet buffer length");
3286 			}
3287 		}
3288 	}
3289 
3290 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3291 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3292 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3293 #if __FreeBSD_version < 1100095
3294 	    hn_rx_stat_int_sysctl,
3295 #else
3296 	    hn_rx_stat_u64_sysctl,
3297 #endif
3298 	    "LU", "LRO queued");
3299 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3300 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3301 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3302 #if __FreeBSD_version < 1100095
3303 	    hn_rx_stat_int_sysctl,
3304 #else
3305 	    hn_rx_stat_u64_sysctl,
3306 #endif
3307 	    "LU", "LRO flushed");
3308 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3309 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3310 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
3311 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3312 #if __FreeBSD_version >= 1100099
3313 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3314 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3315 	    hn_lro_lenlim_sysctl, "IU",
3316 	    "Max # of data bytes to be aggregated by LRO");
3317 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3318 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3319 	    hn_lro_ackcnt_sysctl, "I",
3320 	    "Max # of ACKs to be aggregated by LRO");
3321 #endif
3322 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3323 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3324 	    hn_trust_hcsum_sysctl, "I",
3325 	    "Trust tcp segement verification on host side, "
3326 	    "when csum info is missing");
3327 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3328 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3329 	    hn_trust_hcsum_sysctl, "I",
3330 	    "Trust udp datagram verification on host side, "
3331 	    "when csum info is missing");
3332 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3333 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3334 	    hn_trust_hcsum_sysctl, "I",
3335 	    "Trust ip packet verification on host side, "
3336 	    "when csum info is missing");
3337 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3338 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3339 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
3340 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3341 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3342 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3343 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
3344 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3345 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3346 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3347 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
3348 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3349 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3350 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3351 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
3352 	    hn_rx_stat_ulong_sysctl, "LU",
3353 	    "# of packets that we trust host's csum verification");
3354 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3355 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3356 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
3357 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3358 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3359 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3360 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
3361 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3362 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3363 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3364 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3365 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3366 
3367 	return (0);
3368 }
3369 
3370 static void
3371 hn_destroy_rx_data(struct hn_softc *sc)
3372 {
3373 	int i;
3374 
3375 	if (sc->hn_rxbuf != NULL) {
3376 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3377 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3378 		else
3379 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
3380 		sc->hn_rxbuf = NULL;
3381 	}
3382 
3383 	if (sc->hn_rx_ring_cnt == 0)
3384 		return;
3385 
3386 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3387 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3388 
3389 		if (rxr->hn_br == NULL)
3390 			continue;
3391 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3392 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3393 		} else {
3394 			device_printf(sc->hn_dev,
3395 			    "%dth channel bufring is referenced", i);
3396 		}
3397 		rxr->hn_br = NULL;
3398 
3399 #if defined(INET) || defined(INET6)
3400 		tcp_lro_free(&rxr->hn_lro);
3401 #endif
3402 		free(rxr->hn_pktbuf, M_DEVBUF);
3403 	}
3404 	free(sc->hn_rx_ring, M_DEVBUF);
3405 	sc->hn_rx_ring = NULL;
3406 
3407 	sc->hn_rx_ring_cnt = 0;
3408 	sc->hn_rx_ring_inuse = 0;
3409 }
3410 
3411 static int
3412 hn_tx_ring_create(struct hn_softc *sc, int id)
3413 {
3414 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3415 	device_t dev = sc->hn_dev;
3416 	bus_dma_tag_t parent_dtag;
3417 	int error, i;
3418 
3419 	txr->hn_sc = sc;
3420 	txr->hn_tx_idx = id;
3421 
3422 #ifndef HN_USE_TXDESC_BUFRING
3423 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3424 #endif
3425 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3426 
3427 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3428 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3429 	    M_DEVBUF, M_WAITOK | M_ZERO);
3430 #ifndef HN_USE_TXDESC_BUFRING
3431 	SLIST_INIT(&txr->hn_txlist);
3432 #else
3433 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3434 	    M_WAITOK, &txr->hn_tx_lock);
3435 #endif
3436 
3437 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3438 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3439 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3440 	} else {
3441 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3442 	}
3443 
3444 #ifdef HN_IFSTART_SUPPORT
3445 	if (hn_use_if_start) {
3446 		txr->hn_txeof = hn_start_txeof;
3447 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3448 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3449 	} else
3450 #endif
3451 	{
3452 		int br_depth;
3453 
3454 		txr->hn_txeof = hn_xmit_txeof;
3455 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3456 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3457 
3458 		br_depth = hn_get_txswq_depth(txr);
3459 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3460 		    M_WAITOK, &txr->hn_tx_lock);
3461 	}
3462 
3463 	txr->hn_direct_tx_size = hn_direct_tx_size;
3464 
3465 	/*
3466 	 * Always schedule transmission instead of trying to do direct
3467 	 * transmission.  This one gives the best performance so far.
3468 	 */
3469 	txr->hn_sched_tx = 1;
3470 
3471 	parent_dtag = bus_get_dma_tag(dev);
3472 
3473 	/* DMA tag for RNDIS packet messages. */
3474 	error = bus_dma_tag_create(parent_dtag, /* parent */
3475 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
3476 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
3477 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3478 	    BUS_SPACE_MAXADDR,		/* highaddr */
3479 	    NULL, NULL,			/* filter, filterarg */
3480 	    HN_RNDIS_PKT_LEN,		/* maxsize */
3481 	    1,				/* nsegments */
3482 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
3483 	    0,				/* flags */
3484 	    NULL,			/* lockfunc */
3485 	    NULL,			/* lockfuncarg */
3486 	    &txr->hn_tx_rndis_dtag);
3487 	if (error) {
3488 		device_printf(dev, "failed to create rndis dmatag\n");
3489 		return error;
3490 	}
3491 
3492 	/* DMA tag for data. */
3493 	error = bus_dma_tag_create(parent_dtag, /* parent */
3494 	    1,				/* alignment */
3495 	    HN_TX_DATA_BOUNDARY,	/* boundary */
3496 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3497 	    BUS_SPACE_MAXADDR,		/* highaddr */
3498 	    NULL, NULL,			/* filter, filterarg */
3499 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
3500 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
3501 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
3502 	    0,				/* flags */
3503 	    NULL,			/* lockfunc */
3504 	    NULL,			/* lockfuncarg */
3505 	    &txr->hn_tx_data_dtag);
3506 	if (error) {
3507 		device_printf(dev, "failed to create data dmatag\n");
3508 		return error;
3509 	}
3510 
3511 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3512 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
3513 
3514 		txd->txr = txr;
3515 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3516 		STAILQ_INIT(&txd->agg_list);
3517 
3518 		/*
3519 		 * Allocate and load RNDIS packet message.
3520 		 */
3521         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3522 		    (void **)&txd->rndis_pkt,
3523 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3524 		    &txd->rndis_pkt_dmap);
3525 		if (error) {
3526 			device_printf(dev,
3527 			    "failed to allocate rndis_packet_msg, %d\n", i);
3528 			return error;
3529 		}
3530 
3531 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3532 		    txd->rndis_pkt_dmap,
3533 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3534 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3535 		    BUS_DMA_NOWAIT);
3536 		if (error) {
3537 			device_printf(dev,
3538 			    "failed to load rndis_packet_msg, %d\n", i);
3539 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3540 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3541 			return error;
3542 		}
3543 
3544 		/* DMA map for TX data. */
3545 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3546 		    &txd->data_dmap);
3547 		if (error) {
3548 			device_printf(dev,
3549 			    "failed to allocate tx data dmamap\n");
3550 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3551 			    txd->rndis_pkt_dmap);
3552 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3553 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3554 			return error;
3555 		}
3556 
3557 		/* All set, put it to list */
3558 		txd->flags |= HN_TXD_FLAG_ONLIST;
3559 #ifndef HN_USE_TXDESC_BUFRING
3560 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3561 #else
3562 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
3563 #endif
3564 	}
3565 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3566 
3567 	if (sc->hn_tx_sysctl_tree != NULL) {
3568 		struct sysctl_oid_list *child;
3569 		struct sysctl_ctx_list *ctx;
3570 		char name[16];
3571 
3572 		/*
3573 		 * Create per TX ring sysctl tree:
3574 		 * dev.hn.UNIT.tx.RINGID
3575 		 */
3576 		ctx = device_get_sysctl_ctx(dev);
3577 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3578 
3579 		snprintf(name, sizeof(name), "%d", id);
3580 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3581 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3582 
3583 		if (txr->hn_tx_sysctl_tree != NULL) {
3584 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3585 
3586 #ifdef HN_DEBUG
3587 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3588 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3589 			    "# of available TX descs");
3590 #endif
3591 #ifdef HN_IFSTART_SUPPORT
3592 			if (!hn_use_if_start)
3593 #endif
3594 			{
3595 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3596 				    CTLFLAG_RD, &txr->hn_oactive, 0,
3597 				    "over active");
3598 			}
3599 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3600 			    CTLFLAG_RW, &txr->hn_pkts,
3601 			    "# of packets transmitted");
3602 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3603 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
3604 		}
3605 	}
3606 
3607 	return 0;
3608 }
3609 
3610 static void
3611 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3612 {
3613 	struct hn_tx_ring *txr = txd->txr;
3614 
3615 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
3616 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3617 
3618 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3619 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3620 	    txd->rndis_pkt_dmap);
3621 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3622 }
3623 
3624 static void
3625 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3626 {
3627 
3628 	KASSERT(txd->refs == 0 || txd->refs == 1,
3629 	    ("invalid txd refs %d", txd->refs));
3630 
3631 	/* Aggregated txds will be freed by their aggregating txd. */
3632 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3633 		int freed;
3634 
3635 		freed = hn_txdesc_put(txr, txd);
3636 		KASSERT(freed, ("can't free txdesc"));
3637 	}
3638 }
3639 
3640 static void
3641 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3642 {
3643 	int i;
3644 
3645 	if (txr->hn_txdesc == NULL)
3646 		return;
3647 
3648 	/*
3649 	 * NOTE:
3650 	 * Because the freeing of aggregated txds will be deferred
3651 	 * to the aggregating txd, two passes are used here:
3652 	 * - The first pass GCes any pending txds.  This GC is necessary,
3653 	 *   since if the channels are revoked, hypervisor will not
3654 	 *   deliver send-done for all pending txds.
3655 	 * - The second pass frees the busdma stuffs, i.e. after all txds
3656 	 *   were freed.
3657 	 */
3658 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3659 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3660 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3661 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3662 
3663 	if (txr->hn_tx_data_dtag != NULL)
3664 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3665 	if (txr->hn_tx_rndis_dtag != NULL)
3666 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3667 
3668 #ifdef HN_USE_TXDESC_BUFRING
3669 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3670 #endif
3671 
3672 	free(txr->hn_txdesc, M_DEVBUF);
3673 	txr->hn_txdesc = NULL;
3674 
3675 	if (txr->hn_mbuf_br != NULL)
3676 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3677 
3678 #ifndef HN_USE_TXDESC_BUFRING
3679 	mtx_destroy(&txr->hn_txlist_spin);
3680 #endif
3681 	mtx_destroy(&txr->hn_tx_lock);
3682 }
3683 
3684 static int
3685 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3686 {
3687 	struct sysctl_oid_list *child;
3688 	struct sysctl_ctx_list *ctx;
3689 	int i;
3690 
3691 	/*
3692 	 * Create TXBUF for chimney sending.
3693 	 *
3694 	 * NOTE: It is shared by all channels.
3695 	 */
3696 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3697 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3698 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3699 	if (sc->hn_chim == NULL) {
3700 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
3701 		return (ENOMEM);
3702 	}
3703 
3704 	sc->hn_tx_ring_cnt = ring_cnt;
3705 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3706 
3707 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3708 	    M_DEVBUF, M_WAITOK | M_ZERO);
3709 
3710 	ctx = device_get_sysctl_ctx(sc->hn_dev);
3711 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3712 
3713 	/* Create dev.hn.UNIT.tx sysctl tree */
3714 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3715 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3716 
3717 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3718 		int error;
3719 
3720 		error = hn_tx_ring_create(sc, i);
3721 		if (error)
3722 			return error;
3723 	}
3724 
3725 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3726 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3727 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
3728 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3729 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3730 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3731 	    __offsetof(struct hn_tx_ring, hn_send_failed),
3732 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3733 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3734 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3735 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
3736 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3737 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3738 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3739 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
3740 	    hn_tx_stat_ulong_sysctl, "LU",
3741 	    "# of packet transmission aggregation flush failure");
3742 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3743 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3744 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3745 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3746 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3747 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3748 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
3749 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3750 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3751 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3752 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3753 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3754 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3755 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3756 	    "# of total TX descs");
3757 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3758 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3759 	    "Chimney send packet size upper boundary");
3760 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3761 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3762 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3763 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3764 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3765 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3766 	    hn_tx_conf_int_sysctl, "I",
3767 	    "Size of the packet for direct transmission");
3768 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3769 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3770 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
3771 	    hn_tx_conf_int_sysctl, "I",
3772 	    "Always schedule transmission "
3773 	    "instead of doing direct transmission");
3774 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3775 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3776 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3777 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3778 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3779 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3780 	    "Applied packet transmission aggregation size");
3781 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3782 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3783 	    hn_txagg_pktmax_sysctl, "I",
3784 	    "Applied packet transmission aggregation packets");
3785 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3786 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3787 	    hn_txagg_align_sysctl, "I",
3788 	    "Applied packet transmission aggregation alignment");
3789 
3790 	return 0;
3791 }
3792 
3793 static void
3794 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3795 {
3796 	int i;
3797 
3798 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3799 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
3800 }
3801 
3802 static void
3803 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3804 {
3805 	struct ifnet *ifp = sc->hn_ifp;
3806 	int tso_minlen;
3807 
3808 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3809 		return;
3810 
3811 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3812 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3813 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3814 
3815 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3816 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3817 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3818 
3819 	if (tso_maxlen < tso_minlen)
3820 		tso_maxlen = tso_minlen;
3821 	else if (tso_maxlen > IP_MAXPACKET)
3822 		tso_maxlen = IP_MAXPACKET;
3823 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
3824 		tso_maxlen = sc->hn_ndis_tso_szmax;
3825 	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3826 	if (bootverbose)
3827 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3828 }
3829 
3830 static void
3831 hn_fixup_tx_data(struct hn_softc *sc)
3832 {
3833 	uint64_t csum_assist;
3834 	int i;
3835 
3836 	hn_set_chim_size(sc, sc->hn_chim_szmax);
3837 	if (hn_tx_chimney_size > 0 &&
3838 	    hn_tx_chimney_size < sc->hn_chim_szmax)
3839 		hn_set_chim_size(sc, hn_tx_chimney_size);
3840 
3841 	csum_assist = 0;
3842 	if (sc->hn_caps & HN_CAP_IPCS)
3843 		csum_assist |= CSUM_IP;
3844 	if (sc->hn_caps & HN_CAP_TCP4CS)
3845 		csum_assist |= CSUM_IP_TCP;
3846 	if (sc->hn_caps & HN_CAP_UDP4CS)
3847 		csum_assist |= CSUM_IP_UDP;
3848 	if (sc->hn_caps & HN_CAP_TCP6CS)
3849 		csum_assist |= CSUM_IP6_TCP;
3850 	if (sc->hn_caps & HN_CAP_UDP6CS)
3851 		csum_assist |= CSUM_IP6_UDP;
3852 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3853 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3854 
3855 	if (sc->hn_caps & HN_CAP_HASHVAL) {
3856 		/*
3857 		 * Support HASHVAL pktinfo on TX path.
3858 		 */
3859 		if (bootverbose)
3860 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3861 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3862 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3863 	}
3864 }
3865 
3866 static void
3867 hn_destroy_tx_data(struct hn_softc *sc)
3868 {
3869 	int i;
3870 
3871 	if (sc->hn_chim != NULL) {
3872 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
3873 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3874 		} else {
3875 			device_printf(sc->hn_dev,
3876 			    "chimney sending buffer is referenced");
3877 		}
3878 		sc->hn_chim = NULL;
3879 	}
3880 
3881 	if (sc->hn_tx_ring_cnt == 0)
3882 		return;
3883 
3884 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3885 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3886 
3887 	free(sc->hn_tx_ring, M_DEVBUF);
3888 	sc->hn_tx_ring = NULL;
3889 
3890 	sc->hn_tx_ring_cnt = 0;
3891 	sc->hn_tx_ring_inuse = 0;
3892 }
3893 
3894 #ifdef HN_IFSTART_SUPPORT
3895 
3896 static void
3897 hn_start_taskfunc(void *xtxr, int pending __unused)
3898 {
3899 	struct hn_tx_ring *txr = xtxr;
3900 
3901 	mtx_lock(&txr->hn_tx_lock);
3902 	hn_start_locked(txr, 0);
3903 	mtx_unlock(&txr->hn_tx_lock);
3904 }
3905 
3906 static int
3907 hn_start_locked(struct hn_tx_ring *txr, int len)
3908 {
3909 	struct hn_softc *sc = txr->hn_sc;
3910 	struct ifnet *ifp = sc->hn_ifp;
3911 	int sched = 0;
3912 
3913 	KASSERT(hn_use_if_start,
3914 	    ("hn_start_locked is called, when if_start is disabled"));
3915 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3916 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3917 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3918 
3919 	if (__predict_false(txr->hn_suspended))
3920 		return (0);
3921 
3922 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3923 	    IFF_DRV_RUNNING)
3924 		return (0);
3925 
3926 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3927 		struct hn_txdesc *txd;
3928 		struct mbuf *m_head;
3929 		int error;
3930 
3931 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3932 		if (m_head == NULL)
3933 			break;
3934 
3935 		if (len > 0 && m_head->m_pkthdr.len > len) {
3936 			/*
3937 			 * This sending could be time consuming; let callers
3938 			 * dispatch this packet sending (and sending of any
3939 			 * following up packets) to tx taskqueue.
3940 			 */
3941 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3942 			sched = 1;
3943 			break;
3944 		}
3945 
3946 #if defined(INET6) || defined(INET)
3947 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3948 			m_head = hn_tso_fixup(m_head);
3949 			if (__predict_false(m_head == NULL)) {
3950 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3951 				continue;
3952 			}
3953 		}
3954 #endif
3955 
3956 		txd = hn_txdesc_get(txr);
3957 		if (txd == NULL) {
3958 			txr->hn_no_txdescs++;
3959 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3960 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3961 			break;
3962 		}
3963 
3964 		error = hn_encap(ifp, txr, txd, &m_head);
3965 		if (error) {
3966 			/* Both txd and m_head are freed */
3967 			KASSERT(txr->hn_agg_txd == NULL,
3968 			    ("encap failed w/ pending aggregating txdesc"));
3969 			continue;
3970 		}
3971 
3972 		if (txr->hn_agg_pktleft == 0) {
3973 			if (txr->hn_agg_txd != NULL) {
3974 				KASSERT(m_head == NULL,
3975 				    ("pending mbuf for aggregating txdesc"));
3976 				error = hn_flush_txagg(ifp, txr);
3977 				if (__predict_false(error)) {
3978 					atomic_set_int(&ifp->if_drv_flags,
3979 					    IFF_DRV_OACTIVE);
3980 					break;
3981 				}
3982 			} else {
3983 				KASSERT(m_head != NULL, ("mbuf was freed"));
3984 				error = hn_txpkt(ifp, txr, txd);
3985 				if (__predict_false(error)) {
3986 					/* txd is freed, but m_head is not */
3987 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3988 					atomic_set_int(&ifp->if_drv_flags,
3989 					    IFF_DRV_OACTIVE);
3990 					break;
3991 				}
3992 			}
3993 		}
3994 #ifdef INVARIANTS
3995 		else {
3996 			KASSERT(txr->hn_agg_txd != NULL,
3997 			    ("no aggregating txdesc"));
3998 			KASSERT(m_head == NULL,
3999 			    ("pending mbuf for aggregating txdesc"));
4000 		}
4001 #endif
4002 	}
4003 
4004 	/* Flush pending aggerated transmission. */
4005 	if (txr->hn_agg_txd != NULL)
4006 		hn_flush_txagg(ifp, txr);
4007 	return (sched);
4008 }
4009 
4010 static void
4011 hn_start(struct ifnet *ifp)
4012 {
4013 	struct hn_softc *sc = ifp->if_softc;
4014 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
4015 
4016 	if (txr->hn_sched_tx)
4017 		goto do_sched;
4018 
4019 	if (mtx_trylock(&txr->hn_tx_lock)) {
4020 		int sched;
4021 
4022 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4023 		mtx_unlock(&txr->hn_tx_lock);
4024 		if (!sched)
4025 			return;
4026 	}
4027 do_sched:
4028 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4029 }
4030 
4031 static void
4032 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4033 {
4034 	struct hn_tx_ring *txr = xtxr;
4035 
4036 	mtx_lock(&txr->hn_tx_lock);
4037 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4038 	hn_start_locked(txr, 0);
4039 	mtx_unlock(&txr->hn_tx_lock);
4040 }
4041 
4042 static void
4043 hn_start_txeof(struct hn_tx_ring *txr)
4044 {
4045 	struct hn_softc *sc = txr->hn_sc;
4046 	struct ifnet *ifp = sc->hn_ifp;
4047 
4048 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4049 
4050 	if (txr->hn_sched_tx)
4051 		goto do_sched;
4052 
4053 	if (mtx_trylock(&txr->hn_tx_lock)) {
4054 		int sched;
4055 
4056 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4057 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4058 		mtx_unlock(&txr->hn_tx_lock);
4059 		if (sched) {
4060 			taskqueue_enqueue(txr->hn_tx_taskq,
4061 			    &txr->hn_tx_task);
4062 		}
4063 	} else {
4064 do_sched:
4065 		/*
4066 		 * Release the OACTIVE earlier, with the hope, that
4067 		 * others could catch up.  The task will clear the
4068 		 * flag again with the hn_tx_lock to avoid possible
4069 		 * races.
4070 		 */
4071 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4072 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4073 	}
4074 }
4075 
4076 #endif	/* HN_IFSTART_SUPPORT */
4077 
4078 static int
4079 hn_xmit(struct hn_tx_ring *txr, int len)
4080 {
4081 	struct hn_softc *sc = txr->hn_sc;
4082 	struct ifnet *ifp = sc->hn_ifp;
4083 	struct mbuf *m_head;
4084 	int sched = 0;
4085 
4086 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4087 #ifdef HN_IFSTART_SUPPORT
4088 	KASSERT(hn_use_if_start == 0,
4089 	    ("hn_xmit is called, when if_start is enabled"));
4090 #endif
4091 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4092 
4093 	if (__predict_false(txr->hn_suspended))
4094 		return (0);
4095 
4096 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4097 		return (0);
4098 
4099 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4100 		struct hn_txdesc *txd;
4101 		int error;
4102 
4103 		if (len > 0 && m_head->m_pkthdr.len > len) {
4104 			/*
4105 			 * This sending could be time consuming; let callers
4106 			 * dispatch this packet sending (and sending of any
4107 			 * following up packets) to tx taskqueue.
4108 			 */
4109 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4110 			sched = 1;
4111 			break;
4112 		}
4113 
4114 		txd = hn_txdesc_get(txr);
4115 		if (txd == NULL) {
4116 			txr->hn_no_txdescs++;
4117 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4118 			txr->hn_oactive = 1;
4119 			break;
4120 		}
4121 
4122 		error = hn_encap(ifp, txr, txd, &m_head);
4123 		if (error) {
4124 			/* Both txd and m_head are freed; discard */
4125 			KASSERT(txr->hn_agg_txd == NULL,
4126 			    ("encap failed w/ pending aggregating txdesc"));
4127 			drbr_advance(ifp, txr->hn_mbuf_br);
4128 			continue;
4129 		}
4130 
4131 		if (txr->hn_agg_pktleft == 0) {
4132 			if (txr->hn_agg_txd != NULL) {
4133 				KASSERT(m_head == NULL,
4134 				    ("pending mbuf for aggregating txdesc"));
4135 				error = hn_flush_txagg(ifp, txr);
4136 				if (__predict_false(error)) {
4137 					txr->hn_oactive = 1;
4138 					break;
4139 				}
4140 			} else {
4141 				KASSERT(m_head != NULL, ("mbuf was freed"));
4142 				error = hn_txpkt(ifp, txr, txd);
4143 				if (__predict_false(error)) {
4144 					/* txd is freed, but m_head is not */
4145 					drbr_putback(ifp, txr->hn_mbuf_br,
4146 					    m_head);
4147 					txr->hn_oactive = 1;
4148 					break;
4149 				}
4150 			}
4151 		}
4152 #ifdef INVARIANTS
4153 		else {
4154 			KASSERT(txr->hn_agg_txd != NULL,
4155 			    ("no aggregating txdesc"));
4156 			KASSERT(m_head == NULL,
4157 			    ("pending mbuf for aggregating txdesc"));
4158 		}
4159 #endif
4160 
4161 		/* Sent */
4162 		drbr_advance(ifp, txr->hn_mbuf_br);
4163 	}
4164 
4165 	/* Flush pending aggerated transmission. */
4166 	if (txr->hn_agg_txd != NULL)
4167 		hn_flush_txagg(ifp, txr);
4168 	return (sched);
4169 }
4170 
4171 static int
4172 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4173 {
4174 	struct hn_softc *sc = ifp->if_softc;
4175 	struct hn_tx_ring *txr;
4176 	int error, idx = 0;
4177 
4178 #if defined(INET6) || defined(INET)
4179 	/*
4180 	 * Perform TSO packet header fixup now, since the TSO
4181 	 * packet header should be cache-hot.
4182 	 */
4183 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4184 		m = hn_tso_fixup(m);
4185 		if (__predict_false(m == NULL)) {
4186 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4187 			return EIO;
4188 		}
4189 	}
4190 #endif
4191 
4192 	/*
4193 	 * Select the TX ring based on flowid
4194 	 */
4195 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4196 #ifdef RSS
4197 		uint32_t bid;
4198 
4199 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
4200 		    &bid) == 0)
4201 			idx = bid % sc->hn_tx_ring_inuse;
4202 		else
4203 #endif
4204 			idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4205 	}
4206 	txr = &sc->hn_tx_ring[idx];
4207 
4208 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4209 	if (error) {
4210 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4211 		return error;
4212 	}
4213 
4214 	if (txr->hn_oactive)
4215 		return 0;
4216 
4217 	if (txr->hn_sched_tx)
4218 		goto do_sched;
4219 
4220 	if (mtx_trylock(&txr->hn_tx_lock)) {
4221 		int sched;
4222 
4223 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4224 		mtx_unlock(&txr->hn_tx_lock);
4225 		if (!sched)
4226 			return 0;
4227 	}
4228 do_sched:
4229 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4230 	return 0;
4231 }
4232 
4233 static void
4234 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4235 {
4236 	struct mbuf *m;
4237 
4238 	mtx_lock(&txr->hn_tx_lock);
4239 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4240 		m_freem(m);
4241 	mtx_unlock(&txr->hn_tx_lock);
4242 }
4243 
4244 static void
4245 hn_xmit_qflush(struct ifnet *ifp)
4246 {
4247 	struct hn_softc *sc = ifp->if_softc;
4248 	int i;
4249 
4250 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4251 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4252 	if_qflush(ifp);
4253 }
4254 
4255 static void
4256 hn_xmit_txeof(struct hn_tx_ring *txr)
4257 {
4258 
4259 	if (txr->hn_sched_tx)
4260 		goto do_sched;
4261 
4262 	if (mtx_trylock(&txr->hn_tx_lock)) {
4263 		int sched;
4264 
4265 		txr->hn_oactive = 0;
4266 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4267 		mtx_unlock(&txr->hn_tx_lock);
4268 		if (sched) {
4269 			taskqueue_enqueue(txr->hn_tx_taskq,
4270 			    &txr->hn_tx_task);
4271 		}
4272 	} else {
4273 do_sched:
4274 		/*
4275 		 * Release the oactive earlier, with the hope, that
4276 		 * others could catch up.  The task will clear the
4277 		 * oactive again with the hn_tx_lock to avoid possible
4278 		 * races.
4279 		 */
4280 		txr->hn_oactive = 0;
4281 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4282 	}
4283 }
4284 
4285 static void
4286 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4287 {
4288 	struct hn_tx_ring *txr = xtxr;
4289 
4290 	mtx_lock(&txr->hn_tx_lock);
4291 	hn_xmit(txr, 0);
4292 	mtx_unlock(&txr->hn_tx_lock);
4293 }
4294 
4295 static void
4296 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4297 {
4298 	struct hn_tx_ring *txr = xtxr;
4299 
4300 	mtx_lock(&txr->hn_tx_lock);
4301 	txr->hn_oactive = 0;
4302 	hn_xmit(txr, 0);
4303 	mtx_unlock(&txr->hn_tx_lock);
4304 }
4305 
4306 static int
4307 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4308 {
4309 	struct vmbus_chan_br cbr;
4310 	struct hn_rx_ring *rxr;
4311 	struct hn_tx_ring *txr = NULL;
4312 	int idx, error;
4313 
4314 	idx = vmbus_chan_subidx(chan);
4315 
4316 	/*
4317 	 * Link this channel to RX/TX ring.
4318 	 */
4319 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4320 	    ("invalid channel index %d, should > 0 && < %d",
4321 	     idx, sc->hn_rx_ring_inuse));
4322 	rxr = &sc->hn_rx_ring[idx];
4323 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4324 	    ("RX ring %d already attached", idx));
4325 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4326 
4327 	if (bootverbose) {
4328 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4329 		    idx, vmbus_chan_id(chan));
4330 	}
4331 
4332 	if (idx < sc->hn_tx_ring_inuse) {
4333 		txr = &sc->hn_tx_ring[idx];
4334 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4335 		    ("TX ring %d already attached", idx));
4336 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4337 
4338 		txr->hn_chan = chan;
4339 		if (bootverbose) {
4340 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4341 			    idx, vmbus_chan_id(chan));
4342 		}
4343 	}
4344 
4345 	/* Bind this channel to a proper CPU. */
4346 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4347 
4348 	/*
4349 	 * Open this channel
4350 	 */
4351 	cbr.cbr = rxr->hn_br;
4352 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4353 	cbr.cbr_txsz = HN_TXBR_SIZE;
4354 	cbr.cbr_rxsz = HN_RXBR_SIZE;
4355 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4356 	if (error) {
4357 		if (error == EISCONN) {
4358 			if_printf(sc->hn_ifp, "bufring is connected after "
4359 			    "chan%u open failure\n", vmbus_chan_id(chan));
4360 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4361 		} else {
4362 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4363 			    vmbus_chan_id(chan), error);
4364 		}
4365 	}
4366 	return (error);
4367 }
4368 
4369 static void
4370 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4371 {
4372 	struct hn_rx_ring *rxr;
4373 	int idx, error;
4374 
4375 	idx = vmbus_chan_subidx(chan);
4376 
4377 	/*
4378 	 * Link this channel to RX/TX ring.
4379 	 */
4380 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4381 	    ("invalid channel index %d, should > 0 && < %d",
4382 	     idx, sc->hn_rx_ring_inuse));
4383 	rxr = &sc->hn_rx_ring[idx];
4384 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4385 	    ("RX ring %d is not attached", idx));
4386 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4387 
4388 	if (idx < sc->hn_tx_ring_inuse) {
4389 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4390 
4391 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4392 		    ("TX ring %d is not attached attached", idx));
4393 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4394 	}
4395 
4396 	/*
4397 	 * Close this channel.
4398 	 *
4399 	 * NOTE:
4400 	 * Channel closing does _not_ destroy the target channel.
4401 	 */
4402 	error = vmbus_chan_close_direct(chan);
4403 	if (error == EISCONN) {
4404 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
4405 		    "after being closed\n", vmbus_chan_id(chan));
4406 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4407 	} else if (error) {
4408 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4409 		    vmbus_chan_id(chan), error);
4410 	}
4411 }
4412 
4413 static int
4414 hn_attach_subchans(struct hn_softc *sc)
4415 {
4416 	struct vmbus_channel **subchans;
4417 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4418 	int i, error = 0;
4419 
4420 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
4421 
4422 	/* Attach the sub-channels. */
4423 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4424 	for (i = 0; i < subchan_cnt; ++i) {
4425 		int error1;
4426 
4427 		error1 = hn_chan_attach(sc, subchans[i]);
4428 		if (error1) {
4429 			error = error1;
4430 			/* Move on; all channels will be detached later. */
4431 		}
4432 	}
4433 	vmbus_subchan_rel(subchans, subchan_cnt);
4434 
4435 	if (error) {
4436 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4437 	} else {
4438 		if (bootverbose) {
4439 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4440 			    subchan_cnt);
4441 		}
4442 	}
4443 	return (error);
4444 }
4445 
4446 static void
4447 hn_detach_allchans(struct hn_softc *sc)
4448 {
4449 	struct vmbus_channel **subchans;
4450 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4451 	int i;
4452 
4453 	if (subchan_cnt == 0)
4454 		goto back;
4455 
4456 	/* Detach the sub-channels. */
4457 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4458 	for (i = 0; i < subchan_cnt; ++i)
4459 		hn_chan_detach(sc, subchans[i]);
4460 	vmbus_subchan_rel(subchans, subchan_cnt);
4461 
4462 back:
4463 	/*
4464 	 * Detach the primary channel, _after_ all sub-channels
4465 	 * are detached.
4466 	 */
4467 	hn_chan_detach(sc, sc->hn_prichan);
4468 
4469 	/* Wait for sub-channels to be destroyed, if any. */
4470 	vmbus_subchan_drain(sc->hn_prichan);
4471 
4472 #ifdef INVARIANTS
4473 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4474 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4475 		    HN_RX_FLAG_ATTACHED) == 0,
4476 		    ("%dth RX ring is still attached", i));
4477 	}
4478 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4479 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4480 		    HN_TX_FLAG_ATTACHED) == 0,
4481 		    ("%dth TX ring is still attached", i));
4482 	}
4483 #endif
4484 }
4485 
4486 static int
4487 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4488 {
4489 	struct vmbus_channel **subchans;
4490 	int nchan, rxr_cnt, error;
4491 
4492 	nchan = *nsubch + 1;
4493 	if (nchan == 1) {
4494 		/*
4495 		 * Multiple RX/TX rings are not requested.
4496 		 */
4497 		*nsubch = 0;
4498 		return (0);
4499 	}
4500 
4501 	/*
4502 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4503 	 * table entries.
4504 	 */
4505 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4506 	if (error) {
4507 		/* No RSS; this is benign. */
4508 		*nsubch = 0;
4509 		return (0);
4510 	}
4511 	if (bootverbose) {
4512 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4513 		    rxr_cnt, nchan);
4514 	}
4515 
4516 	if (nchan > rxr_cnt)
4517 		nchan = rxr_cnt;
4518 	if (nchan == 1) {
4519 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4520 		*nsubch = 0;
4521 		return (0);
4522 	}
4523 
4524 	/*
4525 	 * Allocate sub-channels from NVS.
4526 	 */
4527 	*nsubch = nchan - 1;
4528 	error = hn_nvs_alloc_subchans(sc, nsubch);
4529 	if (error || *nsubch == 0) {
4530 		/* Failed to allocate sub-channels. */
4531 		*nsubch = 0;
4532 		return (0);
4533 	}
4534 
4535 	/*
4536 	 * Wait for all sub-channels to become ready before moving on.
4537 	 */
4538 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4539 	vmbus_subchan_rel(subchans, *nsubch);
4540 	return (0);
4541 }
4542 
4543 static bool
4544 hn_synth_attachable(const struct hn_softc *sc)
4545 {
4546 	int i;
4547 
4548 	if (sc->hn_flags & HN_FLAG_ERRORS)
4549 		return (false);
4550 
4551 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4552 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4553 
4554 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4555 			return (false);
4556 	}
4557 	return (true);
4558 }
4559 
4560 static int
4561 hn_synth_attach(struct hn_softc *sc, int mtu)
4562 {
4563 #define ATTACHED_NVS		0x0002
4564 #define ATTACHED_RNDIS		0x0004
4565 
4566 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4567 	int error, nsubch, nchan, i;
4568 	uint32_t old_caps, attached = 0;
4569 
4570 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4571 	    ("synthetic parts were attached"));
4572 
4573 	if (!hn_synth_attachable(sc))
4574 		return (ENXIO);
4575 
4576 	/* Save capabilities for later verification. */
4577 	old_caps = sc->hn_caps;
4578 	sc->hn_caps = 0;
4579 
4580 	/* Clear RSS stuffs. */
4581 	sc->hn_rss_ind_size = 0;
4582 	sc->hn_rss_hash = 0;
4583 
4584 	/*
4585 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
4586 	 */
4587 	error = hn_chan_attach(sc, sc->hn_prichan);
4588 	if (error)
4589 		goto failed;
4590 
4591 	/*
4592 	 * Attach NVS.
4593 	 */
4594 	error = hn_nvs_attach(sc, mtu);
4595 	if (error)
4596 		goto failed;
4597 	attached |= ATTACHED_NVS;
4598 
4599 	/*
4600 	 * Attach RNDIS _after_ NVS is attached.
4601 	 */
4602 	error = hn_rndis_attach(sc, mtu);
4603 	if (error)
4604 		goto failed;
4605 	attached |= ATTACHED_RNDIS;
4606 
4607 	/*
4608 	 * Make sure capabilities are not changed.
4609 	 */
4610 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4611 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4612 		    old_caps, sc->hn_caps);
4613 		error = ENXIO;
4614 		goto failed;
4615 	}
4616 
4617 	/*
4618 	 * Allocate sub-channels for multi-TX/RX rings.
4619 	 *
4620 	 * NOTE:
4621 	 * The # of RX rings that can be used is equivalent to the # of
4622 	 * channels to be requested.
4623 	 */
4624 	nsubch = sc->hn_rx_ring_cnt - 1;
4625 	error = hn_synth_alloc_subchans(sc, &nsubch);
4626 	if (error)
4627 		goto failed;
4628 	/* NOTE: _Full_ synthetic parts detach is required now. */
4629 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4630 
4631 	/*
4632 	 * Set the # of TX/RX rings that could be used according to
4633 	 * the # of channels that NVS offered.
4634 	 */
4635 	nchan = nsubch + 1;
4636 	hn_set_ring_inuse(sc, nchan);
4637 	if (nchan == 1) {
4638 		/* Only the primary channel can be used; done */
4639 		goto back;
4640 	}
4641 
4642 	/*
4643 	 * Attach the sub-channels.
4644 	 *
4645 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
4646 	 */
4647 	error = hn_attach_subchans(sc);
4648 	if (error)
4649 		goto failed;
4650 
4651 	/*
4652 	 * Configure RSS key and indirect table _after_ all sub-channels
4653 	 * are attached.
4654 	 */
4655 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4656 		/*
4657 		 * RSS key is not set yet; set it to the default RSS key.
4658 		 */
4659 		if (bootverbose)
4660 			if_printf(sc->hn_ifp, "setup default RSS key\n");
4661 #ifdef RSS
4662 		rss_getkey(rss->rss_key);
4663 #else
4664 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4665 #endif
4666 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4667 	}
4668 
4669 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4670 		/*
4671 		 * RSS indirect table is not set yet; set it up in round-
4672 		 * robin fashion.
4673 		 */
4674 		if (bootverbose) {
4675 			if_printf(sc->hn_ifp, "setup default RSS indirect "
4676 			    "table\n");
4677 		}
4678 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
4679 			uint32_t subidx;
4680 
4681 #ifdef RSS
4682 			subidx = rss_get_indirection_to_bucket(i);
4683 #else
4684 			subidx = i;
4685 #endif
4686 			rss->rss_ind[i] = subidx % nchan;
4687 		}
4688 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4689 	} else {
4690 		/*
4691 		 * # of usable channels may be changed, so we have to
4692 		 * make sure that all entries in RSS indirect table
4693 		 * are valid.
4694 		 *
4695 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
4696 		 */
4697 		hn_rss_ind_fixup(sc);
4698 	}
4699 
4700 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4701 	if (error)
4702 		goto failed;
4703 back:
4704 	/*
4705 	 * Fixup transmission aggregation setup.
4706 	 */
4707 	hn_set_txagg(sc);
4708 	return (0);
4709 
4710 failed:
4711 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4712 		hn_synth_detach(sc);
4713 	} else {
4714 		if (attached & ATTACHED_RNDIS)
4715 			hn_rndis_detach(sc);
4716 		if (attached & ATTACHED_NVS)
4717 			hn_nvs_detach(sc);
4718 		hn_chan_detach(sc, sc->hn_prichan);
4719 		/* Restore old capabilities. */
4720 		sc->hn_caps = old_caps;
4721 	}
4722 	return (error);
4723 
4724 #undef ATTACHED_RNDIS
4725 #undef ATTACHED_NVS
4726 }
4727 
4728 /*
4729  * NOTE:
4730  * The interface must have been suspended though hn_suspend(), before
4731  * this function get called.
4732  */
4733 static void
4734 hn_synth_detach(struct hn_softc *sc)
4735 {
4736 
4737 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4738 	    ("synthetic parts were not attached"));
4739 
4740 	/* Detach the RNDIS first. */
4741 	hn_rndis_detach(sc);
4742 
4743 	/* Detach NVS. */
4744 	hn_nvs_detach(sc);
4745 
4746 	/* Detach all of the channels. */
4747 	hn_detach_allchans(sc);
4748 
4749 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4750 }
4751 
4752 static void
4753 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4754 {
4755 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4756 	    ("invalid ring count %d", ring_cnt));
4757 
4758 	if (sc->hn_tx_ring_cnt > ring_cnt)
4759 		sc->hn_tx_ring_inuse = ring_cnt;
4760 	else
4761 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4762 	sc->hn_rx_ring_inuse = ring_cnt;
4763 
4764 #ifdef RSS
4765 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
4766 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
4767 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
4768 		    rss_getnumbuckets());
4769 	}
4770 #endif
4771 
4772 	if (bootverbose) {
4773 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4774 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4775 	}
4776 }
4777 
4778 static void
4779 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4780 {
4781 
4782 	/*
4783 	 * NOTE:
4784 	 * The TX bufring will not be drained by the hypervisor,
4785 	 * if the primary channel is revoked.
4786 	 */
4787 	while (!vmbus_chan_rx_empty(chan) ||
4788 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4789 	     !vmbus_chan_tx_empty(chan)))
4790 		pause("waitch", 1);
4791 	vmbus_chan_intr_drain(chan);
4792 }
4793 
4794 static void
4795 hn_suspend_data(struct hn_softc *sc)
4796 {
4797 	struct vmbus_channel **subch = NULL;
4798 	struct hn_tx_ring *txr;
4799 	int i, nsubch;
4800 
4801 	HN_LOCK_ASSERT(sc);
4802 
4803 	/*
4804 	 * Suspend TX.
4805 	 */
4806 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4807 		txr = &sc->hn_tx_ring[i];
4808 
4809 		mtx_lock(&txr->hn_tx_lock);
4810 		txr->hn_suspended = 1;
4811 		mtx_unlock(&txr->hn_tx_lock);
4812 		/* No one is able send more packets now. */
4813 
4814 		/*
4815 		 * Wait for all pending sends to finish.
4816 		 *
4817 		 * NOTE:
4818 		 * We will _not_ receive all pending send-done, if the
4819 		 * primary channel is revoked.
4820 		 */
4821 		while (hn_tx_ring_pending(txr) &&
4822 		    !vmbus_chan_is_revoked(sc->hn_prichan))
4823 			pause("hnwtx", 1 /* 1 tick */);
4824 	}
4825 
4826 	/*
4827 	 * Disable RX by clearing RX filter.
4828 	 */
4829 	hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
4830 
4831 	/*
4832 	 * Give RNDIS enough time to flush all pending data packets.
4833 	 */
4834 	pause("waitrx", (200 * hz) / 1000);
4835 
4836 	/*
4837 	 * Drain RX/TX bufrings and interrupts.
4838 	 */
4839 	nsubch = sc->hn_rx_ring_inuse - 1;
4840 	if (nsubch > 0)
4841 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4842 
4843 	if (subch != NULL) {
4844 		for (i = 0; i < nsubch; ++i)
4845 			hn_chan_drain(sc, subch[i]);
4846 	}
4847 	hn_chan_drain(sc, sc->hn_prichan);
4848 
4849 	if (subch != NULL)
4850 		vmbus_subchan_rel(subch, nsubch);
4851 
4852 	/*
4853 	 * Drain any pending TX tasks.
4854 	 *
4855 	 * NOTE:
4856 	 * The above hn_chan_drain() can dispatch TX tasks, so the TX
4857 	 * tasks will have to be drained _after_ the above hn_chan_drain()
4858 	 * calls.
4859 	 */
4860 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4861 		txr = &sc->hn_tx_ring[i];
4862 
4863 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4864 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4865 	}
4866 }
4867 
4868 static void
4869 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4870 {
4871 
4872 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4873 }
4874 
4875 static void
4876 hn_suspend_mgmt(struct hn_softc *sc)
4877 {
4878 	struct task task;
4879 
4880 	HN_LOCK_ASSERT(sc);
4881 
4882 	/*
4883 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4884 	 * through hn_mgmt_taskq.
4885 	 */
4886 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4887 	vmbus_chan_run_task(sc->hn_prichan, &task);
4888 
4889 	/*
4890 	 * Make sure that all pending management tasks are completed.
4891 	 */
4892 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4893 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4894 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
4895 }
4896 
4897 static void
4898 hn_suspend(struct hn_softc *sc)
4899 {
4900 
4901 	/* Disable polling. */
4902 	hn_polling(sc, 0);
4903 
4904 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4905 		hn_suspend_data(sc);
4906 	hn_suspend_mgmt(sc);
4907 }
4908 
4909 static void
4910 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4911 {
4912 	int i;
4913 
4914 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4915 	    ("invalid TX ring count %d", tx_ring_cnt));
4916 
4917 	for (i = 0; i < tx_ring_cnt; ++i) {
4918 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4919 
4920 		mtx_lock(&txr->hn_tx_lock);
4921 		txr->hn_suspended = 0;
4922 		mtx_unlock(&txr->hn_tx_lock);
4923 	}
4924 }
4925 
4926 static void
4927 hn_resume_data(struct hn_softc *sc)
4928 {
4929 	int i;
4930 
4931 	HN_LOCK_ASSERT(sc);
4932 
4933 	/*
4934 	 * Re-enable RX.
4935 	 */
4936 	hn_rxfilter_config(sc);
4937 
4938 	/*
4939 	 * Make sure to clear suspend status on "all" TX rings,
4940 	 * since hn_tx_ring_inuse can be changed after
4941 	 * hn_suspend_data().
4942 	 */
4943 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4944 
4945 #ifdef HN_IFSTART_SUPPORT
4946 	if (!hn_use_if_start)
4947 #endif
4948 	{
4949 		/*
4950 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
4951 		 * reduced.
4952 		 */
4953 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4954 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4955 	}
4956 
4957 	/*
4958 	 * Kick start TX.
4959 	 */
4960 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4961 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4962 
4963 		/*
4964 		 * Use txeof task, so that any pending oactive can be
4965 		 * cleared properly.
4966 		 */
4967 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4968 	}
4969 }
4970 
4971 static void
4972 hn_resume_mgmt(struct hn_softc *sc)
4973 {
4974 
4975 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4976 
4977 	/*
4978 	 * Kick off network change detection, if it was pending.
4979 	 * If no network change was pending, start link status
4980 	 * checks, which is more lightweight than network change
4981 	 * detection.
4982 	 */
4983 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4984 		hn_change_network(sc);
4985 	else
4986 		hn_update_link_status(sc);
4987 }
4988 
4989 static void
4990 hn_resume(struct hn_softc *sc)
4991 {
4992 
4993 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4994 		hn_resume_data(sc);
4995 	hn_resume_mgmt(sc);
4996 
4997 	/*
4998 	 * Re-enable polling if this interface is running and
4999 	 * the polling is requested.
5000 	 */
5001 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
5002 		hn_polling(sc, sc->hn_pollhz);
5003 }
5004 
5005 static void
5006 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
5007 {
5008 	const struct rndis_status_msg *msg;
5009 	int ofs;
5010 
5011 	if (dlen < sizeof(*msg)) {
5012 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
5013 		return;
5014 	}
5015 	msg = data;
5016 
5017 	switch (msg->rm_status) {
5018 	case RNDIS_STATUS_MEDIA_CONNECT:
5019 	case RNDIS_STATUS_MEDIA_DISCONNECT:
5020 		hn_update_link_status(sc);
5021 		break;
5022 
5023 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
5024 		/* Not really useful; ignore. */
5025 		break;
5026 
5027 	case RNDIS_STATUS_NETWORK_CHANGE:
5028 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
5029 		if (dlen < ofs + msg->rm_stbuflen ||
5030 		    msg->rm_stbuflen < sizeof(uint32_t)) {
5031 			if_printf(sc->hn_ifp, "network changed\n");
5032 		} else {
5033 			uint32_t change;
5034 
5035 			memcpy(&change, ((const uint8_t *)msg) + ofs,
5036 			    sizeof(change));
5037 			if_printf(sc->hn_ifp, "network changed, change %u\n",
5038 			    change);
5039 		}
5040 		hn_change_network(sc);
5041 		break;
5042 
5043 	default:
5044 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
5045 		    msg->rm_status);
5046 		break;
5047 	}
5048 }
5049 
5050 static int
5051 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5052 {
5053 	const struct rndis_pktinfo *pi = info_data;
5054 	uint32_t mask = 0;
5055 
5056 	while (info_dlen != 0) {
5057 		const void *data;
5058 		uint32_t dlen;
5059 
5060 		if (__predict_false(info_dlen < sizeof(*pi)))
5061 			return (EINVAL);
5062 		if (__predict_false(info_dlen < pi->rm_size))
5063 			return (EINVAL);
5064 		info_dlen -= pi->rm_size;
5065 
5066 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5067 			return (EINVAL);
5068 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5069 			return (EINVAL);
5070 		dlen = pi->rm_size - pi->rm_pktinfooffset;
5071 		data = pi->rm_data;
5072 
5073 		switch (pi->rm_type) {
5074 		case NDIS_PKTINFO_TYPE_VLAN:
5075 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5076 				return (EINVAL);
5077 			info->vlan_info = *((const uint32_t *)data);
5078 			mask |= HN_RXINFO_VLAN;
5079 			break;
5080 
5081 		case NDIS_PKTINFO_TYPE_CSUM:
5082 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5083 				return (EINVAL);
5084 			info->csum_info = *((const uint32_t *)data);
5085 			mask |= HN_RXINFO_CSUM;
5086 			break;
5087 
5088 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5089 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5090 				return (EINVAL);
5091 			info->hash_value = *((const uint32_t *)data);
5092 			mask |= HN_RXINFO_HASHVAL;
5093 			break;
5094 
5095 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
5096 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5097 				return (EINVAL);
5098 			info->hash_info = *((const uint32_t *)data);
5099 			mask |= HN_RXINFO_HASHINF;
5100 			break;
5101 
5102 		default:
5103 			goto next;
5104 		}
5105 
5106 		if (mask == HN_RXINFO_ALL) {
5107 			/* All found; done */
5108 			break;
5109 		}
5110 next:
5111 		pi = (const struct rndis_pktinfo *)
5112 		    ((const uint8_t *)pi + pi->rm_size);
5113 	}
5114 
5115 	/*
5116 	 * Final fixup.
5117 	 * - If there is no hash value, invalidate the hash info.
5118 	 */
5119 	if ((mask & HN_RXINFO_HASHVAL) == 0)
5120 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5121 	return (0);
5122 }
5123 
5124 static __inline bool
5125 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5126 {
5127 
5128 	if (off < check_off) {
5129 		if (__predict_true(off + len <= check_off))
5130 			return (false);
5131 	} else if (off > check_off) {
5132 		if (__predict_true(check_off + check_len <= off))
5133 			return (false);
5134 	}
5135 	return (true);
5136 }
5137 
5138 static void
5139 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5140 {
5141 	const struct rndis_packet_msg *pkt;
5142 	struct hn_rxinfo info;
5143 	int data_off, pktinfo_off, data_len, pktinfo_len;
5144 
5145 	/*
5146 	 * Check length.
5147 	 */
5148 	if (__predict_false(dlen < sizeof(*pkt))) {
5149 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5150 		return;
5151 	}
5152 	pkt = data;
5153 
5154 	if (__predict_false(dlen < pkt->rm_len)) {
5155 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5156 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5157 		return;
5158 	}
5159 	if (__predict_false(pkt->rm_len <
5160 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5161 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5162 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
5163 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5164 		    pkt->rm_pktinfolen);
5165 		return;
5166 	}
5167 	if (__predict_false(pkt->rm_datalen == 0)) {
5168 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5169 		return;
5170 	}
5171 
5172 	/*
5173 	 * Check offests.
5174 	 */
5175 #define IS_OFFSET_INVALID(ofs)			\
5176 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
5177 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5178 
5179 	/* XXX Hyper-V does not meet data offset alignment requirement */
5180 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5181 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5182 		    "data offset %u\n", pkt->rm_dataoffset);
5183 		return;
5184 	}
5185 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5186 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5187 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5188 		    "oob offset %u\n", pkt->rm_oobdataoffset);
5189 		return;
5190 	}
5191 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5192 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5193 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5194 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5195 		return;
5196 	}
5197 
5198 #undef IS_OFFSET_INVALID
5199 
5200 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5201 	data_len = pkt->rm_datalen;
5202 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5203 	pktinfo_len = pkt->rm_pktinfolen;
5204 
5205 	/*
5206 	 * Check OOB coverage.
5207 	 */
5208 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
5209 		int oob_off, oob_len;
5210 
5211 		if_printf(rxr->hn_ifp, "got oobdata\n");
5212 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5213 		oob_len = pkt->rm_oobdatalen;
5214 
5215 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5216 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5217 			    "oob overflow, msglen %u, oob abs %d len %d\n",
5218 			    pkt->rm_len, oob_off, oob_len);
5219 			return;
5220 		}
5221 
5222 		/*
5223 		 * Check against data.
5224 		 */
5225 		if (hn_rndis_check_overlap(oob_off, oob_len,
5226 		    data_off, data_len)) {
5227 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5228 			    "oob overlaps data, oob abs %d len %d, "
5229 			    "data abs %d len %d\n",
5230 			    oob_off, oob_len, data_off, data_len);
5231 			return;
5232 		}
5233 
5234 		/*
5235 		 * Check against pktinfo.
5236 		 */
5237 		if (pktinfo_len != 0 &&
5238 		    hn_rndis_check_overlap(oob_off, oob_len,
5239 		    pktinfo_off, pktinfo_len)) {
5240 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5241 			    "oob overlaps pktinfo, oob abs %d len %d, "
5242 			    "pktinfo abs %d len %d\n",
5243 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
5244 			return;
5245 		}
5246 	}
5247 
5248 	/*
5249 	 * Check per-packet-info coverage and find useful per-packet-info.
5250 	 */
5251 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5252 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5253 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5254 	if (__predict_true(pktinfo_len != 0)) {
5255 		bool overlap;
5256 		int error;
5257 
5258 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5259 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5260 			    "pktinfo overflow, msglen %u, "
5261 			    "pktinfo abs %d len %d\n",
5262 			    pkt->rm_len, pktinfo_off, pktinfo_len);
5263 			return;
5264 		}
5265 
5266 		/*
5267 		 * Check packet info coverage.
5268 		 */
5269 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5270 		    data_off, data_len);
5271 		if (__predict_false(overlap)) {
5272 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5273 			    "pktinfo overlap data, pktinfo abs %d len %d, "
5274 			    "data abs %d len %d\n",
5275 			    pktinfo_off, pktinfo_len, data_off, data_len);
5276 			return;
5277 		}
5278 
5279 		/*
5280 		 * Find useful per-packet-info.
5281 		 */
5282 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5283 		    pktinfo_len, &info);
5284 		if (__predict_false(error)) {
5285 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5286 			    "pktinfo\n");
5287 			return;
5288 		}
5289 	}
5290 
5291 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
5292 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5293 		    "data overflow, msglen %u, data abs %d len %d\n",
5294 		    pkt->rm_len, data_off, data_len);
5295 		return;
5296 	}
5297 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5298 }
5299 
5300 static __inline void
5301 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5302 {
5303 	const struct rndis_msghdr *hdr;
5304 
5305 	if (__predict_false(dlen < sizeof(*hdr))) {
5306 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5307 		return;
5308 	}
5309 	hdr = data;
5310 
5311 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5312 		/* Hot data path. */
5313 		hn_rndis_rx_data(rxr, data, dlen);
5314 		/* Done! */
5315 		return;
5316 	}
5317 
5318 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5319 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5320 	else
5321 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5322 }
5323 
5324 static void
5325 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5326 {
5327 	const struct hn_nvs_hdr *hdr;
5328 
5329 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5330 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
5331 		return;
5332 	}
5333 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5334 
5335 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5336 		/* Useless; ignore */
5337 		return;
5338 	}
5339 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5340 }
5341 
5342 static void
5343 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5344     const struct vmbus_chanpkt_hdr *pkt)
5345 {
5346 	struct hn_nvs_sendctx *sndc;
5347 
5348 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5349 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5350 	    VMBUS_CHANPKT_DATALEN(pkt));
5351 	/*
5352 	 * NOTE:
5353 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5354 	 * its callback.
5355 	 */
5356 }
5357 
5358 static void
5359 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5360     const struct vmbus_chanpkt_hdr *pkthdr)
5361 {
5362 	const struct vmbus_chanpkt_rxbuf *pkt;
5363 	const struct hn_nvs_hdr *nvs_hdr;
5364 	int count, i, hlen;
5365 
5366 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5367 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5368 		return;
5369 	}
5370 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5371 
5372 	/* Make sure that this is a RNDIS message. */
5373 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5374 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5375 		    nvs_hdr->nvs_type);
5376 		return;
5377 	}
5378 
5379 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5380 	if (__predict_false(hlen < sizeof(*pkt))) {
5381 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5382 		return;
5383 	}
5384 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5385 
5386 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5387 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5388 		    pkt->cp_rxbuf_id);
5389 		return;
5390 	}
5391 
5392 	count = pkt->cp_rxbuf_cnt;
5393 	if (__predict_false(hlen <
5394 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5395 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5396 		return;
5397 	}
5398 
5399 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5400 	for (i = 0; i < count; ++i) {
5401 		int ofs, len;
5402 
5403 		ofs = pkt->cp_rxbuf[i].rb_ofs;
5404 		len = pkt->cp_rxbuf[i].rb_len;
5405 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5406 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5407 			    "ofs %d, len %d\n", i, ofs, len);
5408 			continue;
5409 		}
5410 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5411 	}
5412 
5413 	/*
5414 	 * Ack the consumed RXBUF associated w/ this channel packet,
5415 	 * so that this RXBUF can be recycled by the hypervisor.
5416 	 */
5417 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5418 }
5419 
5420 static void
5421 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5422     uint64_t tid)
5423 {
5424 	struct hn_nvs_rndis_ack ack;
5425 	int retries, error;
5426 
5427 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5428 	ack.nvs_status = HN_NVS_STATUS_OK;
5429 
5430 	retries = 0;
5431 again:
5432 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5433 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5434 	if (__predict_false(error == EAGAIN)) {
5435 		/*
5436 		 * NOTE:
5437 		 * This should _not_ happen in real world, since the
5438 		 * consumption of the TX bufring from the TX path is
5439 		 * controlled.
5440 		 */
5441 		if (rxr->hn_ack_failed == 0)
5442 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5443 		rxr->hn_ack_failed++;
5444 		retries++;
5445 		if (retries < 10) {
5446 			DELAY(100);
5447 			goto again;
5448 		}
5449 		/* RXBUF leaks! */
5450 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5451 	}
5452 }
5453 
5454 static void
5455 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5456 {
5457 	struct hn_rx_ring *rxr = xrxr;
5458 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
5459 
5460 	for (;;) {
5461 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5462 		int error, pktlen;
5463 
5464 		pktlen = rxr->hn_pktbuf_len;
5465 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5466 		if (__predict_false(error == ENOBUFS)) {
5467 			void *nbuf;
5468 			int nlen;
5469 
5470 			/*
5471 			 * Expand channel packet buffer.
5472 			 *
5473 			 * XXX
5474 			 * Use M_WAITOK here, since allocation failure
5475 			 * is fatal.
5476 			 */
5477 			nlen = rxr->hn_pktbuf_len * 2;
5478 			while (nlen < pktlen)
5479 				nlen *= 2;
5480 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5481 
5482 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5483 			    rxr->hn_pktbuf_len, nlen);
5484 
5485 			free(rxr->hn_pktbuf, M_DEVBUF);
5486 			rxr->hn_pktbuf = nbuf;
5487 			rxr->hn_pktbuf_len = nlen;
5488 			/* Retry! */
5489 			continue;
5490 		} else if (__predict_false(error == EAGAIN)) {
5491 			/* No more channel packets; done! */
5492 			break;
5493 		}
5494 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5495 
5496 		switch (pkt->cph_type) {
5497 		case VMBUS_CHANPKT_TYPE_COMP:
5498 			hn_nvs_handle_comp(sc, chan, pkt);
5499 			break;
5500 
5501 		case VMBUS_CHANPKT_TYPE_RXBUF:
5502 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
5503 			break;
5504 
5505 		case VMBUS_CHANPKT_TYPE_INBAND:
5506 			hn_nvs_handle_notify(sc, pkt);
5507 			break;
5508 
5509 		default:
5510 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5511 			    pkt->cph_type);
5512 			break;
5513 		}
5514 	}
5515 	hn_chan_rollup(rxr, rxr->hn_txr);
5516 }
5517 
5518 static void
5519 hn_tx_taskq_create(void *arg __unused)
5520 {
5521 	int i;
5522 
5523 	/*
5524 	 * Fix the # of TX taskqueues.
5525 	 */
5526 	if (hn_tx_taskq_cnt <= 0)
5527 		hn_tx_taskq_cnt = 1;
5528 	else if (hn_tx_taskq_cnt > mp_ncpus)
5529 		hn_tx_taskq_cnt = mp_ncpus;
5530 
5531 	/*
5532 	 * Fix the TX taskqueue mode.
5533 	 */
5534 	switch (hn_tx_taskq_mode) {
5535 	case HN_TX_TASKQ_M_INDEP:
5536 	case HN_TX_TASKQ_M_GLOBAL:
5537 	case HN_TX_TASKQ_M_EVTTQ:
5538 		break;
5539 	default:
5540 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5541 		break;
5542 	}
5543 
5544 	if (vm_guest != VM_GUEST_HV)
5545 		return;
5546 
5547 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5548 		return;
5549 
5550 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5551 	    M_DEVBUF, M_WAITOK);
5552 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5553 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5554 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5555 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5556 		    "hn tx%d", i);
5557 	}
5558 }
5559 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5560     hn_tx_taskq_create, NULL);
5561 
5562 static void
5563 hn_tx_taskq_destroy(void *arg __unused)
5564 {
5565 
5566 	if (hn_tx_taskque != NULL) {
5567 		int i;
5568 
5569 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
5570 			taskqueue_free(hn_tx_taskque[i]);
5571 		free(hn_tx_taskque, M_DEVBUF);
5572 	}
5573 }
5574 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5575     hn_tx_taskq_destroy, NULL);
5576