xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 2ae6227ddfb85965d9d2a3719583d8fddad02ba1)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 #include "opt_hn.h"
57 #include "opt_inet6.h"
58 #include "opt_inet.h"
59 #include "opt_rss.h"
60 
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/bus.h>
64 #include <sys/counter.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/proc.h>
73 #include <sys/rmlock.h>
74 #include <sys/sbuf.h>
75 #include <sys/sched.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 #include <sys/epoch.h>
85 
86 #include <vm/vm.h>
87 #include <vm/vm_extern.h>
88 #include <vm/pmap.h>
89 
90 #include <machine/atomic.h>
91 #include <machine/in_cksum.h>
92 
93 #include <net/bpf.h>
94 #include <net/ethernet.h>
95 #include <net/if.h>
96 #include <net/if_dl.h>
97 #include <net/if_media.h>
98 #include <net/if_types.h>
99 #include <net/if_var.h>
100 #include <net/rndis.h>
101 #include <net/rss_config.h>
102 
103 #include <netinet/in_systm.h>
104 #include <netinet/in.h>
105 #include <netinet/ip.h>
106 #include <netinet/ip6.h>
107 #include <netinet/tcp.h>
108 #include <netinet/tcp_lro.h>
109 #include <netinet/udp.h>
110 
111 #include <dev/hyperv/include/hyperv.h>
112 #include <dev/hyperv/include/hyperv_busdma.h>
113 #include <dev/hyperv/include/vmbus.h>
114 #include <dev/hyperv/include/vmbus_xact.h>
115 
116 #include <dev/hyperv/netvsc/ndis.h>
117 #include <dev/hyperv/netvsc/if_hnreg.h>
118 #include <dev/hyperv/netvsc/if_hnvar.h>
119 #include <dev/hyperv/netvsc/hn_nvs.h>
120 #include <dev/hyperv/netvsc/hn_rndis.h>
121 
122 #include "vmbus_if.h"
123 
124 #define HN_IFSTART_SUPPORT
125 
126 #define HN_RING_CNT_DEF_MAX		8
127 
128 #define HN_VFMAP_SIZE_DEF		8
129 
130 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
131 
132 /* YYY should get it from the underlying channel */
133 #define HN_TX_DESC_CNT			512
134 
135 #define HN_RNDIS_PKT_LEN					\
136 	(sizeof(struct rndis_packet_msg) +			\
137 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
138 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
139 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
140 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
141 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
142 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
143 
144 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
145 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
146 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
147 /* -1 for RNDIS packet message */
148 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
149 
150 #define HN_DIRECT_TX_SIZE_DEF		128
151 
152 #define HN_EARLY_TXEOF_THRESH		8
153 
154 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
155 
156 #define HN_LROENT_CNT_DEF		128
157 
158 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
159 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
160 /* YYY 2*MTU is a bit rough, but should be good enough. */
161 #define HN_LRO_LENLIM_MIN(ifp)		(2 * if_getmtu(ifp))
162 
163 #define HN_LRO_ACKCNT_DEF		1
164 
165 #define HN_LOCK_INIT(sc)		\
166 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
167 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
168 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
169 #define HN_LOCK(sc)					\
170 do {							\
171 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
172 		/* Relinquish cpu to avoid deadlock */	\
173 		sched_relinquish(curthread);		\
174 		DELAY(1000);				\
175 	}						\
176 } while (0)
177 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
178 
179 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
180 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
181 #define HN_CSUM_IP_HWASSIST(sc)		\
182 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
183 #define HN_CSUM_IP6_HWASSIST(sc)	\
184 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
185 
186 #define HN_PKTSIZE_MIN(align)		\
187 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
188 	    HN_RNDIS_PKT_LEN, (align))
189 #define HN_PKTSIZE(m, align)		\
190 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
191 
192 #ifdef RSS
193 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
194 #else
195 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
196 #endif
197 
198 struct hn_txdesc {
199 #ifndef HN_USE_TXDESC_BUFRING
200 	SLIST_ENTRY(hn_txdesc)		link;
201 #endif
202 	STAILQ_ENTRY(hn_txdesc)		agg_link;
203 
204 	/* Aggregated txdescs, in sending order. */
205 	STAILQ_HEAD(, hn_txdesc)	agg_list;
206 
207 	/* The oldest packet, if transmission aggregation happens. */
208 	struct mbuf			*m;
209 	struct hn_tx_ring		*txr;
210 	int				refs;
211 	uint32_t			flags;	/* HN_TXD_FLAG_ */
212 	struct hn_nvs_sendctx		send_ctx;
213 	uint32_t			chim_index;
214 	int				chim_size;
215 
216 	bus_dmamap_t			data_dmap;
217 
218 	bus_addr_t			rndis_pkt_paddr;
219 	struct rndis_packet_msg		*rndis_pkt;
220 	bus_dmamap_t			rndis_pkt_dmap;
221 };
222 
223 #define HN_TXD_FLAG_ONLIST		0x0001
224 #define HN_TXD_FLAG_DMAMAP		0x0002
225 #define HN_TXD_FLAG_ONAGG		0x0004
226 
227 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
228 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
229 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
230 
231 struct packet_info_id {
232 	uint8_t				ver;
233 	uint8_t				flag;
234 	uint16_t			pkt_id;
235 };
236 
237 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
238 
239 
240 struct hn_rxinfo {
241 	const uint32_t			*vlan_info;
242 	const uint32_t			*csum_info;
243 	const uint32_t			*hash_info;
244 	const uint32_t			*hash_value;
245 	const struct packet_info_id	*pktinfo_id;
246 };
247 
248 struct hn_rxvf_setarg {
249 	struct hn_rx_ring	*rxr;
250 	if_t			vf_ifp;
251 };
252 
253 #define HN_RXINFO_VLAN			0x0001
254 #define HN_RXINFO_CSUM			0x0002
255 #define HN_RXINFO_HASHINF		0x0004
256 #define HN_RXINFO_HASHVAL		0x0008
257 #define HN_RXINFO_PKTINFO_ID		0x0010
258 #define HN_RXINFO_ALL			\
259 	(HN_RXINFO_VLAN |		\
260 	 HN_RXINFO_CSUM |		\
261 	 HN_RXINFO_HASHINF |		\
262 	 HN_RXINFO_HASHVAL |		\
263 	 HN_RXINFO_PKTINFO_ID)
264 
265 static int			hn_probe(device_t);
266 static int			hn_attach(device_t);
267 static int			hn_detach(device_t);
268 static int			hn_shutdown(device_t);
269 static void			hn_chan_callback(struct vmbus_channel *,
270 				    void *);
271 
272 static void			hn_init(void *);
273 static int			hn_ioctl(if_t, u_long, caddr_t);
274 #ifdef HN_IFSTART_SUPPORT
275 static void			hn_start(if_t);
276 #endif
277 static int			hn_transmit(if_t, struct mbuf *);
278 static void			hn_xmit_qflush(if_t);
279 static int			hn_ifmedia_upd(if_t);
280 static void			hn_ifmedia_sts(if_t,
281 				    struct ifmediareq *);
282 
283 static void			hn_ifnet_event(void *, if_t, int);
284 static void			hn_ifaddr_event(void *, if_t);
285 static void			hn_ifnet_attevent(void *, if_t);
286 static void			hn_ifnet_detevent(void *, if_t);
287 static void			hn_ifnet_lnkevent(void *, if_t, int);
288 
289 static bool			hn_ismyvf(const struct hn_softc *,
290 				    const if_t);
291 static void			hn_rxvf_change(struct hn_softc *,
292 				    if_t, bool);
293 static void			hn_rxvf_set(struct hn_softc *, if_t);
294 static void			hn_rxvf_set_task(void *, int);
295 static void			hn_xpnt_vf_input(if_t, struct mbuf *);
296 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
297 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
298 				    struct ifreq *);
299 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
300 static bool			hn_xpnt_vf_isready(struct hn_softc *);
301 static void			hn_xpnt_vf_setready(struct hn_softc *);
302 static void			hn_xpnt_vf_init_taskfunc(void *, int);
303 static void			hn_xpnt_vf_init(struct hn_softc *);
304 static void			hn_xpnt_vf_setenable(struct hn_softc *);
305 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
306 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
307 static void			hn_vf_rss_restore(struct hn_softc *);
308 
309 static int			hn_rndis_rxinfo(const void *, int,
310 				    struct hn_rxinfo *);
311 static void			hn_rndis_rx_data(struct hn_rx_ring *,
312 				    const void *, int);
313 static void			hn_rndis_rx_status(struct hn_softc *,
314 				    const void *, int);
315 static void			hn_rndis_init_fixat(struct hn_softc *, int);
316 
317 static void			hn_nvs_handle_notify(struct hn_softc *,
318 				    const struct vmbus_chanpkt_hdr *);
319 static void			hn_nvs_handle_comp(struct hn_softc *,
320 				    struct vmbus_channel *,
321 				    const struct vmbus_chanpkt_hdr *);
322 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
323 				    struct vmbus_channel *,
324 				    const struct vmbus_chanpkt_hdr *);
325 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
326 				    struct vmbus_channel *, uint64_t);
327 
328 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
329 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
331 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
340 #ifndef RSS
341 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
343 #endif
344 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
346 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
347 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
349 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
350 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
358 
359 static void			hn_stop(struct hn_softc *, bool);
360 static void			hn_init_locked(struct hn_softc *);
361 static int			hn_chan_attach(struct hn_softc *,
362 				    struct vmbus_channel *);
363 static void			hn_chan_detach(struct hn_softc *,
364 				    struct vmbus_channel *);
365 static int			hn_attach_subchans(struct hn_softc *);
366 static void			hn_detach_allchans(struct hn_softc *);
367 static void			hn_chan_rollup(struct hn_rx_ring *,
368 				    struct hn_tx_ring *);
369 static void			hn_set_ring_inuse(struct hn_softc *, int);
370 static int			hn_synth_attach(struct hn_softc *, int);
371 static void			hn_synth_detach(struct hn_softc *);
372 static int			hn_synth_alloc_subchans(struct hn_softc *,
373 				    int *);
374 static bool			hn_synth_attachable(const struct hn_softc *);
375 static void			hn_suspend(struct hn_softc *);
376 static void			hn_suspend_data(struct hn_softc *);
377 static void			hn_suspend_mgmt(struct hn_softc *);
378 static void			hn_resume(struct hn_softc *);
379 static void			hn_resume_data(struct hn_softc *);
380 static void			hn_resume_mgmt(struct hn_softc *);
381 static void			hn_suspend_mgmt_taskfunc(void *, int);
382 static void			hn_chan_drain(struct hn_softc *,
383 				    struct vmbus_channel *);
384 static void			hn_disable_rx(struct hn_softc *);
385 static void			hn_drain_rxtx(struct hn_softc *, int);
386 static void			hn_polling(struct hn_softc *, u_int);
387 static void			hn_chan_polling(struct vmbus_channel *, u_int);
388 static void			hn_mtu_change_fixup(struct hn_softc *);
389 
390 static void			hn_update_link_status(struct hn_softc *);
391 static void			hn_change_network(struct hn_softc *);
392 static void			hn_link_taskfunc(void *, int);
393 static void			hn_netchg_init_taskfunc(void *, int);
394 static void			hn_netchg_status_taskfunc(void *, int);
395 static void			hn_link_status(struct hn_softc *);
396 
397 static int			hn_create_rx_data(struct hn_softc *, int);
398 static void			hn_destroy_rx_data(struct hn_softc *);
399 static int			hn_check_iplen(const struct mbuf *, int);
400 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
401 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
402 static int			hn_rxfilter_config(struct hn_softc *);
403 static int			hn_rss_reconfig(struct hn_softc *);
404 static void			hn_rss_ind_fixup(struct hn_softc *);
405 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
406 static int			hn_rxpkt(struct hn_rx_ring *);
407 static uint32_t			hn_rss_type_fromndis(uint32_t);
408 static uint32_t			hn_rss_type_tondis(uint32_t);
409 
410 static int			hn_tx_ring_create(struct hn_softc *, int);
411 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
412 static int			hn_create_tx_data(struct hn_softc *, int);
413 static void			hn_fixup_tx_data(struct hn_softc *);
414 static void			hn_fixup_rx_data(struct hn_softc *);
415 static void			hn_destroy_tx_data(struct hn_softc *);
416 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
417 static void			hn_txdesc_gc(struct hn_tx_ring *,
418 				    struct hn_txdesc *);
419 static int			hn_encap(if_t, struct hn_tx_ring *,
420 				    struct hn_txdesc *, struct mbuf **);
421 static int			hn_txpkt(if_t, struct hn_tx_ring *,
422 				    struct hn_txdesc *);
423 static void			hn_set_chim_size(struct hn_softc *, int);
424 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
425 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
426 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
427 static void			hn_resume_tx(struct hn_softc *, int);
428 static void			hn_set_txagg(struct hn_softc *);
429 static void			*hn_try_txagg(if_t,
430 				    struct hn_tx_ring *, struct hn_txdesc *,
431 				    int);
432 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
433 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
434 				    struct hn_softc *, struct vmbus_channel *,
435 				    const void *, int);
436 static int			hn_txpkt_sglist(struct hn_tx_ring *,
437 				    struct hn_txdesc *);
438 static int			hn_txpkt_chim(struct hn_tx_ring *,
439 				    struct hn_txdesc *);
440 static int			hn_xmit(struct hn_tx_ring *, int);
441 static void			hn_xmit_taskfunc(void *, int);
442 static void			hn_xmit_txeof(struct hn_tx_ring *);
443 static void			hn_xmit_txeof_taskfunc(void *, int);
444 #ifdef HN_IFSTART_SUPPORT
445 static int			hn_start_locked(struct hn_tx_ring *, int);
446 static void			hn_start_taskfunc(void *, int);
447 static void			hn_start_txeof(struct hn_tx_ring *);
448 static void			hn_start_txeof_taskfunc(void *, int);
449 #endif
450 
451 static int			hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
452 
453 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
454     "Hyper-V network interface");
455 
456 /* Trust tcp segment verification on host side. */
457 static int			hn_trust_hosttcp = 1;
458 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
459     &hn_trust_hosttcp, 0,
460     "Trust tcp segment verification on host side, "
461     "when csum info is missing (global setting)");
462 
463 /* Trust udp datagrams verification on host side. */
464 static int			hn_trust_hostudp = 1;
465 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
466     &hn_trust_hostudp, 0,
467     "Trust udp datagram verification on host side, "
468     "when csum info is missing (global setting)");
469 
470 /* Trust ip packets verification on host side. */
471 static int			hn_trust_hostip = 1;
472 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
473     &hn_trust_hostip, 0,
474     "Trust ip packet verification on host side, "
475     "when csum info is missing (global setting)");
476 
477 /*
478  * Offload UDP/IPv4 checksum.
479  */
480 static int			hn_enable_udp4cs = 1;
481 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
482     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
483 
484 /*
485  * Offload UDP/IPv6 checksum.
486  */
487 static int			hn_enable_udp6cs = 1;
488 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
489     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
490 
491 /* Stats. */
492 static counter_u64_t		hn_udpcs_fixup;
493 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
494     &hn_udpcs_fixup, "# of UDP checksum fixup");
495 
496 /*
497  * See hn_set_hlen().
498  *
499  * This value is for Azure.  For Hyper-V, set this above
500  * 65536 to disable UDP datagram checksum fixup.
501  */
502 static int			hn_udpcs_fixup_mtu = 1420;
503 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
504     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
505 
506 /* Limit TSO burst size */
507 static int			hn_tso_maxlen = IP_MAXPACKET;
508 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
509     &hn_tso_maxlen, 0, "TSO burst limit");
510 
511 /* Limit chimney send size */
512 static int			hn_tx_chimney_size = 0;
513 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
514     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
515 
516 /* Limit the size of packet for direct transmission */
517 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
518 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
519     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
520 
521 /* # of LRO entries per RX ring */
522 #if defined(INET) || defined(INET6)
523 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
524 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
525     &hn_lro_entry_count, 0, "LRO entry count");
526 #endif
527 
528 static int			hn_tx_taskq_cnt = 1;
529 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
530     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
531 
532 #define HN_TX_TASKQ_M_INDEP	0
533 #define HN_TX_TASKQ_M_GLOBAL	1
534 #define HN_TX_TASKQ_M_EVTTQ	2
535 
536 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
537 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
538     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
539     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
540 
541 #ifndef HN_USE_TXDESC_BUFRING
542 static int			hn_use_txdesc_bufring = 0;
543 #else
544 static int			hn_use_txdesc_bufring = 1;
545 #endif
546 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
547     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
548 
549 #ifdef HN_IFSTART_SUPPORT
550 /* Use ifnet.if_start instead of ifnet.if_transmit */
551 static int			hn_use_if_start = 0;
552 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
553     &hn_use_if_start, 0, "Use if_start TX method");
554 #endif
555 
556 /* # of channels to use */
557 static int			hn_chan_cnt = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
559     &hn_chan_cnt, 0,
560     "# of channels to use; each channel has one RX ring and one TX ring");
561 
562 /* # of transmit rings to use */
563 static int			hn_tx_ring_cnt = 0;
564 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
565     &hn_tx_ring_cnt, 0, "# of TX rings to use");
566 
567 /* Software TX ring deptch */
568 static int			hn_tx_swq_depth = 0;
569 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
570     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
571 
572 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
573 static u_int			hn_lro_mbufq_depth = 0;
574 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
575     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
576 
577 /* Packet transmission aggregation size limit */
578 static int			hn_tx_agg_size = -1;
579 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
580     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
581 
582 /* Packet transmission aggregation count limit */
583 static int			hn_tx_agg_pkts = -1;
584 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
585     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
586 
587 /* VF list */
588 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
589     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
590     hn_vflist_sysctl, "A",
591     "VF list");
592 
593 /* VF mapping */
594 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
595     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
596     hn_vfmap_sysctl, "A",
597     "VF mapping");
598 
599 /* Transparent VF */
600 static int			hn_xpnt_vf = 1;
601 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
602     &hn_xpnt_vf, 0, "Transparent VF mod");
603 
604 /* Accurate BPF support for Transparent VF */
605 static int			hn_xpnt_vf_accbpf = 0;
606 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
607     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
608 
609 /* Extra wait for transparent VF attach routing; unit seconds. */
610 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
611 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
612     &hn_xpnt_vf_attwait, 0,
613     "Extra wait for transparent VF attach routing; unit: seconds");
614 
615 static u_int			hn_cpu_index;	/* next CPU for channel */
616 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
617 
618 static struct rmlock		hn_vfmap_lock;
619 static int			hn_vfmap_size;
620 static if_t			*hn_vfmap;
621 
622 static const struct hyperv_guid	hn_guid = {
623 	.hv_guid = {
624 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
625 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
626 };
627 
628 static device_method_t hn_methods[] = {
629 	/* Device interface */
630 	DEVMETHOD(device_probe,		hn_probe),
631 	DEVMETHOD(device_attach,	hn_attach),
632 	DEVMETHOD(device_detach,	hn_detach),
633 	DEVMETHOD(device_shutdown,	hn_shutdown),
634 	DEVMETHOD_END
635 };
636 
637 static driver_t hn_driver = {
638 	"hn",
639 	hn_methods,
640 	sizeof(struct hn_softc)
641 };
642 
643 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0);
644 MODULE_VERSION(hn, 1);
645 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
646 
647 static void
hn_set_lro_lenlim(struct hn_softc * sc,int lenlim)648 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
649 {
650 	int i;
651 
652 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
653 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
654 }
655 
656 static int
hn_txpkt_sglist(struct hn_tx_ring * txr,struct hn_txdesc * txd)657 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
658 {
659 
660 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
661 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
662 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
663 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
664 }
665 
666 static int
hn_txpkt_chim(struct hn_tx_ring * txr,struct hn_txdesc * txd)667 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
668 {
669 	struct hn_nvs_rndis rndis;
670 
671 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
672 	    txd->chim_size > 0, ("invalid rndis chim txd"));
673 
674 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
675 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
676 	rndis.nvs_chim_idx = txd->chim_index;
677 	rndis.nvs_chim_sz = txd->chim_size;
678 
679 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
680 	    &rndis, sizeof(rndis), &txd->send_ctx));
681 }
682 
683 static __inline uint32_t
hn_chim_alloc(struct hn_softc * sc)684 hn_chim_alloc(struct hn_softc *sc)
685 {
686 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
687 	u_long *bmap = sc->hn_chim_bmap;
688 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
689 
690 	for (i = 0; i < bmap_cnt; ++i) {
691 		int idx;
692 
693 		idx = ffsl(~bmap[i]);
694 		if (idx == 0)
695 			continue;
696 
697 		--idx; /* ffsl is 1-based */
698 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
699 		    ("invalid i %d and idx %d", i, idx));
700 
701 		if (atomic_testandset_long(&bmap[i], idx))
702 			continue;
703 
704 		ret = i * LONG_BIT + idx;
705 		break;
706 	}
707 	return (ret);
708 }
709 
710 static __inline void
hn_chim_free(struct hn_softc * sc,uint32_t chim_idx)711 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
712 {
713 	u_long mask;
714 	uint32_t idx;
715 
716 	idx = chim_idx / LONG_BIT;
717 	KASSERT(idx < sc->hn_chim_bmap_cnt,
718 	    ("invalid chimney index 0x%x", chim_idx));
719 
720 	mask = 1UL << (chim_idx % LONG_BIT);
721 	KASSERT(sc->hn_chim_bmap[idx] & mask,
722 	    ("index bitmap 0x%lx, chimney index %u, "
723 	     "bitmap idx %d, bitmask 0x%lx",
724 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
725 
726 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
727 }
728 
729 #if defined(INET6) || defined(INET)
730 
731 #define PULLUP_HDR(m, len)				\
732 do {							\
733 	if (__predict_false((m)->m_len < (len))) {	\
734 		(m) = m_pullup((m), (len));		\
735 		if ((m) == NULL)			\
736 			return (NULL);			\
737 	}						\
738 } while (0)
739 
740 /*
741  * NOTE: If this function failed, the m_head would be freed.
742  */
743 static __inline struct mbuf *
hn_tso_fixup(struct mbuf * m_head)744 hn_tso_fixup(struct mbuf *m_head)
745 {
746 	struct ether_vlan_header *evl;
747 	struct tcphdr *th;
748 	int ehlen;
749 
750 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
751 
752 	PULLUP_HDR(m_head, sizeof(*evl));
753 	evl = mtod(m_head, struct ether_vlan_header *);
754 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
755 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
756 	else
757 		ehlen = ETHER_HDR_LEN;
758 	m_head->m_pkthdr.l2hlen = ehlen;
759 
760 #ifdef INET
761 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
762 		struct ip *ip;
763 		int iphlen;
764 
765 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
766 		ip = mtodo(m_head, ehlen);
767 		iphlen = ip->ip_hl << 2;
768 		m_head->m_pkthdr.l3hlen = iphlen;
769 
770 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
771 		th = mtodo(m_head, ehlen + iphlen);
772 
773 		ip->ip_len = 0;
774 		ip->ip_sum = 0;
775 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
776 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
777 	}
778 #endif
779 #if defined(INET6) && defined(INET)
780 	else
781 #endif
782 #ifdef INET6
783 	{
784 		struct ip6_hdr *ip6;
785 
786 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
787 		ip6 = mtodo(m_head, ehlen);
788 		if (ip6->ip6_nxt != IPPROTO_TCP) {
789 			m_freem(m_head);
790 			return (NULL);
791 		}
792 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
793 
794 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
795 		th = mtodo(m_head, ehlen + sizeof(*ip6));
796 
797 		ip6->ip6_plen = 0;
798 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
799 	}
800 #endif
801 	return (m_head);
802 }
803 
804 /*
805  * NOTE: If this function failed, the m_head would be freed.
806  */
807 static __inline struct mbuf *
hn_set_hlen(struct mbuf * m_head)808 hn_set_hlen(struct mbuf *m_head)
809 {
810 	const struct ether_vlan_header *evl;
811 	int ehlen;
812 
813 	PULLUP_HDR(m_head, sizeof(*evl));
814 	evl = mtod(m_head, const struct ether_vlan_header *);
815 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
816 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
817 	else
818 		ehlen = ETHER_HDR_LEN;
819 	m_head->m_pkthdr.l2hlen = ehlen;
820 
821 #ifdef INET
822 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
823 		const struct ip *ip;
824 		int iphlen;
825 
826 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
827 		ip = mtodo(m_head, ehlen);
828 		iphlen = ip->ip_hl << 2;
829 		m_head->m_pkthdr.l3hlen = iphlen;
830 
831 		/*
832 		 * UDP checksum offload does not work in Azure, if the
833 		 * following conditions meet:
834 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
835 		 * - IP_DF is not set in the IP hdr.
836 		 *
837 		 * Fallback to software checksum for these UDP datagrams.
838 		 */
839 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
840 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
841 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
842 			uint16_t off = ehlen + iphlen;
843 
844 			counter_u64_add(hn_udpcs_fixup, 1);
845 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
846 			*(uint16_t *)(m_head->m_data + off +
847                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
848 			    m_head, m_head->m_pkthdr.len, off);
849 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
850 		}
851 	}
852 #endif
853 #if defined(INET6) && defined(INET)
854 	else
855 #endif
856 #ifdef INET6
857 	{
858 		const struct ip6_hdr *ip6;
859 
860 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
861 		ip6 = mtodo(m_head, ehlen);
862 		if (ip6->ip6_nxt != IPPROTO_TCP &&
863 		    ip6->ip6_nxt != IPPROTO_UDP) {
864 			m_freem(m_head);
865 			return (NULL);
866 		}
867 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
868 	}
869 #endif
870 	return (m_head);
871 }
872 
873 /*
874  * NOTE: If this function failed, the m_head would be freed.
875  */
876 static __inline struct mbuf *
hn_check_tcpsyn(struct mbuf * m_head,int * tcpsyn)877 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
878 {
879 	const struct tcphdr *th;
880 	int ehlen, iphlen;
881 
882 	*tcpsyn = 0;
883 	ehlen = m_head->m_pkthdr.l2hlen;
884 	iphlen = m_head->m_pkthdr.l3hlen;
885 
886 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
887 	th = mtodo(m_head, ehlen + iphlen);
888 	if (tcp_get_flags(th) & TH_SYN)
889 		*tcpsyn = 1;
890 	return (m_head);
891 }
892 
893 #undef PULLUP_HDR
894 
895 #endif	/* INET6 || INET */
896 
897 static int
hn_set_rxfilter(struct hn_softc * sc,uint32_t filter)898 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
899 {
900 	int error = 0;
901 
902 	HN_LOCK_ASSERT(sc);
903 
904 	if (sc->hn_rx_filter != filter) {
905 		error = hn_rndis_set_rxfilter(sc, filter);
906 		if (!error)
907 			sc->hn_rx_filter = filter;
908 	}
909 	return (error);
910 }
911 
912 static int
hn_rxfilter_config(struct hn_softc * sc)913 hn_rxfilter_config(struct hn_softc *sc)
914 {
915 	if_t ifp = sc->hn_ifp;
916 	uint32_t filter;
917 
918 	HN_LOCK_ASSERT(sc);
919 
920 	/*
921 	 * If the non-transparent mode VF is activated, we don't know how
922 	 * its RX filter is configured, so stick the synthetic device in
923 	 * the promiscous mode.
924 	 */
925 	if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
926 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
927 	} else {
928 		filter = NDIS_PACKET_TYPE_DIRECTED;
929 		if (if_getflags(ifp) & IFF_BROADCAST)
930 			filter |= NDIS_PACKET_TYPE_BROADCAST;
931 		/* TODO: support multicast list */
932 		if ((if_getflags(ifp) & IFF_ALLMULTI) ||
933 		    !if_maddr_empty(ifp))
934 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
935 	}
936 	return (hn_set_rxfilter(sc, filter));
937 }
938 
939 static void
hn_set_txagg(struct hn_softc * sc)940 hn_set_txagg(struct hn_softc *sc)
941 {
942 	uint32_t size, pkts;
943 	int i;
944 
945 	/*
946 	 * Setup aggregation size.
947 	 */
948 	if (sc->hn_agg_size < 0)
949 		size = UINT32_MAX;
950 	else
951 		size = sc->hn_agg_size;
952 
953 	if (sc->hn_rndis_agg_size < size)
954 		size = sc->hn_rndis_agg_size;
955 
956 	/* NOTE: We only aggregate packets using chimney sending buffers. */
957 	if (size > (uint32_t)sc->hn_chim_szmax)
958 		size = sc->hn_chim_szmax;
959 
960 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
961 		/* Disable */
962 		size = 0;
963 		pkts = 0;
964 		goto done;
965 	}
966 
967 	/* NOTE: Type of the per TX ring setting is 'int'. */
968 	if (size > INT_MAX)
969 		size = INT_MAX;
970 
971 	/*
972 	 * Setup aggregation packet count.
973 	 */
974 	if (sc->hn_agg_pkts < 0)
975 		pkts = UINT32_MAX;
976 	else
977 		pkts = sc->hn_agg_pkts;
978 
979 	if (sc->hn_rndis_agg_pkts < pkts)
980 		pkts = sc->hn_rndis_agg_pkts;
981 
982 	if (pkts <= 1) {
983 		/* Disable */
984 		size = 0;
985 		pkts = 0;
986 		goto done;
987 	}
988 
989 	/* NOTE: Type of the per TX ring setting is 'short'. */
990 	if (pkts > SHRT_MAX)
991 		pkts = SHRT_MAX;
992 
993 done:
994 	/* NOTE: Type of the per TX ring setting is 'short'. */
995 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
996 		/* Disable */
997 		size = 0;
998 		pkts = 0;
999 	}
1000 
1001 	if (bootverbose) {
1002 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1003 		    size, pkts, sc->hn_rndis_agg_align);
1004 	}
1005 
1006 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1007 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1008 
1009 		mtx_lock(&txr->hn_tx_lock);
1010 		txr->hn_agg_szmax = size;
1011 		txr->hn_agg_pktmax = pkts;
1012 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1013 		mtx_unlock(&txr->hn_tx_lock);
1014 	}
1015 }
1016 
1017 static int
hn_get_txswq_depth(const struct hn_tx_ring * txr)1018 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1019 {
1020 
1021 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1022 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1023 		return txr->hn_txdesc_cnt;
1024 	return hn_tx_swq_depth;
1025 }
1026 
1027 static int
hn_rss_reconfig(struct hn_softc * sc)1028 hn_rss_reconfig(struct hn_softc *sc)
1029 {
1030 	int error;
1031 
1032 	HN_LOCK_ASSERT(sc);
1033 
1034 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1035 		return (ENXIO);
1036 
1037 	/*
1038 	 * Disable RSS first.
1039 	 *
1040 	 * NOTE:
1041 	 * Direct reconfiguration by setting the UNCHG flags does
1042 	 * _not_ work properly.
1043 	 */
1044 	if (bootverbose)
1045 		if_printf(sc->hn_ifp, "disable RSS\n");
1046 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1047 	if (error) {
1048 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1049 		return (error);
1050 	}
1051 
1052 	/*
1053 	 * Reenable the RSS w/ the updated RSS key or indirect
1054 	 * table.
1055 	 */
1056 	if (bootverbose)
1057 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1058 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1059 	if (error) {
1060 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1061 		return (error);
1062 	}
1063 	return (0);
1064 }
1065 
1066 static void
hn_rss_ind_fixup(struct hn_softc * sc)1067 hn_rss_ind_fixup(struct hn_softc *sc)
1068 {
1069 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1070 	int i, nchan;
1071 
1072 	nchan = sc->hn_rx_ring_inuse;
1073 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1074 
1075 	/*
1076 	 * Check indirect table to make sure that all channels in it
1077 	 * can be used.
1078 	 */
1079 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1080 		if (rss->rss_ind[i] >= nchan) {
1081 			if_printf(sc->hn_ifp,
1082 			    "RSS indirect table %d fixup: %u -> %d\n",
1083 			    i, rss->rss_ind[i], nchan - 1);
1084 			rss->rss_ind[i] = nchan - 1;
1085 		}
1086 	}
1087 }
1088 
1089 static int
hn_ifmedia_upd(if_t ifp __unused)1090 hn_ifmedia_upd(if_t ifp __unused)
1091 {
1092 
1093 	/* Ignore since autoselect is the only defined and valid media */
1094 	return (0);
1095 }
1096 
1097 static void
hn_ifmedia_sts(if_t ifp,struct ifmediareq * ifmr)1098 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr)
1099 {
1100 	struct hn_softc *sc = if_getsoftc(ifp);
1101 
1102 	ifmr->ifm_status = IFM_AVALID;
1103 	ifmr->ifm_active = IFM_ETHER;
1104 
1105 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1106 		ifmr->ifm_active |= IFM_NONE;
1107 		return;
1108 	}
1109 	ifmr->ifm_status |= IFM_ACTIVE;
1110 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1111 }
1112 
1113 static void
hn_rxvf_set_task(void * xarg,int pending __unused)1114 hn_rxvf_set_task(void *xarg, int pending __unused)
1115 {
1116 	struct hn_rxvf_setarg *arg = xarg;
1117 
1118 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1119 }
1120 
1121 static void
hn_rxvf_set(struct hn_softc * sc,if_t vf_ifp)1122 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp)
1123 {
1124 	struct hn_rx_ring *rxr;
1125 	struct hn_rxvf_setarg arg;
1126 	struct task task;
1127 	int i;
1128 
1129 	HN_LOCK_ASSERT(sc);
1130 
1131 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1132 
1133 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1134 		rxr = &sc->hn_rx_ring[i];
1135 
1136 		if (i < sc->hn_rx_ring_inuse) {
1137 			arg.rxr = rxr;
1138 			arg.vf_ifp = vf_ifp;
1139 			vmbus_chan_run_task(rxr->hn_chan, &task);
1140 		} else {
1141 			rxr->hn_rxvf_ifp = vf_ifp;
1142 		}
1143 	}
1144 }
1145 
1146 static bool
hn_ismyvf(const struct hn_softc * sc,const if_t ifp)1147 hn_ismyvf(const struct hn_softc *sc, const if_t ifp)
1148 {
1149 	if_t hn_ifp;
1150 
1151 	hn_ifp = sc->hn_ifp;
1152 
1153 	if (ifp == hn_ifp)
1154 		return (false);
1155 
1156 	if (if_getalloctype(ifp) != IFT_ETHER)
1157 		return (false);
1158 
1159 	/* Ignore lagg/vlan interfaces */
1160 	if (strcmp(if_getdname(ifp), "lagg") == 0 ||
1161 	    strcmp(if_getdname(ifp), "vlan") == 0)
1162 		return (false);
1163 
1164 	/*
1165 	 * During detach events if_getifaddr(ifp) might be NULL.
1166 	 * Make sure the bcmp() below doesn't panic on that:
1167 	 */
1168 	if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL)
1169 		return (false);
1170 
1171 	if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0)
1172 		return (false);
1173 
1174 	return (true);
1175 }
1176 
1177 static void
hn_rxvf_change(struct hn_softc * sc,if_t ifp,bool rxvf)1178 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf)
1179 {
1180 	if_t hn_ifp;
1181 
1182 	HN_LOCK(sc);
1183 
1184 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1185 		goto out;
1186 
1187 	if (!hn_ismyvf(sc, ifp))
1188 		goto out;
1189 	hn_ifp = sc->hn_ifp;
1190 
1191 	if (rxvf) {
1192 		if (sc->hn_flags & HN_FLAG_RXVF)
1193 			goto out;
1194 
1195 		sc->hn_flags |= HN_FLAG_RXVF;
1196 		hn_rxfilter_config(sc);
1197 	} else {
1198 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1199 			goto out;
1200 
1201 		sc->hn_flags &= ~HN_FLAG_RXVF;
1202 		if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING)
1203 			hn_rxfilter_config(sc);
1204 		else
1205 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1206 	}
1207 
1208 	hn_nvs_set_datapath(sc,
1209 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1210 
1211 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1212 
1213 	if (rxvf) {
1214 		hn_vf_rss_fixup(sc, true);
1215 		hn_suspend_mgmt(sc);
1216 		sc->hn_link_flags &=
1217 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1218 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1219 	} else {
1220 		hn_vf_rss_restore(sc);
1221 		hn_resume_mgmt(sc);
1222 	}
1223 
1224 	devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1225 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1226 
1227 	if (bootverbose) {
1228 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1229 		    rxvf ? "to" : "from", if_name(ifp));
1230 	}
1231 out:
1232 	HN_UNLOCK(sc);
1233 }
1234 
1235 static void
hn_ifnet_event(void * arg,if_t ifp,int event)1236 hn_ifnet_event(void *arg, if_t ifp, int event)
1237 {
1238 
1239 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1240 		return;
1241 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1242 }
1243 
1244 static void
hn_ifaddr_event(void * arg,if_t ifp)1245 hn_ifaddr_event(void *arg, if_t ifp)
1246 {
1247 
1248 	hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP);
1249 }
1250 
1251 static int
hn_xpnt_vf_iocsetcaps(struct hn_softc * sc,struct ifreq * ifr __unused)1252 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr __unused)
1253 {
1254 	if_t ifp, vf_ifp;
1255 
1256 	HN_LOCK_ASSERT(sc);
1257 	ifp = sc->hn_ifp;
1258 	vf_ifp = sc->hn_vf_ifp;
1259 
1260 	/*
1261 	 * Just sync up with VF's enabled capabilities.
1262 	 */
1263 	if_setcapenable(ifp, if_getcapenable(vf_ifp));
1264 	if_sethwassist(ifp, if_gethwassist(vf_ifp));
1265 
1266 	return (0);
1267 }
1268 
1269 static int
hn_xpnt_vf_iocsetflags(struct hn_softc * sc)1270 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1271 {
1272 	if_t vf_ifp;
1273 	struct ifreq ifr;
1274 
1275 	HN_LOCK_ASSERT(sc);
1276 	vf_ifp = sc->hn_vf_ifp;
1277 
1278 	memset(&ifr, 0, sizeof(ifr));
1279 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1280 	ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff;
1281 	ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16;
1282 	return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread));
1283 }
1284 
1285 static void
hn_xpnt_vf_saveifflags(struct hn_softc * sc)1286 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1287 {
1288 	if_t ifp = sc->hn_ifp;
1289 	int allmulti = 0;
1290 
1291 	HN_LOCK_ASSERT(sc);
1292 
1293 	/* XXX vlan(4) style mcast addr maintenance */
1294 	if (!if_maddr_empty(ifp))
1295 		allmulti = IFF_ALLMULTI;
1296 
1297 	/* Always set the VF's if_flags */
1298 	if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti);
1299 }
1300 
1301 static void
hn_xpnt_vf_input(if_t vf_ifp,struct mbuf * m)1302 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m)
1303 {
1304 	struct rm_priotracker pt;
1305 	if_t hn_ifp = NULL;
1306 	struct mbuf *mn;
1307 
1308 	/*
1309 	 * XXX racy, if hn(4) ever detached.
1310 	 */
1311 	rm_rlock(&hn_vfmap_lock, &pt);
1312 	if (if_getindex(vf_ifp) < hn_vfmap_size)
1313 		hn_ifp = hn_vfmap[if_getindex(vf_ifp)];
1314 	rm_runlock(&hn_vfmap_lock, &pt);
1315 
1316 	if (hn_ifp != NULL) {
1317 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1318 			/*
1319 			 * Allow tapping on the VF.
1320 			 */
1321 			ETHER_BPF_MTAP(vf_ifp, mn);
1322 
1323 			/*
1324 			 * Update VF stats.
1325 			 */
1326 			if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) {
1327 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1328 				    mn->m_pkthdr.len);
1329 			}
1330 			/*
1331 			 * XXX IFCOUNTER_IMCAST
1332 			 * This stat updating is kinda invasive, since it
1333 			 * requires two checks on the mbuf: the length check
1334 			 * and the ethernet header check.  As of this write,
1335 			 * all multicast packets go directly to hn(4), which
1336 			 * makes imcast stat updating in the VF a try in vian.
1337 			 */
1338 
1339 			/*
1340 			 * Fix up rcvif and increase hn(4)'s ipackets.
1341 			 */
1342 			mn->m_pkthdr.rcvif = hn_ifp;
1343 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1344 		}
1345 		/*
1346 		 * Go through hn(4)'s if_input.
1347 		 */
1348 		if_input(hn_ifp, m);
1349 	} else {
1350 		/*
1351 		 * In the middle of the transition; free this
1352 		 * mbuf chain.
1353 		 */
1354 		while (m != NULL) {
1355 			mn = m->m_nextpkt;
1356 			m->m_nextpkt = NULL;
1357 			m_freem(m);
1358 			m = mn;
1359 		}
1360 	}
1361 }
1362 
1363 static void
hn_mtu_change_fixup(struct hn_softc * sc)1364 hn_mtu_change_fixup(struct hn_softc *sc)
1365 {
1366 	if_t ifp;
1367 
1368 	HN_LOCK_ASSERT(sc);
1369 	ifp = sc->hn_ifp;
1370 
1371 	hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp));
1372 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1373 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1374 }
1375 
1376 static uint32_t
hn_rss_type_fromndis(uint32_t rss_hash)1377 hn_rss_type_fromndis(uint32_t rss_hash)
1378 {
1379 	uint32_t types = 0;
1380 
1381 	if (rss_hash & NDIS_HASH_IPV4)
1382 		types |= RSS_TYPE_IPV4;
1383 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1384 		types |= RSS_TYPE_TCP_IPV4;
1385 	if (rss_hash & NDIS_HASH_IPV6)
1386 		types |= RSS_TYPE_IPV6;
1387 	if (rss_hash & NDIS_HASH_IPV6_EX)
1388 		types |= RSS_TYPE_IPV6_EX;
1389 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1390 		types |= RSS_TYPE_TCP_IPV6;
1391 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1392 		types |= RSS_TYPE_TCP_IPV6_EX;
1393 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1394 		types |= RSS_TYPE_UDP_IPV4;
1395 	return (types);
1396 }
1397 
1398 static uint32_t
hn_rss_type_tondis(uint32_t types)1399 hn_rss_type_tondis(uint32_t types)
1400 {
1401 	uint32_t rss_hash = 0;
1402 
1403 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1404 	    ("UDP6 and UDP6EX are not supported"));
1405 
1406 	if (types & RSS_TYPE_IPV4)
1407 		rss_hash |= NDIS_HASH_IPV4;
1408 	if (types & RSS_TYPE_TCP_IPV4)
1409 		rss_hash |= NDIS_HASH_TCP_IPV4;
1410 	if (types & RSS_TYPE_IPV6)
1411 		rss_hash |= NDIS_HASH_IPV6;
1412 	if (types & RSS_TYPE_IPV6_EX)
1413 		rss_hash |= NDIS_HASH_IPV6_EX;
1414 	if (types & RSS_TYPE_TCP_IPV6)
1415 		rss_hash |= NDIS_HASH_TCP_IPV6;
1416 	if (types & RSS_TYPE_TCP_IPV6_EX)
1417 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1418 	if (types & RSS_TYPE_UDP_IPV4)
1419 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1420 	return (rss_hash);
1421 }
1422 
1423 static void
hn_rss_mbuf_hash(struct hn_softc * sc,uint32_t mbuf_hash)1424 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1425 {
1426 	int i;
1427 
1428 	HN_LOCK_ASSERT(sc);
1429 
1430 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1431 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1432 }
1433 
1434 static void
hn_vf_rss_fixup(struct hn_softc * sc,bool reconf)1435 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1436 {
1437 	if_t ifp, vf_ifp;
1438 	struct ifrsshash ifrh;
1439 	struct ifrsskey ifrk;
1440 	int error;
1441 	uint32_t my_types, diff_types, mbuf_types = 0;
1442 
1443 	HN_LOCK_ASSERT(sc);
1444 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1445 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1446 
1447 	if (sc->hn_rx_ring_inuse == 1) {
1448 		/* No RSS on synthetic parts; done. */
1449 		return;
1450 	}
1451 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1452 		/* Synthetic parts do not support Toeplitz; done. */
1453 		return;
1454 	}
1455 
1456 	ifp = sc->hn_ifp;
1457 	vf_ifp = sc->hn_vf_ifp;
1458 
1459 	/*
1460 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1461 	 * supported.
1462 	 */
1463 	memset(&ifrk, 0, sizeof(ifrk));
1464 	strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name));
1465 	error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread);
1466 	if (error) {
1467 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1468 		    if_name(vf_ifp), error);
1469 		goto done;
1470 	}
1471 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1472 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1473 		    if_name(vf_ifp), ifrk.ifrk_func);
1474 		goto done;
1475 	}
1476 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1477 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1478 		    if_name(vf_ifp), ifrk.ifrk_keylen);
1479 		goto done;
1480 	}
1481 
1482 	/*
1483 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1484 	 */
1485 	memset(&ifrh, 0, sizeof(ifrh));
1486 	strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name));
1487 	error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread);
1488 	if (error) {
1489 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1490 		    if_name(vf_ifp), error);
1491 		goto done;
1492 	}
1493 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1494 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1495 		    if_name(vf_ifp), ifrh.ifrh_func);
1496 		goto done;
1497 	}
1498 
1499 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1500 	if ((ifrh.ifrh_types & my_types) == 0) {
1501 		/* This disables RSS; ignore it then */
1502 		if_printf(ifp, "%s intersection of RSS types failed.  "
1503 		    "VF %#x, mine %#x\n", if_name(vf_ifp),
1504 		    ifrh.ifrh_types, my_types);
1505 		goto done;
1506 	}
1507 
1508 	diff_types = my_types ^ ifrh.ifrh_types;
1509 	my_types &= ifrh.ifrh_types;
1510 	mbuf_types = my_types;
1511 
1512 	/*
1513 	 * Detect RSS hash value/type confliction.
1514 	 *
1515 	 * NOTE:
1516 	 * We don't disable the hash type, but stop delivery the hash
1517 	 * value/type through mbufs on RX path.
1518 	 *
1519 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1520 	 * hash is delivered with type of TCP_IPV4.  This means if
1521 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1522 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1523 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1524 	 * here.
1525 	 */
1526 	if ((my_types & RSS_TYPE_IPV4) &&
1527 	    (diff_types & ifrh.ifrh_types &
1528 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1529 		/* Conflict; disable IPV4 hash type/value delivery. */
1530 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1531 		mbuf_types &= ~RSS_TYPE_IPV4;
1532 	}
1533 	if ((my_types & RSS_TYPE_IPV6) &&
1534 	    (diff_types & ifrh.ifrh_types &
1535 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1536 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1537 	      RSS_TYPE_IPV6_EX))) {
1538 		/* Conflict; disable IPV6 hash type/value delivery. */
1539 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1540 		mbuf_types &= ~RSS_TYPE_IPV6;
1541 	}
1542 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1543 	    (diff_types & ifrh.ifrh_types &
1544 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1545 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1546 	      RSS_TYPE_IPV6))) {
1547 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1548 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1549 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1550 	}
1551 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1552 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1553 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1554 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1555 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1556 	}
1557 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1558 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1559 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1560 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1561 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1562 	}
1563 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1564 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1565 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1566 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1567 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1568 	}
1569 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1570 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1571 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1572 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1573 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1574 	}
1575 
1576 	/*
1577 	 * Indirect table does not matter.
1578 	 */
1579 
1580 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1581 	    hn_rss_type_tondis(my_types);
1582 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1583 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1584 
1585 	if (reconf) {
1586 		error = hn_rss_reconfig(sc);
1587 		if (error) {
1588 			/* XXX roll-back? */
1589 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1590 			/* XXX keep going. */
1591 		}
1592 	}
1593 done:
1594 	/* Hash deliverability for mbufs. */
1595 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1596 }
1597 
1598 static void
hn_vf_rss_restore(struct hn_softc * sc)1599 hn_vf_rss_restore(struct hn_softc *sc)
1600 {
1601 
1602 	HN_LOCK_ASSERT(sc);
1603 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1604 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1605 
1606 	if (sc->hn_rx_ring_inuse == 1)
1607 		goto done;
1608 
1609 	/*
1610 	 * Restore hash types.  Key does _not_ matter.
1611 	 */
1612 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1613 		int error;
1614 
1615 		sc->hn_rss_hash = sc->hn_rss_hcap;
1616 		error = hn_rss_reconfig(sc);
1617 		if (error) {
1618 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1619 			    error);
1620 			/* XXX keep going. */
1621 		}
1622 	}
1623 done:
1624 	/* Hash deliverability for mbufs. */
1625 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1626 }
1627 
1628 static void
hn_xpnt_vf_setready(struct hn_softc * sc)1629 hn_xpnt_vf_setready(struct hn_softc *sc)
1630 {
1631 	if_t ifp, vf_ifp;
1632 	struct ifreq ifr;
1633 
1634 	HN_LOCK_ASSERT(sc);
1635 	ifp = sc->hn_ifp;
1636 	vf_ifp = sc->hn_vf_ifp;
1637 
1638 	/*
1639 	 * Mark the VF ready.
1640 	 */
1641 	sc->hn_vf_rdytick = 0;
1642 
1643 	/*
1644 	 * Save information for restoration.
1645 	 */
1646 	sc->hn_saved_caps = if_getcapabilities(ifp);
1647 	sc->hn_saved_tsomax = if_gethwtsomax(ifp);
1648 	sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp);
1649 	sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp);
1650 	sc->hn_saved_capenable = if_getcapenable(ifp);
1651 	sc->hn_saved_hwassist = if_gethwassist(ifp);
1652 
1653 	/*
1654 	 * Intersect supported/enabled capabilities.
1655 	 *
1656 	 * NOTE:
1657 	 * if_hwassist is not changed here.
1658 	 */
1659 	if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp));
1660 	if_setcapenablebit(ifp, 0, if_getcapabilities(ifp));
1661 
1662 	/*
1663 	 * Fix TSO settings.
1664 	 */
1665 	if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp))
1666 		if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp));
1667 	if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp))
1668 		if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp));
1669 	if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp))
1670 		if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp));
1671 
1672 	/*
1673 	 * Change VF's enabled capabilities.
1674 	 */
1675 	memset(&ifr, 0, sizeof(ifr));
1676 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1677 	ifr.ifr_reqcap = if_getcapenable(ifp);
1678 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1679 
1680 	if (if_getmtu(ifp) != ETHERMTU) {
1681 		int error;
1682 
1683 		/*
1684 		 * Change VF's MTU.
1685 		 */
1686 		memset(&ifr, 0, sizeof(ifr));
1687 		strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1688 		ifr.ifr_mtu = if_getmtu(ifp);
1689 		error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread);
1690 		if (error) {
1691 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1692 			    if_name(vf_ifp), if_getmtu(ifp));
1693 			if (if_getmtu(ifp) > ETHERMTU) {
1694 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1695 
1696 				/*
1697 				 * XXX
1698 				 * No need to adjust the synthetic parts' MTU;
1699 				 * failure of the adjustment will cause us
1700 				 * infinite headache.
1701 				 */
1702 				if_setmtu(ifp, ETHERMTU);
1703 				hn_mtu_change_fixup(sc);
1704 			}
1705 		}
1706 	}
1707 }
1708 
1709 static bool
hn_xpnt_vf_isready(struct hn_softc * sc)1710 hn_xpnt_vf_isready(struct hn_softc *sc)
1711 {
1712 
1713 	HN_LOCK_ASSERT(sc);
1714 
1715 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1716 		return (false);
1717 
1718 	if (sc->hn_vf_rdytick == 0)
1719 		return (true);
1720 
1721 	if (sc->hn_vf_rdytick > ticks)
1722 		return (false);
1723 
1724 	/* Mark VF as ready. */
1725 	hn_xpnt_vf_setready(sc);
1726 	return (true);
1727 }
1728 
1729 static void
hn_xpnt_vf_setenable(struct hn_softc * sc)1730 hn_xpnt_vf_setenable(struct hn_softc *sc)
1731 {
1732 	int i;
1733 
1734 	HN_LOCK_ASSERT(sc);
1735 
1736 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1737 	rm_wlock(&sc->hn_vf_lock);
1738 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1739 	rm_wunlock(&sc->hn_vf_lock);
1740 
1741 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1742 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1743 }
1744 
1745 static void
hn_xpnt_vf_setdisable(struct hn_softc * sc,bool clear_vf)1746 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1747 {
1748 	int i;
1749 
1750 	HN_LOCK_ASSERT(sc);
1751 
1752 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1753 	rm_wlock(&sc->hn_vf_lock);
1754 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1755 	if (clear_vf)
1756 		sc->hn_vf_ifp = NULL;
1757 	rm_wunlock(&sc->hn_vf_lock);
1758 
1759 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1760 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1761 }
1762 
1763 static void
hn_xpnt_vf_init(struct hn_softc * sc)1764 hn_xpnt_vf_init(struct hn_softc *sc)
1765 {
1766 	int error;
1767 
1768 	HN_LOCK_ASSERT(sc);
1769 
1770 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1771 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1772 
1773 	if (bootverbose) {
1774 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1775 		    if_name(sc->hn_vf_ifp));
1776 	}
1777 
1778 	/*
1779 	 * Bring the VF up.
1780 	 */
1781 	hn_xpnt_vf_saveifflags(sc);
1782 	if_setflagbits(sc->hn_ifp, IFF_UP, 0);
1783 	error = hn_xpnt_vf_iocsetflags(sc);
1784 	if (error) {
1785 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1786 		    if_name(sc->hn_vf_ifp), error);
1787 		return;
1788 	}
1789 
1790 	/*
1791 	 * NOTE:
1792 	 * Datapath setting must happen _after_ bringing the VF up.
1793 	 */
1794 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1795 
1796 	/*
1797 	 * NOTE:
1798 	 * Fixup RSS related bits _after_ the VF is brought up, since
1799 	 * many VFs generate RSS key during it's initialization.
1800 	 */
1801 	hn_vf_rss_fixup(sc, true);
1802 
1803 	/* Mark transparent mode VF as enabled. */
1804 	hn_xpnt_vf_setenable(sc);
1805 }
1806 
1807 static void
hn_xpnt_vf_init_taskfunc(void * xsc,int pending __unused)1808 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1809 {
1810 	struct hn_softc *sc = xsc;
1811 
1812 	HN_LOCK(sc);
1813 
1814 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1815 		goto done;
1816 	if (sc->hn_vf_ifp == NULL)
1817 		goto done;
1818 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1819 		goto done;
1820 
1821 	if (sc->hn_vf_rdytick != 0) {
1822 		/* Mark VF as ready. */
1823 		hn_xpnt_vf_setready(sc);
1824 	}
1825 
1826 	if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) {
1827 		/*
1828 		 * Delayed VF initialization.
1829 		 */
1830 		if (bootverbose) {
1831 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1832 			    if_name(sc->hn_vf_ifp));
1833 		}
1834 		hn_xpnt_vf_init(sc);
1835 	}
1836 done:
1837 	HN_UNLOCK(sc);
1838 }
1839 
1840 static void
hn_ifnet_attevent(void * xsc,if_t ifp)1841 hn_ifnet_attevent(void *xsc, if_t ifp)
1842 {
1843 	struct hn_softc *sc = xsc;
1844 
1845 	HN_LOCK(sc);
1846 
1847 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1848 		goto done;
1849 
1850 	if (!hn_ismyvf(sc, ifp))
1851 		goto done;
1852 
1853 	if (sc->hn_vf_ifp != NULL) {
1854 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1855 		    if_name(sc->hn_vf_ifp));
1856 		goto done;
1857 	}
1858 
1859 	if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) {
1860 		/*
1861 		 * ifnet.if_start is _not_ supported by transparent
1862 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1863 		 */
1864 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1865 		    "in transparent VF mode.\n", if_name(sc->hn_vf_ifp));
1866 
1867 		goto done;
1868 	}
1869 
1870 	rm_wlock(&hn_vfmap_lock);
1871 
1872 	if (if_getindex(ifp) >= hn_vfmap_size) {
1873 		if_t *newmap;
1874 		int newsize;
1875 
1876 		newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF;
1877 		newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF,
1878 		    M_WAITOK | M_ZERO);
1879 
1880 		memcpy(newmap, hn_vfmap,
1881 		    sizeof(if_t) * hn_vfmap_size);
1882 		free(hn_vfmap, M_DEVBUF);
1883 		hn_vfmap = newmap;
1884 		hn_vfmap_size = newsize;
1885 	}
1886 	KASSERT(hn_vfmap[if_getindex(ifp)] == NULL,
1887 	    ("%s: ifindex %d was mapped to %s",
1888 	     if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)])));
1889 	hn_vfmap[if_getindex(ifp)] = sc->hn_ifp;
1890 
1891 	rm_wunlock(&hn_vfmap_lock);
1892 
1893 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1894 	rm_wlock(&sc->hn_vf_lock);
1895 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1896 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1897 	sc->hn_vf_ifp = ifp;
1898 	rm_wunlock(&sc->hn_vf_lock);
1899 
1900 	if (hn_xpnt_vf) {
1901 		int wait_ticks;
1902 
1903 		/*
1904 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1905 		 * Save vf_ifp's current if_input for later restoration.
1906 		 */
1907 		sc->hn_vf_input = if_getinputfn(ifp);
1908 		if_setinputfn(ifp, hn_xpnt_vf_input);
1909 
1910 		/*
1911 		 * Stop link status management; use the VF's.
1912 		 */
1913 		hn_suspend_mgmt(sc);
1914 
1915 		/*
1916 		 * Give VF sometime to complete its attach routing.
1917 		 */
1918 		wait_ticks = hn_xpnt_vf_attwait * hz;
1919 		sc->hn_vf_rdytick = ticks + wait_ticks;
1920 
1921 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1922 		    wait_ticks);
1923 	}
1924 done:
1925 	HN_UNLOCK(sc);
1926 }
1927 
1928 static void
hn_ifnet_detevent(void * xsc,if_t ifp)1929 hn_ifnet_detevent(void *xsc, if_t ifp)
1930 {
1931 	struct hn_softc *sc = xsc;
1932 
1933 	HN_LOCK(sc);
1934 
1935 	if (sc->hn_vf_ifp == NULL)
1936 		goto done;
1937 
1938 	if (!hn_ismyvf(sc, ifp))
1939 		goto done;
1940 
1941 	if (hn_xpnt_vf) {
1942 		/*
1943 		 * Make sure that the delayed initialization is not running.
1944 		 *
1945 		 * NOTE:
1946 		 * - This lock _must_ be released, since the hn_vf_init task
1947 		 *   will try holding this lock.
1948 		 * - It is safe to release this lock here, since the
1949 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1950 		 *
1951 		 * XXX racy, if hn(4) ever detached.
1952 		 */
1953 		HN_UNLOCK(sc);
1954 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1955 		HN_LOCK(sc);
1956 
1957 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1958 		    if_name(sc->hn_ifp)));
1959 		if_setinputfn(ifp, sc->hn_vf_input);
1960 		sc->hn_vf_input = NULL;
1961 
1962 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1963 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1964 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1965 
1966 		if (sc->hn_vf_rdytick == 0) {
1967 			/*
1968 			 * The VF was ready; restore some settings.
1969 			 */
1970 			if_setcapabilities(ifp, sc->hn_saved_caps);
1971 
1972 			if_sethwtsomax(ifp, sc->hn_saved_tsomax);
1973 			if_sethwtsomaxsegcount(sc->hn_ifp,
1974 			    sc->hn_saved_tsosegcnt);
1975 			if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz);
1976 
1977 			if_setcapenable(ifp, sc->hn_saved_capenable);
1978 			if_sethwassist(ifp, sc->hn_saved_hwassist);
1979 		}
1980 
1981 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1982 			/*
1983 			 * Restore RSS settings.
1984 			 */
1985 			hn_vf_rss_restore(sc);
1986 
1987 			/*
1988 			 * Resume link status management, which was suspended
1989 			 * by hn_ifnet_attevent().
1990 			 */
1991 			hn_resume_mgmt(sc);
1992 		}
1993 	}
1994 
1995 	/* Mark transparent mode VF as disabled. */
1996 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
1997 
1998 	rm_wlock(&hn_vfmap_lock);
1999 
2000 	KASSERT(if_getindex(ifp) < hn_vfmap_size,
2001 	    ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size));
2002 	if (hn_vfmap[if_getindex(ifp)] != NULL) {
2003 		KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp,
2004 		    ("%s: ifindex %d was mapped to %s",
2005 		     if_name(ifp), if_getindex(ifp),
2006 		     if_name(hn_vfmap[if_getindex(ifp)])));
2007 		hn_vfmap[if_getindex(ifp)] = NULL;
2008 	}
2009 
2010 	rm_wunlock(&hn_vfmap_lock);
2011 done:
2012 	HN_UNLOCK(sc);
2013 }
2014 
2015 static void
hn_ifnet_lnkevent(void * xsc,if_t ifp,int link_state)2016 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state)
2017 {
2018 	struct hn_softc *sc = xsc;
2019 
2020 	if (sc->hn_vf_ifp == ifp)
2021 		if_link_state_change(sc->hn_ifp, link_state);
2022 }
2023 
2024 static int
hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS)2025 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS)
2026 {
2027 	struct hn_softc *sc = arg1;
2028 	unsigned int tsomax;
2029 	int error;
2030 
2031 	tsomax = if_gethwtsomax(sc->hn_ifp);
2032 	error = sysctl_handle_int(oidp, &tsomax, 0, req);
2033 	return error;
2034 }
2035 
2036 static int
hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS)2037 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS)
2038 {
2039 	struct hn_softc *sc = arg1;
2040 	unsigned int tsomaxsegcnt;
2041 	int error;
2042 
2043 	tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp);
2044 	error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req);
2045 	return error;
2046 }
2047 
2048 static int
hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS)2049 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS)
2050 {
2051 	struct hn_softc *sc = arg1;
2052 	unsigned int tsomaxsegsz;
2053 	int error;
2054 
2055 	tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp);
2056 	error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req);
2057 	return error;
2058 }
2059 
2060 static int
hn_probe(device_t dev)2061 hn_probe(device_t dev)
2062 {
2063 
2064 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2065 		device_set_desc(dev, "Hyper-V Network Interface");
2066 		return BUS_PROBE_DEFAULT;
2067 	}
2068 	return ENXIO;
2069 }
2070 
2071 static int
hn_attach(device_t dev)2072 hn_attach(device_t dev)
2073 {
2074 	struct hn_softc *sc = device_get_softc(dev);
2075 	struct sysctl_oid_list *child;
2076 	struct sysctl_ctx_list *ctx;
2077 	uint8_t eaddr[ETHER_ADDR_LEN];
2078 	if_t ifp = NULL;
2079 	int error, ring_cnt, tx_ring_cnt;
2080 	uint32_t mtu;
2081 
2082 	sc->hn_dev = dev;
2083 	sc->hn_prichan = vmbus_get_channel(dev);
2084 	HN_LOCK_INIT(sc);
2085 	rm_init(&sc->hn_vf_lock, "hnvf");
2086 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2087 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2088 
2089 	/*
2090 	 * Initialize these tunables once.
2091 	 */
2092 	sc->hn_agg_size = hn_tx_agg_size;
2093 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2094 
2095 	/*
2096 	 * Setup taskqueue for transmission.
2097 	 */
2098 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2099 		int i;
2100 
2101 		sc->hn_tx_taskqs =
2102 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2103 		    M_DEVBUF, M_WAITOK);
2104 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2105 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2106 			    M_WAITOK, taskqueue_thread_enqueue,
2107 			    &sc->hn_tx_taskqs[i]);
2108 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2109 			    "%s tx%d", device_get_nameunit(dev), i);
2110 		}
2111 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2112 		sc->hn_tx_taskqs = hn_tx_taskque;
2113 	}
2114 
2115 	/*
2116 	 * Setup taskqueue for mangement tasks, e.g. link status.
2117 	 */
2118 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2119 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2120 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2121 	    device_get_nameunit(dev));
2122 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2123 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2124 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2125 	    hn_netchg_status_taskfunc, sc);
2126 
2127 	if (hn_xpnt_vf) {
2128 		/*
2129 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2130 		 */
2131 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2132 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2133 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2134 		    device_get_nameunit(dev));
2135 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2136 		    hn_xpnt_vf_init_taskfunc, sc);
2137 	}
2138 
2139 	/*
2140 	 * Allocate ifnet and setup its name earlier, so that if_printf
2141 	 * can be used by functions, which will be called after
2142 	 * ether_ifattach().
2143 	 */
2144 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2145 	if_setsoftc(ifp, sc);
2146 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2147 
2148 	/*
2149 	 * Initialize ifmedia earlier so that it can be unconditionally
2150 	 * destroyed, if error happened later on.
2151 	 */
2152 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2153 
2154 	/*
2155 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2156 	 * to use (tx_ring_cnt).
2157 	 *
2158 	 * NOTE:
2159 	 * The # of RX rings to use is same as the # of channels to use.
2160 	 */
2161 	ring_cnt = hn_chan_cnt;
2162 	if (ring_cnt <= 0) {
2163 		/* Default */
2164 		ring_cnt = mp_ncpus;
2165 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2166 			ring_cnt = HN_RING_CNT_DEF_MAX;
2167 	} else if (ring_cnt > mp_ncpus) {
2168 		ring_cnt = mp_ncpus;
2169 	}
2170 #ifdef RSS
2171 	if (ring_cnt > rss_getnumbuckets())
2172 		ring_cnt = rss_getnumbuckets();
2173 #endif
2174 
2175 	tx_ring_cnt = hn_tx_ring_cnt;
2176 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2177 		tx_ring_cnt = ring_cnt;
2178 #ifdef HN_IFSTART_SUPPORT
2179 	if (hn_use_if_start) {
2180 		/* ifnet.if_start only needs one TX ring. */
2181 		tx_ring_cnt = 1;
2182 	}
2183 #endif
2184 
2185 	/*
2186 	 * Set the leader CPU for channels.
2187 	 */
2188 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2189 
2190 	/*
2191 	 * Create enough TX/RX rings, even if only limited number of
2192 	 * channels can be allocated.
2193 	 */
2194 	error = hn_create_tx_data(sc, tx_ring_cnt);
2195 	if (error)
2196 		goto failed;
2197 	error = hn_create_rx_data(sc, ring_cnt);
2198 	if (error)
2199 		goto failed;
2200 
2201 	/*
2202 	 * Create transaction context for NVS and RNDIS transactions.
2203 	 */
2204 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2205 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2206 	if (sc->hn_xact == NULL) {
2207 		error = ENXIO;
2208 		goto failed;
2209 	}
2210 
2211 	/*
2212 	 * Install orphan handler for the revocation of this device's
2213 	 * primary channel.
2214 	 *
2215 	 * NOTE:
2216 	 * The processing order is critical here:
2217 	 * Install the orphan handler, _before_ testing whether this
2218 	 * device's primary channel has been revoked or not.
2219 	 */
2220 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2221 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2222 		error = ENXIO;
2223 		goto failed;
2224 	}
2225 
2226 	/*
2227 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2228 	 */
2229 	error = hn_synth_attach(sc, ETHERMTU);
2230 	if (error)
2231 		goto failed;
2232 
2233 	error = hn_rndis_get_eaddr(sc, eaddr);
2234 	if (error)
2235 		goto failed;
2236 
2237 	error = hn_rndis_get_mtu(sc, &mtu);
2238 	if (error)
2239 		mtu = ETHERMTU;
2240 	else if (bootverbose)
2241 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2242 
2243 	if (sc->hn_rx_ring_inuse > 1) {
2244 		/*
2245 		 * Reduce TCP segment aggregation limit for multiple
2246 		 * RX rings to increase ACK timeliness.
2247 		 */
2248 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2249 	}
2250 
2251 	/*
2252 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2253 	 */
2254 	hn_fixup_tx_data(sc);
2255 	hn_fixup_rx_data(sc);
2256 
2257 	ctx = device_get_sysctl_ctx(dev);
2258 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2259 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2260 	    &sc->hn_nvs_ver, 0, "NVS version");
2261 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2262 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2263 	    hn_ndis_version_sysctl, "A", "NDIS version");
2264 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2265 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2266 	    hn_caps_sysctl, "A", "capabilities");
2267 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2268 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2269 	    hn_hwassist_sysctl, "A", "hwassist");
2270 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max",
2271 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl,
2272 	    "IU", "max TSO size");
2273 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt",
2274 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl,
2275 	    "IU", "max # of TSO segments");
2276 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz",
2277 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl,
2278 	    "IU", "max size of TSO segment");
2279 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2280 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2281 	    hn_rxfilter_sysctl, "A", "rxfilter");
2282 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2283 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2284 	    hn_rss_hash_sysctl, "A", "RSS hash");
2285 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2286 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2287 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2288 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2289 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2290 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2291 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2292 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2293 #ifndef RSS
2294 	/*
2295 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2296 	 */
2297 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2298 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2299 	    hn_rss_key_sysctl, "IU", "RSS key");
2300 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2301 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2302 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2303 #endif
2304 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2305 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2306 	    "RNDIS offered packet transmission aggregation size limit");
2307 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2308 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2309 	    "RNDIS offered packet transmission aggregation count limit");
2310 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2311 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2312 	    "RNDIS packet transmission aggregation alignment");
2313 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2314 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2315 	    hn_txagg_size_sysctl, "I",
2316 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2317 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2318 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2319 	    hn_txagg_pkts_sysctl, "I",
2320 	    "Packet transmission aggregation packets, "
2321 	    "0 -- disable, -1 -- auto");
2322 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2323 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2324 	    hn_polling_sysctl, "I",
2325 	    "Polling frequency: [100,1000000], 0 disable polling");
2326 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2327 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2328 	    hn_vf_sysctl, "A", "Virtual Function's name");
2329 	if (!hn_xpnt_vf) {
2330 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2331 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2332 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2333 	} else {
2334 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2335 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2336 		    hn_xpnt_vf_enabled_sysctl, "I",
2337 		    "Transparent VF enabled");
2338 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2339 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2340 		    hn_xpnt_vf_accbpf_sysctl, "I",
2341 		    "Accurate BPF for transparent VF");
2342 	}
2343 
2344 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
2345 	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "I",
2346 	    "switch to rsc");
2347 
2348 	/*
2349 	 * Setup the ifmedia, which has been initialized earlier.
2350 	 */
2351 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2352 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2353 	/* XXX ifmedia_set really should do this for us */
2354 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2355 
2356 	/*
2357 	 * Setup the ifnet for this interface.
2358 	 */
2359 
2360 	if_setbaudrate(ifp, IF_Gbps(10));
2361 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
2362 	if_setioctlfn(ifp, hn_ioctl);
2363 	if_setinitfn(ifp, hn_init);
2364 #ifdef HN_IFSTART_SUPPORT
2365 	if (hn_use_if_start) {
2366 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2367 
2368 		if_setstartfn(ifp, hn_start);
2369 		if_setsendqlen(ifp, qdepth);
2370 		if_setsendqready(ifp);
2371 	} else
2372 #endif
2373 	{
2374 		if_settransmitfn(ifp, hn_transmit);
2375 		if_setqflushfn(ifp, hn_xmit_qflush);
2376 	}
2377 
2378 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0);
2379 #ifdef foo
2380 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2381 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0);
2382 #endif
2383 	if (sc->hn_caps & HN_CAP_VLAN) {
2384 		/* XXX not sure about VLAN_MTU. */
2385 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0);
2386 	}
2387 
2388 	if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist);
2389 	if (if_gethwassist(ifp) & HN_CSUM_IP_MASK)
2390 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0);
2391 	if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK)
2392 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0);
2393 	if (sc->hn_caps & HN_CAP_TSO4) {
2394 		if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
2395 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2396 	}
2397 	if (sc->hn_caps & HN_CAP_TSO6) {
2398 		if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
2399 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2400 	}
2401 
2402 	/* Enable all available capabilities by default. */
2403 	if_setcapenable(ifp, if_getcapabilities(ifp));
2404 
2405 	/*
2406 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2407 	 * be enabled through SIOCSIFCAP.
2408 	 */
2409 	if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6));
2410 	if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO));
2411 
2412 	if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) {
2413 		/*
2414 		 * Lock hn_set_tso_maxsize() to simplify its
2415 		 * internal logic.
2416 		 */
2417 		HN_LOCK(sc);
2418 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2419 		HN_UNLOCK(sc);
2420 		if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX);
2421 		if_sethwtsomaxsegsize(ifp, PAGE_SIZE);
2422 	}
2423 
2424 	ether_ifattach(ifp, eaddr);
2425 
2426 	if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2427 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2428 		    if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp));
2429 	}
2430 	if (mtu < ETHERMTU) {
2431 
2432 		if_setmtu(ifp, mtu);
2433 	}
2434 
2435 	/* Inform the upper layer about the long frame support. */
2436 	if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
2437 
2438 	/*
2439 	 * Kick off link status check.
2440 	 */
2441 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2442 	hn_update_link_status(sc);
2443 
2444 	if (!hn_xpnt_vf) {
2445 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2446 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2447 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2448 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2449 	} else {
2450 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2451 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2452 	}
2453 
2454 	/*
2455 	 * NOTE:
2456 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2457 	 * since interface's LLADDR is needed; interface LLADDR is not
2458 	 * available when ifnet_arrival event is triggered.
2459 	 */
2460 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2461 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2462 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2463 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2464 
2465 	return (0);
2466 failed:
2467 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2468 		hn_synth_detach(sc);
2469 	hn_detach(dev);
2470 	return (error);
2471 }
2472 
2473 static int
hn_detach(device_t dev)2474 hn_detach(device_t dev)
2475 {
2476 	struct hn_softc *sc = device_get_softc(dev);
2477 	if_t ifp = sc->hn_ifp, vf_ifp;
2478 
2479 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2480 		/*
2481 		 * In case that the vmbus missed the orphan handler
2482 		 * installation.
2483 		 */
2484 		vmbus_xact_ctx_orphan(sc->hn_xact);
2485 	}
2486 
2487 	if (sc->hn_ifaddr_evthand != NULL)
2488 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2489 	if (sc->hn_ifnet_evthand != NULL)
2490 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2491 	if (sc->hn_ifnet_atthand != NULL) {
2492 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2493 		    sc->hn_ifnet_atthand);
2494 	}
2495 	if (sc->hn_ifnet_dethand != NULL) {
2496 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2497 		    sc->hn_ifnet_dethand);
2498 	}
2499 	if (sc->hn_ifnet_lnkhand != NULL)
2500 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2501 
2502 	vf_ifp = sc->hn_vf_ifp;
2503 	__compiler_membar();
2504 	if (vf_ifp != NULL)
2505 		hn_ifnet_detevent(sc, vf_ifp);
2506 
2507 	if (device_is_attached(dev)) {
2508 		HN_LOCK(sc);
2509 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2510 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
2511 				hn_stop(sc, true);
2512 			/*
2513 			 * NOTE:
2514 			 * hn_stop() only suspends data, so management
2515 			 * stuffs have to be suspended manually here.
2516 			 */
2517 			hn_suspend_mgmt(sc);
2518 			hn_synth_detach(sc);
2519 		}
2520 		HN_UNLOCK(sc);
2521 		ether_ifdetach(ifp);
2522 	}
2523 
2524 	ifmedia_removeall(&sc->hn_media);
2525 	hn_destroy_rx_data(sc);
2526 	hn_destroy_tx_data(sc);
2527 
2528 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2529 		int i;
2530 
2531 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2532 			taskqueue_free(sc->hn_tx_taskqs[i]);
2533 		free(sc->hn_tx_taskqs, M_DEVBUF);
2534 	}
2535 	taskqueue_free(sc->hn_mgmt_taskq0);
2536 	if (sc->hn_vf_taskq != NULL)
2537 		taskqueue_free(sc->hn_vf_taskq);
2538 
2539 	if (sc->hn_xact != NULL) {
2540 		/*
2541 		 * Uninstall the orphan handler _before_ the xact is
2542 		 * destructed.
2543 		 */
2544 		vmbus_chan_unset_orphan(sc->hn_prichan);
2545 		vmbus_xact_ctx_destroy(sc->hn_xact);
2546 	}
2547 
2548 	if_free(ifp);
2549 
2550 	HN_LOCK_DESTROY(sc);
2551 	rm_destroy(&sc->hn_vf_lock);
2552 	return (0);
2553 }
2554 
2555 static int
hn_shutdown(device_t dev)2556 hn_shutdown(device_t dev)
2557 {
2558 
2559 	return (0);
2560 }
2561 
2562 static void
hn_link_status(struct hn_softc * sc)2563 hn_link_status(struct hn_softc *sc)
2564 {
2565 	uint32_t link_status;
2566 	int error;
2567 
2568 	error = hn_rndis_get_linkstatus(sc, &link_status);
2569 	if (error) {
2570 		/* XXX what to do? */
2571 		return;
2572 	}
2573 
2574 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2575 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2576 	else
2577 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2578 	if_link_state_change(sc->hn_ifp,
2579 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2580 	    LINK_STATE_UP : LINK_STATE_DOWN);
2581 }
2582 
2583 static void
hn_link_taskfunc(void * xsc,int pending __unused)2584 hn_link_taskfunc(void *xsc, int pending __unused)
2585 {
2586 	struct hn_softc *sc = xsc;
2587 
2588 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2589 		return;
2590 	hn_link_status(sc);
2591 }
2592 
2593 static void
hn_netchg_init_taskfunc(void * xsc,int pending __unused)2594 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2595 {
2596 	struct hn_softc *sc = xsc;
2597 
2598 	/* Prevent any link status checks from running. */
2599 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2600 
2601 	/*
2602 	 * Fake up a [link down --> link up] state change; 5 seconds
2603 	 * delay is used, which closely simulates miibus reaction
2604 	 * upon link down event.
2605 	 */
2606 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2607 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2608 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2609 	    &sc->hn_netchg_status, 5 * hz);
2610 }
2611 
2612 static void
hn_netchg_status_taskfunc(void * xsc,int pending __unused)2613 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2614 {
2615 	struct hn_softc *sc = xsc;
2616 
2617 	/* Re-allow link status checks. */
2618 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2619 	hn_link_status(sc);
2620 }
2621 
2622 static void
hn_update_link_status(struct hn_softc * sc)2623 hn_update_link_status(struct hn_softc *sc)
2624 {
2625 
2626 	if (sc->hn_mgmt_taskq != NULL)
2627 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2628 }
2629 
2630 static void
hn_change_network(struct hn_softc * sc)2631 hn_change_network(struct hn_softc *sc)
2632 {
2633 
2634 	if (sc->hn_mgmt_taskq != NULL)
2635 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2636 }
2637 
2638 static __inline int
hn_txdesc_dmamap_load(struct hn_tx_ring * txr,struct hn_txdesc * txd,struct mbuf ** m_head,bus_dma_segment_t * segs,int * nsegs)2639 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2640     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2641 {
2642 	struct mbuf *m = *m_head;
2643 	int error;
2644 
2645 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2646 
2647 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2648 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2649 	if (error == EFBIG) {
2650 		struct mbuf *m_new;
2651 
2652 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2653 		if (m_new == NULL)
2654 			return ENOBUFS;
2655 		else
2656 			*m_head = m = m_new;
2657 		txr->hn_tx_collapsed++;
2658 
2659 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2660 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2661 	}
2662 	if (!error) {
2663 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2664 		    BUS_DMASYNC_PREWRITE);
2665 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2666 	}
2667 	return error;
2668 }
2669 
2670 static __inline int
hn_txdesc_put(struct hn_tx_ring * txr,struct hn_txdesc * txd)2671 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2672 {
2673 
2674 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2675 	    ("put an onlist txd %#x", txd->flags));
2676 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2677 	    ("put an onagg txd %#x", txd->flags));
2678 
2679 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2680 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2681 		return 0;
2682 
2683 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2684 		struct hn_txdesc *tmp_txd;
2685 
2686 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2687 			int freed __diagused;
2688 
2689 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2690 			    ("resursive aggregation on aggregated txdesc"));
2691 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2692 			    ("not aggregated txdesc"));
2693 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2694 			    ("aggregated txdesc uses dmamap"));
2695 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2696 			    ("aggregated txdesc consumes "
2697 			     "chimney sending buffer"));
2698 			KASSERT(tmp_txd->chim_size == 0,
2699 			    ("aggregated txdesc has non-zero "
2700 			     "chimney sending size"));
2701 
2702 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2703 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2704 			freed = hn_txdesc_put(txr, tmp_txd);
2705 			KASSERT(freed, ("failed to free aggregated txdesc"));
2706 		}
2707 	}
2708 
2709 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2710 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2711 		    ("chim txd uses dmamap"));
2712 		hn_chim_free(txr->hn_sc, txd->chim_index);
2713 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2714 		txd->chim_size = 0;
2715 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2716 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2717 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2718 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2719 		    txd->data_dmap);
2720 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2721 	}
2722 
2723 	if (txd->m != NULL) {
2724 		m_freem(txd->m);
2725 		txd->m = NULL;
2726 	}
2727 
2728 	txd->flags |= HN_TXD_FLAG_ONLIST;
2729 #ifndef HN_USE_TXDESC_BUFRING
2730 	mtx_lock_spin(&txr->hn_txlist_spin);
2731 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2732 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2733 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2734 	txr->hn_txdesc_avail++;
2735 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2736 	mtx_unlock_spin(&txr->hn_txlist_spin);
2737 #else	/* HN_USE_TXDESC_BUFRING */
2738 #ifdef HN_DEBUG
2739 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2740 #endif
2741 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2742 #endif	/* !HN_USE_TXDESC_BUFRING */
2743 
2744 	return 1;
2745 }
2746 
2747 static __inline struct hn_txdesc *
hn_txdesc_get(struct hn_tx_ring * txr)2748 hn_txdesc_get(struct hn_tx_ring *txr)
2749 {
2750 	struct hn_txdesc *txd;
2751 
2752 #ifndef HN_USE_TXDESC_BUFRING
2753 	mtx_lock_spin(&txr->hn_txlist_spin);
2754 	txd = SLIST_FIRST(&txr->hn_txlist);
2755 	if (txd != NULL) {
2756 		KASSERT(txr->hn_txdesc_avail > 0,
2757 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2758 		txr->hn_txdesc_avail--;
2759 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2760 	}
2761 	mtx_unlock_spin(&txr->hn_txlist_spin);
2762 #else
2763 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2764 #endif
2765 
2766 	if (txd != NULL) {
2767 #ifdef HN_USE_TXDESC_BUFRING
2768 #ifdef HN_DEBUG
2769 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2770 #endif
2771 #endif	/* HN_USE_TXDESC_BUFRING */
2772 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2773 		    STAILQ_EMPTY(&txd->agg_list) &&
2774 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2775 		    txd->chim_size == 0 &&
2776 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2777 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2778 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2779 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2780 		txd->refs = 1;
2781 	}
2782 	return txd;
2783 }
2784 
2785 static __inline void
hn_txdesc_hold(struct hn_txdesc * txd)2786 hn_txdesc_hold(struct hn_txdesc *txd)
2787 {
2788 
2789 	/* 0->1 transition will never work */
2790 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2791 	atomic_add_int(&txd->refs, 1);
2792 }
2793 
2794 static __inline void
hn_txdesc_agg(struct hn_txdesc * agg_txd,struct hn_txdesc * txd)2795 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2796 {
2797 
2798 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2799 	    ("recursive aggregation on aggregating txdesc"));
2800 
2801 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2802 	    ("already aggregated"));
2803 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2804 	    ("recursive aggregation on to-be-aggregated txdesc"));
2805 
2806 	txd->flags |= HN_TXD_FLAG_ONAGG;
2807 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2808 }
2809 
2810 static bool
hn_tx_ring_pending(struct hn_tx_ring * txr)2811 hn_tx_ring_pending(struct hn_tx_ring *txr)
2812 {
2813 	bool pending = false;
2814 
2815 #ifndef HN_USE_TXDESC_BUFRING
2816 	mtx_lock_spin(&txr->hn_txlist_spin);
2817 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2818 		pending = true;
2819 	mtx_unlock_spin(&txr->hn_txlist_spin);
2820 #else
2821 	if (!buf_ring_full(txr->hn_txdesc_br))
2822 		pending = true;
2823 #endif
2824 	return (pending);
2825 }
2826 
2827 static __inline void
hn_txeof(struct hn_tx_ring * txr)2828 hn_txeof(struct hn_tx_ring *txr)
2829 {
2830 	txr->hn_has_txeof = 0;
2831 	txr->hn_txeof(txr);
2832 }
2833 
2834 static void
hn_txpkt_done(struct hn_nvs_sendctx * sndc,struct hn_softc * sc,struct vmbus_channel * chan,const void * data __unused,int dlen __unused)2835 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2836     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2837 {
2838 	struct hn_txdesc *txd = sndc->hn_cbarg;
2839 	struct hn_tx_ring *txr;
2840 
2841 	txr = txd->txr;
2842 	KASSERT(txr->hn_chan == chan,
2843 	    ("channel mismatch, on chan%u, should be chan%u",
2844 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2845 
2846 	txr->hn_has_txeof = 1;
2847 	hn_txdesc_put(txr, txd);
2848 
2849 	++txr->hn_txdone_cnt;
2850 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2851 		txr->hn_txdone_cnt = 0;
2852 		if (txr->hn_oactive)
2853 			hn_txeof(txr);
2854 	}
2855 }
2856 
2857 static void
hn_chan_rollup(struct hn_rx_ring * rxr,struct hn_tx_ring * txr)2858 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2859 {
2860 #if defined(INET) || defined(INET6)
2861 	struct epoch_tracker et;
2862 
2863 	NET_EPOCH_ENTER(et);
2864 	tcp_lro_flush_all(&rxr->hn_lro);
2865 	NET_EPOCH_EXIT(et);
2866 #endif
2867 
2868 	/*
2869 	 * NOTE:
2870 	 * 'txr' could be NULL, if multiple channels and
2871 	 * ifnet.if_start method are enabled.
2872 	 */
2873 	if (txr == NULL || !txr->hn_has_txeof)
2874 		return;
2875 
2876 	txr->hn_txdone_cnt = 0;
2877 	hn_txeof(txr);
2878 }
2879 
2880 static __inline uint32_t
hn_rndis_pktmsg_offset(uint32_t ofs)2881 hn_rndis_pktmsg_offset(uint32_t ofs)
2882 {
2883 
2884 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2885 	    ("invalid RNDIS packet msg offset %u", ofs));
2886 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2887 }
2888 
2889 static __inline void *
hn_rndis_pktinfo_append(struct rndis_packet_msg * pkt,size_t pktsize,size_t pi_dlen,uint32_t pi_type)2890 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2891     size_t pi_dlen, uint32_t pi_type)
2892 {
2893 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2894 	struct rndis_pktinfo *pi;
2895 
2896 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2897 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2898 
2899 	/*
2900 	 * Per-packet-info does not move; it only grows.
2901 	 *
2902 	 * NOTE:
2903 	 * rm_pktinfooffset in this phase counts from the beginning
2904 	 * of rndis_packet_msg.
2905 	 */
2906 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2907 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2908 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2909 	    pkt->rm_pktinfolen);
2910 	pkt->rm_pktinfolen += pi_size;
2911 
2912 	pi->rm_size = pi_size;
2913 	pi->rm_type = pi_type;
2914 	pi->rm_internal = 0;
2915 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2916 
2917 	return (pi->rm_data);
2918 }
2919 
2920 static __inline int
hn_flush_txagg(if_t ifp,struct hn_tx_ring * txr)2921 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr)
2922 {
2923 	struct hn_txdesc *txd;
2924 	struct mbuf *m;
2925 	int error, pkts;
2926 
2927 	txd = txr->hn_agg_txd;
2928 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2929 
2930 	/*
2931 	 * Since hn_txpkt() will reset this temporary stat, save
2932 	 * it now, so that oerrors can be updated properly, if
2933 	 * hn_txpkt() ever fails.
2934 	 */
2935 	pkts = txr->hn_stat_pkts;
2936 
2937 	/*
2938 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2939 	 * failure, save it for later freeing, if hn_txpkt() ever
2940 	 * fails.
2941 	 */
2942 	m = txd->m;
2943 	error = hn_txpkt(ifp, txr, txd);
2944 	if (__predict_false(error)) {
2945 		/* txd is freed, but m is not. */
2946 		m_freem(m);
2947 
2948 		txr->hn_flush_failed++;
2949 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2950 	}
2951 
2952 	/* Reset all aggregation states. */
2953 	txr->hn_agg_txd = NULL;
2954 	txr->hn_agg_szleft = 0;
2955 	txr->hn_agg_pktleft = 0;
2956 	txr->hn_agg_prevpkt = NULL;
2957 
2958 	return (error);
2959 }
2960 
2961 static void *
hn_try_txagg(if_t ifp,struct hn_tx_ring * txr,struct hn_txdesc * txd,int pktsize)2962 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2963     int pktsize)
2964 {
2965 	void *chim;
2966 
2967 	if (txr->hn_agg_txd != NULL) {
2968 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2969 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2970 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2971 			int olen;
2972 
2973 			/*
2974 			 * Update the previous RNDIS packet's total length,
2975 			 * it can be increased due to the mandatory alignment
2976 			 * padding for this RNDIS packet.  And update the
2977 			 * aggregating txdesc's chimney sending buffer size
2978 			 * accordingly.
2979 			 *
2980 			 * XXX
2981 			 * Zero-out the padding, as required by the RNDIS spec.
2982 			 */
2983 			olen = pkt->rm_len;
2984 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2985 			agg_txd->chim_size += pkt->rm_len - olen;
2986 
2987 			/* Link this txdesc to the parent. */
2988 			hn_txdesc_agg(agg_txd, txd);
2989 
2990 			chim = (uint8_t *)pkt + pkt->rm_len;
2991 			/* Save the current packet for later fixup. */
2992 			txr->hn_agg_prevpkt = chim;
2993 
2994 			txr->hn_agg_pktleft--;
2995 			txr->hn_agg_szleft -= pktsize;
2996 			if (txr->hn_agg_szleft <=
2997 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2998 				/*
2999 				 * Probably can't aggregate more packets,
3000 				 * flush this aggregating txdesc proactively.
3001 				 */
3002 				txr->hn_agg_pktleft = 0;
3003 			}
3004 			/* Done! */
3005 			return (chim);
3006 		}
3007 		hn_flush_txagg(ifp, txr);
3008 	}
3009 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3010 
3011 	txr->hn_tx_chimney_tried++;
3012 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3013 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3014 		return (NULL);
3015 	txr->hn_tx_chimney++;
3016 
3017 	chim = txr->hn_sc->hn_chim +
3018 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3019 
3020 	if (txr->hn_agg_pktmax > 1 &&
3021 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3022 		txr->hn_agg_txd = txd;
3023 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3024 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3025 		txr->hn_agg_prevpkt = chim;
3026 	}
3027 	return (chim);
3028 }
3029 
3030 /*
3031  * NOTE:
3032  * If this function fails, then both txd and m_head0 will be freed.
3033  */
3034 static int
hn_encap(if_t ifp,struct hn_tx_ring * txr,struct hn_txdesc * txd,struct mbuf ** m_head0)3035 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3036     struct mbuf **m_head0)
3037 {
3038 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3039 	int error, nsegs, i;
3040 	struct mbuf *m_head = *m_head0;
3041 	struct rndis_packet_msg *pkt;
3042 	uint32_t *pi_data;
3043 	void *chim = NULL;
3044 	int pkt_hlen, pkt_size;
3045 
3046 	pkt = txd->rndis_pkt;
3047 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3048 	if (pkt_size < txr->hn_chim_size) {
3049 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3050 		if (chim != NULL)
3051 			pkt = chim;
3052 	} else {
3053 		if (txr->hn_agg_txd != NULL)
3054 			hn_flush_txagg(ifp, txr);
3055 	}
3056 
3057 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3058 	pkt->rm_len = m_head->m_pkthdr.len;
3059 	pkt->rm_dataoffset = 0;
3060 	pkt->rm_datalen = m_head->m_pkthdr.len;
3061 	pkt->rm_oobdataoffset = 0;
3062 	pkt->rm_oobdatalen = 0;
3063 	pkt->rm_oobdataelements = 0;
3064 	pkt->rm_pktinfooffset = sizeof(*pkt);
3065 	pkt->rm_pktinfolen = 0;
3066 	pkt->rm_vchandle = 0;
3067 	pkt->rm_reserved = 0;
3068 
3069 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3070 		/*
3071 		 * Set the hash value for this packet.
3072 		 */
3073 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3074 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3075 
3076 		if (M_HASHTYPE_ISHASH(m_head))
3077 			/*
3078 			 * The flowid field contains the hash value host
3079 			 * set in the rx queue if it is a ip forwarding pkt.
3080 			 * Set the same hash value so host can send on the
3081 			 * cpu it was received.
3082 			 */
3083 			*pi_data = m_head->m_pkthdr.flowid;
3084 		else
3085 			/*
3086 			 * Otherwise just put the tx queue index.
3087 			 */
3088 			*pi_data = txr->hn_tx_idx;
3089 	}
3090 
3091 	if (m_head->m_flags & M_VLANTAG) {
3092 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3093 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3094 		*pi_data = NDIS_VLAN_INFO_MAKE(
3095 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3096 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3097 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3098 	}
3099 
3100 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3101 #if defined(INET6) || defined(INET)
3102 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3103 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3104 #ifdef INET
3105 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3106 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3107 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3108 			    m_head->m_pkthdr.tso_segsz);
3109 		}
3110 #endif
3111 #if defined(INET6) && defined(INET)
3112 		else
3113 #endif
3114 #ifdef INET6
3115 		{
3116 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3117 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3118 			    m_head->m_pkthdr.tso_segsz);
3119 		}
3120 #endif
3121 #endif	/* INET6 || INET */
3122 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3123 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3124 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3125 		if (m_head->m_pkthdr.csum_flags &
3126 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3127 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3128 		} else {
3129 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3130 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3131 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3132 		}
3133 
3134 		if (m_head->m_pkthdr.csum_flags &
3135 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3136 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3137 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3138 		} else if (m_head->m_pkthdr.csum_flags &
3139 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3140 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3141 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3142 		}
3143 	}
3144 
3145 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3146 	/* Fixup RNDIS packet message total length */
3147 	pkt->rm_len += pkt_hlen;
3148 	/* Convert RNDIS packet message offsets */
3149 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3150 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3151 
3152 	/*
3153 	 * Fast path: Chimney sending.
3154 	 */
3155 	if (chim != NULL) {
3156 		struct hn_txdesc *tgt_txd = txd;
3157 
3158 		if (txr->hn_agg_txd != NULL) {
3159 			tgt_txd = txr->hn_agg_txd;
3160 #ifdef INVARIANTS
3161 			*m_head0 = NULL;
3162 #endif
3163 		}
3164 
3165 		KASSERT(pkt == chim,
3166 		    ("RNDIS pkt not in chimney sending buffer"));
3167 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3168 		    ("chimney sending buffer is not used"));
3169 		tgt_txd->chim_size += pkt->rm_len;
3170 
3171 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3172 		    ((uint8_t *)chim) + pkt_hlen);
3173 
3174 		txr->hn_gpa_cnt = 0;
3175 		txr->hn_sendpkt = hn_txpkt_chim;
3176 		goto done;
3177 	}
3178 
3179 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3180 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3181 	    ("chimney buffer is used"));
3182 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3183 
3184 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3185 	if (__predict_false(error)) {
3186 		int freed __diagused;
3187 
3188 		/*
3189 		 * This mbuf is not linked w/ the txd yet, so free it now.
3190 		 */
3191 		m_freem(m_head);
3192 		*m_head0 = NULL;
3193 
3194 		freed = hn_txdesc_put(txr, txd);
3195 		KASSERT(freed != 0,
3196 		    ("fail to free txd upon txdma error"));
3197 
3198 		txr->hn_txdma_failed++;
3199 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3200 		return error;
3201 	}
3202 	*m_head0 = m_head;
3203 
3204 	/* +1 RNDIS packet message */
3205 	txr->hn_gpa_cnt = nsegs + 1;
3206 
3207 	/* send packet with page buffer */
3208 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3209 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3210 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3211 
3212 	/*
3213 	 * Fill the page buffers with mbuf info after the page
3214 	 * buffer for RNDIS packet message.
3215 	 */
3216 	for (i = 0; i < nsegs; ++i) {
3217 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3218 
3219 		gpa->gpa_page = atop(segs[i].ds_addr);
3220 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3221 		gpa->gpa_len = segs[i].ds_len;
3222 	}
3223 
3224 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3225 	txd->chim_size = 0;
3226 	txr->hn_sendpkt = hn_txpkt_sglist;
3227 done:
3228 	txd->m = m_head;
3229 
3230 	/* Set the completion routine */
3231 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3232 
3233 	/* Update temporary stats for later use. */
3234 	txr->hn_stat_pkts++;
3235 	txr->hn_stat_size += m_head->m_pkthdr.len;
3236 	if (m_head->m_flags & M_MCAST)
3237 		txr->hn_stat_mcasts++;
3238 
3239 	return 0;
3240 }
3241 
3242 /*
3243  * NOTE:
3244  * If this function fails, then txd will be freed, but the mbuf
3245  * associated w/ the txd will _not_ be freed.
3246  */
3247 static int
hn_txpkt(if_t ifp,struct hn_tx_ring * txr,struct hn_txdesc * txd)3248 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3249 {
3250 	int error, send_failed = 0, has_bpf;
3251 
3252 again:
3253 	has_bpf = bpf_peers_present_if(ifp);
3254 	if (has_bpf) {
3255 		/*
3256 		 * Make sure that this txd and any aggregated txds are not
3257 		 * freed before ETHER_BPF_MTAP.
3258 		 */
3259 		hn_txdesc_hold(txd);
3260 	}
3261 	error = txr->hn_sendpkt(txr, txd);
3262 	if (!error) {
3263 		if (has_bpf) {
3264 			const struct hn_txdesc *tmp_txd;
3265 
3266 			ETHER_BPF_MTAP(ifp, txd->m);
3267 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3268 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3269 		}
3270 
3271 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3272 #ifdef HN_IFSTART_SUPPORT
3273 		if (!hn_use_if_start)
3274 #endif
3275 		{
3276 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3277 			    txr->hn_stat_size);
3278 			if (txr->hn_stat_mcasts != 0) {
3279 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3280 				    txr->hn_stat_mcasts);
3281 			}
3282 		}
3283 		txr->hn_pkts += txr->hn_stat_pkts;
3284 		txr->hn_sends++;
3285 	}
3286 	if (has_bpf)
3287 		hn_txdesc_put(txr, txd);
3288 
3289 	if (__predict_false(error)) {
3290 		int freed __diagused;
3291 
3292 		/*
3293 		 * This should "really rarely" happen.
3294 		 *
3295 		 * XXX Too many RX to be acked or too many sideband
3296 		 * commands to run?  Ask netvsc_channel_rollup()
3297 		 * to kick start later.
3298 		 */
3299 		txr->hn_has_txeof = 1;
3300 		if (!send_failed) {
3301 			txr->hn_send_failed++;
3302 			send_failed = 1;
3303 			/*
3304 			 * Try sending again after set hn_has_txeof;
3305 			 * in case that we missed the last
3306 			 * netvsc_channel_rollup().
3307 			 */
3308 			goto again;
3309 		}
3310 		if_printf(ifp, "send failed\n");
3311 
3312 		/*
3313 		 * Caller will perform further processing on the
3314 		 * associated mbuf, so don't free it in hn_txdesc_put();
3315 		 * only unload it from the DMA map in hn_txdesc_put(),
3316 		 * if it was loaded.
3317 		 */
3318 		txd->m = NULL;
3319 		freed = hn_txdesc_put(txr, txd);
3320 		KASSERT(freed != 0,
3321 		    ("fail to free txd upon send error"));
3322 
3323 		txr->hn_send_failed++;
3324 	}
3325 
3326 	/* Reset temporary stats, after this sending is done. */
3327 	txr->hn_stat_size = 0;
3328 	txr->hn_stat_pkts = 0;
3329 	txr->hn_stat_mcasts = 0;
3330 
3331 	return (error);
3332 }
3333 
3334 /*
3335  * Append the specified data to the indicated mbuf chain,
3336  * Extend the mbuf chain if the new data does not fit in
3337  * existing space.
3338  *
3339  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3340  * There should be an equivalent in the kernel mbuf code,
3341  * but there does not appear to be one yet.
3342  *
3343  * Differs from m_append() in that additional mbufs are
3344  * allocated with cluster size MJUMPAGESIZE, and filled
3345  * accordingly.
3346  *
3347  * Return the last mbuf in the chain or NULL if failed to
3348  * allocate new mbuf.
3349  */
3350 static struct mbuf *
hv_m_append(struct mbuf * m0,int len,c_caddr_t cp)3351 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3352 {
3353 	struct mbuf *m, *n;
3354 	int remainder, space;
3355 
3356 	for (m = m0; m->m_next != NULL; m = m->m_next)
3357 		;
3358 	remainder = len;
3359 	space = M_TRAILINGSPACE(m);
3360 	if (space > 0) {
3361 		/*
3362 		 * Copy into available space.
3363 		 */
3364 		if (space > remainder)
3365 			space = remainder;
3366 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3367 		m->m_len += space;
3368 		cp += space;
3369 		remainder -= space;
3370 	}
3371 	while (remainder > 0) {
3372 		/*
3373 		 * Allocate a new mbuf; could check space
3374 		 * and allocate a cluster instead.
3375 		 */
3376 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3377 		if (n == NULL)
3378 			return NULL;
3379 		n->m_len = min(MJUMPAGESIZE, remainder);
3380 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3381 		cp += n->m_len;
3382 		remainder -= n->m_len;
3383 		m->m_next = n;
3384 		m = n;
3385 	}
3386 
3387 	return m;
3388 }
3389 
3390 #if defined(INET) || defined(INET6)
3391 static __inline int
hn_lro_rx(struct lro_ctrl * lc,struct mbuf * m)3392 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3393 {
3394 	if (hn_lro_mbufq_depth) {
3395 		tcp_lro_queue_mbuf(lc, m);
3396 		return 0;
3397 	}
3398 	return tcp_lro_rx(lc, m, 0);
3399 }
3400 #endif
3401 
3402 static int
hn_rxpkt(struct hn_rx_ring * rxr)3403 hn_rxpkt(struct hn_rx_ring *rxr)
3404 {
3405 	if_t ifp, hn_ifp = rxr->hn_ifp;
3406 	struct mbuf *m_new, *n;
3407 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3408 	int hash_type = M_HASHTYPE_NONE;
3409 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3410 	int i;
3411 
3412 	ifp = hn_ifp;
3413 	if (rxr->hn_rxvf_ifp != NULL) {
3414 		/*
3415 		 * Non-transparent mode VF; pretend this packet is from
3416 		 * the VF.
3417 		 */
3418 		ifp = rxr->hn_rxvf_ifp;
3419 		is_vf = 1;
3420 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3421 		/* Transparent mode VF. */
3422 		is_vf = 1;
3423 	}
3424 
3425 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
3426 		/*
3427 		 * NOTE:
3428 		 * See the NOTE of hn_rndis_init_fixat().  This
3429 		 * function can be reached, immediately after the
3430 		 * RNDIS is initialized but before the ifnet is
3431 		 * setup on the hn_attach() path; drop the unexpected
3432 		 * packets.
3433 		 */
3434 		return (0);
3435 	}
3436 
3437 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3438 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3439 		return (0);
3440 	}
3441 
3442 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3443 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3444 		if (m_new == NULL) {
3445 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3446 			return (0);
3447 		}
3448 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3449 		    rxr->rsc.frag_len[0]);
3450 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3451 	} else {
3452 		/*
3453 		 * Get an mbuf with a cluster.  For packets 2K or less,
3454 		 * get a standard 2K cluster.  For anything larger, get a
3455 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3456 		 * if looped around to the Hyper-V TX channel, so avoid them.
3457 		 */
3458 		size = MCLBYTES;
3459 		if (rxr->rsc.pktlen > MCLBYTES) {
3460 			/* 4096 */
3461 			size = MJUMPAGESIZE;
3462 		}
3463 
3464 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3465 		if (m_new == NULL) {
3466 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3467 			return (0);
3468 		}
3469 
3470 		n = m_new;
3471 		for (i = 0; i < rxr->rsc.cnt; i++) {
3472 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3473 			    rxr->rsc.frag_data[i]);
3474 			if (n == NULL) {
3475 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3476 				return (0);
3477 			} else {
3478 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3479 			}
3480 		}
3481 	}
3482 	if (rxr->rsc.pktlen <= MHLEN)
3483 		rxr->hn_small_pkts++;
3484 
3485 	m_new->m_pkthdr.rcvif = ifp;
3486 
3487 	if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0))
3488 		do_csum = 0;
3489 
3490 	/* receive side checksum offload */
3491 	if (rxr->rsc.csum_info != NULL) {
3492 		/* IP csum offload */
3493 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3494 			m_new->m_pkthdr.csum_flags |=
3495 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3496 			rxr->hn_csum_ip++;
3497 		}
3498 
3499 		/* TCP/UDP csum offload */
3500 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3501 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3502 			m_new->m_pkthdr.csum_flags |=
3503 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3504 			m_new->m_pkthdr.csum_data = 0xffff;
3505 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3506 				rxr->hn_csum_tcp++;
3507 			else
3508 				rxr->hn_csum_udp++;
3509 		}
3510 
3511 		/*
3512 		 * XXX
3513 		 * As of this write (Oct 28th, 2016), host side will turn
3514 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3515 		 * the do_lro setting here is actually _not_ accurate.  We
3516 		 * depend on the RSS hash type check to reset do_lro.
3517 		 */
3518 		if ((*(rxr->rsc.csum_info) &
3519 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3520 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3521 			do_lro = 1;
3522 	} else {
3523 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3524 		if (l3proto == ETHERTYPE_IP) {
3525 			if (l4proto == IPPROTO_TCP) {
3526 				if (do_csum &&
3527 				    (rxr->hn_trust_hcsum &
3528 				     HN_TRUST_HCSUM_TCP)) {
3529 					rxr->hn_csum_trusted++;
3530 					m_new->m_pkthdr.csum_flags |=
3531 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3532 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3533 					m_new->m_pkthdr.csum_data = 0xffff;
3534 				}
3535 				do_lro = 1;
3536 			} else if (l4proto == IPPROTO_UDP) {
3537 				if (do_csum &&
3538 				    (rxr->hn_trust_hcsum &
3539 				     HN_TRUST_HCSUM_UDP)) {
3540 					rxr->hn_csum_trusted++;
3541 					m_new->m_pkthdr.csum_flags |=
3542 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3543 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3544 					m_new->m_pkthdr.csum_data = 0xffff;
3545 				}
3546 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3547 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3548 				rxr->hn_csum_trusted++;
3549 				m_new->m_pkthdr.csum_flags |=
3550 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3551 			}
3552 		}
3553 	}
3554 
3555 	if (rxr->rsc.vlan_info != NULL) {
3556 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3557 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3558 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3559 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3560 		m_new->m_flags |= M_VLANTAG;
3561 	}
3562 
3563 	/*
3564 	 * If VF is activated (transparent/non-transparent mode does not
3565 	 * matter here).
3566 	 *
3567 	 * - Disable LRO
3568 	 *
3569 	 *   hn(4) will only receive broadcast packets, multicast packets,
3570 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3571 	 *   packet types.
3572 	 *
3573 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3574 	 *   all, since the LRO flush will use hn(4) as the receiving
3575 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3576 	 */
3577 	if (is_vf)
3578 		do_lro = 0;
3579 
3580 	/*
3581 	 * If VF is activated (transparent/non-transparent mode does not
3582 	 * matter here), do _not_ mess with unsupported hash types or
3583 	 * functions.
3584 	 */
3585 	if (rxr->rsc.hash_info != NULL) {
3586 		rxr->hn_rss_pkts++;
3587 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3588 		if (!is_vf)
3589 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3590 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3591 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3592 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3593 			    rxr->hn_mbuf_hash);
3594 
3595 			/*
3596 			 * NOTE:
3597 			 * do_lro is resetted, if the hash types are not TCP
3598 			 * related.  See the comment in the above csum_flags
3599 			 * setup section.
3600 			 */
3601 			switch (type) {
3602 			case NDIS_HASH_IPV4:
3603 				hash_type = M_HASHTYPE_RSS_IPV4;
3604 				do_lro = 0;
3605 				break;
3606 
3607 			case NDIS_HASH_TCP_IPV4:
3608 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3609 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3610 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3611 
3612 					if (is_vf)
3613 						def_htype = M_HASHTYPE_NONE;
3614 
3615 					/*
3616 					 * UDP 4-tuple hash is delivered as
3617 					 * TCP 4-tuple hash.
3618 					 */
3619 					if (l3proto == ETHERTYPE_MAX) {
3620 						hn_rxpkt_proto(m_new,
3621 						    &l3proto, &l4proto);
3622 					}
3623 					if (l3proto == ETHERTYPE_IP) {
3624 						if (l4proto == IPPROTO_UDP &&
3625 						    (rxr->hn_mbuf_hash &
3626 						     NDIS_HASH_UDP_IPV4_X)) {
3627 							hash_type =
3628 							M_HASHTYPE_RSS_UDP_IPV4;
3629 							do_lro = 0;
3630 						} else if (l4proto !=
3631 						    IPPROTO_TCP) {
3632 							hash_type = def_htype;
3633 							do_lro = 0;
3634 						}
3635 					} else {
3636 						hash_type = def_htype;
3637 						do_lro = 0;
3638 					}
3639 				}
3640 				break;
3641 
3642 			case NDIS_HASH_IPV6:
3643 				hash_type = M_HASHTYPE_RSS_IPV6;
3644 				do_lro = 0;
3645 				break;
3646 
3647 			case NDIS_HASH_IPV6_EX:
3648 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3649 				do_lro = 0;
3650 				break;
3651 
3652 			case NDIS_HASH_TCP_IPV6:
3653 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3654 				break;
3655 
3656 			case NDIS_HASH_TCP_IPV6_EX:
3657 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3658 				break;
3659 			}
3660 		}
3661 	} else if (!is_vf) {
3662 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3663 		hash_type = M_HASHTYPE_OPAQUE;
3664 	}
3665 	M_HASHTYPE_SET(m_new, hash_type);
3666 
3667 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3668 	if (hn_ifp != ifp) {
3669 		const struct ether_header *eh;
3670 
3671 		/*
3672 		 * Non-transparent mode VF is activated.
3673 		 */
3674 
3675 		/*
3676 		 * Allow tapping on hn(4).
3677 		 */
3678 		ETHER_BPF_MTAP(hn_ifp, m_new);
3679 
3680 		/*
3681 		 * Update hn(4)'s stats.
3682 		 */
3683 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3684 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3685 		/* Checked at the beginning of this function. */
3686 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3687 		eh = mtod(m_new, struct ether_header *);
3688 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3689 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3690 	}
3691 	rxr->hn_pkts++;
3692 
3693 	if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) {
3694 #if defined(INET) || defined(INET6)
3695 		struct lro_ctrl *lro = &rxr->hn_lro;
3696 
3697 		if (lro->lro_cnt) {
3698 			rxr->hn_lro_tried++;
3699 			if (hn_lro_rx(lro, m_new) == 0) {
3700 				/* DONE! */
3701 				return 0;
3702 			}
3703 		}
3704 #endif
3705 	}
3706 	if_input(ifp, m_new);
3707 
3708 	return (0);
3709 }
3710 
3711 static int
hn_ioctl(if_t ifp,u_long cmd,caddr_t data)3712 hn_ioctl(if_t ifp, u_long cmd, caddr_t data)
3713 {
3714 	struct hn_softc *sc = if_getsoftc(ifp);
3715 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3716 	if_t vf_ifp;
3717 	int mask, error = 0;
3718 	struct ifrsskey *ifrk;
3719 	struct ifrsshash *ifrh;
3720 	uint32_t mtu;
3721 
3722 	switch (cmd) {
3723 	case SIOCSIFMTU:
3724 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3725 			error = EINVAL;
3726 			break;
3727 		}
3728 
3729 		HN_LOCK(sc);
3730 
3731 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3732 			HN_UNLOCK(sc);
3733 			break;
3734 		}
3735 
3736 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3737 			/* Can't change MTU */
3738 			HN_UNLOCK(sc);
3739 			error = EOPNOTSUPP;
3740 			break;
3741 		}
3742 
3743 		if (if_getmtu(ifp) == ifr->ifr_mtu) {
3744 			HN_UNLOCK(sc);
3745 			break;
3746 		}
3747 
3748 		if (hn_xpnt_vf_isready(sc)) {
3749 			vf_ifp = sc->hn_vf_ifp;
3750 			ifr_vf = *ifr;
3751 			strlcpy(ifr_vf.ifr_name, if_name(vf_ifp),
3752 			    sizeof(ifr_vf.ifr_name));
3753 			error = ifhwioctl(SIOCSIFMTU,vf_ifp,
3754 			    (caddr_t)&ifr_vf, curthread);
3755 			if (error) {
3756 				HN_UNLOCK(sc);
3757 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3758 				    if_name(vf_ifp), ifr->ifr_mtu, error);
3759 				break;
3760 			}
3761 		}
3762 
3763 		/*
3764 		 * Suspend this interface before the synthetic parts
3765 		 * are ripped.
3766 		 */
3767 		hn_suspend(sc);
3768 
3769 		/*
3770 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3771 		 */
3772 		hn_synth_detach(sc);
3773 
3774 		/*
3775 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3776 		 * with the new MTU setting.
3777 		 */
3778 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3779 		if (error) {
3780 			HN_UNLOCK(sc);
3781 			break;
3782 		}
3783 
3784 		error = hn_rndis_get_mtu(sc, &mtu);
3785 		if (error)
3786 			mtu = ifr->ifr_mtu;
3787 		else if (bootverbose)
3788 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3789 
3790 		/*
3791 		 * Commit the requested MTU, after the synthetic parts
3792 		 * have been successfully attached.
3793 		 */
3794 		if (mtu >= ifr->ifr_mtu) {
3795 			mtu = ifr->ifr_mtu;
3796 		} else {
3797 			if_printf(ifp, "fixup mtu %d -> %u\n",
3798 			    ifr->ifr_mtu, mtu);
3799 		}
3800 		if_setmtu(ifp, mtu);
3801 
3802 		/*
3803 		 * Synthetic parts' reattach may change the chimney
3804 		 * sending size; update it.
3805 		 */
3806 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3807 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3808 
3809 		/*
3810 		 * Make sure that various parameters based on MTU are
3811 		 * still valid, after the MTU change.
3812 		 */
3813 		hn_mtu_change_fixup(sc);
3814 
3815 		/*
3816 		 * All done!  Resume the interface now.
3817 		 */
3818 		hn_resume(sc);
3819 
3820 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3821 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3822 			/*
3823 			 * Since we have reattached the NVS part,
3824 			 * change the datapath to VF again; in case
3825 			 * that it is lost, after the NVS was detached.
3826 			 */
3827 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3828 		}
3829 
3830 		HN_UNLOCK(sc);
3831 		break;
3832 
3833 	case SIOCSIFFLAGS:
3834 		HN_LOCK(sc);
3835 
3836 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3837 			HN_UNLOCK(sc);
3838 			break;
3839 		}
3840 
3841 		if (hn_xpnt_vf_isready(sc))
3842 			hn_xpnt_vf_saveifflags(sc);
3843 
3844 		if (if_getflags(ifp) & IFF_UP) {
3845 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3846 				/*
3847 				 * Caller meight hold mutex, e.g.
3848 				 * bpf; use busy-wait for the RNDIS
3849 				 * reply.
3850 				 */
3851 				HN_NO_SLEEPING(sc);
3852 				hn_rxfilter_config(sc);
3853 				HN_SLEEPING_OK(sc);
3854 
3855 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3856 					error = hn_xpnt_vf_iocsetflags(sc);
3857 			} else {
3858 				hn_init_locked(sc);
3859 			}
3860 		} else {
3861 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
3862 				hn_stop(sc, false);
3863 		}
3864 		sc->hn_if_flags = if_getflags(ifp);
3865 
3866 		HN_UNLOCK(sc);
3867 		break;
3868 
3869 	case SIOCSIFCAP:
3870 		HN_LOCK(sc);
3871 
3872 		if (hn_xpnt_vf_isready(sc)) {
3873 			ifr_vf = *ifr;
3874 			strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp),
3875 			    sizeof(ifr_vf.ifr_name));
3876 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3877 			HN_UNLOCK(sc);
3878 			break;
3879 		}
3880 
3881 		/*
3882 		 * Fix up requested capabilities w/ supported capabilities,
3883 		 * since the supported capabilities could have been changed.
3884 		 */
3885 		mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^
3886 		    if_getcapenable(ifp);
3887 
3888 		if (mask & IFCAP_TXCSUM) {
3889 			if_togglecapenable(ifp, IFCAP_TXCSUM);
3890 			if (if_getcapenable(ifp) & IFCAP_TXCSUM)
3891 				if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0);
3892 			else
3893 				if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc));
3894 		}
3895 		if (mask & IFCAP_TXCSUM_IPV6) {
3896 			if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6);
3897 			if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
3898 				if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0);
3899 			else
3900 				if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc));
3901 		}
3902 
3903 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3904 		if (mask & IFCAP_RXCSUM)
3905 			if_togglecapenable(ifp, IFCAP_RXCSUM);
3906 #ifdef foo
3907 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3908 		if (mask & IFCAP_RXCSUM_IPV6)
3909 			if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6);
3910 #endif
3911 
3912 		if (mask & IFCAP_LRO)
3913 			if_togglecapenable(ifp, IFCAP_LRO);
3914 
3915 		if (mask & IFCAP_TSO4) {
3916 			if_togglecapenable(ifp, IFCAP_TSO4);
3917 			if (if_getcapenable(ifp) & IFCAP_TSO4)
3918 				if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
3919 			else
3920 				if_sethwassistbits(ifp, 0, CSUM_IP_TSO);
3921 		}
3922 		if (mask & IFCAP_TSO6) {
3923 			if_togglecapenable(ifp, IFCAP_TSO6);
3924 			if (if_getcapenable(ifp) & IFCAP_TSO6)
3925 				if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
3926 			else
3927 				if_sethwassistbits(ifp, 0, CSUM_IP6_TSO);
3928 		}
3929 
3930 		HN_UNLOCK(sc);
3931 		break;
3932 
3933 	case SIOCADDMULTI:
3934 	case SIOCDELMULTI:
3935 		HN_LOCK(sc);
3936 
3937 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3938 			HN_UNLOCK(sc);
3939 			break;
3940 		}
3941 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3942 			/*
3943 			 * Multicast uses mutex; use busy-wait for
3944 			 * the RNDIS reply.
3945 			 */
3946 			HN_NO_SLEEPING(sc);
3947 			hn_rxfilter_config(sc);
3948 			HN_SLEEPING_OK(sc);
3949 		}
3950 
3951 		/* XXX vlan(4) style mcast addr maintenance */
3952 		if (hn_xpnt_vf_isready(sc)) {
3953 			int old_if_flags;
3954 
3955 			old_if_flags = if_getflags(sc->hn_vf_ifp);
3956 			hn_xpnt_vf_saveifflags(sc);
3957 
3958 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3959 			    ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) &
3960 			     IFF_ALLMULTI))
3961 				error = hn_xpnt_vf_iocsetflags(sc);
3962 		}
3963 
3964 		HN_UNLOCK(sc);
3965 		break;
3966 
3967 	case SIOCSIFMEDIA:
3968 	case SIOCGIFMEDIA:
3969 		HN_LOCK(sc);
3970 		if (hn_xpnt_vf_isready(sc)) {
3971 			/*
3972 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3973 			 * create and pass ifr_vf to the VF here; just
3974 			 * replace the ifr_name.
3975 			 */
3976 			vf_ifp = sc->hn_vf_ifp;
3977 			strlcpy(ifr->ifr_name, if_name(vf_ifp),
3978 			    sizeof(ifr->ifr_name));
3979 			error = ifhwioctl(cmd, vf_ifp, data, curthread);
3980 			/* Restore the ifr_name. */
3981 			strlcpy(ifr->ifr_name, if_name(ifp),
3982 			    sizeof(ifr->ifr_name));
3983 			HN_UNLOCK(sc);
3984 			break;
3985 		}
3986 		HN_UNLOCK(sc);
3987 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3988 		break;
3989 
3990 	case SIOCGIFRSSHASH:
3991 		ifrh = (struct ifrsshash *)data;
3992 		HN_LOCK(sc);
3993 		if (sc->hn_rx_ring_inuse == 1) {
3994 			HN_UNLOCK(sc);
3995 			ifrh->ifrh_func = RSS_FUNC_NONE;
3996 			ifrh->ifrh_types = 0;
3997 			break;
3998 		}
3999 
4000 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4001 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4002 		else
4003 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4004 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4005 		HN_UNLOCK(sc);
4006 		break;
4007 
4008 	case SIOCGIFRSSKEY:
4009 		ifrk = (struct ifrsskey *)data;
4010 		HN_LOCK(sc);
4011 		if (sc->hn_rx_ring_inuse == 1) {
4012 			HN_UNLOCK(sc);
4013 			ifrk->ifrk_func = RSS_FUNC_NONE;
4014 			ifrk->ifrk_keylen = 0;
4015 			break;
4016 		}
4017 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4018 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4019 		else
4020 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4021 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4022 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4023 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4024 		HN_UNLOCK(sc);
4025 		break;
4026 
4027 	default:
4028 		error = ether_ioctl(ifp, cmd, data);
4029 		break;
4030 	}
4031 	return (error);
4032 }
4033 
4034 static void
hn_stop(struct hn_softc * sc,bool detaching)4035 hn_stop(struct hn_softc *sc, bool detaching)
4036 {
4037 	if_t ifp = sc->hn_ifp;
4038 	int i;
4039 
4040 	HN_LOCK_ASSERT(sc);
4041 
4042 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4043 	    ("synthetic parts were not attached"));
4044 
4045 	/* Clear RUNNING bit ASAP. */
4046 	if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
4047 
4048 	/* Disable polling. */
4049 	hn_polling(sc, 0);
4050 
4051 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4052 		KASSERT(sc->hn_vf_ifp != NULL,
4053 		    ("%s: VF is not attached", if_name(ifp)));
4054 
4055 		/* Mark transparent mode VF as disabled. */
4056 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4057 
4058 		/*
4059 		 * NOTE:
4060 		 * Datapath setting must happen _before_ bringing
4061 		 * the VF down.
4062 		 */
4063 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4064 
4065 		/*
4066 		 * Bring the VF down.
4067 		 */
4068 		hn_xpnt_vf_saveifflags(sc);
4069 		if_setflagbits(ifp, 0, IFF_UP);
4070 		hn_xpnt_vf_iocsetflags(sc);
4071 	}
4072 
4073 	/* Suspend data transfers. */
4074 	hn_suspend_data(sc);
4075 
4076 	/* Clear OACTIVE bit. */
4077 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4078 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4079 		sc->hn_tx_ring[i].hn_oactive = 0;
4080 
4081 	/*
4082 	 * If the non-transparent mode VF is active, make sure
4083 	 * that the RX filter still allows packet reception.
4084 	 */
4085 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4086 		hn_rxfilter_config(sc);
4087 }
4088 
4089 static void
hn_init_locked(struct hn_softc * sc)4090 hn_init_locked(struct hn_softc *sc)
4091 {
4092 	if_t ifp = sc->hn_ifp;
4093 	int i;
4094 
4095 	HN_LOCK_ASSERT(sc);
4096 
4097 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4098 		return;
4099 
4100 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
4101 		return;
4102 
4103 	/* Configure RX filter */
4104 	hn_rxfilter_config(sc);
4105 
4106 	/* Clear OACTIVE bit. */
4107 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4108 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4109 		sc->hn_tx_ring[i].hn_oactive = 0;
4110 
4111 	/* Clear TX 'suspended' bit. */
4112 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4113 
4114 	if (hn_xpnt_vf_isready(sc)) {
4115 		/* Initialize transparent VF. */
4116 		hn_xpnt_vf_init(sc);
4117 	}
4118 
4119 	/* Everything is ready; unleash! */
4120 	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
4121 
4122 	/* Re-enable polling if requested. */
4123 	if (sc->hn_pollhz > 0)
4124 		hn_polling(sc, sc->hn_pollhz);
4125 }
4126 
4127 static void
hn_init(void * xsc)4128 hn_init(void *xsc)
4129 {
4130 	struct hn_softc *sc = xsc;
4131 
4132 	HN_LOCK(sc);
4133 	hn_init_locked(sc);
4134 	HN_UNLOCK(sc);
4135 }
4136 
4137 static int
hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)4138 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4139 {
4140 	struct hn_softc *sc = arg1;
4141 	unsigned int lenlim;
4142 	int error;
4143 
4144 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4145 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4146 	if (error || req->newptr == NULL)
4147 		return error;
4148 
4149 	HN_LOCK(sc);
4150 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4151 	    lenlim > TCP_LRO_LENGTH_MAX) {
4152 		HN_UNLOCK(sc);
4153 		return EINVAL;
4154 	}
4155 	hn_set_lro_lenlim(sc, lenlim);
4156 	HN_UNLOCK(sc);
4157 
4158 	return 0;
4159 }
4160 
4161 static int
hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)4162 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4163 {
4164 	struct hn_softc *sc = arg1;
4165 	int ackcnt, error, i;
4166 
4167 	/*
4168 	 * lro_ackcnt_lim is append count limit,
4169 	 * +1 to turn it into aggregation limit.
4170 	 */
4171 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4172 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4173 	if (error || req->newptr == NULL)
4174 		return error;
4175 
4176 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4177 		return EINVAL;
4178 
4179 	/*
4180 	 * Convert aggregation limit back to append
4181 	 * count limit.
4182 	 */
4183 	--ackcnt;
4184 	HN_LOCK(sc);
4185 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4186 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4187 	HN_UNLOCK(sc);
4188 	return 0;
4189 }
4190 
4191 static int
hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)4192 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4193 {
4194 	struct hn_softc *sc = arg1;
4195 	int hcsum = arg2;
4196 	int on, error, i;
4197 
4198 	on = 0;
4199 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4200 		on = 1;
4201 
4202 	error = sysctl_handle_int(oidp, &on, 0, req);
4203 	if (error || req->newptr == NULL)
4204 		return error;
4205 
4206 	HN_LOCK(sc);
4207 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4208 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4209 
4210 		if (on)
4211 			rxr->hn_trust_hcsum |= hcsum;
4212 		else
4213 			rxr->hn_trust_hcsum &= ~hcsum;
4214 	}
4215 	HN_UNLOCK(sc);
4216 	return 0;
4217 }
4218 
4219 static int
hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)4220 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4221 {
4222 	struct hn_softc *sc = arg1;
4223 	int chim_size, error;
4224 
4225 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4226 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4227 	if (error || req->newptr == NULL)
4228 		return error;
4229 
4230 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4231 		return EINVAL;
4232 
4233 	HN_LOCK(sc);
4234 	hn_set_chim_size(sc, chim_size);
4235 	HN_UNLOCK(sc);
4236 	return 0;
4237 }
4238 
4239 static int
hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)4240 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4241 {
4242 	struct hn_softc *sc = arg1;
4243 	int ofs = arg2, i, error;
4244 	struct hn_rx_ring *rxr;
4245 	uint64_t stat;
4246 
4247 	stat = 0;
4248 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4249 		rxr = &sc->hn_rx_ring[i];
4250 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4251 	}
4252 
4253 	error = sysctl_handle_64(oidp, &stat, 0, req);
4254 	if (error || req->newptr == NULL)
4255 		return error;
4256 
4257 	/* Zero out this stat. */
4258 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4259 		rxr = &sc->hn_rx_ring[i];
4260 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4261 	}
4262 	return 0;
4263 }
4264 
4265 static int
hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)4266 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4267 {
4268 	struct hn_softc *sc = arg1;
4269 	int ofs = arg2, i, error;
4270 	struct hn_rx_ring *rxr;
4271 	u_long stat;
4272 
4273 	stat = 0;
4274 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4275 		rxr = &sc->hn_rx_ring[i];
4276 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4277 	}
4278 
4279 	error = sysctl_handle_long(oidp, &stat, 0, req);
4280 	if (error || req->newptr == NULL)
4281 		return error;
4282 
4283 	/* Zero out this stat. */
4284 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4285 		rxr = &sc->hn_rx_ring[i];
4286 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4287 	}
4288 	return 0;
4289 }
4290 
4291 static int
hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)4292 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4293 {
4294 	struct hn_softc *sc = arg1;
4295 	int ofs = arg2, i, error;
4296 	struct hn_tx_ring *txr;
4297 	u_long stat;
4298 
4299 	stat = 0;
4300 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4301 		txr = &sc->hn_tx_ring[i];
4302 		stat += *((u_long *)((uint8_t *)txr + ofs));
4303 	}
4304 
4305 	error = sysctl_handle_long(oidp, &stat, 0, req);
4306 	if (error || req->newptr == NULL)
4307 		return error;
4308 
4309 	/* Zero out this stat. */
4310 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4311 		txr = &sc->hn_tx_ring[i];
4312 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4313 	}
4314 	return 0;
4315 }
4316 
4317 static int
hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)4318 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4319 {
4320 	struct hn_softc *sc = arg1;
4321 	int ofs = arg2, i, error, conf;
4322 	struct hn_tx_ring *txr;
4323 
4324 	txr = &sc->hn_tx_ring[0];
4325 	conf = *((int *)((uint8_t *)txr + ofs));
4326 
4327 	error = sysctl_handle_int(oidp, &conf, 0, req);
4328 	if (error || req->newptr == NULL)
4329 		return error;
4330 
4331 	HN_LOCK(sc);
4332 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4333 		txr = &sc->hn_tx_ring[i];
4334 		*((int *)((uint8_t *)txr + ofs)) = conf;
4335 	}
4336 	HN_UNLOCK(sc);
4337 
4338 	return 0;
4339 }
4340 
4341 static int
hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)4342 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4343 {
4344 	struct hn_softc *sc = arg1;
4345 	int error, size;
4346 
4347 	size = sc->hn_agg_size;
4348 	error = sysctl_handle_int(oidp, &size, 0, req);
4349 	if (error || req->newptr == NULL)
4350 		return (error);
4351 
4352 	HN_LOCK(sc);
4353 	sc->hn_agg_size = size;
4354 	hn_set_txagg(sc);
4355 	HN_UNLOCK(sc);
4356 
4357 	return (0);
4358 }
4359 
4360 static int
hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)4361 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4362 {
4363 	struct hn_softc *sc = arg1;
4364 	int error, pkts;
4365 
4366 	pkts = sc->hn_agg_pkts;
4367 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4368 	if (error || req->newptr == NULL)
4369 		return (error);
4370 
4371 	HN_LOCK(sc);
4372 	sc->hn_agg_pkts = pkts;
4373 	hn_set_txagg(sc);
4374 	HN_UNLOCK(sc);
4375 
4376 	return (0);
4377 }
4378 
4379 static int
hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)4380 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4381 {
4382 	struct hn_softc *sc = arg1;
4383 	int pkts;
4384 
4385 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4386 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4387 }
4388 
4389 static int
hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)4390 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4391 {
4392 	struct hn_softc *sc = arg1;
4393 	int align;
4394 
4395 	align = sc->hn_tx_ring[0].hn_agg_align;
4396 	return (sysctl_handle_int(oidp, &align, 0, req));
4397 }
4398 
4399 static void
hn_chan_polling(struct vmbus_channel * chan,u_int pollhz)4400 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4401 {
4402 	if (pollhz == 0)
4403 		vmbus_chan_poll_disable(chan);
4404 	else
4405 		vmbus_chan_poll_enable(chan, pollhz);
4406 }
4407 
4408 static void
hn_polling(struct hn_softc * sc,u_int pollhz)4409 hn_polling(struct hn_softc *sc, u_int pollhz)
4410 {
4411 	int nsubch = sc->hn_rx_ring_inuse - 1;
4412 
4413 	HN_LOCK_ASSERT(sc);
4414 
4415 	if (nsubch > 0) {
4416 		struct vmbus_channel **subch;
4417 		int i;
4418 
4419 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4420 		for (i = 0; i < nsubch; ++i)
4421 			hn_chan_polling(subch[i], pollhz);
4422 		vmbus_subchan_rel(subch, nsubch);
4423 	}
4424 	hn_chan_polling(sc->hn_prichan, pollhz);
4425 }
4426 
4427 static int
hn_polling_sysctl(SYSCTL_HANDLER_ARGS)4428 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4429 {
4430 	struct hn_softc *sc = arg1;
4431 	int pollhz, error;
4432 
4433 	pollhz = sc->hn_pollhz;
4434 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4435 	if (error || req->newptr == NULL)
4436 		return (error);
4437 
4438 	if (pollhz != 0 &&
4439 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4440 		return (EINVAL);
4441 
4442 	HN_LOCK(sc);
4443 	if (sc->hn_pollhz != pollhz) {
4444 		sc->hn_pollhz = pollhz;
4445 		if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) &&
4446 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4447 			hn_polling(sc, sc->hn_pollhz);
4448 	}
4449 	HN_UNLOCK(sc);
4450 
4451 	return (0);
4452 }
4453 
4454 static int
hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)4455 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4456 {
4457 	struct hn_softc *sc = arg1;
4458 	char verstr[16];
4459 
4460 	snprintf(verstr, sizeof(verstr), "%u.%u",
4461 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4462 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4463 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4464 }
4465 
4466 static int
hn_caps_sysctl(SYSCTL_HANDLER_ARGS)4467 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4468 {
4469 	struct hn_softc *sc = arg1;
4470 	char caps_str[128];
4471 	uint32_t caps;
4472 
4473 	HN_LOCK(sc);
4474 	caps = sc->hn_caps;
4475 	HN_UNLOCK(sc);
4476 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4477 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4478 }
4479 
4480 static int
hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)4481 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4482 {
4483 	struct hn_softc *sc = arg1;
4484 	char assist_str[128];
4485 	uint32_t hwassist;
4486 
4487 	HN_LOCK(sc);
4488 	hwassist = if_gethwassist(sc->hn_ifp);
4489 	HN_UNLOCK(sc);
4490 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4491 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4492 }
4493 
4494 static int
hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)4495 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4496 {
4497 	struct hn_softc *sc = arg1;
4498 	char filter_str[128];
4499 	uint32_t filter;
4500 
4501 	HN_LOCK(sc);
4502 	filter = sc->hn_rx_filter;
4503 	HN_UNLOCK(sc);
4504 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4505 	    NDIS_PACKET_TYPES);
4506 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4507 }
4508 
4509 static int
hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)4510 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
4511 {
4512 	struct hn_softc *sc = arg1;
4513 	int rsc_ctrl, mtu;
4514 	int error;
4515 
4516 	rsc_ctrl = sc->hn_rsc_ctrl;
4517 	error = sysctl_handle_int(oidp, &rsc_ctrl, 0, req);
4518 	if (error || req->newptr == NULL)
4519 		return (error);
4520 
4521 	if (sc->hn_rsc_ctrl != rsc_ctrl) {
4522 		HN_LOCK(sc);
4523 		sc->hn_rsc_ctrl = rsc_ctrl;
4524 		mtu = if_getmtu(sc->hn_ifp);
4525 		error = hn_rndis_reconf_offload(sc, mtu);
4526 		HN_UNLOCK(sc);
4527 	}
4528 
4529 	return (error);
4530 }
4531 #ifndef RSS
4532 
4533 static int
hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)4534 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4535 {
4536 	struct hn_softc *sc = arg1;
4537 	int error;
4538 
4539 	HN_LOCK(sc);
4540 
4541 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4542 	if (error || req->newptr == NULL)
4543 		goto back;
4544 
4545 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4546 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4547 		/*
4548 		 * RSS key is synchronized w/ VF's, don't allow users
4549 		 * to change it.
4550 		 */
4551 		error = EBUSY;
4552 		goto back;
4553 	}
4554 
4555 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4556 	if (error)
4557 		goto back;
4558 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4559 
4560 	if (sc->hn_rx_ring_inuse > 1) {
4561 		error = hn_rss_reconfig(sc);
4562 	} else {
4563 		/* Not RSS capable, at least for now; just save the RSS key. */
4564 		error = 0;
4565 	}
4566 back:
4567 	HN_UNLOCK(sc);
4568 	return (error);
4569 }
4570 
4571 static int
hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)4572 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4573 {
4574 	struct hn_softc *sc = arg1;
4575 	int error;
4576 
4577 	HN_LOCK(sc);
4578 
4579 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4580 	if (error || req->newptr == NULL)
4581 		goto back;
4582 
4583 	/*
4584 	 * Don't allow RSS indirect table change, if this interface is not
4585 	 * RSS capable currently.
4586 	 */
4587 	if (sc->hn_rx_ring_inuse == 1) {
4588 		error = EOPNOTSUPP;
4589 		goto back;
4590 	}
4591 
4592 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4593 	if (error)
4594 		goto back;
4595 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4596 
4597 	hn_rss_ind_fixup(sc);
4598 	error = hn_rss_reconfig(sc);
4599 back:
4600 	HN_UNLOCK(sc);
4601 	return (error);
4602 }
4603 
4604 #endif	/* !RSS */
4605 
4606 static int
hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)4607 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4608 {
4609 	struct hn_softc *sc = arg1;
4610 	char hash_str[128];
4611 	uint32_t hash;
4612 
4613 	HN_LOCK(sc);
4614 	hash = sc->hn_rss_hash;
4615 	HN_UNLOCK(sc);
4616 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4617 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4618 }
4619 
4620 static int
hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)4621 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4622 {
4623 	struct hn_softc *sc = arg1;
4624 	char hash_str[128];
4625 	uint32_t hash;
4626 
4627 	HN_LOCK(sc);
4628 	hash = sc->hn_rss_hcap;
4629 	HN_UNLOCK(sc);
4630 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4631 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4632 }
4633 
4634 static int
hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)4635 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4636 {
4637 	struct hn_softc *sc = arg1;
4638 	char hash_str[128];
4639 	uint32_t hash;
4640 
4641 	HN_LOCK(sc);
4642 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4643 	HN_UNLOCK(sc);
4644 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4645 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4646 }
4647 
4648 static int
hn_vf_sysctl(SYSCTL_HANDLER_ARGS)4649 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4650 {
4651 	struct hn_softc *sc = arg1;
4652 	char vf_name[IFNAMSIZ + 1];
4653 	if_t vf_ifp;
4654 
4655 	HN_LOCK(sc);
4656 	vf_name[0] = '\0';
4657 	vf_ifp = sc->hn_vf_ifp;
4658 	if (vf_ifp != NULL)
4659 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4660 	HN_UNLOCK(sc);
4661 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4662 }
4663 
4664 static int
hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)4665 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4666 {
4667 	struct hn_softc *sc = arg1;
4668 	char vf_name[IFNAMSIZ + 1];
4669 	if_t vf_ifp;
4670 
4671 	HN_LOCK(sc);
4672 	vf_name[0] = '\0';
4673 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4674 	if (vf_ifp != NULL)
4675 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4676 	HN_UNLOCK(sc);
4677 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4678 }
4679 
4680 static int
hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)4681 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4682 {
4683 	struct rm_priotracker pt;
4684 	struct sbuf *sb;
4685 	int error, i;
4686 	bool first;
4687 
4688 	error = sysctl_wire_old_buffer(req, 0);
4689 	if (error != 0)
4690 		return (error);
4691 
4692 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4693 	if (sb == NULL)
4694 		return (ENOMEM);
4695 
4696 	rm_rlock(&hn_vfmap_lock, &pt);
4697 
4698 	first = true;
4699 	for (i = 0; i < hn_vfmap_size; ++i) {
4700 		struct epoch_tracker et;
4701 		if_t ifp;
4702 
4703 		if (hn_vfmap[i] == NULL)
4704 			continue;
4705 
4706 		NET_EPOCH_ENTER(et);
4707 		ifp = ifnet_byindex(i);
4708 		if (ifp != NULL) {
4709 			if (first)
4710 				sbuf_printf(sb, "%s", if_name(ifp));
4711 			else
4712 				sbuf_printf(sb, " %s", if_name(ifp));
4713 			first = false;
4714 		}
4715 		NET_EPOCH_EXIT(et);
4716 	}
4717 
4718 	rm_runlock(&hn_vfmap_lock, &pt);
4719 
4720 	error = sbuf_finish(sb);
4721 	sbuf_delete(sb);
4722 	return (error);
4723 }
4724 
4725 static int
hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)4726 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4727 {
4728 	struct rm_priotracker pt;
4729 	struct sbuf *sb;
4730 	int error, i;
4731 	bool first;
4732 
4733 	error = sysctl_wire_old_buffer(req, 0);
4734 	if (error != 0)
4735 		return (error);
4736 
4737 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4738 	if (sb == NULL)
4739 		return (ENOMEM);
4740 
4741 	rm_rlock(&hn_vfmap_lock, &pt);
4742 
4743 	first = true;
4744 	for (i = 0; i < hn_vfmap_size; ++i) {
4745 		struct epoch_tracker et;
4746 		if_t ifp, hn_ifp;
4747 
4748 		hn_ifp = hn_vfmap[i];
4749 		if (hn_ifp == NULL)
4750 			continue;
4751 
4752 		NET_EPOCH_ENTER(et);
4753 		ifp = ifnet_byindex(i);
4754 		if (ifp != NULL) {
4755 			if (first) {
4756 				sbuf_printf(sb, "%s:%s", if_name(ifp),
4757 				    if_name(hn_ifp));
4758 			} else {
4759 				sbuf_printf(sb, " %s:%s", if_name(ifp),
4760 				    if_name(hn_ifp));
4761 			}
4762 			first = false;
4763 		}
4764 		NET_EPOCH_EXIT(et);
4765 	}
4766 
4767 	rm_runlock(&hn_vfmap_lock, &pt);
4768 
4769 	error = sbuf_finish(sb);
4770 	sbuf_delete(sb);
4771 	return (error);
4772 }
4773 
4774 static int
hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)4775 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4776 {
4777 	struct hn_softc *sc = arg1;
4778 	int error, onoff = 0;
4779 
4780 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4781 		onoff = 1;
4782 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4783 	if (error || req->newptr == NULL)
4784 		return (error);
4785 
4786 	HN_LOCK(sc);
4787 	/* NOTE: hn_vf_lock for hn_transmit() */
4788 	rm_wlock(&sc->hn_vf_lock);
4789 	if (onoff)
4790 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4791 	else
4792 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4793 	rm_wunlock(&sc->hn_vf_lock);
4794 	HN_UNLOCK(sc);
4795 
4796 	return (0);
4797 }
4798 
4799 static int
hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)4800 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4801 {
4802 	struct hn_softc *sc = arg1;
4803 	int enabled = 0;
4804 
4805 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4806 		enabled = 1;
4807 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4808 }
4809 
4810 static int
hn_check_iplen(const struct mbuf * m,int hoff)4811 hn_check_iplen(const struct mbuf *m, int hoff)
4812 {
4813 	const struct ip *ip;
4814 	int len, iphlen, iplen;
4815 	const struct tcphdr *th;
4816 	int thoff;				/* TCP data offset */
4817 
4818 	len = hoff + sizeof(struct ip);
4819 
4820 	/* The packet must be at least the size of an IP header. */
4821 	if (m->m_pkthdr.len < len)
4822 		return IPPROTO_DONE;
4823 
4824 	/* The fixed IP header must reside completely in the first mbuf. */
4825 	if (m->m_len < len)
4826 		return IPPROTO_DONE;
4827 
4828 	ip = mtodo(m, hoff);
4829 
4830 	/* Bound check the packet's stated IP header length. */
4831 	iphlen = ip->ip_hl << 2;
4832 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4833 		return IPPROTO_DONE;
4834 
4835 	/* The full IP header must reside completely in the one mbuf. */
4836 	if (m->m_len < hoff + iphlen)
4837 		return IPPROTO_DONE;
4838 
4839 	iplen = ntohs(ip->ip_len);
4840 
4841 	/*
4842 	 * Check that the amount of data in the buffers is as
4843 	 * at least much as the IP header would have us expect.
4844 	 */
4845 	if (m->m_pkthdr.len < hoff + iplen)
4846 		return IPPROTO_DONE;
4847 
4848 	/*
4849 	 * Ignore IP fragments.
4850 	 */
4851 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4852 		return IPPROTO_DONE;
4853 
4854 	/*
4855 	 * The TCP/IP or UDP/IP header must be entirely contained within
4856 	 * the first fragment of a packet.
4857 	 */
4858 	switch (ip->ip_p) {
4859 	case IPPROTO_TCP:
4860 		if (iplen < iphlen + sizeof(struct tcphdr))
4861 			return IPPROTO_DONE;
4862 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4863 			return IPPROTO_DONE;
4864 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4865 		thoff = th->th_off << 2;
4866 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4867 			return IPPROTO_DONE;
4868 		if (m->m_len < hoff + iphlen + thoff)
4869 			return IPPROTO_DONE;
4870 		break;
4871 	case IPPROTO_UDP:
4872 		if (iplen < iphlen + sizeof(struct udphdr))
4873 			return IPPROTO_DONE;
4874 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4875 			return IPPROTO_DONE;
4876 		break;
4877 	default:
4878 		if (iplen < iphlen)
4879 			return IPPROTO_DONE;
4880 		break;
4881 	}
4882 	return ip->ip_p;
4883 }
4884 
4885 static void
hn_rxpkt_proto(const struct mbuf * m_new,int * l3proto,int * l4proto)4886 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4887 {
4888 	const struct ether_header *eh;
4889 	uint16_t etype;
4890 	int hoff;
4891 
4892 	hoff = sizeof(*eh);
4893 	/* Checked at the beginning of this function. */
4894 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4895 
4896 	eh = mtod(m_new, const struct ether_header *);
4897 	etype = ntohs(eh->ether_type);
4898 	if (etype == ETHERTYPE_VLAN) {
4899 		const struct ether_vlan_header *evl;
4900 
4901 		hoff = sizeof(*evl);
4902 		if (m_new->m_len < hoff)
4903 			return;
4904 		evl = mtod(m_new, const struct ether_vlan_header *);
4905 		etype = ntohs(evl->evl_proto);
4906 	}
4907 	*l3proto = etype;
4908 
4909 	if (etype == ETHERTYPE_IP)
4910 		*l4proto = hn_check_iplen(m_new, hoff);
4911 	else
4912 		*l4proto = IPPROTO_DONE;
4913 }
4914 
4915 static int
hn_create_rx_data(struct hn_softc * sc,int ring_cnt)4916 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4917 {
4918 	struct sysctl_oid_list *child;
4919 	struct sysctl_ctx_list *ctx;
4920 	device_t dev = sc->hn_dev;
4921 #if defined(INET) || defined(INET6)
4922 	int lroent_cnt;
4923 #endif
4924 	int i;
4925 
4926 	/*
4927 	 * Create RXBUF for reception.
4928 	 *
4929 	 * NOTE:
4930 	 * - It is shared by all channels.
4931 	 * - A large enough buffer is allocated, certain version of NVSes
4932 	 *   may further limit the usable space.
4933 	 */
4934 	sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
4935 	    0ul, ~0ul, PAGE_SIZE, 0);
4936 	if (sc->hn_rxbuf == NULL) {
4937 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4938 		return (ENOMEM);
4939 	}
4940 
4941 	sc->hn_rx_ring_cnt = ring_cnt;
4942 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4943 
4944 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4945 	    M_DEVBUF, M_WAITOK | M_ZERO);
4946 
4947 #if defined(INET) || defined(INET6)
4948 	lroent_cnt = hn_lro_entry_count;
4949 	if (lroent_cnt < TCP_LRO_ENTRIES)
4950 		lroent_cnt = TCP_LRO_ENTRIES;
4951 	if (bootverbose)
4952 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4953 #endif	/* INET || INET6 */
4954 
4955 	ctx = device_get_sysctl_ctx(dev);
4956 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4957 
4958 	/* Create dev.hn.UNIT.rx sysctl tree */
4959 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4960 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4961 
4962 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4963 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4964 
4965 		rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF,
4966 		    M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0);
4967 		if (rxr->hn_br == NULL) {
4968 			device_printf(dev, "allocate bufring failed\n");
4969 			return (ENOMEM);
4970 		}
4971 
4972 		if (hn_trust_hosttcp)
4973 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4974 		if (hn_trust_hostudp)
4975 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4976 		if (hn_trust_hostip)
4977 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4978 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4979 		rxr->hn_ifp = sc->hn_ifp;
4980 		if (i < sc->hn_tx_ring_cnt)
4981 			rxr->hn_txr = &sc->hn_tx_ring[i];
4982 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4983 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4984 		rxr->hn_rx_idx = i;
4985 		rxr->hn_rxbuf = sc->hn_rxbuf;
4986 
4987 		/*
4988 		 * Initialize LRO.
4989 		 */
4990 #if defined(INET) || defined(INET6)
4991 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4992 		    hn_lro_mbufq_depth);
4993 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4994 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4995 #endif	/* INET || INET6 */
4996 
4997 		if (sc->hn_rx_sysctl_tree != NULL) {
4998 			char name[16];
4999 
5000 			/*
5001 			 * Create per RX ring sysctl tree:
5002 			 * dev.hn.UNIT.rx.RINGID
5003 			 */
5004 			snprintf(name, sizeof(name), "%d", i);
5005 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5006 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5007 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5008 
5009 			if (rxr->hn_rx_sysctl_tree != NULL) {
5010 				SYSCTL_ADD_ULONG(ctx,
5011 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5012 				    OID_AUTO, "packets",
5013 				    CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5014 				    "# of packets received");
5015 				SYSCTL_ADD_ULONG(ctx,
5016 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5017 				    OID_AUTO, "rss_pkts",
5018 				    CTLFLAG_RW | CTLFLAG_STATS,
5019 				    &rxr->hn_rss_pkts,
5020 				    "# of packets w/ RSS info received");
5021 				SYSCTL_ADD_ULONG(ctx,
5022 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5023 				    OID_AUTO, "rsc_pkts",
5024 				    CTLFLAG_RW | CTLFLAG_STATS,
5025 				    &rxr->hn_rsc_pkts,
5026 				    "# of RSC packets received");
5027 				SYSCTL_ADD_ULONG(ctx,
5028 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5029 				    OID_AUTO, "rsc_drop",
5030 				    CTLFLAG_RW | CTLFLAG_STATS,
5031 				    &rxr->hn_rsc_drop,
5032 				    "# of RSC fragments dropped");
5033 				SYSCTL_ADD_INT(ctx,
5034 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5035 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5036 				    &rxr->hn_pktbuf_len, 0,
5037 				    "Temporary channel packet buffer length");
5038 			}
5039 		}
5040 	}
5041 
5042 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5043 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5044 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5045 	    hn_rx_stat_u64_sysctl,
5046 	    "LU", "LRO queued");
5047 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5048 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5049 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5050 	    hn_rx_stat_u64_sysctl,
5051 	    "LU", "LRO flushed");
5052 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5053 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5054 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5055 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5056 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5057 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5058 	    hn_lro_lenlim_sysctl, "IU",
5059 	    "Max # of data bytes to be aggregated by LRO");
5060 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5061 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5062 	    hn_lro_ackcnt_sysctl, "I",
5063 	    "Max # of ACKs to be aggregated by LRO");
5064 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5065 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5066 	    hn_trust_hcsum_sysctl, "I",
5067 	    "Trust tcp segment verification on host side, "
5068 	    "when csum info is missing");
5069 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5070 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5071 	    hn_trust_hcsum_sysctl, "I",
5072 	    "Trust udp datagram verification on host side, "
5073 	    "when csum info is missing");
5074 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5075 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5076 	    hn_trust_hcsum_sysctl, "I",
5077 	    "Trust ip packet verification on host side, "
5078 	    "when csum info is missing");
5079 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5080 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5081 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5082 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5083 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5084 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5085 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5086 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5087 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5088 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5089 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5090 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5091 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5092 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5093 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5094 	    hn_rx_stat_ulong_sysctl, "LU",
5095 	    "# of packets that we trust host's csum verification");
5096 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5097 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5098 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5099 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5100 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5101 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5102 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5103 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5104 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5105 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5106 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5107 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5108 
5109 	return (0);
5110 }
5111 
5112 static void
hn_destroy_rx_data(struct hn_softc * sc)5113 hn_destroy_rx_data(struct hn_softc *sc)
5114 {
5115 	int i;
5116 
5117 	if (sc->hn_rxbuf != NULL) {
5118 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5119 			free(sc->hn_rxbuf, M_DEVBUF);
5120 		else
5121 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5122 		sc->hn_rxbuf = NULL;
5123 	}
5124 
5125 	if (sc->hn_rx_ring_cnt == 0)
5126 		return;
5127 
5128 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5129 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5130 
5131 		if (rxr->hn_br == NULL)
5132 			continue;
5133 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5134 			free(rxr->hn_br, M_DEVBUF);
5135 		} else {
5136 			device_printf(sc->hn_dev,
5137 			    "%dth channel bufring is referenced", i);
5138 		}
5139 		rxr->hn_br = NULL;
5140 
5141 #if defined(INET) || defined(INET6)
5142 		tcp_lro_free(&rxr->hn_lro);
5143 #endif
5144 		free(rxr->hn_pktbuf, M_DEVBUF);
5145 	}
5146 	free(sc->hn_rx_ring, M_DEVBUF);
5147 	sc->hn_rx_ring = NULL;
5148 
5149 	sc->hn_rx_ring_cnt = 0;
5150 	sc->hn_rx_ring_inuse = 0;
5151 }
5152 
5153 static int
hn_tx_ring_create(struct hn_softc * sc,int id)5154 hn_tx_ring_create(struct hn_softc *sc, int id)
5155 {
5156 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5157 	device_t dev = sc->hn_dev;
5158 	bus_dma_tag_t parent_dtag;
5159 	int error, i;
5160 
5161 	txr->hn_sc = sc;
5162 	txr->hn_tx_idx = id;
5163 
5164 #ifndef HN_USE_TXDESC_BUFRING
5165 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5166 #endif
5167 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5168 
5169 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5170 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5171 	    M_DEVBUF, M_WAITOK | M_ZERO);
5172 #ifndef HN_USE_TXDESC_BUFRING
5173 	SLIST_INIT(&txr->hn_txlist);
5174 #else
5175 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5176 	    M_WAITOK, &txr->hn_tx_lock);
5177 #endif
5178 
5179 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5180 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5181 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5182 	} else {
5183 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5184 	}
5185 
5186 #ifdef HN_IFSTART_SUPPORT
5187 	if (hn_use_if_start) {
5188 		txr->hn_txeof = hn_start_txeof;
5189 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5190 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5191 	} else
5192 #endif
5193 	{
5194 		int br_depth;
5195 
5196 		txr->hn_txeof = hn_xmit_txeof;
5197 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5198 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5199 
5200 		br_depth = hn_get_txswq_depth(txr);
5201 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5202 		    M_WAITOK, &txr->hn_tx_lock);
5203 	}
5204 
5205 	txr->hn_direct_tx_size = hn_direct_tx_size;
5206 
5207 	/*
5208 	 * Always schedule transmission instead of trying to do direct
5209 	 * transmission.  This one gives the best performance so far.
5210 	 */
5211 	txr->hn_sched_tx = 1;
5212 
5213 	parent_dtag = bus_get_dma_tag(dev);
5214 
5215 	/* DMA tag for RNDIS packet messages. */
5216 	error = bus_dma_tag_create(parent_dtag, /* parent */
5217 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5218 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5219 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5220 	    BUS_SPACE_MAXADDR,		/* highaddr */
5221 	    NULL, NULL,			/* filter, filterarg */
5222 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5223 	    1,				/* nsegments */
5224 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5225 	    0,				/* flags */
5226 	    NULL,			/* lockfunc */
5227 	    NULL,			/* lockfuncarg */
5228 	    &txr->hn_tx_rndis_dtag);
5229 	if (error) {
5230 		device_printf(dev, "failed to create rndis dmatag\n");
5231 		return error;
5232 	}
5233 
5234 	/* DMA tag for data. */
5235 	error = bus_dma_tag_create(parent_dtag, /* parent */
5236 	    1,				/* alignment */
5237 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5238 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5239 	    BUS_SPACE_MAXADDR,		/* highaddr */
5240 	    NULL, NULL,			/* filter, filterarg */
5241 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5242 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5243 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5244 	    0,				/* flags */
5245 	    NULL,			/* lockfunc */
5246 	    NULL,			/* lockfuncarg */
5247 	    &txr->hn_tx_data_dtag);
5248 	if (error) {
5249 		device_printf(dev, "failed to create data dmatag\n");
5250 		return error;
5251 	}
5252 
5253 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5254 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5255 
5256 		txd->txr = txr;
5257 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5258 		STAILQ_INIT(&txd->agg_list);
5259 
5260 		/*
5261 		 * Allocate and load RNDIS packet message.
5262 		 */
5263         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5264 		    (void **)&txd->rndis_pkt,
5265 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5266 		    &txd->rndis_pkt_dmap);
5267 		if (error) {
5268 			device_printf(dev,
5269 			    "failed to allocate rndis_packet_msg, %d\n", i);
5270 			return error;
5271 		}
5272 
5273 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5274 		    txd->rndis_pkt_dmap,
5275 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5276 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5277 		    BUS_DMA_NOWAIT);
5278 		if (error) {
5279 			device_printf(dev,
5280 			    "failed to load rndis_packet_msg, %d\n", i);
5281 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5282 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5283 			return error;
5284 		}
5285 
5286 		/* DMA map for TX data. */
5287 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5288 		    &txd->data_dmap);
5289 		if (error) {
5290 			device_printf(dev,
5291 			    "failed to allocate tx data dmamap\n");
5292 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5293 			    txd->rndis_pkt_dmap);
5294 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5295 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5296 			return error;
5297 		}
5298 
5299 		/* All set, put it to list */
5300 		txd->flags |= HN_TXD_FLAG_ONLIST;
5301 #ifndef HN_USE_TXDESC_BUFRING
5302 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5303 #else
5304 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5305 #endif
5306 	}
5307 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5308 
5309 	if (sc->hn_tx_sysctl_tree != NULL) {
5310 		struct sysctl_oid_list *child;
5311 		struct sysctl_ctx_list *ctx;
5312 		char name[16];
5313 
5314 		/*
5315 		 * Create per TX ring sysctl tree:
5316 		 * dev.hn.UNIT.tx.RINGID
5317 		 */
5318 		ctx = device_get_sysctl_ctx(dev);
5319 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5320 
5321 		snprintf(name, sizeof(name), "%d", id);
5322 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5323 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5324 
5325 		if (txr->hn_tx_sysctl_tree != NULL) {
5326 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5327 
5328 #ifdef HN_DEBUG
5329 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5330 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5331 			    "# of available TX descs");
5332 #endif
5333 #ifdef HN_IFSTART_SUPPORT
5334 			if (!hn_use_if_start)
5335 #endif
5336 			{
5337 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5338 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5339 				    "over active");
5340 			}
5341 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5342 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5343 			    "# of packets transmitted");
5344 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5345 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5346 			    "# of sends");
5347 		}
5348 	}
5349 
5350 	return 0;
5351 }
5352 
5353 static void
hn_txdesc_dmamap_destroy(struct hn_txdesc * txd)5354 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5355 {
5356 	struct hn_tx_ring *txr = txd->txr;
5357 
5358 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5359 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5360 
5361 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5362 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5363 	    txd->rndis_pkt_dmap);
5364 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5365 }
5366 
5367 static void
hn_txdesc_gc(struct hn_tx_ring * txr,struct hn_txdesc * txd)5368 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5369 {
5370 
5371 	KASSERT(txd->refs == 0 || txd->refs == 1,
5372 	    ("invalid txd refs %d", txd->refs));
5373 
5374 	/* Aggregated txds will be freed by their aggregating txd. */
5375 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5376 		int freed __diagused;
5377 
5378 		freed = hn_txdesc_put(txr, txd);
5379 		KASSERT(freed, ("can't free txdesc"));
5380 	}
5381 }
5382 
5383 static void
hn_tx_ring_destroy(struct hn_tx_ring * txr)5384 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5385 {
5386 	int i;
5387 
5388 	if (txr->hn_txdesc == NULL)
5389 		return;
5390 
5391 	/*
5392 	 * NOTE:
5393 	 * Because the freeing of aggregated txds will be deferred
5394 	 * to the aggregating txd, two passes are used here:
5395 	 * - The first pass GCes any pending txds.  This GC is necessary,
5396 	 *   since if the channels are revoked, hypervisor will not
5397 	 *   deliver send-done for all pending txds.
5398 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5399 	 *   were freed.
5400 	 */
5401 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5402 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5403 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5404 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5405 
5406 	if (txr->hn_tx_data_dtag != NULL)
5407 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5408 	if (txr->hn_tx_rndis_dtag != NULL)
5409 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5410 
5411 #ifdef HN_USE_TXDESC_BUFRING
5412 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5413 #endif
5414 
5415 	free(txr->hn_txdesc, M_DEVBUF);
5416 	txr->hn_txdesc = NULL;
5417 
5418 	if (txr->hn_mbuf_br != NULL)
5419 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5420 
5421 #ifndef HN_USE_TXDESC_BUFRING
5422 	mtx_destroy(&txr->hn_txlist_spin);
5423 #endif
5424 	mtx_destroy(&txr->hn_tx_lock);
5425 }
5426 
5427 static int
hn_create_tx_data(struct hn_softc * sc,int ring_cnt)5428 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5429 {
5430 	struct sysctl_oid_list *child;
5431 	struct sysctl_ctx_list *ctx;
5432 	int i;
5433 
5434 	/*
5435 	 * Create TXBUF for chimney sending.
5436 	 *
5437 	 * NOTE: It is shared by all channels.
5438 	 */
5439 	sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
5440 	    0ul, ~0ul, PAGE_SIZE, 0);
5441 	if (sc->hn_chim == NULL) {
5442 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5443 		return (ENOMEM);
5444 	}
5445 
5446 	sc->hn_tx_ring_cnt = ring_cnt;
5447 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5448 
5449 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5450 	    M_DEVBUF, M_WAITOK | M_ZERO);
5451 
5452 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5453 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5454 
5455 	/* Create dev.hn.UNIT.tx sysctl tree */
5456 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5457 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5458 
5459 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5460 		int error;
5461 
5462 		error = hn_tx_ring_create(sc, i);
5463 		if (error)
5464 			return error;
5465 	}
5466 
5467 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5468 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5469 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5470 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5471 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5472 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5473 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5474 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5475 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5476 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5477 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5478 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5479 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5480 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5481 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5482 	    hn_tx_stat_ulong_sysctl, "LU",
5483 	    "# of packet transmission aggregation flush failure");
5484 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5485 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5486 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5487 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5488 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5489 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5490 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5491 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5492 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5493 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5494 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5495 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5496 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5497 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5498 	    "# of total TX descs");
5499 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5500 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5501 	    "Chimney send packet size upper boundary");
5502 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5503 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5504 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5505 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5506 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5507 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5508 	    hn_tx_conf_int_sysctl, "I",
5509 	    "Size of the packet for direct transmission");
5510 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5511 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5512 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5513 	    hn_tx_conf_int_sysctl, "I",
5514 	    "Always schedule transmission "
5515 	    "instead of doing direct transmission");
5516 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5517 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5518 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5519 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5520 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5521 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5522 	    "Applied packet transmission aggregation size");
5523 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5524 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5525 	    hn_txagg_pktmax_sysctl, "I",
5526 	    "Applied packet transmission aggregation packets");
5527 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5528 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5529 	    hn_txagg_align_sysctl, "I",
5530 	    "Applied packet transmission aggregation alignment");
5531 
5532 	return 0;
5533 }
5534 
5535 static void
hn_set_chim_size(struct hn_softc * sc,int chim_size)5536 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5537 {
5538 	int i;
5539 
5540 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5541 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5542 }
5543 
5544 static void
hn_set_tso_maxsize(struct hn_softc * sc,int tso_maxlen,int mtu)5545 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5546 {
5547 	if_t ifp = sc->hn_ifp;
5548 	u_int hw_tsomax;
5549 	int tso_minlen;
5550 
5551 	HN_LOCK_ASSERT(sc);
5552 
5553 	if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5554 		return;
5555 
5556 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5557 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5558 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5559 
5560 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5561 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5562 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5563 
5564 	if (tso_maxlen < tso_minlen)
5565 		tso_maxlen = tso_minlen;
5566 	else if (tso_maxlen > IP_MAXPACKET)
5567 		tso_maxlen = IP_MAXPACKET;
5568 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5569 		tso_maxlen = sc->hn_ndis_tso_szmax;
5570 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5571 
5572 	if (hn_xpnt_vf_isready(sc)) {
5573 		if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp))
5574 			hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp);
5575 	}
5576 	if_sethwtsomax(ifp, hw_tsomax);
5577 	if (bootverbose)
5578 		if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp));
5579 }
5580 
5581 static void
hn_fixup_tx_data(struct hn_softc * sc)5582 hn_fixup_tx_data(struct hn_softc *sc)
5583 {
5584 	uint64_t csum_assist;
5585 	int i;
5586 
5587 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5588 	if (hn_tx_chimney_size > 0 &&
5589 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5590 		hn_set_chim_size(sc, hn_tx_chimney_size);
5591 
5592 	csum_assist = 0;
5593 	if (sc->hn_caps & HN_CAP_IPCS)
5594 		csum_assist |= CSUM_IP;
5595 	if (sc->hn_caps & HN_CAP_TCP4CS)
5596 		csum_assist |= CSUM_IP_TCP;
5597 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5598 		csum_assist |= CSUM_IP_UDP;
5599 	if (sc->hn_caps & HN_CAP_TCP6CS)
5600 		csum_assist |= CSUM_IP6_TCP;
5601 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5602 		csum_assist |= CSUM_IP6_UDP;
5603 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5604 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5605 
5606 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5607 		/*
5608 		 * Support HASHVAL pktinfo on TX path.
5609 		 */
5610 		if (bootverbose)
5611 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5612 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5613 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5614 	}
5615 }
5616 
5617 static void
hn_fixup_rx_data(struct hn_softc * sc)5618 hn_fixup_rx_data(struct hn_softc *sc)
5619 {
5620 
5621 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5622 		int i;
5623 
5624 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5625 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5626 	}
5627 }
5628 
5629 static void
hn_destroy_tx_data(struct hn_softc * sc)5630 hn_destroy_tx_data(struct hn_softc *sc)
5631 {
5632 	int i;
5633 
5634 	if (sc->hn_chim != NULL) {
5635 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5636 			free(sc->hn_chim, M_DEVBUF);
5637 		} else {
5638 			device_printf(sc->hn_dev,
5639 			    "chimney sending buffer is referenced");
5640 		}
5641 		sc->hn_chim = NULL;
5642 	}
5643 
5644 	if (sc->hn_tx_ring_cnt == 0)
5645 		return;
5646 
5647 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5648 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5649 
5650 	free(sc->hn_tx_ring, M_DEVBUF);
5651 	sc->hn_tx_ring = NULL;
5652 
5653 	sc->hn_tx_ring_cnt = 0;
5654 	sc->hn_tx_ring_inuse = 0;
5655 }
5656 
5657 #ifdef HN_IFSTART_SUPPORT
5658 
5659 static void
hn_start_taskfunc(void * xtxr,int pending __unused)5660 hn_start_taskfunc(void *xtxr, int pending __unused)
5661 {
5662 	struct hn_tx_ring *txr = xtxr;
5663 
5664 	mtx_lock(&txr->hn_tx_lock);
5665 	hn_start_locked(txr, 0);
5666 	mtx_unlock(&txr->hn_tx_lock);
5667 }
5668 
5669 static int
hn_start_locked(struct hn_tx_ring * txr,int len)5670 hn_start_locked(struct hn_tx_ring *txr, int len)
5671 {
5672 	struct hn_softc *sc = txr->hn_sc;
5673 	if_t ifp = sc->hn_ifp;
5674 	int sched = 0;
5675 
5676 	KASSERT(hn_use_if_start,
5677 	    ("hn_start_locked is called, when if_start is disabled"));
5678 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5679 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5680 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5681 
5682 	if (__predict_false(txr->hn_suspended))
5683 		return (0);
5684 
5685 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5686 	    IFF_DRV_RUNNING)
5687 		return (0);
5688 
5689 	while (!if_sendq_empty(ifp)) {
5690 		struct hn_txdesc *txd;
5691 		struct mbuf *m_head;
5692 		int error;
5693 
5694 		m_head = if_dequeue(ifp);
5695 		if (m_head == NULL)
5696 			break;
5697 
5698 		if (len > 0 && m_head->m_pkthdr.len > len) {
5699 			/*
5700 			 * This sending could be time consuming; let callers
5701 			 * dispatch this packet sending (and sending of any
5702 			 * following up packets) to tx taskqueue.
5703 			 */
5704 			if_sendq_prepend(ifp, m_head);
5705 			sched = 1;
5706 			break;
5707 		}
5708 
5709 #if defined(INET6) || defined(INET)
5710 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5711 			m_head = hn_tso_fixup(m_head);
5712 			if (__predict_false(m_head == NULL)) {
5713 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5714 				continue;
5715 			}
5716 		} else if (m_head->m_pkthdr.csum_flags &
5717 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5718 			m_head = hn_set_hlen(m_head);
5719 			if (__predict_false(m_head == NULL)) {
5720 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5721 				continue;
5722 			}
5723 		}
5724 #endif
5725 
5726 		txd = hn_txdesc_get(txr);
5727 		if (txd == NULL) {
5728 			txr->hn_no_txdescs++;
5729 			if_sendq_prepend(ifp, m_head);
5730 			if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
5731 			break;
5732 		}
5733 
5734 		error = hn_encap(ifp, txr, txd, &m_head);
5735 		if (error) {
5736 			/* Both txd and m_head are freed */
5737 			KASSERT(txr->hn_agg_txd == NULL,
5738 			    ("encap failed w/ pending aggregating txdesc"));
5739 			continue;
5740 		}
5741 
5742 		if (txr->hn_agg_pktleft == 0) {
5743 			if (txr->hn_agg_txd != NULL) {
5744 				KASSERT(m_head == NULL,
5745 				    ("pending mbuf for aggregating txdesc"));
5746 				error = hn_flush_txagg(ifp, txr);
5747 				if (__predict_false(error)) {
5748 					if_setdrvflagbits(ifp,
5749 					    IFF_DRV_OACTIVE, 0);
5750 					break;
5751 				}
5752 			} else {
5753 				KASSERT(m_head != NULL, ("mbuf was freed"));
5754 				error = hn_txpkt(ifp, txr, txd);
5755 				if (__predict_false(error)) {
5756 					/* txd is freed, but m_head is not */
5757 					if_sendq_prepend(ifp, m_head);
5758 					if_setdrvflagbits(ifp,
5759 					    IFF_DRV_OACTIVE, 0);
5760 					break;
5761 				}
5762 			}
5763 		}
5764 #ifdef INVARIANTS
5765 		else {
5766 			KASSERT(txr->hn_agg_txd != NULL,
5767 			    ("no aggregating txdesc"));
5768 			KASSERT(m_head == NULL,
5769 			    ("pending mbuf for aggregating txdesc"));
5770 		}
5771 #endif
5772 	}
5773 
5774 	/* Flush pending aggerated transmission. */
5775 	if (txr->hn_agg_txd != NULL)
5776 		hn_flush_txagg(ifp, txr);
5777 	return (sched);
5778 }
5779 
5780 static void
hn_start(if_t ifp)5781 hn_start(if_t ifp)
5782 {
5783 	struct hn_softc *sc = if_getsoftc(ifp);
5784 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5785 
5786 	if (txr->hn_sched_tx)
5787 		goto do_sched;
5788 
5789 	if (mtx_trylock(&txr->hn_tx_lock)) {
5790 		int sched;
5791 
5792 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5793 		mtx_unlock(&txr->hn_tx_lock);
5794 		if (!sched)
5795 			return;
5796 	}
5797 do_sched:
5798 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5799 }
5800 
5801 static void
hn_start_txeof_taskfunc(void * xtxr,int pending __unused)5802 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5803 {
5804 	struct hn_tx_ring *txr = xtxr;
5805 
5806 	mtx_lock(&txr->hn_tx_lock);
5807 	if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE);
5808 	hn_start_locked(txr, 0);
5809 	mtx_unlock(&txr->hn_tx_lock);
5810 }
5811 
5812 static void
hn_start_txeof(struct hn_tx_ring * txr)5813 hn_start_txeof(struct hn_tx_ring *txr)
5814 {
5815 	struct hn_softc *sc = txr->hn_sc;
5816 	if_t ifp = sc->hn_ifp;
5817 
5818 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5819 
5820 	if (txr->hn_sched_tx)
5821 		goto do_sched;
5822 
5823 	if (mtx_trylock(&txr->hn_tx_lock)) {
5824 		int sched;
5825 
5826 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5827 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5828 		mtx_unlock(&txr->hn_tx_lock);
5829 		if (sched) {
5830 			taskqueue_enqueue(txr->hn_tx_taskq,
5831 			    &txr->hn_tx_task);
5832 		}
5833 	} else {
5834 do_sched:
5835 		/*
5836 		 * Release the OACTIVE earlier, with the hope, that
5837 		 * others could catch up.  The task will clear the
5838 		 * flag again with the hn_tx_lock to avoid possible
5839 		 * races.
5840 		 */
5841 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5842 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5843 	}
5844 }
5845 
5846 #endif	/* HN_IFSTART_SUPPORT */
5847 
5848 static int
hn_xmit(struct hn_tx_ring * txr,int len)5849 hn_xmit(struct hn_tx_ring *txr, int len)
5850 {
5851 	struct hn_softc *sc = txr->hn_sc;
5852 	if_t ifp = sc->hn_ifp;
5853 	struct mbuf *m_head;
5854 	int sched = 0;
5855 
5856 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5857 #ifdef HN_IFSTART_SUPPORT
5858 	KASSERT(hn_use_if_start == 0,
5859 	    ("hn_xmit is called, when if_start is enabled"));
5860 #endif
5861 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5862 
5863 	if (__predict_false(txr->hn_suspended))
5864 		return (0);
5865 
5866 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5867 		return (0);
5868 
5869 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5870 		struct hn_txdesc *txd;
5871 		int error;
5872 
5873 		if (len > 0 && m_head->m_pkthdr.len > len) {
5874 			/*
5875 			 * This sending could be time consuming; let callers
5876 			 * dispatch this packet sending (and sending of any
5877 			 * following up packets) to tx taskqueue.
5878 			 */
5879 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5880 			sched = 1;
5881 			break;
5882 		}
5883 
5884 		txd = hn_txdesc_get(txr);
5885 		if (txd == NULL) {
5886 			txr->hn_no_txdescs++;
5887 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5888 			txr->hn_oactive = 1;
5889 			break;
5890 		}
5891 
5892 		error = hn_encap(ifp, txr, txd, &m_head);
5893 		if (error) {
5894 			/* Both txd and m_head are freed; discard */
5895 			KASSERT(txr->hn_agg_txd == NULL,
5896 			    ("encap failed w/ pending aggregating txdesc"));
5897 			drbr_advance(ifp, txr->hn_mbuf_br);
5898 			continue;
5899 		}
5900 
5901 		if (txr->hn_agg_pktleft == 0) {
5902 			if (txr->hn_agg_txd != NULL) {
5903 				KASSERT(m_head == NULL,
5904 				    ("pending mbuf for aggregating txdesc"));
5905 				error = hn_flush_txagg(ifp, txr);
5906 				if (__predict_false(error)) {
5907 					txr->hn_oactive = 1;
5908 					break;
5909 				}
5910 			} else {
5911 				KASSERT(m_head != NULL, ("mbuf was freed"));
5912 				error = hn_txpkt(ifp, txr, txd);
5913 				if (__predict_false(error)) {
5914 					/* txd is freed, but m_head is not */
5915 					drbr_putback(ifp, txr->hn_mbuf_br,
5916 					    m_head);
5917 					txr->hn_oactive = 1;
5918 					break;
5919 				}
5920 			}
5921 		}
5922 #ifdef INVARIANTS
5923 		else {
5924 			KASSERT(txr->hn_agg_txd != NULL,
5925 			    ("no aggregating txdesc"));
5926 			KASSERT(m_head == NULL,
5927 			    ("pending mbuf for aggregating txdesc"));
5928 		}
5929 #endif
5930 
5931 		/* Sent */
5932 		drbr_advance(ifp, txr->hn_mbuf_br);
5933 	}
5934 
5935 	/* Flush pending aggerated transmission. */
5936 	if (txr->hn_agg_txd != NULL)
5937 		hn_flush_txagg(ifp, txr);
5938 	return (sched);
5939 }
5940 
5941 static int
hn_transmit(if_t ifp,struct mbuf * m)5942 hn_transmit(if_t ifp, struct mbuf *m)
5943 {
5944 	struct hn_softc *sc = if_getsoftc(ifp);
5945 	struct hn_tx_ring *txr;
5946 	int error, idx = 0;
5947 
5948 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5949 		struct rm_priotracker pt;
5950 
5951 		rm_rlock(&sc->hn_vf_lock, &pt);
5952 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5953 			struct mbuf *m_bpf = NULL;
5954 			int obytes, omcast;
5955 
5956 			obytes = m->m_pkthdr.len;
5957 			omcast = (m->m_flags & M_MCAST) != 0;
5958 
5959 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5960 				if (bpf_peers_present_if(ifp)) {
5961 					m_bpf = m_copypacket(m, M_NOWAIT);
5962 					if (m_bpf == NULL) {
5963 						/*
5964 						 * Failed to grab a shallow
5965 						 * copy; tap now.
5966 						 */
5967 						ETHER_BPF_MTAP(ifp, m);
5968 					}
5969 				}
5970 			} else {
5971 				ETHER_BPF_MTAP(ifp, m);
5972 			}
5973 
5974 			error = if_transmit(sc->hn_vf_ifp, m);
5975 			rm_runlock(&sc->hn_vf_lock, &pt);
5976 
5977 			if (m_bpf != NULL) {
5978 				if (!error)
5979 					ETHER_BPF_MTAP(ifp, m_bpf);
5980 				m_freem(m_bpf);
5981 			}
5982 
5983 			if (error == ENOBUFS) {
5984 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5985 			} else if (error) {
5986 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5987 			} else {
5988 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5989 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5990 				if (omcast) {
5991 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5992 					    omcast);
5993 				}
5994 			}
5995 			return (error);
5996 		}
5997 		rm_runlock(&sc->hn_vf_lock, &pt);
5998 	}
5999 
6000 #if defined(INET6) || defined(INET)
6001 	/*
6002 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6003 	 * since packet headers should be cache-hot.
6004 	 */
6005 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6006 		m = hn_tso_fixup(m);
6007 		if (__predict_false(m == NULL)) {
6008 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6009 			return EIO;
6010 		}
6011 	} else if (m->m_pkthdr.csum_flags &
6012 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6013 		m = hn_set_hlen(m);
6014 		if (__predict_false(m == NULL)) {
6015 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6016 			return EIO;
6017 		}
6018 	}
6019 #endif
6020 
6021 	/*
6022 	 * Select the TX ring based on flowid
6023 	 */
6024 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6025 #ifdef RSS
6026 		uint32_t bid;
6027 
6028 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6029 		    &bid) == 0)
6030 			idx = bid % sc->hn_tx_ring_inuse;
6031 		else
6032 #endif
6033 		{
6034 #if defined(INET6) || defined(INET)
6035 			int tcpsyn = 0;
6036 
6037 			if (m->m_pkthdr.len < 128 &&
6038 			    (m->m_pkthdr.csum_flags &
6039 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6040 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6041 				m = hn_check_tcpsyn(m, &tcpsyn);
6042 				if (__predict_false(m == NULL)) {
6043 					if_inc_counter(ifp,
6044 					    IFCOUNTER_OERRORS, 1);
6045 					return (EIO);
6046 				}
6047 			}
6048 #else
6049 			const int tcpsyn = 0;
6050 #endif
6051 			if (tcpsyn)
6052 				idx = 0;
6053 			else
6054 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6055 		}
6056 	}
6057 	txr = &sc->hn_tx_ring[idx];
6058 
6059 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6060 	if (error) {
6061 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6062 		return error;
6063 	}
6064 
6065 	if (txr->hn_oactive)
6066 		return 0;
6067 
6068 	if (txr->hn_sched_tx)
6069 		goto do_sched;
6070 
6071 	if (mtx_trylock(&txr->hn_tx_lock)) {
6072 		int sched;
6073 
6074 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6075 		mtx_unlock(&txr->hn_tx_lock);
6076 		if (!sched)
6077 			return 0;
6078 	}
6079 do_sched:
6080 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6081 	return 0;
6082 }
6083 
6084 static void
hn_tx_ring_qflush(struct hn_tx_ring * txr)6085 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6086 {
6087 	struct mbuf *m;
6088 
6089 	mtx_lock(&txr->hn_tx_lock);
6090 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6091 		m_freem(m);
6092 	mtx_unlock(&txr->hn_tx_lock);
6093 }
6094 
6095 static void
hn_xmit_qflush(if_t ifp)6096 hn_xmit_qflush(if_t ifp)
6097 {
6098 	struct hn_softc *sc = if_getsoftc(ifp);
6099 	struct rm_priotracker pt;
6100 	int i;
6101 
6102 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6103 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6104 	if_qflush(ifp);
6105 
6106 	rm_rlock(&sc->hn_vf_lock, &pt);
6107 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6108 		if_qflush(sc->hn_vf_ifp);
6109 	rm_runlock(&sc->hn_vf_lock, &pt);
6110 }
6111 
6112 static void
hn_xmit_txeof(struct hn_tx_ring * txr)6113 hn_xmit_txeof(struct hn_tx_ring *txr)
6114 {
6115 
6116 	if (txr->hn_sched_tx)
6117 		goto do_sched;
6118 
6119 	if (mtx_trylock(&txr->hn_tx_lock)) {
6120 		int sched;
6121 
6122 		txr->hn_oactive = 0;
6123 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6124 		mtx_unlock(&txr->hn_tx_lock);
6125 		if (sched) {
6126 			taskqueue_enqueue(txr->hn_tx_taskq,
6127 			    &txr->hn_tx_task);
6128 		}
6129 	} else {
6130 do_sched:
6131 		/*
6132 		 * Release the oactive earlier, with the hope, that
6133 		 * others could catch up.  The task will clear the
6134 		 * oactive again with the hn_tx_lock to avoid possible
6135 		 * races.
6136 		 */
6137 		txr->hn_oactive = 0;
6138 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6139 	}
6140 }
6141 
6142 static void
hn_xmit_taskfunc(void * xtxr,int pending __unused)6143 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6144 {
6145 	struct hn_tx_ring *txr = xtxr;
6146 
6147 	mtx_lock(&txr->hn_tx_lock);
6148 	hn_xmit(txr, 0);
6149 	mtx_unlock(&txr->hn_tx_lock);
6150 }
6151 
6152 static void
hn_xmit_txeof_taskfunc(void * xtxr,int pending __unused)6153 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6154 {
6155 	struct hn_tx_ring *txr = xtxr;
6156 
6157 	mtx_lock(&txr->hn_tx_lock);
6158 	txr->hn_oactive = 0;
6159 	hn_xmit(txr, 0);
6160 	mtx_unlock(&txr->hn_tx_lock);
6161 }
6162 
6163 static int
hn_chan_attach(struct hn_softc * sc,struct vmbus_channel * chan)6164 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6165 {
6166 	struct vmbus_chan_br cbr;
6167 	struct hn_rx_ring *rxr;
6168 	struct hn_tx_ring *txr = NULL;
6169 	int idx, error;
6170 
6171 	idx = vmbus_chan_subidx(chan);
6172 
6173 	/*
6174 	 * Link this channel to RX/TX ring.
6175 	 */
6176 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6177 	    ("invalid channel index %d, should > 0 && < %d",
6178 	     idx, sc->hn_rx_ring_inuse));
6179 	rxr = &sc->hn_rx_ring[idx];
6180 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6181 	    ("RX ring %d already attached", idx));
6182 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6183 	rxr->hn_chan = chan;
6184 
6185 	if (bootverbose) {
6186 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6187 		    idx, vmbus_chan_id(chan));
6188 	}
6189 
6190 	if (idx < sc->hn_tx_ring_inuse) {
6191 		txr = &sc->hn_tx_ring[idx];
6192 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6193 		    ("TX ring %d already attached", idx));
6194 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6195 
6196 		txr->hn_chan = chan;
6197 		if (bootverbose) {
6198 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6199 			    idx, vmbus_chan_id(chan));
6200 		}
6201 	}
6202 
6203 	/* Bind this channel to a proper CPU. */
6204 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6205 
6206 	/*
6207 	 * Open this channel
6208 	 */
6209 	cbr.cbr = rxr->hn_br;
6210 	cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br);
6211 	cbr.cbr_txsz = HN_TXBR_SIZE;
6212 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6213 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6214 	if (error) {
6215 		if (error == EISCONN) {
6216 			if_printf(sc->hn_ifp, "bufring is connected after "
6217 			    "chan%u open failure\n", vmbus_chan_id(chan));
6218 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6219 		} else {
6220 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6221 			    vmbus_chan_id(chan), error);
6222 		}
6223 	}
6224 	return (error);
6225 }
6226 
6227 static void
hn_chan_detach(struct hn_softc * sc,struct vmbus_channel * chan)6228 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6229 {
6230 	struct hn_rx_ring *rxr;
6231 	int idx, error;
6232 
6233 	idx = vmbus_chan_subidx(chan);
6234 
6235 	/*
6236 	 * Link this channel to RX/TX ring.
6237 	 */
6238 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6239 	    ("invalid channel index %d, should > 0 && < %d",
6240 	     idx, sc->hn_rx_ring_inuse));
6241 	rxr = &sc->hn_rx_ring[idx];
6242 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6243 	    ("RX ring %d is not attached", idx));
6244 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6245 
6246 	if (idx < sc->hn_tx_ring_inuse) {
6247 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6248 
6249 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6250 		    ("TX ring %d is not attached attached", idx));
6251 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6252 	}
6253 
6254 	/*
6255 	 * Close this channel.
6256 	 *
6257 	 * NOTE:
6258 	 * Channel closing does _not_ destroy the target channel.
6259 	 */
6260 	error = vmbus_chan_close_direct(chan);
6261 	if (error == EISCONN) {
6262 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6263 		    "after being closed\n", vmbus_chan_id(chan));
6264 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6265 	} else if (error) {
6266 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6267 		    vmbus_chan_id(chan), error);
6268 	}
6269 }
6270 
6271 static int
hn_attach_subchans(struct hn_softc * sc)6272 hn_attach_subchans(struct hn_softc *sc)
6273 {
6274 	struct vmbus_channel **subchans;
6275 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6276 	int i, error = 0;
6277 
6278 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6279 
6280 	/* Attach the sub-channels. */
6281 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6282 	for (i = 0; i < subchan_cnt; ++i) {
6283 		int error1;
6284 
6285 		error1 = hn_chan_attach(sc, subchans[i]);
6286 		if (error1) {
6287 			error = error1;
6288 			/* Move on; all channels will be detached later. */
6289 		}
6290 	}
6291 	vmbus_subchan_rel(subchans, subchan_cnt);
6292 
6293 	if (error) {
6294 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6295 	} else {
6296 		if (bootverbose) {
6297 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6298 			    subchan_cnt);
6299 		}
6300 	}
6301 	return (error);
6302 }
6303 
6304 static void
hn_detach_allchans(struct hn_softc * sc)6305 hn_detach_allchans(struct hn_softc *sc)
6306 {
6307 	struct vmbus_channel **subchans;
6308 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6309 	int i;
6310 
6311 	if (subchan_cnt == 0)
6312 		goto back;
6313 
6314 	/* Detach the sub-channels. */
6315 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6316 	for (i = 0; i < subchan_cnt; ++i)
6317 		hn_chan_detach(sc, subchans[i]);
6318 	vmbus_subchan_rel(subchans, subchan_cnt);
6319 
6320 back:
6321 	/*
6322 	 * Detach the primary channel, _after_ all sub-channels
6323 	 * are detached.
6324 	 */
6325 	hn_chan_detach(sc, sc->hn_prichan);
6326 
6327 	/* Wait for sub-channels to be destroyed, if any. */
6328 	vmbus_subchan_drain(sc->hn_prichan);
6329 
6330 #ifdef INVARIANTS
6331 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6332 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6333 		    HN_RX_FLAG_ATTACHED) == 0,
6334 		    ("%dth RX ring is still attached", i));
6335 	}
6336 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6337 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6338 		    HN_TX_FLAG_ATTACHED) == 0,
6339 		    ("%dth TX ring is still attached", i));
6340 	}
6341 #endif
6342 }
6343 
6344 static int
hn_synth_alloc_subchans(struct hn_softc * sc,int * nsubch)6345 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6346 {
6347 	struct vmbus_channel **subchans;
6348 	int nchan, rxr_cnt, error;
6349 
6350 	nchan = *nsubch + 1;
6351 	if (nchan == 1) {
6352 		/*
6353 		 * Multiple RX/TX rings are not requested.
6354 		 */
6355 		*nsubch = 0;
6356 		return (0);
6357 	}
6358 
6359 	/*
6360 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6361 	 * table entries.
6362 	 */
6363 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6364 	if (error) {
6365 		/* No RSS; this is benign. */
6366 		*nsubch = 0;
6367 		return (0);
6368 	}
6369 	if (bootverbose) {
6370 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6371 		    rxr_cnt, nchan);
6372 	}
6373 
6374 	if (nchan > rxr_cnt)
6375 		nchan = rxr_cnt;
6376 	if (nchan == 1) {
6377 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6378 		*nsubch = 0;
6379 		return (0);
6380 	}
6381 
6382 	/*
6383 	 * Allocate sub-channels from NVS.
6384 	 */
6385 	*nsubch = nchan - 1;
6386 	error = hn_nvs_alloc_subchans(sc, nsubch);
6387 	if (error || *nsubch == 0) {
6388 		/* Failed to allocate sub-channels. */
6389 		*nsubch = 0;
6390 		return (0);
6391 	}
6392 
6393 	/*
6394 	 * Wait for all sub-channels to become ready before moving on.
6395 	 */
6396 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6397 	vmbus_subchan_rel(subchans, *nsubch);
6398 	return (0);
6399 }
6400 
6401 static bool
hn_synth_attachable(const struct hn_softc * sc)6402 hn_synth_attachable(const struct hn_softc *sc)
6403 {
6404 	int i;
6405 
6406 	if (sc->hn_flags & HN_FLAG_ERRORS)
6407 		return (false);
6408 
6409 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6410 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6411 
6412 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6413 			return (false);
6414 	}
6415 	return (true);
6416 }
6417 
6418 /*
6419  * Make sure that the RX filter is zero after the successful
6420  * RNDIS initialization.
6421  *
6422  * NOTE:
6423  * Under certain conditions on certain versions of Hyper-V,
6424  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6425  * after the successful RNDIS initialization, which breaks
6426  * the assumption of any following code (well, it breaks the
6427  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6428  * explicitly, drain packets sneaking through, and drain the
6429  * interrupt taskqueues scheduled due to the stealth packets.
6430  */
6431 static void
hn_rndis_init_fixat(struct hn_softc * sc,int nchan)6432 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6433 {
6434 
6435 	hn_disable_rx(sc);
6436 	hn_drain_rxtx(sc, nchan);
6437 }
6438 
6439 static int
hn_synth_attach(struct hn_softc * sc,int mtu)6440 hn_synth_attach(struct hn_softc *sc, int mtu)
6441 {
6442 #define ATTACHED_NVS		0x0002
6443 #define ATTACHED_RNDIS		0x0004
6444 
6445 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6446 	int error, nsubch, nchan = 1, i, rndis_inited;
6447 	uint32_t old_caps, attached = 0;
6448 
6449 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6450 	    ("synthetic parts were attached"));
6451 
6452 	if (!hn_synth_attachable(sc))
6453 		return (ENXIO);
6454 
6455 	/* Save capabilities for later verification. */
6456 	old_caps = sc->hn_caps;
6457 	sc->hn_caps = 0;
6458 
6459 	/* Clear RSS stuffs. */
6460 	sc->hn_rss_ind_size = 0;
6461 	sc->hn_rss_hash = 0;
6462 	sc->hn_rss_hcap = 0;
6463 
6464 	/*
6465 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6466 	 */
6467 	error = hn_chan_attach(sc, sc->hn_prichan);
6468 	if (error)
6469 		goto failed;
6470 
6471 	/*
6472 	 * Attach NVS.
6473 	 */
6474 	error = hn_nvs_attach(sc, mtu);
6475 	if (error)
6476 		goto failed;
6477 	attached |= ATTACHED_NVS;
6478 
6479 	/*
6480 	 * Attach RNDIS _after_ NVS is attached.
6481 	 */
6482 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6483 	if (rndis_inited)
6484 		attached |= ATTACHED_RNDIS;
6485 	if (error)
6486 		goto failed;
6487 
6488 	/*
6489 	 * Make sure capabilities are not changed.
6490 	 */
6491 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6492 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6493 		    old_caps, sc->hn_caps);
6494 		error = ENXIO;
6495 		goto failed;
6496 	}
6497 
6498 	/*
6499 	 * Allocate sub-channels for multi-TX/RX rings.
6500 	 *
6501 	 * NOTE:
6502 	 * The # of RX rings that can be used is equivalent to the # of
6503 	 * channels to be requested.
6504 	 */
6505 	nsubch = sc->hn_rx_ring_cnt - 1;
6506 	error = hn_synth_alloc_subchans(sc, &nsubch);
6507 	if (error)
6508 		goto failed;
6509 	/* NOTE: _Full_ synthetic parts detach is required now. */
6510 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6511 
6512 	/*
6513 	 * Set the # of TX/RX rings that could be used according to
6514 	 * the # of channels that NVS offered.
6515 	 */
6516 	nchan = nsubch + 1;
6517 	hn_set_ring_inuse(sc, nchan);
6518 	if (nchan == 1) {
6519 		/* Only the primary channel can be used; done */
6520 		goto back;
6521 	}
6522 
6523 	/*
6524 	 * Attach the sub-channels.
6525 	 *
6526 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6527 	 */
6528 	error = hn_attach_subchans(sc);
6529 	if (error)
6530 		goto failed;
6531 
6532 	/*
6533 	 * Configure RSS key and indirect table _after_ all sub-channels
6534 	 * are attached.
6535 	 */
6536 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6537 		/*
6538 		 * RSS key is not set yet; set it to the default RSS key.
6539 		 */
6540 		if (bootverbose)
6541 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6542 		rss_getkey(rss->rss_key);
6543 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6544 	}
6545 
6546 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6547 		/*
6548 		 * RSS indirect table is not set yet; set it up in round-
6549 		 * robin fashion.
6550 		 */
6551 		if (bootverbose) {
6552 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6553 			    "table\n");
6554 		}
6555 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6556 			uint32_t subidx;
6557 
6558 #ifdef RSS
6559 			subidx = rss_get_indirection_to_bucket(i);
6560 #else
6561 			subidx = i;
6562 #endif
6563 			rss->rss_ind[i] = subidx % nchan;
6564 		}
6565 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6566 	} else {
6567 		/*
6568 		 * # of usable channels may be changed, so we have to
6569 		 * make sure that all entries in RSS indirect table
6570 		 * are valid.
6571 		 *
6572 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6573 		 */
6574 		hn_rss_ind_fixup(sc);
6575 	}
6576 
6577 	sc->hn_rss_hash = sc->hn_rss_hcap;
6578 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6579 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6580 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6581 		hn_vf_rss_fixup(sc, false);
6582 	}
6583 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6584 	if (error)
6585 		goto failed;
6586 back:
6587 	/*
6588 	 * Fixup transmission aggregation setup.
6589 	 */
6590 	hn_set_txagg(sc);
6591 	hn_rndis_init_fixat(sc, nchan);
6592 	return (0);
6593 
6594 failed:
6595 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6596 		hn_rndis_init_fixat(sc, nchan);
6597 		hn_synth_detach(sc);
6598 	} else {
6599 		if (attached & ATTACHED_RNDIS) {
6600 			hn_rndis_init_fixat(sc, nchan);
6601 			hn_rndis_detach(sc);
6602 		}
6603 		if (attached & ATTACHED_NVS)
6604 			hn_nvs_detach(sc);
6605 		hn_chan_detach(sc, sc->hn_prichan);
6606 		/* Restore old capabilities. */
6607 		sc->hn_caps = old_caps;
6608 	}
6609 	return (error);
6610 
6611 #undef ATTACHED_RNDIS
6612 #undef ATTACHED_NVS
6613 }
6614 
6615 /*
6616  * NOTE:
6617  * The interface must have been suspended though hn_suspend(), before
6618  * this function get called.
6619  */
6620 static void
hn_synth_detach(struct hn_softc * sc)6621 hn_synth_detach(struct hn_softc *sc)
6622 {
6623 
6624 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6625 	    ("synthetic parts were not attached"));
6626 
6627 	/* Detach the RNDIS first. */
6628 	hn_rndis_detach(sc);
6629 
6630 	/* Detach NVS. */
6631 	hn_nvs_detach(sc);
6632 
6633 	/* Detach all of the channels. */
6634 	hn_detach_allchans(sc);
6635 
6636 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6637 		/*
6638 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6639 		 */
6640 		int error;
6641 
6642 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6643 		    sc->hn_rxbuf_gpadl);
6644 		if (error) {
6645 			if_printf(sc->hn_ifp,
6646 			    "rxbuf gpadl disconn failed: %d\n", error);
6647 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6648 		}
6649 		sc->hn_rxbuf_gpadl = 0;
6650 	}
6651 
6652 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6653 		/*
6654 		 * Host is post-Win2016, disconnect chimney sending buffer from
6655 		 * primary channel here.
6656 		 */
6657 		int error;
6658 
6659 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6660 		    sc->hn_chim_gpadl);
6661 		if (error) {
6662 			if_printf(sc->hn_ifp,
6663 			    "chim gpadl disconn failed: %d\n", error);
6664 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6665 		}
6666 		sc->hn_chim_gpadl = 0;
6667 	}
6668 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6669 }
6670 
6671 static void
hn_set_ring_inuse(struct hn_softc * sc,int ring_cnt)6672 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6673 {
6674 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6675 	    ("invalid ring count %d", ring_cnt));
6676 
6677 	if (sc->hn_tx_ring_cnt > ring_cnt)
6678 		sc->hn_tx_ring_inuse = ring_cnt;
6679 	else
6680 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6681 	sc->hn_rx_ring_inuse = ring_cnt;
6682 
6683 #ifdef RSS
6684 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6685 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6686 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6687 		    rss_getnumbuckets());
6688 	}
6689 #endif
6690 
6691 	if (bootverbose) {
6692 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6693 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6694 	}
6695 }
6696 
6697 static void
hn_chan_drain(struct hn_softc * sc,struct vmbus_channel * chan)6698 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6699 {
6700 
6701 	/*
6702 	 * NOTE:
6703 	 * The TX bufring will not be drained by the hypervisor,
6704 	 * if the primary channel is revoked.
6705 	 */
6706 	while (!vmbus_chan_rx_empty(chan) ||
6707 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6708 	     !vmbus_chan_tx_empty(chan)))
6709 		pause("waitch", 1);
6710 	vmbus_chan_intr_drain(chan);
6711 }
6712 
6713 static void
hn_disable_rx(struct hn_softc * sc)6714 hn_disable_rx(struct hn_softc *sc)
6715 {
6716 
6717 	/*
6718 	 * Disable RX by clearing RX filter forcefully.
6719 	 */
6720 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6721 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6722 
6723 	/*
6724 	 * Give RNDIS enough time to flush all pending data packets.
6725 	 */
6726 	pause("waitrx", (200 * hz) / 1000);
6727 }
6728 
6729 /*
6730  * NOTE:
6731  * RX/TX _must_ have been suspended/disabled, before this function
6732  * is called.
6733  */
6734 static void
hn_drain_rxtx(struct hn_softc * sc,int nchan)6735 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6736 {
6737 	struct vmbus_channel **subch = NULL;
6738 	int nsubch;
6739 
6740 	/*
6741 	 * Drain RX/TX bufrings and interrupts.
6742 	 */
6743 	nsubch = nchan - 1;
6744 	if (nsubch > 0)
6745 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6746 
6747 	if (subch != NULL) {
6748 		int i;
6749 
6750 		for (i = 0; i < nsubch; ++i)
6751 			hn_chan_drain(sc, subch[i]);
6752 	}
6753 	hn_chan_drain(sc, sc->hn_prichan);
6754 
6755 	if (subch != NULL)
6756 		vmbus_subchan_rel(subch, nsubch);
6757 }
6758 
6759 static void
hn_suspend_data(struct hn_softc * sc)6760 hn_suspend_data(struct hn_softc *sc)
6761 {
6762 	struct hn_tx_ring *txr;
6763 	int i;
6764 
6765 	HN_LOCK_ASSERT(sc);
6766 
6767 	/*
6768 	 * Suspend TX.
6769 	 */
6770 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6771 		txr = &sc->hn_tx_ring[i];
6772 
6773 		mtx_lock(&txr->hn_tx_lock);
6774 		txr->hn_suspended = 1;
6775 		mtx_unlock(&txr->hn_tx_lock);
6776 		/* No one is able send more packets now. */
6777 
6778 		/*
6779 		 * Wait for all pending sends to finish.
6780 		 *
6781 		 * NOTE:
6782 		 * We will _not_ receive all pending send-done, if the
6783 		 * primary channel is revoked.
6784 		 */
6785 		while (hn_tx_ring_pending(txr) &&
6786 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6787 			pause("hnwtx", 1 /* 1 tick */);
6788 	}
6789 
6790 	/*
6791 	 * Disable RX.
6792 	 */
6793 	hn_disable_rx(sc);
6794 
6795 	/*
6796 	 * Drain RX/TX.
6797 	 */
6798 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6799 
6800 	/*
6801 	 * Drain any pending TX tasks.
6802 	 *
6803 	 * NOTE:
6804 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6805 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6806 	 */
6807 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6808 		txr = &sc->hn_tx_ring[i];
6809 
6810 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6811 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6812 	}
6813 }
6814 
6815 static void
hn_suspend_mgmt_taskfunc(void * xsc,int pending __unused)6816 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6817 {
6818 
6819 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6820 }
6821 
6822 static void
hn_suspend_mgmt(struct hn_softc * sc)6823 hn_suspend_mgmt(struct hn_softc *sc)
6824 {
6825 	struct task task;
6826 
6827 	HN_LOCK_ASSERT(sc);
6828 
6829 	/*
6830 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6831 	 * through hn_mgmt_taskq.
6832 	 */
6833 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6834 	vmbus_chan_run_task(sc->hn_prichan, &task);
6835 
6836 	/*
6837 	 * Make sure that all pending management tasks are completed.
6838 	 */
6839 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6840 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6841 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6842 }
6843 
6844 static void
hn_suspend(struct hn_softc * sc)6845 hn_suspend(struct hn_softc *sc)
6846 {
6847 
6848 	/* Disable polling. */
6849 	hn_polling(sc, 0);
6850 
6851 	/*
6852 	 * If the non-transparent mode VF is activated, the synthetic
6853 	 * device is receiving packets, so the data path of the
6854 	 * synthetic device must be suspended.
6855 	 */
6856 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6857 	    (sc->hn_flags & HN_FLAG_RXVF))
6858 		hn_suspend_data(sc);
6859 	hn_suspend_mgmt(sc);
6860 }
6861 
6862 static void
hn_resume_tx(struct hn_softc * sc,int tx_ring_cnt)6863 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6864 {
6865 	int i;
6866 
6867 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6868 	    ("invalid TX ring count %d", tx_ring_cnt));
6869 
6870 	for (i = 0; i < tx_ring_cnt; ++i) {
6871 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6872 
6873 		mtx_lock(&txr->hn_tx_lock);
6874 		txr->hn_suspended = 0;
6875 		mtx_unlock(&txr->hn_tx_lock);
6876 	}
6877 }
6878 
6879 static void
hn_resume_data(struct hn_softc * sc)6880 hn_resume_data(struct hn_softc *sc)
6881 {
6882 	int i;
6883 
6884 	HN_LOCK_ASSERT(sc);
6885 
6886 	/*
6887 	 * Re-enable RX.
6888 	 */
6889 	hn_rxfilter_config(sc);
6890 
6891 	/*
6892 	 * Make sure to clear suspend status on "all" TX rings,
6893 	 * since hn_tx_ring_inuse can be changed after
6894 	 * hn_suspend_data().
6895 	 */
6896 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6897 
6898 #ifdef HN_IFSTART_SUPPORT
6899 	if (!hn_use_if_start)
6900 #endif
6901 	{
6902 		/*
6903 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6904 		 * reduced.
6905 		 */
6906 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6907 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6908 	}
6909 
6910 	/*
6911 	 * Kick start TX.
6912 	 */
6913 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6914 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6915 
6916 		/*
6917 		 * Use txeof task, so that any pending oactive can be
6918 		 * cleared properly.
6919 		 */
6920 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6921 	}
6922 }
6923 
6924 static void
hn_resume_mgmt(struct hn_softc * sc)6925 hn_resume_mgmt(struct hn_softc *sc)
6926 {
6927 
6928 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6929 
6930 	/*
6931 	 * Kick off network change detection, if it was pending.
6932 	 * If no network change was pending, start link status
6933 	 * checks, which is more lightweight than network change
6934 	 * detection.
6935 	 */
6936 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6937 		hn_change_network(sc);
6938 	else
6939 		hn_update_link_status(sc);
6940 }
6941 
6942 static void
hn_resume(struct hn_softc * sc)6943 hn_resume(struct hn_softc *sc)
6944 {
6945 
6946 	/*
6947 	 * If the non-transparent mode VF is activated, the synthetic
6948 	 * device have to receive packets, so the data path of the
6949 	 * synthetic device must be resumed.
6950 	 */
6951 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6952 	    (sc->hn_flags & HN_FLAG_RXVF))
6953 		hn_resume_data(sc);
6954 
6955 	/*
6956 	 * Don't resume link status change if VF is attached/activated.
6957 	 * - In the non-transparent VF mode, the synthetic device marks
6958 	 *   link down until the VF is deactivated; i.e. VF is down.
6959 	 * - In transparent VF mode, VF's media status is used until
6960 	 *   the VF is detached.
6961 	 */
6962 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6963 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6964 		hn_resume_mgmt(sc);
6965 
6966 	/*
6967 	 * Re-enable polling if this interface is running and
6968 	 * the polling is requested.
6969 	 */
6970 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6971 		hn_polling(sc, sc->hn_pollhz);
6972 }
6973 
6974 static void
hn_rndis_rx_status(struct hn_softc * sc,const void * data,int dlen)6975 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6976 {
6977 	const struct rndis_status_msg *msg;
6978 	int ofs;
6979 
6980 	if (dlen < sizeof(*msg)) {
6981 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6982 		return;
6983 	}
6984 	msg = data;
6985 
6986 	switch (msg->rm_status) {
6987 	case RNDIS_STATUS_MEDIA_CONNECT:
6988 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6989 		hn_update_link_status(sc);
6990 		break;
6991 
6992 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6993 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6994 		/* Not really useful; ignore. */
6995 		break;
6996 
6997 	case RNDIS_STATUS_NETWORK_CHANGE:
6998 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6999 		if (dlen < ofs + msg->rm_stbuflen ||
7000 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7001 			if_printf(sc->hn_ifp, "network changed\n");
7002 		} else {
7003 			uint32_t change;
7004 
7005 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7006 			    sizeof(change));
7007 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7008 			    change);
7009 		}
7010 		hn_change_network(sc);
7011 		break;
7012 
7013 	default:
7014 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7015 		    msg->rm_status);
7016 		break;
7017 	}
7018 }
7019 
7020 static int
hn_rndis_rxinfo(const void * info_data,int info_dlen,struct hn_rxinfo * info)7021 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7022 {
7023 	const struct rndis_pktinfo *pi = info_data;
7024 	uint32_t mask = 0;
7025 
7026 	while (info_dlen != 0) {
7027 		const void *data;
7028 		uint32_t dlen;
7029 
7030 		if (__predict_false(info_dlen < sizeof(*pi)))
7031 			return (EINVAL);
7032 		if (__predict_false(info_dlen < pi->rm_size))
7033 			return (EINVAL);
7034 		info_dlen -= pi->rm_size;
7035 
7036 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7037 			return (EINVAL);
7038 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7039 			return (EINVAL);
7040 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7041 		data = pi->rm_data;
7042 
7043 		if (pi->rm_internal == 1) {
7044 			switch (pi->rm_type) {
7045 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7046 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7047 					return (EINVAL);
7048 				info->pktinfo_id =
7049 				    (const struct packet_info_id *)data;
7050 				mask |= HN_RXINFO_PKTINFO_ID;
7051 				break;
7052 
7053 			default:
7054 				goto next;
7055 			}
7056 		} else {
7057 			switch (pi->rm_type) {
7058 			case NDIS_PKTINFO_TYPE_VLAN:
7059 				if (__predict_false(dlen
7060 				    < NDIS_VLAN_INFO_SIZE))
7061 					return (EINVAL);
7062 				info->vlan_info = (const uint32_t *)data;
7063 				mask |= HN_RXINFO_VLAN;
7064 				break;
7065 
7066 			case NDIS_PKTINFO_TYPE_CSUM:
7067 				if (__predict_false(dlen
7068 				    < NDIS_RXCSUM_INFO_SIZE))
7069 					return (EINVAL);
7070 				info->csum_info = (const uint32_t *)data;
7071 				mask |= HN_RXINFO_CSUM;
7072 				break;
7073 
7074 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7075 				if (__predict_false(dlen
7076 				    < HN_NDIS_HASH_VALUE_SIZE))
7077 					return (EINVAL);
7078 				info->hash_value = (const uint32_t *)data;
7079 				mask |= HN_RXINFO_HASHVAL;
7080 				break;
7081 
7082 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7083 				if (__predict_false(dlen
7084 				    < HN_NDIS_HASH_INFO_SIZE))
7085 					return (EINVAL);
7086 				info->hash_info = (const uint32_t *)data;
7087 				mask |= HN_RXINFO_HASHINF;
7088 				break;
7089 
7090 			default:
7091 				goto next;
7092 			}
7093 		}
7094 
7095 		if (mask == HN_RXINFO_ALL) {
7096 			/* All found; done */
7097 			break;
7098 		}
7099 next:
7100 		pi = (const struct rndis_pktinfo *)
7101 		    ((const uint8_t *)pi + pi->rm_size);
7102 	}
7103 
7104 	/*
7105 	 * Final fixup.
7106 	 * - If there is no hash value, invalidate the hash info.
7107 	 */
7108 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7109 		info->hash_info = NULL;
7110 	return (0);
7111 }
7112 
7113 static __inline bool
hn_rndis_check_overlap(int off,int len,int check_off,int check_len)7114 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7115 {
7116 
7117 	if (off < check_off) {
7118 		if (__predict_true(off + len <= check_off))
7119 			return (false);
7120 	} else if (off > check_off) {
7121 		if (__predict_true(check_off + check_len <= off))
7122 			return (false);
7123 	}
7124 	return (true);
7125 }
7126 
7127 static __inline void
hn_rsc_add_data(struct hn_rx_ring * rxr,const void * data,uint32_t len,struct hn_rxinfo * info)7128 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7129 		uint32_t len, struct hn_rxinfo *info)
7130 {
7131 	uint32_t cnt = rxr->rsc.cnt;
7132 
7133 	if (cnt) {
7134 		rxr->rsc.pktlen += len;
7135 	} else {
7136 		rxr->rsc.vlan_info = info->vlan_info;
7137 		rxr->rsc.csum_info = info->csum_info;
7138 		rxr->rsc.hash_info = info->hash_info;
7139 		rxr->rsc.hash_value = info->hash_value;
7140 		rxr->rsc.pktlen = len;
7141 	}
7142 
7143 	rxr->rsc.frag_data[cnt] = data;
7144 	rxr->rsc.frag_len[cnt] = len;
7145 	rxr->rsc.cnt++;
7146 }
7147 
7148 static void
hn_rndis_rx_data(struct hn_rx_ring * rxr,const void * data,int dlen)7149 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7150 {
7151 	const struct rndis_packet_msg *pkt;
7152 	struct hn_rxinfo info;
7153 	int data_off, pktinfo_off, data_len, pktinfo_len;
7154 	bool rsc_more= false;
7155 
7156 	/*
7157 	 * Check length.
7158 	 */
7159 	if (__predict_false(dlen < sizeof(*pkt))) {
7160 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7161 		return;
7162 	}
7163 	pkt = data;
7164 
7165 	if (__predict_false(dlen < pkt->rm_len)) {
7166 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7167 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7168 		return;
7169 	}
7170 	if (__predict_false(pkt->rm_len <
7171 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7172 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7173 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7174 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7175 		    pkt->rm_pktinfolen);
7176 		return;
7177 	}
7178 	if (__predict_false(pkt->rm_datalen == 0)) {
7179 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7180 		return;
7181 	}
7182 
7183 	/*
7184 	 * Check offests.
7185 	 */
7186 #define IS_OFFSET_INVALID(ofs)			\
7187 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7188 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7189 
7190 	/* XXX Hyper-V does not meet data offset alignment requirement */
7191 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7192 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7193 		    "data offset %u\n", pkt->rm_dataoffset);
7194 		return;
7195 	}
7196 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7197 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7198 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7199 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7200 		return;
7201 	}
7202 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7203 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7204 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7205 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7206 		return;
7207 	}
7208 
7209 #undef IS_OFFSET_INVALID
7210 
7211 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7212 	data_len = pkt->rm_datalen;
7213 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7214 	pktinfo_len = pkt->rm_pktinfolen;
7215 
7216 	/*
7217 	 * Check OOB coverage.
7218 	 */
7219 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7220 		int oob_off, oob_len;
7221 
7222 		if_printf(rxr->hn_ifp, "got oobdata\n");
7223 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7224 		oob_len = pkt->rm_oobdatalen;
7225 
7226 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7227 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7228 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7229 			    pkt->rm_len, oob_off, oob_len);
7230 			return;
7231 		}
7232 
7233 		/*
7234 		 * Check against data.
7235 		 */
7236 		if (hn_rndis_check_overlap(oob_off, oob_len,
7237 		    data_off, data_len)) {
7238 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7239 			    "oob overlaps data, oob abs %d len %d, "
7240 			    "data abs %d len %d\n",
7241 			    oob_off, oob_len, data_off, data_len);
7242 			return;
7243 		}
7244 
7245 		/*
7246 		 * Check against pktinfo.
7247 		 */
7248 		if (pktinfo_len != 0 &&
7249 		    hn_rndis_check_overlap(oob_off, oob_len,
7250 		    pktinfo_off, pktinfo_len)) {
7251 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7252 			    "oob overlaps pktinfo, oob abs %d len %d, "
7253 			    "pktinfo abs %d len %d\n",
7254 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7255 			return;
7256 		}
7257 	}
7258 
7259 	/*
7260 	 * Check per-packet-info coverage and find useful per-packet-info.
7261 	 */
7262 	info.vlan_info = NULL;
7263 	info.csum_info = NULL;
7264 	info.hash_info = NULL;
7265 	info.pktinfo_id = NULL;
7266 
7267 	if (__predict_true(pktinfo_len != 0)) {
7268 		bool overlap;
7269 		int error;
7270 
7271 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7272 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7273 			    "pktinfo overflow, msglen %u, "
7274 			    "pktinfo abs %d len %d\n",
7275 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7276 			return;
7277 		}
7278 
7279 		/*
7280 		 * Check packet info coverage.
7281 		 */
7282 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7283 		    data_off, data_len);
7284 		if (__predict_false(overlap)) {
7285 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7286 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7287 			    "data abs %d len %d\n",
7288 			    pktinfo_off, pktinfo_len, data_off, data_len);
7289 			return;
7290 		}
7291 
7292 		/*
7293 		 * Find useful per-packet-info.
7294 		 */
7295 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7296 		    pktinfo_len, &info);
7297 		if (__predict_false(error)) {
7298 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7299 			    "pktinfo\n");
7300 			return;
7301 		}
7302 	}
7303 
7304 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7305 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7306 		    "data overflow, msglen %u, data abs %d len %d\n",
7307 		    pkt->rm_len, data_off, data_len);
7308 		return;
7309 	}
7310 
7311 	/* Identify RSC fragments, drop invalid packets */
7312 	if ((info.pktinfo_id != NULL) &&
7313 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7314 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7315 			rxr->rsc.cnt = 0;
7316 			rxr->hn_rsc_pkts++;
7317 		} else if (rxr->rsc.cnt == 0)
7318 			goto drop;
7319 
7320 		rsc_more = true;
7321 
7322 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7323 			rsc_more = false;
7324 
7325 		if (rsc_more && rxr->rsc.is_last)
7326 			goto drop;
7327 	} else {
7328 		rxr->rsc.cnt = 0;
7329 	}
7330 
7331 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7332 		goto drop;
7333 
7334 	/* Store data in per rx ring structure */
7335 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7336 	    data_len, &info);
7337 
7338 	if (rsc_more)
7339 		return;
7340 
7341 	hn_rxpkt(rxr);
7342 	rxr->rsc.cnt = 0;
7343 	return;
7344 drop:
7345 	rxr->hn_rsc_drop++;
7346 	return;
7347 }
7348 
7349 static __inline void
hn_rndis_rxpkt(struct hn_rx_ring * rxr,const void * data,int dlen)7350 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7351 {
7352 	const struct rndis_msghdr *hdr;
7353 
7354 	if (__predict_false(dlen < sizeof(*hdr))) {
7355 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7356 		return;
7357 	}
7358 	hdr = data;
7359 
7360 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7361 		/* Hot data path. */
7362 		hn_rndis_rx_data(rxr, data, dlen);
7363 		/* Done! */
7364 		return;
7365 	}
7366 
7367 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7368 		hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen);
7369 	else
7370 		hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen);
7371 }
7372 
7373 static void
hn_nvs_handle_notify(struct hn_softc * sc,const struct vmbus_chanpkt_hdr * pkt)7374 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7375 {
7376 	const struct hn_nvs_hdr *hdr;
7377 
7378 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7379 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7380 		return;
7381 	}
7382 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7383 
7384 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7385 		/* Useless; ignore */
7386 		return;
7387 	}
7388 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7389 }
7390 
7391 static void
hn_nvs_handle_comp(struct hn_softc * sc,struct vmbus_channel * chan,const struct vmbus_chanpkt_hdr * pkt)7392 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7393     const struct vmbus_chanpkt_hdr *pkt)
7394 {
7395 	struct hn_nvs_sendctx *sndc;
7396 
7397 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7398 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7399 	    VMBUS_CHANPKT_DATALEN(pkt));
7400 	/*
7401 	 * NOTE:
7402 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7403 	 * its callback.
7404 	 */
7405 }
7406 
7407 static void
hn_nvs_handle_rxbuf(struct hn_rx_ring * rxr,struct vmbus_channel * chan,const struct vmbus_chanpkt_hdr * pkthdr)7408 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7409     const struct vmbus_chanpkt_hdr *pkthdr)
7410 {
7411 	struct epoch_tracker et;
7412 	const struct vmbus_chanpkt_rxbuf *pkt;
7413 	const struct hn_nvs_hdr *nvs_hdr;
7414 	int count, i, hlen;
7415 
7416 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7417 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7418 		return;
7419 	}
7420 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7421 
7422 	/* Make sure that this is a RNDIS message. */
7423 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7424 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7425 		    nvs_hdr->nvs_type);
7426 		return;
7427 	}
7428 
7429 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7430 	if (__predict_false(hlen < sizeof(*pkt))) {
7431 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7432 		return;
7433 	}
7434 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7435 
7436 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7437 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7438 		    pkt->cp_rxbuf_id);
7439 		return;
7440 	}
7441 
7442 	count = pkt->cp_rxbuf_cnt;
7443 	if (__predict_false(hlen <
7444 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7445 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7446 		return;
7447 	}
7448 
7449 	NET_EPOCH_ENTER(et);
7450 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7451 	for (i = 0; i < count; ++i) {
7452 		int ofs, len;
7453 
7454 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7455 		len = pkt->cp_rxbuf[i].rb_len;
7456 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7457 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7458 			    "ofs %d, len %d\n", i, ofs, len);
7459 			continue;
7460 		}
7461 
7462 		rxr->rsc.is_last = (i == (count - 1));
7463 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7464 	}
7465 	NET_EPOCH_EXIT(et);
7466 
7467 	/*
7468 	 * Ack the consumed RXBUF associated w/ this channel packet,
7469 	 * so that this RXBUF can be recycled by the hypervisor.
7470 	 */
7471 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7472 }
7473 
7474 static void
hn_nvs_ack_rxbuf(struct hn_rx_ring * rxr,struct vmbus_channel * chan,uint64_t tid)7475 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7476     uint64_t tid)
7477 {
7478 	struct hn_nvs_rndis_ack ack;
7479 	int retries, error;
7480 
7481 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7482 	ack.nvs_status = HN_NVS_STATUS_OK;
7483 
7484 	retries = 0;
7485 again:
7486 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7487 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7488 	if (__predict_false(error == EAGAIN)) {
7489 		/*
7490 		 * NOTE:
7491 		 * This should _not_ happen in real world, since the
7492 		 * consumption of the TX bufring from the TX path is
7493 		 * controlled.
7494 		 */
7495 		if (rxr->hn_ack_failed == 0)
7496 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7497 		rxr->hn_ack_failed++;
7498 		retries++;
7499 		if (retries < 10) {
7500 			DELAY(100);
7501 			goto again;
7502 		}
7503 		/* RXBUF leaks! */
7504 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7505 	}
7506 }
7507 
7508 static void
hn_chan_callback(struct vmbus_channel * chan,void * xrxr)7509 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7510 {
7511 	struct hn_rx_ring *rxr = xrxr;
7512 	struct hn_softc *sc = if_getsoftc(rxr->hn_ifp);
7513 
7514 	for (;;) {
7515 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7516 		int error, pktlen;
7517 
7518 		pktlen = rxr->hn_pktbuf_len;
7519 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7520 		if (__predict_false(error == ENOBUFS)) {
7521 			void *nbuf;
7522 			int nlen;
7523 
7524 			/*
7525 			 * Expand channel packet buffer.
7526 			 *
7527 			 * XXX
7528 			 * Use M_WAITOK here, since allocation failure
7529 			 * is fatal.
7530 			 */
7531 			nlen = rxr->hn_pktbuf_len * 2;
7532 			while (nlen < pktlen)
7533 				nlen *= 2;
7534 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7535 
7536 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7537 			    rxr->hn_pktbuf_len, nlen);
7538 
7539 			free(rxr->hn_pktbuf, M_DEVBUF);
7540 			rxr->hn_pktbuf = nbuf;
7541 			rxr->hn_pktbuf_len = nlen;
7542 			/* Retry! */
7543 			continue;
7544 		} else if (__predict_false(error == EAGAIN)) {
7545 			/* No more channel packets; done! */
7546 			break;
7547 		}
7548 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7549 
7550 		switch (pkt->cph_type) {
7551 		case VMBUS_CHANPKT_TYPE_COMP:
7552 			hn_nvs_handle_comp(sc, chan, pkt);
7553 			break;
7554 
7555 		case VMBUS_CHANPKT_TYPE_RXBUF:
7556 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7557 			break;
7558 
7559 		case VMBUS_CHANPKT_TYPE_INBAND:
7560 			hn_nvs_handle_notify(sc, pkt);
7561 			break;
7562 
7563 		default:
7564 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7565 			    pkt->cph_type);
7566 			break;
7567 		}
7568 	}
7569 	hn_chan_rollup(rxr, rxr->hn_txr);
7570 }
7571 
7572 static void
hn_sysinit(void * arg __unused)7573 hn_sysinit(void *arg __unused)
7574 {
7575 	int i;
7576 
7577 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7578 
7579 #ifdef HN_IFSTART_SUPPORT
7580 	/*
7581 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7582 	 * mainly due to the IFF_DRV_OACTIVE flag.
7583 	 */
7584 	if (hn_xpnt_vf && hn_use_if_start) {
7585 		hn_use_if_start = 0;
7586 		printf("hn: transparent VF mode, if_transmit will be used, "
7587 		    "instead of if_start\n");
7588 	}
7589 #endif
7590 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7591 		printf("hn: invalid transparent VF attach routing "
7592 		    "wait timeout %d, reset to %d\n",
7593 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7594 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7595 	}
7596 
7597 	/*
7598 	 * Initialize VF map.
7599 	 */
7600 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7601 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7602 	hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF,
7603 	    M_WAITOK | M_ZERO);
7604 
7605 	/*
7606 	 * Fix the # of TX taskqueues.
7607 	 */
7608 	if (hn_tx_taskq_cnt <= 0)
7609 		hn_tx_taskq_cnt = 1;
7610 	else if (hn_tx_taskq_cnt > mp_ncpus)
7611 		hn_tx_taskq_cnt = mp_ncpus;
7612 
7613 	/*
7614 	 * Fix the TX taskqueue mode.
7615 	 */
7616 	switch (hn_tx_taskq_mode) {
7617 	case HN_TX_TASKQ_M_INDEP:
7618 	case HN_TX_TASKQ_M_GLOBAL:
7619 	case HN_TX_TASKQ_M_EVTTQ:
7620 		break;
7621 	default:
7622 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7623 		break;
7624 	}
7625 
7626 	if (vm_guest != VM_GUEST_HV)
7627 		return;
7628 
7629 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7630 		return;
7631 
7632 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7633 	    M_DEVBUF, M_WAITOK);
7634 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7635 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7636 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7637 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7638 		    "hn tx%d", i);
7639 	}
7640 }
7641 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7642 
7643 static void
hn_sysuninit(void * arg __unused)7644 hn_sysuninit(void *arg __unused)
7645 {
7646 
7647 	if (hn_tx_taskque != NULL) {
7648 		int i;
7649 
7650 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7651 			taskqueue_free(hn_tx_taskque[i]);
7652 		free(hn_tx_taskque, M_DEVBUF);
7653 	}
7654 
7655 	if (hn_vfmap != NULL)
7656 		free(hn_vfmap, M_DEVBUF);
7657 	rm_destroy(&hn_vfmap_lock);
7658 
7659 	counter_u64_free(hn_udpcs_fixup);
7660 }
7661 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7662