1 /*-
2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
12 * disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*-
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54
55 #include <sys/cdefs.h>
56 #include "opt_hn.h"
57 #include "opt_inet6.h"
58 #include "opt_inet.h"
59 #include "opt_rss.h"
60
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/bus.h>
64 #include <sys/counter.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/proc.h>
73 #include <sys/rmlock.h>
74 #include <sys/sbuf.h>
75 #include <sys/sched.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 #include <sys/epoch.h>
85
86 #include <vm/vm.h>
87 #include <vm/vm_extern.h>
88 #include <vm/pmap.h>
89
90 #include <machine/atomic.h>
91 #include <machine/in_cksum.h>
92
93 #include <net/bpf.h>
94 #include <net/ethernet.h>
95 #include <net/if.h>
96 #include <net/if_dl.h>
97 #include <net/if_media.h>
98 #include <net/if_types.h>
99 #include <net/if_var.h>
100 #include <net/rndis.h>
101 #ifdef RSS
102 #include <net/rss_config.h>
103 #endif
104
105 #include <netinet/in_systm.h>
106 #include <netinet/in.h>
107 #include <netinet/ip.h>
108 #include <netinet/ip6.h>
109 #include <netinet/tcp.h>
110 #include <netinet/tcp_lro.h>
111 #include <netinet/udp.h>
112
113 #include <dev/hyperv/include/hyperv.h>
114 #include <dev/hyperv/include/hyperv_busdma.h>
115 #include <dev/hyperv/include/vmbus.h>
116 #include <dev/hyperv/include/vmbus_xact.h>
117
118 #include <dev/hyperv/netvsc/ndis.h>
119 #include <dev/hyperv/netvsc/if_hnreg.h>
120 #include <dev/hyperv/netvsc/if_hnvar.h>
121 #include <dev/hyperv/netvsc/hn_nvs.h>
122 #include <dev/hyperv/netvsc/hn_rndis.h>
123
124 #include "vmbus_if.h"
125
126 #define HN_IFSTART_SUPPORT
127
128 #define HN_RING_CNT_DEF_MAX 8
129
130 #define HN_VFMAP_SIZE_DEF 8
131
132 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
133
134 /* YYY should get it from the underlying channel */
135 #define HN_TX_DESC_CNT 512
136
137 #define HN_RNDIS_PKT_LEN \
138 (sizeof(struct rndis_packet_msg) + \
139 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
140 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
141 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
142 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
143 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
144 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
145
146 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
147 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
148 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
149 /* -1 for RNDIS packet message */
150 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
151
152 #define HN_DIRECT_TX_SIZE_DEF 128
153
154 #define HN_EARLY_TXEOF_THRESH 8
155
156 #define HN_PKTBUF_LEN_DEF (16 * 1024)
157
158 #define HN_LROENT_CNT_DEF 128
159
160 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
161 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
162 /* YYY 2*MTU is a bit rough, but should be good enough. */
163 #define HN_LRO_LENLIM_MIN(ifp) (2 * if_getmtu(ifp))
164
165 #define HN_LRO_ACKCNT_DEF 1
166
167 #define HN_LOCK_INIT(sc) \
168 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
169 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
170 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
171 #define HN_LOCK(sc) \
172 do { \
173 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \
174 /* Relinquish cpu to avoid deadlock */ \
175 sched_relinquish(curthread); \
176 DELAY(1000); \
177 } \
178 } while (0)
179 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
180
181 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
182 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
183 #define HN_CSUM_IP_HWASSIST(sc) \
184 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
185 #define HN_CSUM_IP6_HWASSIST(sc) \
186 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
187
188 #define HN_PKTSIZE_MIN(align) \
189 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
190 HN_RNDIS_PKT_LEN, (align))
191 #define HN_PKTSIZE(m, align) \
192 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
193
194 #ifdef RSS
195 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets())
196 #else
197 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
198 #endif
199
200 struct hn_txdesc {
201 #ifndef HN_USE_TXDESC_BUFRING
202 SLIST_ENTRY(hn_txdesc) link;
203 #endif
204 STAILQ_ENTRY(hn_txdesc) agg_link;
205
206 /* Aggregated txdescs, in sending order. */
207 STAILQ_HEAD(, hn_txdesc) agg_list;
208
209 /* The oldest packet, if transmission aggregation happens. */
210 struct mbuf *m;
211 struct hn_tx_ring *txr;
212 int refs;
213 uint32_t flags; /* HN_TXD_FLAG_ */
214 struct hn_nvs_sendctx send_ctx;
215 uint32_t chim_index;
216 int chim_size;
217
218 bus_dmamap_t data_dmap;
219
220 bus_addr_t rndis_pkt_paddr;
221 struct rndis_packet_msg *rndis_pkt;
222 bus_dmamap_t rndis_pkt_dmap;
223 };
224
225 #define HN_TXD_FLAG_ONLIST 0x0001
226 #define HN_TXD_FLAG_DMAMAP 0x0002
227 #define HN_TXD_FLAG_ONAGG 0x0004
228
229 #define HN_NDIS_PKTINFO_SUBALLOC 0x01
230 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02
231 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04
232
233 struct packet_info_id {
234 uint8_t ver;
235 uint8_t flag;
236 uint16_t pkt_id;
237 };
238
239 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id)
240
241
242 struct hn_rxinfo {
243 const uint32_t *vlan_info;
244 const uint32_t *csum_info;
245 const uint32_t *hash_info;
246 const uint32_t *hash_value;
247 const struct packet_info_id *pktinfo_id;
248 };
249
250 struct hn_rxvf_setarg {
251 struct hn_rx_ring *rxr;
252 if_t vf_ifp;
253 };
254
255 #define HN_RXINFO_VLAN 0x0001
256 #define HN_RXINFO_CSUM 0x0002
257 #define HN_RXINFO_HASHINF 0x0004
258 #define HN_RXINFO_HASHVAL 0x0008
259 #define HN_RXINFO_PKTINFO_ID 0x0010
260 #define HN_RXINFO_ALL \
261 (HN_RXINFO_VLAN | \
262 HN_RXINFO_CSUM | \
263 HN_RXINFO_HASHINF | \
264 HN_RXINFO_HASHVAL | \
265 HN_RXINFO_PKTINFO_ID)
266
267 static int hn_probe(device_t);
268 static int hn_attach(device_t);
269 static int hn_detach(device_t);
270 static int hn_shutdown(device_t);
271 static void hn_chan_callback(struct vmbus_channel *,
272 void *);
273
274 static void hn_init(void *);
275 static int hn_ioctl(if_t, u_long, caddr_t);
276 #ifdef HN_IFSTART_SUPPORT
277 static void hn_start(if_t);
278 #endif
279 static int hn_transmit(if_t, struct mbuf *);
280 static void hn_xmit_qflush(if_t);
281 static int hn_ifmedia_upd(if_t);
282 static void hn_ifmedia_sts(if_t,
283 struct ifmediareq *);
284
285 static void hn_ifnet_event(void *, if_t, int);
286 static void hn_ifaddr_event(void *, if_t);
287 static void hn_ifnet_attevent(void *, if_t);
288 static void hn_ifnet_detevent(void *, if_t);
289 static void hn_ifnet_lnkevent(void *, if_t, int);
290
291 static bool hn_ismyvf(const struct hn_softc *,
292 const if_t);
293 static void hn_rxvf_change(struct hn_softc *,
294 if_t, bool);
295 static void hn_rxvf_set(struct hn_softc *, if_t);
296 static void hn_rxvf_set_task(void *, int);
297 static void hn_xpnt_vf_input(if_t, struct mbuf *);
298 static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
299 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
300 struct ifreq *);
301 static void hn_xpnt_vf_saveifflags(struct hn_softc *);
302 static bool hn_xpnt_vf_isready(struct hn_softc *);
303 static void hn_xpnt_vf_setready(struct hn_softc *);
304 static void hn_xpnt_vf_init_taskfunc(void *, int);
305 static void hn_xpnt_vf_init(struct hn_softc *);
306 static void hn_xpnt_vf_setenable(struct hn_softc *);
307 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
308 static void hn_vf_rss_fixup(struct hn_softc *, bool);
309 static void hn_vf_rss_restore(struct hn_softc *);
310
311 static int hn_rndis_rxinfo(const void *, int,
312 struct hn_rxinfo *);
313 static void hn_rndis_rx_data(struct hn_rx_ring *,
314 const void *, int);
315 static void hn_rndis_rx_status(struct hn_softc *,
316 const void *, int);
317 static void hn_rndis_init_fixat(struct hn_softc *, int);
318
319 static void hn_nvs_handle_notify(struct hn_softc *,
320 const struct vmbus_chanpkt_hdr *);
321 static void hn_nvs_handle_comp(struct hn_softc *,
322 struct vmbus_channel *,
323 const struct vmbus_chanpkt_hdr *);
324 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
325 struct vmbus_channel *,
326 const struct vmbus_chanpkt_hdr *);
327 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
328 struct vmbus_channel *, uint64_t);
329
330 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
332 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
333 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
334 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
335 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
336 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
337 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
338 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
339 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
340 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
341 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
342 #ifndef RSS
343 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
344 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
345 #endif
346 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
347 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
348 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
349 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
350 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
351 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
352 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
353 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
354 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
355 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
356 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
357 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
358 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
359 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
360
361 static void hn_stop(struct hn_softc *, bool);
362 static void hn_init_locked(struct hn_softc *);
363 static int hn_chan_attach(struct hn_softc *,
364 struct vmbus_channel *);
365 static void hn_chan_detach(struct hn_softc *,
366 struct vmbus_channel *);
367 static int hn_attach_subchans(struct hn_softc *);
368 static void hn_detach_allchans(struct hn_softc *);
369 static void hn_chan_rollup(struct hn_rx_ring *,
370 struct hn_tx_ring *);
371 static void hn_set_ring_inuse(struct hn_softc *, int);
372 static int hn_synth_attach(struct hn_softc *, int);
373 static void hn_synth_detach(struct hn_softc *);
374 static int hn_synth_alloc_subchans(struct hn_softc *,
375 int *);
376 static bool hn_synth_attachable(const struct hn_softc *);
377 static void hn_suspend(struct hn_softc *);
378 static void hn_suspend_data(struct hn_softc *);
379 static void hn_suspend_mgmt(struct hn_softc *);
380 static void hn_resume(struct hn_softc *);
381 static void hn_resume_data(struct hn_softc *);
382 static void hn_resume_mgmt(struct hn_softc *);
383 static void hn_suspend_mgmt_taskfunc(void *, int);
384 static void hn_chan_drain(struct hn_softc *,
385 struct vmbus_channel *);
386 static void hn_disable_rx(struct hn_softc *);
387 static void hn_drain_rxtx(struct hn_softc *, int);
388 static void hn_polling(struct hn_softc *, u_int);
389 static void hn_chan_polling(struct vmbus_channel *, u_int);
390 static void hn_mtu_change_fixup(struct hn_softc *);
391
392 static void hn_update_link_status(struct hn_softc *);
393 static void hn_change_network(struct hn_softc *);
394 static void hn_link_taskfunc(void *, int);
395 static void hn_netchg_init_taskfunc(void *, int);
396 static void hn_netchg_status_taskfunc(void *, int);
397 static void hn_link_status(struct hn_softc *);
398
399 static int hn_create_rx_data(struct hn_softc *, int);
400 static void hn_destroy_rx_data(struct hn_softc *);
401 static int hn_check_iplen(const struct mbuf *, int);
402 static void hn_rxpkt_proto(const struct mbuf *, int *, int *);
403 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
404 static int hn_rxfilter_config(struct hn_softc *);
405 static int hn_rss_reconfig(struct hn_softc *);
406 static void hn_rss_ind_fixup(struct hn_softc *);
407 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
408 static int hn_rxpkt(struct hn_rx_ring *);
409 static uint32_t hn_rss_type_fromndis(uint32_t);
410 static uint32_t hn_rss_type_tondis(uint32_t);
411
412 static int hn_tx_ring_create(struct hn_softc *, int);
413 static void hn_tx_ring_destroy(struct hn_tx_ring *);
414 static int hn_create_tx_data(struct hn_softc *, int);
415 static void hn_fixup_tx_data(struct hn_softc *);
416 static void hn_fixup_rx_data(struct hn_softc *);
417 static void hn_destroy_tx_data(struct hn_softc *);
418 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
419 static void hn_txdesc_gc(struct hn_tx_ring *,
420 struct hn_txdesc *);
421 static int hn_encap(if_t, struct hn_tx_ring *,
422 struct hn_txdesc *, struct mbuf **);
423 static int hn_txpkt(if_t, struct hn_tx_ring *,
424 struct hn_txdesc *);
425 static void hn_set_chim_size(struct hn_softc *, int);
426 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
427 static bool hn_tx_ring_pending(struct hn_tx_ring *);
428 static void hn_tx_ring_qflush(struct hn_tx_ring *);
429 static void hn_resume_tx(struct hn_softc *, int);
430 static void hn_set_txagg(struct hn_softc *);
431 static void *hn_try_txagg(if_t,
432 struct hn_tx_ring *, struct hn_txdesc *,
433 int);
434 static int hn_get_txswq_depth(const struct hn_tx_ring *);
435 static void hn_txpkt_done(struct hn_nvs_sendctx *,
436 struct hn_softc *, struct vmbus_channel *,
437 const void *, int);
438 static int hn_txpkt_sglist(struct hn_tx_ring *,
439 struct hn_txdesc *);
440 static int hn_txpkt_chim(struct hn_tx_ring *,
441 struct hn_txdesc *);
442 static int hn_xmit(struct hn_tx_ring *, int);
443 static void hn_xmit_taskfunc(void *, int);
444 static void hn_xmit_txeof(struct hn_tx_ring *);
445 static void hn_xmit_txeof_taskfunc(void *, int);
446 #ifdef HN_IFSTART_SUPPORT
447 static int hn_start_locked(struct hn_tx_ring *, int);
448 static void hn_start_taskfunc(void *, int);
449 static void hn_start_txeof(struct hn_tx_ring *);
450 static void hn_start_txeof_taskfunc(void *, int);
451 #endif
452
453 static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
454
455 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
456 "Hyper-V network interface");
457
458 /* Trust tcp segment verification on host side. */
459 static int hn_trust_hosttcp = 1;
460 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
461 &hn_trust_hosttcp, 0,
462 "Trust tcp segment verification on host side, "
463 "when csum info is missing (global setting)");
464
465 /* Trust udp datagrams verification on host side. */
466 static int hn_trust_hostudp = 1;
467 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
468 &hn_trust_hostudp, 0,
469 "Trust udp datagram verification on host side, "
470 "when csum info is missing (global setting)");
471
472 /* Trust ip packets verification on host side. */
473 static int hn_trust_hostip = 1;
474 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
475 &hn_trust_hostip, 0,
476 "Trust ip packet verification on host side, "
477 "when csum info is missing (global setting)");
478
479 /*
480 * Offload UDP/IPv4 checksum.
481 */
482 static int hn_enable_udp4cs = 1;
483 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
484 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
485
486 /*
487 * Offload UDP/IPv6 checksum.
488 */
489 static int hn_enable_udp6cs = 1;
490 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
491 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
492
493 /* Stats. */
494 static counter_u64_t hn_udpcs_fixup;
495 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
496 &hn_udpcs_fixup, "# of UDP checksum fixup");
497
498 /*
499 * See hn_set_hlen().
500 *
501 * This value is for Azure. For Hyper-V, set this above
502 * 65536 to disable UDP datagram checksum fixup.
503 */
504 static int hn_udpcs_fixup_mtu = 1420;
505 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
506 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
507
508 /* Limit TSO burst size */
509 static int hn_tso_maxlen = IP_MAXPACKET;
510 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
511 &hn_tso_maxlen, 0, "TSO burst limit");
512
513 /* Limit chimney send size */
514 static int hn_tx_chimney_size = 0;
515 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
516 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
517
518 /* Limit the size of packet for direct transmission */
519 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
520 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
521 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
522
523 /* # of LRO entries per RX ring */
524 #if defined(INET) || defined(INET6)
525 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
526 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
527 &hn_lro_entry_count, 0, "LRO entry count");
528 #endif
529
530 static int hn_tx_taskq_cnt = 1;
531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
532 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
533
534 #define HN_TX_TASKQ_M_INDEP 0
535 #define HN_TX_TASKQ_M_GLOBAL 1
536 #define HN_TX_TASKQ_M_EVTTQ 2
537
538 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
539 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
540 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
541 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
542
543 #ifndef HN_USE_TXDESC_BUFRING
544 static int hn_use_txdesc_bufring = 0;
545 #else
546 static int hn_use_txdesc_bufring = 1;
547 #endif
548 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
549 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
550
551 #ifdef HN_IFSTART_SUPPORT
552 /* Use ifnet.if_start instead of ifnet.if_transmit */
553 static int hn_use_if_start = 0;
554 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
555 &hn_use_if_start, 0, "Use if_start TX method");
556 #endif
557
558 /* # of channels to use */
559 static int hn_chan_cnt = 0;
560 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
561 &hn_chan_cnt, 0,
562 "# of channels to use; each channel has one RX ring and one TX ring");
563
564 /* # of transmit rings to use */
565 static int hn_tx_ring_cnt = 0;
566 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
567 &hn_tx_ring_cnt, 0, "# of TX rings to use");
568
569 /* Software TX ring deptch */
570 static int hn_tx_swq_depth = 0;
571 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
572 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
573
574 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
575 static u_int hn_lro_mbufq_depth = 0;
576 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
577 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
578
579 /* Packet transmission aggregation size limit */
580 static int hn_tx_agg_size = -1;
581 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
582 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
583
584 /* Packet transmission aggregation count limit */
585 static int hn_tx_agg_pkts = -1;
586 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
587 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
588
589 /* VF list */
590 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
591 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
592 hn_vflist_sysctl, "A",
593 "VF list");
594
595 /* VF mapping */
596 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
597 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
598 hn_vfmap_sysctl, "A",
599 "VF mapping");
600
601 /* Transparent VF */
602 static int hn_xpnt_vf = 1;
603 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
604 &hn_xpnt_vf, 0, "Transparent VF mod");
605
606 /* Accurate BPF support for Transparent VF */
607 static int hn_xpnt_vf_accbpf = 0;
608 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
609 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
610
611 /* Extra wait for transparent VF attach routing; unit seconds. */
612 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
613 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
614 &hn_xpnt_vf_attwait, 0,
615 "Extra wait for transparent VF attach routing; unit: seconds");
616
617 static u_int hn_cpu_index; /* next CPU for channel */
618 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
619
620 static struct rmlock hn_vfmap_lock;
621 static int hn_vfmap_size;
622 static if_t *hn_vfmap;
623
624 #ifndef RSS
625 static const uint8_t
626 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
627 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
628 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
629 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
630 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
631 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
632 };
633 #endif /* !RSS */
634
635 static const struct hyperv_guid hn_guid = {
636 .hv_guid = {
637 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
638 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
639 };
640
641 static device_method_t hn_methods[] = {
642 /* Device interface */
643 DEVMETHOD(device_probe, hn_probe),
644 DEVMETHOD(device_attach, hn_attach),
645 DEVMETHOD(device_detach, hn_detach),
646 DEVMETHOD(device_shutdown, hn_shutdown),
647 DEVMETHOD_END
648 };
649
650 static driver_t hn_driver = {
651 "hn",
652 hn_methods,
653 sizeof(struct hn_softc)
654 };
655
656 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0);
657 MODULE_VERSION(hn, 1);
658 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
659
660 static void
hn_set_lro_lenlim(struct hn_softc * sc,int lenlim)661 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
662 {
663 int i;
664
665 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
666 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
667 }
668
669 static int
hn_txpkt_sglist(struct hn_tx_ring * txr,struct hn_txdesc * txd)670 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
671 {
672
673 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
674 txd->chim_size == 0, ("invalid rndis sglist txd"));
675 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
676 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
677 }
678
679 static int
hn_txpkt_chim(struct hn_tx_ring * txr,struct hn_txdesc * txd)680 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
681 {
682 struct hn_nvs_rndis rndis;
683
684 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
685 txd->chim_size > 0, ("invalid rndis chim txd"));
686
687 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
688 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
689 rndis.nvs_chim_idx = txd->chim_index;
690 rndis.nvs_chim_sz = txd->chim_size;
691
692 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
693 &rndis, sizeof(rndis), &txd->send_ctx));
694 }
695
696 static __inline uint32_t
hn_chim_alloc(struct hn_softc * sc)697 hn_chim_alloc(struct hn_softc *sc)
698 {
699 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
700 u_long *bmap = sc->hn_chim_bmap;
701 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
702
703 for (i = 0; i < bmap_cnt; ++i) {
704 int idx;
705
706 idx = ffsl(~bmap[i]);
707 if (idx == 0)
708 continue;
709
710 --idx; /* ffsl is 1-based */
711 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
712 ("invalid i %d and idx %d", i, idx));
713
714 if (atomic_testandset_long(&bmap[i], idx))
715 continue;
716
717 ret = i * LONG_BIT + idx;
718 break;
719 }
720 return (ret);
721 }
722
723 static __inline void
hn_chim_free(struct hn_softc * sc,uint32_t chim_idx)724 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
725 {
726 u_long mask;
727 uint32_t idx;
728
729 idx = chim_idx / LONG_BIT;
730 KASSERT(idx < sc->hn_chim_bmap_cnt,
731 ("invalid chimney index 0x%x", chim_idx));
732
733 mask = 1UL << (chim_idx % LONG_BIT);
734 KASSERT(sc->hn_chim_bmap[idx] & mask,
735 ("index bitmap 0x%lx, chimney index %u, "
736 "bitmap idx %d, bitmask 0x%lx",
737 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
738
739 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
740 }
741
742 #if defined(INET6) || defined(INET)
743
744 #define PULLUP_HDR(m, len) \
745 do { \
746 if (__predict_false((m)->m_len < (len))) { \
747 (m) = m_pullup((m), (len)); \
748 if ((m) == NULL) \
749 return (NULL); \
750 } \
751 } while (0)
752
753 /*
754 * NOTE: If this function failed, the m_head would be freed.
755 */
756 static __inline struct mbuf *
hn_tso_fixup(struct mbuf * m_head)757 hn_tso_fixup(struct mbuf *m_head)
758 {
759 struct ether_vlan_header *evl;
760 struct tcphdr *th;
761 int ehlen;
762
763 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
764
765 PULLUP_HDR(m_head, sizeof(*evl));
766 evl = mtod(m_head, struct ether_vlan_header *);
767 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
768 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
769 else
770 ehlen = ETHER_HDR_LEN;
771 m_head->m_pkthdr.l2hlen = ehlen;
772
773 #ifdef INET
774 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
775 struct ip *ip;
776 int iphlen;
777
778 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
779 ip = mtodo(m_head, ehlen);
780 iphlen = ip->ip_hl << 2;
781 m_head->m_pkthdr.l3hlen = iphlen;
782
783 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
784 th = mtodo(m_head, ehlen + iphlen);
785
786 ip->ip_len = 0;
787 ip->ip_sum = 0;
788 th->th_sum = in_pseudo(ip->ip_src.s_addr,
789 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
790 }
791 #endif
792 #if defined(INET6) && defined(INET)
793 else
794 #endif
795 #ifdef INET6
796 {
797 struct ip6_hdr *ip6;
798
799 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
800 ip6 = mtodo(m_head, ehlen);
801 if (ip6->ip6_nxt != IPPROTO_TCP) {
802 m_freem(m_head);
803 return (NULL);
804 }
805 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
806
807 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
808 th = mtodo(m_head, ehlen + sizeof(*ip6));
809
810 ip6->ip6_plen = 0;
811 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
812 }
813 #endif
814 return (m_head);
815 }
816
817 /*
818 * NOTE: If this function failed, the m_head would be freed.
819 */
820 static __inline struct mbuf *
hn_set_hlen(struct mbuf * m_head)821 hn_set_hlen(struct mbuf *m_head)
822 {
823 const struct ether_vlan_header *evl;
824 int ehlen;
825
826 PULLUP_HDR(m_head, sizeof(*evl));
827 evl = mtod(m_head, const struct ether_vlan_header *);
828 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
829 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
830 else
831 ehlen = ETHER_HDR_LEN;
832 m_head->m_pkthdr.l2hlen = ehlen;
833
834 #ifdef INET
835 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
836 const struct ip *ip;
837 int iphlen;
838
839 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
840 ip = mtodo(m_head, ehlen);
841 iphlen = ip->ip_hl << 2;
842 m_head->m_pkthdr.l3hlen = iphlen;
843
844 /*
845 * UDP checksum offload does not work in Azure, if the
846 * following conditions meet:
847 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
848 * - IP_DF is not set in the IP hdr.
849 *
850 * Fallback to software checksum for these UDP datagrams.
851 */
852 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
853 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
854 (ntohs(ip->ip_off) & IP_DF) == 0) {
855 uint16_t off = ehlen + iphlen;
856
857 counter_u64_add(hn_udpcs_fixup, 1);
858 PULLUP_HDR(m_head, off + sizeof(struct udphdr));
859 *(uint16_t *)(m_head->m_data + off +
860 m_head->m_pkthdr.csum_data) = in_cksum_skip(
861 m_head, m_head->m_pkthdr.len, off);
862 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
863 }
864 }
865 #endif
866 #if defined(INET6) && defined(INET)
867 else
868 #endif
869 #ifdef INET6
870 {
871 const struct ip6_hdr *ip6;
872
873 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
874 ip6 = mtodo(m_head, ehlen);
875 if (ip6->ip6_nxt != IPPROTO_TCP &&
876 ip6->ip6_nxt != IPPROTO_UDP) {
877 m_freem(m_head);
878 return (NULL);
879 }
880 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
881 }
882 #endif
883 return (m_head);
884 }
885
886 /*
887 * NOTE: If this function failed, the m_head would be freed.
888 */
889 static __inline struct mbuf *
hn_check_tcpsyn(struct mbuf * m_head,int * tcpsyn)890 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
891 {
892 const struct tcphdr *th;
893 int ehlen, iphlen;
894
895 *tcpsyn = 0;
896 ehlen = m_head->m_pkthdr.l2hlen;
897 iphlen = m_head->m_pkthdr.l3hlen;
898
899 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
900 th = mtodo(m_head, ehlen + iphlen);
901 if (tcp_get_flags(th) & TH_SYN)
902 *tcpsyn = 1;
903 return (m_head);
904 }
905
906 #undef PULLUP_HDR
907
908 #endif /* INET6 || INET */
909
910 static int
hn_set_rxfilter(struct hn_softc * sc,uint32_t filter)911 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
912 {
913 int error = 0;
914
915 HN_LOCK_ASSERT(sc);
916
917 if (sc->hn_rx_filter != filter) {
918 error = hn_rndis_set_rxfilter(sc, filter);
919 if (!error)
920 sc->hn_rx_filter = filter;
921 }
922 return (error);
923 }
924
925 static int
hn_rxfilter_config(struct hn_softc * sc)926 hn_rxfilter_config(struct hn_softc *sc)
927 {
928 if_t ifp = sc->hn_ifp;
929 uint32_t filter;
930
931 HN_LOCK_ASSERT(sc);
932
933 /*
934 * If the non-transparent mode VF is activated, we don't know how
935 * its RX filter is configured, so stick the synthetic device in
936 * the promiscous mode.
937 */
938 if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
939 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
940 } else {
941 filter = NDIS_PACKET_TYPE_DIRECTED;
942 if (if_getflags(ifp) & IFF_BROADCAST)
943 filter |= NDIS_PACKET_TYPE_BROADCAST;
944 /* TODO: support multicast list */
945 if ((if_getflags(ifp) & IFF_ALLMULTI) ||
946 !if_maddr_empty(ifp))
947 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
948 }
949 return (hn_set_rxfilter(sc, filter));
950 }
951
952 static void
hn_set_txagg(struct hn_softc * sc)953 hn_set_txagg(struct hn_softc *sc)
954 {
955 uint32_t size, pkts;
956 int i;
957
958 /*
959 * Setup aggregation size.
960 */
961 if (sc->hn_agg_size < 0)
962 size = UINT32_MAX;
963 else
964 size = sc->hn_agg_size;
965
966 if (sc->hn_rndis_agg_size < size)
967 size = sc->hn_rndis_agg_size;
968
969 /* NOTE: We only aggregate packets using chimney sending buffers. */
970 if (size > (uint32_t)sc->hn_chim_szmax)
971 size = sc->hn_chim_szmax;
972
973 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
974 /* Disable */
975 size = 0;
976 pkts = 0;
977 goto done;
978 }
979
980 /* NOTE: Type of the per TX ring setting is 'int'. */
981 if (size > INT_MAX)
982 size = INT_MAX;
983
984 /*
985 * Setup aggregation packet count.
986 */
987 if (sc->hn_agg_pkts < 0)
988 pkts = UINT32_MAX;
989 else
990 pkts = sc->hn_agg_pkts;
991
992 if (sc->hn_rndis_agg_pkts < pkts)
993 pkts = sc->hn_rndis_agg_pkts;
994
995 if (pkts <= 1) {
996 /* Disable */
997 size = 0;
998 pkts = 0;
999 goto done;
1000 }
1001
1002 /* NOTE: Type of the per TX ring setting is 'short'. */
1003 if (pkts > SHRT_MAX)
1004 pkts = SHRT_MAX;
1005
1006 done:
1007 /* NOTE: Type of the per TX ring setting is 'short'. */
1008 if (sc->hn_rndis_agg_align > SHRT_MAX) {
1009 /* Disable */
1010 size = 0;
1011 pkts = 0;
1012 }
1013
1014 if (bootverbose) {
1015 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1016 size, pkts, sc->hn_rndis_agg_align);
1017 }
1018
1019 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1020 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1021
1022 mtx_lock(&txr->hn_tx_lock);
1023 txr->hn_agg_szmax = size;
1024 txr->hn_agg_pktmax = pkts;
1025 txr->hn_agg_align = sc->hn_rndis_agg_align;
1026 mtx_unlock(&txr->hn_tx_lock);
1027 }
1028 }
1029
1030 static int
hn_get_txswq_depth(const struct hn_tx_ring * txr)1031 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1032 {
1033
1034 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1035 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1036 return txr->hn_txdesc_cnt;
1037 return hn_tx_swq_depth;
1038 }
1039
1040 static int
hn_rss_reconfig(struct hn_softc * sc)1041 hn_rss_reconfig(struct hn_softc *sc)
1042 {
1043 int error;
1044
1045 HN_LOCK_ASSERT(sc);
1046
1047 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1048 return (ENXIO);
1049
1050 /*
1051 * Disable RSS first.
1052 *
1053 * NOTE:
1054 * Direct reconfiguration by setting the UNCHG flags does
1055 * _not_ work properly.
1056 */
1057 if (bootverbose)
1058 if_printf(sc->hn_ifp, "disable RSS\n");
1059 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1060 if (error) {
1061 if_printf(sc->hn_ifp, "RSS disable failed\n");
1062 return (error);
1063 }
1064
1065 /*
1066 * Reenable the RSS w/ the updated RSS key or indirect
1067 * table.
1068 */
1069 if (bootverbose)
1070 if_printf(sc->hn_ifp, "reconfig RSS\n");
1071 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1072 if (error) {
1073 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1074 return (error);
1075 }
1076 return (0);
1077 }
1078
1079 static void
hn_rss_ind_fixup(struct hn_softc * sc)1080 hn_rss_ind_fixup(struct hn_softc *sc)
1081 {
1082 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1083 int i, nchan;
1084
1085 nchan = sc->hn_rx_ring_inuse;
1086 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1087
1088 /*
1089 * Check indirect table to make sure that all channels in it
1090 * can be used.
1091 */
1092 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1093 if (rss->rss_ind[i] >= nchan) {
1094 if_printf(sc->hn_ifp,
1095 "RSS indirect table %d fixup: %u -> %d\n",
1096 i, rss->rss_ind[i], nchan - 1);
1097 rss->rss_ind[i] = nchan - 1;
1098 }
1099 }
1100 }
1101
1102 static int
hn_ifmedia_upd(if_t ifp __unused)1103 hn_ifmedia_upd(if_t ifp __unused)
1104 {
1105
1106 /* Ignore since autoselect is the only defined and valid media */
1107 return (0);
1108 }
1109
1110 static void
hn_ifmedia_sts(if_t ifp,struct ifmediareq * ifmr)1111 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr)
1112 {
1113 struct hn_softc *sc = if_getsoftc(ifp);
1114
1115 ifmr->ifm_status = IFM_AVALID;
1116 ifmr->ifm_active = IFM_ETHER;
1117
1118 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1119 ifmr->ifm_active |= IFM_NONE;
1120 return;
1121 }
1122 ifmr->ifm_status |= IFM_ACTIVE;
1123 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1124 }
1125
1126 static void
hn_rxvf_set_task(void * xarg,int pending __unused)1127 hn_rxvf_set_task(void *xarg, int pending __unused)
1128 {
1129 struct hn_rxvf_setarg *arg = xarg;
1130
1131 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1132 }
1133
1134 static void
hn_rxvf_set(struct hn_softc * sc,if_t vf_ifp)1135 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp)
1136 {
1137 struct hn_rx_ring *rxr;
1138 struct hn_rxvf_setarg arg;
1139 struct task task;
1140 int i;
1141
1142 HN_LOCK_ASSERT(sc);
1143
1144 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1145
1146 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1147 rxr = &sc->hn_rx_ring[i];
1148
1149 if (i < sc->hn_rx_ring_inuse) {
1150 arg.rxr = rxr;
1151 arg.vf_ifp = vf_ifp;
1152 vmbus_chan_run_task(rxr->hn_chan, &task);
1153 } else {
1154 rxr->hn_rxvf_ifp = vf_ifp;
1155 }
1156 }
1157 }
1158
1159 static bool
hn_ismyvf(const struct hn_softc * sc,const if_t ifp)1160 hn_ismyvf(const struct hn_softc *sc, const if_t ifp)
1161 {
1162 if_t hn_ifp;
1163
1164 hn_ifp = sc->hn_ifp;
1165
1166 if (ifp == hn_ifp)
1167 return (false);
1168
1169 if (if_getalloctype(ifp) != IFT_ETHER)
1170 return (false);
1171
1172 /* Ignore lagg/vlan interfaces */
1173 if (strcmp(if_getdname(ifp), "lagg") == 0 ||
1174 strcmp(if_getdname(ifp), "vlan") == 0)
1175 return (false);
1176
1177 /*
1178 * During detach events if_getifaddr(ifp) might be NULL.
1179 * Make sure the bcmp() below doesn't panic on that:
1180 */
1181 if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL)
1182 return (false);
1183
1184 if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0)
1185 return (false);
1186
1187 return (true);
1188 }
1189
1190 static void
hn_rxvf_change(struct hn_softc * sc,if_t ifp,bool rxvf)1191 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf)
1192 {
1193 if_t hn_ifp;
1194
1195 HN_LOCK(sc);
1196
1197 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1198 goto out;
1199
1200 if (!hn_ismyvf(sc, ifp))
1201 goto out;
1202 hn_ifp = sc->hn_ifp;
1203
1204 if (rxvf) {
1205 if (sc->hn_flags & HN_FLAG_RXVF)
1206 goto out;
1207
1208 sc->hn_flags |= HN_FLAG_RXVF;
1209 hn_rxfilter_config(sc);
1210 } else {
1211 if (!(sc->hn_flags & HN_FLAG_RXVF))
1212 goto out;
1213
1214 sc->hn_flags &= ~HN_FLAG_RXVF;
1215 if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING)
1216 hn_rxfilter_config(sc);
1217 else
1218 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1219 }
1220
1221 hn_nvs_set_datapath(sc,
1222 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1223
1224 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1225
1226 if (rxvf) {
1227 hn_vf_rss_fixup(sc, true);
1228 hn_suspend_mgmt(sc);
1229 sc->hn_link_flags &=
1230 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1231 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1232 } else {
1233 hn_vf_rss_restore(sc);
1234 hn_resume_mgmt(sc);
1235 }
1236
1237 devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1238 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1239
1240 if (bootverbose) {
1241 if_printf(hn_ifp, "datapath is switched %s %s\n",
1242 rxvf ? "to" : "from", if_name(ifp));
1243 }
1244 out:
1245 HN_UNLOCK(sc);
1246 }
1247
1248 static void
hn_ifnet_event(void * arg,if_t ifp,int event)1249 hn_ifnet_event(void *arg, if_t ifp, int event)
1250 {
1251
1252 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1253 return;
1254 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1255 }
1256
1257 static void
hn_ifaddr_event(void * arg,if_t ifp)1258 hn_ifaddr_event(void *arg, if_t ifp)
1259 {
1260
1261 hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP);
1262 }
1263
1264 static int
hn_xpnt_vf_iocsetcaps(struct hn_softc * sc,struct ifreq * ifr __unused)1265 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr __unused)
1266 {
1267 if_t ifp, vf_ifp;
1268
1269 HN_LOCK_ASSERT(sc);
1270 ifp = sc->hn_ifp;
1271 vf_ifp = sc->hn_vf_ifp;
1272
1273 /*
1274 * Just sync up with VF's enabled capabilities.
1275 */
1276 if_setcapenable(ifp, if_getcapenable(vf_ifp));
1277 if_sethwassist(ifp, if_gethwassist(vf_ifp));
1278
1279 return (0);
1280 }
1281
1282 static int
hn_xpnt_vf_iocsetflags(struct hn_softc * sc)1283 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1284 {
1285 if_t vf_ifp;
1286 struct ifreq ifr;
1287
1288 HN_LOCK_ASSERT(sc);
1289 vf_ifp = sc->hn_vf_ifp;
1290
1291 memset(&ifr, 0, sizeof(ifr));
1292 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1293 ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff;
1294 ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16;
1295 return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread));
1296 }
1297
1298 static void
hn_xpnt_vf_saveifflags(struct hn_softc * sc)1299 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1300 {
1301 if_t ifp = sc->hn_ifp;
1302 int allmulti = 0;
1303
1304 HN_LOCK_ASSERT(sc);
1305
1306 /* XXX vlan(4) style mcast addr maintenance */
1307 if (!if_maddr_empty(ifp))
1308 allmulti = IFF_ALLMULTI;
1309
1310 /* Always set the VF's if_flags */
1311 if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti);
1312 }
1313
1314 static void
hn_xpnt_vf_input(if_t vf_ifp,struct mbuf * m)1315 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m)
1316 {
1317 struct rm_priotracker pt;
1318 if_t hn_ifp = NULL;
1319 struct mbuf *mn;
1320
1321 /*
1322 * XXX racy, if hn(4) ever detached.
1323 */
1324 rm_rlock(&hn_vfmap_lock, &pt);
1325 if (if_getindex(vf_ifp) < hn_vfmap_size)
1326 hn_ifp = hn_vfmap[if_getindex(vf_ifp)];
1327 rm_runlock(&hn_vfmap_lock, &pt);
1328
1329 if (hn_ifp != NULL) {
1330 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1331 /*
1332 * Allow tapping on the VF.
1333 */
1334 ETHER_BPF_MTAP(vf_ifp, mn);
1335
1336 /*
1337 * Update VF stats.
1338 */
1339 if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) {
1340 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1341 mn->m_pkthdr.len);
1342 }
1343 /*
1344 * XXX IFCOUNTER_IMCAST
1345 * This stat updating is kinda invasive, since it
1346 * requires two checks on the mbuf: the length check
1347 * and the ethernet header check. As of this write,
1348 * all multicast packets go directly to hn(4), which
1349 * makes imcast stat updating in the VF a try in vian.
1350 */
1351
1352 /*
1353 * Fix up rcvif and increase hn(4)'s ipackets.
1354 */
1355 mn->m_pkthdr.rcvif = hn_ifp;
1356 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1357 }
1358 /*
1359 * Go through hn(4)'s if_input.
1360 */
1361 if_input(hn_ifp, m);
1362 } else {
1363 /*
1364 * In the middle of the transition; free this
1365 * mbuf chain.
1366 */
1367 while (m != NULL) {
1368 mn = m->m_nextpkt;
1369 m->m_nextpkt = NULL;
1370 m_freem(m);
1371 m = mn;
1372 }
1373 }
1374 }
1375
1376 static void
hn_mtu_change_fixup(struct hn_softc * sc)1377 hn_mtu_change_fixup(struct hn_softc *sc)
1378 {
1379 if_t ifp;
1380
1381 HN_LOCK_ASSERT(sc);
1382 ifp = sc->hn_ifp;
1383
1384 hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp));
1385 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1386 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1387 }
1388
1389 static uint32_t
hn_rss_type_fromndis(uint32_t rss_hash)1390 hn_rss_type_fromndis(uint32_t rss_hash)
1391 {
1392 uint32_t types = 0;
1393
1394 if (rss_hash & NDIS_HASH_IPV4)
1395 types |= RSS_TYPE_IPV4;
1396 if (rss_hash & NDIS_HASH_TCP_IPV4)
1397 types |= RSS_TYPE_TCP_IPV4;
1398 if (rss_hash & NDIS_HASH_IPV6)
1399 types |= RSS_TYPE_IPV6;
1400 if (rss_hash & NDIS_HASH_IPV6_EX)
1401 types |= RSS_TYPE_IPV6_EX;
1402 if (rss_hash & NDIS_HASH_TCP_IPV6)
1403 types |= RSS_TYPE_TCP_IPV6;
1404 if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1405 types |= RSS_TYPE_TCP_IPV6_EX;
1406 if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1407 types |= RSS_TYPE_UDP_IPV4;
1408 return (types);
1409 }
1410
1411 static uint32_t
hn_rss_type_tondis(uint32_t types)1412 hn_rss_type_tondis(uint32_t types)
1413 {
1414 uint32_t rss_hash = 0;
1415
1416 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1417 ("UDP6 and UDP6EX are not supported"));
1418
1419 if (types & RSS_TYPE_IPV4)
1420 rss_hash |= NDIS_HASH_IPV4;
1421 if (types & RSS_TYPE_TCP_IPV4)
1422 rss_hash |= NDIS_HASH_TCP_IPV4;
1423 if (types & RSS_TYPE_IPV6)
1424 rss_hash |= NDIS_HASH_IPV6;
1425 if (types & RSS_TYPE_IPV6_EX)
1426 rss_hash |= NDIS_HASH_IPV6_EX;
1427 if (types & RSS_TYPE_TCP_IPV6)
1428 rss_hash |= NDIS_HASH_TCP_IPV6;
1429 if (types & RSS_TYPE_TCP_IPV6_EX)
1430 rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1431 if (types & RSS_TYPE_UDP_IPV4)
1432 rss_hash |= NDIS_HASH_UDP_IPV4_X;
1433 return (rss_hash);
1434 }
1435
1436 static void
hn_rss_mbuf_hash(struct hn_softc * sc,uint32_t mbuf_hash)1437 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1438 {
1439 int i;
1440
1441 HN_LOCK_ASSERT(sc);
1442
1443 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1444 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1445 }
1446
1447 static void
hn_vf_rss_fixup(struct hn_softc * sc,bool reconf)1448 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1449 {
1450 if_t ifp, vf_ifp;
1451 struct ifrsshash ifrh;
1452 struct ifrsskey ifrk;
1453 int error;
1454 uint32_t my_types, diff_types, mbuf_types = 0;
1455
1456 HN_LOCK_ASSERT(sc);
1457 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1458 ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1459
1460 if (sc->hn_rx_ring_inuse == 1) {
1461 /* No RSS on synthetic parts; done. */
1462 return;
1463 }
1464 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1465 /* Synthetic parts do not support Toeplitz; done. */
1466 return;
1467 }
1468
1469 ifp = sc->hn_ifp;
1470 vf_ifp = sc->hn_vf_ifp;
1471
1472 /*
1473 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is
1474 * supported.
1475 */
1476 memset(&ifrk, 0, sizeof(ifrk));
1477 strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name));
1478 error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread);
1479 if (error) {
1480 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1481 if_name(vf_ifp), error);
1482 goto done;
1483 }
1484 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1485 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1486 if_name(vf_ifp), ifrk.ifrk_func);
1487 goto done;
1488 }
1489 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1490 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1491 if_name(vf_ifp), ifrk.ifrk_keylen);
1492 goto done;
1493 }
1494
1495 /*
1496 * Extract VF's RSS hash. Only Toeplitz is supported.
1497 */
1498 memset(&ifrh, 0, sizeof(ifrh));
1499 strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name));
1500 error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread);
1501 if (error) {
1502 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1503 if_name(vf_ifp), error);
1504 goto done;
1505 }
1506 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1507 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1508 if_name(vf_ifp), ifrh.ifrh_func);
1509 goto done;
1510 }
1511
1512 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1513 if ((ifrh.ifrh_types & my_types) == 0) {
1514 /* This disables RSS; ignore it then */
1515 if_printf(ifp, "%s intersection of RSS types failed. "
1516 "VF %#x, mine %#x\n", if_name(vf_ifp),
1517 ifrh.ifrh_types, my_types);
1518 goto done;
1519 }
1520
1521 diff_types = my_types ^ ifrh.ifrh_types;
1522 my_types &= ifrh.ifrh_types;
1523 mbuf_types = my_types;
1524
1525 /*
1526 * Detect RSS hash value/type confliction.
1527 *
1528 * NOTE:
1529 * We don't disable the hash type, but stop delivery the hash
1530 * value/type through mbufs on RX path.
1531 *
1532 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1533 * hash is delivered with type of TCP_IPV4. This means if
1534 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1535 * least to hn_mbuf_hash. However, given that _all_ of the
1536 * NICs implement TCP_IPV4, this will _not_ impose any issues
1537 * here.
1538 */
1539 if ((my_types & RSS_TYPE_IPV4) &&
1540 (diff_types & ifrh.ifrh_types &
1541 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1542 /* Conflict; disable IPV4 hash type/value delivery. */
1543 if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1544 mbuf_types &= ~RSS_TYPE_IPV4;
1545 }
1546 if ((my_types & RSS_TYPE_IPV6) &&
1547 (diff_types & ifrh.ifrh_types &
1548 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1549 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1550 RSS_TYPE_IPV6_EX))) {
1551 /* Conflict; disable IPV6 hash type/value delivery. */
1552 if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1553 mbuf_types &= ~RSS_TYPE_IPV6;
1554 }
1555 if ((my_types & RSS_TYPE_IPV6_EX) &&
1556 (diff_types & ifrh.ifrh_types &
1557 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1558 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1559 RSS_TYPE_IPV6))) {
1560 /* Conflict; disable IPV6_EX hash type/value delivery. */
1561 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1562 mbuf_types &= ~RSS_TYPE_IPV6_EX;
1563 }
1564 if ((my_types & RSS_TYPE_TCP_IPV6) &&
1565 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1566 /* Conflict; disable TCP_IPV6 hash type/value delivery. */
1567 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1568 mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1569 }
1570 if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1571 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1572 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1573 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1574 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1575 }
1576 if ((my_types & RSS_TYPE_UDP_IPV6) &&
1577 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1578 /* Conflict; disable UDP_IPV6 hash type/value delivery. */
1579 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1580 mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1581 }
1582 if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1583 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1584 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1585 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1586 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1587 }
1588
1589 /*
1590 * Indirect table does not matter.
1591 */
1592
1593 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1594 hn_rss_type_tondis(my_types);
1595 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1596 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1597
1598 if (reconf) {
1599 error = hn_rss_reconfig(sc);
1600 if (error) {
1601 /* XXX roll-back? */
1602 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1603 /* XXX keep going. */
1604 }
1605 }
1606 done:
1607 /* Hash deliverability for mbufs. */
1608 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1609 }
1610
1611 static void
hn_vf_rss_restore(struct hn_softc * sc)1612 hn_vf_rss_restore(struct hn_softc *sc)
1613 {
1614
1615 HN_LOCK_ASSERT(sc);
1616 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1617 ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1618
1619 if (sc->hn_rx_ring_inuse == 1)
1620 goto done;
1621
1622 /*
1623 * Restore hash types. Key does _not_ matter.
1624 */
1625 if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1626 int error;
1627
1628 sc->hn_rss_hash = sc->hn_rss_hcap;
1629 error = hn_rss_reconfig(sc);
1630 if (error) {
1631 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1632 error);
1633 /* XXX keep going. */
1634 }
1635 }
1636 done:
1637 /* Hash deliverability for mbufs. */
1638 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1639 }
1640
1641 static void
hn_xpnt_vf_setready(struct hn_softc * sc)1642 hn_xpnt_vf_setready(struct hn_softc *sc)
1643 {
1644 if_t ifp, vf_ifp;
1645 struct ifreq ifr;
1646
1647 HN_LOCK_ASSERT(sc);
1648 ifp = sc->hn_ifp;
1649 vf_ifp = sc->hn_vf_ifp;
1650
1651 /*
1652 * Mark the VF ready.
1653 */
1654 sc->hn_vf_rdytick = 0;
1655
1656 /*
1657 * Save information for restoration.
1658 */
1659 sc->hn_saved_caps = if_getcapabilities(ifp);
1660 sc->hn_saved_tsomax = if_gethwtsomax(ifp);
1661 sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp);
1662 sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp);
1663 sc->hn_saved_capenable = if_getcapenable(ifp);
1664 sc->hn_saved_hwassist = if_gethwassist(ifp);
1665
1666 /*
1667 * Intersect supported/enabled capabilities.
1668 *
1669 * NOTE:
1670 * if_hwassist is not changed here.
1671 */
1672 if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp));
1673 if_setcapenablebit(ifp, 0, if_getcapabilities(ifp));
1674
1675 /*
1676 * Fix TSO settings.
1677 */
1678 if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp))
1679 if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp));
1680 if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp))
1681 if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp));
1682 if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp))
1683 if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp));
1684
1685 /*
1686 * Change VF's enabled capabilities.
1687 */
1688 memset(&ifr, 0, sizeof(ifr));
1689 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1690 ifr.ifr_reqcap = if_getcapenable(ifp);
1691 hn_xpnt_vf_iocsetcaps(sc, &ifr);
1692
1693 if (if_getmtu(ifp) != ETHERMTU) {
1694 int error;
1695
1696 /*
1697 * Change VF's MTU.
1698 */
1699 memset(&ifr, 0, sizeof(ifr));
1700 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1701 ifr.ifr_mtu = if_getmtu(ifp);
1702 error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread);
1703 if (error) {
1704 if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1705 if_name(vf_ifp), if_getmtu(ifp));
1706 if (if_getmtu(ifp) > ETHERMTU) {
1707 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1708
1709 /*
1710 * XXX
1711 * No need to adjust the synthetic parts' MTU;
1712 * failure of the adjustment will cause us
1713 * infinite headache.
1714 */
1715 if_setmtu(ifp, ETHERMTU);
1716 hn_mtu_change_fixup(sc);
1717 }
1718 }
1719 }
1720 }
1721
1722 static bool
hn_xpnt_vf_isready(struct hn_softc * sc)1723 hn_xpnt_vf_isready(struct hn_softc *sc)
1724 {
1725
1726 HN_LOCK_ASSERT(sc);
1727
1728 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1729 return (false);
1730
1731 if (sc->hn_vf_rdytick == 0)
1732 return (true);
1733
1734 if (sc->hn_vf_rdytick > ticks)
1735 return (false);
1736
1737 /* Mark VF as ready. */
1738 hn_xpnt_vf_setready(sc);
1739 return (true);
1740 }
1741
1742 static void
hn_xpnt_vf_setenable(struct hn_softc * sc)1743 hn_xpnt_vf_setenable(struct hn_softc *sc)
1744 {
1745 int i;
1746
1747 HN_LOCK_ASSERT(sc);
1748
1749 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1750 rm_wlock(&sc->hn_vf_lock);
1751 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1752 rm_wunlock(&sc->hn_vf_lock);
1753
1754 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1755 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1756 }
1757
1758 static void
hn_xpnt_vf_setdisable(struct hn_softc * sc,bool clear_vf)1759 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1760 {
1761 int i;
1762
1763 HN_LOCK_ASSERT(sc);
1764
1765 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1766 rm_wlock(&sc->hn_vf_lock);
1767 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1768 if (clear_vf)
1769 sc->hn_vf_ifp = NULL;
1770 rm_wunlock(&sc->hn_vf_lock);
1771
1772 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1773 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1774 }
1775
1776 static void
hn_xpnt_vf_init(struct hn_softc * sc)1777 hn_xpnt_vf_init(struct hn_softc *sc)
1778 {
1779 int error;
1780
1781 HN_LOCK_ASSERT(sc);
1782
1783 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1784 ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1785
1786 if (bootverbose) {
1787 if_printf(sc->hn_ifp, "try bringing up %s\n",
1788 if_name(sc->hn_vf_ifp));
1789 }
1790
1791 /*
1792 * Bring the VF up.
1793 */
1794 hn_xpnt_vf_saveifflags(sc);
1795 if_setflagbits(sc->hn_ifp, IFF_UP, 0);
1796 error = hn_xpnt_vf_iocsetflags(sc);
1797 if (error) {
1798 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1799 if_name(sc->hn_vf_ifp), error);
1800 return;
1801 }
1802
1803 /*
1804 * NOTE:
1805 * Datapath setting must happen _after_ bringing the VF up.
1806 */
1807 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1808
1809 /*
1810 * NOTE:
1811 * Fixup RSS related bits _after_ the VF is brought up, since
1812 * many VFs generate RSS key during it's initialization.
1813 */
1814 hn_vf_rss_fixup(sc, true);
1815
1816 /* Mark transparent mode VF as enabled. */
1817 hn_xpnt_vf_setenable(sc);
1818 }
1819
1820 static void
hn_xpnt_vf_init_taskfunc(void * xsc,int pending __unused)1821 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1822 {
1823 struct hn_softc *sc = xsc;
1824
1825 HN_LOCK(sc);
1826
1827 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1828 goto done;
1829 if (sc->hn_vf_ifp == NULL)
1830 goto done;
1831 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1832 goto done;
1833
1834 if (sc->hn_vf_rdytick != 0) {
1835 /* Mark VF as ready. */
1836 hn_xpnt_vf_setready(sc);
1837 }
1838
1839 if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) {
1840 /*
1841 * Delayed VF initialization.
1842 */
1843 if (bootverbose) {
1844 if_printf(sc->hn_ifp, "delayed initialize %s\n",
1845 if_name(sc->hn_vf_ifp));
1846 }
1847 hn_xpnt_vf_init(sc);
1848 }
1849 done:
1850 HN_UNLOCK(sc);
1851 }
1852
1853 static void
hn_ifnet_attevent(void * xsc,if_t ifp)1854 hn_ifnet_attevent(void *xsc, if_t ifp)
1855 {
1856 struct hn_softc *sc = xsc;
1857
1858 HN_LOCK(sc);
1859
1860 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1861 goto done;
1862
1863 if (!hn_ismyvf(sc, ifp))
1864 goto done;
1865
1866 if (sc->hn_vf_ifp != NULL) {
1867 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1868 if_name(sc->hn_vf_ifp));
1869 goto done;
1870 }
1871
1872 if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) {
1873 /*
1874 * ifnet.if_start is _not_ supported by transparent
1875 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1876 */
1877 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1878 "in transparent VF mode.\n", if_name(sc->hn_vf_ifp));
1879
1880 goto done;
1881 }
1882
1883 rm_wlock(&hn_vfmap_lock);
1884
1885 if (if_getindex(ifp) >= hn_vfmap_size) {
1886 if_t *newmap;
1887 int newsize;
1888
1889 newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF;
1890 newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF,
1891 M_WAITOK | M_ZERO);
1892
1893 memcpy(newmap, hn_vfmap,
1894 sizeof(if_t) * hn_vfmap_size);
1895 free(hn_vfmap, M_DEVBUF);
1896 hn_vfmap = newmap;
1897 hn_vfmap_size = newsize;
1898 }
1899 KASSERT(hn_vfmap[if_getindex(ifp)] == NULL,
1900 ("%s: ifindex %d was mapped to %s",
1901 if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)])));
1902 hn_vfmap[if_getindex(ifp)] = sc->hn_ifp;
1903
1904 rm_wunlock(&hn_vfmap_lock);
1905
1906 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1907 rm_wlock(&sc->hn_vf_lock);
1908 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1909 ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1910 sc->hn_vf_ifp = ifp;
1911 rm_wunlock(&sc->hn_vf_lock);
1912
1913 if (hn_xpnt_vf) {
1914 int wait_ticks;
1915
1916 /*
1917 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1918 * Save vf_ifp's current if_input for later restoration.
1919 */
1920 sc->hn_vf_input = if_getinputfn(ifp);
1921 if_setinputfn(ifp, hn_xpnt_vf_input);
1922
1923 /*
1924 * Stop link status management; use the VF's.
1925 */
1926 hn_suspend_mgmt(sc);
1927
1928 /*
1929 * Give VF sometime to complete its attach routing.
1930 */
1931 wait_ticks = hn_xpnt_vf_attwait * hz;
1932 sc->hn_vf_rdytick = ticks + wait_ticks;
1933
1934 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1935 wait_ticks);
1936 }
1937 done:
1938 HN_UNLOCK(sc);
1939 }
1940
1941 static void
hn_ifnet_detevent(void * xsc,if_t ifp)1942 hn_ifnet_detevent(void *xsc, if_t ifp)
1943 {
1944 struct hn_softc *sc = xsc;
1945
1946 HN_LOCK(sc);
1947
1948 if (sc->hn_vf_ifp == NULL)
1949 goto done;
1950
1951 if (!hn_ismyvf(sc, ifp))
1952 goto done;
1953
1954 if (hn_xpnt_vf) {
1955 /*
1956 * Make sure that the delayed initialization is not running.
1957 *
1958 * NOTE:
1959 * - This lock _must_ be released, since the hn_vf_init task
1960 * will try holding this lock.
1961 * - It is safe to release this lock here, since the
1962 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1963 *
1964 * XXX racy, if hn(4) ever detached.
1965 */
1966 HN_UNLOCK(sc);
1967 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1968 HN_LOCK(sc);
1969
1970 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1971 if_name(sc->hn_ifp)));
1972 if_setinputfn(ifp, sc->hn_vf_input);
1973 sc->hn_vf_input = NULL;
1974
1975 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1976 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1977 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1978
1979 if (sc->hn_vf_rdytick == 0) {
1980 /*
1981 * The VF was ready; restore some settings.
1982 */
1983 if_setcapabilities(ifp, sc->hn_saved_caps);
1984
1985 if_sethwtsomax(ifp, sc->hn_saved_tsomax);
1986 if_sethwtsomaxsegcount(sc->hn_ifp,
1987 sc->hn_saved_tsosegcnt);
1988 if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz);
1989
1990 if_setcapenable(ifp, sc->hn_saved_capenable);
1991 if_sethwassist(ifp, sc->hn_saved_hwassist);
1992 }
1993
1994 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1995 /*
1996 * Restore RSS settings.
1997 */
1998 hn_vf_rss_restore(sc);
1999
2000 /*
2001 * Resume link status management, which was suspended
2002 * by hn_ifnet_attevent().
2003 */
2004 hn_resume_mgmt(sc);
2005 }
2006 }
2007
2008 /* Mark transparent mode VF as disabled. */
2009 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2010
2011 rm_wlock(&hn_vfmap_lock);
2012
2013 KASSERT(if_getindex(ifp) < hn_vfmap_size,
2014 ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size));
2015 if (hn_vfmap[if_getindex(ifp)] != NULL) {
2016 KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp,
2017 ("%s: ifindex %d was mapped to %s",
2018 if_name(ifp), if_getindex(ifp),
2019 if_name(hn_vfmap[if_getindex(ifp)])));
2020 hn_vfmap[if_getindex(ifp)] = NULL;
2021 }
2022
2023 rm_wunlock(&hn_vfmap_lock);
2024 done:
2025 HN_UNLOCK(sc);
2026 }
2027
2028 static void
hn_ifnet_lnkevent(void * xsc,if_t ifp,int link_state)2029 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state)
2030 {
2031 struct hn_softc *sc = xsc;
2032
2033 if (sc->hn_vf_ifp == ifp)
2034 if_link_state_change(sc->hn_ifp, link_state);
2035 }
2036
2037 static int
hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS)2038 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS)
2039 {
2040 struct hn_softc *sc = arg1;
2041 unsigned int tsomax;
2042 int error;
2043
2044 tsomax = if_gethwtsomax(sc->hn_ifp);
2045 error = sysctl_handle_int(oidp, &tsomax, 0, req);
2046 return error;
2047 }
2048
2049 static int
hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS)2050 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS)
2051 {
2052 struct hn_softc *sc = arg1;
2053 unsigned int tsomaxsegcnt;
2054 int error;
2055
2056 tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp);
2057 error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req);
2058 return error;
2059 }
2060
2061 static int
hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS)2062 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS)
2063 {
2064 struct hn_softc *sc = arg1;
2065 unsigned int tsomaxsegsz;
2066 int error;
2067
2068 tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp);
2069 error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req);
2070 return error;
2071 }
2072
2073 static int
hn_probe(device_t dev)2074 hn_probe(device_t dev)
2075 {
2076
2077 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2078 device_set_desc(dev, "Hyper-V Network Interface");
2079 return BUS_PROBE_DEFAULT;
2080 }
2081 return ENXIO;
2082 }
2083
2084 static int
hn_attach(device_t dev)2085 hn_attach(device_t dev)
2086 {
2087 struct hn_softc *sc = device_get_softc(dev);
2088 struct sysctl_oid_list *child;
2089 struct sysctl_ctx_list *ctx;
2090 uint8_t eaddr[ETHER_ADDR_LEN];
2091 if_t ifp = NULL;
2092 int error, ring_cnt, tx_ring_cnt;
2093 uint32_t mtu;
2094
2095 sc->hn_dev = dev;
2096 sc->hn_prichan = vmbus_get_channel(dev);
2097 HN_LOCK_INIT(sc);
2098 rm_init(&sc->hn_vf_lock, "hnvf");
2099 if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2100 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2101
2102 /*
2103 * Initialize these tunables once.
2104 */
2105 sc->hn_agg_size = hn_tx_agg_size;
2106 sc->hn_agg_pkts = hn_tx_agg_pkts;
2107
2108 /*
2109 * Setup taskqueue for transmission.
2110 */
2111 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2112 int i;
2113
2114 sc->hn_tx_taskqs =
2115 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2116 M_DEVBUF, M_WAITOK);
2117 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2118 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2119 M_WAITOK, taskqueue_thread_enqueue,
2120 &sc->hn_tx_taskqs[i]);
2121 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2122 "%s tx%d", device_get_nameunit(dev), i);
2123 }
2124 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2125 sc->hn_tx_taskqs = hn_tx_taskque;
2126 }
2127
2128 /*
2129 * Setup taskqueue for mangement tasks, e.g. link status.
2130 */
2131 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2132 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2133 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2134 device_get_nameunit(dev));
2135 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2136 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2137 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2138 hn_netchg_status_taskfunc, sc);
2139
2140 if (hn_xpnt_vf) {
2141 /*
2142 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2143 */
2144 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2145 taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2146 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2147 device_get_nameunit(dev));
2148 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2149 hn_xpnt_vf_init_taskfunc, sc);
2150 }
2151
2152 /*
2153 * Allocate ifnet and setup its name earlier, so that if_printf
2154 * can be used by functions, which will be called after
2155 * ether_ifattach().
2156 */
2157 ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2158 if_setsoftc(ifp, sc);
2159 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2160
2161 /*
2162 * Initialize ifmedia earlier so that it can be unconditionally
2163 * destroyed, if error happened later on.
2164 */
2165 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2166
2167 /*
2168 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2169 * to use (tx_ring_cnt).
2170 *
2171 * NOTE:
2172 * The # of RX rings to use is same as the # of channels to use.
2173 */
2174 ring_cnt = hn_chan_cnt;
2175 if (ring_cnt <= 0) {
2176 /* Default */
2177 ring_cnt = mp_ncpus;
2178 if (ring_cnt > HN_RING_CNT_DEF_MAX)
2179 ring_cnt = HN_RING_CNT_DEF_MAX;
2180 } else if (ring_cnt > mp_ncpus) {
2181 ring_cnt = mp_ncpus;
2182 }
2183 #ifdef RSS
2184 if (ring_cnt > rss_getnumbuckets())
2185 ring_cnt = rss_getnumbuckets();
2186 #endif
2187
2188 tx_ring_cnt = hn_tx_ring_cnt;
2189 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2190 tx_ring_cnt = ring_cnt;
2191 #ifdef HN_IFSTART_SUPPORT
2192 if (hn_use_if_start) {
2193 /* ifnet.if_start only needs one TX ring. */
2194 tx_ring_cnt = 1;
2195 }
2196 #endif
2197
2198 /*
2199 * Set the leader CPU for channels.
2200 */
2201 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2202
2203 /*
2204 * Create enough TX/RX rings, even if only limited number of
2205 * channels can be allocated.
2206 */
2207 error = hn_create_tx_data(sc, tx_ring_cnt);
2208 if (error)
2209 goto failed;
2210 error = hn_create_rx_data(sc, ring_cnt);
2211 if (error)
2212 goto failed;
2213
2214 /*
2215 * Create transaction context for NVS and RNDIS transactions.
2216 */
2217 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2218 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2219 if (sc->hn_xact == NULL) {
2220 error = ENXIO;
2221 goto failed;
2222 }
2223
2224 /*
2225 * Install orphan handler for the revocation of this device's
2226 * primary channel.
2227 *
2228 * NOTE:
2229 * The processing order is critical here:
2230 * Install the orphan handler, _before_ testing whether this
2231 * device's primary channel has been revoked or not.
2232 */
2233 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2234 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2235 error = ENXIO;
2236 goto failed;
2237 }
2238
2239 /*
2240 * Attach the synthetic parts, i.e. NVS and RNDIS.
2241 */
2242 error = hn_synth_attach(sc, ETHERMTU);
2243 if (error)
2244 goto failed;
2245
2246 error = hn_rndis_get_eaddr(sc, eaddr);
2247 if (error)
2248 goto failed;
2249
2250 error = hn_rndis_get_mtu(sc, &mtu);
2251 if (error)
2252 mtu = ETHERMTU;
2253 else if (bootverbose)
2254 device_printf(dev, "RNDIS mtu %u\n", mtu);
2255
2256 if (sc->hn_rx_ring_inuse > 1) {
2257 /*
2258 * Reduce TCP segment aggregation limit for multiple
2259 * RX rings to increase ACK timeliness.
2260 */
2261 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2262 }
2263
2264 /*
2265 * Fixup TX/RX stuffs after synthetic parts are attached.
2266 */
2267 hn_fixup_tx_data(sc);
2268 hn_fixup_rx_data(sc);
2269
2270 ctx = device_get_sysctl_ctx(dev);
2271 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2272 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2273 &sc->hn_nvs_ver, 0, "NVS version");
2274 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2275 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2276 hn_ndis_version_sysctl, "A", "NDIS version");
2277 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2278 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2279 hn_caps_sysctl, "A", "capabilities");
2280 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2281 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2282 hn_hwassist_sysctl, "A", "hwassist");
2283 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max",
2284 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl,
2285 "IU", "max TSO size");
2286 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt",
2287 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl,
2288 "IU", "max # of TSO segments");
2289 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz",
2290 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl,
2291 "IU", "max size of TSO segment");
2292 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2293 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2294 hn_rxfilter_sysctl, "A", "rxfilter");
2295 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2296 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2297 hn_rss_hash_sysctl, "A", "RSS hash");
2298 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2299 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2300 hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2301 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2302 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2303 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2304 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2305 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2306 #ifndef RSS
2307 /*
2308 * Don't allow RSS key/indirect table changes, if RSS is defined.
2309 */
2310 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2311 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2312 hn_rss_key_sysctl, "IU", "RSS key");
2313 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2314 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2315 hn_rss_ind_sysctl, "IU", "RSS indirect table");
2316 #endif
2317 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2318 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2319 "RNDIS offered packet transmission aggregation size limit");
2320 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2321 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2322 "RNDIS offered packet transmission aggregation count limit");
2323 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2324 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2325 "RNDIS packet transmission aggregation alignment");
2326 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2327 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2328 hn_txagg_size_sysctl, "I",
2329 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2330 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2331 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2332 hn_txagg_pkts_sysctl, "I",
2333 "Packet transmission aggregation packets, "
2334 "0 -- disable, -1 -- auto");
2335 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2336 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2337 hn_polling_sysctl, "I",
2338 "Polling frequency: [100,1000000], 0 disable polling");
2339 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2340 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2341 hn_vf_sysctl, "A", "Virtual Function's name");
2342 if (!hn_xpnt_vf) {
2343 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2344 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2345 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2346 } else {
2347 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2348 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2349 hn_xpnt_vf_enabled_sysctl, "I",
2350 "Transparent VF enabled");
2351 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2352 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2353 hn_xpnt_vf_accbpf_sysctl, "I",
2354 "Accurate BPF for transparent VF");
2355 }
2356
2357 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
2358 CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "I",
2359 "switch to rsc");
2360
2361 /*
2362 * Setup the ifmedia, which has been initialized earlier.
2363 */
2364 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2365 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2366 /* XXX ifmedia_set really should do this for us */
2367 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2368
2369 /*
2370 * Setup the ifnet for this interface.
2371 */
2372
2373 if_setbaudrate(ifp, IF_Gbps(10));
2374 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
2375 if_setioctlfn(ifp, hn_ioctl);
2376 if_setinitfn(ifp, hn_init);
2377 #ifdef HN_IFSTART_SUPPORT
2378 if (hn_use_if_start) {
2379 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2380
2381 if_setstartfn(ifp, hn_start);
2382 if_setsendqlen(ifp, qdepth);
2383 if_setsendqready(ifp);
2384 } else
2385 #endif
2386 {
2387 if_settransmitfn(ifp, hn_transmit);
2388 if_setqflushfn(ifp, hn_xmit_qflush);
2389 }
2390
2391 if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0);
2392 #ifdef foo
2393 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2394 if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0);
2395 #endif
2396 if (sc->hn_caps & HN_CAP_VLAN) {
2397 /* XXX not sure about VLAN_MTU. */
2398 if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0);
2399 }
2400
2401 if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist);
2402 if (if_gethwassist(ifp) & HN_CSUM_IP_MASK)
2403 if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0);
2404 if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK)
2405 if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0);
2406 if (sc->hn_caps & HN_CAP_TSO4) {
2407 if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
2408 if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2409 }
2410 if (sc->hn_caps & HN_CAP_TSO6) {
2411 if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
2412 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2413 }
2414
2415 /* Enable all available capabilities by default. */
2416 if_setcapenable(ifp, if_getcapabilities(ifp));
2417
2418 /*
2419 * Disable IPv6 TSO and TXCSUM by default, they still can
2420 * be enabled through SIOCSIFCAP.
2421 */
2422 if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6));
2423 if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO));
2424
2425 if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) {
2426 /*
2427 * Lock hn_set_tso_maxsize() to simplify its
2428 * internal logic.
2429 */
2430 HN_LOCK(sc);
2431 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2432 HN_UNLOCK(sc);
2433 if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX);
2434 if_sethwtsomaxsegsize(ifp, PAGE_SIZE);
2435 }
2436
2437 ether_ifattach(ifp, eaddr);
2438
2439 if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2440 if_printf(ifp, "TSO segcnt %u segsz %u\n",
2441 if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp));
2442 }
2443 if (mtu < ETHERMTU) {
2444
2445 if_setmtu(ifp, mtu);
2446 }
2447
2448 /* Inform the upper layer about the long frame support. */
2449 if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
2450
2451 /*
2452 * Kick off link status check.
2453 */
2454 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2455 hn_update_link_status(sc);
2456
2457 if (!hn_xpnt_vf) {
2458 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2459 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2460 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2461 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2462 } else {
2463 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2464 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2465 }
2466
2467 /*
2468 * NOTE:
2469 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2470 * since interface's LLADDR is needed; interface LLADDR is not
2471 * available when ifnet_arrival event is triggered.
2472 */
2473 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2474 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2475 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2476 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2477
2478 return (0);
2479 failed:
2480 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2481 hn_synth_detach(sc);
2482 hn_detach(dev);
2483 return (error);
2484 }
2485
2486 static int
hn_detach(device_t dev)2487 hn_detach(device_t dev)
2488 {
2489 struct hn_softc *sc = device_get_softc(dev);
2490 if_t ifp = sc->hn_ifp, vf_ifp;
2491
2492 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2493 /*
2494 * In case that the vmbus missed the orphan handler
2495 * installation.
2496 */
2497 vmbus_xact_ctx_orphan(sc->hn_xact);
2498 }
2499
2500 if (sc->hn_ifaddr_evthand != NULL)
2501 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2502 if (sc->hn_ifnet_evthand != NULL)
2503 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2504 if (sc->hn_ifnet_atthand != NULL) {
2505 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2506 sc->hn_ifnet_atthand);
2507 }
2508 if (sc->hn_ifnet_dethand != NULL) {
2509 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2510 sc->hn_ifnet_dethand);
2511 }
2512 if (sc->hn_ifnet_lnkhand != NULL)
2513 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2514
2515 vf_ifp = sc->hn_vf_ifp;
2516 __compiler_membar();
2517 if (vf_ifp != NULL)
2518 hn_ifnet_detevent(sc, vf_ifp);
2519
2520 if (device_is_attached(dev)) {
2521 HN_LOCK(sc);
2522 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2523 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
2524 hn_stop(sc, true);
2525 /*
2526 * NOTE:
2527 * hn_stop() only suspends data, so management
2528 * stuffs have to be suspended manually here.
2529 */
2530 hn_suspend_mgmt(sc);
2531 hn_synth_detach(sc);
2532 }
2533 HN_UNLOCK(sc);
2534 ether_ifdetach(ifp);
2535 }
2536
2537 ifmedia_removeall(&sc->hn_media);
2538 hn_destroy_rx_data(sc);
2539 hn_destroy_tx_data(sc);
2540
2541 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2542 int i;
2543
2544 for (i = 0; i < hn_tx_taskq_cnt; ++i)
2545 taskqueue_free(sc->hn_tx_taskqs[i]);
2546 free(sc->hn_tx_taskqs, M_DEVBUF);
2547 }
2548 taskqueue_free(sc->hn_mgmt_taskq0);
2549 if (sc->hn_vf_taskq != NULL)
2550 taskqueue_free(sc->hn_vf_taskq);
2551
2552 if (sc->hn_xact != NULL) {
2553 /*
2554 * Uninstall the orphan handler _before_ the xact is
2555 * destructed.
2556 */
2557 vmbus_chan_unset_orphan(sc->hn_prichan);
2558 vmbus_xact_ctx_destroy(sc->hn_xact);
2559 }
2560
2561 if_free(ifp);
2562
2563 HN_LOCK_DESTROY(sc);
2564 rm_destroy(&sc->hn_vf_lock);
2565 return (0);
2566 }
2567
2568 static int
hn_shutdown(device_t dev)2569 hn_shutdown(device_t dev)
2570 {
2571
2572 return (0);
2573 }
2574
2575 static void
hn_link_status(struct hn_softc * sc)2576 hn_link_status(struct hn_softc *sc)
2577 {
2578 uint32_t link_status;
2579 int error;
2580
2581 error = hn_rndis_get_linkstatus(sc, &link_status);
2582 if (error) {
2583 /* XXX what to do? */
2584 return;
2585 }
2586
2587 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2588 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2589 else
2590 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2591 if_link_state_change(sc->hn_ifp,
2592 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2593 LINK_STATE_UP : LINK_STATE_DOWN);
2594 }
2595
2596 static void
hn_link_taskfunc(void * xsc,int pending __unused)2597 hn_link_taskfunc(void *xsc, int pending __unused)
2598 {
2599 struct hn_softc *sc = xsc;
2600
2601 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2602 return;
2603 hn_link_status(sc);
2604 }
2605
2606 static void
hn_netchg_init_taskfunc(void * xsc,int pending __unused)2607 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2608 {
2609 struct hn_softc *sc = xsc;
2610
2611 /* Prevent any link status checks from running. */
2612 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2613
2614 /*
2615 * Fake up a [link down --> link up] state change; 5 seconds
2616 * delay is used, which closely simulates miibus reaction
2617 * upon link down event.
2618 */
2619 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2620 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2621 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2622 &sc->hn_netchg_status, 5 * hz);
2623 }
2624
2625 static void
hn_netchg_status_taskfunc(void * xsc,int pending __unused)2626 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2627 {
2628 struct hn_softc *sc = xsc;
2629
2630 /* Re-allow link status checks. */
2631 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2632 hn_link_status(sc);
2633 }
2634
2635 static void
hn_update_link_status(struct hn_softc * sc)2636 hn_update_link_status(struct hn_softc *sc)
2637 {
2638
2639 if (sc->hn_mgmt_taskq != NULL)
2640 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2641 }
2642
2643 static void
hn_change_network(struct hn_softc * sc)2644 hn_change_network(struct hn_softc *sc)
2645 {
2646
2647 if (sc->hn_mgmt_taskq != NULL)
2648 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2649 }
2650
2651 static __inline int
hn_txdesc_dmamap_load(struct hn_tx_ring * txr,struct hn_txdesc * txd,struct mbuf ** m_head,bus_dma_segment_t * segs,int * nsegs)2652 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2653 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2654 {
2655 struct mbuf *m = *m_head;
2656 int error;
2657
2658 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2659
2660 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2661 m, segs, nsegs, BUS_DMA_NOWAIT);
2662 if (error == EFBIG) {
2663 struct mbuf *m_new;
2664
2665 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2666 if (m_new == NULL)
2667 return ENOBUFS;
2668 else
2669 *m_head = m = m_new;
2670 txr->hn_tx_collapsed++;
2671
2672 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2673 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2674 }
2675 if (!error) {
2676 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2677 BUS_DMASYNC_PREWRITE);
2678 txd->flags |= HN_TXD_FLAG_DMAMAP;
2679 }
2680 return error;
2681 }
2682
2683 static __inline int
hn_txdesc_put(struct hn_tx_ring * txr,struct hn_txdesc * txd)2684 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2685 {
2686
2687 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2688 ("put an onlist txd %#x", txd->flags));
2689 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2690 ("put an onagg txd %#x", txd->flags));
2691
2692 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2693 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2694 return 0;
2695
2696 if (!STAILQ_EMPTY(&txd->agg_list)) {
2697 struct hn_txdesc *tmp_txd;
2698
2699 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2700 int freed __diagused;
2701
2702 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2703 ("resursive aggregation on aggregated txdesc"));
2704 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2705 ("not aggregated txdesc"));
2706 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2707 ("aggregated txdesc uses dmamap"));
2708 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2709 ("aggregated txdesc consumes "
2710 "chimney sending buffer"));
2711 KASSERT(tmp_txd->chim_size == 0,
2712 ("aggregated txdesc has non-zero "
2713 "chimney sending size"));
2714
2715 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2716 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2717 freed = hn_txdesc_put(txr, tmp_txd);
2718 KASSERT(freed, ("failed to free aggregated txdesc"));
2719 }
2720 }
2721
2722 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2723 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2724 ("chim txd uses dmamap"));
2725 hn_chim_free(txr->hn_sc, txd->chim_index);
2726 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2727 txd->chim_size = 0;
2728 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2729 bus_dmamap_sync(txr->hn_tx_data_dtag,
2730 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2731 bus_dmamap_unload(txr->hn_tx_data_dtag,
2732 txd->data_dmap);
2733 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2734 }
2735
2736 if (txd->m != NULL) {
2737 m_freem(txd->m);
2738 txd->m = NULL;
2739 }
2740
2741 txd->flags |= HN_TXD_FLAG_ONLIST;
2742 #ifndef HN_USE_TXDESC_BUFRING
2743 mtx_lock_spin(&txr->hn_txlist_spin);
2744 KASSERT(txr->hn_txdesc_avail >= 0 &&
2745 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2746 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2747 txr->hn_txdesc_avail++;
2748 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2749 mtx_unlock_spin(&txr->hn_txlist_spin);
2750 #else /* HN_USE_TXDESC_BUFRING */
2751 #ifdef HN_DEBUG
2752 atomic_add_int(&txr->hn_txdesc_avail, 1);
2753 #endif
2754 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2755 #endif /* !HN_USE_TXDESC_BUFRING */
2756
2757 return 1;
2758 }
2759
2760 static __inline struct hn_txdesc *
hn_txdesc_get(struct hn_tx_ring * txr)2761 hn_txdesc_get(struct hn_tx_ring *txr)
2762 {
2763 struct hn_txdesc *txd;
2764
2765 #ifndef HN_USE_TXDESC_BUFRING
2766 mtx_lock_spin(&txr->hn_txlist_spin);
2767 txd = SLIST_FIRST(&txr->hn_txlist);
2768 if (txd != NULL) {
2769 KASSERT(txr->hn_txdesc_avail > 0,
2770 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2771 txr->hn_txdesc_avail--;
2772 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2773 }
2774 mtx_unlock_spin(&txr->hn_txlist_spin);
2775 #else
2776 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2777 #endif
2778
2779 if (txd != NULL) {
2780 #ifdef HN_USE_TXDESC_BUFRING
2781 #ifdef HN_DEBUG
2782 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2783 #endif
2784 #endif /* HN_USE_TXDESC_BUFRING */
2785 KASSERT(txd->m == NULL && txd->refs == 0 &&
2786 STAILQ_EMPTY(&txd->agg_list) &&
2787 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2788 txd->chim_size == 0 &&
2789 (txd->flags & HN_TXD_FLAG_ONLIST) &&
2790 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2791 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2792 txd->flags &= ~HN_TXD_FLAG_ONLIST;
2793 txd->refs = 1;
2794 }
2795 return txd;
2796 }
2797
2798 static __inline void
hn_txdesc_hold(struct hn_txdesc * txd)2799 hn_txdesc_hold(struct hn_txdesc *txd)
2800 {
2801
2802 /* 0->1 transition will never work */
2803 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2804 atomic_add_int(&txd->refs, 1);
2805 }
2806
2807 static __inline void
hn_txdesc_agg(struct hn_txdesc * agg_txd,struct hn_txdesc * txd)2808 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2809 {
2810
2811 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2812 ("recursive aggregation on aggregating txdesc"));
2813
2814 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2815 ("already aggregated"));
2816 KASSERT(STAILQ_EMPTY(&txd->agg_list),
2817 ("recursive aggregation on to-be-aggregated txdesc"));
2818
2819 txd->flags |= HN_TXD_FLAG_ONAGG;
2820 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2821 }
2822
2823 static bool
hn_tx_ring_pending(struct hn_tx_ring * txr)2824 hn_tx_ring_pending(struct hn_tx_ring *txr)
2825 {
2826 bool pending = false;
2827
2828 #ifndef HN_USE_TXDESC_BUFRING
2829 mtx_lock_spin(&txr->hn_txlist_spin);
2830 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2831 pending = true;
2832 mtx_unlock_spin(&txr->hn_txlist_spin);
2833 #else
2834 if (!buf_ring_full(txr->hn_txdesc_br))
2835 pending = true;
2836 #endif
2837 return (pending);
2838 }
2839
2840 static __inline void
hn_txeof(struct hn_tx_ring * txr)2841 hn_txeof(struct hn_tx_ring *txr)
2842 {
2843 txr->hn_has_txeof = 0;
2844 txr->hn_txeof(txr);
2845 }
2846
2847 static void
hn_txpkt_done(struct hn_nvs_sendctx * sndc,struct hn_softc * sc,struct vmbus_channel * chan,const void * data __unused,int dlen __unused)2848 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2849 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2850 {
2851 struct hn_txdesc *txd = sndc->hn_cbarg;
2852 struct hn_tx_ring *txr;
2853
2854 txr = txd->txr;
2855 KASSERT(txr->hn_chan == chan,
2856 ("channel mismatch, on chan%u, should be chan%u",
2857 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2858
2859 txr->hn_has_txeof = 1;
2860 hn_txdesc_put(txr, txd);
2861
2862 ++txr->hn_txdone_cnt;
2863 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2864 txr->hn_txdone_cnt = 0;
2865 if (txr->hn_oactive)
2866 hn_txeof(txr);
2867 }
2868 }
2869
2870 static void
hn_chan_rollup(struct hn_rx_ring * rxr,struct hn_tx_ring * txr)2871 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2872 {
2873 #if defined(INET) || defined(INET6)
2874 struct epoch_tracker et;
2875
2876 NET_EPOCH_ENTER(et);
2877 tcp_lro_flush_all(&rxr->hn_lro);
2878 NET_EPOCH_EXIT(et);
2879 #endif
2880
2881 /*
2882 * NOTE:
2883 * 'txr' could be NULL, if multiple channels and
2884 * ifnet.if_start method are enabled.
2885 */
2886 if (txr == NULL || !txr->hn_has_txeof)
2887 return;
2888
2889 txr->hn_txdone_cnt = 0;
2890 hn_txeof(txr);
2891 }
2892
2893 static __inline uint32_t
hn_rndis_pktmsg_offset(uint32_t ofs)2894 hn_rndis_pktmsg_offset(uint32_t ofs)
2895 {
2896
2897 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2898 ("invalid RNDIS packet msg offset %u", ofs));
2899 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2900 }
2901
2902 static __inline void *
hn_rndis_pktinfo_append(struct rndis_packet_msg * pkt,size_t pktsize,size_t pi_dlen,uint32_t pi_type)2903 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2904 size_t pi_dlen, uint32_t pi_type)
2905 {
2906 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2907 struct rndis_pktinfo *pi;
2908
2909 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2910 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2911
2912 /*
2913 * Per-packet-info does not move; it only grows.
2914 *
2915 * NOTE:
2916 * rm_pktinfooffset in this phase counts from the beginning
2917 * of rndis_packet_msg.
2918 */
2919 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2920 ("%u pktinfo overflows RNDIS packet msg", pi_type));
2921 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2922 pkt->rm_pktinfolen);
2923 pkt->rm_pktinfolen += pi_size;
2924
2925 pi->rm_size = pi_size;
2926 pi->rm_type = pi_type;
2927 pi->rm_internal = 0;
2928 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2929
2930 return (pi->rm_data);
2931 }
2932
2933 static __inline int
hn_flush_txagg(if_t ifp,struct hn_tx_ring * txr)2934 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr)
2935 {
2936 struct hn_txdesc *txd;
2937 struct mbuf *m;
2938 int error, pkts;
2939
2940 txd = txr->hn_agg_txd;
2941 KASSERT(txd != NULL, ("no aggregate txdesc"));
2942
2943 /*
2944 * Since hn_txpkt() will reset this temporary stat, save
2945 * it now, so that oerrors can be updated properly, if
2946 * hn_txpkt() ever fails.
2947 */
2948 pkts = txr->hn_stat_pkts;
2949
2950 /*
2951 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2952 * failure, save it for later freeing, if hn_txpkt() ever
2953 * fails.
2954 */
2955 m = txd->m;
2956 error = hn_txpkt(ifp, txr, txd);
2957 if (__predict_false(error)) {
2958 /* txd is freed, but m is not. */
2959 m_freem(m);
2960
2961 txr->hn_flush_failed++;
2962 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2963 }
2964
2965 /* Reset all aggregation states. */
2966 txr->hn_agg_txd = NULL;
2967 txr->hn_agg_szleft = 0;
2968 txr->hn_agg_pktleft = 0;
2969 txr->hn_agg_prevpkt = NULL;
2970
2971 return (error);
2972 }
2973
2974 static void *
hn_try_txagg(if_t ifp,struct hn_tx_ring * txr,struct hn_txdesc * txd,int pktsize)2975 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2976 int pktsize)
2977 {
2978 void *chim;
2979
2980 if (txr->hn_agg_txd != NULL) {
2981 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2982 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2983 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2984 int olen;
2985
2986 /*
2987 * Update the previous RNDIS packet's total length,
2988 * it can be increased due to the mandatory alignment
2989 * padding for this RNDIS packet. And update the
2990 * aggregating txdesc's chimney sending buffer size
2991 * accordingly.
2992 *
2993 * XXX
2994 * Zero-out the padding, as required by the RNDIS spec.
2995 */
2996 olen = pkt->rm_len;
2997 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2998 agg_txd->chim_size += pkt->rm_len - olen;
2999
3000 /* Link this txdesc to the parent. */
3001 hn_txdesc_agg(agg_txd, txd);
3002
3003 chim = (uint8_t *)pkt + pkt->rm_len;
3004 /* Save the current packet for later fixup. */
3005 txr->hn_agg_prevpkt = chim;
3006
3007 txr->hn_agg_pktleft--;
3008 txr->hn_agg_szleft -= pktsize;
3009 if (txr->hn_agg_szleft <=
3010 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3011 /*
3012 * Probably can't aggregate more packets,
3013 * flush this aggregating txdesc proactively.
3014 */
3015 txr->hn_agg_pktleft = 0;
3016 }
3017 /* Done! */
3018 return (chim);
3019 }
3020 hn_flush_txagg(ifp, txr);
3021 }
3022 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3023
3024 txr->hn_tx_chimney_tried++;
3025 txd->chim_index = hn_chim_alloc(txr->hn_sc);
3026 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3027 return (NULL);
3028 txr->hn_tx_chimney++;
3029
3030 chim = txr->hn_sc->hn_chim +
3031 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3032
3033 if (txr->hn_agg_pktmax > 1 &&
3034 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3035 txr->hn_agg_txd = txd;
3036 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3037 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3038 txr->hn_agg_prevpkt = chim;
3039 }
3040 return (chim);
3041 }
3042
3043 /*
3044 * NOTE:
3045 * If this function fails, then both txd and m_head0 will be freed.
3046 */
3047 static int
hn_encap(if_t ifp,struct hn_tx_ring * txr,struct hn_txdesc * txd,struct mbuf ** m_head0)3048 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3049 struct mbuf **m_head0)
3050 {
3051 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3052 int error, nsegs, i;
3053 struct mbuf *m_head = *m_head0;
3054 struct rndis_packet_msg *pkt;
3055 uint32_t *pi_data;
3056 void *chim = NULL;
3057 int pkt_hlen, pkt_size;
3058
3059 pkt = txd->rndis_pkt;
3060 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3061 if (pkt_size < txr->hn_chim_size) {
3062 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3063 if (chim != NULL)
3064 pkt = chim;
3065 } else {
3066 if (txr->hn_agg_txd != NULL)
3067 hn_flush_txagg(ifp, txr);
3068 }
3069
3070 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3071 pkt->rm_len = m_head->m_pkthdr.len;
3072 pkt->rm_dataoffset = 0;
3073 pkt->rm_datalen = m_head->m_pkthdr.len;
3074 pkt->rm_oobdataoffset = 0;
3075 pkt->rm_oobdatalen = 0;
3076 pkt->rm_oobdataelements = 0;
3077 pkt->rm_pktinfooffset = sizeof(*pkt);
3078 pkt->rm_pktinfolen = 0;
3079 pkt->rm_vchandle = 0;
3080 pkt->rm_reserved = 0;
3081
3082 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3083 /*
3084 * Set the hash value for this packet.
3085 */
3086 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3087 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3088
3089 if (M_HASHTYPE_ISHASH(m_head))
3090 /*
3091 * The flowid field contains the hash value host
3092 * set in the rx queue if it is a ip forwarding pkt.
3093 * Set the same hash value so host can send on the
3094 * cpu it was received.
3095 */
3096 *pi_data = m_head->m_pkthdr.flowid;
3097 else
3098 /*
3099 * Otherwise just put the tx queue index.
3100 */
3101 *pi_data = txr->hn_tx_idx;
3102 }
3103
3104 if (m_head->m_flags & M_VLANTAG) {
3105 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3106 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3107 *pi_data = NDIS_VLAN_INFO_MAKE(
3108 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3109 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3110 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3111 }
3112
3113 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3114 #if defined(INET6) || defined(INET)
3115 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3116 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3117 #ifdef INET
3118 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3119 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3120 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3121 m_head->m_pkthdr.tso_segsz);
3122 }
3123 #endif
3124 #if defined(INET6) && defined(INET)
3125 else
3126 #endif
3127 #ifdef INET6
3128 {
3129 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3130 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3131 m_head->m_pkthdr.tso_segsz);
3132 }
3133 #endif
3134 #endif /* INET6 || INET */
3135 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3136 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3137 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3138 if (m_head->m_pkthdr.csum_flags &
3139 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3140 *pi_data = NDIS_TXCSUM_INFO_IPV6;
3141 } else {
3142 *pi_data = NDIS_TXCSUM_INFO_IPV4;
3143 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3144 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
3145 }
3146
3147 if (m_head->m_pkthdr.csum_flags &
3148 (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3149 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3150 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3151 } else if (m_head->m_pkthdr.csum_flags &
3152 (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3153 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3154 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3155 }
3156 }
3157
3158 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3159 /* Fixup RNDIS packet message total length */
3160 pkt->rm_len += pkt_hlen;
3161 /* Convert RNDIS packet message offsets */
3162 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3163 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3164
3165 /*
3166 * Fast path: Chimney sending.
3167 */
3168 if (chim != NULL) {
3169 struct hn_txdesc *tgt_txd = txd;
3170
3171 if (txr->hn_agg_txd != NULL) {
3172 tgt_txd = txr->hn_agg_txd;
3173 #ifdef INVARIANTS
3174 *m_head0 = NULL;
3175 #endif
3176 }
3177
3178 KASSERT(pkt == chim,
3179 ("RNDIS pkt not in chimney sending buffer"));
3180 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3181 ("chimney sending buffer is not used"));
3182 tgt_txd->chim_size += pkt->rm_len;
3183
3184 m_copydata(m_head, 0, m_head->m_pkthdr.len,
3185 ((uint8_t *)chim) + pkt_hlen);
3186
3187 txr->hn_gpa_cnt = 0;
3188 txr->hn_sendpkt = hn_txpkt_chim;
3189 goto done;
3190 }
3191
3192 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3193 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3194 ("chimney buffer is used"));
3195 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3196
3197 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3198 if (__predict_false(error)) {
3199 int freed __diagused;
3200
3201 /*
3202 * This mbuf is not linked w/ the txd yet, so free it now.
3203 */
3204 m_freem(m_head);
3205 *m_head0 = NULL;
3206
3207 freed = hn_txdesc_put(txr, txd);
3208 KASSERT(freed != 0,
3209 ("fail to free txd upon txdma error"));
3210
3211 txr->hn_txdma_failed++;
3212 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3213 return error;
3214 }
3215 *m_head0 = m_head;
3216
3217 /* +1 RNDIS packet message */
3218 txr->hn_gpa_cnt = nsegs + 1;
3219
3220 /* send packet with page buffer */
3221 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3222 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3223 txr->hn_gpa[0].gpa_len = pkt_hlen;
3224
3225 /*
3226 * Fill the page buffers with mbuf info after the page
3227 * buffer for RNDIS packet message.
3228 */
3229 for (i = 0; i < nsegs; ++i) {
3230 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3231
3232 gpa->gpa_page = atop(segs[i].ds_addr);
3233 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3234 gpa->gpa_len = segs[i].ds_len;
3235 }
3236
3237 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3238 txd->chim_size = 0;
3239 txr->hn_sendpkt = hn_txpkt_sglist;
3240 done:
3241 txd->m = m_head;
3242
3243 /* Set the completion routine */
3244 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3245
3246 /* Update temporary stats for later use. */
3247 txr->hn_stat_pkts++;
3248 txr->hn_stat_size += m_head->m_pkthdr.len;
3249 if (m_head->m_flags & M_MCAST)
3250 txr->hn_stat_mcasts++;
3251
3252 return 0;
3253 }
3254
3255 /*
3256 * NOTE:
3257 * If this function fails, then txd will be freed, but the mbuf
3258 * associated w/ the txd will _not_ be freed.
3259 */
3260 static int
hn_txpkt(if_t ifp,struct hn_tx_ring * txr,struct hn_txdesc * txd)3261 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3262 {
3263 int error, send_failed = 0, has_bpf;
3264
3265 again:
3266 has_bpf = bpf_peers_present_if(ifp);
3267 if (has_bpf) {
3268 /*
3269 * Make sure that this txd and any aggregated txds are not
3270 * freed before ETHER_BPF_MTAP.
3271 */
3272 hn_txdesc_hold(txd);
3273 }
3274 error = txr->hn_sendpkt(txr, txd);
3275 if (!error) {
3276 if (has_bpf) {
3277 const struct hn_txdesc *tmp_txd;
3278
3279 ETHER_BPF_MTAP(ifp, txd->m);
3280 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3281 ETHER_BPF_MTAP(ifp, tmp_txd->m);
3282 }
3283
3284 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3285 #ifdef HN_IFSTART_SUPPORT
3286 if (!hn_use_if_start)
3287 #endif
3288 {
3289 if_inc_counter(ifp, IFCOUNTER_OBYTES,
3290 txr->hn_stat_size);
3291 if (txr->hn_stat_mcasts != 0) {
3292 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3293 txr->hn_stat_mcasts);
3294 }
3295 }
3296 txr->hn_pkts += txr->hn_stat_pkts;
3297 txr->hn_sends++;
3298 }
3299 if (has_bpf)
3300 hn_txdesc_put(txr, txd);
3301
3302 if (__predict_false(error)) {
3303 int freed __diagused;
3304
3305 /*
3306 * This should "really rarely" happen.
3307 *
3308 * XXX Too many RX to be acked or too many sideband
3309 * commands to run? Ask netvsc_channel_rollup()
3310 * to kick start later.
3311 */
3312 txr->hn_has_txeof = 1;
3313 if (!send_failed) {
3314 txr->hn_send_failed++;
3315 send_failed = 1;
3316 /*
3317 * Try sending again after set hn_has_txeof;
3318 * in case that we missed the last
3319 * netvsc_channel_rollup().
3320 */
3321 goto again;
3322 }
3323 if_printf(ifp, "send failed\n");
3324
3325 /*
3326 * Caller will perform further processing on the
3327 * associated mbuf, so don't free it in hn_txdesc_put();
3328 * only unload it from the DMA map in hn_txdesc_put(),
3329 * if it was loaded.
3330 */
3331 txd->m = NULL;
3332 freed = hn_txdesc_put(txr, txd);
3333 KASSERT(freed != 0,
3334 ("fail to free txd upon send error"));
3335
3336 txr->hn_send_failed++;
3337 }
3338
3339 /* Reset temporary stats, after this sending is done. */
3340 txr->hn_stat_size = 0;
3341 txr->hn_stat_pkts = 0;
3342 txr->hn_stat_mcasts = 0;
3343
3344 return (error);
3345 }
3346
3347 /*
3348 * Append the specified data to the indicated mbuf chain,
3349 * Extend the mbuf chain if the new data does not fit in
3350 * existing space.
3351 *
3352 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3353 * There should be an equivalent in the kernel mbuf code,
3354 * but there does not appear to be one yet.
3355 *
3356 * Differs from m_append() in that additional mbufs are
3357 * allocated with cluster size MJUMPAGESIZE, and filled
3358 * accordingly.
3359 *
3360 * Return the last mbuf in the chain or NULL if failed to
3361 * allocate new mbuf.
3362 */
3363 static struct mbuf *
hv_m_append(struct mbuf * m0,int len,c_caddr_t cp)3364 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3365 {
3366 struct mbuf *m, *n;
3367 int remainder, space;
3368
3369 for (m = m0; m->m_next != NULL; m = m->m_next)
3370 ;
3371 remainder = len;
3372 space = M_TRAILINGSPACE(m);
3373 if (space > 0) {
3374 /*
3375 * Copy into available space.
3376 */
3377 if (space > remainder)
3378 space = remainder;
3379 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3380 m->m_len += space;
3381 cp += space;
3382 remainder -= space;
3383 }
3384 while (remainder > 0) {
3385 /*
3386 * Allocate a new mbuf; could check space
3387 * and allocate a cluster instead.
3388 */
3389 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3390 if (n == NULL)
3391 return NULL;
3392 n->m_len = min(MJUMPAGESIZE, remainder);
3393 bcopy(cp, mtod(n, caddr_t), n->m_len);
3394 cp += n->m_len;
3395 remainder -= n->m_len;
3396 m->m_next = n;
3397 m = n;
3398 }
3399
3400 return m;
3401 }
3402
3403 #if defined(INET) || defined(INET6)
3404 static __inline int
hn_lro_rx(struct lro_ctrl * lc,struct mbuf * m)3405 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3406 {
3407 if (hn_lro_mbufq_depth) {
3408 tcp_lro_queue_mbuf(lc, m);
3409 return 0;
3410 }
3411 return tcp_lro_rx(lc, m, 0);
3412 }
3413 #endif
3414
3415 static int
hn_rxpkt(struct hn_rx_ring * rxr)3416 hn_rxpkt(struct hn_rx_ring *rxr)
3417 {
3418 if_t ifp, hn_ifp = rxr->hn_ifp;
3419 struct mbuf *m_new, *n;
3420 int size, do_lro = 0, do_csum = 1, is_vf = 0;
3421 int hash_type = M_HASHTYPE_NONE;
3422 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3423 int i;
3424
3425 ifp = hn_ifp;
3426 if (rxr->hn_rxvf_ifp != NULL) {
3427 /*
3428 * Non-transparent mode VF; pretend this packet is from
3429 * the VF.
3430 */
3431 ifp = rxr->hn_rxvf_ifp;
3432 is_vf = 1;
3433 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3434 /* Transparent mode VF. */
3435 is_vf = 1;
3436 }
3437
3438 if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
3439 /*
3440 * NOTE:
3441 * See the NOTE of hn_rndis_init_fixat(). This
3442 * function can be reached, immediately after the
3443 * RNDIS is initialized but before the ifnet is
3444 * setup on the hn_attach() path; drop the unexpected
3445 * packets.
3446 */
3447 return (0);
3448 }
3449
3450 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3451 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3452 return (0);
3453 }
3454
3455 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3456 m_new = m_gethdr(M_NOWAIT, MT_DATA);
3457 if (m_new == NULL) {
3458 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3459 return (0);
3460 }
3461 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3462 rxr->rsc.frag_len[0]);
3463 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3464 } else {
3465 /*
3466 * Get an mbuf with a cluster. For packets 2K or less,
3467 * get a standard 2K cluster. For anything larger, get a
3468 * 4K cluster. Any buffers larger than 4K can cause problems
3469 * if looped around to the Hyper-V TX channel, so avoid them.
3470 */
3471 size = MCLBYTES;
3472 if (rxr->rsc.pktlen > MCLBYTES) {
3473 /* 4096 */
3474 size = MJUMPAGESIZE;
3475 }
3476
3477 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3478 if (m_new == NULL) {
3479 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3480 return (0);
3481 }
3482
3483 n = m_new;
3484 for (i = 0; i < rxr->rsc.cnt; i++) {
3485 n = hv_m_append(n, rxr->rsc.frag_len[i],
3486 rxr->rsc.frag_data[i]);
3487 if (n == NULL) {
3488 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3489 return (0);
3490 } else {
3491 m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3492 }
3493 }
3494 }
3495 if (rxr->rsc.pktlen <= MHLEN)
3496 rxr->hn_small_pkts++;
3497
3498 m_new->m_pkthdr.rcvif = ifp;
3499
3500 if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0))
3501 do_csum = 0;
3502
3503 /* receive side checksum offload */
3504 if (rxr->rsc.csum_info != NULL) {
3505 /* IP csum offload */
3506 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3507 m_new->m_pkthdr.csum_flags |=
3508 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3509 rxr->hn_csum_ip++;
3510 }
3511
3512 /* TCP/UDP csum offload */
3513 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3514 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3515 m_new->m_pkthdr.csum_flags |=
3516 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3517 m_new->m_pkthdr.csum_data = 0xffff;
3518 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3519 rxr->hn_csum_tcp++;
3520 else
3521 rxr->hn_csum_udp++;
3522 }
3523
3524 /*
3525 * XXX
3526 * As of this write (Oct 28th, 2016), host side will turn
3527 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3528 * the do_lro setting here is actually _not_ accurate. We
3529 * depend on the RSS hash type check to reset do_lro.
3530 */
3531 if ((*(rxr->rsc.csum_info) &
3532 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3533 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3534 do_lro = 1;
3535 } else {
3536 hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3537 if (l3proto == ETHERTYPE_IP) {
3538 if (l4proto == IPPROTO_TCP) {
3539 if (do_csum &&
3540 (rxr->hn_trust_hcsum &
3541 HN_TRUST_HCSUM_TCP)) {
3542 rxr->hn_csum_trusted++;
3543 m_new->m_pkthdr.csum_flags |=
3544 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3545 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3546 m_new->m_pkthdr.csum_data = 0xffff;
3547 }
3548 do_lro = 1;
3549 } else if (l4proto == IPPROTO_UDP) {
3550 if (do_csum &&
3551 (rxr->hn_trust_hcsum &
3552 HN_TRUST_HCSUM_UDP)) {
3553 rxr->hn_csum_trusted++;
3554 m_new->m_pkthdr.csum_flags |=
3555 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3556 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3557 m_new->m_pkthdr.csum_data = 0xffff;
3558 }
3559 } else if (l4proto != IPPROTO_DONE && do_csum &&
3560 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3561 rxr->hn_csum_trusted++;
3562 m_new->m_pkthdr.csum_flags |=
3563 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3564 }
3565 }
3566 }
3567
3568 if (rxr->rsc.vlan_info != NULL) {
3569 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3570 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3571 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3572 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3573 m_new->m_flags |= M_VLANTAG;
3574 }
3575
3576 /*
3577 * If VF is activated (tranparent/non-transparent mode does not
3578 * matter here).
3579 *
3580 * - Disable LRO
3581 *
3582 * hn(4) will only receive broadcast packets, multicast packets,
3583 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3584 * packet types.
3585 *
3586 * For non-transparent, we definitely _cannot_ enable LRO at
3587 * all, since the LRO flush will use hn(4) as the receiving
3588 * interface; i.e. hn_ifp->if_input(hn_ifp, m).
3589 */
3590 if (is_vf)
3591 do_lro = 0;
3592
3593 /*
3594 * If VF is activated (tranparent/non-transparent mode does not
3595 * matter here), do _not_ mess with unsupported hash types or
3596 * functions.
3597 */
3598 if (rxr->rsc.hash_info != NULL) {
3599 rxr->hn_rss_pkts++;
3600 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3601 if (!is_vf)
3602 hash_type = M_HASHTYPE_OPAQUE_HASH;
3603 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3604 NDIS_HASH_FUNCTION_TOEPLITZ) {
3605 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3606 rxr->hn_mbuf_hash);
3607
3608 /*
3609 * NOTE:
3610 * do_lro is resetted, if the hash types are not TCP
3611 * related. See the comment in the above csum_flags
3612 * setup section.
3613 */
3614 switch (type) {
3615 case NDIS_HASH_IPV4:
3616 hash_type = M_HASHTYPE_RSS_IPV4;
3617 do_lro = 0;
3618 break;
3619
3620 case NDIS_HASH_TCP_IPV4:
3621 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3622 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3623 int def_htype = M_HASHTYPE_OPAQUE_HASH;
3624
3625 if (is_vf)
3626 def_htype = M_HASHTYPE_NONE;
3627
3628 /*
3629 * UDP 4-tuple hash is delivered as
3630 * TCP 4-tuple hash.
3631 */
3632 if (l3proto == ETHERTYPE_MAX) {
3633 hn_rxpkt_proto(m_new,
3634 &l3proto, &l4proto);
3635 }
3636 if (l3proto == ETHERTYPE_IP) {
3637 if (l4proto == IPPROTO_UDP &&
3638 (rxr->hn_mbuf_hash &
3639 NDIS_HASH_UDP_IPV4_X)) {
3640 hash_type =
3641 M_HASHTYPE_RSS_UDP_IPV4;
3642 do_lro = 0;
3643 } else if (l4proto !=
3644 IPPROTO_TCP) {
3645 hash_type = def_htype;
3646 do_lro = 0;
3647 }
3648 } else {
3649 hash_type = def_htype;
3650 do_lro = 0;
3651 }
3652 }
3653 break;
3654
3655 case NDIS_HASH_IPV6:
3656 hash_type = M_HASHTYPE_RSS_IPV6;
3657 do_lro = 0;
3658 break;
3659
3660 case NDIS_HASH_IPV6_EX:
3661 hash_type = M_HASHTYPE_RSS_IPV6_EX;
3662 do_lro = 0;
3663 break;
3664
3665 case NDIS_HASH_TCP_IPV6:
3666 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3667 break;
3668
3669 case NDIS_HASH_TCP_IPV6_EX:
3670 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3671 break;
3672 }
3673 }
3674 } else if (!is_vf) {
3675 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3676 hash_type = M_HASHTYPE_OPAQUE;
3677 }
3678 M_HASHTYPE_SET(m_new, hash_type);
3679
3680 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3681 if (hn_ifp != ifp) {
3682 const struct ether_header *eh;
3683
3684 /*
3685 * Non-transparent mode VF is activated.
3686 */
3687
3688 /*
3689 * Allow tapping on hn(4).
3690 */
3691 ETHER_BPF_MTAP(hn_ifp, m_new);
3692
3693 /*
3694 * Update hn(4)'s stats.
3695 */
3696 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3697 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3698 /* Checked at the beginning of this function. */
3699 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3700 eh = mtod(m_new, struct ether_header *);
3701 if (ETHER_IS_MULTICAST(eh->ether_dhost))
3702 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3703 }
3704 rxr->hn_pkts++;
3705
3706 if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) {
3707 #if defined(INET) || defined(INET6)
3708 struct lro_ctrl *lro = &rxr->hn_lro;
3709
3710 if (lro->lro_cnt) {
3711 rxr->hn_lro_tried++;
3712 if (hn_lro_rx(lro, m_new) == 0) {
3713 /* DONE! */
3714 return 0;
3715 }
3716 }
3717 #endif
3718 }
3719 if_input(ifp, m_new);
3720
3721 return (0);
3722 }
3723
3724 static int
hn_ioctl(if_t ifp,u_long cmd,caddr_t data)3725 hn_ioctl(if_t ifp, u_long cmd, caddr_t data)
3726 {
3727 struct hn_softc *sc = if_getsoftc(ifp);
3728 struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3729 if_t vf_ifp;
3730 int mask, error = 0;
3731 struct ifrsskey *ifrk;
3732 struct ifrsshash *ifrh;
3733 uint32_t mtu;
3734
3735 switch (cmd) {
3736 case SIOCSIFMTU:
3737 if (ifr->ifr_mtu > HN_MTU_MAX) {
3738 error = EINVAL;
3739 break;
3740 }
3741
3742 HN_LOCK(sc);
3743
3744 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3745 HN_UNLOCK(sc);
3746 break;
3747 }
3748
3749 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3750 /* Can't change MTU */
3751 HN_UNLOCK(sc);
3752 error = EOPNOTSUPP;
3753 break;
3754 }
3755
3756 if (if_getmtu(ifp) == ifr->ifr_mtu) {
3757 HN_UNLOCK(sc);
3758 break;
3759 }
3760
3761 if (hn_xpnt_vf_isready(sc)) {
3762 vf_ifp = sc->hn_vf_ifp;
3763 ifr_vf = *ifr;
3764 strlcpy(ifr_vf.ifr_name, if_name(vf_ifp),
3765 sizeof(ifr_vf.ifr_name));
3766 error = ifhwioctl(SIOCSIFMTU,vf_ifp,
3767 (caddr_t)&ifr_vf, curthread);
3768 if (error) {
3769 HN_UNLOCK(sc);
3770 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3771 if_name(vf_ifp), ifr->ifr_mtu, error);
3772 break;
3773 }
3774 }
3775
3776 /*
3777 * Suspend this interface before the synthetic parts
3778 * are ripped.
3779 */
3780 hn_suspend(sc);
3781
3782 /*
3783 * Detach the synthetics parts, i.e. NVS and RNDIS.
3784 */
3785 hn_synth_detach(sc);
3786
3787 /*
3788 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3789 * with the new MTU setting.
3790 */
3791 error = hn_synth_attach(sc, ifr->ifr_mtu);
3792 if (error) {
3793 HN_UNLOCK(sc);
3794 break;
3795 }
3796
3797 error = hn_rndis_get_mtu(sc, &mtu);
3798 if (error)
3799 mtu = ifr->ifr_mtu;
3800 else if (bootverbose)
3801 if_printf(ifp, "RNDIS mtu %u\n", mtu);
3802
3803 /*
3804 * Commit the requested MTU, after the synthetic parts
3805 * have been successfully attached.
3806 */
3807 if (mtu >= ifr->ifr_mtu) {
3808 mtu = ifr->ifr_mtu;
3809 } else {
3810 if_printf(ifp, "fixup mtu %d -> %u\n",
3811 ifr->ifr_mtu, mtu);
3812 }
3813 if_setmtu(ifp, mtu);
3814
3815 /*
3816 * Synthetic parts' reattach may change the chimney
3817 * sending size; update it.
3818 */
3819 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3820 hn_set_chim_size(sc, sc->hn_chim_szmax);
3821
3822 /*
3823 * Make sure that various parameters based on MTU are
3824 * still valid, after the MTU change.
3825 */
3826 hn_mtu_change_fixup(sc);
3827
3828 /*
3829 * All done! Resume the interface now.
3830 */
3831 hn_resume(sc);
3832
3833 if ((sc->hn_flags & HN_FLAG_RXVF) ||
3834 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3835 /*
3836 * Since we have reattached the NVS part,
3837 * change the datapath to VF again; in case
3838 * that it is lost, after the NVS was detached.
3839 */
3840 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3841 }
3842
3843 HN_UNLOCK(sc);
3844 break;
3845
3846 case SIOCSIFFLAGS:
3847 HN_LOCK(sc);
3848
3849 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3850 HN_UNLOCK(sc);
3851 break;
3852 }
3853
3854 if (hn_xpnt_vf_isready(sc))
3855 hn_xpnt_vf_saveifflags(sc);
3856
3857 if (if_getflags(ifp) & IFF_UP) {
3858 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3859 /*
3860 * Caller meight hold mutex, e.g.
3861 * bpf; use busy-wait for the RNDIS
3862 * reply.
3863 */
3864 HN_NO_SLEEPING(sc);
3865 hn_rxfilter_config(sc);
3866 HN_SLEEPING_OK(sc);
3867
3868 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3869 error = hn_xpnt_vf_iocsetflags(sc);
3870 } else {
3871 hn_init_locked(sc);
3872 }
3873 } else {
3874 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
3875 hn_stop(sc, false);
3876 }
3877 sc->hn_if_flags = if_getflags(ifp);
3878
3879 HN_UNLOCK(sc);
3880 break;
3881
3882 case SIOCSIFCAP:
3883 HN_LOCK(sc);
3884
3885 if (hn_xpnt_vf_isready(sc)) {
3886 ifr_vf = *ifr;
3887 strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp),
3888 sizeof(ifr_vf.ifr_name));
3889 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3890 HN_UNLOCK(sc);
3891 break;
3892 }
3893
3894 /*
3895 * Fix up requested capabilities w/ supported capabilities,
3896 * since the supported capabilities could have been changed.
3897 */
3898 mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^
3899 if_getcapenable(ifp);
3900
3901 if (mask & IFCAP_TXCSUM) {
3902 if_togglecapenable(ifp, IFCAP_TXCSUM);
3903 if (if_getcapenable(ifp) & IFCAP_TXCSUM)
3904 if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0);
3905 else
3906 if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc));
3907 }
3908 if (mask & IFCAP_TXCSUM_IPV6) {
3909 if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6);
3910 if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
3911 if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0);
3912 else
3913 if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc));
3914 }
3915
3916 /* TODO: flip RNDIS offload parameters for RXCSUM. */
3917 if (mask & IFCAP_RXCSUM)
3918 if_togglecapenable(ifp, IFCAP_RXCSUM);
3919 #ifdef foo
3920 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
3921 if (mask & IFCAP_RXCSUM_IPV6)
3922 if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6);
3923 #endif
3924
3925 if (mask & IFCAP_LRO)
3926 if_togglecapenable(ifp, IFCAP_LRO);
3927
3928 if (mask & IFCAP_TSO4) {
3929 if_togglecapenable(ifp, IFCAP_TSO4);
3930 if (if_getcapenable(ifp) & IFCAP_TSO4)
3931 if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
3932 else
3933 if_sethwassistbits(ifp, 0, CSUM_IP_TSO);
3934 }
3935 if (mask & IFCAP_TSO6) {
3936 if_togglecapenable(ifp, IFCAP_TSO6);
3937 if (if_getcapenable(ifp) & IFCAP_TSO6)
3938 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
3939 else
3940 if_sethwassistbits(ifp, 0, CSUM_IP6_TSO);
3941 }
3942
3943 HN_UNLOCK(sc);
3944 break;
3945
3946 case SIOCADDMULTI:
3947 case SIOCDELMULTI:
3948 HN_LOCK(sc);
3949
3950 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3951 HN_UNLOCK(sc);
3952 break;
3953 }
3954 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3955 /*
3956 * Multicast uses mutex; use busy-wait for
3957 * the RNDIS reply.
3958 */
3959 HN_NO_SLEEPING(sc);
3960 hn_rxfilter_config(sc);
3961 HN_SLEEPING_OK(sc);
3962 }
3963
3964 /* XXX vlan(4) style mcast addr maintenance */
3965 if (hn_xpnt_vf_isready(sc)) {
3966 int old_if_flags;
3967
3968 old_if_flags = if_getflags(sc->hn_vf_ifp);
3969 hn_xpnt_vf_saveifflags(sc);
3970
3971 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3972 ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) &
3973 IFF_ALLMULTI))
3974 error = hn_xpnt_vf_iocsetflags(sc);
3975 }
3976
3977 HN_UNLOCK(sc);
3978 break;
3979
3980 case SIOCSIFMEDIA:
3981 case SIOCGIFMEDIA:
3982 HN_LOCK(sc);
3983 if (hn_xpnt_vf_isready(sc)) {
3984 /*
3985 * SIOCGIFMEDIA expects ifmediareq, so don't
3986 * create and pass ifr_vf to the VF here; just
3987 * replace the ifr_name.
3988 */
3989 vf_ifp = sc->hn_vf_ifp;
3990 strlcpy(ifr->ifr_name, if_name(vf_ifp),
3991 sizeof(ifr->ifr_name));
3992 error = ifhwioctl(cmd, vf_ifp, data, curthread);
3993 /* Restore the ifr_name. */
3994 strlcpy(ifr->ifr_name, if_name(ifp),
3995 sizeof(ifr->ifr_name));
3996 HN_UNLOCK(sc);
3997 break;
3998 }
3999 HN_UNLOCK(sc);
4000 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4001 break;
4002
4003 case SIOCGIFRSSHASH:
4004 ifrh = (struct ifrsshash *)data;
4005 HN_LOCK(sc);
4006 if (sc->hn_rx_ring_inuse == 1) {
4007 HN_UNLOCK(sc);
4008 ifrh->ifrh_func = RSS_FUNC_NONE;
4009 ifrh->ifrh_types = 0;
4010 break;
4011 }
4012
4013 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4014 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4015 else
4016 ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4017 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4018 HN_UNLOCK(sc);
4019 break;
4020
4021 case SIOCGIFRSSKEY:
4022 ifrk = (struct ifrsskey *)data;
4023 HN_LOCK(sc);
4024 if (sc->hn_rx_ring_inuse == 1) {
4025 HN_UNLOCK(sc);
4026 ifrk->ifrk_func = RSS_FUNC_NONE;
4027 ifrk->ifrk_keylen = 0;
4028 break;
4029 }
4030 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4031 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4032 else
4033 ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4034 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4035 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4036 NDIS_HASH_KEYSIZE_TOEPLITZ);
4037 HN_UNLOCK(sc);
4038 break;
4039
4040 default:
4041 error = ether_ioctl(ifp, cmd, data);
4042 break;
4043 }
4044 return (error);
4045 }
4046
4047 static void
hn_stop(struct hn_softc * sc,bool detaching)4048 hn_stop(struct hn_softc *sc, bool detaching)
4049 {
4050 if_t ifp = sc->hn_ifp;
4051 int i;
4052
4053 HN_LOCK_ASSERT(sc);
4054
4055 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4056 ("synthetic parts were not attached"));
4057
4058 /* Clear RUNNING bit ASAP. */
4059 if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
4060
4061 /* Disable polling. */
4062 hn_polling(sc, 0);
4063
4064 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4065 KASSERT(sc->hn_vf_ifp != NULL,
4066 ("%s: VF is not attached", if_name(ifp)));
4067
4068 /* Mark transparent mode VF as disabled. */
4069 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4070
4071 /*
4072 * NOTE:
4073 * Datapath setting must happen _before_ bringing
4074 * the VF down.
4075 */
4076 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4077
4078 /*
4079 * Bring the VF down.
4080 */
4081 hn_xpnt_vf_saveifflags(sc);
4082 if_setflagbits(ifp, 0, IFF_UP);
4083 hn_xpnt_vf_iocsetflags(sc);
4084 }
4085
4086 /* Suspend data transfers. */
4087 hn_suspend_data(sc);
4088
4089 /* Clear OACTIVE bit. */
4090 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4091 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4092 sc->hn_tx_ring[i].hn_oactive = 0;
4093
4094 /*
4095 * If the non-transparent mode VF is active, make sure
4096 * that the RX filter still allows packet reception.
4097 */
4098 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4099 hn_rxfilter_config(sc);
4100 }
4101
4102 static void
hn_init_locked(struct hn_softc * sc)4103 hn_init_locked(struct hn_softc *sc)
4104 {
4105 if_t ifp = sc->hn_ifp;
4106 int i;
4107
4108 HN_LOCK_ASSERT(sc);
4109
4110 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4111 return;
4112
4113 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
4114 return;
4115
4116 /* Configure RX filter */
4117 hn_rxfilter_config(sc);
4118
4119 /* Clear OACTIVE bit. */
4120 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4121 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4122 sc->hn_tx_ring[i].hn_oactive = 0;
4123
4124 /* Clear TX 'suspended' bit. */
4125 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4126
4127 if (hn_xpnt_vf_isready(sc)) {
4128 /* Initialize transparent VF. */
4129 hn_xpnt_vf_init(sc);
4130 }
4131
4132 /* Everything is ready; unleash! */
4133 if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
4134
4135 /* Re-enable polling if requested. */
4136 if (sc->hn_pollhz > 0)
4137 hn_polling(sc, sc->hn_pollhz);
4138 }
4139
4140 static void
hn_init(void * xsc)4141 hn_init(void *xsc)
4142 {
4143 struct hn_softc *sc = xsc;
4144
4145 HN_LOCK(sc);
4146 hn_init_locked(sc);
4147 HN_UNLOCK(sc);
4148 }
4149
4150 static int
hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)4151 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4152 {
4153 struct hn_softc *sc = arg1;
4154 unsigned int lenlim;
4155 int error;
4156
4157 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4158 error = sysctl_handle_int(oidp, &lenlim, 0, req);
4159 if (error || req->newptr == NULL)
4160 return error;
4161
4162 HN_LOCK(sc);
4163 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4164 lenlim > TCP_LRO_LENGTH_MAX) {
4165 HN_UNLOCK(sc);
4166 return EINVAL;
4167 }
4168 hn_set_lro_lenlim(sc, lenlim);
4169 HN_UNLOCK(sc);
4170
4171 return 0;
4172 }
4173
4174 static int
hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)4175 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4176 {
4177 struct hn_softc *sc = arg1;
4178 int ackcnt, error, i;
4179
4180 /*
4181 * lro_ackcnt_lim is append count limit,
4182 * +1 to turn it into aggregation limit.
4183 */
4184 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4185 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4186 if (error || req->newptr == NULL)
4187 return error;
4188
4189 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4190 return EINVAL;
4191
4192 /*
4193 * Convert aggregation limit back to append
4194 * count limit.
4195 */
4196 --ackcnt;
4197 HN_LOCK(sc);
4198 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4199 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4200 HN_UNLOCK(sc);
4201 return 0;
4202 }
4203
4204 static int
hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)4205 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4206 {
4207 struct hn_softc *sc = arg1;
4208 int hcsum = arg2;
4209 int on, error, i;
4210
4211 on = 0;
4212 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4213 on = 1;
4214
4215 error = sysctl_handle_int(oidp, &on, 0, req);
4216 if (error || req->newptr == NULL)
4217 return error;
4218
4219 HN_LOCK(sc);
4220 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4221 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4222
4223 if (on)
4224 rxr->hn_trust_hcsum |= hcsum;
4225 else
4226 rxr->hn_trust_hcsum &= ~hcsum;
4227 }
4228 HN_UNLOCK(sc);
4229 return 0;
4230 }
4231
4232 static int
hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)4233 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4234 {
4235 struct hn_softc *sc = arg1;
4236 int chim_size, error;
4237
4238 chim_size = sc->hn_tx_ring[0].hn_chim_size;
4239 error = sysctl_handle_int(oidp, &chim_size, 0, req);
4240 if (error || req->newptr == NULL)
4241 return error;
4242
4243 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4244 return EINVAL;
4245
4246 HN_LOCK(sc);
4247 hn_set_chim_size(sc, chim_size);
4248 HN_UNLOCK(sc);
4249 return 0;
4250 }
4251
4252 static int
hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)4253 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4254 {
4255 struct hn_softc *sc = arg1;
4256 int ofs = arg2, i, error;
4257 struct hn_rx_ring *rxr;
4258 uint64_t stat;
4259
4260 stat = 0;
4261 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4262 rxr = &sc->hn_rx_ring[i];
4263 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4264 }
4265
4266 error = sysctl_handle_64(oidp, &stat, 0, req);
4267 if (error || req->newptr == NULL)
4268 return error;
4269
4270 /* Zero out this stat. */
4271 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4272 rxr = &sc->hn_rx_ring[i];
4273 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4274 }
4275 return 0;
4276 }
4277
4278 static int
hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)4279 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4280 {
4281 struct hn_softc *sc = arg1;
4282 int ofs = arg2, i, error;
4283 struct hn_rx_ring *rxr;
4284 u_long stat;
4285
4286 stat = 0;
4287 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4288 rxr = &sc->hn_rx_ring[i];
4289 stat += *((u_long *)((uint8_t *)rxr + ofs));
4290 }
4291
4292 error = sysctl_handle_long(oidp, &stat, 0, req);
4293 if (error || req->newptr == NULL)
4294 return error;
4295
4296 /* Zero out this stat. */
4297 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4298 rxr = &sc->hn_rx_ring[i];
4299 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
4300 }
4301 return 0;
4302 }
4303
4304 static int
hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)4305 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4306 {
4307 struct hn_softc *sc = arg1;
4308 int ofs = arg2, i, error;
4309 struct hn_tx_ring *txr;
4310 u_long stat;
4311
4312 stat = 0;
4313 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4314 txr = &sc->hn_tx_ring[i];
4315 stat += *((u_long *)((uint8_t *)txr + ofs));
4316 }
4317
4318 error = sysctl_handle_long(oidp, &stat, 0, req);
4319 if (error || req->newptr == NULL)
4320 return error;
4321
4322 /* Zero out this stat. */
4323 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4324 txr = &sc->hn_tx_ring[i];
4325 *((u_long *)((uint8_t *)txr + ofs)) = 0;
4326 }
4327 return 0;
4328 }
4329
4330 static int
hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)4331 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4332 {
4333 struct hn_softc *sc = arg1;
4334 int ofs = arg2, i, error, conf;
4335 struct hn_tx_ring *txr;
4336
4337 txr = &sc->hn_tx_ring[0];
4338 conf = *((int *)((uint8_t *)txr + ofs));
4339
4340 error = sysctl_handle_int(oidp, &conf, 0, req);
4341 if (error || req->newptr == NULL)
4342 return error;
4343
4344 HN_LOCK(sc);
4345 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4346 txr = &sc->hn_tx_ring[i];
4347 *((int *)((uint8_t *)txr + ofs)) = conf;
4348 }
4349 HN_UNLOCK(sc);
4350
4351 return 0;
4352 }
4353
4354 static int
hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)4355 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4356 {
4357 struct hn_softc *sc = arg1;
4358 int error, size;
4359
4360 size = sc->hn_agg_size;
4361 error = sysctl_handle_int(oidp, &size, 0, req);
4362 if (error || req->newptr == NULL)
4363 return (error);
4364
4365 HN_LOCK(sc);
4366 sc->hn_agg_size = size;
4367 hn_set_txagg(sc);
4368 HN_UNLOCK(sc);
4369
4370 return (0);
4371 }
4372
4373 static int
hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)4374 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4375 {
4376 struct hn_softc *sc = arg1;
4377 int error, pkts;
4378
4379 pkts = sc->hn_agg_pkts;
4380 error = sysctl_handle_int(oidp, &pkts, 0, req);
4381 if (error || req->newptr == NULL)
4382 return (error);
4383
4384 HN_LOCK(sc);
4385 sc->hn_agg_pkts = pkts;
4386 hn_set_txagg(sc);
4387 HN_UNLOCK(sc);
4388
4389 return (0);
4390 }
4391
4392 static int
hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)4393 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4394 {
4395 struct hn_softc *sc = arg1;
4396 int pkts;
4397
4398 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4399 return (sysctl_handle_int(oidp, &pkts, 0, req));
4400 }
4401
4402 static int
hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)4403 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4404 {
4405 struct hn_softc *sc = arg1;
4406 int align;
4407
4408 align = sc->hn_tx_ring[0].hn_agg_align;
4409 return (sysctl_handle_int(oidp, &align, 0, req));
4410 }
4411
4412 static void
hn_chan_polling(struct vmbus_channel * chan,u_int pollhz)4413 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4414 {
4415 if (pollhz == 0)
4416 vmbus_chan_poll_disable(chan);
4417 else
4418 vmbus_chan_poll_enable(chan, pollhz);
4419 }
4420
4421 static void
hn_polling(struct hn_softc * sc,u_int pollhz)4422 hn_polling(struct hn_softc *sc, u_int pollhz)
4423 {
4424 int nsubch = sc->hn_rx_ring_inuse - 1;
4425
4426 HN_LOCK_ASSERT(sc);
4427
4428 if (nsubch > 0) {
4429 struct vmbus_channel **subch;
4430 int i;
4431
4432 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4433 for (i = 0; i < nsubch; ++i)
4434 hn_chan_polling(subch[i], pollhz);
4435 vmbus_subchan_rel(subch, nsubch);
4436 }
4437 hn_chan_polling(sc->hn_prichan, pollhz);
4438 }
4439
4440 static int
hn_polling_sysctl(SYSCTL_HANDLER_ARGS)4441 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4442 {
4443 struct hn_softc *sc = arg1;
4444 int pollhz, error;
4445
4446 pollhz = sc->hn_pollhz;
4447 error = sysctl_handle_int(oidp, &pollhz, 0, req);
4448 if (error || req->newptr == NULL)
4449 return (error);
4450
4451 if (pollhz != 0 &&
4452 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4453 return (EINVAL);
4454
4455 HN_LOCK(sc);
4456 if (sc->hn_pollhz != pollhz) {
4457 sc->hn_pollhz = pollhz;
4458 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) &&
4459 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4460 hn_polling(sc, sc->hn_pollhz);
4461 }
4462 HN_UNLOCK(sc);
4463
4464 return (0);
4465 }
4466
4467 static int
hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)4468 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4469 {
4470 struct hn_softc *sc = arg1;
4471 char verstr[16];
4472
4473 snprintf(verstr, sizeof(verstr), "%u.%u",
4474 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4475 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4476 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4477 }
4478
4479 static int
hn_caps_sysctl(SYSCTL_HANDLER_ARGS)4480 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4481 {
4482 struct hn_softc *sc = arg1;
4483 char caps_str[128];
4484 uint32_t caps;
4485
4486 HN_LOCK(sc);
4487 caps = sc->hn_caps;
4488 HN_UNLOCK(sc);
4489 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4490 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4491 }
4492
4493 static int
hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)4494 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4495 {
4496 struct hn_softc *sc = arg1;
4497 char assist_str[128];
4498 uint32_t hwassist;
4499
4500 HN_LOCK(sc);
4501 hwassist = if_gethwassist(sc->hn_ifp);
4502 HN_UNLOCK(sc);
4503 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4504 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4505 }
4506
4507 static int
hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)4508 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4509 {
4510 struct hn_softc *sc = arg1;
4511 char filter_str[128];
4512 uint32_t filter;
4513
4514 HN_LOCK(sc);
4515 filter = sc->hn_rx_filter;
4516 HN_UNLOCK(sc);
4517 snprintf(filter_str, sizeof(filter_str), "%b", filter,
4518 NDIS_PACKET_TYPES);
4519 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4520 }
4521
4522 static int
hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)4523 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
4524 {
4525 struct hn_softc *sc = arg1;
4526 int rsc_ctrl, mtu;
4527 int error;
4528
4529 rsc_ctrl = sc->hn_rsc_ctrl;
4530 error = sysctl_handle_int(oidp, &rsc_ctrl, 0, req);
4531 if (error || req->newptr == NULL)
4532 return (error);
4533
4534 if (sc->hn_rsc_ctrl != rsc_ctrl) {
4535 HN_LOCK(sc);
4536 sc->hn_rsc_ctrl = rsc_ctrl;
4537 mtu = if_getmtu(sc->hn_ifp);
4538 error = hn_rndis_reconf_offload(sc, mtu);
4539 HN_UNLOCK(sc);
4540 }
4541
4542 return (error);
4543 }
4544 #ifndef RSS
4545
4546 static int
hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)4547 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4548 {
4549 struct hn_softc *sc = arg1;
4550 int error;
4551
4552 HN_LOCK(sc);
4553
4554 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4555 if (error || req->newptr == NULL)
4556 goto back;
4557
4558 if ((sc->hn_flags & HN_FLAG_RXVF) ||
4559 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4560 /*
4561 * RSS key is synchronized w/ VF's, don't allow users
4562 * to change it.
4563 */
4564 error = EBUSY;
4565 goto back;
4566 }
4567
4568 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4569 if (error)
4570 goto back;
4571 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4572
4573 if (sc->hn_rx_ring_inuse > 1) {
4574 error = hn_rss_reconfig(sc);
4575 } else {
4576 /* Not RSS capable, at least for now; just save the RSS key. */
4577 error = 0;
4578 }
4579 back:
4580 HN_UNLOCK(sc);
4581 return (error);
4582 }
4583
4584 static int
hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)4585 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4586 {
4587 struct hn_softc *sc = arg1;
4588 int error;
4589
4590 HN_LOCK(sc);
4591
4592 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4593 if (error || req->newptr == NULL)
4594 goto back;
4595
4596 /*
4597 * Don't allow RSS indirect table change, if this interface is not
4598 * RSS capable currently.
4599 */
4600 if (sc->hn_rx_ring_inuse == 1) {
4601 error = EOPNOTSUPP;
4602 goto back;
4603 }
4604
4605 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4606 if (error)
4607 goto back;
4608 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4609
4610 hn_rss_ind_fixup(sc);
4611 error = hn_rss_reconfig(sc);
4612 back:
4613 HN_UNLOCK(sc);
4614 return (error);
4615 }
4616
4617 #endif /* !RSS */
4618
4619 static int
hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)4620 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4621 {
4622 struct hn_softc *sc = arg1;
4623 char hash_str[128];
4624 uint32_t hash;
4625
4626 HN_LOCK(sc);
4627 hash = sc->hn_rss_hash;
4628 HN_UNLOCK(sc);
4629 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4630 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4631 }
4632
4633 static int
hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)4634 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4635 {
4636 struct hn_softc *sc = arg1;
4637 char hash_str[128];
4638 uint32_t hash;
4639
4640 HN_LOCK(sc);
4641 hash = sc->hn_rss_hcap;
4642 HN_UNLOCK(sc);
4643 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4644 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4645 }
4646
4647 static int
hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)4648 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4649 {
4650 struct hn_softc *sc = arg1;
4651 char hash_str[128];
4652 uint32_t hash;
4653
4654 HN_LOCK(sc);
4655 hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4656 HN_UNLOCK(sc);
4657 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4658 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4659 }
4660
4661 static int
hn_vf_sysctl(SYSCTL_HANDLER_ARGS)4662 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4663 {
4664 struct hn_softc *sc = arg1;
4665 char vf_name[IFNAMSIZ + 1];
4666 if_t vf_ifp;
4667
4668 HN_LOCK(sc);
4669 vf_name[0] = '\0';
4670 vf_ifp = sc->hn_vf_ifp;
4671 if (vf_ifp != NULL)
4672 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4673 HN_UNLOCK(sc);
4674 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4675 }
4676
4677 static int
hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)4678 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4679 {
4680 struct hn_softc *sc = arg1;
4681 char vf_name[IFNAMSIZ + 1];
4682 if_t vf_ifp;
4683
4684 HN_LOCK(sc);
4685 vf_name[0] = '\0';
4686 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4687 if (vf_ifp != NULL)
4688 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4689 HN_UNLOCK(sc);
4690 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4691 }
4692
4693 static int
hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)4694 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4695 {
4696 struct rm_priotracker pt;
4697 struct sbuf *sb;
4698 int error, i;
4699 bool first;
4700
4701 error = sysctl_wire_old_buffer(req, 0);
4702 if (error != 0)
4703 return (error);
4704
4705 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4706 if (sb == NULL)
4707 return (ENOMEM);
4708
4709 rm_rlock(&hn_vfmap_lock, &pt);
4710
4711 first = true;
4712 for (i = 0; i < hn_vfmap_size; ++i) {
4713 struct epoch_tracker et;
4714 if_t ifp;
4715
4716 if (hn_vfmap[i] == NULL)
4717 continue;
4718
4719 NET_EPOCH_ENTER(et);
4720 ifp = ifnet_byindex(i);
4721 if (ifp != NULL) {
4722 if (first)
4723 sbuf_printf(sb, "%s", if_name(ifp));
4724 else
4725 sbuf_printf(sb, " %s", if_name(ifp));
4726 first = false;
4727 }
4728 NET_EPOCH_EXIT(et);
4729 }
4730
4731 rm_runlock(&hn_vfmap_lock, &pt);
4732
4733 error = sbuf_finish(sb);
4734 sbuf_delete(sb);
4735 return (error);
4736 }
4737
4738 static int
hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)4739 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4740 {
4741 struct rm_priotracker pt;
4742 struct sbuf *sb;
4743 int error, i;
4744 bool first;
4745
4746 error = sysctl_wire_old_buffer(req, 0);
4747 if (error != 0)
4748 return (error);
4749
4750 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4751 if (sb == NULL)
4752 return (ENOMEM);
4753
4754 rm_rlock(&hn_vfmap_lock, &pt);
4755
4756 first = true;
4757 for (i = 0; i < hn_vfmap_size; ++i) {
4758 struct epoch_tracker et;
4759 if_t ifp, hn_ifp;
4760
4761 hn_ifp = hn_vfmap[i];
4762 if (hn_ifp == NULL)
4763 continue;
4764
4765 NET_EPOCH_ENTER(et);
4766 ifp = ifnet_byindex(i);
4767 if (ifp != NULL) {
4768 if (first) {
4769 sbuf_printf(sb, "%s:%s", if_name(ifp),
4770 if_name(hn_ifp));
4771 } else {
4772 sbuf_printf(sb, " %s:%s", if_name(ifp),
4773 if_name(hn_ifp));
4774 }
4775 first = false;
4776 }
4777 NET_EPOCH_EXIT(et);
4778 }
4779
4780 rm_runlock(&hn_vfmap_lock, &pt);
4781
4782 error = sbuf_finish(sb);
4783 sbuf_delete(sb);
4784 return (error);
4785 }
4786
4787 static int
hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)4788 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4789 {
4790 struct hn_softc *sc = arg1;
4791 int error, onoff = 0;
4792
4793 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4794 onoff = 1;
4795 error = sysctl_handle_int(oidp, &onoff, 0, req);
4796 if (error || req->newptr == NULL)
4797 return (error);
4798
4799 HN_LOCK(sc);
4800 /* NOTE: hn_vf_lock for hn_transmit() */
4801 rm_wlock(&sc->hn_vf_lock);
4802 if (onoff)
4803 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4804 else
4805 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4806 rm_wunlock(&sc->hn_vf_lock);
4807 HN_UNLOCK(sc);
4808
4809 return (0);
4810 }
4811
4812 static int
hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)4813 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4814 {
4815 struct hn_softc *sc = arg1;
4816 int enabled = 0;
4817
4818 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4819 enabled = 1;
4820 return (sysctl_handle_int(oidp, &enabled, 0, req));
4821 }
4822
4823 static int
hn_check_iplen(const struct mbuf * m,int hoff)4824 hn_check_iplen(const struct mbuf *m, int hoff)
4825 {
4826 const struct ip *ip;
4827 int len, iphlen, iplen;
4828 const struct tcphdr *th;
4829 int thoff; /* TCP data offset */
4830
4831 len = hoff + sizeof(struct ip);
4832
4833 /* The packet must be at least the size of an IP header. */
4834 if (m->m_pkthdr.len < len)
4835 return IPPROTO_DONE;
4836
4837 /* The fixed IP header must reside completely in the first mbuf. */
4838 if (m->m_len < len)
4839 return IPPROTO_DONE;
4840
4841 ip = mtodo(m, hoff);
4842
4843 /* Bound check the packet's stated IP header length. */
4844 iphlen = ip->ip_hl << 2;
4845 if (iphlen < sizeof(struct ip)) /* minimum header length */
4846 return IPPROTO_DONE;
4847
4848 /* The full IP header must reside completely in the one mbuf. */
4849 if (m->m_len < hoff + iphlen)
4850 return IPPROTO_DONE;
4851
4852 iplen = ntohs(ip->ip_len);
4853
4854 /*
4855 * Check that the amount of data in the buffers is as
4856 * at least much as the IP header would have us expect.
4857 */
4858 if (m->m_pkthdr.len < hoff + iplen)
4859 return IPPROTO_DONE;
4860
4861 /*
4862 * Ignore IP fragments.
4863 */
4864 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4865 return IPPROTO_DONE;
4866
4867 /*
4868 * The TCP/IP or UDP/IP header must be entirely contained within
4869 * the first fragment of a packet.
4870 */
4871 switch (ip->ip_p) {
4872 case IPPROTO_TCP:
4873 if (iplen < iphlen + sizeof(struct tcphdr))
4874 return IPPROTO_DONE;
4875 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4876 return IPPROTO_DONE;
4877 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4878 thoff = th->th_off << 2;
4879 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4880 return IPPROTO_DONE;
4881 if (m->m_len < hoff + iphlen + thoff)
4882 return IPPROTO_DONE;
4883 break;
4884 case IPPROTO_UDP:
4885 if (iplen < iphlen + sizeof(struct udphdr))
4886 return IPPROTO_DONE;
4887 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4888 return IPPROTO_DONE;
4889 break;
4890 default:
4891 if (iplen < iphlen)
4892 return IPPROTO_DONE;
4893 break;
4894 }
4895 return ip->ip_p;
4896 }
4897
4898 static void
hn_rxpkt_proto(const struct mbuf * m_new,int * l3proto,int * l4proto)4899 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4900 {
4901 const struct ether_header *eh;
4902 uint16_t etype;
4903 int hoff;
4904
4905 hoff = sizeof(*eh);
4906 /* Checked at the beginning of this function. */
4907 KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4908
4909 eh = mtod(m_new, const struct ether_header *);
4910 etype = ntohs(eh->ether_type);
4911 if (etype == ETHERTYPE_VLAN) {
4912 const struct ether_vlan_header *evl;
4913
4914 hoff = sizeof(*evl);
4915 if (m_new->m_len < hoff)
4916 return;
4917 evl = mtod(m_new, const struct ether_vlan_header *);
4918 etype = ntohs(evl->evl_proto);
4919 }
4920 *l3proto = etype;
4921
4922 if (etype == ETHERTYPE_IP)
4923 *l4proto = hn_check_iplen(m_new, hoff);
4924 else
4925 *l4proto = IPPROTO_DONE;
4926 }
4927
4928 static int
hn_create_rx_data(struct hn_softc * sc,int ring_cnt)4929 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4930 {
4931 struct sysctl_oid_list *child;
4932 struct sysctl_ctx_list *ctx;
4933 device_t dev = sc->hn_dev;
4934 #if defined(INET) || defined(INET6)
4935 int lroent_cnt;
4936 #endif
4937 int i;
4938
4939 /*
4940 * Create RXBUF for reception.
4941 *
4942 * NOTE:
4943 * - It is shared by all channels.
4944 * - A large enough buffer is allocated, certain version of NVSes
4945 * may further limit the usable space.
4946 */
4947 sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
4948 0ul, ~0ul, PAGE_SIZE, 0);
4949 if (sc->hn_rxbuf == NULL) {
4950 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4951 return (ENOMEM);
4952 }
4953
4954 sc->hn_rx_ring_cnt = ring_cnt;
4955 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4956
4957 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4958 M_DEVBUF, M_WAITOK | M_ZERO);
4959
4960 #if defined(INET) || defined(INET6)
4961 lroent_cnt = hn_lro_entry_count;
4962 if (lroent_cnt < TCP_LRO_ENTRIES)
4963 lroent_cnt = TCP_LRO_ENTRIES;
4964 if (bootverbose)
4965 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4966 #endif /* INET || INET6 */
4967
4968 ctx = device_get_sysctl_ctx(dev);
4969 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4970
4971 /* Create dev.hn.UNIT.rx sysctl tree */
4972 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4973 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4974
4975 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4976 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4977
4978 rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF,
4979 M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0);
4980 if (rxr->hn_br == NULL) {
4981 device_printf(dev, "allocate bufring failed\n");
4982 return (ENOMEM);
4983 }
4984
4985 if (hn_trust_hosttcp)
4986 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4987 if (hn_trust_hostudp)
4988 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4989 if (hn_trust_hostip)
4990 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4991 rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4992 rxr->hn_ifp = sc->hn_ifp;
4993 if (i < sc->hn_tx_ring_cnt)
4994 rxr->hn_txr = &sc->hn_tx_ring[i];
4995 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4996 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4997 rxr->hn_rx_idx = i;
4998 rxr->hn_rxbuf = sc->hn_rxbuf;
4999
5000 /*
5001 * Initialize LRO.
5002 */
5003 #if defined(INET) || defined(INET6)
5004 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5005 hn_lro_mbufq_depth);
5006 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5007 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5008 #endif /* INET || INET6 */
5009
5010 if (sc->hn_rx_sysctl_tree != NULL) {
5011 char name[16];
5012
5013 /*
5014 * Create per RX ring sysctl tree:
5015 * dev.hn.UNIT.rx.RINGID
5016 */
5017 snprintf(name, sizeof(name), "%d", i);
5018 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5019 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5020 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5021
5022 if (rxr->hn_rx_sysctl_tree != NULL) {
5023 SYSCTL_ADD_ULONG(ctx,
5024 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5025 OID_AUTO, "packets",
5026 CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5027 "# of packets received");
5028 SYSCTL_ADD_ULONG(ctx,
5029 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5030 OID_AUTO, "rss_pkts",
5031 CTLFLAG_RW | CTLFLAG_STATS,
5032 &rxr->hn_rss_pkts,
5033 "# of packets w/ RSS info received");
5034 SYSCTL_ADD_ULONG(ctx,
5035 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5036 OID_AUTO, "rsc_pkts",
5037 CTLFLAG_RW | CTLFLAG_STATS,
5038 &rxr->hn_rsc_pkts,
5039 "# of RSC packets received");
5040 SYSCTL_ADD_ULONG(ctx,
5041 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5042 OID_AUTO, "rsc_drop",
5043 CTLFLAG_RW | CTLFLAG_STATS,
5044 &rxr->hn_rsc_drop,
5045 "# of RSC fragments dropped");
5046 SYSCTL_ADD_INT(ctx,
5047 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5048 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5049 &rxr->hn_pktbuf_len, 0,
5050 "Temporary channel packet buffer length");
5051 }
5052 }
5053 }
5054
5055 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5056 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5057 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5058 hn_rx_stat_u64_sysctl,
5059 "LU", "LRO queued");
5060 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5061 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5062 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5063 hn_rx_stat_u64_sysctl,
5064 "LU", "LRO flushed");
5065 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5066 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5067 __offsetof(struct hn_rx_ring, hn_lro_tried),
5068 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5069 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5070 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5071 hn_lro_lenlim_sysctl, "IU",
5072 "Max # of data bytes to be aggregated by LRO");
5073 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5074 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5075 hn_lro_ackcnt_sysctl, "I",
5076 "Max # of ACKs to be aggregated by LRO");
5077 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5078 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5079 hn_trust_hcsum_sysctl, "I",
5080 "Trust tcp segment verification on host side, "
5081 "when csum info is missing");
5082 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5083 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5084 hn_trust_hcsum_sysctl, "I",
5085 "Trust udp datagram verification on host side, "
5086 "when csum info is missing");
5087 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5088 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5089 hn_trust_hcsum_sysctl, "I",
5090 "Trust ip packet verification on host side, "
5091 "when csum info is missing");
5092 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5093 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5094 __offsetof(struct hn_rx_ring, hn_csum_ip),
5095 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5096 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5097 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5098 __offsetof(struct hn_rx_ring, hn_csum_tcp),
5099 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5100 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5101 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5102 __offsetof(struct hn_rx_ring, hn_csum_udp),
5103 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5104 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5105 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5106 __offsetof(struct hn_rx_ring, hn_csum_trusted),
5107 hn_rx_stat_ulong_sysctl, "LU",
5108 "# of packets that we trust host's csum verification");
5109 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5110 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5111 __offsetof(struct hn_rx_ring, hn_small_pkts),
5112 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5113 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5114 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5115 __offsetof(struct hn_rx_ring, hn_ack_failed),
5116 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5117 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5118 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5119 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5120 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5121
5122 return (0);
5123 }
5124
5125 static void
hn_destroy_rx_data(struct hn_softc * sc)5126 hn_destroy_rx_data(struct hn_softc *sc)
5127 {
5128 int i;
5129
5130 if (sc->hn_rxbuf != NULL) {
5131 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5132 free(sc->hn_rxbuf, M_DEVBUF);
5133 else
5134 device_printf(sc->hn_dev, "RXBUF is referenced\n");
5135 sc->hn_rxbuf = NULL;
5136 }
5137
5138 if (sc->hn_rx_ring_cnt == 0)
5139 return;
5140
5141 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5142 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5143
5144 if (rxr->hn_br == NULL)
5145 continue;
5146 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5147 free(rxr->hn_br, M_DEVBUF);
5148 } else {
5149 device_printf(sc->hn_dev,
5150 "%dth channel bufring is referenced", i);
5151 }
5152 rxr->hn_br = NULL;
5153
5154 #if defined(INET) || defined(INET6)
5155 tcp_lro_free(&rxr->hn_lro);
5156 #endif
5157 free(rxr->hn_pktbuf, M_DEVBUF);
5158 }
5159 free(sc->hn_rx_ring, M_DEVBUF);
5160 sc->hn_rx_ring = NULL;
5161
5162 sc->hn_rx_ring_cnt = 0;
5163 sc->hn_rx_ring_inuse = 0;
5164 }
5165
5166 static int
hn_tx_ring_create(struct hn_softc * sc,int id)5167 hn_tx_ring_create(struct hn_softc *sc, int id)
5168 {
5169 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5170 device_t dev = sc->hn_dev;
5171 bus_dma_tag_t parent_dtag;
5172 int error, i;
5173
5174 txr->hn_sc = sc;
5175 txr->hn_tx_idx = id;
5176
5177 #ifndef HN_USE_TXDESC_BUFRING
5178 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5179 #endif
5180 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5181
5182 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5183 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5184 M_DEVBUF, M_WAITOK | M_ZERO);
5185 #ifndef HN_USE_TXDESC_BUFRING
5186 SLIST_INIT(&txr->hn_txlist);
5187 #else
5188 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5189 M_WAITOK, &txr->hn_tx_lock);
5190 #endif
5191
5192 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5193 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5194 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5195 } else {
5196 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5197 }
5198
5199 #ifdef HN_IFSTART_SUPPORT
5200 if (hn_use_if_start) {
5201 txr->hn_txeof = hn_start_txeof;
5202 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5203 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5204 } else
5205 #endif
5206 {
5207 int br_depth;
5208
5209 txr->hn_txeof = hn_xmit_txeof;
5210 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5211 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5212
5213 br_depth = hn_get_txswq_depth(txr);
5214 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5215 M_WAITOK, &txr->hn_tx_lock);
5216 }
5217
5218 txr->hn_direct_tx_size = hn_direct_tx_size;
5219
5220 /*
5221 * Always schedule transmission instead of trying to do direct
5222 * transmission. This one gives the best performance so far.
5223 */
5224 txr->hn_sched_tx = 1;
5225
5226 parent_dtag = bus_get_dma_tag(dev);
5227
5228 /* DMA tag for RNDIS packet messages. */
5229 error = bus_dma_tag_create(parent_dtag, /* parent */
5230 HN_RNDIS_PKT_ALIGN, /* alignment */
5231 HN_RNDIS_PKT_BOUNDARY, /* boundary */
5232 BUS_SPACE_MAXADDR, /* lowaddr */
5233 BUS_SPACE_MAXADDR, /* highaddr */
5234 NULL, NULL, /* filter, filterarg */
5235 HN_RNDIS_PKT_LEN, /* maxsize */
5236 1, /* nsegments */
5237 HN_RNDIS_PKT_LEN, /* maxsegsize */
5238 0, /* flags */
5239 NULL, /* lockfunc */
5240 NULL, /* lockfuncarg */
5241 &txr->hn_tx_rndis_dtag);
5242 if (error) {
5243 device_printf(dev, "failed to create rndis dmatag\n");
5244 return error;
5245 }
5246
5247 /* DMA tag for data. */
5248 error = bus_dma_tag_create(parent_dtag, /* parent */
5249 1, /* alignment */
5250 HN_TX_DATA_BOUNDARY, /* boundary */
5251 BUS_SPACE_MAXADDR, /* lowaddr */
5252 BUS_SPACE_MAXADDR, /* highaddr */
5253 NULL, NULL, /* filter, filterarg */
5254 HN_TX_DATA_MAXSIZE, /* maxsize */
5255 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
5256 HN_TX_DATA_SEGSIZE, /* maxsegsize */
5257 0, /* flags */
5258 NULL, /* lockfunc */
5259 NULL, /* lockfuncarg */
5260 &txr->hn_tx_data_dtag);
5261 if (error) {
5262 device_printf(dev, "failed to create data dmatag\n");
5263 return error;
5264 }
5265
5266 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5267 struct hn_txdesc *txd = &txr->hn_txdesc[i];
5268
5269 txd->txr = txr;
5270 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5271 STAILQ_INIT(&txd->agg_list);
5272
5273 /*
5274 * Allocate and load RNDIS packet message.
5275 */
5276 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5277 (void **)&txd->rndis_pkt,
5278 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5279 &txd->rndis_pkt_dmap);
5280 if (error) {
5281 device_printf(dev,
5282 "failed to allocate rndis_packet_msg, %d\n", i);
5283 return error;
5284 }
5285
5286 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5287 txd->rndis_pkt_dmap,
5288 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5289 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5290 BUS_DMA_NOWAIT);
5291 if (error) {
5292 device_printf(dev,
5293 "failed to load rndis_packet_msg, %d\n", i);
5294 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5295 txd->rndis_pkt, txd->rndis_pkt_dmap);
5296 return error;
5297 }
5298
5299 /* DMA map for TX data. */
5300 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5301 &txd->data_dmap);
5302 if (error) {
5303 device_printf(dev,
5304 "failed to allocate tx data dmamap\n");
5305 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5306 txd->rndis_pkt_dmap);
5307 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5308 txd->rndis_pkt, txd->rndis_pkt_dmap);
5309 return error;
5310 }
5311
5312 /* All set, put it to list */
5313 txd->flags |= HN_TXD_FLAG_ONLIST;
5314 #ifndef HN_USE_TXDESC_BUFRING
5315 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5316 #else
5317 buf_ring_enqueue(txr->hn_txdesc_br, txd);
5318 #endif
5319 }
5320 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5321
5322 if (sc->hn_tx_sysctl_tree != NULL) {
5323 struct sysctl_oid_list *child;
5324 struct sysctl_ctx_list *ctx;
5325 char name[16];
5326
5327 /*
5328 * Create per TX ring sysctl tree:
5329 * dev.hn.UNIT.tx.RINGID
5330 */
5331 ctx = device_get_sysctl_ctx(dev);
5332 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5333
5334 snprintf(name, sizeof(name), "%d", id);
5335 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5336 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5337
5338 if (txr->hn_tx_sysctl_tree != NULL) {
5339 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5340
5341 #ifdef HN_DEBUG
5342 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5343 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5344 "# of available TX descs");
5345 #endif
5346 #ifdef HN_IFSTART_SUPPORT
5347 if (!hn_use_if_start)
5348 #endif
5349 {
5350 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5351 CTLFLAG_RD, &txr->hn_oactive, 0,
5352 "over active");
5353 }
5354 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5355 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5356 "# of packets transmitted");
5357 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5358 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5359 "# of sends");
5360 }
5361 }
5362
5363 return 0;
5364 }
5365
5366 static void
hn_txdesc_dmamap_destroy(struct hn_txdesc * txd)5367 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5368 {
5369 struct hn_tx_ring *txr = txd->txr;
5370
5371 KASSERT(txd->m == NULL, ("still has mbuf installed"));
5372 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5373
5374 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5375 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5376 txd->rndis_pkt_dmap);
5377 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5378 }
5379
5380 static void
hn_txdesc_gc(struct hn_tx_ring * txr,struct hn_txdesc * txd)5381 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5382 {
5383
5384 KASSERT(txd->refs == 0 || txd->refs == 1,
5385 ("invalid txd refs %d", txd->refs));
5386
5387 /* Aggregated txds will be freed by their aggregating txd. */
5388 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5389 int freed __diagused;
5390
5391 freed = hn_txdesc_put(txr, txd);
5392 KASSERT(freed, ("can't free txdesc"));
5393 }
5394 }
5395
5396 static void
hn_tx_ring_destroy(struct hn_tx_ring * txr)5397 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5398 {
5399 int i;
5400
5401 if (txr->hn_txdesc == NULL)
5402 return;
5403
5404 /*
5405 * NOTE:
5406 * Because the freeing of aggregated txds will be deferred
5407 * to the aggregating txd, two passes are used here:
5408 * - The first pass GCes any pending txds. This GC is necessary,
5409 * since if the channels are revoked, hypervisor will not
5410 * deliver send-done for all pending txds.
5411 * - The second pass frees the busdma stuffs, i.e. after all txds
5412 * were freed.
5413 */
5414 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5415 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5416 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5417 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5418
5419 if (txr->hn_tx_data_dtag != NULL)
5420 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5421 if (txr->hn_tx_rndis_dtag != NULL)
5422 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5423
5424 #ifdef HN_USE_TXDESC_BUFRING
5425 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5426 #endif
5427
5428 free(txr->hn_txdesc, M_DEVBUF);
5429 txr->hn_txdesc = NULL;
5430
5431 if (txr->hn_mbuf_br != NULL)
5432 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5433
5434 #ifndef HN_USE_TXDESC_BUFRING
5435 mtx_destroy(&txr->hn_txlist_spin);
5436 #endif
5437 mtx_destroy(&txr->hn_tx_lock);
5438 }
5439
5440 static int
hn_create_tx_data(struct hn_softc * sc,int ring_cnt)5441 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5442 {
5443 struct sysctl_oid_list *child;
5444 struct sysctl_ctx_list *ctx;
5445 int i;
5446
5447 /*
5448 * Create TXBUF for chimney sending.
5449 *
5450 * NOTE: It is shared by all channels.
5451 */
5452 sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
5453 0ul, ~0ul, PAGE_SIZE, 0);
5454 if (sc->hn_chim == NULL) {
5455 device_printf(sc->hn_dev, "allocate txbuf failed\n");
5456 return (ENOMEM);
5457 }
5458
5459 sc->hn_tx_ring_cnt = ring_cnt;
5460 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5461
5462 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5463 M_DEVBUF, M_WAITOK | M_ZERO);
5464
5465 ctx = device_get_sysctl_ctx(sc->hn_dev);
5466 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5467
5468 /* Create dev.hn.UNIT.tx sysctl tree */
5469 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5470 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5471
5472 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5473 int error;
5474
5475 error = hn_tx_ring_create(sc, i);
5476 if (error)
5477 return error;
5478 }
5479
5480 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5481 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5482 __offsetof(struct hn_tx_ring, hn_no_txdescs),
5483 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5484 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5485 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5486 __offsetof(struct hn_tx_ring, hn_send_failed),
5487 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5488 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5489 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5490 __offsetof(struct hn_tx_ring, hn_txdma_failed),
5491 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5492 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5493 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5494 __offsetof(struct hn_tx_ring, hn_flush_failed),
5495 hn_tx_stat_ulong_sysctl, "LU",
5496 "# of packet transmission aggregation flush failure");
5497 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5498 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5499 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5500 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5501 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5502 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5503 __offsetof(struct hn_tx_ring, hn_tx_chimney),
5504 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5505 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5506 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5507 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5508 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5509 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5510 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5511 "# of total TX descs");
5512 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5513 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5514 "Chimney send packet size upper boundary");
5515 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5516 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5517 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5518 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5519 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5520 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5521 hn_tx_conf_int_sysctl, "I",
5522 "Size of the packet for direct transmission");
5523 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5524 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5525 __offsetof(struct hn_tx_ring, hn_sched_tx),
5526 hn_tx_conf_int_sysctl, "I",
5527 "Always schedule transmission "
5528 "instead of doing direct transmission");
5529 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5530 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5531 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5532 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5533 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5534 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5535 "Applied packet transmission aggregation size");
5536 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5537 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5538 hn_txagg_pktmax_sysctl, "I",
5539 "Applied packet transmission aggregation packets");
5540 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5541 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5542 hn_txagg_align_sysctl, "I",
5543 "Applied packet transmission aggregation alignment");
5544
5545 return 0;
5546 }
5547
5548 static void
hn_set_chim_size(struct hn_softc * sc,int chim_size)5549 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5550 {
5551 int i;
5552
5553 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5554 sc->hn_tx_ring[i].hn_chim_size = chim_size;
5555 }
5556
5557 static void
hn_set_tso_maxsize(struct hn_softc * sc,int tso_maxlen,int mtu)5558 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5559 {
5560 if_t ifp = sc->hn_ifp;
5561 u_int hw_tsomax;
5562 int tso_minlen;
5563
5564 HN_LOCK_ASSERT(sc);
5565
5566 if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5567 return;
5568
5569 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5570 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5571 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5572
5573 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5574 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5575 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5576
5577 if (tso_maxlen < tso_minlen)
5578 tso_maxlen = tso_minlen;
5579 else if (tso_maxlen > IP_MAXPACKET)
5580 tso_maxlen = IP_MAXPACKET;
5581 if (tso_maxlen > sc->hn_ndis_tso_szmax)
5582 tso_maxlen = sc->hn_ndis_tso_szmax;
5583 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5584
5585 if (hn_xpnt_vf_isready(sc)) {
5586 if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp))
5587 hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp);
5588 }
5589 if_sethwtsomax(ifp, hw_tsomax);
5590 if (bootverbose)
5591 if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp));
5592 }
5593
5594 static void
hn_fixup_tx_data(struct hn_softc * sc)5595 hn_fixup_tx_data(struct hn_softc *sc)
5596 {
5597 uint64_t csum_assist;
5598 int i;
5599
5600 hn_set_chim_size(sc, sc->hn_chim_szmax);
5601 if (hn_tx_chimney_size > 0 &&
5602 hn_tx_chimney_size < sc->hn_chim_szmax)
5603 hn_set_chim_size(sc, hn_tx_chimney_size);
5604
5605 csum_assist = 0;
5606 if (sc->hn_caps & HN_CAP_IPCS)
5607 csum_assist |= CSUM_IP;
5608 if (sc->hn_caps & HN_CAP_TCP4CS)
5609 csum_assist |= CSUM_IP_TCP;
5610 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5611 csum_assist |= CSUM_IP_UDP;
5612 if (sc->hn_caps & HN_CAP_TCP6CS)
5613 csum_assist |= CSUM_IP6_TCP;
5614 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5615 csum_assist |= CSUM_IP6_UDP;
5616 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5617 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5618
5619 if (sc->hn_caps & HN_CAP_HASHVAL) {
5620 /*
5621 * Support HASHVAL pktinfo on TX path.
5622 */
5623 if (bootverbose)
5624 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5625 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5626 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5627 }
5628 }
5629
5630 static void
hn_fixup_rx_data(struct hn_softc * sc)5631 hn_fixup_rx_data(struct hn_softc *sc)
5632 {
5633
5634 if (sc->hn_caps & HN_CAP_UDPHASH) {
5635 int i;
5636
5637 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5638 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5639 }
5640 }
5641
5642 static void
hn_destroy_tx_data(struct hn_softc * sc)5643 hn_destroy_tx_data(struct hn_softc *sc)
5644 {
5645 int i;
5646
5647 if (sc->hn_chim != NULL) {
5648 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5649 free(sc->hn_chim, M_DEVBUF);
5650 } else {
5651 device_printf(sc->hn_dev,
5652 "chimney sending buffer is referenced");
5653 }
5654 sc->hn_chim = NULL;
5655 }
5656
5657 if (sc->hn_tx_ring_cnt == 0)
5658 return;
5659
5660 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5661 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5662
5663 free(sc->hn_tx_ring, M_DEVBUF);
5664 sc->hn_tx_ring = NULL;
5665
5666 sc->hn_tx_ring_cnt = 0;
5667 sc->hn_tx_ring_inuse = 0;
5668 }
5669
5670 #ifdef HN_IFSTART_SUPPORT
5671
5672 static void
hn_start_taskfunc(void * xtxr,int pending __unused)5673 hn_start_taskfunc(void *xtxr, int pending __unused)
5674 {
5675 struct hn_tx_ring *txr = xtxr;
5676
5677 mtx_lock(&txr->hn_tx_lock);
5678 hn_start_locked(txr, 0);
5679 mtx_unlock(&txr->hn_tx_lock);
5680 }
5681
5682 static int
hn_start_locked(struct hn_tx_ring * txr,int len)5683 hn_start_locked(struct hn_tx_ring *txr, int len)
5684 {
5685 struct hn_softc *sc = txr->hn_sc;
5686 if_t ifp = sc->hn_ifp;
5687 int sched = 0;
5688
5689 KASSERT(hn_use_if_start,
5690 ("hn_start_locked is called, when if_start is disabled"));
5691 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5692 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5693 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5694
5695 if (__predict_false(txr->hn_suspended))
5696 return (0);
5697
5698 if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5699 IFF_DRV_RUNNING)
5700 return (0);
5701
5702 while (!if_sendq_empty(ifp)) {
5703 struct hn_txdesc *txd;
5704 struct mbuf *m_head;
5705 int error;
5706
5707 m_head = if_dequeue(ifp);
5708 if (m_head == NULL)
5709 break;
5710
5711 if (len > 0 && m_head->m_pkthdr.len > len) {
5712 /*
5713 * This sending could be time consuming; let callers
5714 * dispatch this packet sending (and sending of any
5715 * following up packets) to tx taskqueue.
5716 */
5717 if_sendq_prepend(ifp, m_head);
5718 sched = 1;
5719 break;
5720 }
5721
5722 #if defined(INET6) || defined(INET)
5723 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5724 m_head = hn_tso_fixup(m_head);
5725 if (__predict_false(m_head == NULL)) {
5726 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5727 continue;
5728 }
5729 } else if (m_head->m_pkthdr.csum_flags &
5730 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5731 m_head = hn_set_hlen(m_head);
5732 if (__predict_false(m_head == NULL)) {
5733 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5734 continue;
5735 }
5736 }
5737 #endif
5738
5739 txd = hn_txdesc_get(txr);
5740 if (txd == NULL) {
5741 txr->hn_no_txdescs++;
5742 if_sendq_prepend(ifp, m_head);
5743 if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
5744 break;
5745 }
5746
5747 error = hn_encap(ifp, txr, txd, &m_head);
5748 if (error) {
5749 /* Both txd and m_head are freed */
5750 KASSERT(txr->hn_agg_txd == NULL,
5751 ("encap failed w/ pending aggregating txdesc"));
5752 continue;
5753 }
5754
5755 if (txr->hn_agg_pktleft == 0) {
5756 if (txr->hn_agg_txd != NULL) {
5757 KASSERT(m_head == NULL,
5758 ("pending mbuf for aggregating txdesc"));
5759 error = hn_flush_txagg(ifp, txr);
5760 if (__predict_false(error)) {
5761 if_setdrvflagbits(ifp,
5762 IFF_DRV_OACTIVE, 0);
5763 break;
5764 }
5765 } else {
5766 KASSERT(m_head != NULL, ("mbuf was freed"));
5767 error = hn_txpkt(ifp, txr, txd);
5768 if (__predict_false(error)) {
5769 /* txd is freed, but m_head is not */
5770 if_sendq_prepend(ifp, m_head);
5771 if_setdrvflagbits(ifp,
5772 IFF_DRV_OACTIVE, 0);
5773 break;
5774 }
5775 }
5776 }
5777 #ifdef INVARIANTS
5778 else {
5779 KASSERT(txr->hn_agg_txd != NULL,
5780 ("no aggregating txdesc"));
5781 KASSERT(m_head == NULL,
5782 ("pending mbuf for aggregating txdesc"));
5783 }
5784 #endif
5785 }
5786
5787 /* Flush pending aggerated transmission. */
5788 if (txr->hn_agg_txd != NULL)
5789 hn_flush_txagg(ifp, txr);
5790 return (sched);
5791 }
5792
5793 static void
hn_start(if_t ifp)5794 hn_start(if_t ifp)
5795 {
5796 struct hn_softc *sc = if_getsoftc(ifp);
5797 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5798
5799 if (txr->hn_sched_tx)
5800 goto do_sched;
5801
5802 if (mtx_trylock(&txr->hn_tx_lock)) {
5803 int sched;
5804
5805 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5806 mtx_unlock(&txr->hn_tx_lock);
5807 if (!sched)
5808 return;
5809 }
5810 do_sched:
5811 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5812 }
5813
5814 static void
hn_start_txeof_taskfunc(void * xtxr,int pending __unused)5815 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5816 {
5817 struct hn_tx_ring *txr = xtxr;
5818
5819 mtx_lock(&txr->hn_tx_lock);
5820 if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE);
5821 hn_start_locked(txr, 0);
5822 mtx_unlock(&txr->hn_tx_lock);
5823 }
5824
5825 static void
hn_start_txeof(struct hn_tx_ring * txr)5826 hn_start_txeof(struct hn_tx_ring *txr)
5827 {
5828 struct hn_softc *sc = txr->hn_sc;
5829 if_t ifp = sc->hn_ifp;
5830
5831 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5832
5833 if (txr->hn_sched_tx)
5834 goto do_sched;
5835
5836 if (mtx_trylock(&txr->hn_tx_lock)) {
5837 int sched;
5838
5839 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5840 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5841 mtx_unlock(&txr->hn_tx_lock);
5842 if (sched) {
5843 taskqueue_enqueue(txr->hn_tx_taskq,
5844 &txr->hn_tx_task);
5845 }
5846 } else {
5847 do_sched:
5848 /*
5849 * Release the OACTIVE earlier, with the hope, that
5850 * others could catch up. The task will clear the
5851 * flag again with the hn_tx_lock to avoid possible
5852 * races.
5853 */
5854 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5855 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5856 }
5857 }
5858
5859 #endif /* HN_IFSTART_SUPPORT */
5860
5861 static int
hn_xmit(struct hn_tx_ring * txr,int len)5862 hn_xmit(struct hn_tx_ring *txr, int len)
5863 {
5864 struct hn_softc *sc = txr->hn_sc;
5865 if_t ifp = sc->hn_ifp;
5866 struct mbuf *m_head;
5867 int sched = 0;
5868
5869 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5870 #ifdef HN_IFSTART_SUPPORT
5871 KASSERT(hn_use_if_start == 0,
5872 ("hn_xmit is called, when if_start is enabled"));
5873 #endif
5874 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5875
5876 if (__predict_false(txr->hn_suspended))
5877 return (0);
5878
5879 if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5880 return (0);
5881
5882 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5883 struct hn_txdesc *txd;
5884 int error;
5885
5886 if (len > 0 && m_head->m_pkthdr.len > len) {
5887 /*
5888 * This sending could be time consuming; let callers
5889 * dispatch this packet sending (and sending of any
5890 * following up packets) to tx taskqueue.
5891 */
5892 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5893 sched = 1;
5894 break;
5895 }
5896
5897 txd = hn_txdesc_get(txr);
5898 if (txd == NULL) {
5899 txr->hn_no_txdescs++;
5900 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5901 txr->hn_oactive = 1;
5902 break;
5903 }
5904
5905 error = hn_encap(ifp, txr, txd, &m_head);
5906 if (error) {
5907 /* Both txd and m_head are freed; discard */
5908 KASSERT(txr->hn_agg_txd == NULL,
5909 ("encap failed w/ pending aggregating txdesc"));
5910 drbr_advance(ifp, txr->hn_mbuf_br);
5911 continue;
5912 }
5913
5914 if (txr->hn_agg_pktleft == 0) {
5915 if (txr->hn_agg_txd != NULL) {
5916 KASSERT(m_head == NULL,
5917 ("pending mbuf for aggregating txdesc"));
5918 error = hn_flush_txagg(ifp, txr);
5919 if (__predict_false(error)) {
5920 txr->hn_oactive = 1;
5921 break;
5922 }
5923 } else {
5924 KASSERT(m_head != NULL, ("mbuf was freed"));
5925 error = hn_txpkt(ifp, txr, txd);
5926 if (__predict_false(error)) {
5927 /* txd is freed, but m_head is not */
5928 drbr_putback(ifp, txr->hn_mbuf_br,
5929 m_head);
5930 txr->hn_oactive = 1;
5931 break;
5932 }
5933 }
5934 }
5935 #ifdef INVARIANTS
5936 else {
5937 KASSERT(txr->hn_agg_txd != NULL,
5938 ("no aggregating txdesc"));
5939 KASSERT(m_head == NULL,
5940 ("pending mbuf for aggregating txdesc"));
5941 }
5942 #endif
5943
5944 /* Sent */
5945 drbr_advance(ifp, txr->hn_mbuf_br);
5946 }
5947
5948 /* Flush pending aggerated transmission. */
5949 if (txr->hn_agg_txd != NULL)
5950 hn_flush_txagg(ifp, txr);
5951 return (sched);
5952 }
5953
5954 static int
hn_transmit(if_t ifp,struct mbuf * m)5955 hn_transmit(if_t ifp, struct mbuf *m)
5956 {
5957 struct hn_softc *sc = if_getsoftc(ifp);
5958 struct hn_tx_ring *txr;
5959 int error, idx = 0;
5960
5961 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5962 struct rm_priotracker pt;
5963
5964 rm_rlock(&sc->hn_vf_lock, &pt);
5965 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5966 struct mbuf *m_bpf = NULL;
5967 int obytes, omcast;
5968
5969 obytes = m->m_pkthdr.len;
5970 omcast = (m->m_flags & M_MCAST) != 0;
5971
5972 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5973 if (bpf_peers_present_if(ifp)) {
5974 m_bpf = m_copypacket(m, M_NOWAIT);
5975 if (m_bpf == NULL) {
5976 /*
5977 * Failed to grab a shallow
5978 * copy; tap now.
5979 */
5980 ETHER_BPF_MTAP(ifp, m);
5981 }
5982 }
5983 } else {
5984 ETHER_BPF_MTAP(ifp, m);
5985 }
5986
5987 error = if_transmit(sc->hn_vf_ifp, m);
5988 rm_runlock(&sc->hn_vf_lock, &pt);
5989
5990 if (m_bpf != NULL) {
5991 if (!error)
5992 ETHER_BPF_MTAP(ifp, m_bpf);
5993 m_freem(m_bpf);
5994 }
5995
5996 if (error == ENOBUFS) {
5997 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5998 } else if (error) {
5999 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6000 } else {
6001 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6002 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6003 if (omcast) {
6004 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6005 omcast);
6006 }
6007 }
6008 return (error);
6009 }
6010 rm_runlock(&sc->hn_vf_lock, &pt);
6011 }
6012
6013 #if defined(INET6) || defined(INET)
6014 /*
6015 * Perform TSO packet header fixup or get l2/l3 header length now,
6016 * since packet headers should be cache-hot.
6017 */
6018 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6019 m = hn_tso_fixup(m);
6020 if (__predict_false(m == NULL)) {
6021 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6022 return EIO;
6023 }
6024 } else if (m->m_pkthdr.csum_flags &
6025 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6026 m = hn_set_hlen(m);
6027 if (__predict_false(m == NULL)) {
6028 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6029 return EIO;
6030 }
6031 }
6032 #endif
6033
6034 /*
6035 * Select the TX ring based on flowid
6036 */
6037 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6038 #ifdef RSS
6039 uint32_t bid;
6040
6041 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6042 &bid) == 0)
6043 idx = bid % sc->hn_tx_ring_inuse;
6044 else
6045 #endif
6046 {
6047 #if defined(INET6) || defined(INET)
6048 int tcpsyn = 0;
6049
6050 if (m->m_pkthdr.len < 128 &&
6051 (m->m_pkthdr.csum_flags &
6052 (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6053 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6054 m = hn_check_tcpsyn(m, &tcpsyn);
6055 if (__predict_false(m == NULL)) {
6056 if_inc_counter(ifp,
6057 IFCOUNTER_OERRORS, 1);
6058 return (EIO);
6059 }
6060 }
6061 #else
6062 const int tcpsyn = 0;
6063 #endif
6064 if (tcpsyn)
6065 idx = 0;
6066 else
6067 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6068 }
6069 }
6070 txr = &sc->hn_tx_ring[idx];
6071
6072 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6073 if (error) {
6074 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6075 return error;
6076 }
6077
6078 if (txr->hn_oactive)
6079 return 0;
6080
6081 if (txr->hn_sched_tx)
6082 goto do_sched;
6083
6084 if (mtx_trylock(&txr->hn_tx_lock)) {
6085 int sched;
6086
6087 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6088 mtx_unlock(&txr->hn_tx_lock);
6089 if (!sched)
6090 return 0;
6091 }
6092 do_sched:
6093 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6094 return 0;
6095 }
6096
6097 static void
hn_tx_ring_qflush(struct hn_tx_ring * txr)6098 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6099 {
6100 struct mbuf *m;
6101
6102 mtx_lock(&txr->hn_tx_lock);
6103 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6104 m_freem(m);
6105 mtx_unlock(&txr->hn_tx_lock);
6106 }
6107
6108 static void
hn_xmit_qflush(if_t ifp)6109 hn_xmit_qflush(if_t ifp)
6110 {
6111 struct hn_softc *sc = if_getsoftc(ifp);
6112 struct rm_priotracker pt;
6113 int i;
6114
6115 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6116 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6117 if_qflush(ifp);
6118
6119 rm_rlock(&sc->hn_vf_lock, &pt);
6120 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6121 if_qflush(sc->hn_vf_ifp);
6122 rm_runlock(&sc->hn_vf_lock, &pt);
6123 }
6124
6125 static void
hn_xmit_txeof(struct hn_tx_ring * txr)6126 hn_xmit_txeof(struct hn_tx_ring *txr)
6127 {
6128
6129 if (txr->hn_sched_tx)
6130 goto do_sched;
6131
6132 if (mtx_trylock(&txr->hn_tx_lock)) {
6133 int sched;
6134
6135 txr->hn_oactive = 0;
6136 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6137 mtx_unlock(&txr->hn_tx_lock);
6138 if (sched) {
6139 taskqueue_enqueue(txr->hn_tx_taskq,
6140 &txr->hn_tx_task);
6141 }
6142 } else {
6143 do_sched:
6144 /*
6145 * Release the oactive earlier, with the hope, that
6146 * others could catch up. The task will clear the
6147 * oactive again with the hn_tx_lock to avoid possible
6148 * races.
6149 */
6150 txr->hn_oactive = 0;
6151 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6152 }
6153 }
6154
6155 static void
hn_xmit_taskfunc(void * xtxr,int pending __unused)6156 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6157 {
6158 struct hn_tx_ring *txr = xtxr;
6159
6160 mtx_lock(&txr->hn_tx_lock);
6161 hn_xmit(txr, 0);
6162 mtx_unlock(&txr->hn_tx_lock);
6163 }
6164
6165 static void
hn_xmit_txeof_taskfunc(void * xtxr,int pending __unused)6166 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6167 {
6168 struct hn_tx_ring *txr = xtxr;
6169
6170 mtx_lock(&txr->hn_tx_lock);
6171 txr->hn_oactive = 0;
6172 hn_xmit(txr, 0);
6173 mtx_unlock(&txr->hn_tx_lock);
6174 }
6175
6176 static int
hn_chan_attach(struct hn_softc * sc,struct vmbus_channel * chan)6177 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6178 {
6179 struct vmbus_chan_br cbr;
6180 struct hn_rx_ring *rxr;
6181 struct hn_tx_ring *txr = NULL;
6182 int idx, error;
6183
6184 idx = vmbus_chan_subidx(chan);
6185
6186 /*
6187 * Link this channel to RX/TX ring.
6188 */
6189 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6190 ("invalid channel index %d, should > 0 && < %d",
6191 idx, sc->hn_rx_ring_inuse));
6192 rxr = &sc->hn_rx_ring[idx];
6193 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6194 ("RX ring %d already attached", idx));
6195 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6196 rxr->hn_chan = chan;
6197
6198 if (bootverbose) {
6199 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6200 idx, vmbus_chan_id(chan));
6201 }
6202
6203 if (idx < sc->hn_tx_ring_inuse) {
6204 txr = &sc->hn_tx_ring[idx];
6205 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6206 ("TX ring %d already attached", idx));
6207 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6208
6209 txr->hn_chan = chan;
6210 if (bootverbose) {
6211 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6212 idx, vmbus_chan_id(chan));
6213 }
6214 }
6215
6216 /* Bind this channel to a proper CPU. */
6217 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6218
6219 /*
6220 * Open this channel
6221 */
6222 cbr.cbr = rxr->hn_br;
6223 cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br);
6224 cbr.cbr_txsz = HN_TXBR_SIZE;
6225 cbr.cbr_rxsz = HN_RXBR_SIZE;
6226 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6227 if (error) {
6228 if (error == EISCONN) {
6229 if_printf(sc->hn_ifp, "bufring is connected after "
6230 "chan%u open failure\n", vmbus_chan_id(chan));
6231 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6232 } else {
6233 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6234 vmbus_chan_id(chan), error);
6235 }
6236 }
6237 return (error);
6238 }
6239
6240 static void
hn_chan_detach(struct hn_softc * sc,struct vmbus_channel * chan)6241 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6242 {
6243 struct hn_rx_ring *rxr;
6244 int idx, error;
6245
6246 idx = vmbus_chan_subidx(chan);
6247
6248 /*
6249 * Link this channel to RX/TX ring.
6250 */
6251 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6252 ("invalid channel index %d, should > 0 && < %d",
6253 idx, sc->hn_rx_ring_inuse));
6254 rxr = &sc->hn_rx_ring[idx];
6255 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6256 ("RX ring %d is not attached", idx));
6257 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6258
6259 if (idx < sc->hn_tx_ring_inuse) {
6260 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6261
6262 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6263 ("TX ring %d is not attached attached", idx));
6264 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6265 }
6266
6267 /*
6268 * Close this channel.
6269 *
6270 * NOTE:
6271 * Channel closing does _not_ destroy the target channel.
6272 */
6273 error = vmbus_chan_close_direct(chan);
6274 if (error == EISCONN) {
6275 if_printf(sc->hn_ifp, "chan%u bufring is connected "
6276 "after being closed\n", vmbus_chan_id(chan));
6277 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6278 } else if (error) {
6279 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6280 vmbus_chan_id(chan), error);
6281 }
6282 }
6283
6284 static int
hn_attach_subchans(struct hn_softc * sc)6285 hn_attach_subchans(struct hn_softc *sc)
6286 {
6287 struct vmbus_channel **subchans;
6288 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6289 int i, error = 0;
6290
6291 KASSERT(subchan_cnt > 0, ("no sub-channels"));
6292
6293 /* Attach the sub-channels. */
6294 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6295 for (i = 0; i < subchan_cnt; ++i) {
6296 int error1;
6297
6298 error1 = hn_chan_attach(sc, subchans[i]);
6299 if (error1) {
6300 error = error1;
6301 /* Move on; all channels will be detached later. */
6302 }
6303 }
6304 vmbus_subchan_rel(subchans, subchan_cnt);
6305
6306 if (error) {
6307 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6308 } else {
6309 if (bootverbose) {
6310 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6311 subchan_cnt);
6312 }
6313 }
6314 return (error);
6315 }
6316
6317 static void
hn_detach_allchans(struct hn_softc * sc)6318 hn_detach_allchans(struct hn_softc *sc)
6319 {
6320 struct vmbus_channel **subchans;
6321 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6322 int i;
6323
6324 if (subchan_cnt == 0)
6325 goto back;
6326
6327 /* Detach the sub-channels. */
6328 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6329 for (i = 0; i < subchan_cnt; ++i)
6330 hn_chan_detach(sc, subchans[i]);
6331 vmbus_subchan_rel(subchans, subchan_cnt);
6332
6333 back:
6334 /*
6335 * Detach the primary channel, _after_ all sub-channels
6336 * are detached.
6337 */
6338 hn_chan_detach(sc, sc->hn_prichan);
6339
6340 /* Wait for sub-channels to be destroyed, if any. */
6341 vmbus_subchan_drain(sc->hn_prichan);
6342
6343 #ifdef INVARIANTS
6344 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6345 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6346 HN_RX_FLAG_ATTACHED) == 0,
6347 ("%dth RX ring is still attached", i));
6348 }
6349 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6350 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6351 HN_TX_FLAG_ATTACHED) == 0,
6352 ("%dth TX ring is still attached", i));
6353 }
6354 #endif
6355 }
6356
6357 static int
hn_synth_alloc_subchans(struct hn_softc * sc,int * nsubch)6358 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6359 {
6360 struct vmbus_channel **subchans;
6361 int nchan, rxr_cnt, error;
6362
6363 nchan = *nsubch + 1;
6364 if (nchan == 1) {
6365 /*
6366 * Multiple RX/TX rings are not requested.
6367 */
6368 *nsubch = 0;
6369 return (0);
6370 }
6371
6372 /*
6373 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6374 * table entries.
6375 */
6376 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6377 if (error) {
6378 /* No RSS; this is benign. */
6379 *nsubch = 0;
6380 return (0);
6381 }
6382 if (bootverbose) {
6383 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6384 rxr_cnt, nchan);
6385 }
6386
6387 if (nchan > rxr_cnt)
6388 nchan = rxr_cnt;
6389 if (nchan == 1) {
6390 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6391 *nsubch = 0;
6392 return (0);
6393 }
6394
6395 /*
6396 * Allocate sub-channels from NVS.
6397 */
6398 *nsubch = nchan - 1;
6399 error = hn_nvs_alloc_subchans(sc, nsubch);
6400 if (error || *nsubch == 0) {
6401 /* Failed to allocate sub-channels. */
6402 *nsubch = 0;
6403 return (0);
6404 }
6405
6406 /*
6407 * Wait for all sub-channels to become ready before moving on.
6408 */
6409 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6410 vmbus_subchan_rel(subchans, *nsubch);
6411 return (0);
6412 }
6413
6414 static bool
hn_synth_attachable(const struct hn_softc * sc)6415 hn_synth_attachable(const struct hn_softc *sc)
6416 {
6417 int i;
6418
6419 if (sc->hn_flags & HN_FLAG_ERRORS)
6420 return (false);
6421
6422 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6423 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6424
6425 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6426 return (false);
6427 }
6428 return (true);
6429 }
6430
6431 /*
6432 * Make sure that the RX filter is zero after the successful
6433 * RNDIS initialization.
6434 *
6435 * NOTE:
6436 * Under certain conditions on certain versions of Hyper-V,
6437 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6438 * after the successful RNDIS initialization, which breaks
6439 * the assumption of any following code (well, it breaks the
6440 * RNDIS API contract actually). Clear the RNDIS rxfilter
6441 * explicitly, drain packets sneaking through, and drain the
6442 * interrupt taskqueues scheduled due to the stealth packets.
6443 */
6444 static void
hn_rndis_init_fixat(struct hn_softc * sc,int nchan)6445 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6446 {
6447
6448 hn_disable_rx(sc);
6449 hn_drain_rxtx(sc, nchan);
6450 }
6451
6452 static int
hn_synth_attach(struct hn_softc * sc,int mtu)6453 hn_synth_attach(struct hn_softc *sc, int mtu)
6454 {
6455 #define ATTACHED_NVS 0x0002
6456 #define ATTACHED_RNDIS 0x0004
6457
6458 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6459 int error, nsubch, nchan = 1, i, rndis_inited;
6460 uint32_t old_caps, attached = 0;
6461
6462 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6463 ("synthetic parts were attached"));
6464
6465 if (!hn_synth_attachable(sc))
6466 return (ENXIO);
6467
6468 /* Save capabilities for later verification. */
6469 old_caps = sc->hn_caps;
6470 sc->hn_caps = 0;
6471
6472 /* Clear RSS stuffs. */
6473 sc->hn_rss_ind_size = 0;
6474 sc->hn_rss_hash = 0;
6475 sc->hn_rss_hcap = 0;
6476
6477 /*
6478 * Attach the primary channel _before_ attaching NVS and RNDIS.
6479 */
6480 error = hn_chan_attach(sc, sc->hn_prichan);
6481 if (error)
6482 goto failed;
6483
6484 /*
6485 * Attach NVS.
6486 */
6487 error = hn_nvs_attach(sc, mtu);
6488 if (error)
6489 goto failed;
6490 attached |= ATTACHED_NVS;
6491
6492 /*
6493 * Attach RNDIS _after_ NVS is attached.
6494 */
6495 error = hn_rndis_attach(sc, mtu, &rndis_inited);
6496 if (rndis_inited)
6497 attached |= ATTACHED_RNDIS;
6498 if (error)
6499 goto failed;
6500
6501 /*
6502 * Make sure capabilities are not changed.
6503 */
6504 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6505 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6506 old_caps, sc->hn_caps);
6507 error = ENXIO;
6508 goto failed;
6509 }
6510
6511 /*
6512 * Allocate sub-channels for multi-TX/RX rings.
6513 *
6514 * NOTE:
6515 * The # of RX rings that can be used is equivalent to the # of
6516 * channels to be requested.
6517 */
6518 nsubch = sc->hn_rx_ring_cnt - 1;
6519 error = hn_synth_alloc_subchans(sc, &nsubch);
6520 if (error)
6521 goto failed;
6522 /* NOTE: _Full_ synthetic parts detach is required now. */
6523 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6524
6525 /*
6526 * Set the # of TX/RX rings that could be used according to
6527 * the # of channels that NVS offered.
6528 */
6529 nchan = nsubch + 1;
6530 hn_set_ring_inuse(sc, nchan);
6531 if (nchan == 1) {
6532 /* Only the primary channel can be used; done */
6533 goto back;
6534 }
6535
6536 /*
6537 * Attach the sub-channels.
6538 *
6539 * NOTE: hn_set_ring_inuse() _must_ have been called.
6540 */
6541 error = hn_attach_subchans(sc);
6542 if (error)
6543 goto failed;
6544
6545 /*
6546 * Configure RSS key and indirect table _after_ all sub-channels
6547 * are attached.
6548 */
6549 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6550 /*
6551 * RSS key is not set yet; set it to the default RSS key.
6552 */
6553 if (bootverbose)
6554 if_printf(sc->hn_ifp, "setup default RSS key\n");
6555 #ifdef RSS
6556 rss_getkey(rss->rss_key);
6557 #else
6558 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6559 #endif
6560 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6561 }
6562
6563 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6564 /*
6565 * RSS indirect table is not set yet; set it up in round-
6566 * robin fashion.
6567 */
6568 if (bootverbose) {
6569 if_printf(sc->hn_ifp, "setup default RSS indirect "
6570 "table\n");
6571 }
6572 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6573 uint32_t subidx;
6574
6575 #ifdef RSS
6576 subidx = rss_get_indirection_to_bucket(i);
6577 #else
6578 subidx = i;
6579 #endif
6580 rss->rss_ind[i] = subidx % nchan;
6581 }
6582 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6583 } else {
6584 /*
6585 * # of usable channels may be changed, so we have to
6586 * make sure that all entries in RSS indirect table
6587 * are valid.
6588 *
6589 * NOTE: hn_set_ring_inuse() _must_ have been called.
6590 */
6591 hn_rss_ind_fixup(sc);
6592 }
6593
6594 sc->hn_rss_hash = sc->hn_rss_hcap;
6595 if ((sc->hn_flags & HN_FLAG_RXVF) ||
6596 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6597 /* NOTE: Don't reconfigure RSS; will do immediately. */
6598 hn_vf_rss_fixup(sc, false);
6599 }
6600 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6601 if (error)
6602 goto failed;
6603 back:
6604 /*
6605 * Fixup transmission aggregation setup.
6606 */
6607 hn_set_txagg(sc);
6608 hn_rndis_init_fixat(sc, nchan);
6609 return (0);
6610
6611 failed:
6612 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6613 hn_rndis_init_fixat(sc, nchan);
6614 hn_synth_detach(sc);
6615 } else {
6616 if (attached & ATTACHED_RNDIS) {
6617 hn_rndis_init_fixat(sc, nchan);
6618 hn_rndis_detach(sc);
6619 }
6620 if (attached & ATTACHED_NVS)
6621 hn_nvs_detach(sc);
6622 hn_chan_detach(sc, sc->hn_prichan);
6623 /* Restore old capabilities. */
6624 sc->hn_caps = old_caps;
6625 }
6626 return (error);
6627
6628 #undef ATTACHED_RNDIS
6629 #undef ATTACHED_NVS
6630 }
6631
6632 /*
6633 * NOTE:
6634 * The interface must have been suspended though hn_suspend(), before
6635 * this function get called.
6636 */
6637 static void
hn_synth_detach(struct hn_softc * sc)6638 hn_synth_detach(struct hn_softc *sc)
6639 {
6640
6641 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6642 ("synthetic parts were not attached"));
6643
6644 /* Detach the RNDIS first. */
6645 hn_rndis_detach(sc);
6646
6647 /* Detach NVS. */
6648 hn_nvs_detach(sc);
6649
6650 /* Detach all of the channels. */
6651 hn_detach_allchans(sc);
6652
6653 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6654 /*
6655 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6656 */
6657 int error;
6658
6659 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6660 sc->hn_rxbuf_gpadl);
6661 if (error) {
6662 if_printf(sc->hn_ifp,
6663 "rxbuf gpadl disconn failed: %d\n", error);
6664 sc->hn_flags |= HN_FLAG_RXBUF_REF;
6665 }
6666 sc->hn_rxbuf_gpadl = 0;
6667 }
6668
6669 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6670 /*
6671 * Host is post-Win2016, disconnect chimney sending buffer from
6672 * primary channel here.
6673 */
6674 int error;
6675
6676 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6677 sc->hn_chim_gpadl);
6678 if (error) {
6679 if_printf(sc->hn_ifp,
6680 "chim gpadl disconn failed: %d\n", error);
6681 sc->hn_flags |= HN_FLAG_CHIM_REF;
6682 }
6683 sc->hn_chim_gpadl = 0;
6684 }
6685 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6686 }
6687
6688 static void
hn_set_ring_inuse(struct hn_softc * sc,int ring_cnt)6689 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6690 {
6691 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6692 ("invalid ring count %d", ring_cnt));
6693
6694 if (sc->hn_tx_ring_cnt > ring_cnt)
6695 sc->hn_tx_ring_inuse = ring_cnt;
6696 else
6697 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6698 sc->hn_rx_ring_inuse = ring_cnt;
6699
6700 #ifdef RSS
6701 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6702 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6703 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6704 rss_getnumbuckets());
6705 }
6706 #endif
6707
6708 if (bootverbose) {
6709 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6710 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6711 }
6712 }
6713
6714 static void
hn_chan_drain(struct hn_softc * sc,struct vmbus_channel * chan)6715 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6716 {
6717
6718 /*
6719 * NOTE:
6720 * The TX bufring will not be drained by the hypervisor,
6721 * if the primary channel is revoked.
6722 */
6723 while (!vmbus_chan_rx_empty(chan) ||
6724 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6725 !vmbus_chan_tx_empty(chan)))
6726 pause("waitch", 1);
6727 vmbus_chan_intr_drain(chan);
6728 }
6729
6730 static void
hn_disable_rx(struct hn_softc * sc)6731 hn_disable_rx(struct hn_softc *sc)
6732 {
6733
6734 /*
6735 * Disable RX by clearing RX filter forcefully.
6736 */
6737 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6738 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6739
6740 /*
6741 * Give RNDIS enough time to flush all pending data packets.
6742 */
6743 pause("waitrx", (200 * hz) / 1000);
6744 }
6745
6746 /*
6747 * NOTE:
6748 * RX/TX _must_ have been suspended/disabled, before this function
6749 * is called.
6750 */
6751 static void
hn_drain_rxtx(struct hn_softc * sc,int nchan)6752 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6753 {
6754 struct vmbus_channel **subch = NULL;
6755 int nsubch;
6756
6757 /*
6758 * Drain RX/TX bufrings and interrupts.
6759 */
6760 nsubch = nchan - 1;
6761 if (nsubch > 0)
6762 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6763
6764 if (subch != NULL) {
6765 int i;
6766
6767 for (i = 0; i < nsubch; ++i)
6768 hn_chan_drain(sc, subch[i]);
6769 }
6770 hn_chan_drain(sc, sc->hn_prichan);
6771
6772 if (subch != NULL)
6773 vmbus_subchan_rel(subch, nsubch);
6774 }
6775
6776 static void
hn_suspend_data(struct hn_softc * sc)6777 hn_suspend_data(struct hn_softc *sc)
6778 {
6779 struct hn_tx_ring *txr;
6780 int i;
6781
6782 HN_LOCK_ASSERT(sc);
6783
6784 /*
6785 * Suspend TX.
6786 */
6787 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6788 txr = &sc->hn_tx_ring[i];
6789
6790 mtx_lock(&txr->hn_tx_lock);
6791 txr->hn_suspended = 1;
6792 mtx_unlock(&txr->hn_tx_lock);
6793 /* No one is able send more packets now. */
6794
6795 /*
6796 * Wait for all pending sends to finish.
6797 *
6798 * NOTE:
6799 * We will _not_ receive all pending send-done, if the
6800 * primary channel is revoked.
6801 */
6802 while (hn_tx_ring_pending(txr) &&
6803 !vmbus_chan_is_revoked(sc->hn_prichan))
6804 pause("hnwtx", 1 /* 1 tick */);
6805 }
6806
6807 /*
6808 * Disable RX.
6809 */
6810 hn_disable_rx(sc);
6811
6812 /*
6813 * Drain RX/TX.
6814 */
6815 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6816
6817 /*
6818 * Drain any pending TX tasks.
6819 *
6820 * NOTE:
6821 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6822 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6823 */
6824 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6825 txr = &sc->hn_tx_ring[i];
6826
6827 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6828 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6829 }
6830 }
6831
6832 static void
hn_suspend_mgmt_taskfunc(void * xsc,int pending __unused)6833 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6834 {
6835
6836 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6837 }
6838
6839 static void
hn_suspend_mgmt(struct hn_softc * sc)6840 hn_suspend_mgmt(struct hn_softc *sc)
6841 {
6842 struct task task;
6843
6844 HN_LOCK_ASSERT(sc);
6845
6846 /*
6847 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6848 * through hn_mgmt_taskq.
6849 */
6850 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6851 vmbus_chan_run_task(sc->hn_prichan, &task);
6852
6853 /*
6854 * Make sure that all pending management tasks are completed.
6855 */
6856 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6857 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6858 taskqueue_drain_all(sc->hn_mgmt_taskq0);
6859 }
6860
6861 static void
hn_suspend(struct hn_softc * sc)6862 hn_suspend(struct hn_softc *sc)
6863 {
6864
6865 /* Disable polling. */
6866 hn_polling(sc, 0);
6867
6868 /*
6869 * If the non-transparent mode VF is activated, the synthetic
6870 * device is receiving packets, so the data path of the
6871 * synthetic device must be suspended.
6872 */
6873 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6874 (sc->hn_flags & HN_FLAG_RXVF))
6875 hn_suspend_data(sc);
6876 hn_suspend_mgmt(sc);
6877 }
6878
6879 static void
hn_resume_tx(struct hn_softc * sc,int tx_ring_cnt)6880 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6881 {
6882 int i;
6883
6884 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6885 ("invalid TX ring count %d", tx_ring_cnt));
6886
6887 for (i = 0; i < tx_ring_cnt; ++i) {
6888 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6889
6890 mtx_lock(&txr->hn_tx_lock);
6891 txr->hn_suspended = 0;
6892 mtx_unlock(&txr->hn_tx_lock);
6893 }
6894 }
6895
6896 static void
hn_resume_data(struct hn_softc * sc)6897 hn_resume_data(struct hn_softc *sc)
6898 {
6899 int i;
6900
6901 HN_LOCK_ASSERT(sc);
6902
6903 /*
6904 * Re-enable RX.
6905 */
6906 hn_rxfilter_config(sc);
6907
6908 /*
6909 * Make sure to clear suspend status on "all" TX rings,
6910 * since hn_tx_ring_inuse can be changed after
6911 * hn_suspend_data().
6912 */
6913 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6914
6915 #ifdef HN_IFSTART_SUPPORT
6916 if (!hn_use_if_start)
6917 #endif
6918 {
6919 /*
6920 * Flush unused drbrs, since hn_tx_ring_inuse may be
6921 * reduced.
6922 */
6923 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6924 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6925 }
6926
6927 /*
6928 * Kick start TX.
6929 */
6930 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6931 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6932
6933 /*
6934 * Use txeof task, so that any pending oactive can be
6935 * cleared properly.
6936 */
6937 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6938 }
6939 }
6940
6941 static void
hn_resume_mgmt(struct hn_softc * sc)6942 hn_resume_mgmt(struct hn_softc *sc)
6943 {
6944
6945 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6946
6947 /*
6948 * Kick off network change detection, if it was pending.
6949 * If no network change was pending, start link status
6950 * checks, which is more lightweight than network change
6951 * detection.
6952 */
6953 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6954 hn_change_network(sc);
6955 else
6956 hn_update_link_status(sc);
6957 }
6958
6959 static void
hn_resume(struct hn_softc * sc)6960 hn_resume(struct hn_softc *sc)
6961 {
6962
6963 /*
6964 * If the non-transparent mode VF is activated, the synthetic
6965 * device have to receive packets, so the data path of the
6966 * synthetic device must be resumed.
6967 */
6968 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6969 (sc->hn_flags & HN_FLAG_RXVF))
6970 hn_resume_data(sc);
6971
6972 /*
6973 * Don't resume link status change if VF is attached/activated.
6974 * - In the non-transparent VF mode, the synthetic device marks
6975 * link down until the VF is deactivated; i.e. VF is down.
6976 * - In transparent VF mode, VF's media status is used until
6977 * the VF is detached.
6978 */
6979 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6980 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6981 hn_resume_mgmt(sc);
6982
6983 /*
6984 * Re-enable polling if this interface is running and
6985 * the polling is requested.
6986 */
6987 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6988 hn_polling(sc, sc->hn_pollhz);
6989 }
6990
6991 static void
hn_rndis_rx_status(struct hn_softc * sc,const void * data,int dlen)6992 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6993 {
6994 const struct rndis_status_msg *msg;
6995 int ofs;
6996
6997 if (dlen < sizeof(*msg)) {
6998 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6999 return;
7000 }
7001 msg = data;
7002
7003 switch (msg->rm_status) {
7004 case RNDIS_STATUS_MEDIA_CONNECT:
7005 case RNDIS_STATUS_MEDIA_DISCONNECT:
7006 hn_update_link_status(sc);
7007 break;
7008
7009 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7010 case RNDIS_STATUS_LINK_SPEED_CHANGE:
7011 /* Not really useful; ignore. */
7012 break;
7013
7014 case RNDIS_STATUS_NETWORK_CHANGE:
7015 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7016 if (dlen < ofs + msg->rm_stbuflen ||
7017 msg->rm_stbuflen < sizeof(uint32_t)) {
7018 if_printf(sc->hn_ifp, "network changed\n");
7019 } else {
7020 uint32_t change;
7021
7022 memcpy(&change, ((const uint8_t *)msg) + ofs,
7023 sizeof(change));
7024 if_printf(sc->hn_ifp, "network changed, change %u\n",
7025 change);
7026 }
7027 hn_change_network(sc);
7028 break;
7029
7030 default:
7031 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7032 msg->rm_status);
7033 break;
7034 }
7035 }
7036
7037 static int
hn_rndis_rxinfo(const void * info_data,int info_dlen,struct hn_rxinfo * info)7038 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7039 {
7040 const struct rndis_pktinfo *pi = info_data;
7041 uint32_t mask = 0;
7042
7043 while (info_dlen != 0) {
7044 const void *data;
7045 uint32_t dlen;
7046
7047 if (__predict_false(info_dlen < sizeof(*pi)))
7048 return (EINVAL);
7049 if (__predict_false(info_dlen < pi->rm_size))
7050 return (EINVAL);
7051 info_dlen -= pi->rm_size;
7052
7053 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7054 return (EINVAL);
7055 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7056 return (EINVAL);
7057 dlen = pi->rm_size - pi->rm_pktinfooffset;
7058 data = pi->rm_data;
7059
7060 if (pi->rm_internal == 1) {
7061 switch (pi->rm_type) {
7062 case NDIS_PKTINFO_IT_PKTINFO_ID:
7063 if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7064 return (EINVAL);
7065 info->pktinfo_id =
7066 (const struct packet_info_id *)data;
7067 mask |= HN_RXINFO_PKTINFO_ID;
7068 break;
7069
7070 default:
7071 goto next;
7072 }
7073 } else {
7074 switch (pi->rm_type) {
7075 case NDIS_PKTINFO_TYPE_VLAN:
7076 if (__predict_false(dlen
7077 < NDIS_VLAN_INFO_SIZE))
7078 return (EINVAL);
7079 info->vlan_info = (const uint32_t *)data;
7080 mask |= HN_RXINFO_VLAN;
7081 break;
7082
7083 case NDIS_PKTINFO_TYPE_CSUM:
7084 if (__predict_false(dlen
7085 < NDIS_RXCSUM_INFO_SIZE))
7086 return (EINVAL);
7087 info->csum_info = (const uint32_t *)data;
7088 mask |= HN_RXINFO_CSUM;
7089 break;
7090
7091 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7092 if (__predict_false(dlen
7093 < HN_NDIS_HASH_VALUE_SIZE))
7094 return (EINVAL);
7095 info->hash_value = (const uint32_t *)data;
7096 mask |= HN_RXINFO_HASHVAL;
7097 break;
7098
7099 case HN_NDIS_PKTINFO_TYPE_HASHINF:
7100 if (__predict_false(dlen
7101 < HN_NDIS_HASH_INFO_SIZE))
7102 return (EINVAL);
7103 info->hash_info = (const uint32_t *)data;
7104 mask |= HN_RXINFO_HASHINF;
7105 break;
7106
7107 default:
7108 goto next;
7109 }
7110 }
7111
7112 if (mask == HN_RXINFO_ALL) {
7113 /* All found; done */
7114 break;
7115 }
7116 next:
7117 pi = (const struct rndis_pktinfo *)
7118 ((const uint8_t *)pi + pi->rm_size);
7119 }
7120
7121 /*
7122 * Final fixup.
7123 * - If there is no hash value, invalidate the hash info.
7124 */
7125 if ((mask & HN_RXINFO_HASHVAL) == 0)
7126 info->hash_info = NULL;
7127 return (0);
7128 }
7129
7130 static __inline bool
hn_rndis_check_overlap(int off,int len,int check_off,int check_len)7131 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7132 {
7133
7134 if (off < check_off) {
7135 if (__predict_true(off + len <= check_off))
7136 return (false);
7137 } else if (off > check_off) {
7138 if (__predict_true(check_off + check_len <= off))
7139 return (false);
7140 }
7141 return (true);
7142 }
7143
7144 static __inline void
hn_rsc_add_data(struct hn_rx_ring * rxr,const void * data,uint32_t len,struct hn_rxinfo * info)7145 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7146 uint32_t len, struct hn_rxinfo *info)
7147 {
7148 uint32_t cnt = rxr->rsc.cnt;
7149
7150 if (cnt) {
7151 rxr->rsc.pktlen += len;
7152 } else {
7153 rxr->rsc.vlan_info = info->vlan_info;
7154 rxr->rsc.csum_info = info->csum_info;
7155 rxr->rsc.hash_info = info->hash_info;
7156 rxr->rsc.hash_value = info->hash_value;
7157 rxr->rsc.pktlen = len;
7158 }
7159
7160 rxr->rsc.frag_data[cnt] = data;
7161 rxr->rsc.frag_len[cnt] = len;
7162 rxr->rsc.cnt++;
7163 }
7164
7165 static void
hn_rndis_rx_data(struct hn_rx_ring * rxr,const void * data,int dlen)7166 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7167 {
7168 const struct rndis_packet_msg *pkt;
7169 struct hn_rxinfo info;
7170 int data_off, pktinfo_off, data_len, pktinfo_len;
7171 bool rsc_more= false;
7172
7173 /*
7174 * Check length.
7175 */
7176 if (__predict_false(dlen < sizeof(*pkt))) {
7177 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7178 return;
7179 }
7180 pkt = data;
7181
7182 if (__predict_false(dlen < pkt->rm_len)) {
7183 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7184 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7185 return;
7186 }
7187 if (__predict_false(pkt->rm_len <
7188 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7189 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7190 "msglen %u, data %u, oob %u, pktinfo %u\n",
7191 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7192 pkt->rm_pktinfolen);
7193 return;
7194 }
7195 if (__predict_false(pkt->rm_datalen == 0)) {
7196 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7197 return;
7198 }
7199
7200 /*
7201 * Check offests.
7202 */
7203 #define IS_OFFSET_INVALID(ofs) \
7204 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
7205 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7206
7207 /* XXX Hyper-V does not meet data offset alignment requirement */
7208 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7209 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7210 "data offset %u\n", pkt->rm_dataoffset);
7211 return;
7212 }
7213 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7214 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7215 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7216 "oob offset %u\n", pkt->rm_oobdataoffset);
7217 return;
7218 }
7219 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7220 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7221 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7222 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7223 return;
7224 }
7225
7226 #undef IS_OFFSET_INVALID
7227
7228 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7229 data_len = pkt->rm_datalen;
7230 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7231 pktinfo_len = pkt->rm_pktinfolen;
7232
7233 /*
7234 * Check OOB coverage.
7235 */
7236 if (__predict_false(pkt->rm_oobdatalen != 0)) {
7237 int oob_off, oob_len;
7238
7239 if_printf(rxr->hn_ifp, "got oobdata\n");
7240 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7241 oob_len = pkt->rm_oobdatalen;
7242
7243 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7244 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7245 "oob overflow, msglen %u, oob abs %d len %d\n",
7246 pkt->rm_len, oob_off, oob_len);
7247 return;
7248 }
7249
7250 /*
7251 * Check against data.
7252 */
7253 if (hn_rndis_check_overlap(oob_off, oob_len,
7254 data_off, data_len)) {
7255 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7256 "oob overlaps data, oob abs %d len %d, "
7257 "data abs %d len %d\n",
7258 oob_off, oob_len, data_off, data_len);
7259 return;
7260 }
7261
7262 /*
7263 * Check against pktinfo.
7264 */
7265 if (pktinfo_len != 0 &&
7266 hn_rndis_check_overlap(oob_off, oob_len,
7267 pktinfo_off, pktinfo_len)) {
7268 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7269 "oob overlaps pktinfo, oob abs %d len %d, "
7270 "pktinfo abs %d len %d\n",
7271 oob_off, oob_len, pktinfo_off, pktinfo_len);
7272 return;
7273 }
7274 }
7275
7276 /*
7277 * Check per-packet-info coverage and find useful per-packet-info.
7278 */
7279 info.vlan_info = NULL;
7280 info.csum_info = NULL;
7281 info.hash_info = NULL;
7282 info.pktinfo_id = NULL;
7283
7284 if (__predict_true(pktinfo_len != 0)) {
7285 bool overlap;
7286 int error;
7287
7288 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7289 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7290 "pktinfo overflow, msglen %u, "
7291 "pktinfo abs %d len %d\n",
7292 pkt->rm_len, pktinfo_off, pktinfo_len);
7293 return;
7294 }
7295
7296 /*
7297 * Check packet info coverage.
7298 */
7299 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7300 data_off, data_len);
7301 if (__predict_false(overlap)) {
7302 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7303 "pktinfo overlap data, pktinfo abs %d len %d, "
7304 "data abs %d len %d\n",
7305 pktinfo_off, pktinfo_len, data_off, data_len);
7306 return;
7307 }
7308
7309 /*
7310 * Find useful per-packet-info.
7311 */
7312 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7313 pktinfo_len, &info);
7314 if (__predict_false(error)) {
7315 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7316 "pktinfo\n");
7317 return;
7318 }
7319 }
7320
7321 if (__predict_false(data_off + data_len > pkt->rm_len)) {
7322 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7323 "data overflow, msglen %u, data abs %d len %d\n",
7324 pkt->rm_len, data_off, data_len);
7325 return;
7326 }
7327
7328 /* Identify RSC fragments, drop invalid packets */
7329 if ((info.pktinfo_id != NULL) &&
7330 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7331 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7332 rxr->rsc.cnt = 0;
7333 rxr->hn_rsc_pkts++;
7334 } else if (rxr->rsc.cnt == 0)
7335 goto drop;
7336
7337 rsc_more = true;
7338
7339 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7340 rsc_more = false;
7341
7342 if (rsc_more && rxr->rsc.is_last)
7343 goto drop;
7344 } else {
7345 rxr->rsc.cnt = 0;
7346 }
7347
7348 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7349 goto drop;
7350
7351 /* Store data in per rx ring structure */
7352 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7353 data_len, &info);
7354
7355 if (rsc_more)
7356 return;
7357
7358 hn_rxpkt(rxr);
7359 rxr->rsc.cnt = 0;
7360 return;
7361 drop:
7362 rxr->hn_rsc_drop++;
7363 return;
7364 }
7365
7366 static __inline void
hn_rndis_rxpkt(struct hn_rx_ring * rxr,const void * data,int dlen)7367 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7368 {
7369 const struct rndis_msghdr *hdr;
7370
7371 if (__predict_false(dlen < sizeof(*hdr))) {
7372 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7373 return;
7374 }
7375 hdr = data;
7376
7377 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7378 /* Hot data path. */
7379 hn_rndis_rx_data(rxr, data, dlen);
7380 /* Done! */
7381 return;
7382 }
7383
7384 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7385 hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen);
7386 else
7387 hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen);
7388 }
7389
7390 static void
hn_nvs_handle_notify(struct hn_softc * sc,const struct vmbus_chanpkt_hdr * pkt)7391 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7392 {
7393 const struct hn_nvs_hdr *hdr;
7394
7395 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7396 if_printf(sc->hn_ifp, "invalid nvs notify\n");
7397 return;
7398 }
7399 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7400
7401 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7402 /* Useless; ignore */
7403 return;
7404 }
7405 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7406 }
7407
7408 static void
hn_nvs_handle_comp(struct hn_softc * sc,struct vmbus_channel * chan,const struct vmbus_chanpkt_hdr * pkt)7409 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7410 const struct vmbus_chanpkt_hdr *pkt)
7411 {
7412 struct hn_nvs_sendctx *sndc;
7413
7414 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7415 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7416 VMBUS_CHANPKT_DATALEN(pkt));
7417 /*
7418 * NOTE:
7419 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7420 * its callback.
7421 */
7422 }
7423
7424 static void
hn_nvs_handle_rxbuf(struct hn_rx_ring * rxr,struct vmbus_channel * chan,const struct vmbus_chanpkt_hdr * pkthdr)7425 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7426 const struct vmbus_chanpkt_hdr *pkthdr)
7427 {
7428 struct epoch_tracker et;
7429 const struct vmbus_chanpkt_rxbuf *pkt;
7430 const struct hn_nvs_hdr *nvs_hdr;
7431 int count, i, hlen;
7432
7433 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7434 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7435 return;
7436 }
7437 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7438
7439 /* Make sure that this is a RNDIS message. */
7440 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7441 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7442 nvs_hdr->nvs_type);
7443 return;
7444 }
7445
7446 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7447 if (__predict_false(hlen < sizeof(*pkt))) {
7448 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7449 return;
7450 }
7451 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7452
7453 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7454 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7455 pkt->cp_rxbuf_id);
7456 return;
7457 }
7458
7459 count = pkt->cp_rxbuf_cnt;
7460 if (__predict_false(hlen <
7461 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7462 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7463 return;
7464 }
7465
7466 NET_EPOCH_ENTER(et);
7467 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7468 for (i = 0; i < count; ++i) {
7469 int ofs, len;
7470
7471 ofs = pkt->cp_rxbuf[i].rb_ofs;
7472 len = pkt->cp_rxbuf[i].rb_len;
7473 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7474 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7475 "ofs %d, len %d\n", i, ofs, len);
7476 continue;
7477 }
7478
7479 rxr->rsc.is_last = (i == (count - 1));
7480 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7481 }
7482 NET_EPOCH_EXIT(et);
7483
7484 /*
7485 * Ack the consumed RXBUF associated w/ this channel packet,
7486 * so that this RXBUF can be recycled by the hypervisor.
7487 */
7488 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7489 }
7490
7491 static void
hn_nvs_ack_rxbuf(struct hn_rx_ring * rxr,struct vmbus_channel * chan,uint64_t tid)7492 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7493 uint64_t tid)
7494 {
7495 struct hn_nvs_rndis_ack ack;
7496 int retries, error;
7497
7498 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7499 ack.nvs_status = HN_NVS_STATUS_OK;
7500
7501 retries = 0;
7502 again:
7503 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7504 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7505 if (__predict_false(error == EAGAIN)) {
7506 /*
7507 * NOTE:
7508 * This should _not_ happen in real world, since the
7509 * consumption of the TX bufring from the TX path is
7510 * controlled.
7511 */
7512 if (rxr->hn_ack_failed == 0)
7513 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7514 rxr->hn_ack_failed++;
7515 retries++;
7516 if (retries < 10) {
7517 DELAY(100);
7518 goto again;
7519 }
7520 /* RXBUF leaks! */
7521 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7522 }
7523 }
7524
7525 static void
hn_chan_callback(struct vmbus_channel * chan,void * xrxr)7526 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7527 {
7528 struct hn_rx_ring *rxr = xrxr;
7529 struct hn_softc *sc = if_getsoftc(rxr->hn_ifp);
7530
7531 for (;;) {
7532 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7533 int error, pktlen;
7534
7535 pktlen = rxr->hn_pktbuf_len;
7536 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7537 if (__predict_false(error == ENOBUFS)) {
7538 void *nbuf;
7539 int nlen;
7540
7541 /*
7542 * Expand channel packet buffer.
7543 *
7544 * XXX
7545 * Use M_WAITOK here, since allocation failure
7546 * is fatal.
7547 */
7548 nlen = rxr->hn_pktbuf_len * 2;
7549 while (nlen < pktlen)
7550 nlen *= 2;
7551 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7552
7553 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7554 rxr->hn_pktbuf_len, nlen);
7555
7556 free(rxr->hn_pktbuf, M_DEVBUF);
7557 rxr->hn_pktbuf = nbuf;
7558 rxr->hn_pktbuf_len = nlen;
7559 /* Retry! */
7560 continue;
7561 } else if (__predict_false(error == EAGAIN)) {
7562 /* No more channel packets; done! */
7563 break;
7564 }
7565 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7566
7567 switch (pkt->cph_type) {
7568 case VMBUS_CHANPKT_TYPE_COMP:
7569 hn_nvs_handle_comp(sc, chan, pkt);
7570 break;
7571
7572 case VMBUS_CHANPKT_TYPE_RXBUF:
7573 hn_nvs_handle_rxbuf(rxr, chan, pkt);
7574 break;
7575
7576 case VMBUS_CHANPKT_TYPE_INBAND:
7577 hn_nvs_handle_notify(sc, pkt);
7578 break;
7579
7580 default:
7581 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7582 pkt->cph_type);
7583 break;
7584 }
7585 }
7586 hn_chan_rollup(rxr, rxr->hn_txr);
7587 }
7588
7589 static void
hn_sysinit(void * arg __unused)7590 hn_sysinit(void *arg __unused)
7591 {
7592 int i;
7593
7594 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7595
7596 #ifdef HN_IFSTART_SUPPORT
7597 /*
7598 * Don't use ifnet.if_start if transparent VF mode is requested;
7599 * mainly due to the IFF_DRV_OACTIVE flag.
7600 */
7601 if (hn_xpnt_vf && hn_use_if_start) {
7602 hn_use_if_start = 0;
7603 printf("hn: tranparent VF mode, if_transmit will be used, "
7604 "instead of if_start\n");
7605 }
7606 #endif
7607 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7608 printf("hn: invalid transparent VF attach routing "
7609 "wait timeout %d, reset to %d\n",
7610 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7611 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7612 }
7613
7614 /*
7615 * Initialize VF map.
7616 */
7617 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7618 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7619 hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF,
7620 M_WAITOK | M_ZERO);
7621
7622 /*
7623 * Fix the # of TX taskqueues.
7624 */
7625 if (hn_tx_taskq_cnt <= 0)
7626 hn_tx_taskq_cnt = 1;
7627 else if (hn_tx_taskq_cnt > mp_ncpus)
7628 hn_tx_taskq_cnt = mp_ncpus;
7629
7630 /*
7631 * Fix the TX taskqueue mode.
7632 */
7633 switch (hn_tx_taskq_mode) {
7634 case HN_TX_TASKQ_M_INDEP:
7635 case HN_TX_TASKQ_M_GLOBAL:
7636 case HN_TX_TASKQ_M_EVTTQ:
7637 break;
7638 default:
7639 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7640 break;
7641 }
7642
7643 if (vm_guest != VM_GUEST_HV)
7644 return;
7645
7646 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7647 return;
7648
7649 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7650 M_DEVBUF, M_WAITOK);
7651 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7652 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7653 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7654 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7655 "hn tx%d", i);
7656 }
7657 }
7658 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7659
7660 static void
hn_sysuninit(void * arg __unused)7661 hn_sysuninit(void *arg __unused)
7662 {
7663
7664 if (hn_tx_taskque != NULL) {
7665 int i;
7666
7667 for (i = 0; i < hn_tx_taskq_cnt; ++i)
7668 taskqueue_free(hn_tx_taskque[i]);
7669 free(hn_tx_taskque, M_DEVBUF);
7670 }
7671
7672 if (hn_vfmap != NULL)
7673 free(hn_vfmap, M_DEVBUF);
7674 rm_destroy(&hn_vfmap_lock);
7675
7676 counter_u64_free(hn_udpcs_fixup);
7677 }
7678 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7679