1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/bus.h> 66 #include <sys/counter.h> 67 #include <sys/kernel.h> 68 #include <sys/limits.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/module.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/rmlock.h> 75 #include <sys/sbuf.h> 76 #include <sys/smp.h> 77 #include <sys/socket.h> 78 #include <sys/sockio.h> 79 #include <sys/sx.h> 80 #include <sys/sysctl.h> 81 #include <sys/taskqueue.h> 82 #include <sys/buf_ring.h> 83 #include <sys/eventhandler.h> 84 85 #include <machine/atomic.h> 86 #include <machine/in_cksum.h> 87 88 #include <net/bpf.h> 89 #include <net/ethernet.h> 90 #include <net/if.h> 91 #include <net/if_dl.h> 92 #include <net/if_media.h> 93 #include <net/if_types.h> 94 #include <net/if_var.h> 95 #include <net/rndis.h> 96 #ifdef RSS 97 #include <net/rss_config.h> 98 #endif 99 100 #include <netinet/in_systm.h> 101 #include <netinet/in.h> 102 #include <netinet/ip.h> 103 #include <netinet/ip6.h> 104 #include <netinet/tcp.h> 105 #include <netinet/tcp_lro.h> 106 #include <netinet/udp.h> 107 108 #include <dev/hyperv/include/hyperv.h> 109 #include <dev/hyperv/include/hyperv_busdma.h> 110 #include <dev/hyperv/include/vmbus.h> 111 #include <dev/hyperv/include/vmbus_xact.h> 112 113 #include <dev/hyperv/netvsc/ndis.h> 114 #include <dev/hyperv/netvsc/if_hnreg.h> 115 #include <dev/hyperv/netvsc/if_hnvar.h> 116 #include <dev/hyperv/netvsc/hn_nvs.h> 117 #include <dev/hyperv/netvsc/hn_rndis.h> 118 119 #include "vmbus_if.h" 120 121 #define HN_IFSTART_SUPPORT 122 123 #define HN_RING_CNT_DEF_MAX 8 124 125 #define HN_VFMAP_SIZE_DEF 8 126 127 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 128 129 /* YYY should get it from the underlying channel */ 130 #define HN_TX_DESC_CNT 512 131 132 #define HN_RNDIS_PKT_LEN \ 133 (sizeof(struct rndis_packet_msg) + \ 134 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 135 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 136 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 137 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 138 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 139 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 140 141 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 142 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 143 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 144 /* -1 for RNDIS packet message */ 145 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 146 147 #define HN_DIRECT_TX_SIZE_DEF 128 148 149 #define HN_EARLY_TXEOF_THRESH 8 150 151 #define HN_PKTBUF_LEN_DEF (16 * 1024) 152 153 #define HN_LROENT_CNT_DEF 128 154 155 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 156 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 157 /* YYY 2*MTU is a bit rough, but should be good enough. */ 158 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 159 160 #define HN_LRO_ACKCNT_DEF 1 161 162 #define HN_LOCK_INIT(sc) \ 163 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 164 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 165 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 166 #define HN_LOCK(sc) \ 167 do { \ 168 while (sx_try_xlock(&(sc)->hn_lock) == 0) \ 169 DELAY(1000); \ 170 } while (0) 171 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 172 173 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 174 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 175 #define HN_CSUM_IP_HWASSIST(sc) \ 176 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 177 #define HN_CSUM_IP6_HWASSIST(sc) \ 178 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 179 180 #define HN_PKTSIZE_MIN(align) \ 181 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 182 HN_RNDIS_PKT_LEN, (align)) 183 #define HN_PKTSIZE(m, align) \ 184 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 185 186 #ifdef RSS 187 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 188 #else 189 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 190 #endif 191 192 struct hn_txdesc { 193 #ifndef HN_USE_TXDESC_BUFRING 194 SLIST_ENTRY(hn_txdesc) link; 195 #endif 196 STAILQ_ENTRY(hn_txdesc) agg_link; 197 198 /* Aggregated txdescs, in sending order. */ 199 STAILQ_HEAD(, hn_txdesc) agg_list; 200 201 /* The oldest packet, if transmission aggregation happens. */ 202 struct mbuf *m; 203 struct hn_tx_ring *txr; 204 int refs; 205 uint32_t flags; /* HN_TXD_FLAG_ */ 206 struct hn_nvs_sendctx send_ctx; 207 uint32_t chim_index; 208 int chim_size; 209 210 bus_dmamap_t data_dmap; 211 212 bus_addr_t rndis_pkt_paddr; 213 struct rndis_packet_msg *rndis_pkt; 214 bus_dmamap_t rndis_pkt_dmap; 215 }; 216 217 #define HN_TXD_FLAG_ONLIST 0x0001 218 #define HN_TXD_FLAG_DMAMAP 0x0002 219 #define HN_TXD_FLAG_ONAGG 0x0004 220 221 struct hn_rxinfo { 222 uint32_t vlan_info; 223 uint32_t csum_info; 224 uint32_t hash_info; 225 uint32_t hash_value; 226 }; 227 228 struct hn_rxvf_setarg { 229 struct hn_rx_ring *rxr; 230 struct ifnet *vf_ifp; 231 }; 232 233 #define HN_RXINFO_VLAN 0x0001 234 #define HN_RXINFO_CSUM 0x0002 235 #define HN_RXINFO_HASHINF 0x0004 236 #define HN_RXINFO_HASHVAL 0x0008 237 #define HN_RXINFO_ALL \ 238 (HN_RXINFO_VLAN | \ 239 HN_RXINFO_CSUM | \ 240 HN_RXINFO_HASHINF | \ 241 HN_RXINFO_HASHVAL) 242 243 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff 244 #define HN_NDIS_RXCSUM_INFO_INVALID 0 245 #define HN_NDIS_HASH_INFO_INVALID 0 246 247 static int hn_probe(device_t); 248 static int hn_attach(device_t); 249 static int hn_detach(device_t); 250 static int hn_shutdown(device_t); 251 static void hn_chan_callback(struct vmbus_channel *, 252 void *); 253 254 static void hn_init(void *); 255 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 256 #ifdef HN_IFSTART_SUPPORT 257 static void hn_start(struct ifnet *); 258 #endif 259 static int hn_transmit(struct ifnet *, struct mbuf *); 260 static void hn_xmit_qflush(struct ifnet *); 261 static int hn_ifmedia_upd(struct ifnet *); 262 static void hn_ifmedia_sts(struct ifnet *, 263 struct ifmediareq *); 264 265 static void hn_ifnet_event(void *, struct ifnet *, int); 266 static void hn_ifaddr_event(void *, struct ifnet *); 267 static void hn_ifnet_attevent(void *, struct ifnet *); 268 static void hn_ifnet_detevent(void *, struct ifnet *); 269 static void hn_ifnet_lnkevent(void *, struct ifnet *, int); 270 271 static bool hn_ismyvf(const struct hn_softc *, 272 const struct ifnet *); 273 static void hn_rxvf_change(struct hn_softc *, 274 struct ifnet *, bool); 275 static void hn_rxvf_set(struct hn_softc *, struct ifnet *); 276 static void hn_rxvf_set_task(void *, int); 277 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); 278 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 279 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 280 struct ifreq *); 281 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 282 static bool hn_xpnt_vf_isready(struct hn_softc *); 283 static void hn_xpnt_vf_setready(struct hn_softc *); 284 static void hn_xpnt_vf_init_taskfunc(void *, int); 285 static void hn_xpnt_vf_init(struct hn_softc *); 286 static void hn_xpnt_vf_setenable(struct hn_softc *); 287 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 288 static void hn_vf_rss_fixup(struct hn_softc *, bool); 289 static void hn_vf_rss_restore(struct hn_softc *); 290 291 static int hn_rndis_rxinfo(const void *, int, 292 struct hn_rxinfo *); 293 static void hn_rndis_rx_data(struct hn_rx_ring *, 294 const void *, int); 295 static void hn_rndis_rx_status(struct hn_softc *, 296 const void *, int); 297 static void hn_rndis_init_fixat(struct hn_softc *, int); 298 299 static void hn_nvs_handle_notify(struct hn_softc *, 300 const struct vmbus_chanpkt_hdr *); 301 static void hn_nvs_handle_comp(struct hn_softc *, 302 struct vmbus_channel *, 303 const struct vmbus_chanpkt_hdr *); 304 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 305 struct vmbus_channel *, 306 const struct vmbus_chanpkt_hdr *); 307 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 308 struct vmbus_channel *, uint64_t); 309 310 #if __FreeBSD_version >= 1100099 311 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 312 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 313 #endif 314 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 315 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 316 #if __FreeBSD_version < 1100095 317 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 318 #else 319 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 320 #endif 321 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 322 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 323 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 324 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 325 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 326 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 327 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 328 #ifndef RSS 329 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 330 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 331 #endif 332 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 334 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 335 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 336 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 337 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 338 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 339 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 342 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 343 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 344 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 345 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 346 347 static void hn_stop(struct hn_softc *, bool); 348 static void hn_init_locked(struct hn_softc *); 349 static int hn_chan_attach(struct hn_softc *, 350 struct vmbus_channel *); 351 static void hn_chan_detach(struct hn_softc *, 352 struct vmbus_channel *); 353 static int hn_attach_subchans(struct hn_softc *); 354 static void hn_detach_allchans(struct hn_softc *); 355 static void hn_chan_rollup(struct hn_rx_ring *, 356 struct hn_tx_ring *); 357 static void hn_set_ring_inuse(struct hn_softc *, int); 358 static int hn_synth_attach(struct hn_softc *, int); 359 static void hn_synth_detach(struct hn_softc *); 360 static int hn_synth_alloc_subchans(struct hn_softc *, 361 int *); 362 static bool hn_synth_attachable(const struct hn_softc *); 363 static void hn_suspend(struct hn_softc *); 364 static void hn_suspend_data(struct hn_softc *); 365 static void hn_suspend_mgmt(struct hn_softc *); 366 static void hn_resume(struct hn_softc *); 367 static void hn_resume_data(struct hn_softc *); 368 static void hn_resume_mgmt(struct hn_softc *); 369 static void hn_suspend_mgmt_taskfunc(void *, int); 370 static void hn_chan_drain(struct hn_softc *, 371 struct vmbus_channel *); 372 static void hn_disable_rx(struct hn_softc *); 373 static void hn_drain_rxtx(struct hn_softc *, int); 374 static void hn_polling(struct hn_softc *, u_int); 375 static void hn_chan_polling(struct vmbus_channel *, u_int); 376 static void hn_mtu_change_fixup(struct hn_softc *); 377 378 static void hn_update_link_status(struct hn_softc *); 379 static void hn_change_network(struct hn_softc *); 380 static void hn_link_taskfunc(void *, int); 381 static void hn_netchg_init_taskfunc(void *, int); 382 static void hn_netchg_status_taskfunc(void *, int); 383 static void hn_link_status(struct hn_softc *); 384 385 static int hn_create_rx_data(struct hn_softc *, int); 386 static void hn_destroy_rx_data(struct hn_softc *); 387 static int hn_check_iplen(const struct mbuf *, int); 388 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 389 static int hn_rxfilter_config(struct hn_softc *); 390 #ifndef RSS 391 static int hn_rss_reconfig(struct hn_softc *); 392 #endif 393 static void hn_rss_ind_fixup(struct hn_softc *); 394 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 395 static int hn_rxpkt(struct hn_rx_ring *, const void *, 396 int, const struct hn_rxinfo *); 397 static uint32_t hn_rss_type_fromndis(uint32_t); 398 static uint32_t hn_rss_type_tondis(uint32_t); 399 400 static int hn_tx_ring_create(struct hn_softc *, int); 401 static void hn_tx_ring_destroy(struct hn_tx_ring *); 402 static int hn_create_tx_data(struct hn_softc *, int); 403 static void hn_fixup_tx_data(struct hn_softc *); 404 static void hn_destroy_tx_data(struct hn_softc *); 405 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 406 static void hn_txdesc_gc(struct hn_tx_ring *, 407 struct hn_txdesc *); 408 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 409 struct hn_txdesc *, struct mbuf **); 410 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 411 struct hn_txdesc *); 412 static void hn_set_chim_size(struct hn_softc *, int); 413 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 414 static bool hn_tx_ring_pending(struct hn_tx_ring *); 415 static void hn_tx_ring_qflush(struct hn_tx_ring *); 416 static void hn_resume_tx(struct hn_softc *, int); 417 static void hn_set_txagg(struct hn_softc *); 418 static void *hn_try_txagg(struct ifnet *, 419 struct hn_tx_ring *, struct hn_txdesc *, 420 int); 421 static int hn_get_txswq_depth(const struct hn_tx_ring *); 422 static void hn_txpkt_done(struct hn_nvs_sendctx *, 423 struct hn_softc *, struct vmbus_channel *, 424 const void *, int); 425 static int hn_txpkt_sglist(struct hn_tx_ring *, 426 struct hn_txdesc *); 427 static int hn_txpkt_chim(struct hn_tx_ring *, 428 struct hn_txdesc *); 429 static int hn_xmit(struct hn_tx_ring *, int); 430 static void hn_xmit_taskfunc(void *, int); 431 static void hn_xmit_txeof(struct hn_tx_ring *); 432 static void hn_xmit_txeof_taskfunc(void *, int); 433 #ifdef HN_IFSTART_SUPPORT 434 static int hn_start_locked(struct hn_tx_ring *, int); 435 static void hn_start_taskfunc(void *, int); 436 static void hn_start_txeof(struct hn_tx_ring *); 437 static void hn_start_txeof_taskfunc(void *, int); 438 #endif 439 440 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 441 "Hyper-V network interface"); 442 443 /* Trust tcp segements verification on host side. */ 444 static int hn_trust_hosttcp = 1; 445 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 446 &hn_trust_hosttcp, 0, 447 "Trust tcp segement verification on host side, " 448 "when csum info is missing (global setting)"); 449 450 /* Trust udp datagrams verification on host side. */ 451 static int hn_trust_hostudp = 1; 452 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 453 &hn_trust_hostudp, 0, 454 "Trust udp datagram verification on host side, " 455 "when csum info is missing (global setting)"); 456 457 /* Trust ip packets verification on host side. */ 458 static int hn_trust_hostip = 1; 459 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 460 &hn_trust_hostip, 0, 461 "Trust ip packet verification on host side, " 462 "when csum info is missing (global setting)"); 463 464 /* 465 * Offload UDP/IPv4 checksum. 466 */ 467 static int hn_enable_udp4cs = 1; 468 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 469 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 470 471 /* 472 * Offload UDP/IPv6 checksum. 473 */ 474 static int hn_enable_udp6cs = 1; 475 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 476 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 477 478 /* Stats. */ 479 static counter_u64_t hn_udpcs_fixup; 480 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 481 &hn_udpcs_fixup, "# of UDP checksum fixup"); 482 483 /* 484 * See hn_set_hlen(). 485 * 486 * This value is for Azure. For Hyper-V, set this above 487 * 65536 to disable UDP datagram checksum fixup. 488 */ 489 static int hn_udpcs_fixup_mtu = 1420; 490 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 491 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 492 493 /* Limit TSO burst size */ 494 static int hn_tso_maxlen = IP_MAXPACKET; 495 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 496 &hn_tso_maxlen, 0, "TSO burst limit"); 497 498 /* Limit chimney send size */ 499 static int hn_tx_chimney_size = 0; 500 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 501 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 502 503 /* Limit the size of packet for direct transmission */ 504 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 505 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 506 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 507 508 /* # of LRO entries per RX ring */ 509 #if defined(INET) || defined(INET6) 510 #if __FreeBSD_version >= 1100095 511 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 512 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 513 &hn_lro_entry_count, 0, "LRO entry count"); 514 #endif 515 #endif 516 517 static int hn_tx_taskq_cnt = 1; 518 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 519 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 520 521 #define HN_TX_TASKQ_M_INDEP 0 522 #define HN_TX_TASKQ_M_GLOBAL 1 523 #define HN_TX_TASKQ_M_EVTTQ 2 524 525 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 526 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 527 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 528 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 529 530 #ifndef HN_USE_TXDESC_BUFRING 531 static int hn_use_txdesc_bufring = 0; 532 #else 533 static int hn_use_txdesc_bufring = 1; 534 #endif 535 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 536 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 537 538 #ifdef HN_IFSTART_SUPPORT 539 /* Use ifnet.if_start instead of ifnet.if_transmit */ 540 static int hn_use_if_start = 0; 541 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 542 &hn_use_if_start, 0, "Use if_start TX method"); 543 #endif 544 545 /* # of channels to use */ 546 static int hn_chan_cnt = 0; 547 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 548 &hn_chan_cnt, 0, 549 "# of channels to use; each channel has one RX ring and one TX ring"); 550 551 /* # of transmit rings to use */ 552 static int hn_tx_ring_cnt = 0; 553 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 554 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 555 556 /* Software TX ring deptch */ 557 static int hn_tx_swq_depth = 0; 558 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 559 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 560 561 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 562 #if __FreeBSD_version >= 1100095 563 static u_int hn_lro_mbufq_depth = 0; 564 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 565 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 566 #endif 567 568 /* Packet transmission aggregation size limit */ 569 static int hn_tx_agg_size = -1; 570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 571 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 572 573 /* Packet transmission aggregation count limit */ 574 static int hn_tx_agg_pkts = -1; 575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 576 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 577 578 /* VF list */ 579 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING, 580 0, 0, hn_vflist_sysctl, "A", "VF list"); 581 582 /* VF mapping */ 583 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING, 584 0, 0, hn_vfmap_sysctl, "A", "VF mapping"); 585 586 /* Transparent VF */ 587 static int hn_xpnt_vf = 0; 588 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 589 &hn_xpnt_vf, 0, "Transparent VF mod"); 590 591 /* Accurate BPF support for Transparent VF */ 592 static int hn_xpnt_vf_accbpf = 0; 593 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 594 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 595 596 /* Extra wait for transparent VF attach routing; unit seconds. */ 597 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 598 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 599 &hn_xpnt_vf_attwait, 0, 600 "Extra wait for transparent VF attach routing; unit: seconds"); 601 602 static u_int hn_cpu_index; /* next CPU for channel */ 603 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 604 605 static struct rmlock hn_vfmap_lock; 606 static int hn_vfmap_size; 607 static struct ifnet **hn_vfmap; 608 609 #ifndef RSS 610 static const uint8_t 611 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 612 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 613 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 614 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 615 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 616 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 617 }; 618 #endif /* !RSS */ 619 620 static const struct hyperv_guid hn_guid = { 621 .hv_guid = { 622 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 623 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 624 }; 625 626 static device_method_t hn_methods[] = { 627 /* Device interface */ 628 DEVMETHOD(device_probe, hn_probe), 629 DEVMETHOD(device_attach, hn_attach), 630 DEVMETHOD(device_detach, hn_detach), 631 DEVMETHOD(device_shutdown, hn_shutdown), 632 DEVMETHOD_END 633 }; 634 635 static driver_t hn_driver = { 636 "hn", 637 hn_methods, 638 sizeof(struct hn_softc) 639 }; 640 641 static devclass_t hn_devclass; 642 643 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 644 MODULE_VERSION(hn, 1); 645 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 646 647 #if __FreeBSD_version >= 1100099 648 static void 649 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 650 { 651 int i; 652 653 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 654 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 655 } 656 #endif 657 658 static int 659 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 660 { 661 662 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 663 txd->chim_size == 0, ("invalid rndis sglist txd")); 664 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 665 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 666 } 667 668 static int 669 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 670 { 671 struct hn_nvs_rndis rndis; 672 673 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 674 txd->chim_size > 0, ("invalid rndis chim txd")); 675 676 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 677 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 678 rndis.nvs_chim_idx = txd->chim_index; 679 rndis.nvs_chim_sz = txd->chim_size; 680 681 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 682 &rndis, sizeof(rndis), &txd->send_ctx)); 683 } 684 685 static __inline uint32_t 686 hn_chim_alloc(struct hn_softc *sc) 687 { 688 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 689 u_long *bmap = sc->hn_chim_bmap; 690 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 691 692 for (i = 0; i < bmap_cnt; ++i) { 693 int idx; 694 695 idx = ffsl(~bmap[i]); 696 if (idx == 0) 697 continue; 698 699 --idx; /* ffsl is 1-based */ 700 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 701 ("invalid i %d and idx %d", i, idx)); 702 703 if (atomic_testandset_long(&bmap[i], idx)) 704 continue; 705 706 ret = i * LONG_BIT + idx; 707 break; 708 } 709 return (ret); 710 } 711 712 static __inline void 713 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 714 { 715 u_long mask; 716 uint32_t idx; 717 718 idx = chim_idx / LONG_BIT; 719 KASSERT(idx < sc->hn_chim_bmap_cnt, 720 ("invalid chimney index 0x%x", chim_idx)); 721 722 mask = 1UL << (chim_idx % LONG_BIT); 723 KASSERT(sc->hn_chim_bmap[idx] & mask, 724 ("index bitmap 0x%lx, chimney index %u, " 725 "bitmap idx %d, bitmask 0x%lx", 726 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 727 728 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 729 } 730 731 #if defined(INET6) || defined(INET) 732 733 #define PULLUP_HDR(m, len) \ 734 do { \ 735 if (__predict_false((m)->m_len < (len))) { \ 736 (m) = m_pullup((m), (len)); \ 737 if ((m) == NULL) \ 738 return (NULL); \ 739 } \ 740 } while (0) 741 742 /* 743 * NOTE: If this function failed, the m_head would be freed. 744 */ 745 static __inline struct mbuf * 746 hn_tso_fixup(struct mbuf *m_head) 747 { 748 struct ether_vlan_header *evl; 749 struct tcphdr *th; 750 int ehlen; 751 752 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 753 754 PULLUP_HDR(m_head, sizeof(*evl)); 755 evl = mtod(m_head, struct ether_vlan_header *); 756 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 757 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 758 else 759 ehlen = ETHER_HDR_LEN; 760 m_head->m_pkthdr.l2hlen = ehlen; 761 762 #ifdef INET 763 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 764 struct ip *ip; 765 int iphlen; 766 767 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 768 ip = mtodo(m_head, ehlen); 769 iphlen = ip->ip_hl << 2; 770 m_head->m_pkthdr.l3hlen = iphlen; 771 772 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 773 th = mtodo(m_head, ehlen + iphlen); 774 775 ip->ip_len = 0; 776 ip->ip_sum = 0; 777 th->th_sum = in_pseudo(ip->ip_src.s_addr, 778 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 779 } 780 #endif 781 #if defined(INET6) && defined(INET) 782 else 783 #endif 784 #ifdef INET6 785 { 786 struct ip6_hdr *ip6; 787 788 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 789 ip6 = mtodo(m_head, ehlen); 790 if (ip6->ip6_nxt != IPPROTO_TCP) { 791 m_freem(m_head); 792 return (NULL); 793 } 794 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 795 796 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 797 th = mtodo(m_head, ehlen + sizeof(*ip6)); 798 799 ip6->ip6_plen = 0; 800 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 801 } 802 #endif 803 return (m_head); 804 } 805 806 /* 807 * NOTE: If this function failed, the m_head would be freed. 808 */ 809 static __inline struct mbuf * 810 hn_set_hlen(struct mbuf *m_head) 811 { 812 const struct ether_vlan_header *evl; 813 int ehlen; 814 815 PULLUP_HDR(m_head, sizeof(*evl)); 816 evl = mtod(m_head, const struct ether_vlan_header *); 817 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 818 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 819 else 820 ehlen = ETHER_HDR_LEN; 821 m_head->m_pkthdr.l2hlen = ehlen; 822 823 #ifdef INET 824 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 825 const struct ip *ip; 826 int iphlen; 827 828 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 829 ip = mtodo(m_head, ehlen); 830 iphlen = ip->ip_hl << 2; 831 m_head->m_pkthdr.l3hlen = iphlen; 832 833 /* 834 * UDP checksum offload does not work in Azure, if the 835 * following conditions meet: 836 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 837 * - IP_DF is not set in the IP hdr. 838 * 839 * Fallback to software checksum for these UDP datagrams. 840 */ 841 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 842 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 843 (ntohs(ip->ip_off) & IP_DF) == 0) { 844 uint16_t off = ehlen + iphlen; 845 846 counter_u64_add(hn_udpcs_fixup, 1); 847 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 848 *(uint16_t *)(m_head->m_data + off + 849 m_head->m_pkthdr.csum_data) = in_cksum_skip( 850 m_head, m_head->m_pkthdr.len, off); 851 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 852 } 853 } 854 #endif 855 #if defined(INET6) && defined(INET) 856 else 857 #endif 858 #ifdef INET6 859 { 860 const struct ip6_hdr *ip6; 861 862 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 863 ip6 = mtodo(m_head, ehlen); 864 if (ip6->ip6_nxt != IPPROTO_TCP) { 865 m_freem(m_head); 866 return (NULL); 867 } 868 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 869 } 870 #endif 871 return (m_head); 872 } 873 874 /* 875 * NOTE: If this function failed, the m_head would be freed. 876 */ 877 static __inline struct mbuf * 878 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 879 { 880 const struct tcphdr *th; 881 int ehlen, iphlen; 882 883 *tcpsyn = 0; 884 ehlen = m_head->m_pkthdr.l2hlen; 885 iphlen = m_head->m_pkthdr.l3hlen; 886 887 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 888 th = mtodo(m_head, ehlen + iphlen); 889 if (th->th_flags & TH_SYN) 890 *tcpsyn = 1; 891 return (m_head); 892 } 893 894 #undef PULLUP_HDR 895 896 #endif /* INET6 || INET */ 897 898 static int 899 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 900 { 901 int error = 0; 902 903 HN_LOCK_ASSERT(sc); 904 905 if (sc->hn_rx_filter != filter) { 906 error = hn_rndis_set_rxfilter(sc, filter); 907 if (!error) 908 sc->hn_rx_filter = filter; 909 } 910 return (error); 911 } 912 913 static int 914 hn_rxfilter_config(struct hn_softc *sc) 915 { 916 struct ifnet *ifp = sc->hn_ifp; 917 uint32_t filter; 918 919 HN_LOCK_ASSERT(sc); 920 921 /* 922 * If the non-transparent mode VF is activated, we don't know how 923 * its RX filter is configured, so stick the synthetic device in 924 * the promiscous mode. 925 */ 926 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 927 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 928 } else { 929 filter = NDIS_PACKET_TYPE_DIRECTED; 930 if (ifp->if_flags & IFF_BROADCAST) 931 filter |= NDIS_PACKET_TYPE_BROADCAST; 932 /* TODO: support multicast list */ 933 if ((ifp->if_flags & IFF_ALLMULTI) || 934 !TAILQ_EMPTY(&ifp->if_multiaddrs)) 935 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 936 } 937 return (hn_set_rxfilter(sc, filter)); 938 } 939 940 static void 941 hn_set_txagg(struct hn_softc *sc) 942 { 943 uint32_t size, pkts; 944 int i; 945 946 /* 947 * Setup aggregation size. 948 */ 949 if (sc->hn_agg_size < 0) 950 size = UINT32_MAX; 951 else 952 size = sc->hn_agg_size; 953 954 if (sc->hn_rndis_agg_size < size) 955 size = sc->hn_rndis_agg_size; 956 957 /* NOTE: We only aggregate packets using chimney sending buffers. */ 958 if (size > (uint32_t)sc->hn_chim_szmax) 959 size = sc->hn_chim_szmax; 960 961 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 962 /* Disable */ 963 size = 0; 964 pkts = 0; 965 goto done; 966 } 967 968 /* NOTE: Type of the per TX ring setting is 'int'. */ 969 if (size > INT_MAX) 970 size = INT_MAX; 971 972 /* 973 * Setup aggregation packet count. 974 */ 975 if (sc->hn_agg_pkts < 0) 976 pkts = UINT32_MAX; 977 else 978 pkts = sc->hn_agg_pkts; 979 980 if (sc->hn_rndis_agg_pkts < pkts) 981 pkts = sc->hn_rndis_agg_pkts; 982 983 if (pkts <= 1) { 984 /* Disable */ 985 size = 0; 986 pkts = 0; 987 goto done; 988 } 989 990 /* NOTE: Type of the per TX ring setting is 'short'. */ 991 if (pkts > SHRT_MAX) 992 pkts = SHRT_MAX; 993 994 done: 995 /* NOTE: Type of the per TX ring setting is 'short'. */ 996 if (sc->hn_rndis_agg_align > SHRT_MAX) { 997 /* Disable */ 998 size = 0; 999 pkts = 0; 1000 } 1001 1002 if (bootverbose) { 1003 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1004 size, pkts, sc->hn_rndis_agg_align); 1005 } 1006 1007 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1008 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1009 1010 mtx_lock(&txr->hn_tx_lock); 1011 txr->hn_agg_szmax = size; 1012 txr->hn_agg_pktmax = pkts; 1013 txr->hn_agg_align = sc->hn_rndis_agg_align; 1014 mtx_unlock(&txr->hn_tx_lock); 1015 } 1016 } 1017 1018 static int 1019 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1020 { 1021 1022 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1023 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1024 return txr->hn_txdesc_cnt; 1025 return hn_tx_swq_depth; 1026 } 1027 1028 #ifndef RSS 1029 static int 1030 hn_rss_reconfig(struct hn_softc *sc) 1031 { 1032 int error; 1033 1034 HN_LOCK_ASSERT(sc); 1035 1036 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1037 return (ENXIO); 1038 1039 /* 1040 * Disable RSS first. 1041 * 1042 * NOTE: 1043 * Direct reconfiguration by setting the UNCHG flags does 1044 * _not_ work properly. 1045 */ 1046 if (bootverbose) 1047 if_printf(sc->hn_ifp, "disable RSS\n"); 1048 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1049 if (error) { 1050 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1051 return (error); 1052 } 1053 1054 /* 1055 * Reenable the RSS w/ the updated RSS key or indirect 1056 * table. 1057 */ 1058 if (bootverbose) 1059 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1060 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1061 if (error) { 1062 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1063 return (error); 1064 } 1065 return (0); 1066 } 1067 #endif /* !RSS */ 1068 1069 static void 1070 hn_rss_ind_fixup(struct hn_softc *sc) 1071 { 1072 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1073 int i, nchan; 1074 1075 nchan = sc->hn_rx_ring_inuse; 1076 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1077 1078 /* 1079 * Check indirect table to make sure that all channels in it 1080 * can be used. 1081 */ 1082 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1083 if (rss->rss_ind[i] >= nchan) { 1084 if_printf(sc->hn_ifp, 1085 "RSS indirect table %d fixup: %u -> %d\n", 1086 i, rss->rss_ind[i], nchan - 1); 1087 rss->rss_ind[i] = nchan - 1; 1088 } 1089 } 1090 } 1091 1092 static int 1093 hn_ifmedia_upd(struct ifnet *ifp __unused) 1094 { 1095 1096 return EOPNOTSUPP; 1097 } 1098 1099 static void 1100 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 1101 { 1102 struct hn_softc *sc = ifp->if_softc; 1103 1104 ifmr->ifm_status = IFM_AVALID; 1105 ifmr->ifm_active = IFM_ETHER; 1106 1107 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1108 ifmr->ifm_active |= IFM_NONE; 1109 return; 1110 } 1111 ifmr->ifm_status |= IFM_ACTIVE; 1112 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1113 } 1114 1115 static void 1116 hn_rxvf_set_task(void *xarg, int pending __unused) 1117 { 1118 struct hn_rxvf_setarg *arg = xarg; 1119 1120 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1121 } 1122 1123 static void 1124 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) 1125 { 1126 struct hn_rx_ring *rxr; 1127 struct hn_rxvf_setarg arg; 1128 struct task task; 1129 int i; 1130 1131 HN_LOCK_ASSERT(sc); 1132 1133 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1134 1135 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1136 rxr = &sc->hn_rx_ring[i]; 1137 1138 if (i < sc->hn_rx_ring_inuse) { 1139 arg.rxr = rxr; 1140 arg.vf_ifp = vf_ifp; 1141 vmbus_chan_run_task(rxr->hn_chan, &task); 1142 } else { 1143 rxr->hn_rxvf_ifp = vf_ifp; 1144 } 1145 } 1146 } 1147 1148 static bool 1149 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) 1150 { 1151 const struct ifnet *hn_ifp; 1152 1153 hn_ifp = sc->hn_ifp; 1154 1155 if (ifp == hn_ifp) 1156 return (false); 1157 1158 if (ifp->if_alloctype != IFT_ETHER) 1159 return (false); 1160 1161 /* Ignore lagg/vlan interfaces */ 1162 if (strcmp(ifp->if_dname, "lagg") == 0 || 1163 strcmp(ifp->if_dname, "vlan") == 0) 1164 return (false); 1165 1166 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 1167 return (false); 1168 1169 return (true); 1170 } 1171 1172 static void 1173 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) 1174 { 1175 struct ifnet *hn_ifp; 1176 1177 HN_LOCK(sc); 1178 1179 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1180 goto out; 1181 1182 if (!hn_ismyvf(sc, ifp)) 1183 goto out; 1184 hn_ifp = sc->hn_ifp; 1185 1186 if (rxvf) { 1187 if (sc->hn_flags & HN_FLAG_RXVF) 1188 goto out; 1189 1190 sc->hn_flags |= HN_FLAG_RXVF; 1191 hn_rxfilter_config(sc); 1192 } else { 1193 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1194 goto out; 1195 1196 sc->hn_flags &= ~HN_FLAG_RXVF; 1197 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 1198 hn_rxfilter_config(sc); 1199 else 1200 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1201 } 1202 1203 hn_nvs_set_datapath(sc, 1204 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1205 1206 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1207 1208 if (rxvf) { 1209 hn_vf_rss_fixup(sc, true); 1210 hn_suspend_mgmt(sc); 1211 sc->hn_link_flags &= 1212 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1213 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1214 } else { 1215 hn_vf_rss_restore(sc); 1216 hn_resume_mgmt(sc); 1217 } 1218 1219 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, 1220 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1221 1222 if (bootverbose) { 1223 if_printf(hn_ifp, "datapath is switched %s %s\n", 1224 rxvf ? "to" : "from", ifp->if_xname); 1225 } 1226 out: 1227 HN_UNLOCK(sc); 1228 } 1229 1230 static void 1231 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1232 { 1233 1234 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1235 return; 1236 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1237 } 1238 1239 static void 1240 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1241 { 1242 1243 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); 1244 } 1245 1246 static int 1247 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1248 { 1249 struct ifnet *ifp, *vf_ifp; 1250 uint64_t tmp; 1251 int error; 1252 1253 HN_LOCK_ASSERT(sc); 1254 ifp = sc->hn_ifp; 1255 vf_ifp = sc->hn_vf_ifp; 1256 1257 /* 1258 * Fix up requested capabilities w/ supported capabilities, 1259 * since the supported capabilities could have been changed. 1260 */ 1261 ifr->ifr_reqcap &= ifp->if_capabilities; 1262 /* Pass SIOCSIFCAP to VF. */ 1263 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); 1264 1265 /* 1266 * NOTE: 1267 * The error will be propagated to the callers, however, it 1268 * is _not_ useful here. 1269 */ 1270 1271 /* 1272 * Merge VF's enabled capabilities. 1273 */ 1274 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; 1275 1276 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); 1277 if (ifp->if_capenable & IFCAP_TXCSUM) 1278 ifp->if_hwassist |= tmp; 1279 else 1280 ifp->if_hwassist &= ~tmp; 1281 1282 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); 1283 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 1284 ifp->if_hwassist |= tmp; 1285 else 1286 ifp->if_hwassist &= ~tmp; 1287 1288 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; 1289 if (ifp->if_capenable & IFCAP_TSO4) 1290 ifp->if_hwassist |= tmp; 1291 else 1292 ifp->if_hwassist &= ~tmp; 1293 1294 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; 1295 if (ifp->if_capenable & IFCAP_TSO6) 1296 ifp->if_hwassist |= tmp; 1297 else 1298 ifp->if_hwassist &= ~tmp; 1299 1300 return (error); 1301 } 1302 1303 static int 1304 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1305 { 1306 struct ifnet *vf_ifp; 1307 struct ifreq ifr; 1308 1309 HN_LOCK_ASSERT(sc); 1310 vf_ifp = sc->hn_vf_ifp; 1311 1312 memset(&ifr, 0, sizeof(ifr)); 1313 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1314 ifr.ifr_flags = vf_ifp->if_flags & 0xffff; 1315 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; 1316 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); 1317 } 1318 1319 static void 1320 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1321 { 1322 struct ifnet *ifp = sc->hn_ifp; 1323 int allmulti = 0; 1324 1325 HN_LOCK_ASSERT(sc); 1326 1327 /* XXX vlan(4) style mcast addr maintenance */ 1328 if (!TAILQ_EMPTY(&ifp->if_multiaddrs)) 1329 allmulti = IFF_ALLMULTI; 1330 1331 /* Always set the VF's if_flags */ 1332 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; 1333 } 1334 1335 static void 1336 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) 1337 { 1338 struct rm_priotracker pt; 1339 struct ifnet *hn_ifp = NULL; 1340 struct mbuf *mn; 1341 1342 /* 1343 * XXX racy, if hn(4) ever detached. 1344 */ 1345 rm_rlock(&hn_vfmap_lock, &pt); 1346 if (vf_ifp->if_index < hn_vfmap_size) 1347 hn_ifp = hn_vfmap[vf_ifp->if_index]; 1348 rm_runlock(&hn_vfmap_lock, &pt); 1349 1350 if (hn_ifp != NULL) { 1351 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1352 /* 1353 * Allow tapping on the VF. 1354 */ 1355 ETHER_BPF_MTAP(vf_ifp, mn); 1356 1357 /* 1358 * Update VF stats. 1359 */ 1360 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { 1361 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1362 mn->m_pkthdr.len); 1363 } 1364 /* 1365 * XXX IFCOUNTER_IMCAST 1366 * This stat updating is kinda invasive, since it 1367 * requires two checks on the mbuf: the length check 1368 * and the ethernet header check. As of this write, 1369 * all multicast packets go directly to hn(4), which 1370 * makes imcast stat updating in the VF a try in vian. 1371 */ 1372 1373 /* 1374 * Fix up rcvif and increase hn(4)'s ipackets. 1375 */ 1376 mn->m_pkthdr.rcvif = hn_ifp; 1377 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1378 } 1379 /* 1380 * Go through hn(4)'s if_input. 1381 */ 1382 hn_ifp->if_input(hn_ifp, m); 1383 } else { 1384 /* 1385 * In the middle of the transition; free this 1386 * mbuf chain. 1387 */ 1388 while (m != NULL) { 1389 mn = m->m_nextpkt; 1390 m->m_nextpkt = NULL; 1391 m_freem(m); 1392 m = mn; 1393 } 1394 } 1395 } 1396 1397 static void 1398 hn_mtu_change_fixup(struct hn_softc *sc) 1399 { 1400 struct ifnet *ifp; 1401 1402 HN_LOCK_ASSERT(sc); 1403 ifp = sc->hn_ifp; 1404 1405 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 1406 #if __FreeBSD_version >= 1100099 1407 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1408 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1409 #endif 1410 } 1411 1412 static uint32_t 1413 hn_rss_type_fromndis(uint32_t rss_hash) 1414 { 1415 uint32_t types = 0; 1416 1417 if (rss_hash & NDIS_HASH_IPV4) 1418 types |= RSS_TYPE_IPV4; 1419 if (rss_hash & NDIS_HASH_TCP_IPV4) 1420 types |= RSS_TYPE_TCP_IPV4; 1421 if (rss_hash & NDIS_HASH_IPV6) 1422 types |= RSS_TYPE_IPV6; 1423 if (rss_hash & NDIS_HASH_IPV6_EX) 1424 types |= RSS_TYPE_IPV6_EX; 1425 if (rss_hash & NDIS_HASH_TCP_IPV6) 1426 types |= RSS_TYPE_TCP_IPV6; 1427 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1428 types |= RSS_TYPE_TCP_IPV6_EX; 1429 return (types); 1430 } 1431 1432 static uint32_t 1433 hn_rss_type_tondis(uint32_t types) 1434 { 1435 uint32_t rss_hash = 0; 1436 1437 KASSERT((types & 1438 (RSS_TYPE_UDP_IPV4 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1439 ("UDP4, UDP6 and UDP6EX are not supported")); 1440 1441 if (types & RSS_TYPE_IPV4) 1442 rss_hash |= NDIS_HASH_IPV4; 1443 if (types & RSS_TYPE_TCP_IPV4) 1444 rss_hash |= NDIS_HASH_TCP_IPV4; 1445 if (types & RSS_TYPE_IPV6) 1446 rss_hash |= NDIS_HASH_IPV6; 1447 if (types & RSS_TYPE_IPV6_EX) 1448 rss_hash |= NDIS_HASH_IPV6_EX; 1449 if (types & RSS_TYPE_TCP_IPV6) 1450 rss_hash |= NDIS_HASH_TCP_IPV6; 1451 if (types & RSS_TYPE_TCP_IPV6_EX) 1452 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1453 return (rss_hash); 1454 } 1455 1456 static void 1457 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1458 { 1459 int i; 1460 1461 HN_LOCK_ASSERT(sc); 1462 1463 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1464 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1465 } 1466 1467 static void 1468 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1469 { 1470 struct ifnet *ifp, *vf_ifp; 1471 struct ifrsshash ifrh; 1472 struct ifrsskey ifrk; 1473 int error; 1474 uint32_t my_types, diff_types, mbuf_types = 0; 1475 1476 HN_LOCK_ASSERT(sc); 1477 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1478 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1479 1480 if (sc->hn_rx_ring_inuse == 1) { 1481 /* No RSS on synthetic parts; done. */ 1482 return; 1483 } 1484 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1485 /* Synthetic parts do not support Toeplitz; done. */ 1486 return; 1487 } 1488 1489 ifp = sc->hn_ifp; 1490 vf_ifp = sc->hn_vf_ifp; 1491 1492 /* 1493 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1494 * supported. 1495 */ 1496 memset(&ifrk, 0, sizeof(ifrk)); 1497 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); 1498 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); 1499 if (error) { 1500 if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n", 1501 vf_ifp->if_xname, error); 1502 goto done; 1503 } 1504 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1505 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1506 vf_ifp->if_xname, ifrk.ifrk_func); 1507 goto done; 1508 } 1509 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1510 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1511 vf_ifp->if_xname, ifrk.ifrk_keylen); 1512 goto done; 1513 } 1514 1515 /* 1516 * Extract VF's RSS hash. Only Toeplitz is supported. 1517 */ 1518 memset(&ifrh, 0, sizeof(ifrh)); 1519 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); 1520 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); 1521 if (error) { 1522 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1523 vf_ifp->if_xname, error); 1524 goto done; 1525 } 1526 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1527 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1528 vf_ifp->if_xname, ifrh.ifrh_func); 1529 goto done; 1530 } 1531 1532 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1533 if ((ifrh.ifrh_types & my_types) == 0) { 1534 /* This disables RSS; ignore it then */ 1535 if_printf(ifp, "%s intersection of RSS types failed. " 1536 "VF %#x, mine %#x\n", vf_ifp->if_xname, 1537 ifrh.ifrh_types, my_types); 1538 goto done; 1539 } 1540 1541 diff_types = my_types ^ ifrh.ifrh_types; 1542 my_types &= ifrh.ifrh_types; 1543 mbuf_types = my_types; 1544 1545 /* 1546 * Detect RSS hash value/type confliction. 1547 * 1548 * NOTE: 1549 * We don't disable the hash type, but stop delivery the hash 1550 * value/type through mbufs on RX path. 1551 */ 1552 if ((my_types & RSS_TYPE_IPV4) && 1553 (diff_types & ifrh.ifrh_types & 1554 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1555 /* Conflict; disable IPV4 hash type/value delivery. */ 1556 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1557 mbuf_types &= ~RSS_TYPE_IPV4; 1558 } 1559 if ((my_types & RSS_TYPE_IPV6) && 1560 (diff_types & ifrh.ifrh_types & 1561 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1562 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1563 RSS_TYPE_IPV6_EX))) { 1564 /* Conflict; disable IPV6 hash type/value delivery. */ 1565 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1566 mbuf_types &= ~RSS_TYPE_IPV6; 1567 } 1568 if ((my_types & RSS_TYPE_IPV6_EX) && 1569 (diff_types & ifrh.ifrh_types & 1570 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1571 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1572 RSS_TYPE_IPV6))) { 1573 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1574 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1575 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1576 } 1577 if ((my_types & RSS_TYPE_TCP_IPV6) && 1578 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1579 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1580 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1581 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1582 } 1583 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1584 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1585 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1586 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1587 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1588 } 1589 if ((my_types & RSS_TYPE_UDP_IPV6) && 1590 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1591 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1592 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1593 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1594 } 1595 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1596 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1597 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1598 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1599 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1600 } 1601 1602 /* 1603 * Indirect table does not matter. 1604 */ 1605 1606 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1607 hn_rss_type_tondis(my_types); 1608 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1609 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1610 1611 if (reconf) { 1612 error = hn_rss_reconfig(sc); 1613 if (error) { 1614 /* XXX roll-back? */ 1615 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1616 /* XXX keep going. */ 1617 } 1618 } 1619 done: 1620 /* Hash deliverability for mbufs. */ 1621 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1622 } 1623 1624 static void 1625 hn_vf_rss_restore(struct hn_softc *sc) 1626 { 1627 1628 HN_LOCK_ASSERT(sc); 1629 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1630 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1631 1632 if (sc->hn_rx_ring_inuse == 1) 1633 goto done; 1634 1635 /* 1636 * Restore hash types. Key does _not_ matter. 1637 */ 1638 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1639 int error; 1640 1641 sc->hn_rss_hash = sc->hn_rss_hcap; 1642 error = hn_rss_reconfig(sc); 1643 if (error) { 1644 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1645 error); 1646 /* XXX keep going. */ 1647 } 1648 } 1649 done: 1650 /* Hash deliverability for mbufs. */ 1651 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1652 } 1653 1654 static void 1655 hn_xpnt_vf_setready(struct hn_softc *sc) 1656 { 1657 struct ifnet *ifp, *vf_ifp; 1658 struct ifreq ifr; 1659 1660 HN_LOCK_ASSERT(sc); 1661 ifp = sc->hn_ifp; 1662 vf_ifp = sc->hn_vf_ifp; 1663 1664 /* 1665 * Mark the VF ready. 1666 */ 1667 sc->hn_vf_rdytick = 0; 1668 1669 /* 1670 * Save information for restoration. 1671 */ 1672 sc->hn_saved_caps = ifp->if_capabilities; 1673 sc->hn_saved_tsomax = ifp->if_hw_tsomax; 1674 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; 1675 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; 1676 1677 /* 1678 * Intersect supported/enabled capabilities. 1679 * 1680 * NOTE: 1681 * if_hwassist is not changed here. 1682 */ 1683 ifp->if_capabilities &= vf_ifp->if_capabilities; 1684 ifp->if_capenable &= ifp->if_capabilities; 1685 1686 /* 1687 * Fix TSO settings. 1688 */ 1689 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) 1690 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; 1691 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) 1692 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; 1693 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) 1694 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; 1695 1696 /* 1697 * Change VF's enabled capabilities. 1698 */ 1699 memset(&ifr, 0, sizeof(ifr)); 1700 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1701 ifr.ifr_reqcap = ifp->if_capenable; 1702 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1703 1704 if (ifp->if_mtu != ETHERMTU) { 1705 int error; 1706 1707 /* 1708 * Change VF's MTU. 1709 */ 1710 memset(&ifr, 0, sizeof(ifr)); 1711 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1712 ifr.ifr_mtu = ifp->if_mtu; 1713 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); 1714 if (error) { 1715 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1716 vf_ifp->if_xname, ifp->if_mtu); 1717 if (ifp->if_mtu > ETHERMTU) { 1718 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1719 1720 /* 1721 * XXX 1722 * No need to adjust the synthetic parts' MTU; 1723 * failure of the adjustment will cause us 1724 * infinite headache. 1725 */ 1726 ifp->if_mtu = ETHERMTU; 1727 hn_mtu_change_fixup(sc); 1728 } 1729 } 1730 } 1731 } 1732 1733 static bool 1734 hn_xpnt_vf_isready(struct hn_softc *sc) 1735 { 1736 1737 HN_LOCK_ASSERT(sc); 1738 1739 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1740 return (false); 1741 1742 if (sc->hn_vf_rdytick == 0) 1743 return (true); 1744 1745 if (sc->hn_vf_rdytick > ticks) 1746 return (false); 1747 1748 /* Mark VF as ready. */ 1749 hn_xpnt_vf_setready(sc); 1750 return (true); 1751 } 1752 1753 static void 1754 hn_xpnt_vf_setenable(struct hn_softc *sc) 1755 { 1756 int i; 1757 1758 HN_LOCK_ASSERT(sc); 1759 1760 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1761 rm_wlock(&sc->hn_vf_lock); 1762 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1763 rm_wunlock(&sc->hn_vf_lock); 1764 1765 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1766 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1767 } 1768 1769 static void 1770 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1771 { 1772 int i; 1773 1774 HN_LOCK_ASSERT(sc); 1775 1776 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1777 rm_wlock(&sc->hn_vf_lock); 1778 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1779 if (clear_vf) 1780 sc->hn_vf_ifp = NULL; 1781 rm_wunlock(&sc->hn_vf_lock); 1782 1783 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1784 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1785 } 1786 1787 static void 1788 hn_xpnt_vf_init(struct hn_softc *sc) 1789 { 1790 int error; 1791 1792 HN_LOCK_ASSERT(sc); 1793 1794 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1795 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1796 1797 if (bootverbose) { 1798 if_printf(sc->hn_ifp, "try bringing up %s\n", 1799 sc->hn_vf_ifp->if_xname); 1800 } 1801 1802 /* 1803 * Bring the VF up. 1804 */ 1805 hn_xpnt_vf_saveifflags(sc); 1806 sc->hn_vf_ifp->if_flags |= IFF_UP; 1807 error = hn_xpnt_vf_iocsetflags(sc); 1808 if (error) { 1809 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1810 sc->hn_vf_ifp->if_xname, error); 1811 return; 1812 } 1813 1814 /* 1815 * NOTE: 1816 * Datapath setting must happen _after_ bringing the VF up. 1817 */ 1818 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1819 1820 /* 1821 * NOTE: 1822 * Fixup RSS related bits _after_ the VF is brought up, since 1823 * many VFs generate RSS key during it's initialization. 1824 */ 1825 hn_vf_rss_fixup(sc, true); 1826 1827 /* Mark transparent mode VF as enabled. */ 1828 hn_xpnt_vf_setenable(sc); 1829 } 1830 1831 static void 1832 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1833 { 1834 struct hn_softc *sc = xsc; 1835 1836 HN_LOCK(sc); 1837 1838 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1839 goto done; 1840 if (sc->hn_vf_ifp == NULL) 1841 goto done; 1842 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1843 goto done; 1844 1845 if (sc->hn_vf_rdytick != 0) { 1846 /* Mark VF as ready. */ 1847 hn_xpnt_vf_setready(sc); 1848 } 1849 1850 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { 1851 /* 1852 * Delayed VF initialization. 1853 */ 1854 if (bootverbose) { 1855 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1856 sc->hn_vf_ifp->if_xname); 1857 } 1858 hn_xpnt_vf_init(sc); 1859 } 1860 done: 1861 HN_UNLOCK(sc); 1862 } 1863 1864 static void 1865 hn_ifnet_attevent(void *xsc, struct ifnet *ifp) 1866 { 1867 struct hn_softc *sc = xsc; 1868 1869 HN_LOCK(sc); 1870 1871 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1872 goto done; 1873 1874 if (!hn_ismyvf(sc, ifp)) 1875 goto done; 1876 1877 if (sc->hn_vf_ifp != NULL) { 1878 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1879 sc->hn_vf_ifp->if_xname); 1880 goto done; 1881 } 1882 1883 if (hn_xpnt_vf && ifp->if_start != NULL) { 1884 /* 1885 * ifnet.if_start is _not_ supported by transparent 1886 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1887 */ 1888 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1889 "in transparent VF mode.\n", ifp->if_xname); 1890 goto done; 1891 } 1892 1893 rm_wlock(&hn_vfmap_lock); 1894 1895 if (ifp->if_index >= hn_vfmap_size) { 1896 struct ifnet **newmap; 1897 int newsize; 1898 1899 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; 1900 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, 1901 M_WAITOK | M_ZERO); 1902 1903 memcpy(newmap, hn_vfmap, 1904 sizeof(struct ifnet *) * hn_vfmap_size); 1905 free(hn_vfmap, M_DEVBUF); 1906 hn_vfmap = newmap; 1907 hn_vfmap_size = newsize; 1908 } 1909 KASSERT(hn_vfmap[ifp->if_index] == NULL, 1910 ("%s: ifindex %d was mapped to %s", 1911 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); 1912 hn_vfmap[ifp->if_index] = sc->hn_ifp; 1913 1914 rm_wunlock(&hn_vfmap_lock); 1915 1916 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1917 rm_wlock(&sc->hn_vf_lock); 1918 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1919 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1920 sc->hn_vf_ifp = ifp; 1921 rm_wunlock(&sc->hn_vf_lock); 1922 1923 if (hn_xpnt_vf) { 1924 int wait_ticks; 1925 1926 /* 1927 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1928 * Save vf_ifp's current if_input for later restoration. 1929 */ 1930 sc->hn_vf_input = ifp->if_input; 1931 ifp->if_input = hn_xpnt_vf_input; 1932 1933 /* 1934 * Stop link status management; use the VF's. 1935 */ 1936 hn_suspend_mgmt(sc); 1937 1938 /* 1939 * Give VF sometime to complete its attach routing. 1940 */ 1941 wait_ticks = hn_xpnt_vf_attwait * hz; 1942 sc->hn_vf_rdytick = ticks + wait_ticks; 1943 1944 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1945 wait_ticks); 1946 } 1947 done: 1948 HN_UNLOCK(sc); 1949 } 1950 1951 static void 1952 hn_ifnet_detevent(void *xsc, struct ifnet *ifp) 1953 { 1954 struct hn_softc *sc = xsc; 1955 1956 HN_LOCK(sc); 1957 1958 if (sc->hn_vf_ifp == NULL) 1959 goto done; 1960 1961 if (!hn_ismyvf(sc, ifp)) 1962 goto done; 1963 1964 if (hn_xpnt_vf) { 1965 /* 1966 * Make sure that the delayed initialization is not running. 1967 * 1968 * NOTE: 1969 * - This lock _must_ be released, since the hn_vf_init task 1970 * will try holding this lock. 1971 * - It is safe to release this lock here, since the 1972 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 1973 * 1974 * XXX racy, if hn(4) ever detached. 1975 */ 1976 HN_UNLOCK(sc); 1977 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 1978 HN_LOCK(sc); 1979 1980 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 1981 sc->hn_ifp->if_xname)); 1982 ifp->if_input = sc->hn_vf_input; 1983 sc->hn_vf_input = NULL; 1984 1985 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 1986 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 1987 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 1988 1989 if (sc->hn_vf_rdytick == 0) { 1990 /* 1991 * The VF was ready; restore some settings. 1992 */ 1993 sc->hn_ifp->if_capabilities = sc->hn_saved_caps; 1994 /* 1995 * NOTE: 1996 * There is _no_ need to fixup if_capenable and 1997 * if_hwassist, since the if_capabilities before 1998 * restoration was an intersection of the VF's 1999 * if_capabilites and the synthetic device's 2000 * if_capabilites. 2001 */ 2002 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; 2003 sc->hn_ifp->if_hw_tsomaxsegcount = 2004 sc->hn_saved_tsosegcnt; 2005 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; 2006 } 2007 2008 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2009 /* 2010 * Restore RSS settings. 2011 */ 2012 hn_vf_rss_restore(sc); 2013 2014 /* 2015 * Resume link status management, which was suspended 2016 * by hn_ifnet_attevent(). 2017 */ 2018 hn_resume_mgmt(sc); 2019 } 2020 } 2021 2022 /* Mark transparent mode VF as disabled. */ 2023 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2024 2025 rm_wlock(&hn_vfmap_lock); 2026 2027 KASSERT(ifp->if_index < hn_vfmap_size, 2028 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); 2029 if (hn_vfmap[ifp->if_index] != NULL) { 2030 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, 2031 ("%s: ifindex %d was mapped to %s", 2032 ifp->if_xname, ifp->if_index, 2033 hn_vfmap[ifp->if_index]->if_xname)); 2034 hn_vfmap[ifp->if_index] = NULL; 2035 } 2036 2037 rm_wunlock(&hn_vfmap_lock); 2038 done: 2039 HN_UNLOCK(sc); 2040 } 2041 2042 static void 2043 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) 2044 { 2045 struct hn_softc *sc = xsc; 2046 2047 if (sc->hn_vf_ifp == ifp) 2048 if_link_state_change(sc->hn_ifp, link_state); 2049 } 2050 2051 static int 2052 hn_probe(device_t dev) 2053 { 2054 2055 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2056 device_set_desc(dev, "Hyper-V Network Interface"); 2057 return BUS_PROBE_DEFAULT; 2058 } 2059 return ENXIO; 2060 } 2061 2062 static int 2063 hn_attach(device_t dev) 2064 { 2065 struct hn_softc *sc = device_get_softc(dev); 2066 struct sysctl_oid_list *child; 2067 struct sysctl_ctx_list *ctx; 2068 uint8_t eaddr[ETHER_ADDR_LEN]; 2069 struct ifnet *ifp = NULL; 2070 int error, ring_cnt, tx_ring_cnt; 2071 uint32_t mtu; 2072 2073 sc->hn_dev = dev; 2074 sc->hn_prichan = vmbus_get_channel(dev); 2075 HN_LOCK_INIT(sc); 2076 rm_init(&sc->hn_vf_lock, "hnvf"); 2077 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2078 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2079 2080 /* 2081 * Initialize these tunables once. 2082 */ 2083 sc->hn_agg_size = hn_tx_agg_size; 2084 sc->hn_agg_pkts = hn_tx_agg_pkts; 2085 2086 /* 2087 * Setup taskqueue for transmission. 2088 */ 2089 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2090 int i; 2091 2092 sc->hn_tx_taskqs = 2093 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2094 M_DEVBUF, M_WAITOK); 2095 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2096 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2097 M_WAITOK, taskqueue_thread_enqueue, 2098 &sc->hn_tx_taskqs[i]); 2099 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2100 "%s tx%d", device_get_nameunit(dev), i); 2101 } 2102 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2103 sc->hn_tx_taskqs = hn_tx_taskque; 2104 } 2105 2106 /* 2107 * Setup taskqueue for mangement tasks, e.g. link status. 2108 */ 2109 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2110 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2111 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2112 device_get_nameunit(dev)); 2113 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2114 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2115 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2116 hn_netchg_status_taskfunc, sc); 2117 2118 if (hn_xpnt_vf) { 2119 /* 2120 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2121 */ 2122 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2123 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2124 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2125 device_get_nameunit(dev)); 2126 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2127 hn_xpnt_vf_init_taskfunc, sc); 2128 } 2129 2130 /* 2131 * Allocate ifnet and setup its name earlier, so that if_printf 2132 * can be used by functions, which will be called after 2133 * ether_ifattach(). 2134 */ 2135 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2136 ifp->if_softc = sc; 2137 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2138 2139 /* 2140 * Initialize ifmedia earlier so that it can be unconditionally 2141 * destroyed, if error happened later on. 2142 */ 2143 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2144 2145 /* 2146 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2147 * to use (tx_ring_cnt). 2148 * 2149 * NOTE: 2150 * The # of RX rings to use is same as the # of channels to use. 2151 */ 2152 ring_cnt = hn_chan_cnt; 2153 if (ring_cnt <= 0) { 2154 /* Default */ 2155 ring_cnt = mp_ncpus; 2156 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2157 ring_cnt = HN_RING_CNT_DEF_MAX; 2158 } else if (ring_cnt > mp_ncpus) { 2159 ring_cnt = mp_ncpus; 2160 } 2161 #ifdef RSS 2162 if (ring_cnt > rss_getnumbuckets()) 2163 ring_cnt = rss_getnumbuckets(); 2164 #endif 2165 2166 tx_ring_cnt = hn_tx_ring_cnt; 2167 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2168 tx_ring_cnt = ring_cnt; 2169 #ifdef HN_IFSTART_SUPPORT 2170 if (hn_use_if_start) { 2171 /* ifnet.if_start only needs one TX ring. */ 2172 tx_ring_cnt = 1; 2173 } 2174 #endif 2175 2176 /* 2177 * Set the leader CPU for channels. 2178 */ 2179 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2180 2181 /* 2182 * Create enough TX/RX rings, even if only limited number of 2183 * channels can be allocated. 2184 */ 2185 error = hn_create_tx_data(sc, tx_ring_cnt); 2186 if (error) 2187 goto failed; 2188 error = hn_create_rx_data(sc, ring_cnt); 2189 if (error) 2190 goto failed; 2191 2192 /* 2193 * Create transaction context for NVS and RNDIS transactions. 2194 */ 2195 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2196 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2197 if (sc->hn_xact == NULL) { 2198 error = ENXIO; 2199 goto failed; 2200 } 2201 2202 /* 2203 * Install orphan handler for the revocation of this device's 2204 * primary channel. 2205 * 2206 * NOTE: 2207 * The processing order is critical here: 2208 * Install the orphan handler, _before_ testing whether this 2209 * device's primary channel has been revoked or not. 2210 */ 2211 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2212 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2213 error = ENXIO; 2214 goto failed; 2215 } 2216 2217 /* 2218 * Attach the synthetic parts, i.e. NVS and RNDIS. 2219 */ 2220 error = hn_synth_attach(sc, ETHERMTU); 2221 if (error) 2222 goto failed; 2223 2224 error = hn_rndis_get_eaddr(sc, eaddr); 2225 if (error) 2226 goto failed; 2227 2228 error = hn_rndis_get_mtu(sc, &mtu); 2229 if (error) 2230 mtu = ETHERMTU; 2231 else if (bootverbose) 2232 device_printf(dev, "RNDIS mtu %u\n", mtu); 2233 2234 #if __FreeBSD_version >= 1100099 2235 if (sc->hn_rx_ring_inuse > 1) { 2236 /* 2237 * Reduce TCP segment aggregation limit for multiple 2238 * RX rings to increase ACK timeliness. 2239 */ 2240 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2241 } 2242 #endif 2243 2244 /* 2245 * Fixup TX stuffs after synthetic parts are attached. 2246 */ 2247 hn_fixup_tx_data(sc); 2248 2249 ctx = device_get_sysctl_ctx(dev); 2250 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2251 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2252 &sc->hn_nvs_ver, 0, "NVS version"); 2253 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2254 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2255 hn_ndis_version_sysctl, "A", "NDIS version"); 2256 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2257 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2258 hn_caps_sysctl, "A", "capabilities"); 2259 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2260 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2261 hn_hwassist_sysctl, "A", "hwassist"); 2262 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", 2263 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); 2264 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", 2265 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, 2266 "max # of TSO segments"); 2267 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", 2268 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, 2269 "max size of TSO segment"); 2270 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2271 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2272 hn_rxfilter_sysctl, "A", "rxfilter"); 2273 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2274 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2275 hn_rss_hash_sysctl, "A", "RSS hash"); 2276 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2277 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2278 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2279 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2280 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2281 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2282 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2283 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2284 #ifndef RSS 2285 /* 2286 * Don't allow RSS key/indirect table changes, if RSS is defined. 2287 */ 2288 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2289 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2290 hn_rss_key_sysctl, "IU", "RSS key"); 2291 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2292 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2293 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2294 #endif 2295 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2296 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2297 "RNDIS offered packet transmission aggregation size limit"); 2298 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2299 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2300 "RNDIS offered packet transmission aggregation count limit"); 2301 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2302 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2303 "RNDIS packet transmission aggregation alignment"); 2304 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2305 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2306 hn_txagg_size_sysctl, "I", 2307 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2308 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2309 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2310 hn_txagg_pkts_sysctl, "I", 2311 "Packet transmission aggregation packets, " 2312 "0 -- disable, -1 -- auto"); 2313 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2314 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2315 hn_polling_sysctl, "I", 2316 "Polling frequency: [100,1000000], 0 disable polling"); 2317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2318 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2319 hn_vf_sysctl, "A", "Virtual Function's name"); 2320 if (!hn_xpnt_vf) { 2321 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2322 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2323 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2324 } else { 2325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2326 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2327 hn_xpnt_vf_enabled_sysctl, "I", 2328 "Transparent VF enabled"); 2329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2330 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2331 hn_xpnt_vf_accbpf_sysctl, "I", 2332 "Accurate BPF for transparent VF"); 2333 } 2334 2335 /* 2336 * Setup the ifmedia, which has been initialized earlier. 2337 */ 2338 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2339 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2340 /* XXX ifmedia_set really should do this for us */ 2341 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2342 2343 /* 2344 * Setup the ifnet for this interface. 2345 */ 2346 2347 ifp->if_baudrate = IF_Gbps(10); 2348 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 2349 ifp->if_ioctl = hn_ioctl; 2350 ifp->if_init = hn_init; 2351 #ifdef HN_IFSTART_SUPPORT 2352 if (hn_use_if_start) { 2353 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2354 2355 ifp->if_start = hn_start; 2356 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 2357 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 2358 IFQ_SET_READY(&ifp->if_snd); 2359 } else 2360 #endif 2361 { 2362 ifp->if_transmit = hn_transmit; 2363 ifp->if_qflush = hn_xmit_qflush; 2364 } 2365 2366 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; 2367 #ifdef foo 2368 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2369 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 2370 #endif 2371 if (sc->hn_caps & HN_CAP_VLAN) { 2372 /* XXX not sure about VLAN_MTU. */ 2373 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 2374 } 2375 2376 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 2377 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 2378 ifp->if_capabilities |= IFCAP_TXCSUM; 2379 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 2380 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 2381 if (sc->hn_caps & HN_CAP_TSO4) { 2382 ifp->if_capabilities |= IFCAP_TSO4; 2383 ifp->if_hwassist |= CSUM_IP_TSO; 2384 } 2385 if (sc->hn_caps & HN_CAP_TSO6) { 2386 ifp->if_capabilities |= IFCAP_TSO6; 2387 ifp->if_hwassist |= CSUM_IP6_TSO; 2388 } 2389 2390 /* Enable all available capabilities by default. */ 2391 ifp->if_capenable = ifp->if_capabilities; 2392 2393 /* 2394 * Disable IPv6 TSO and TXCSUM by default, they still can 2395 * be enabled through SIOCSIFCAP. 2396 */ 2397 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 2398 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 2399 2400 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 2401 /* 2402 * Lock hn_set_tso_maxsize() to simplify its 2403 * internal logic. 2404 */ 2405 HN_LOCK(sc); 2406 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2407 HN_UNLOCK(sc); 2408 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 2409 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 2410 } 2411 2412 ether_ifattach(ifp, eaddr); 2413 2414 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2415 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2416 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 2417 } 2418 if (mtu < ETHERMTU) { 2419 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); 2420 ifp->if_mtu = mtu; 2421 } 2422 2423 /* Inform the upper layer about the long frame support. */ 2424 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 2425 2426 /* 2427 * Kick off link status check. 2428 */ 2429 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2430 hn_update_link_status(sc); 2431 2432 if (!hn_xpnt_vf) { 2433 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2434 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2435 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2436 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2437 } else { 2438 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2439 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2440 } 2441 2442 /* 2443 * NOTE: 2444 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2445 * since interface's LLADDR is needed; interface LLADDR is not 2446 * available when ifnet_arrival event is triggered. 2447 */ 2448 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2449 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2450 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2451 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2452 2453 return (0); 2454 failed: 2455 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2456 hn_synth_detach(sc); 2457 hn_detach(dev); 2458 return (error); 2459 } 2460 2461 static int 2462 hn_detach(device_t dev) 2463 { 2464 struct hn_softc *sc = device_get_softc(dev); 2465 struct ifnet *ifp = sc->hn_ifp, *vf_ifp; 2466 2467 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2468 /* 2469 * In case that the vmbus missed the orphan handler 2470 * installation. 2471 */ 2472 vmbus_xact_ctx_orphan(sc->hn_xact); 2473 } 2474 2475 if (sc->hn_ifaddr_evthand != NULL) 2476 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2477 if (sc->hn_ifnet_evthand != NULL) 2478 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2479 if (sc->hn_ifnet_atthand != NULL) { 2480 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2481 sc->hn_ifnet_atthand); 2482 } 2483 if (sc->hn_ifnet_dethand != NULL) { 2484 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2485 sc->hn_ifnet_dethand); 2486 } 2487 if (sc->hn_ifnet_lnkhand != NULL) 2488 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2489 2490 vf_ifp = sc->hn_vf_ifp; 2491 __compiler_membar(); 2492 if (vf_ifp != NULL) 2493 hn_ifnet_detevent(sc, vf_ifp); 2494 2495 if (device_is_attached(dev)) { 2496 HN_LOCK(sc); 2497 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2498 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2499 hn_stop(sc, true); 2500 /* 2501 * NOTE: 2502 * hn_stop() only suspends data, so managment 2503 * stuffs have to be suspended manually here. 2504 */ 2505 hn_suspend_mgmt(sc); 2506 hn_synth_detach(sc); 2507 } 2508 HN_UNLOCK(sc); 2509 ether_ifdetach(ifp); 2510 } 2511 2512 ifmedia_removeall(&sc->hn_media); 2513 hn_destroy_rx_data(sc); 2514 hn_destroy_tx_data(sc); 2515 2516 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2517 int i; 2518 2519 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2520 taskqueue_free(sc->hn_tx_taskqs[i]); 2521 free(sc->hn_tx_taskqs, M_DEVBUF); 2522 } 2523 taskqueue_free(sc->hn_mgmt_taskq0); 2524 if (sc->hn_vf_taskq != NULL) 2525 taskqueue_free(sc->hn_vf_taskq); 2526 2527 if (sc->hn_xact != NULL) { 2528 /* 2529 * Uninstall the orphan handler _before_ the xact is 2530 * destructed. 2531 */ 2532 vmbus_chan_unset_orphan(sc->hn_prichan); 2533 vmbus_xact_ctx_destroy(sc->hn_xact); 2534 } 2535 2536 if_free(ifp); 2537 2538 HN_LOCK_DESTROY(sc); 2539 rm_destroy(&sc->hn_vf_lock); 2540 return (0); 2541 } 2542 2543 static int 2544 hn_shutdown(device_t dev) 2545 { 2546 2547 return (0); 2548 } 2549 2550 static void 2551 hn_link_status(struct hn_softc *sc) 2552 { 2553 uint32_t link_status; 2554 int error; 2555 2556 error = hn_rndis_get_linkstatus(sc, &link_status); 2557 if (error) { 2558 /* XXX what to do? */ 2559 return; 2560 } 2561 2562 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2563 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2564 else 2565 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2566 if_link_state_change(sc->hn_ifp, 2567 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2568 LINK_STATE_UP : LINK_STATE_DOWN); 2569 } 2570 2571 static void 2572 hn_link_taskfunc(void *xsc, int pending __unused) 2573 { 2574 struct hn_softc *sc = xsc; 2575 2576 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2577 return; 2578 hn_link_status(sc); 2579 } 2580 2581 static void 2582 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2583 { 2584 struct hn_softc *sc = xsc; 2585 2586 /* Prevent any link status checks from running. */ 2587 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2588 2589 /* 2590 * Fake up a [link down --> link up] state change; 5 seconds 2591 * delay is used, which closely simulates miibus reaction 2592 * upon link down event. 2593 */ 2594 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2595 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2596 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2597 &sc->hn_netchg_status, 5 * hz); 2598 } 2599 2600 static void 2601 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2602 { 2603 struct hn_softc *sc = xsc; 2604 2605 /* Re-allow link status checks. */ 2606 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2607 hn_link_status(sc); 2608 } 2609 2610 static void 2611 hn_update_link_status(struct hn_softc *sc) 2612 { 2613 2614 if (sc->hn_mgmt_taskq != NULL) 2615 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2616 } 2617 2618 static void 2619 hn_change_network(struct hn_softc *sc) 2620 { 2621 2622 if (sc->hn_mgmt_taskq != NULL) 2623 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2624 } 2625 2626 static __inline int 2627 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2628 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2629 { 2630 struct mbuf *m = *m_head; 2631 int error; 2632 2633 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2634 2635 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2636 m, segs, nsegs, BUS_DMA_NOWAIT); 2637 if (error == EFBIG) { 2638 struct mbuf *m_new; 2639 2640 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2641 if (m_new == NULL) 2642 return ENOBUFS; 2643 else 2644 *m_head = m = m_new; 2645 txr->hn_tx_collapsed++; 2646 2647 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2648 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2649 } 2650 if (!error) { 2651 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2652 BUS_DMASYNC_PREWRITE); 2653 txd->flags |= HN_TXD_FLAG_DMAMAP; 2654 } 2655 return error; 2656 } 2657 2658 static __inline int 2659 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2660 { 2661 2662 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2663 ("put an onlist txd %#x", txd->flags)); 2664 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2665 ("put an onagg txd %#x", txd->flags)); 2666 2667 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2668 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2669 return 0; 2670 2671 if (!STAILQ_EMPTY(&txd->agg_list)) { 2672 struct hn_txdesc *tmp_txd; 2673 2674 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2675 int freed; 2676 2677 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2678 ("resursive aggregation on aggregated txdesc")); 2679 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2680 ("not aggregated txdesc")); 2681 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2682 ("aggregated txdesc uses dmamap")); 2683 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2684 ("aggregated txdesc consumes " 2685 "chimney sending buffer")); 2686 KASSERT(tmp_txd->chim_size == 0, 2687 ("aggregated txdesc has non-zero " 2688 "chimney sending size")); 2689 2690 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2691 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2692 freed = hn_txdesc_put(txr, tmp_txd); 2693 KASSERT(freed, ("failed to free aggregated txdesc")); 2694 } 2695 } 2696 2697 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2698 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2699 ("chim txd uses dmamap")); 2700 hn_chim_free(txr->hn_sc, txd->chim_index); 2701 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2702 txd->chim_size = 0; 2703 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2704 bus_dmamap_sync(txr->hn_tx_data_dtag, 2705 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2706 bus_dmamap_unload(txr->hn_tx_data_dtag, 2707 txd->data_dmap); 2708 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2709 } 2710 2711 if (txd->m != NULL) { 2712 m_freem(txd->m); 2713 txd->m = NULL; 2714 } 2715 2716 txd->flags |= HN_TXD_FLAG_ONLIST; 2717 #ifndef HN_USE_TXDESC_BUFRING 2718 mtx_lock_spin(&txr->hn_txlist_spin); 2719 KASSERT(txr->hn_txdesc_avail >= 0 && 2720 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2721 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2722 txr->hn_txdesc_avail++; 2723 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2724 mtx_unlock_spin(&txr->hn_txlist_spin); 2725 #else /* HN_USE_TXDESC_BUFRING */ 2726 #ifdef HN_DEBUG 2727 atomic_add_int(&txr->hn_txdesc_avail, 1); 2728 #endif 2729 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2730 #endif /* !HN_USE_TXDESC_BUFRING */ 2731 2732 return 1; 2733 } 2734 2735 static __inline struct hn_txdesc * 2736 hn_txdesc_get(struct hn_tx_ring *txr) 2737 { 2738 struct hn_txdesc *txd; 2739 2740 #ifndef HN_USE_TXDESC_BUFRING 2741 mtx_lock_spin(&txr->hn_txlist_spin); 2742 txd = SLIST_FIRST(&txr->hn_txlist); 2743 if (txd != NULL) { 2744 KASSERT(txr->hn_txdesc_avail > 0, 2745 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2746 txr->hn_txdesc_avail--; 2747 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2748 } 2749 mtx_unlock_spin(&txr->hn_txlist_spin); 2750 #else 2751 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2752 #endif 2753 2754 if (txd != NULL) { 2755 #ifdef HN_USE_TXDESC_BUFRING 2756 #ifdef HN_DEBUG 2757 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2758 #endif 2759 #endif /* HN_USE_TXDESC_BUFRING */ 2760 KASSERT(txd->m == NULL && txd->refs == 0 && 2761 STAILQ_EMPTY(&txd->agg_list) && 2762 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2763 txd->chim_size == 0 && 2764 (txd->flags & HN_TXD_FLAG_ONLIST) && 2765 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2766 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2767 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2768 txd->refs = 1; 2769 } 2770 return txd; 2771 } 2772 2773 static __inline void 2774 hn_txdesc_hold(struct hn_txdesc *txd) 2775 { 2776 2777 /* 0->1 transition will never work */ 2778 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2779 atomic_add_int(&txd->refs, 1); 2780 } 2781 2782 static __inline void 2783 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2784 { 2785 2786 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2787 ("recursive aggregation on aggregating txdesc")); 2788 2789 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2790 ("already aggregated")); 2791 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2792 ("recursive aggregation on to-be-aggregated txdesc")); 2793 2794 txd->flags |= HN_TXD_FLAG_ONAGG; 2795 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2796 } 2797 2798 static bool 2799 hn_tx_ring_pending(struct hn_tx_ring *txr) 2800 { 2801 bool pending = false; 2802 2803 #ifndef HN_USE_TXDESC_BUFRING 2804 mtx_lock_spin(&txr->hn_txlist_spin); 2805 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2806 pending = true; 2807 mtx_unlock_spin(&txr->hn_txlist_spin); 2808 #else 2809 if (!buf_ring_full(txr->hn_txdesc_br)) 2810 pending = true; 2811 #endif 2812 return (pending); 2813 } 2814 2815 static __inline void 2816 hn_txeof(struct hn_tx_ring *txr) 2817 { 2818 txr->hn_has_txeof = 0; 2819 txr->hn_txeof(txr); 2820 } 2821 2822 static void 2823 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2824 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2825 { 2826 struct hn_txdesc *txd = sndc->hn_cbarg; 2827 struct hn_tx_ring *txr; 2828 2829 txr = txd->txr; 2830 KASSERT(txr->hn_chan == chan, 2831 ("channel mismatch, on chan%u, should be chan%u", 2832 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2833 2834 txr->hn_has_txeof = 1; 2835 hn_txdesc_put(txr, txd); 2836 2837 ++txr->hn_txdone_cnt; 2838 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2839 txr->hn_txdone_cnt = 0; 2840 if (txr->hn_oactive) 2841 hn_txeof(txr); 2842 } 2843 } 2844 2845 static void 2846 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2847 { 2848 #if defined(INET) || defined(INET6) 2849 tcp_lro_flush_all(&rxr->hn_lro); 2850 #endif 2851 2852 /* 2853 * NOTE: 2854 * 'txr' could be NULL, if multiple channels and 2855 * ifnet.if_start method are enabled. 2856 */ 2857 if (txr == NULL || !txr->hn_has_txeof) 2858 return; 2859 2860 txr->hn_txdone_cnt = 0; 2861 hn_txeof(txr); 2862 } 2863 2864 static __inline uint32_t 2865 hn_rndis_pktmsg_offset(uint32_t ofs) 2866 { 2867 2868 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2869 ("invalid RNDIS packet msg offset %u", ofs)); 2870 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2871 } 2872 2873 static __inline void * 2874 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2875 size_t pi_dlen, uint32_t pi_type) 2876 { 2877 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2878 struct rndis_pktinfo *pi; 2879 2880 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2881 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2882 2883 /* 2884 * Per-packet-info does not move; it only grows. 2885 * 2886 * NOTE: 2887 * rm_pktinfooffset in this phase counts from the beginning 2888 * of rndis_packet_msg. 2889 */ 2890 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2891 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2892 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2893 pkt->rm_pktinfolen); 2894 pkt->rm_pktinfolen += pi_size; 2895 2896 pi->rm_size = pi_size; 2897 pi->rm_type = pi_type; 2898 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2899 2900 return (pi->rm_data); 2901 } 2902 2903 static __inline int 2904 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 2905 { 2906 struct hn_txdesc *txd; 2907 struct mbuf *m; 2908 int error, pkts; 2909 2910 txd = txr->hn_agg_txd; 2911 KASSERT(txd != NULL, ("no aggregate txdesc")); 2912 2913 /* 2914 * Since hn_txpkt() will reset this temporary stat, save 2915 * it now, so that oerrors can be updated properly, if 2916 * hn_txpkt() ever fails. 2917 */ 2918 pkts = txr->hn_stat_pkts; 2919 2920 /* 2921 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2922 * failure, save it for later freeing, if hn_txpkt() ever 2923 * fails. 2924 */ 2925 m = txd->m; 2926 error = hn_txpkt(ifp, txr, txd); 2927 if (__predict_false(error)) { 2928 /* txd is freed, but m is not. */ 2929 m_freem(m); 2930 2931 txr->hn_flush_failed++; 2932 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2933 } 2934 2935 /* Reset all aggregation states. */ 2936 txr->hn_agg_txd = NULL; 2937 txr->hn_agg_szleft = 0; 2938 txr->hn_agg_pktleft = 0; 2939 txr->hn_agg_prevpkt = NULL; 2940 2941 return (error); 2942 } 2943 2944 static void * 2945 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2946 int pktsize) 2947 { 2948 void *chim; 2949 2950 if (txr->hn_agg_txd != NULL) { 2951 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2952 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 2953 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 2954 int olen; 2955 2956 /* 2957 * Update the previous RNDIS packet's total length, 2958 * it can be increased due to the mandatory alignment 2959 * padding for this RNDIS packet. And update the 2960 * aggregating txdesc's chimney sending buffer size 2961 * accordingly. 2962 * 2963 * XXX 2964 * Zero-out the padding, as required by the RNDIS spec. 2965 */ 2966 olen = pkt->rm_len; 2967 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 2968 agg_txd->chim_size += pkt->rm_len - olen; 2969 2970 /* Link this txdesc to the parent. */ 2971 hn_txdesc_agg(agg_txd, txd); 2972 2973 chim = (uint8_t *)pkt + pkt->rm_len; 2974 /* Save the current packet for later fixup. */ 2975 txr->hn_agg_prevpkt = chim; 2976 2977 txr->hn_agg_pktleft--; 2978 txr->hn_agg_szleft -= pktsize; 2979 if (txr->hn_agg_szleft <= 2980 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 2981 /* 2982 * Probably can't aggregate more packets, 2983 * flush this aggregating txdesc proactively. 2984 */ 2985 txr->hn_agg_pktleft = 0; 2986 } 2987 /* Done! */ 2988 return (chim); 2989 } 2990 hn_flush_txagg(ifp, txr); 2991 } 2992 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 2993 2994 txr->hn_tx_chimney_tried++; 2995 txd->chim_index = hn_chim_alloc(txr->hn_sc); 2996 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 2997 return (NULL); 2998 txr->hn_tx_chimney++; 2999 3000 chim = txr->hn_sc->hn_chim + 3001 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3002 3003 if (txr->hn_agg_pktmax > 1 && 3004 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3005 txr->hn_agg_txd = txd; 3006 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3007 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3008 txr->hn_agg_prevpkt = chim; 3009 } 3010 return (chim); 3011 } 3012 3013 /* 3014 * NOTE: 3015 * If this function fails, then both txd and m_head0 will be freed. 3016 */ 3017 static int 3018 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3019 struct mbuf **m_head0) 3020 { 3021 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3022 int error, nsegs, i; 3023 struct mbuf *m_head = *m_head0; 3024 struct rndis_packet_msg *pkt; 3025 uint32_t *pi_data; 3026 void *chim = NULL; 3027 int pkt_hlen, pkt_size; 3028 3029 pkt = txd->rndis_pkt; 3030 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3031 if (pkt_size < txr->hn_chim_size) { 3032 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3033 if (chim != NULL) 3034 pkt = chim; 3035 } else { 3036 if (txr->hn_agg_txd != NULL) 3037 hn_flush_txagg(ifp, txr); 3038 } 3039 3040 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3041 pkt->rm_len = m_head->m_pkthdr.len; 3042 pkt->rm_dataoffset = 0; 3043 pkt->rm_datalen = m_head->m_pkthdr.len; 3044 pkt->rm_oobdataoffset = 0; 3045 pkt->rm_oobdatalen = 0; 3046 pkt->rm_oobdataelements = 0; 3047 pkt->rm_pktinfooffset = sizeof(*pkt); 3048 pkt->rm_pktinfolen = 0; 3049 pkt->rm_vchandle = 0; 3050 pkt->rm_reserved = 0; 3051 3052 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3053 /* 3054 * Set the hash value for this packet, so that the host could 3055 * dispatch the TX done event for this packet back to this TX 3056 * ring's channel. 3057 */ 3058 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3059 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3060 *pi_data = txr->hn_tx_idx; 3061 } 3062 3063 if (m_head->m_flags & M_VLANTAG) { 3064 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3065 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3066 *pi_data = NDIS_VLAN_INFO_MAKE( 3067 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3068 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3069 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3070 } 3071 3072 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3073 #if defined(INET6) || defined(INET) 3074 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3075 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3076 #ifdef INET 3077 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3078 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3079 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3080 m_head->m_pkthdr.tso_segsz); 3081 } 3082 #endif 3083 #if defined(INET6) && defined(INET) 3084 else 3085 #endif 3086 #ifdef INET6 3087 { 3088 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3089 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3090 m_head->m_pkthdr.tso_segsz); 3091 } 3092 #endif 3093 #endif /* INET6 || INET */ 3094 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3095 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3096 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3097 if (m_head->m_pkthdr.csum_flags & 3098 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3099 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3100 } else { 3101 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3102 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3103 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3104 } 3105 3106 if (m_head->m_pkthdr.csum_flags & 3107 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3108 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3109 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3110 } else if (m_head->m_pkthdr.csum_flags & 3111 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3112 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3113 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3114 } 3115 } 3116 3117 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3118 /* Fixup RNDIS packet message total length */ 3119 pkt->rm_len += pkt_hlen; 3120 /* Convert RNDIS packet message offsets */ 3121 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3122 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3123 3124 /* 3125 * Fast path: Chimney sending. 3126 */ 3127 if (chim != NULL) { 3128 struct hn_txdesc *tgt_txd = txd; 3129 3130 if (txr->hn_agg_txd != NULL) { 3131 tgt_txd = txr->hn_agg_txd; 3132 #ifdef INVARIANTS 3133 *m_head0 = NULL; 3134 #endif 3135 } 3136 3137 KASSERT(pkt == chim, 3138 ("RNDIS pkt not in chimney sending buffer")); 3139 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3140 ("chimney sending buffer is not used")); 3141 tgt_txd->chim_size += pkt->rm_len; 3142 3143 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3144 ((uint8_t *)chim) + pkt_hlen); 3145 3146 txr->hn_gpa_cnt = 0; 3147 txr->hn_sendpkt = hn_txpkt_chim; 3148 goto done; 3149 } 3150 3151 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3152 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3153 ("chimney buffer is used")); 3154 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3155 3156 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3157 if (__predict_false(error)) { 3158 int freed; 3159 3160 /* 3161 * This mbuf is not linked w/ the txd yet, so free it now. 3162 */ 3163 m_freem(m_head); 3164 *m_head0 = NULL; 3165 3166 freed = hn_txdesc_put(txr, txd); 3167 KASSERT(freed != 0, 3168 ("fail to free txd upon txdma error")); 3169 3170 txr->hn_txdma_failed++; 3171 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3172 return error; 3173 } 3174 *m_head0 = m_head; 3175 3176 /* +1 RNDIS packet message */ 3177 txr->hn_gpa_cnt = nsegs + 1; 3178 3179 /* send packet with page buffer */ 3180 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3181 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3182 txr->hn_gpa[0].gpa_len = pkt_hlen; 3183 3184 /* 3185 * Fill the page buffers with mbuf info after the page 3186 * buffer for RNDIS packet message. 3187 */ 3188 for (i = 0; i < nsegs; ++i) { 3189 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3190 3191 gpa->gpa_page = atop(segs[i].ds_addr); 3192 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3193 gpa->gpa_len = segs[i].ds_len; 3194 } 3195 3196 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3197 txd->chim_size = 0; 3198 txr->hn_sendpkt = hn_txpkt_sglist; 3199 done: 3200 txd->m = m_head; 3201 3202 /* Set the completion routine */ 3203 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3204 3205 /* Update temporary stats for later use. */ 3206 txr->hn_stat_pkts++; 3207 txr->hn_stat_size += m_head->m_pkthdr.len; 3208 if (m_head->m_flags & M_MCAST) 3209 txr->hn_stat_mcasts++; 3210 3211 return 0; 3212 } 3213 3214 /* 3215 * NOTE: 3216 * If this function fails, then txd will be freed, but the mbuf 3217 * associated w/ the txd will _not_ be freed. 3218 */ 3219 static int 3220 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3221 { 3222 int error, send_failed = 0, has_bpf; 3223 3224 again: 3225 has_bpf = bpf_peers_present(ifp->if_bpf); 3226 if (has_bpf) { 3227 /* 3228 * Make sure that this txd and any aggregated txds are not 3229 * freed before ETHER_BPF_MTAP. 3230 */ 3231 hn_txdesc_hold(txd); 3232 } 3233 error = txr->hn_sendpkt(txr, txd); 3234 if (!error) { 3235 if (has_bpf) { 3236 const struct hn_txdesc *tmp_txd; 3237 3238 ETHER_BPF_MTAP(ifp, txd->m); 3239 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3240 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3241 } 3242 3243 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3244 #ifdef HN_IFSTART_SUPPORT 3245 if (!hn_use_if_start) 3246 #endif 3247 { 3248 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3249 txr->hn_stat_size); 3250 if (txr->hn_stat_mcasts != 0) { 3251 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3252 txr->hn_stat_mcasts); 3253 } 3254 } 3255 txr->hn_pkts += txr->hn_stat_pkts; 3256 txr->hn_sends++; 3257 } 3258 if (has_bpf) 3259 hn_txdesc_put(txr, txd); 3260 3261 if (__predict_false(error)) { 3262 int freed; 3263 3264 /* 3265 * This should "really rarely" happen. 3266 * 3267 * XXX Too many RX to be acked or too many sideband 3268 * commands to run? Ask netvsc_channel_rollup() 3269 * to kick start later. 3270 */ 3271 txr->hn_has_txeof = 1; 3272 if (!send_failed) { 3273 txr->hn_send_failed++; 3274 send_failed = 1; 3275 /* 3276 * Try sending again after set hn_has_txeof; 3277 * in case that we missed the last 3278 * netvsc_channel_rollup(). 3279 */ 3280 goto again; 3281 } 3282 if_printf(ifp, "send failed\n"); 3283 3284 /* 3285 * Caller will perform further processing on the 3286 * associated mbuf, so don't free it in hn_txdesc_put(); 3287 * only unload it from the DMA map in hn_txdesc_put(), 3288 * if it was loaded. 3289 */ 3290 txd->m = NULL; 3291 freed = hn_txdesc_put(txr, txd); 3292 KASSERT(freed != 0, 3293 ("fail to free txd upon send error")); 3294 3295 txr->hn_send_failed++; 3296 } 3297 3298 /* Reset temporary stats, after this sending is done. */ 3299 txr->hn_stat_size = 0; 3300 txr->hn_stat_pkts = 0; 3301 txr->hn_stat_mcasts = 0; 3302 3303 return (error); 3304 } 3305 3306 /* 3307 * Append the specified data to the indicated mbuf chain, 3308 * Extend the mbuf chain if the new data does not fit in 3309 * existing space. 3310 * 3311 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3312 * There should be an equivalent in the kernel mbuf code, 3313 * but there does not appear to be one yet. 3314 * 3315 * Differs from m_append() in that additional mbufs are 3316 * allocated with cluster size MJUMPAGESIZE, and filled 3317 * accordingly. 3318 * 3319 * Return 1 if able to complete the job; otherwise 0. 3320 */ 3321 static int 3322 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3323 { 3324 struct mbuf *m, *n; 3325 int remainder, space; 3326 3327 for (m = m0; m->m_next != NULL; m = m->m_next) 3328 ; 3329 remainder = len; 3330 space = M_TRAILINGSPACE(m); 3331 if (space > 0) { 3332 /* 3333 * Copy into available space. 3334 */ 3335 if (space > remainder) 3336 space = remainder; 3337 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3338 m->m_len += space; 3339 cp += space; 3340 remainder -= space; 3341 } 3342 while (remainder > 0) { 3343 /* 3344 * Allocate a new mbuf; could check space 3345 * and allocate a cluster instead. 3346 */ 3347 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3348 if (n == NULL) 3349 break; 3350 n->m_len = min(MJUMPAGESIZE, remainder); 3351 bcopy(cp, mtod(n, caddr_t), n->m_len); 3352 cp += n->m_len; 3353 remainder -= n->m_len; 3354 m->m_next = n; 3355 m = n; 3356 } 3357 if (m0->m_flags & M_PKTHDR) 3358 m0->m_pkthdr.len += len - remainder; 3359 3360 return (remainder == 0); 3361 } 3362 3363 #if defined(INET) || defined(INET6) 3364 static __inline int 3365 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3366 { 3367 #if __FreeBSD_version >= 1100095 3368 if (hn_lro_mbufq_depth) { 3369 tcp_lro_queue_mbuf(lc, m); 3370 return 0; 3371 } 3372 #endif 3373 return tcp_lro_rx(lc, m, 0); 3374 } 3375 #endif 3376 3377 static int 3378 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, 3379 const struct hn_rxinfo *info) 3380 { 3381 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; 3382 struct mbuf *m_new; 3383 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3384 int hash_type = M_HASHTYPE_NONE; 3385 3386 ifp = hn_ifp; 3387 if (rxr->hn_rxvf_ifp != NULL) { 3388 /* 3389 * Non-transparent mode VF; pretend this packet is from 3390 * the VF. 3391 */ 3392 ifp = rxr->hn_rxvf_ifp; 3393 is_vf = 1; 3394 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3395 /* Transparent mode VF. */ 3396 is_vf = 1; 3397 } 3398 3399 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 3400 /* 3401 * NOTE: 3402 * See the NOTE of hn_rndis_init_fixat(). This 3403 * function can be reached, immediately after the 3404 * RNDIS is initialized but before the ifnet is 3405 * setup on the hn_attach() path; drop the unexpected 3406 * packets. 3407 */ 3408 return (0); 3409 } 3410 3411 if (__predict_false(dlen < ETHER_HDR_LEN)) { 3412 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3413 return (0); 3414 } 3415 3416 if (dlen <= MHLEN) { 3417 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3418 if (m_new == NULL) { 3419 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3420 return (0); 3421 } 3422 memcpy(mtod(m_new, void *), data, dlen); 3423 m_new->m_pkthdr.len = m_new->m_len = dlen; 3424 rxr->hn_small_pkts++; 3425 } else { 3426 /* 3427 * Get an mbuf with a cluster. For packets 2K or less, 3428 * get a standard 2K cluster. For anything larger, get a 3429 * 4K cluster. Any buffers larger than 4K can cause problems 3430 * if looped around to the Hyper-V TX channel, so avoid them. 3431 */ 3432 size = MCLBYTES; 3433 if (dlen > MCLBYTES) { 3434 /* 4096 */ 3435 size = MJUMPAGESIZE; 3436 } 3437 3438 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3439 if (m_new == NULL) { 3440 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3441 return (0); 3442 } 3443 3444 hv_m_append(m_new, dlen, data); 3445 } 3446 m_new->m_pkthdr.rcvif = ifp; 3447 3448 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) 3449 do_csum = 0; 3450 3451 /* receive side checksum offload */ 3452 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { 3453 /* IP csum offload */ 3454 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3455 m_new->m_pkthdr.csum_flags |= 3456 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3457 rxr->hn_csum_ip++; 3458 } 3459 3460 /* TCP/UDP csum offload */ 3461 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | 3462 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3463 m_new->m_pkthdr.csum_flags |= 3464 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3465 m_new->m_pkthdr.csum_data = 0xffff; 3466 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) 3467 rxr->hn_csum_tcp++; 3468 else 3469 rxr->hn_csum_udp++; 3470 } 3471 3472 /* 3473 * XXX 3474 * As of this write (Oct 28th, 2016), host side will turn 3475 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3476 * the do_lro setting here is actually _not_ accurate. We 3477 * depend on the RSS hash type check to reset do_lro. 3478 */ 3479 if ((info->csum_info & 3480 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3481 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3482 do_lro = 1; 3483 } else { 3484 const struct ether_header *eh; 3485 uint16_t etype; 3486 int hoff; 3487 3488 hoff = sizeof(*eh); 3489 /* Checked at the beginning of this function. */ 3490 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 3491 3492 eh = mtod(m_new, struct ether_header *); 3493 etype = ntohs(eh->ether_type); 3494 if (etype == ETHERTYPE_VLAN) { 3495 const struct ether_vlan_header *evl; 3496 3497 hoff = sizeof(*evl); 3498 if (m_new->m_len < hoff) 3499 goto skip; 3500 evl = mtod(m_new, struct ether_vlan_header *); 3501 etype = ntohs(evl->evl_proto); 3502 } 3503 3504 if (etype == ETHERTYPE_IP) { 3505 int pr; 3506 3507 pr = hn_check_iplen(m_new, hoff); 3508 if (pr == IPPROTO_TCP) { 3509 if (do_csum && 3510 (rxr->hn_trust_hcsum & 3511 HN_TRUST_HCSUM_TCP)) { 3512 rxr->hn_csum_trusted++; 3513 m_new->m_pkthdr.csum_flags |= 3514 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3515 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3516 m_new->m_pkthdr.csum_data = 0xffff; 3517 } 3518 do_lro = 1; 3519 } else if (pr == IPPROTO_UDP) { 3520 if (do_csum && 3521 (rxr->hn_trust_hcsum & 3522 HN_TRUST_HCSUM_UDP)) { 3523 rxr->hn_csum_trusted++; 3524 m_new->m_pkthdr.csum_flags |= 3525 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3526 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3527 m_new->m_pkthdr.csum_data = 0xffff; 3528 } 3529 } else if (pr != IPPROTO_DONE && do_csum && 3530 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3531 rxr->hn_csum_trusted++; 3532 m_new->m_pkthdr.csum_flags |= 3533 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3534 } 3535 } 3536 } 3537 skip: 3538 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { 3539 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3540 NDIS_VLAN_INFO_ID(info->vlan_info), 3541 NDIS_VLAN_INFO_PRI(info->vlan_info), 3542 NDIS_VLAN_INFO_CFI(info->vlan_info)); 3543 m_new->m_flags |= M_VLANTAG; 3544 } 3545 3546 /* 3547 * If VF is activated (tranparent/non-transparent mode does not 3548 * matter here). 3549 * 3550 * - Disable LRO 3551 * 3552 * hn(4) will only receive broadcast packets, multicast packets, 3553 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3554 * packet types. 3555 * 3556 * For non-transparent, we definitely _cannot_ enable LRO at 3557 * all, since the LRO flush will use hn(4) as the receiving 3558 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3559 */ 3560 if (is_vf) 3561 do_lro = 0; 3562 3563 /* 3564 * If VF is activated (tranparent/non-transparent mode does not 3565 * matter here), do _not_ mess with unsupported hash types or 3566 * functions. 3567 */ 3568 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { 3569 rxr->hn_rss_pkts++; 3570 m_new->m_pkthdr.flowid = info->hash_value; 3571 if (!is_vf) 3572 hash_type = M_HASHTYPE_OPAQUE_HASH; 3573 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == 3574 NDIS_HASH_FUNCTION_TOEPLITZ) { 3575 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK & 3576 rxr->hn_mbuf_hash); 3577 3578 /* 3579 * NOTE: 3580 * do_lro is resetted, if the hash types are not TCP 3581 * related. See the comment in the above csum_flags 3582 * setup section. 3583 */ 3584 switch (type) { 3585 case NDIS_HASH_IPV4: 3586 hash_type = M_HASHTYPE_RSS_IPV4; 3587 do_lro = 0; 3588 break; 3589 3590 case NDIS_HASH_TCP_IPV4: 3591 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3592 break; 3593 3594 case NDIS_HASH_IPV6: 3595 hash_type = M_HASHTYPE_RSS_IPV6; 3596 do_lro = 0; 3597 break; 3598 3599 case NDIS_HASH_IPV6_EX: 3600 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3601 do_lro = 0; 3602 break; 3603 3604 case NDIS_HASH_TCP_IPV6: 3605 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3606 break; 3607 3608 case NDIS_HASH_TCP_IPV6_EX: 3609 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3610 break; 3611 } 3612 } 3613 } else if (!is_vf) { 3614 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3615 hash_type = M_HASHTYPE_OPAQUE; 3616 } 3617 M_HASHTYPE_SET(m_new, hash_type); 3618 3619 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3620 if (hn_ifp != ifp) { 3621 const struct ether_header *eh; 3622 3623 /* 3624 * Non-transparent mode VF is activated. 3625 */ 3626 3627 /* 3628 * Allow tapping on hn(4). 3629 */ 3630 ETHER_BPF_MTAP(hn_ifp, m_new); 3631 3632 /* 3633 * Update hn(4)'s stats. 3634 */ 3635 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3636 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3637 /* Checked at the beginning of this function. */ 3638 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3639 eh = mtod(m_new, struct ether_header *); 3640 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3641 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3642 } 3643 rxr->hn_pkts++; 3644 3645 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { 3646 #if defined(INET) || defined(INET6) 3647 struct lro_ctrl *lro = &rxr->hn_lro; 3648 3649 if (lro->lro_cnt) { 3650 rxr->hn_lro_tried++; 3651 if (hn_lro_rx(lro, m_new) == 0) { 3652 /* DONE! */ 3653 return 0; 3654 } 3655 } 3656 #endif 3657 } 3658 ifp->if_input(ifp, m_new); 3659 3660 return (0); 3661 } 3662 3663 static int 3664 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 3665 { 3666 struct hn_softc *sc = ifp->if_softc; 3667 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3668 struct ifnet *vf_ifp; 3669 int mask, error = 0; 3670 struct ifrsskey *ifrk; 3671 struct ifrsshash *ifrh; 3672 uint32_t mtu; 3673 3674 switch (cmd) { 3675 case SIOCSIFMTU: 3676 if (ifr->ifr_mtu > HN_MTU_MAX) { 3677 error = EINVAL; 3678 break; 3679 } 3680 3681 HN_LOCK(sc); 3682 3683 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3684 HN_UNLOCK(sc); 3685 break; 3686 } 3687 3688 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3689 /* Can't change MTU */ 3690 HN_UNLOCK(sc); 3691 error = EOPNOTSUPP; 3692 break; 3693 } 3694 3695 if (ifp->if_mtu == ifr->ifr_mtu) { 3696 HN_UNLOCK(sc); 3697 break; 3698 } 3699 3700 if (hn_xpnt_vf_isready(sc)) { 3701 vf_ifp = sc->hn_vf_ifp; 3702 ifr_vf = *ifr; 3703 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, 3704 sizeof(ifr_vf.ifr_name)); 3705 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, 3706 (caddr_t)&ifr_vf); 3707 if (error) { 3708 HN_UNLOCK(sc); 3709 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3710 vf_ifp->if_xname, ifr->ifr_mtu, error); 3711 break; 3712 } 3713 } 3714 3715 /* 3716 * Suspend this interface before the synthetic parts 3717 * are ripped. 3718 */ 3719 hn_suspend(sc); 3720 3721 /* 3722 * Detach the synthetics parts, i.e. NVS and RNDIS. 3723 */ 3724 hn_synth_detach(sc); 3725 3726 /* 3727 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3728 * with the new MTU setting. 3729 */ 3730 error = hn_synth_attach(sc, ifr->ifr_mtu); 3731 if (error) { 3732 HN_UNLOCK(sc); 3733 break; 3734 } 3735 3736 error = hn_rndis_get_mtu(sc, &mtu); 3737 if (error) 3738 mtu = ifr->ifr_mtu; 3739 else if (bootverbose) 3740 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3741 3742 /* 3743 * Commit the requested MTU, after the synthetic parts 3744 * have been successfully attached. 3745 */ 3746 if (mtu >= ifr->ifr_mtu) { 3747 mtu = ifr->ifr_mtu; 3748 } else { 3749 if_printf(ifp, "fixup mtu %d -> %u\n", 3750 ifr->ifr_mtu, mtu); 3751 } 3752 ifp->if_mtu = mtu; 3753 3754 /* 3755 * Synthetic parts' reattach may change the chimney 3756 * sending size; update it. 3757 */ 3758 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3759 hn_set_chim_size(sc, sc->hn_chim_szmax); 3760 3761 /* 3762 * Make sure that various parameters based on MTU are 3763 * still valid, after the MTU change. 3764 */ 3765 hn_mtu_change_fixup(sc); 3766 3767 /* 3768 * All done! Resume the interface now. 3769 */ 3770 hn_resume(sc); 3771 3772 if ((sc->hn_flags & HN_FLAG_RXVF) || 3773 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3774 /* 3775 * Since we have reattached the NVS part, 3776 * change the datapath to VF again; in case 3777 * that it is lost, after the NVS was detached. 3778 */ 3779 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3780 } 3781 3782 HN_UNLOCK(sc); 3783 break; 3784 3785 case SIOCSIFFLAGS: 3786 HN_LOCK(sc); 3787 3788 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3789 HN_UNLOCK(sc); 3790 break; 3791 } 3792 3793 if (hn_xpnt_vf_isready(sc)) 3794 hn_xpnt_vf_saveifflags(sc); 3795 3796 if (ifp->if_flags & IFF_UP) { 3797 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3798 /* 3799 * Caller meight hold mutex, e.g. 3800 * bpf; use busy-wait for the RNDIS 3801 * reply. 3802 */ 3803 HN_NO_SLEEPING(sc); 3804 hn_rxfilter_config(sc); 3805 HN_SLEEPING_OK(sc); 3806 3807 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3808 error = hn_xpnt_vf_iocsetflags(sc); 3809 } else { 3810 hn_init_locked(sc); 3811 } 3812 } else { 3813 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3814 hn_stop(sc, false); 3815 } 3816 sc->hn_if_flags = ifp->if_flags; 3817 3818 HN_UNLOCK(sc); 3819 break; 3820 3821 case SIOCSIFCAP: 3822 HN_LOCK(sc); 3823 3824 if (hn_xpnt_vf_isready(sc)) { 3825 ifr_vf = *ifr; 3826 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, 3827 sizeof(ifr_vf.ifr_name)); 3828 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3829 HN_UNLOCK(sc); 3830 break; 3831 } 3832 3833 /* 3834 * Fix up requested capabilities w/ supported capabilities, 3835 * since the supported capabilities could have been changed. 3836 */ 3837 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ 3838 ifp->if_capenable; 3839 3840 if (mask & IFCAP_TXCSUM) { 3841 ifp->if_capenable ^= IFCAP_TXCSUM; 3842 if (ifp->if_capenable & IFCAP_TXCSUM) 3843 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 3844 else 3845 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 3846 } 3847 if (mask & IFCAP_TXCSUM_IPV6) { 3848 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 3849 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 3850 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 3851 else 3852 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 3853 } 3854 3855 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3856 if (mask & IFCAP_RXCSUM) 3857 ifp->if_capenable ^= IFCAP_RXCSUM; 3858 #ifdef foo 3859 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3860 if (mask & IFCAP_RXCSUM_IPV6) 3861 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 3862 #endif 3863 3864 if (mask & IFCAP_LRO) 3865 ifp->if_capenable ^= IFCAP_LRO; 3866 3867 if (mask & IFCAP_TSO4) { 3868 ifp->if_capenable ^= IFCAP_TSO4; 3869 if (ifp->if_capenable & IFCAP_TSO4) 3870 ifp->if_hwassist |= CSUM_IP_TSO; 3871 else 3872 ifp->if_hwassist &= ~CSUM_IP_TSO; 3873 } 3874 if (mask & IFCAP_TSO6) { 3875 ifp->if_capenable ^= IFCAP_TSO6; 3876 if (ifp->if_capenable & IFCAP_TSO6) 3877 ifp->if_hwassist |= CSUM_IP6_TSO; 3878 else 3879 ifp->if_hwassist &= ~CSUM_IP6_TSO; 3880 } 3881 3882 HN_UNLOCK(sc); 3883 break; 3884 3885 case SIOCADDMULTI: 3886 case SIOCDELMULTI: 3887 HN_LOCK(sc); 3888 3889 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3890 HN_UNLOCK(sc); 3891 break; 3892 } 3893 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3894 /* 3895 * Multicast uses mutex; use busy-wait for 3896 * the RNDIS reply. 3897 */ 3898 HN_NO_SLEEPING(sc); 3899 hn_rxfilter_config(sc); 3900 HN_SLEEPING_OK(sc); 3901 } 3902 3903 /* XXX vlan(4) style mcast addr maintenance */ 3904 if (hn_xpnt_vf_isready(sc)) { 3905 int old_if_flags; 3906 3907 old_if_flags = sc->hn_vf_ifp->if_flags; 3908 hn_xpnt_vf_saveifflags(sc); 3909 3910 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3911 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & 3912 IFF_ALLMULTI)) 3913 error = hn_xpnt_vf_iocsetflags(sc); 3914 } 3915 3916 HN_UNLOCK(sc); 3917 break; 3918 3919 case SIOCSIFMEDIA: 3920 case SIOCGIFMEDIA: 3921 HN_LOCK(sc); 3922 if (hn_xpnt_vf_isready(sc)) { 3923 /* 3924 * SIOCGIFMEDIA expects ifmediareq, so don't 3925 * create and pass ifr_vf to the VF here; just 3926 * replace the ifr_name. 3927 */ 3928 vf_ifp = sc->hn_vf_ifp; 3929 strlcpy(ifr->ifr_name, vf_ifp->if_xname, 3930 sizeof(ifr->ifr_name)); 3931 error = vf_ifp->if_ioctl(vf_ifp, cmd, data); 3932 /* Restore the ifr_name. */ 3933 strlcpy(ifr->ifr_name, ifp->if_xname, 3934 sizeof(ifr->ifr_name)); 3935 HN_UNLOCK(sc); 3936 break; 3937 } 3938 HN_UNLOCK(sc); 3939 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 3940 break; 3941 3942 case SIOCGIFRSSHASH: 3943 ifrh = (struct ifrsshash *)data; 3944 HN_LOCK(sc); 3945 if (sc->hn_rx_ring_inuse == 1) { 3946 HN_UNLOCK(sc); 3947 ifrh->ifrh_func = RSS_FUNC_NONE; 3948 ifrh->ifrh_types = 0; 3949 break; 3950 } 3951 3952 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 3953 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 3954 else 3955 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 3956 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 3957 HN_UNLOCK(sc); 3958 break; 3959 3960 case SIOCGIFRSSKEY: 3961 ifrk = (struct ifrsskey *)data; 3962 HN_LOCK(sc); 3963 if (sc->hn_rx_ring_inuse == 1) { 3964 HN_UNLOCK(sc); 3965 ifrk->ifrk_func = RSS_FUNC_NONE; 3966 ifrk->ifrk_keylen = 0; 3967 break; 3968 } 3969 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 3970 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 3971 else 3972 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 3973 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 3974 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 3975 NDIS_HASH_KEYSIZE_TOEPLITZ); 3976 HN_UNLOCK(sc); 3977 break; 3978 3979 default: 3980 error = ether_ioctl(ifp, cmd, data); 3981 break; 3982 } 3983 return (error); 3984 } 3985 3986 static void 3987 hn_stop(struct hn_softc *sc, bool detaching) 3988 { 3989 struct ifnet *ifp = sc->hn_ifp; 3990 int i; 3991 3992 HN_LOCK_ASSERT(sc); 3993 3994 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 3995 ("synthetic parts were not attached")); 3996 3997 /* Clear RUNNING bit ASAP. */ 3998 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 3999 4000 /* Disable polling. */ 4001 hn_polling(sc, 0); 4002 4003 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4004 KASSERT(sc->hn_vf_ifp != NULL, 4005 ("%s: VF is not attached", ifp->if_xname)); 4006 4007 /* Mark transparent mode VF as disabled. */ 4008 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4009 4010 /* 4011 * NOTE: 4012 * Datapath setting must happen _before_ bringing 4013 * the VF down. 4014 */ 4015 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4016 4017 /* 4018 * Bring the VF down. 4019 */ 4020 hn_xpnt_vf_saveifflags(sc); 4021 sc->hn_vf_ifp->if_flags &= ~IFF_UP; 4022 hn_xpnt_vf_iocsetflags(sc); 4023 } 4024 4025 /* Suspend data transfers. */ 4026 hn_suspend_data(sc); 4027 4028 /* Clear OACTIVE bit. */ 4029 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4030 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4031 sc->hn_tx_ring[i].hn_oactive = 0; 4032 4033 /* 4034 * If the non-transparent mode VF is active, make sure 4035 * that the RX filter still allows packet reception. 4036 */ 4037 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4038 hn_rxfilter_config(sc); 4039 } 4040 4041 static void 4042 hn_init_locked(struct hn_softc *sc) 4043 { 4044 struct ifnet *ifp = sc->hn_ifp; 4045 int i; 4046 4047 HN_LOCK_ASSERT(sc); 4048 4049 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4050 return; 4051 4052 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 4053 return; 4054 4055 /* Configure RX filter */ 4056 hn_rxfilter_config(sc); 4057 4058 /* Clear OACTIVE bit. */ 4059 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4060 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4061 sc->hn_tx_ring[i].hn_oactive = 0; 4062 4063 /* Clear TX 'suspended' bit. */ 4064 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4065 4066 if (hn_xpnt_vf_isready(sc)) { 4067 /* Initialize transparent VF. */ 4068 hn_xpnt_vf_init(sc); 4069 } 4070 4071 /* Everything is ready; unleash! */ 4072 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4073 4074 /* Re-enable polling if requested. */ 4075 if (sc->hn_pollhz > 0) 4076 hn_polling(sc, sc->hn_pollhz); 4077 } 4078 4079 static void 4080 hn_init(void *xsc) 4081 { 4082 struct hn_softc *sc = xsc; 4083 4084 HN_LOCK(sc); 4085 hn_init_locked(sc); 4086 HN_UNLOCK(sc); 4087 } 4088 4089 #if __FreeBSD_version >= 1100099 4090 4091 static int 4092 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4093 { 4094 struct hn_softc *sc = arg1; 4095 unsigned int lenlim; 4096 int error; 4097 4098 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4099 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4100 if (error || req->newptr == NULL) 4101 return error; 4102 4103 HN_LOCK(sc); 4104 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4105 lenlim > TCP_LRO_LENGTH_MAX) { 4106 HN_UNLOCK(sc); 4107 return EINVAL; 4108 } 4109 hn_set_lro_lenlim(sc, lenlim); 4110 HN_UNLOCK(sc); 4111 4112 return 0; 4113 } 4114 4115 static int 4116 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4117 { 4118 struct hn_softc *sc = arg1; 4119 int ackcnt, error, i; 4120 4121 /* 4122 * lro_ackcnt_lim is append count limit, 4123 * +1 to turn it into aggregation limit. 4124 */ 4125 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4126 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4127 if (error || req->newptr == NULL) 4128 return error; 4129 4130 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4131 return EINVAL; 4132 4133 /* 4134 * Convert aggregation limit back to append 4135 * count limit. 4136 */ 4137 --ackcnt; 4138 HN_LOCK(sc); 4139 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4140 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4141 HN_UNLOCK(sc); 4142 return 0; 4143 } 4144 4145 #endif 4146 4147 static int 4148 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4149 { 4150 struct hn_softc *sc = arg1; 4151 int hcsum = arg2; 4152 int on, error, i; 4153 4154 on = 0; 4155 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4156 on = 1; 4157 4158 error = sysctl_handle_int(oidp, &on, 0, req); 4159 if (error || req->newptr == NULL) 4160 return error; 4161 4162 HN_LOCK(sc); 4163 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4164 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4165 4166 if (on) 4167 rxr->hn_trust_hcsum |= hcsum; 4168 else 4169 rxr->hn_trust_hcsum &= ~hcsum; 4170 } 4171 HN_UNLOCK(sc); 4172 return 0; 4173 } 4174 4175 static int 4176 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4177 { 4178 struct hn_softc *sc = arg1; 4179 int chim_size, error; 4180 4181 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4182 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4183 if (error || req->newptr == NULL) 4184 return error; 4185 4186 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4187 return EINVAL; 4188 4189 HN_LOCK(sc); 4190 hn_set_chim_size(sc, chim_size); 4191 HN_UNLOCK(sc); 4192 return 0; 4193 } 4194 4195 #if __FreeBSD_version < 1100095 4196 static int 4197 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 4198 { 4199 struct hn_softc *sc = arg1; 4200 int ofs = arg2, i, error; 4201 struct hn_rx_ring *rxr; 4202 uint64_t stat; 4203 4204 stat = 0; 4205 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4206 rxr = &sc->hn_rx_ring[i]; 4207 stat += *((int *)((uint8_t *)rxr + ofs)); 4208 } 4209 4210 error = sysctl_handle_64(oidp, &stat, 0, req); 4211 if (error || req->newptr == NULL) 4212 return error; 4213 4214 /* Zero out this stat. */ 4215 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4216 rxr = &sc->hn_rx_ring[i]; 4217 *((int *)((uint8_t *)rxr + ofs)) = 0; 4218 } 4219 return 0; 4220 } 4221 #else 4222 static int 4223 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4224 { 4225 struct hn_softc *sc = arg1; 4226 int ofs = arg2, i, error; 4227 struct hn_rx_ring *rxr; 4228 uint64_t stat; 4229 4230 stat = 0; 4231 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4232 rxr = &sc->hn_rx_ring[i]; 4233 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4234 } 4235 4236 error = sysctl_handle_64(oidp, &stat, 0, req); 4237 if (error || req->newptr == NULL) 4238 return error; 4239 4240 /* Zero out this stat. */ 4241 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4242 rxr = &sc->hn_rx_ring[i]; 4243 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4244 } 4245 return 0; 4246 } 4247 4248 #endif 4249 4250 static int 4251 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4252 { 4253 struct hn_softc *sc = arg1; 4254 int ofs = arg2, i, error; 4255 struct hn_rx_ring *rxr; 4256 u_long stat; 4257 4258 stat = 0; 4259 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4260 rxr = &sc->hn_rx_ring[i]; 4261 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4262 } 4263 4264 error = sysctl_handle_long(oidp, &stat, 0, req); 4265 if (error || req->newptr == NULL) 4266 return error; 4267 4268 /* Zero out this stat. */ 4269 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4270 rxr = &sc->hn_rx_ring[i]; 4271 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4272 } 4273 return 0; 4274 } 4275 4276 static int 4277 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4278 { 4279 struct hn_softc *sc = arg1; 4280 int ofs = arg2, i, error; 4281 struct hn_tx_ring *txr; 4282 u_long stat; 4283 4284 stat = 0; 4285 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4286 txr = &sc->hn_tx_ring[i]; 4287 stat += *((u_long *)((uint8_t *)txr + ofs)); 4288 } 4289 4290 error = sysctl_handle_long(oidp, &stat, 0, req); 4291 if (error || req->newptr == NULL) 4292 return error; 4293 4294 /* Zero out this stat. */ 4295 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4296 txr = &sc->hn_tx_ring[i]; 4297 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4298 } 4299 return 0; 4300 } 4301 4302 static int 4303 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4304 { 4305 struct hn_softc *sc = arg1; 4306 int ofs = arg2, i, error, conf; 4307 struct hn_tx_ring *txr; 4308 4309 txr = &sc->hn_tx_ring[0]; 4310 conf = *((int *)((uint8_t *)txr + ofs)); 4311 4312 error = sysctl_handle_int(oidp, &conf, 0, req); 4313 if (error || req->newptr == NULL) 4314 return error; 4315 4316 HN_LOCK(sc); 4317 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4318 txr = &sc->hn_tx_ring[i]; 4319 *((int *)((uint8_t *)txr + ofs)) = conf; 4320 } 4321 HN_UNLOCK(sc); 4322 4323 return 0; 4324 } 4325 4326 static int 4327 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4328 { 4329 struct hn_softc *sc = arg1; 4330 int error, size; 4331 4332 size = sc->hn_agg_size; 4333 error = sysctl_handle_int(oidp, &size, 0, req); 4334 if (error || req->newptr == NULL) 4335 return (error); 4336 4337 HN_LOCK(sc); 4338 sc->hn_agg_size = size; 4339 hn_set_txagg(sc); 4340 HN_UNLOCK(sc); 4341 4342 return (0); 4343 } 4344 4345 static int 4346 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4347 { 4348 struct hn_softc *sc = arg1; 4349 int error, pkts; 4350 4351 pkts = sc->hn_agg_pkts; 4352 error = sysctl_handle_int(oidp, &pkts, 0, req); 4353 if (error || req->newptr == NULL) 4354 return (error); 4355 4356 HN_LOCK(sc); 4357 sc->hn_agg_pkts = pkts; 4358 hn_set_txagg(sc); 4359 HN_UNLOCK(sc); 4360 4361 return (0); 4362 } 4363 4364 static int 4365 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4366 { 4367 struct hn_softc *sc = arg1; 4368 int pkts; 4369 4370 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4371 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4372 } 4373 4374 static int 4375 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4376 { 4377 struct hn_softc *sc = arg1; 4378 int align; 4379 4380 align = sc->hn_tx_ring[0].hn_agg_align; 4381 return (sysctl_handle_int(oidp, &align, 0, req)); 4382 } 4383 4384 static void 4385 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4386 { 4387 if (pollhz == 0) 4388 vmbus_chan_poll_disable(chan); 4389 else 4390 vmbus_chan_poll_enable(chan, pollhz); 4391 } 4392 4393 static void 4394 hn_polling(struct hn_softc *sc, u_int pollhz) 4395 { 4396 int nsubch = sc->hn_rx_ring_inuse - 1; 4397 4398 HN_LOCK_ASSERT(sc); 4399 4400 if (nsubch > 0) { 4401 struct vmbus_channel **subch; 4402 int i; 4403 4404 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4405 for (i = 0; i < nsubch; ++i) 4406 hn_chan_polling(subch[i], pollhz); 4407 vmbus_subchan_rel(subch, nsubch); 4408 } 4409 hn_chan_polling(sc->hn_prichan, pollhz); 4410 } 4411 4412 static int 4413 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4414 { 4415 struct hn_softc *sc = arg1; 4416 int pollhz, error; 4417 4418 pollhz = sc->hn_pollhz; 4419 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4420 if (error || req->newptr == NULL) 4421 return (error); 4422 4423 if (pollhz != 0 && 4424 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4425 return (EINVAL); 4426 4427 HN_LOCK(sc); 4428 if (sc->hn_pollhz != pollhz) { 4429 sc->hn_pollhz = pollhz; 4430 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 4431 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4432 hn_polling(sc, sc->hn_pollhz); 4433 } 4434 HN_UNLOCK(sc); 4435 4436 return (0); 4437 } 4438 4439 static int 4440 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4441 { 4442 struct hn_softc *sc = arg1; 4443 char verstr[16]; 4444 4445 snprintf(verstr, sizeof(verstr), "%u.%u", 4446 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4447 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4448 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4449 } 4450 4451 static int 4452 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4453 { 4454 struct hn_softc *sc = arg1; 4455 char caps_str[128]; 4456 uint32_t caps; 4457 4458 HN_LOCK(sc); 4459 caps = sc->hn_caps; 4460 HN_UNLOCK(sc); 4461 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4462 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4463 } 4464 4465 static int 4466 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4467 { 4468 struct hn_softc *sc = arg1; 4469 char assist_str[128]; 4470 uint32_t hwassist; 4471 4472 HN_LOCK(sc); 4473 hwassist = sc->hn_ifp->if_hwassist; 4474 HN_UNLOCK(sc); 4475 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4476 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4477 } 4478 4479 static int 4480 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4481 { 4482 struct hn_softc *sc = arg1; 4483 char filter_str[128]; 4484 uint32_t filter; 4485 4486 HN_LOCK(sc); 4487 filter = sc->hn_rx_filter; 4488 HN_UNLOCK(sc); 4489 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4490 NDIS_PACKET_TYPES); 4491 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4492 } 4493 4494 #ifndef RSS 4495 4496 static int 4497 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4498 { 4499 struct hn_softc *sc = arg1; 4500 int error; 4501 4502 HN_LOCK(sc); 4503 4504 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4505 if (error || req->newptr == NULL) 4506 goto back; 4507 4508 if ((sc->hn_flags & HN_FLAG_RXVF) || 4509 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4510 /* 4511 * RSS key is synchronized w/ VF's, don't allow users 4512 * to change it. 4513 */ 4514 error = EBUSY; 4515 goto back; 4516 } 4517 4518 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4519 if (error) 4520 goto back; 4521 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4522 4523 if (sc->hn_rx_ring_inuse > 1) { 4524 error = hn_rss_reconfig(sc); 4525 } else { 4526 /* Not RSS capable, at least for now; just save the RSS key. */ 4527 error = 0; 4528 } 4529 back: 4530 HN_UNLOCK(sc); 4531 return (error); 4532 } 4533 4534 static int 4535 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4536 { 4537 struct hn_softc *sc = arg1; 4538 int error; 4539 4540 HN_LOCK(sc); 4541 4542 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4543 if (error || req->newptr == NULL) 4544 goto back; 4545 4546 /* 4547 * Don't allow RSS indirect table change, if this interface is not 4548 * RSS capable currently. 4549 */ 4550 if (sc->hn_rx_ring_inuse == 1) { 4551 error = EOPNOTSUPP; 4552 goto back; 4553 } 4554 4555 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4556 if (error) 4557 goto back; 4558 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4559 4560 hn_rss_ind_fixup(sc); 4561 error = hn_rss_reconfig(sc); 4562 back: 4563 HN_UNLOCK(sc); 4564 return (error); 4565 } 4566 4567 #endif /* !RSS */ 4568 4569 static int 4570 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4571 { 4572 struct hn_softc *sc = arg1; 4573 char hash_str[128]; 4574 uint32_t hash; 4575 4576 HN_LOCK(sc); 4577 hash = sc->hn_rss_hash; 4578 HN_UNLOCK(sc); 4579 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4580 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4581 } 4582 4583 static int 4584 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4585 { 4586 struct hn_softc *sc = arg1; 4587 char hash_str[128]; 4588 uint32_t hash; 4589 4590 HN_LOCK(sc); 4591 hash = sc->hn_rss_hcap; 4592 HN_UNLOCK(sc); 4593 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4594 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4595 } 4596 4597 static int 4598 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4599 { 4600 struct hn_softc *sc = arg1; 4601 char hash_str[128]; 4602 uint32_t hash; 4603 4604 HN_LOCK(sc); 4605 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4606 HN_UNLOCK(sc); 4607 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4608 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4609 } 4610 4611 static int 4612 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4613 { 4614 struct hn_softc *sc = arg1; 4615 char vf_name[IFNAMSIZ + 1]; 4616 struct ifnet *vf_ifp; 4617 4618 HN_LOCK(sc); 4619 vf_name[0] = '\0'; 4620 vf_ifp = sc->hn_vf_ifp; 4621 if (vf_ifp != NULL) 4622 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4623 HN_UNLOCK(sc); 4624 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4625 } 4626 4627 static int 4628 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4629 { 4630 struct hn_softc *sc = arg1; 4631 char vf_name[IFNAMSIZ + 1]; 4632 struct ifnet *vf_ifp; 4633 4634 HN_LOCK(sc); 4635 vf_name[0] = '\0'; 4636 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4637 if (vf_ifp != NULL) 4638 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4639 HN_UNLOCK(sc); 4640 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4641 } 4642 4643 static int 4644 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4645 { 4646 struct rm_priotracker pt; 4647 struct sbuf *sb; 4648 int error, i; 4649 bool first; 4650 4651 error = sysctl_wire_old_buffer(req, 0); 4652 if (error != 0) 4653 return (error); 4654 4655 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4656 if (sb == NULL) 4657 return (ENOMEM); 4658 4659 rm_rlock(&hn_vfmap_lock, &pt); 4660 4661 first = true; 4662 for (i = 0; i < hn_vfmap_size; ++i) { 4663 struct ifnet *ifp; 4664 4665 if (hn_vfmap[i] == NULL) 4666 continue; 4667 4668 ifp = ifnet_byindex(i); 4669 if (ifp != NULL) { 4670 if (first) 4671 sbuf_printf(sb, "%s", ifp->if_xname); 4672 else 4673 sbuf_printf(sb, " %s", ifp->if_xname); 4674 first = false; 4675 } 4676 } 4677 4678 rm_runlock(&hn_vfmap_lock, &pt); 4679 4680 error = sbuf_finish(sb); 4681 sbuf_delete(sb); 4682 return (error); 4683 } 4684 4685 static int 4686 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4687 { 4688 struct rm_priotracker pt; 4689 struct sbuf *sb; 4690 int error, i; 4691 bool first; 4692 4693 error = sysctl_wire_old_buffer(req, 0); 4694 if (error != 0) 4695 return (error); 4696 4697 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4698 if (sb == NULL) 4699 return (ENOMEM); 4700 4701 rm_rlock(&hn_vfmap_lock, &pt); 4702 4703 first = true; 4704 for (i = 0; i < hn_vfmap_size; ++i) { 4705 struct ifnet *ifp, *hn_ifp; 4706 4707 hn_ifp = hn_vfmap[i]; 4708 if (hn_ifp == NULL) 4709 continue; 4710 4711 ifp = ifnet_byindex(i); 4712 if (ifp != NULL) { 4713 if (first) { 4714 sbuf_printf(sb, "%s:%s", ifp->if_xname, 4715 hn_ifp->if_xname); 4716 } else { 4717 sbuf_printf(sb, " %s:%s", ifp->if_xname, 4718 hn_ifp->if_xname); 4719 } 4720 first = false; 4721 } 4722 } 4723 4724 rm_runlock(&hn_vfmap_lock, &pt); 4725 4726 error = sbuf_finish(sb); 4727 sbuf_delete(sb); 4728 return (error); 4729 } 4730 4731 static int 4732 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4733 { 4734 struct hn_softc *sc = arg1; 4735 int error, onoff = 0; 4736 4737 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4738 onoff = 1; 4739 error = sysctl_handle_int(oidp, &onoff, 0, req); 4740 if (error || req->newptr == NULL) 4741 return (error); 4742 4743 HN_LOCK(sc); 4744 /* NOTE: hn_vf_lock for hn_transmit() */ 4745 rm_wlock(&sc->hn_vf_lock); 4746 if (onoff) 4747 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4748 else 4749 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4750 rm_wunlock(&sc->hn_vf_lock); 4751 HN_UNLOCK(sc); 4752 4753 return (0); 4754 } 4755 4756 static int 4757 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4758 { 4759 struct hn_softc *sc = arg1; 4760 int enabled = 0; 4761 4762 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4763 enabled = 1; 4764 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4765 } 4766 4767 static int 4768 hn_check_iplen(const struct mbuf *m, int hoff) 4769 { 4770 const struct ip *ip; 4771 int len, iphlen, iplen; 4772 const struct tcphdr *th; 4773 int thoff; /* TCP data offset */ 4774 4775 len = hoff + sizeof(struct ip); 4776 4777 /* The packet must be at least the size of an IP header. */ 4778 if (m->m_pkthdr.len < len) 4779 return IPPROTO_DONE; 4780 4781 /* The fixed IP header must reside completely in the first mbuf. */ 4782 if (m->m_len < len) 4783 return IPPROTO_DONE; 4784 4785 ip = mtodo(m, hoff); 4786 4787 /* Bound check the packet's stated IP header length. */ 4788 iphlen = ip->ip_hl << 2; 4789 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4790 return IPPROTO_DONE; 4791 4792 /* The full IP header must reside completely in the one mbuf. */ 4793 if (m->m_len < hoff + iphlen) 4794 return IPPROTO_DONE; 4795 4796 iplen = ntohs(ip->ip_len); 4797 4798 /* 4799 * Check that the amount of data in the buffers is as 4800 * at least much as the IP header would have us expect. 4801 */ 4802 if (m->m_pkthdr.len < hoff + iplen) 4803 return IPPROTO_DONE; 4804 4805 /* 4806 * Ignore IP fragments. 4807 */ 4808 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4809 return IPPROTO_DONE; 4810 4811 /* 4812 * The TCP/IP or UDP/IP header must be entirely contained within 4813 * the first fragment of a packet. 4814 */ 4815 switch (ip->ip_p) { 4816 case IPPROTO_TCP: 4817 if (iplen < iphlen + sizeof(struct tcphdr)) 4818 return IPPROTO_DONE; 4819 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4820 return IPPROTO_DONE; 4821 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4822 thoff = th->th_off << 2; 4823 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4824 return IPPROTO_DONE; 4825 if (m->m_len < hoff + iphlen + thoff) 4826 return IPPROTO_DONE; 4827 break; 4828 case IPPROTO_UDP: 4829 if (iplen < iphlen + sizeof(struct udphdr)) 4830 return IPPROTO_DONE; 4831 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4832 return IPPROTO_DONE; 4833 break; 4834 default: 4835 if (iplen < iphlen) 4836 return IPPROTO_DONE; 4837 break; 4838 } 4839 return ip->ip_p; 4840 } 4841 4842 static int 4843 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4844 { 4845 struct sysctl_oid_list *child; 4846 struct sysctl_ctx_list *ctx; 4847 device_t dev = sc->hn_dev; 4848 #if defined(INET) || defined(INET6) 4849 #if __FreeBSD_version >= 1100095 4850 int lroent_cnt; 4851 #endif 4852 #endif 4853 int i; 4854 4855 /* 4856 * Create RXBUF for reception. 4857 * 4858 * NOTE: 4859 * - It is shared by all channels. 4860 * - A large enough buffer is allocated, certain version of NVSes 4861 * may further limit the usable space. 4862 */ 4863 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4864 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 4865 BUS_DMA_WAITOK | BUS_DMA_ZERO); 4866 if (sc->hn_rxbuf == NULL) { 4867 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4868 return (ENOMEM); 4869 } 4870 4871 sc->hn_rx_ring_cnt = ring_cnt; 4872 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4873 4874 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4875 M_DEVBUF, M_WAITOK | M_ZERO); 4876 4877 #if defined(INET) || defined(INET6) 4878 #if __FreeBSD_version >= 1100095 4879 lroent_cnt = hn_lro_entry_count; 4880 if (lroent_cnt < TCP_LRO_ENTRIES) 4881 lroent_cnt = TCP_LRO_ENTRIES; 4882 if (bootverbose) 4883 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 4884 #endif 4885 #endif /* INET || INET6 */ 4886 4887 ctx = device_get_sysctl_ctx(dev); 4888 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 4889 4890 /* Create dev.hn.UNIT.rx sysctl tree */ 4891 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 4892 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4893 4894 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4895 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4896 4897 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4898 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 4899 &rxr->hn_br_dma, BUS_DMA_WAITOK); 4900 if (rxr->hn_br == NULL) { 4901 device_printf(dev, "allocate bufring failed\n"); 4902 return (ENOMEM); 4903 } 4904 4905 if (hn_trust_hosttcp) 4906 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 4907 if (hn_trust_hostudp) 4908 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 4909 if (hn_trust_hostip) 4910 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 4911 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 4912 rxr->hn_ifp = sc->hn_ifp; 4913 if (i < sc->hn_tx_ring_cnt) 4914 rxr->hn_txr = &sc->hn_tx_ring[i]; 4915 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 4916 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 4917 rxr->hn_rx_idx = i; 4918 rxr->hn_rxbuf = sc->hn_rxbuf; 4919 4920 /* 4921 * Initialize LRO. 4922 */ 4923 #if defined(INET) || defined(INET6) 4924 #if __FreeBSD_version >= 1100095 4925 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 4926 hn_lro_mbufq_depth); 4927 #else 4928 tcp_lro_init(&rxr->hn_lro); 4929 rxr->hn_lro.ifp = sc->hn_ifp; 4930 #endif 4931 #if __FreeBSD_version >= 1100099 4932 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 4933 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 4934 #endif 4935 #endif /* INET || INET6 */ 4936 4937 if (sc->hn_rx_sysctl_tree != NULL) { 4938 char name[16]; 4939 4940 /* 4941 * Create per RX ring sysctl tree: 4942 * dev.hn.UNIT.rx.RINGID 4943 */ 4944 snprintf(name, sizeof(name), "%d", i); 4945 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 4946 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 4947 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4948 4949 if (rxr->hn_rx_sysctl_tree != NULL) { 4950 SYSCTL_ADD_ULONG(ctx, 4951 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 4952 OID_AUTO, "packets", CTLFLAG_RW, 4953 &rxr->hn_pkts, "# of packets received"); 4954 SYSCTL_ADD_ULONG(ctx, 4955 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 4956 OID_AUTO, "rss_pkts", CTLFLAG_RW, 4957 &rxr->hn_rss_pkts, 4958 "# of packets w/ RSS info received"); 4959 SYSCTL_ADD_INT(ctx, 4960 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 4961 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 4962 &rxr->hn_pktbuf_len, 0, 4963 "Temporary channel packet buffer length"); 4964 } 4965 } 4966 } 4967 4968 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 4969 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4970 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 4971 #if __FreeBSD_version < 1100095 4972 hn_rx_stat_int_sysctl, 4973 #else 4974 hn_rx_stat_u64_sysctl, 4975 #endif 4976 "LU", "LRO queued"); 4977 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 4978 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4979 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 4980 #if __FreeBSD_version < 1100095 4981 hn_rx_stat_int_sysctl, 4982 #else 4983 hn_rx_stat_u64_sysctl, 4984 #endif 4985 "LU", "LRO flushed"); 4986 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 4987 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4988 __offsetof(struct hn_rx_ring, hn_lro_tried), 4989 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 4990 #if __FreeBSD_version >= 1100099 4991 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 4992 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 4993 hn_lro_lenlim_sysctl, "IU", 4994 "Max # of data bytes to be aggregated by LRO"); 4995 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 4996 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 4997 hn_lro_ackcnt_sysctl, "I", 4998 "Max # of ACKs to be aggregated by LRO"); 4999 #endif 5000 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5001 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5002 hn_trust_hcsum_sysctl, "I", 5003 "Trust tcp segement verification on host side, " 5004 "when csum info is missing"); 5005 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5006 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5007 hn_trust_hcsum_sysctl, "I", 5008 "Trust udp datagram verification on host side, " 5009 "when csum info is missing"); 5010 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5011 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5012 hn_trust_hcsum_sysctl, "I", 5013 "Trust ip packet verification on host side, " 5014 "when csum info is missing"); 5015 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5016 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5017 __offsetof(struct hn_rx_ring, hn_csum_ip), 5018 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5019 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5020 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5021 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5022 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5023 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5024 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5025 __offsetof(struct hn_rx_ring, hn_csum_udp), 5026 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5027 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5028 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5029 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5030 hn_rx_stat_ulong_sysctl, "LU", 5031 "# of packets that we trust host's csum verification"); 5032 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5033 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5034 __offsetof(struct hn_rx_ring, hn_small_pkts), 5035 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5036 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5037 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5038 __offsetof(struct hn_rx_ring, hn_ack_failed), 5039 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5040 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5041 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5042 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5043 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5044 5045 return (0); 5046 } 5047 5048 static void 5049 hn_destroy_rx_data(struct hn_softc *sc) 5050 { 5051 int i; 5052 5053 if (sc->hn_rxbuf != NULL) { 5054 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5055 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 5056 else 5057 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5058 sc->hn_rxbuf = NULL; 5059 } 5060 5061 if (sc->hn_rx_ring_cnt == 0) 5062 return; 5063 5064 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5065 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5066 5067 if (rxr->hn_br == NULL) 5068 continue; 5069 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5070 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 5071 } else { 5072 device_printf(sc->hn_dev, 5073 "%dth channel bufring is referenced", i); 5074 } 5075 rxr->hn_br = NULL; 5076 5077 #if defined(INET) || defined(INET6) 5078 tcp_lro_free(&rxr->hn_lro); 5079 #endif 5080 free(rxr->hn_pktbuf, M_DEVBUF); 5081 } 5082 free(sc->hn_rx_ring, M_DEVBUF); 5083 sc->hn_rx_ring = NULL; 5084 5085 sc->hn_rx_ring_cnt = 0; 5086 sc->hn_rx_ring_inuse = 0; 5087 } 5088 5089 static int 5090 hn_tx_ring_create(struct hn_softc *sc, int id) 5091 { 5092 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5093 device_t dev = sc->hn_dev; 5094 bus_dma_tag_t parent_dtag; 5095 int error, i; 5096 5097 txr->hn_sc = sc; 5098 txr->hn_tx_idx = id; 5099 5100 #ifndef HN_USE_TXDESC_BUFRING 5101 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5102 #endif 5103 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5104 5105 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5106 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5107 M_DEVBUF, M_WAITOK | M_ZERO); 5108 #ifndef HN_USE_TXDESC_BUFRING 5109 SLIST_INIT(&txr->hn_txlist); 5110 #else 5111 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5112 M_WAITOK, &txr->hn_tx_lock); 5113 #endif 5114 5115 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5116 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5117 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5118 } else { 5119 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5120 } 5121 5122 #ifdef HN_IFSTART_SUPPORT 5123 if (hn_use_if_start) { 5124 txr->hn_txeof = hn_start_txeof; 5125 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5126 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5127 } else 5128 #endif 5129 { 5130 int br_depth; 5131 5132 txr->hn_txeof = hn_xmit_txeof; 5133 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5134 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5135 5136 br_depth = hn_get_txswq_depth(txr); 5137 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5138 M_WAITOK, &txr->hn_tx_lock); 5139 } 5140 5141 txr->hn_direct_tx_size = hn_direct_tx_size; 5142 5143 /* 5144 * Always schedule transmission instead of trying to do direct 5145 * transmission. This one gives the best performance so far. 5146 */ 5147 txr->hn_sched_tx = 1; 5148 5149 parent_dtag = bus_get_dma_tag(dev); 5150 5151 /* DMA tag for RNDIS packet messages. */ 5152 error = bus_dma_tag_create(parent_dtag, /* parent */ 5153 HN_RNDIS_PKT_ALIGN, /* alignment */ 5154 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5155 BUS_SPACE_MAXADDR, /* lowaddr */ 5156 BUS_SPACE_MAXADDR, /* highaddr */ 5157 NULL, NULL, /* filter, filterarg */ 5158 HN_RNDIS_PKT_LEN, /* maxsize */ 5159 1, /* nsegments */ 5160 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5161 0, /* flags */ 5162 NULL, /* lockfunc */ 5163 NULL, /* lockfuncarg */ 5164 &txr->hn_tx_rndis_dtag); 5165 if (error) { 5166 device_printf(dev, "failed to create rndis dmatag\n"); 5167 return error; 5168 } 5169 5170 /* DMA tag for data. */ 5171 error = bus_dma_tag_create(parent_dtag, /* parent */ 5172 1, /* alignment */ 5173 HN_TX_DATA_BOUNDARY, /* boundary */ 5174 BUS_SPACE_MAXADDR, /* lowaddr */ 5175 BUS_SPACE_MAXADDR, /* highaddr */ 5176 NULL, NULL, /* filter, filterarg */ 5177 HN_TX_DATA_MAXSIZE, /* maxsize */ 5178 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5179 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5180 0, /* flags */ 5181 NULL, /* lockfunc */ 5182 NULL, /* lockfuncarg */ 5183 &txr->hn_tx_data_dtag); 5184 if (error) { 5185 device_printf(dev, "failed to create data dmatag\n"); 5186 return error; 5187 } 5188 5189 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5190 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5191 5192 txd->txr = txr; 5193 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5194 STAILQ_INIT(&txd->agg_list); 5195 5196 /* 5197 * Allocate and load RNDIS packet message. 5198 */ 5199 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5200 (void **)&txd->rndis_pkt, 5201 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5202 &txd->rndis_pkt_dmap); 5203 if (error) { 5204 device_printf(dev, 5205 "failed to allocate rndis_packet_msg, %d\n", i); 5206 return error; 5207 } 5208 5209 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5210 txd->rndis_pkt_dmap, 5211 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5212 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5213 BUS_DMA_NOWAIT); 5214 if (error) { 5215 device_printf(dev, 5216 "failed to load rndis_packet_msg, %d\n", i); 5217 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5218 txd->rndis_pkt, txd->rndis_pkt_dmap); 5219 return error; 5220 } 5221 5222 /* DMA map for TX data. */ 5223 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5224 &txd->data_dmap); 5225 if (error) { 5226 device_printf(dev, 5227 "failed to allocate tx data dmamap\n"); 5228 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5229 txd->rndis_pkt_dmap); 5230 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5231 txd->rndis_pkt, txd->rndis_pkt_dmap); 5232 return error; 5233 } 5234 5235 /* All set, put it to list */ 5236 txd->flags |= HN_TXD_FLAG_ONLIST; 5237 #ifndef HN_USE_TXDESC_BUFRING 5238 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5239 #else 5240 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5241 #endif 5242 } 5243 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5244 5245 if (sc->hn_tx_sysctl_tree != NULL) { 5246 struct sysctl_oid_list *child; 5247 struct sysctl_ctx_list *ctx; 5248 char name[16]; 5249 5250 /* 5251 * Create per TX ring sysctl tree: 5252 * dev.hn.UNIT.tx.RINGID 5253 */ 5254 ctx = device_get_sysctl_ctx(dev); 5255 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5256 5257 snprintf(name, sizeof(name), "%d", id); 5258 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5259 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5260 5261 if (txr->hn_tx_sysctl_tree != NULL) { 5262 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5263 5264 #ifdef HN_DEBUG 5265 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5266 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5267 "# of available TX descs"); 5268 #endif 5269 #ifdef HN_IFSTART_SUPPORT 5270 if (!hn_use_if_start) 5271 #endif 5272 { 5273 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5274 CTLFLAG_RD, &txr->hn_oactive, 0, 5275 "over active"); 5276 } 5277 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5278 CTLFLAG_RW, &txr->hn_pkts, 5279 "# of packets transmitted"); 5280 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5281 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 5282 } 5283 } 5284 5285 return 0; 5286 } 5287 5288 static void 5289 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5290 { 5291 struct hn_tx_ring *txr = txd->txr; 5292 5293 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5294 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5295 5296 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5297 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5298 txd->rndis_pkt_dmap); 5299 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5300 } 5301 5302 static void 5303 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5304 { 5305 5306 KASSERT(txd->refs == 0 || txd->refs == 1, 5307 ("invalid txd refs %d", txd->refs)); 5308 5309 /* Aggregated txds will be freed by their aggregating txd. */ 5310 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5311 int freed; 5312 5313 freed = hn_txdesc_put(txr, txd); 5314 KASSERT(freed, ("can't free txdesc")); 5315 } 5316 } 5317 5318 static void 5319 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5320 { 5321 int i; 5322 5323 if (txr->hn_txdesc == NULL) 5324 return; 5325 5326 /* 5327 * NOTE: 5328 * Because the freeing of aggregated txds will be deferred 5329 * to the aggregating txd, two passes are used here: 5330 * - The first pass GCes any pending txds. This GC is necessary, 5331 * since if the channels are revoked, hypervisor will not 5332 * deliver send-done for all pending txds. 5333 * - The second pass frees the busdma stuffs, i.e. after all txds 5334 * were freed. 5335 */ 5336 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5337 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5338 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5339 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5340 5341 if (txr->hn_tx_data_dtag != NULL) 5342 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5343 if (txr->hn_tx_rndis_dtag != NULL) 5344 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5345 5346 #ifdef HN_USE_TXDESC_BUFRING 5347 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5348 #endif 5349 5350 free(txr->hn_txdesc, M_DEVBUF); 5351 txr->hn_txdesc = NULL; 5352 5353 if (txr->hn_mbuf_br != NULL) 5354 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5355 5356 #ifndef HN_USE_TXDESC_BUFRING 5357 mtx_destroy(&txr->hn_txlist_spin); 5358 #endif 5359 mtx_destroy(&txr->hn_tx_lock); 5360 } 5361 5362 static int 5363 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5364 { 5365 struct sysctl_oid_list *child; 5366 struct sysctl_ctx_list *ctx; 5367 int i; 5368 5369 /* 5370 * Create TXBUF for chimney sending. 5371 * 5372 * NOTE: It is shared by all channels. 5373 */ 5374 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 5375 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 5376 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5377 if (sc->hn_chim == NULL) { 5378 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5379 return (ENOMEM); 5380 } 5381 5382 sc->hn_tx_ring_cnt = ring_cnt; 5383 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5384 5385 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5386 M_DEVBUF, M_WAITOK | M_ZERO); 5387 5388 ctx = device_get_sysctl_ctx(sc->hn_dev); 5389 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5390 5391 /* Create dev.hn.UNIT.tx sysctl tree */ 5392 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5393 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5394 5395 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5396 int error; 5397 5398 error = hn_tx_ring_create(sc, i); 5399 if (error) 5400 return error; 5401 } 5402 5403 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5404 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5405 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5406 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5407 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5408 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5409 __offsetof(struct hn_tx_ring, hn_send_failed), 5410 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5411 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5412 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5413 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5414 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5415 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5416 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5417 __offsetof(struct hn_tx_ring, hn_flush_failed), 5418 hn_tx_stat_ulong_sysctl, "LU", 5419 "# of packet transmission aggregation flush failure"); 5420 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5421 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5422 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5423 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5424 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5425 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5426 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5427 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5428 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5429 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5430 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5431 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5432 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5433 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5434 "# of total TX descs"); 5435 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5436 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5437 "Chimney send packet size upper boundary"); 5438 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5439 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5440 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5441 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5442 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5443 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5444 hn_tx_conf_int_sysctl, "I", 5445 "Size of the packet for direct transmission"); 5446 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5447 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5448 __offsetof(struct hn_tx_ring, hn_sched_tx), 5449 hn_tx_conf_int_sysctl, "I", 5450 "Always schedule transmission " 5451 "instead of doing direct transmission"); 5452 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5453 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5454 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5455 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5456 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5457 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5458 "Applied packet transmission aggregation size"); 5459 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5460 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5461 hn_txagg_pktmax_sysctl, "I", 5462 "Applied packet transmission aggregation packets"); 5463 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5464 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5465 hn_txagg_align_sysctl, "I", 5466 "Applied packet transmission aggregation alignment"); 5467 5468 return 0; 5469 } 5470 5471 static void 5472 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5473 { 5474 int i; 5475 5476 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5477 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5478 } 5479 5480 static void 5481 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5482 { 5483 struct ifnet *ifp = sc->hn_ifp; 5484 u_int hw_tsomax; 5485 int tso_minlen; 5486 5487 HN_LOCK_ASSERT(sc); 5488 5489 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5490 return; 5491 5492 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5493 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5494 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5495 5496 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5497 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5498 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5499 5500 if (tso_maxlen < tso_minlen) 5501 tso_maxlen = tso_minlen; 5502 else if (tso_maxlen > IP_MAXPACKET) 5503 tso_maxlen = IP_MAXPACKET; 5504 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5505 tso_maxlen = sc->hn_ndis_tso_szmax; 5506 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5507 5508 if (hn_xpnt_vf_isready(sc)) { 5509 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) 5510 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; 5511 } 5512 ifp->if_hw_tsomax = hw_tsomax; 5513 if (bootverbose) 5514 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 5515 } 5516 5517 static void 5518 hn_fixup_tx_data(struct hn_softc *sc) 5519 { 5520 uint64_t csum_assist; 5521 int i; 5522 5523 hn_set_chim_size(sc, sc->hn_chim_szmax); 5524 if (hn_tx_chimney_size > 0 && 5525 hn_tx_chimney_size < sc->hn_chim_szmax) 5526 hn_set_chim_size(sc, hn_tx_chimney_size); 5527 5528 csum_assist = 0; 5529 if (sc->hn_caps & HN_CAP_IPCS) 5530 csum_assist |= CSUM_IP; 5531 if (sc->hn_caps & HN_CAP_TCP4CS) 5532 csum_assist |= CSUM_IP_TCP; 5533 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5534 csum_assist |= CSUM_IP_UDP; 5535 if (sc->hn_caps & HN_CAP_TCP6CS) 5536 csum_assist |= CSUM_IP6_TCP; 5537 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5538 csum_assist |= CSUM_IP6_UDP; 5539 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5540 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5541 5542 if (sc->hn_caps & HN_CAP_HASHVAL) { 5543 /* 5544 * Support HASHVAL pktinfo on TX path. 5545 */ 5546 if (bootverbose) 5547 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5548 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5549 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5550 } 5551 } 5552 5553 static void 5554 hn_destroy_tx_data(struct hn_softc *sc) 5555 { 5556 int i; 5557 5558 if (sc->hn_chim != NULL) { 5559 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5560 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5561 } else { 5562 device_printf(sc->hn_dev, 5563 "chimney sending buffer is referenced"); 5564 } 5565 sc->hn_chim = NULL; 5566 } 5567 5568 if (sc->hn_tx_ring_cnt == 0) 5569 return; 5570 5571 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5572 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5573 5574 free(sc->hn_tx_ring, M_DEVBUF); 5575 sc->hn_tx_ring = NULL; 5576 5577 sc->hn_tx_ring_cnt = 0; 5578 sc->hn_tx_ring_inuse = 0; 5579 } 5580 5581 #ifdef HN_IFSTART_SUPPORT 5582 5583 static void 5584 hn_start_taskfunc(void *xtxr, int pending __unused) 5585 { 5586 struct hn_tx_ring *txr = xtxr; 5587 5588 mtx_lock(&txr->hn_tx_lock); 5589 hn_start_locked(txr, 0); 5590 mtx_unlock(&txr->hn_tx_lock); 5591 } 5592 5593 static int 5594 hn_start_locked(struct hn_tx_ring *txr, int len) 5595 { 5596 struct hn_softc *sc = txr->hn_sc; 5597 struct ifnet *ifp = sc->hn_ifp; 5598 int sched = 0; 5599 5600 KASSERT(hn_use_if_start, 5601 ("hn_start_locked is called, when if_start is disabled")); 5602 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5603 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5604 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5605 5606 if (__predict_false(txr->hn_suspended)) 5607 return (0); 5608 5609 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5610 IFF_DRV_RUNNING) 5611 return (0); 5612 5613 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 5614 struct hn_txdesc *txd; 5615 struct mbuf *m_head; 5616 int error; 5617 5618 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 5619 if (m_head == NULL) 5620 break; 5621 5622 if (len > 0 && m_head->m_pkthdr.len > len) { 5623 /* 5624 * This sending could be time consuming; let callers 5625 * dispatch this packet sending (and sending of any 5626 * following up packets) to tx taskqueue. 5627 */ 5628 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5629 sched = 1; 5630 break; 5631 } 5632 5633 #if defined(INET6) || defined(INET) 5634 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5635 m_head = hn_tso_fixup(m_head); 5636 if (__predict_false(m_head == NULL)) { 5637 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5638 continue; 5639 } 5640 } else if (m_head->m_pkthdr.csum_flags & 5641 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5642 m_head = hn_set_hlen(m_head); 5643 if (__predict_false(m_head == NULL)) { 5644 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5645 continue; 5646 } 5647 } 5648 #endif 5649 5650 txd = hn_txdesc_get(txr); 5651 if (txd == NULL) { 5652 txr->hn_no_txdescs++; 5653 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5654 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5655 break; 5656 } 5657 5658 error = hn_encap(ifp, txr, txd, &m_head); 5659 if (error) { 5660 /* Both txd and m_head are freed */ 5661 KASSERT(txr->hn_agg_txd == NULL, 5662 ("encap failed w/ pending aggregating txdesc")); 5663 continue; 5664 } 5665 5666 if (txr->hn_agg_pktleft == 0) { 5667 if (txr->hn_agg_txd != NULL) { 5668 KASSERT(m_head == NULL, 5669 ("pending mbuf for aggregating txdesc")); 5670 error = hn_flush_txagg(ifp, txr); 5671 if (__predict_false(error)) { 5672 atomic_set_int(&ifp->if_drv_flags, 5673 IFF_DRV_OACTIVE); 5674 break; 5675 } 5676 } else { 5677 KASSERT(m_head != NULL, ("mbuf was freed")); 5678 error = hn_txpkt(ifp, txr, txd); 5679 if (__predict_false(error)) { 5680 /* txd is freed, but m_head is not */ 5681 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5682 atomic_set_int(&ifp->if_drv_flags, 5683 IFF_DRV_OACTIVE); 5684 break; 5685 } 5686 } 5687 } 5688 #ifdef INVARIANTS 5689 else { 5690 KASSERT(txr->hn_agg_txd != NULL, 5691 ("no aggregating txdesc")); 5692 KASSERT(m_head == NULL, 5693 ("pending mbuf for aggregating txdesc")); 5694 } 5695 #endif 5696 } 5697 5698 /* Flush pending aggerated transmission. */ 5699 if (txr->hn_agg_txd != NULL) 5700 hn_flush_txagg(ifp, txr); 5701 return (sched); 5702 } 5703 5704 static void 5705 hn_start(struct ifnet *ifp) 5706 { 5707 struct hn_softc *sc = ifp->if_softc; 5708 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5709 5710 if (txr->hn_sched_tx) 5711 goto do_sched; 5712 5713 if (mtx_trylock(&txr->hn_tx_lock)) { 5714 int sched; 5715 5716 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5717 mtx_unlock(&txr->hn_tx_lock); 5718 if (!sched) 5719 return; 5720 } 5721 do_sched: 5722 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5723 } 5724 5725 static void 5726 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5727 { 5728 struct hn_tx_ring *txr = xtxr; 5729 5730 mtx_lock(&txr->hn_tx_lock); 5731 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 5732 hn_start_locked(txr, 0); 5733 mtx_unlock(&txr->hn_tx_lock); 5734 } 5735 5736 static void 5737 hn_start_txeof(struct hn_tx_ring *txr) 5738 { 5739 struct hn_softc *sc = txr->hn_sc; 5740 struct ifnet *ifp = sc->hn_ifp; 5741 5742 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5743 5744 if (txr->hn_sched_tx) 5745 goto do_sched; 5746 5747 if (mtx_trylock(&txr->hn_tx_lock)) { 5748 int sched; 5749 5750 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5751 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5752 mtx_unlock(&txr->hn_tx_lock); 5753 if (sched) { 5754 taskqueue_enqueue(txr->hn_tx_taskq, 5755 &txr->hn_tx_task); 5756 } 5757 } else { 5758 do_sched: 5759 /* 5760 * Release the OACTIVE earlier, with the hope, that 5761 * others could catch up. The task will clear the 5762 * flag again with the hn_tx_lock to avoid possible 5763 * races. 5764 */ 5765 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5766 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5767 } 5768 } 5769 5770 #endif /* HN_IFSTART_SUPPORT */ 5771 5772 static int 5773 hn_xmit(struct hn_tx_ring *txr, int len) 5774 { 5775 struct hn_softc *sc = txr->hn_sc; 5776 struct ifnet *ifp = sc->hn_ifp; 5777 struct mbuf *m_head; 5778 int sched = 0; 5779 5780 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5781 #ifdef HN_IFSTART_SUPPORT 5782 KASSERT(hn_use_if_start == 0, 5783 ("hn_xmit is called, when if_start is enabled")); 5784 #endif 5785 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5786 5787 if (__predict_false(txr->hn_suspended)) 5788 return (0); 5789 5790 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5791 return (0); 5792 5793 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5794 struct hn_txdesc *txd; 5795 int error; 5796 5797 if (len > 0 && m_head->m_pkthdr.len > len) { 5798 /* 5799 * This sending could be time consuming; let callers 5800 * dispatch this packet sending (and sending of any 5801 * following up packets) to tx taskqueue. 5802 */ 5803 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5804 sched = 1; 5805 break; 5806 } 5807 5808 txd = hn_txdesc_get(txr); 5809 if (txd == NULL) { 5810 txr->hn_no_txdescs++; 5811 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5812 txr->hn_oactive = 1; 5813 break; 5814 } 5815 5816 error = hn_encap(ifp, txr, txd, &m_head); 5817 if (error) { 5818 /* Both txd and m_head are freed; discard */ 5819 KASSERT(txr->hn_agg_txd == NULL, 5820 ("encap failed w/ pending aggregating txdesc")); 5821 drbr_advance(ifp, txr->hn_mbuf_br); 5822 continue; 5823 } 5824 5825 if (txr->hn_agg_pktleft == 0) { 5826 if (txr->hn_agg_txd != NULL) { 5827 KASSERT(m_head == NULL, 5828 ("pending mbuf for aggregating txdesc")); 5829 error = hn_flush_txagg(ifp, txr); 5830 if (__predict_false(error)) { 5831 txr->hn_oactive = 1; 5832 break; 5833 } 5834 } else { 5835 KASSERT(m_head != NULL, ("mbuf was freed")); 5836 error = hn_txpkt(ifp, txr, txd); 5837 if (__predict_false(error)) { 5838 /* txd is freed, but m_head is not */ 5839 drbr_putback(ifp, txr->hn_mbuf_br, 5840 m_head); 5841 txr->hn_oactive = 1; 5842 break; 5843 } 5844 } 5845 } 5846 #ifdef INVARIANTS 5847 else { 5848 KASSERT(txr->hn_agg_txd != NULL, 5849 ("no aggregating txdesc")); 5850 KASSERT(m_head == NULL, 5851 ("pending mbuf for aggregating txdesc")); 5852 } 5853 #endif 5854 5855 /* Sent */ 5856 drbr_advance(ifp, txr->hn_mbuf_br); 5857 } 5858 5859 /* Flush pending aggerated transmission. */ 5860 if (txr->hn_agg_txd != NULL) 5861 hn_flush_txagg(ifp, txr); 5862 return (sched); 5863 } 5864 5865 static int 5866 hn_transmit(struct ifnet *ifp, struct mbuf *m) 5867 { 5868 struct hn_softc *sc = ifp->if_softc; 5869 struct hn_tx_ring *txr; 5870 int error, idx = 0; 5871 5872 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 5873 struct rm_priotracker pt; 5874 5875 rm_rlock(&sc->hn_vf_lock, &pt); 5876 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 5877 struct mbuf *m_bpf = NULL; 5878 int obytes, omcast; 5879 5880 obytes = m->m_pkthdr.len; 5881 if (m->m_flags & M_MCAST) 5882 omcast = 1; 5883 5884 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 5885 if (bpf_peers_present(ifp->if_bpf)) { 5886 m_bpf = m_copypacket(m, M_NOWAIT); 5887 if (m_bpf == NULL) { 5888 /* 5889 * Failed to grab a shallow 5890 * copy; tap now. 5891 */ 5892 ETHER_BPF_MTAP(ifp, m); 5893 } 5894 } 5895 } else { 5896 ETHER_BPF_MTAP(ifp, m); 5897 } 5898 5899 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); 5900 rm_runlock(&sc->hn_vf_lock, &pt); 5901 5902 if (m_bpf != NULL) { 5903 if (!error) 5904 ETHER_BPF_MTAP(ifp, m_bpf); 5905 m_freem(m_bpf); 5906 } 5907 5908 if (error == ENOBUFS) { 5909 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 5910 } else if (error) { 5911 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5912 } else { 5913 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 5914 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 5915 if (omcast) { 5916 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 5917 omcast); 5918 } 5919 } 5920 return (error); 5921 } 5922 rm_runlock(&sc->hn_vf_lock, &pt); 5923 } 5924 5925 #if defined(INET6) || defined(INET) 5926 /* 5927 * Perform TSO packet header fixup or get l2/l3 header length now, 5928 * since packet headers should be cache-hot. 5929 */ 5930 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 5931 m = hn_tso_fixup(m); 5932 if (__predict_false(m == NULL)) { 5933 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5934 return EIO; 5935 } 5936 } else if (m->m_pkthdr.csum_flags & 5937 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5938 m = hn_set_hlen(m); 5939 if (__predict_false(m == NULL)) { 5940 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5941 return EIO; 5942 } 5943 } 5944 #endif 5945 5946 /* 5947 * Select the TX ring based on flowid 5948 */ 5949 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 5950 #ifdef RSS 5951 uint32_t bid; 5952 5953 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 5954 &bid) == 0) 5955 idx = bid % sc->hn_tx_ring_inuse; 5956 else 5957 #endif 5958 { 5959 #if defined(INET6) || defined(INET) 5960 int tcpsyn = 0; 5961 5962 if (m->m_pkthdr.len < 128 && 5963 (m->m_pkthdr.csum_flags & 5964 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 5965 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 5966 m = hn_check_tcpsyn(m, &tcpsyn); 5967 if (__predict_false(m == NULL)) { 5968 if_inc_counter(ifp, 5969 IFCOUNTER_OERRORS, 1); 5970 return (EIO); 5971 } 5972 } 5973 #else 5974 const int tcpsyn = 0; 5975 #endif 5976 if (tcpsyn) 5977 idx = 0; 5978 else 5979 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 5980 } 5981 } 5982 txr = &sc->hn_tx_ring[idx]; 5983 5984 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 5985 if (error) { 5986 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 5987 return error; 5988 } 5989 5990 if (txr->hn_oactive) 5991 return 0; 5992 5993 if (txr->hn_sched_tx) 5994 goto do_sched; 5995 5996 if (mtx_trylock(&txr->hn_tx_lock)) { 5997 int sched; 5998 5999 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6000 mtx_unlock(&txr->hn_tx_lock); 6001 if (!sched) 6002 return 0; 6003 } 6004 do_sched: 6005 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6006 return 0; 6007 } 6008 6009 static void 6010 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6011 { 6012 struct mbuf *m; 6013 6014 mtx_lock(&txr->hn_tx_lock); 6015 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6016 m_freem(m); 6017 mtx_unlock(&txr->hn_tx_lock); 6018 } 6019 6020 static void 6021 hn_xmit_qflush(struct ifnet *ifp) 6022 { 6023 struct hn_softc *sc = ifp->if_softc; 6024 struct rm_priotracker pt; 6025 int i; 6026 6027 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6028 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6029 if_qflush(ifp); 6030 6031 rm_rlock(&sc->hn_vf_lock, &pt); 6032 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6033 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); 6034 rm_runlock(&sc->hn_vf_lock, &pt); 6035 } 6036 6037 static void 6038 hn_xmit_txeof(struct hn_tx_ring *txr) 6039 { 6040 6041 if (txr->hn_sched_tx) 6042 goto do_sched; 6043 6044 if (mtx_trylock(&txr->hn_tx_lock)) { 6045 int sched; 6046 6047 txr->hn_oactive = 0; 6048 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6049 mtx_unlock(&txr->hn_tx_lock); 6050 if (sched) { 6051 taskqueue_enqueue(txr->hn_tx_taskq, 6052 &txr->hn_tx_task); 6053 } 6054 } else { 6055 do_sched: 6056 /* 6057 * Release the oactive earlier, with the hope, that 6058 * others could catch up. The task will clear the 6059 * oactive again with the hn_tx_lock to avoid possible 6060 * races. 6061 */ 6062 txr->hn_oactive = 0; 6063 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6064 } 6065 } 6066 6067 static void 6068 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6069 { 6070 struct hn_tx_ring *txr = xtxr; 6071 6072 mtx_lock(&txr->hn_tx_lock); 6073 hn_xmit(txr, 0); 6074 mtx_unlock(&txr->hn_tx_lock); 6075 } 6076 6077 static void 6078 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6079 { 6080 struct hn_tx_ring *txr = xtxr; 6081 6082 mtx_lock(&txr->hn_tx_lock); 6083 txr->hn_oactive = 0; 6084 hn_xmit(txr, 0); 6085 mtx_unlock(&txr->hn_tx_lock); 6086 } 6087 6088 static int 6089 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6090 { 6091 struct vmbus_chan_br cbr; 6092 struct hn_rx_ring *rxr; 6093 struct hn_tx_ring *txr = NULL; 6094 int idx, error; 6095 6096 idx = vmbus_chan_subidx(chan); 6097 6098 /* 6099 * Link this channel to RX/TX ring. 6100 */ 6101 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6102 ("invalid channel index %d, should > 0 && < %d", 6103 idx, sc->hn_rx_ring_inuse)); 6104 rxr = &sc->hn_rx_ring[idx]; 6105 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6106 ("RX ring %d already attached", idx)); 6107 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6108 rxr->hn_chan = chan; 6109 6110 if (bootverbose) { 6111 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6112 idx, vmbus_chan_id(chan)); 6113 } 6114 6115 if (idx < sc->hn_tx_ring_inuse) { 6116 txr = &sc->hn_tx_ring[idx]; 6117 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6118 ("TX ring %d already attached", idx)); 6119 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6120 6121 txr->hn_chan = chan; 6122 if (bootverbose) { 6123 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6124 idx, vmbus_chan_id(chan)); 6125 } 6126 } 6127 6128 /* Bind this channel to a proper CPU. */ 6129 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6130 6131 /* 6132 * Open this channel 6133 */ 6134 cbr.cbr = rxr->hn_br; 6135 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 6136 cbr.cbr_txsz = HN_TXBR_SIZE; 6137 cbr.cbr_rxsz = HN_RXBR_SIZE; 6138 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6139 if (error) { 6140 if (error == EISCONN) { 6141 if_printf(sc->hn_ifp, "bufring is connected after " 6142 "chan%u open failure\n", vmbus_chan_id(chan)); 6143 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6144 } else { 6145 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6146 vmbus_chan_id(chan), error); 6147 } 6148 } 6149 return (error); 6150 } 6151 6152 static void 6153 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6154 { 6155 struct hn_rx_ring *rxr; 6156 int idx, error; 6157 6158 idx = vmbus_chan_subidx(chan); 6159 6160 /* 6161 * Link this channel to RX/TX ring. 6162 */ 6163 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6164 ("invalid channel index %d, should > 0 && < %d", 6165 idx, sc->hn_rx_ring_inuse)); 6166 rxr = &sc->hn_rx_ring[idx]; 6167 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6168 ("RX ring %d is not attached", idx)); 6169 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6170 6171 if (idx < sc->hn_tx_ring_inuse) { 6172 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6173 6174 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6175 ("TX ring %d is not attached attached", idx)); 6176 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6177 } 6178 6179 /* 6180 * Close this channel. 6181 * 6182 * NOTE: 6183 * Channel closing does _not_ destroy the target channel. 6184 */ 6185 error = vmbus_chan_close_direct(chan); 6186 if (error == EISCONN) { 6187 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6188 "after being closed\n", vmbus_chan_id(chan)); 6189 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6190 } else if (error) { 6191 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6192 vmbus_chan_id(chan), error); 6193 } 6194 } 6195 6196 static int 6197 hn_attach_subchans(struct hn_softc *sc) 6198 { 6199 struct vmbus_channel **subchans; 6200 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6201 int i, error = 0; 6202 6203 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6204 6205 /* Attach the sub-channels. */ 6206 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6207 for (i = 0; i < subchan_cnt; ++i) { 6208 int error1; 6209 6210 error1 = hn_chan_attach(sc, subchans[i]); 6211 if (error1) { 6212 error = error1; 6213 /* Move on; all channels will be detached later. */ 6214 } 6215 } 6216 vmbus_subchan_rel(subchans, subchan_cnt); 6217 6218 if (error) { 6219 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6220 } else { 6221 if (bootverbose) { 6222 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6223 subchan_cnt); 6224 } 6225 } 6226 return (error); 6227 } 6228 6229 static void 6230 hn_detach_allchans(struct hn_softc *sc) 6231 { 6232 struct vmbus_channel **subchans; 6233 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6234 int i; 6235 6236 if (subchan_cnt == 0) 6237 goto back; 6238 6239 /* Detach the sub-channels. */ 6240 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6241 for (i = 0; i < subchan_cnt; ++i) 6242 hn_chan_detach(sc, subchans[i]); 6243 vmbus_subchan_rel(subchans, subchan_cnt); 6244 6245 back: 6246 /* 6247 * Detach the primary channel, _after_ all sub-channels 6248 * are detached. 6249 */ 6250 hn_chan_detach(sc, sc->hn_prichan); 6251 6252 /* Wait for sub-channels to be destroyed, if any. */ 6253 vmbus_subchan_drain(sc->hn_prichan); 6254 6255 #ifdef INVARIANTS 6256 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6257 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6258 HN_RX_FLAG_ATTACHED) == 0, 6259 ("%dth RX ring is still attached", i)); 6260 } 6261 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6262 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6263 HN_TX_FLAG_ATTACHED) == 0, 6264 ("%dth TX ring is still attached", i)); 6265 } 6266 #endif 6267 } 6268 6269 static int 6270 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6271 { 6272 struct vmbus_channel **subchans; 6273 int nchan, rxr_cnt, error; 6274 6275 nchan = *nsubch + 1; 6276 if (nchan == 1) { 6277 /* 6278 * Multiple RX/TX rings are not requested. 6279 */ 6280 *nsubch = 0; 6281 return (0); 6282 } 6283 6284 /* 6285 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6286 * table entries. 6287 */ 6288 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6289 if (error) { 6290 /* No RSS; this is benign. */ 6291 *nsubch = 0; 6292 return (0); 6293 } 6294 if (bootverbose) { 6295 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6296 rxr_cnt, nchan); 6297 } 6298 6299 if (nchan > rxr_cnt) 6300 nchan = rxr_cnt; 6301 if (nchan == 1) { 6302 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6303 *nsubch = 0; 6304 return (0); 6305 } 6306 6307 /* 6308 * Allocate sub-channels from NVS. 6309 */ 6310 *nsubch = nchan - 1; 6311 error = hn_nvs_alloc_subchans(sc, nsubch); 6312 if (error || *nsubch == 0) { 6313 /* Failed to allocate sub-channels. */ 6314 *nsubch = 0; 6315 return (0); 6316 } 6317 6318 /* 6319 * Wait for all sub-channels to become ready before moving on. 6320 */ 6321 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6322 vmbus_subchan_rel(subchans, *nsubch); 6323 return (0); 6324 } 6325 6326 static bool 6327 hn_synth_attachable(const struct hn_softc *sc) 6328 { 6329 int i; 6330 6331 if (sc->hn_flags & HN_FLAG_ERRORS) 6332 return (false); 6333 6334 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6335 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6336 6337 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6338 return (false); 6339 } 6340 return (true); 6341 } 6342 6343 /* 6344 * Make sure that the RX filter is zero after the successful 6345 * RNDIS initialization. 6346 * 6347 * NOTE: 6348 * Under certain conditions on certain versions of Hyper-V, 6349 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6350 * after the successful RNDIS initialization, which breaks 6351 * the assumption of any following code (well, it breaks the 6352 * RNDIS API contract actually). Clear the RNDIS rxfilter 6353 * explicitly, drain packets sneaking through, and drain the 6354 * interrupt taskqueues scheduled due to the stealth packets. 6355 */ 6356 static void 6357 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6358 { 6359 6360 hn_disable_rx(sc); 6361 hn_drain_rxtx(sc, nchan); 6362 } 6363 6364 static int 6365 hn_synth_attach(struct hn_softc *sc, int mtu) 6366 { 6367 #define ATTACHED_NVS 0x0002 6368 #define ATTACHED_RNDIS 0x0004 6369 6370 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6371 int error, nsubch, nchan = 1, i, rndis_inited; 6372 uint32_t old_caps, attached = 0; 6373 6374 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6375 ("synthetic parts were attached")); 6376 6377 if (!hn_synth_attachable(sc)) 6378 return (ENXIO); 6379 6380 /* Save capabilities for later verification. */ 6381 old_caps = sc->hn_caps; 6382 sc->hn_caps = 0; 6383 6384 /* Clear RSS stuffs. */ 6385 sc->hn_rss_ind_size = 0; 6386 sc->hn_rss_hash = 0; 6387 sc->hn_rss_hcap = 0; 6388 6389 /* 6390 * Attach the primary channel _before_ attaching NVS and RNDIS. 6391 */ 6392 error = hn_chan_attach(sc, sc->hn_prichan); 6393 if (error) 6394 goto failed; 6395 6396 /* 6397 * Attach NVS. 6398 */ 6399 error = hn_nvs_attach(sc, mtu); 6400 if (error) 6401 goto failed; 6402 attached |= ATTACHED_NVS; 6403 6404 /* 6405 * Attach RNDIS _after_ NVS is attached. 6406 */ 6407 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6408 if (rndis_inited) 6409 attached |= ATTACHED_RNDIS; 6410 if (error) 6411 goto failed; 6412 6413 /* 6414 * Make sure capabilities are not changed. 6415 */ 6416 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6417 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6418 old_caps, sc->hn_caps); 6419 error = ENXIO; 6420 goto failed; 6421 } 6422 6423 /* 6424 * Allocate sub-channels for multi-TX/RX rings. 6425 * 6426 * NOTE: 6427 * The # of RX rings that can be used is equivalent to the # of 6428 * channels to be requested. 6429 */ 6430 nsubch = sc->hn_rx_ring_cnt - 1; 6431 error = hn_synth_alloc_subchans(sc, &nsubch); 6432 if (error) 6433 goto failed; 6434 /* NOTE: _Full_ synthetic parts detach is required now. */ 6435 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6436 6437 /* 6438 * Set the # of TX/RX rings that could be used according to 6439 * the # of channels that NVS offered. 6440 */ 6441 nchan = nsubch + 1; 6442 hn_set_ring_inuse(sc, nchan); 6443 if (nchan == 1) { 6444 /* Only the primary channel can be used; done */ 6445 goto back; 6446 } 6447 6448 /* 6449 * Attach the sub-channels. 6450 * 6451 * NOTE: hn_set_ring_inuse() _must_ have been called. 6452 */ 6453 error = hn_attach_subchans(sc); 6454 if (error) 6455 goto failed; 6456 6457 /* 6458 * Configure RSS key and indirect table _after_ all sub-channels 6459 * are attached. 6460 */ 6461 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6462 /* 6463 * RSS key is not set yet; set it to the default RSS key. 6464 */ 6465 if (bootverbose) 6466 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6467 #ifdef RSS 6468 rss_getkey(rss->rss_key); 6469 #else 6470 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6471 #endif 6472 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6473 } 6474 6475 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6476 /* 6477 * RSS indirect table is not set yet; set it up in round- 6478 * robin fashion. 6479 */ 6480 if (bootverbose) { 6481 if_printf(sc->hn_ifp, "setup default RSS indirect " 6482 "table\n"); 6483 } 6484 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6485 uint32_t subidx; 6486 6487 #ifdef RSS 6488 subidx = rss_get_indirection_to_bucket(i); 6489 #else 6490 subidx = i; 6491 #endif 6492 rss->rss_ind[i] = subidx % nchan; 6493 } 6494 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6495 } else { 6496 /* 6497 * # of usable channels may be changed, so we have to 6498 * make sure that all entries in RSS indirect table 6499 * are valid. 6500 * 6501 * NOTE: hn_set_ring_inuse() _must_ have been called. 6502 */ 6503 hn_rss_ind_fixup(sc); 6504 } 6505 6506 sc->hn_rss_hash = sc->hn_rss_hcap; 6507 if ((sc->hn_flags & HN_FLAG_RXVF) || 6508 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6509 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6510 hn_vf_rss_fixup(sc, false); 6511 } 6512 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6513 if (error) 6514 goto failed; 6515 back: 6516 /* 6517 * Fixup transmission aggregation setup. 6518 */ 6519 hn_set_txagg(sc); 6520 hn_rndis_init_fixat(sc, nchan); 6521 return (0); 6522 6523 failed: 6524 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6525 hn_rndis_init_fixat(sc, nchan); 6526 hn_synth_detach(sc); 6527 } else { 6528 if (attached & ATTACHED_RNDIS) { 6529 hn_rndis_init_fixat(sc, nchan); 6530 hn_rndis_detach(sc); 6531 } 6532 if (attached & ATTACHED_NVS) 6533 hn_nvs_detach(sc); 6534 hn_chan_detach(sc, sc->hn_prichan); 6535 /* Restore old capabilities. */ 6536 sc->hn_caps = old_caps; 6537 } 6538 return (error); 6539 6540 #undef ATTACHED_RNDIS 6541 #undef ATTACHED_NVS 6542 } 6543 6544 /* 6545 * NOTE: 6546 * The interface must have been suspended though hn_suspend(), before 6547 * this function get called. 6548 */ 6549 static void 6550 hn_synth_detach(struct hn_softc *sc) 6551 { 6552 6553 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6554 ("synthetic parts were not attached")); 6555 6556 /* Detach the RNDIS first. */ 6557 hn_rndis_detach(sc); 6558 6559 /* Detach NVS. */ 6560 hn_nvs_detach(sc); 6561 6562 /* Detach all of the channels. */ 6563 hn_detach_allchans(sc); 6564 6565 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6566 } 6567 6568 static void 6569 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6570 { 6571 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6572 ("invalid ring count %d", ring_cnt)); 6573 6574 if (sc->hn_tx_ring_cnt > ring_cnt) 6575 sc->hn_tx_ring_inuse = ring_cnt; 6576 else 6577 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6578 sc->hn_rx_ring_inuse = ring_cnt; 6579 6580 #ifdef RSS 6581 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6582 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6583 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6584 rss_getnumbuckets()); 6585 } 6586 #endif 6587 6588 if (bootverbose) { 6589 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6590 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6591 } 6592 } 6593 6594 static void 6595 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6596 { 6597 6598 /* 6599 * NOTE: 6600 * The TX bufring will not be drained by the hypervisor, 6601 * if the primary channel is revoked. 6602 */ 6603 while (!vmbus_chan_rx_empty(chan) || 6604 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6605 !vmbus_chan_tx_empty(chan))) 6606 pause("waitch", 1); 6607 vmbus_chan_intr_drain(chan); 6608 } 6609 6610 static void 6611 hn_disable_rx(struct hn_softc *sc) 6612 { 6613 6614 /* 6615 * Disable RX by clearing RX filter forcefully. 6616 */ 6617 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6618 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6619 6620 /* 6621 * Give RNDIS enough time to flush all pending data packets. 6622 */ 6623 pause("waitrx", (200 * hz) / 1000); 6624 } 6625 6626 /* 6627 * NOTE: 6628 * RX/TX _must_ have been suspended/disabled, before this function 6629 * is called. 6630 */ 6631 static void 6632 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6633 { 6634 struct vmbus_channel **subch = NULL; 6635 int nsubch; 6636 6637 /* 6638 * Drain RX/TX bufrings and interrupts. 6639 */ 6640 nsubch = nchan - 1; 6641 if (nsubch > 0) 6642 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6643 6644 if (subch != NULL) { 6645 int i; 6646 6647 for (i = 0; i < nsubch; ++i) 6648 hn_chan_drain(sc, subch[i]); 6649 } 6650 hn_chan_drain(sc, sc->hn_prichan); 6651 6652 if (subch != NULL) 6653 vmbus_subchan_rel(subch, nsubch); 6654 } 6655 6656 static void 6657 hn_suspend_data(struct hn_softc *sc) 6658 { 6659 struct hn_tx_ring *txr; 6660 int i; 6661 6662 HN_LOCK_ASSERT(sc); 6663 6664 /* 6665 * Suspend TX. 6666 */ 6667 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6668 txr = &sc->hn_tx_ring[i]; 6669 6670 mtx_lock(&txr->hn_tx_lock); 6671 txr->hn_suspended = 1; 6672 mtx_unlock(&txr->hn_tx_lock); 6673 /* No one is able send more packets now. */ 6674 6675 /* 6676 * Wait for all pending sends to finish. 6677 * 6678 * NOTE: 6679 * We will _not_ receive all pending send-done, if the 6680 * primary channel is revoked. 6681 */ 6682 while (hn_tx_ring_pending(txr) && 6683 !vmbus_chan_is_revoked(sc->hn_prichan)) 6684 pause("hnwtx", 1 /* 1 tick */); 6685 } 6686 6687 /* 6688 * Disable RX. 6689 */ 6690 hn_disable_rx(sc); 6691 6692 /* 6693 * Drain RX/TX. 6694 */ 6695 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6696 6697 /* 6698 * Drain any pending TX tasks. 6699 * 6700 * NOTE: 6701 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6702 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6703 */ 6704 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6705 txr = &sc->hn_tx_ring[i]; 6706 6707 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6708 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6709 } 6710 } 6711 6712 static void 6713 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6714 { 6715 6716 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6717 } 6718 6719 static void 6720 hn_suspend_mgmt(struct hn_softc *sc) 6721 { 6722 struct task task; 6723 6724 HN_LOCK_ASSERT(sc); 6725 6726 /* 6727 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6728 * through hn_mgmt_taskq. 6729 */ 6730 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6731 vmbus_chan_run_task(sc->hn_prichan, &task); 6732 6733 /* 6734 * Make sure that all pending management tasks are completed. 6735 */ 6736 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6737 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6738 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6739 } 6740 6741 static void 6742 hn_suspend(struct hn_softc *sc) 6743 { 6744 6745 /* Disable polling. */ 6746 hn_polling(sc, 0); 6747 6748 /* 6749 * If the non-transparent mode VF is activated, the synthetic 6750 * device is receiving packets, so the data path of the 6751 * synthetic device must be suspended. 6752 */ 6753 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6754 (sc->hn_flags & HN_FLAG_RXVF)) 6755 hn_suspend_data(sc); 6756 hn_suspend_mgmt(sc); 6757 } 6758 6759 static void 6760 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6761 { 6762 int i; 6763 6764 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6765 ("invalid TX ring count %d", tx_ring_cnt)); 6766 6767 for (i = 0; i < tx_ring_cnt; ++i) { 6768 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6769 6770 mtx_lock(&txr->hn_tx_lock); 6771 txr->hn_suspended = 0; 6772 mtx_unlock(&txr->hn_tx_lock); 6773 } 6774 } 6775 6776 static void 6777 hn_resume_data(struct hn_softc *sc) 6778 { 6779 int i; 6780 6781 HN_LOCK_ASSERT(sc); 6782 6783 /* 6784 * Re-enable RX. 6785 */ 6786 hn_rxfilter_config(sc); 6787 6788 /* 6789 * Make sure to clear suspend status on "all" TX rings, 6790 * since hn_tx_ring_inuse can be changed after 6791 * hn_suspend_data(). 6792 */ 6793 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6794 6795 #ifdef HN_IFSTART_SUPPORT 6796 if (!hn_use_if_start) 6797 #endif 6798 { 6799 /* 6800 * Flush unused drbrs, since hn_tx_ring_inuse may be 6801 * reduced. 6802 */ 6803 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6804 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6805 } 6806 6807 /* 6808 * Kick start TX. 6809 */ 6810 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6811 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6812 6813 /* 6814 * Use txeof task, so that any pending oactive can be 6815 * cleared properly. 6816 */ 6817 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6818 } 6819 } 6820 6821 static void 6822 hn_resume_mgmt(struct hn_softc *sc) 6823 { 6824 6825 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6826 6827 /* 6828 * Kick off network change detection, if it was pending. 6829 * If no network change was pending, start link status 6830 * checks, which is more lightweight than network change 6831 * detection. 6832 */ 6833 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6834 hn_change_network(sc); 6835 else 6836 hn_update_link_status(sc); 6837 } 6838 6839 static void 6840 hn_resume(struct hn_softc *sc) 6841 { 6842 6843 /* 6844 * If the non-transparent mode VF is activated, the synthetic 6845 * device have to receive packets, so the data path of the 6846 * synthetic device must be resumed. 6847 */ 6848 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6849 (sc->hn_flags & HN_FLAG_RXVF)) 6850 hn_resume_data(sc); 6851 6852 /* 6853 * Don't resume link status change if VF is attached/activated. 6854 * - In the non-transparent VF mode, the synthetic device marks 6855 * link down until the VF is deactivated; i.e. VF is down. 6856 * - In transparent VF mode, VF's media status is used until 6857 * the VF is detached. 6858 */ 6859 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 6860 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 6861 hn_resume_mgmt(sc); 6862 6863 /* 6864 * Re-enable polling if this interface is running and 6865 * the polling is requested. 6866 */ 6867 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 6868 hn_polling(sc, sc->hn_pollhz); 6869 } 6870 6871 static void 6872 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 6873 { 6874 const struct rndis_status_msg *msg; 6875 int ofs; 6876 6877 if (dlen < sizeof(*msg)) { 6878 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 6879 return; 6880 } 6881 msg = data; 6882 6883 switch (msg->rm_status) { 6884 case RNDIS_STATUS_MEDIA_CONNECT: 6885 case RNDIS_STATUS_MEDIA_DISCONNECT: 6886 hn_update_link_status(sc); 6887 break; 6888 6889 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 6890 case RNDIS_STATUS_LINK_SPEED_CHANGE: 6891 /* Not really useful; ignore. */ 6892 break; 6893 6894 case RNDIS_STATUS_NETWORK_CHANGE: 6895 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 6896 if (dlen < ofs + msg->rm_stbuflen || 6897 msg->rm_stbuflen < sizeof(uint32_t)) { 6898 if_printf(sc->hn_ifp, "network changed\n"); 6899 } else { 6900 uint32_t change; 6901 6902 memcpy(&change, ((const uint8_t *)msg) + ofs, 6903 sizeof(change)); 6904 if_printf(sc->hn_ifp, "network changed, change %u\n", 6905 change); 6906 } 6907 hn_change_network(sc); 6908 break; 6909 6910 default: 6911 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 6912 msg->rm_status); 6913 break; 6914 } 6915 } 6916 6917 static int 6918 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 6919 { 6920 const struct rndis_pktinfo *pi = info_data; 6921 uint32_t mask = 0; 6922 6923 while (info_dlen != 0) { 6924 const void *data; 6925 uint32_t dlen; 6926 6927 if (__predict_false(info_dlen < sizeof(*pi))) 6928 return (EINVAL); 6929 if (__predict_false(info_dlen < pi->rm_size)) 6930 return (EINVAL); 6931 info_dlen -= pi->rm_size; 6932 6933 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 6934 return (EINVAL); 6935 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 6936 return (EINVAL); 6937 dlen = pi->rm_size - pi->rm_pktinfooffset; 6938 data = pi->rm_data; 6939 6940 switch (pi->rm_type) { 6941 case NDIS_PKTINFO_TYPE_VLAN: 6942 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) 6943 return (EINVAL); 6944 info->vlan_info = *((const uint32_t *)data); 6945 mask |= HN_RXINFO_VLAN; 6946 break; 6947 6948 case NDIS_PKTINFO_TYPE_CSUM: 6949 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) 6950 return (EINVAL); 6951 info->csum_info = *((const uint32_t *)data); 6952 mask |= HN_RXINFO_CSUM; 6953 break; 6954 6955 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 6956 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) 6957 return (EINVAL); 6958 info->hash_value = *((const uint32_t *)data); 6959 mask |= HN_RXINFO_HASHVAL; 6960 break; 6961 6962 case HN_NDIS_PKTINFO_TYPE_HASHINF: 6963 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) 6964 return (EINVAL); 6965 info->hash_info = *((const uint32_t *)data); 6966 mask |= HN_RXINFO_HASHINF; 6967 break; 6968 6969 default: 6970 goto next; 6971 } 6972 6973 if (mask == HN_RXINFO_ALL) { 6974 /* All found; done */ 6975 break; 6976 } 6977 next: 6978 pi = (const struct rndis_pktinfo *) 6979 ((const uint8_t *)pi + pi->rm_size); 6980 } 6981 6982 /* 6983 * Final fixup. 6984 * - If there is no hash value, invalidate the hash info. 6985 */ 6986 if ((mask & HN_RXINFO_HASHVAL) == 0) 6987 info->hash_info = HN_NDIS_HASH_INFO_INVALID; 6988 return (0); 6989 } 6990 6991 static __inline bool 6992 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 6993 { 6994 6995 if (off < check_off) { 6996 if (__predict_true(off + len <= check_off)) 6997 return (false); 6998 } else if (off > check_off) { 6999 if (__predict_true(check_off + check_len <= off)) 7000 return (false); 7001 } 7002 return (true); 7003 } 7004 7005 static void 7006 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7007 { 7008 const struct rndis_packet_msg *pkt; 7009 struct hn_rxinfo info; 7010 int data_off, pktinfo_off, data_len, pktinfo_len; 7011 7012 /* 7013 * Check length. 7014 */ 7015 if (__predict_false(dlen < sizeof(*pkt))) { 7016 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7017 return; 7018 } 7019 pkt = data; 7020 7021 if (__predict_false(dlen < pkt->rm_len)) { 7022 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7023 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7024 return; 7025 } 7026 if (__predict_false(pkt->rm_len < 7027 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7028 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7029 "msglen %u, data %u, oob %u, pktinfo %u\n", 7030 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7031 pkt->rm_pktinfolen); 7032 return; 7033 } 7034 if (__predict_false(pkt->rm_datalen == 0)) { 7035 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7036 return; 7037 } 7038 7039 /* 7040 * Check offests. 7041 */ 7042 #define IS_OFFSET_INVALID(ofs) \ 7043 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7044 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7045 7046 /* XXX Hyper-V does not meet data offset alignment requirement */ 7047 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7048 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7049 "data offset %u\n", pkt->rm_dataoffset); 7050 return; 7051 } 7052 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7053 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7054 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7055 "oob offset %u\n", pkt->rm_oobdataoffset); 7056 return; 7057 } 7058 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7059 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7060 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7061 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7062 return; 7063 } 7064 7065 #undef IS_OFFSET_INVALID 7066 7067 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7068 data_len = pkt->rm_datalen; 7069 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7070 pktinfo_len = pkt->rm_pktinfolen; 7071 7072 /* 7073 * Check OOB coverage. 7074 */ 7075 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7076 int oob_off, oob_len; 7077 7078 if_printf(rxr->hn_ifp, "got oobdata\n"); 7079 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7080 oob_len = pkt->rm_oobdatalen; 7081 7082 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7083 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7084 "oob overflow, msglen %u, oob abs %d len %d\n", 7085 pkt->rm_len, oob_off, oob_len); 7086 return; 7087 } 7088 7089 /* 7090 * Check against data. 7091 */ 7092 if (hn_rndis_check_overlap(oob_off, oob_len, 7093 data_off, data_len)) { 7094 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7095 "oob overlaps data, oob abs %d len %d, " 7096 "data abs %d len %d\n", 7097 oob_off, oob_len, data_off, data_len); 7098 return; 7099 } 7100 7101 /* 7102 * Check against pktinfo. 7103 */ 7104 if (pktinfo_len != 0 && 7105 hn_rndis_check_overlap(oob_off, oob_len, 7106 pktinfo_off, pktinfo_len)) { 7107 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7108 "oob overlaps pktinfo, oob abs %d len %d, " 7109 "pktinfo abs %d len %d\n", 7110 oob_off, oob_len, pktinfo_off, pktinfo_len); 7111 return; 7112 } 7113 } 7114 7115 /* 7116 * Check per-packet-info coverage and find useful per-packet-info. 7117 */ 7118 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; 7119 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; 7120 info.hash_info = HN_NDIS_HASH_INFO_INVALID; 7121 if (__predict_true(pktinfo_len != 0)) { 7122 bool overlap; 7123 int error; 7124 7125 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7126 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7127 "pktinfo overflow, msglen %u, " 7128 "pktinfo abs %d len %d\n", 7129 pkt->rm_len, pktinfo_off, pktinfo_len); 7130 return; 7131 } 7132 7133 /* 7134 * Check packet info coverage. 7135 */ 7136 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7137 data_off, data_len); 7138 if (__predict_false(overlap)) { 7139 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7140 "pktinfo overlap data, pktinfo abs %d len %d, " 7141 "data abs %d len %d\n", 7142 pktinfo_off, pktinfo_len, data_off, data_len); 7143 return; 7144 } 7145 7146 /* 7147 * Find useful per-packet-info. 7148 */ 7149 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7150 pktinfo_len, &info); 7151 if (__predict_false(error)) { 7152 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7153 "pktinfo\n"); 7154 return; 7155 } 7156 } 7157 7158 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7159 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7160 "data overflow, msglen %u, data abs %d len %d\n", 7161 pkt->rm_len, data_off, data_len); 7162 return; 7163 } 7164 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); 7165 } 7166 7167 static __inline void 7168 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7169 { 7170 const struct rndis_msghdr *hdr; 7171 7172 if (__predict_false(dlen < sizeof(*hdr))) { 7173 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7174 return; 7175 } 7176 hdr = data; 7177 7178 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7179 /* Hot data path. */ 7180 hn_rndis_rx_data(rxr, data, dlen); 7181 /* Done! */ 7182 return; 7183 } 7184 7185 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7186 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 7187 else 7188 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 7189 } 7190 7191 static void 7192 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7193 { 7194 const struct hn_nvs_hdr *hdr; 7195 7196 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7197 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7198 return; 7199 } 7200 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7201 7202 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7203 /* Useless; ignore */ 7204 return; 7205 } 7206 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7207 } 7208 7209 static void 7210 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7211 const struct vmbus_chanpkt_hdr *pkt) 7212 { 7213 struct hn_nvs_sendctx *sndc; 7214 7215 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7216 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7217 VMBUS_CHANPKT_DATALEN(pkt)); 7218 /* 7219 * NOTE: 7220 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7221 * its callback. 7222 */ 7223 } 7224 7225 static void 7226 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7227 const struct vmbus_chanpkt_hdr *pkthdr) 7228 { 7229 const struct vmbus_chanpkt_rxbuf *pkt; 7230 const struct hn_nvs_hdr *nvs_hdr; 7231 int count, i, hlen; 7232 7233 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7234 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7235 return; 7236 } 7237 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7238 7239 /* Make sure that this is a RNDIS message. */ 7240 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7241 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7242 nvs_hdr->nvs_type); 7243 return; 7244 } 7245 7246 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7247 if (__predict_false(hlen < sizeof(*pkt))) { 7248 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7249 return; 7250 } 7251 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7252 7253 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7254 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7255 pkt->cp_rxbuf_id); 7256 return; 7257 } 7258 7259 count = pkt->cp_rxbuf_cnt; 7260 if (__predict_false(hlen < 7261 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7262 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7263 return; 7264 } 7265 7266 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7267 for (i = 0; i < count; ++i) { 7268 int ofs, len; 7269 7270 ofs = pkt->cp_rxbuf[i].rb_ofs; 7271 len = pkt->cp_rxbuf[i].rb_len; 7272 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7273 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7274 "ofs %d, len %d\n", i, ofs, len); 7275 continue; 7276 } 7277 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7278 } 7279 7280 /* 7281 * Ack the consumed RXBUF associated w/ this channel packet, 7282 * so that this RXBUF can be recycled by the hypervisor. 7283 */ 7284 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7285 } 7286 7287 static void 7288 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7289 uint64_t tid) 7290 { 7291 struct hn_nvs_rndis_ack ack; 7292 int retries, error; 7293 7294 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7295 ack.nvs_status = HN_NVS_STATUS_OK; 7296 7297 retries = 0; 7298 again: 7299 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7300 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7301 if (__predict_false(error == EAGAIN)) { 7302 /* 7303 * NOTE: 7304 * This should _not_ happen in real world, since the 7305 * consumption of the TX bufring from the TX path is 7306 * controlled. 7307 */ 7308 if (rxr->hn_ack_failed == 0) 7309 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7310 rxr->hn_ack_failed++; 7311 retries++; 7312 if (retries < 10) { 7313 DELAY(100); 7314 goto again; 7315 } 7316 /* RXBUF leaks! */ 7317 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7318 } 7319 } 7320 7321 static void 7322 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7323 { 7324 struct hn_rx_ring *rxr = xrxr; 7325 struct hn_softc *sc = rxr->hn_ifp->if_softc; 7326 7327 for (;;) { 7328 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7329 int error, pktlen; 7330 7331 pktlen = rxr->hn_pktbuf_len; 7332 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7333 if (__predict_false(error == ENOBUFS)) { 7334 void *nbuf; 7335 int nlen; 7336 7337 /* 7338 * Expand channel packet buffer. 7339 * 7340 * XXX 7341 * Use M_WAITOK here, since allocation failure 7342 * is fatal. 7343 */ 7344 nlen = rxr->hn_pktbuf_len * 2; 7345 while (nlen < pktlen) 7346 nlen *= 2; 7347 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7348 7349 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7350 rxr->hn_pktbuf_len, nlen); 7351 7352 free(rxr->hn_pktbuf, M_DEVBUF); 7353 rxr->hn_pktbuf = nbuf; 7354 rxr->hn_pktbuf_len = nlen; 7355 /* Retry! */ 7356 continue; 7357 } else if (__predict_false(error == EAGAIN)) { 7358 /* No more channel packets; done! */ 7359 break; 7360 } 7361 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7362 7363 switch (pkt->cph_type) { 7364 case VMBUS_CHANPKT_TYPE_COMP: 7365 hn_nvs_handle_comp(sc, chan, pkt); 7366 break; 7367 7368 case VMBUS_CHANPKT_TYPE_RXBUF: 7369 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7370 break; 7371 7372 case VMBUS_CHANPKT_TYPE_INBAND: 7373 hn_nvs_handle_notify(sc, pkt); 7374 break; 7375 7376 default: 7377 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7378 pkt->cph_type); 7379 break; 7380 } 7381 } 7382 hn_chan_rollup(rxr, rxr->hn_txr); 7383 } 7384 7385 static void 7386 hn_sysinit(void *arg __unused) 7387 { 7388 int i; 7389 7390 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7391 7392 #ifdef HN_IFSTART_SUPPORT 7393 /* 7394 * Don't use ifnet.if_start if transparent VF mode is requested; 7395 * mainly due to the IFF_DRV_OACTIVE flag. 7396 */ 7397 if (hn_xpnt_vf && hn_use_if_start) { 7398 hn_use_if_start = 0; 7399 printf("hn: tranparent VF mode, if_transmit will be used, " 7400 "instead of if_start\n"); 7401 } 7402 #endif 7403 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7404 printf("hn: invalid transparent VF attach routing " 7405 "wait timeout %d, reset to %d\n", 7406 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7407 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7408 } 7409 7410 /* 7411 * Initialize VF map. 7412 */ 7413 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7414 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7415 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, 7416 M_WAITOK | M_ZERO); 7417 7418 /* 7419 * Fix the # of TX taskqueues. 7420 */ 7421 if (hn_tx_taskq_cnt <= 0) 7422 hn_tx_taskq_cnt = 1; 7423 else if (hn_tx_taskq_cnt > mp_ncpus) 7424 hn_tx_taskq_cnt = mp_ncpus; 7425 7426 /* 7427 * Fix the TX taskqueue mode. 7428 */ 7429 switch (hn_tx_taskq_mode) { 7430 case HN_TX_TASKQ_M_INDEP: 7431 case HN_TX_TASKQ_M_GLOBAL: 7432 case HN_TX_TASKQ_M_EVTTQ: 7433 break; 7434 default: 7435 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7436 break; 7437 } 7438 7439 if (vm_guest != VM_GUEST_HV) 7440 return; 7441 7442 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7443 return; 7444 7445 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7446 M_DEVBUF, M_WAITOK); 7447 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7448 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7449 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7450 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7451 "hn tx%d", i); 7452 } 7453 } 7454 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7455 7456 static void 7457 hn_sysuninit(void *arg __unused) 7458 { 7459 7460 if (hn_tx_taskque != NULL) { 7461 int i; 7462 7463 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7464 taskqueue_free(hn_tx_taskque[i]); 7465 free(hn_tx_taskque, M_DEVBUF); 7466 } 7467 7468 if (hn_vfmap != NULL) 7469 free(hn_vfmap, M_DEVBUF); 7470 rm_destroy(&hn_vfmap_lock); 7471 7472 counter_u64_free(hn_udpcs_fixup); 7473 } 7474 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7475