1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/bus.h> 66 #include <sys/counter.h> 67 #include <sys/kernel.h> 68 #include <sys/limits.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/module.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/socket.h> 80 #include <sys/sockio.h> 81 #include <sys/sx.h> 82 #include <sys/sysctl.h> 83 #include <sys/taskqueue.h> 84 #include <sys/buf_ring.h> 85 #include <sys/eventhandler.h> 86 #include <sys/epoch.h> 87 88 #include <machine/atomic.h> 89 #include <machine/in_cksum.h> 90 91 #include <net/bpf.h> 92 #include <net/ethernet.h> 93 #include <net/if.h> 94 #include <net/if_dl.h> 95 #include <net/if_media.h> 96 #include <net/if_types.h> 97 #include <net/if_var.h> 98 #include <net/rndis.h> 99 #ifdef RSS 100 #include <net/rss_config.h> 101 #endif 102 103 #include <netinet/in_systm.h> 104 #include <netinet/in.h> 105 #include <netinet/ip.h> 106 #include <netinet/ip6.h> 107 #include <netinet/tcp.h> 108 #include <netinet/tcp_lro.h> 109 #include <netinet/udp.h> 110 111 #include <dev/hyperv/include/hyperv.h> 112 #include <dev/hyperv/include/hyperv_busdma.h> 113 #include <dev/hyperv/include/vmbus.h> 114 #include <dev/hyperv/include/vmbus_xact.h> 115 116 #include <dev/hyperv/netvsc/ndis.h> 117 #include <dev/hyperv/netvsc/if_hnreg.h> 118 #include <dev/hyperv/netvsc/if_hnvar.h> 119 #include <dev/hyperv/netvsc/hn_nvs.h> 120 #include <dev/hyperv/netvsc/hn_rndis.h> 121 122 #include "vmbus_if.h" 123 124 #define HN_IFSTART_SUPPORT 125 126 #define HN_RING_CNT_DEF_MAX 8 127 128 #define HN_VFMAP_SIZE_DEF 8 129 130 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 131 132 /* YYY should get it from the underlying channel */ 133 #define HN_TX_DESC_CNT 512 134 135 #define HN_RNDIS_PKT_LEN \ 136 (sizeof(struct rndis_packet_msg) + \ 137 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 138 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 139 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 140 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 141 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 142 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 143 144 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 145 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 146 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 147 /* -1 for RNDIS packet message */ 148 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 149 150 #define HN_DIRECT_TX_SIZE_DEF 128 151 152 #define HN_EARLY_TXEOF_THRESH 8 153 154 #define HN_PKTBUF_LEN_DEF (16 * 1024) 155 156 #define HN_LROENT_CNT_DEF 128 157 158 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 159 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 160 /* YYY 2*MTU is a bit rough, but should be good enough. */ 161 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 162 163 #define HN_LRO_ACKCNT_DEF 1 164 165 #define HN_LOCK_INIT(sc) \ 166 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 167 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 168 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 169 #define HN_LOCK(sc) \ 170 do { \ 171 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 172 /* Relinquish cpu to avoid deadlock */ \ 173 sched_relinquish(curthread); \ 174 DELAY(1000); \ 175 } \ 176 } while (0) 177 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 178 179 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 180 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 181 #define HN_CSUM_IP_HWASSIST(sc) \ 182 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 183 #define HN_CSUM_IP6_HWASSIST(sc) \ 184 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 185 186 #define HN_PKTSIZE_MIN(align) \ 187 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 188 HN_RNDIS_PKT_LEN, (align)) 189 #define HN_PKTSIZE(m, align) \ 190 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 191 192 #ifdef RSS 193 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 194 #else 195 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 196 #endif 197 198 struct hn_txdesc { 199 #ifndef HN_USE_TXDESC_BUFRING 200 SLIST_ENTRY(hn_txdesc) link; 201 #endif 202 STAILQ_ENTRY(hn_txdesc) agg_link; 203 204 /* Aggregated txdescs, in sending order. */ 205 STAILQ_HEAD(, hn_txdesc) agg_list; 206 207 /* The oldest packet, if transmission aggregation happens. */ 208 struct mbuf *m; 209 struct hn_tx_ring *txr; 210 int refs; 211 uint32_t flags; /* HN_TXD_FLAG_ */ 212 struct hn_nvs_sendctx send_ctx; 213 uint32_t chim_index; 214 int chim_size; 215 216 bus_dmamap_t data_dmap; 217 218 bus_addr_t rndis_pkt_paddr; 219 struct rndis_packet_msg *rndis_pkt; 220 bus_dmamap_t rndis_pkt_dmap; 221 }; 222 223 #define HN_TXD_FLAG_ONLIST 0x0001 224 #define HN_TXD_FLAG_DMAMAP 0x0002 225 #define HN_TXD_FLAG_ONAGG 0x0004 226 227 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 228 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 229 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 230 231 struct packet_info_id { 232 uint8_t ver; 233 uint8_t flag; 234 uint16_t pkt_id; 235 }; 236 237 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 238 239 240 struct hn_rxinfo { 241 const uint32_t *vlan_info; 242 const uint32_t *csum_info; 243 const uint32_t *hash_info; 244 const uint32_t *hash_value; 245 const struct packet_info_id *pktinfo_id; 246 }; 247 248 struct hn_rxvf_setarg { 249 struct hn_rx_ring *rxr; 250 struct ifnet *vf_ifp; 251 }; 252 253 #define HN_RXINFO_VLAN 0x0001 254 #define HN_RXINFO_CSUM 0x0002 255 #define HN_RXINFO_HASHINF 0x0004 256 #define HN_RXINFO_HASHVAL 0x0008 257 #define HN_RXINFO_PKTINFO_ID 0x0010 258 #define HN_RXINFO_ALL \ 259 (HN_RXINFO_VLAN | \ 260 HN_RXINFO_CSUM | \ 261 HN_RXINFO_HASHINF | \ 262 HN_RXINFO_HASHVAL | \ 263 HN_RXINFO_PKTINFO_ID) 264 265 static int hn_probe(device_t); 266 static int hn_attach(device_t); 267 static int hn_detach(device_t); 268 static int hn_shutdown(device_t); 269 static void hn_chan_callback(struct vmbus_channel *, 270 void *); 271 272 static void hn_init(void *); 273 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 274 #ifdef HN_IFSTART_SUPPORT 275 static void hn_start(struct ifnet *); 276 #endif 277 static int hn_transmit(struct ifnet *, struct mbuf *); 278 static void hn_xmit_qflush(struct ifnet *); 279 static int hn_ifmedia_upd(struct ifnet *); 280 static void hn_ifmedia_sts(struct ifnet *, 281 struct ifmediareq *); 282 283 static void hn_ifnet_event(void *, struct ifnet *, int); 284 static void hn_ifaddr_event(void *, struct ifnet *); 285 static void hn_ifnet_attevent(void *, struct ifnet *); 286 static void hn_ifnet_detevent(void *, struct ifnet *); 287 static void hn_ifnet_lnkevent(void *, struct ifnet *, int); 288 289 static bool hn_ismyvf(const struct hn_softc *, 290 const struct ifnet *); 291 static void hn_rxvf_change(struct hn_softc *, 292 struct ifnet *, bool); 293 static void hn_rxvf_set(struct hn_softc *, struct ifnet *); 294 static void hn_rxvf_set_task(void *, int); 295 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); 296 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 297 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 298 struct ifreq *); 299 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 300 static bool hn_xpnt_vf_isready(struct hn_softc *); 301 static void hn_xpnt_vf_setready(struct hn_softc *); 302 static void hn_xpnt_vf_init_taskfunc(void *, int); 303 static void hn_xpnt_vf_init(struct hn_softc *); 304 static void hn_xpnt_vf_setenable(struct hn_softc *); 305 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 306 static void hn_vf_rss_fixup(struct hn_softc *, bool); 307 static void hn_vf_rss_restore(struct hn_softc *); 308 309 static int hn_rndis_rxinfo(const void *, int, 310 struct hn_rxinfo *); 311 static void hn_rndis_rx_data(struct hn_rx_ring *, 312 const void *, int); 313 static void hn_rndis_rx_status(struct hn_softc *, 314 const void *, int); 315 static void hn_rndis_init_fixat(struct hn_softc *, int); 316 317 static void hn_nvs_handle_notify(struct hn_softc *, 318 const struct vmbus_chanpkt_hdr *); 319 static void hn_nvs_handle_comp(struct hn_softc *, 320 struct vmbus_channel *, 321 const struct vmbus_chanpkt_hdr *); 322 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 323 struct vmbus_channel *, 324 const struct vmbus_chanpkt_hdr *); 325 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 326 struct vmbus_channel *, uint64_t); 327 328 #if __FreeBSD_version >= 1100099 329 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 330 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 331 #endif 332 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 334 #if __FreeBSD_version < 1100095 335 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 336 #else 337 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 338 #endif 339 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 342 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 343 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 344 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 345 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 346 #ifndef RSS 347 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 348 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 349 #endif 350 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 358 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 359 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 360 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 361 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 362 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 363 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 364 365 static void hn_stop(struct hn_softc *, bool); 366 static void hn_init_locked(struct hn_softc *); 367 static int hn_chan_attach(struct hn_softc *, 368 struct vmbus_channel *); 369 static void hn_chan_detach(struct hn_softc *, 370 struct vmbus_channel *); 371 static int hn_attach_subchans(struct hn_softc *); 372 static void hn_detach_allchans(struct hn_softc *); 373 static void hn_chan_rollup(struct hn_rx_ring *, 374 struct hn_tx_ring *); 375 static void hn_set_ring_inuse(struct hn_softc *, int); 376 static int hn_synth_attach(struct hn_softc *, int); 377 static void hn_synth_detach(struct hn_softc *); 378 static int hn_synth_alloc_subchans(struct hn_softc *, 379 int *); 380 static bool hn_synth_attachable(const struct hn_softc *); 381 static void hn_suspend(struct hn_softc *); 382 static void hn_suspend_data(struct hn_softc *); 383 static void hn_suspend_mgmt(struct hn_softc *); 384 static void hn_resume(struct hn_softc *); 385 static void hn_resume_data(struct hn_softc *); 386 static void hn_resume_mgmt(struct hn_softc *); 387 static void hn_suspend_mgmt_taskfunc(void *, int); 388 static void hn_chan_drain(struct hn_softc *, 389 struct vmbus_channel *); 390 static void hn_disable_rx(struct hn_softc *); 391 static void hn_drain_rxtx(struct hn_softc *, int); 392 static void hn_polling(struct hn_softc *, u_int); 393 static void hn_chan_polling(struct vmbus_channel *, u_int); 394 static void hn_mtu_change_fixup(struct hn_softc *); 395 396 static void hn_update_link_status(struct hn_softc *); 397 static void hn_change_network(struct hn_softc *); 398 static void hn_link_taskfunc(void *, int); 399 static void hn_netchg_init_taskfunc(void *, int); 400 static void hn_netchg_status_taskfunc(void *, int); 401 static void hn_link_status(struct hn_softc *); 402 403 static int hn_create_rx_data(struct hn_softc *, int); 404 static void hn_destroy_rx_data(struct hn_softc *); 405 static int hn_check_iplen(const struct mbuf *, int); 406 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 407 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 408 static int hn_rxfilter_config(struct hn_softc *); 409 static int hn_rss_reconfig(struct hn_softc *); 410 static void hn_rss_ind_fixup(struct hn_softc *); 411 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 412 static int hn_rxpkt(struct hn_rx_ring *); 413 static uint32_t hn_rss_type_fromndis(uint32_t); 414 static uint32_t hn_rss_type_tondis(uint32_t); 415 416 static int hn_tx_ring_create(struct hn_softc *, int); 417 static void hn_tx_ring_destroy(struct hn_tx_ring *); 418 static int hn_create_tx_data(struct hn_softc *, int); 419 static void hn_fixup_tx_data(struct hn_softc *); 420 static void hn_fixup_rx_data(struct hn_softc *); 421 static void hn_destroy_tx_data(struct hn_softc *); 422 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 423 static void hn_txdesc_gc(struct hn_tx_ring *, 424 struct hn_txdesc *); 425 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 426 struct hn_txdesc *, struct mbuf **); 427 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 428 struct hn_txdesc *); 429 static void hn_set_chim_size(struct hn_softc *, int); 430 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 431 static bool hn_tx_ring_pending(struct hn_tx_ring *); 432 static void hn_tx_ring_qflush(struct hn_tx_ring *); 433 static void hn_resume_tx(struct hn_softc *, int); 434 static void hn_set_txagg(struct hn_softc *); 435 static void *hn_try_txagg(struct ifnet *, 436 struct hn_tx_ring *, struct hn_txdesc *, 437 int); 438 static int hn_get_txswq_depth(const struct hn_tx_ring *); 439 static void hn_txpkt_done(struct hn_nvs_sendctx *, 440 struct hn_softc *, struct vmbus_channel *, 441 const void *, int); 442 static int hn_txpkt_sglist(struct hn_tx_ring *, 443 struct hn_txdesc *); 444 static int hn_txpkt_chim(struct hn_tx_ring *, 445 struct hn_txdesc *); 446 static int hn_xmit(struct hn_tx_ring *, int); 447 static void hn_xmit_taskfunc(void *, int); 448 static void hn_xmit_txeof(struct hn_tx_ring *); 449 static void hn_xmit_txeof_taskfunc(void *, int); 450 #ifdef HN_IFSTART_SUPPORT 451 static int hn_start_locked(struct hn_tx_ring *, int); 452 static void hn_start_taskfunc(void *, int); 453 static void hn_start_txeof(struct hn_tx_ring *); 454 static void hn_start_txeof_taskfunc(void *, int); 455 #endif 456 457 static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS); 458 459 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 460 "Hyper-V network interface"); 461 462 /* Trust tcp segment verification on host side. */ 463 static int hn_trust_hosttcp = 1; 464 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 465 &hn_trust_hosttcp, 0, 466 "Trust tcp segment verification on host side, " 467 "when csum info is missing (global setting)"); 468 469 /* Trust udp datagrams verification on host side. */ 470 static int hn_trust_hostudp = 1; 471 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 472 &hn_trust_hostudp, 0, 473 "Trust udp datagram verification on host side, " 474 "when csum info is missing (global setting)"); 475 476 /* Trust ip packets verification on host side. */ 477 static int hn_trust_hostip = 1; 478 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 479 &hn_trust_hostip, 0, 480 "Trust ip packet verification on host side, " 481 "when csum info is missing (global setting)"); 482 483 /* 484 * Offload UDP/IPv4 checksum. 485 */ 486 static int hn_enable_udp4cs = 1; 487 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 488 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 489 490 /* 491 * Offload UDP/IPv6 checksum. 492 */ 493 static int hn_enable_udp6cs = 1; 494 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 495 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 496 497 /* Stats. */ 498 static counter_u64_t hn_udpcs_fixup; 499 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 500 &hn_udpcs_fixup, "# of UDP checksum fixup"); 501 502 /* 503 * See hn_set_hlen(). 504 * 505 * This value is for Azure. For Hyper-V, set this above 506 * 65536 to disable UDP datagram checksum fixup. 507 */ 508 static int hn_udpcs_fixup_mtu = 1420; 509 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 510 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 511 512 /* Limit TSO burst size */ 513 static int hn_tso_maxlen = IP_MAXPACKET; 514 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 515 &hn_tso_maxlen, 0, "TSO burst limit"); 516 517 /* Limit chimney send size */ 518 static int hn_tx_chimney_size = 0; 519 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 520 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 521 522 /* Limit the size of packet for direct transmission */ 523 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 524 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 525 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 526 527 /* # of LRO entries per RX ring */ 528 #if defined(INET) || defined(INET6) 529 #if __FreeBSD_version >= 1100095 530 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 531 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 532 &hn_lro_entry_count, 0, "LRO entry count"); 533 #endif 534 #endif 535 536 static int hn_tx_taskq_cnt = 1; 537 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 538 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 539 540 #define HN_TX_TASKQ_M_INDEP 0 541 #define HN_TX_TASKQ_M_GLOBAL 1 542 #define HN_TX_TASKQ_M_EVTTQ 2 543 544 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 545 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 546 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 547 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 548 549 #ifndef HN_USE_TXDESC_BUFRING 550 static int hn_use_txdesc_bufring = 0; 551 #else 552 static int hn_use_txdesc_bufring = 1; 553 #endif 554 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 555 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 556 557 #ifdef HN_IFSTART_SUPPORT 558 /* Use ifnet.if_start instead of ifnet.if_transmit */ 559 static int hn_use_if_start = 0; 560 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 561 &hn_use_if_start, 0, "Use if_start TX method"); 562 #endif 563 564 /* # of channels to use */ 565 static int hn_chan_cnt = 0; 566 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 567 &hn_chan_cnt, 0, 568 "# of channels to use; each channel has one RX ring and one TX ring"); 569 570 /* # of transmit rings to use */ 571 static int hn_tx_ring_cnt = 0; 572 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 573 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 574 575 /* Software TX ring deptch */ 576 static int hn_tx_swq_depth = 0; 577 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 578 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 579 580 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 581 #if __FreeBSD_version >= 1100095 582 static u_int hn_lro_mbufq_depth = 0; 583 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 584 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 585 #endif 586 587 /* Packet transmission aggregation size limit */ 588 static int hn_tx_agg_size = -1; 589 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 590 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 591 592 /* Packet transmission aggregation count limit */ 593 static int hn_tx_agg_pkts = -1; 594 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 595 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 596 597 /* VF list */ 598 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 599 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 600 hn_vflist_sysctl, "A", 601 "VF list"); 602 603 /* VF mapping */ 604 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 605 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 606 hn_vfmap_sysctl, "A", 607 "VF mapping"); 608 609 /* Transparent VF */ 610 static int hn_xpnt_vf = 1; 611 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 612 &hn_xpnt_vf, 0, "Transparent VF mod"); 613 614 /* Accurate BPF support for Transparent VF */ 615 static int hn_xpnt_vf_accbpf = 0; 616 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 617 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 618 619 /* Extra wait for transparent VF attach routing; unit seconds. */ 620 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 621 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 622 &hn_xpnt_vf_attwait, 0, 623 "Extra wait for transparent VF attach routing; unit: seconds"); 624 625 static u_int hn_cpu_index; /* next CPU for channel */ 626 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 627 628 static struct rmlock hn_vfmap_lock; 629 static int hn_vfmap_size; 630 static struct ifnet **hn_vfmap; 631 632 #ifndef RSS 633 static const uint8_t 634 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 635 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 636 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 637 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 638 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 639 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 640 }; 641 #endif /* !RSS */ 642 643 static const struct hyperv_guid hn_guid = { 644 .hv_guid = { 645 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 646 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 647 }; 648 649 static device_method_t hn_methods[] = { 650 /* Device interface */ 651 DEVMETHOD(device_probe, hn_probe), 652 DEVMETHOD(device_attach, hn_attach), 653 DEVMETHOD(device_detach, hn_detach), 654 DEVMETHOD(device_shutdown, hn_shutdown), 655 DEVMETHOD_END 656 }; 657 658 static driver_t hn_driver = { 659 "hn", 660 hn_methods, 661 sizeof(struct hn_softc) 662 }; 663 664 static devclass_t hn_devclass; 665 666 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 667 MODULE_VERSION(hn, 1); 668 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 669 670 #if __FreeBSD_version >= 1100099 671 static void 672 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 673 { 674 int i; 675 676 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 677 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 678 } 679 #endif 680 681 static int 682 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 683 { 684 685 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 686 txd->chim_size == 0, ("invalid rndis sglist txd")); 687 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 688 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 689 } 690 691 static int 692 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 693 { 694 struct hn_nvs_rndis rndis; 695 696 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 697 txd->chim_size > 0, ("invalid rndis chim txd")); 698 699 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 700 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 701 rndis.nvs_chim_idx = txd->chim_index; 702 rndis.nvs_chim_sz = txd->chim_size; 703 704 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 705 &rndis, sizeof(rndis), &txd->send_ctx)); 706 } 707 708 static __inline uint32_t 709 hn_chim_alloc(struct hn_softc *sc) 710 { 711 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 712 u_long *bmap = sc->hn_chim_bmap; 713 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 714 715 for (i = 0; i < bmap_cnt; ++i) { 716 int idx; 717 718 idx = ffsl(~bmap[i]); 719 if (idx == 0) 720 continue; 721 722 --idx; /* ffsl is 1-based */ 723 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 724 ("invalid i %d and idx %d", i, idx)); 725 726 if (atomic_testandset_long(&bmap[i], idx)) 727 continue; 728 729 ret = i * LONG_BIT + idx; 730 break; 731 } 732 return (ret); 733 } 734 735 static __inline void 736 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 737 { 738 u_long mask; 739 uint32_t idx; 740 741 idx = chim_idx / LONG_BIT; 742 KASSERT(idx < sc->hn_chim_bmap_cnt, 743 ("invalid chimney index 0x%x", chim_idx)); 744 745 mask = 1UL << (chim_idx % LONG_BIT); 746 KASSERT(sc->hn_chim_bmap[idx] & mask, 747 ("index bitmap 0x%lx, chimney index %u, " 748 "bitmap idx %d, bitmask 0x%lx", 749 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 750 751 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 752 } 753 754 #if defined(INET6) || defined(INET) 755 756 #define PULLUP_HDR(m, len) \ 757 do { \ 758 if (__predict_false((m)->m_len < (len))) { \ 759 (m) = m_pullup((m), (len)); \ 760 if ((m) == NULL) \ 761 return (NULL); \ 762 } \ 763 } while (0) 764 765 /* 766 * NOTE: If this function failed, the m_head would be freed. 767 */ 768 static __inline struct mbuf * 769 hn_tso_fixup(struct mbuf *m_head) 770 { 771 struct ether_vlan_header *evl; 772 struct tcphdr *th; 773 int ehlen; 774 775 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 776 777 PULLUP_HDR(m_head, sizeof(*evl)); 778 evl = mtod(m_head, struct ether_vlan_header *); 779 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 780 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 781 else 782 ehlen = ETHER_HDR_LEN; 783 m_head->m_pkthdr.l2hlen = ehlen; 784 785 #ifdef INET 786 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 787 struct ip *ip; 788 int iphlen; 789 790 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 791 ip = mtodo(m_head, ehlen); 792 iphlen = ip->ip_hl << 2; 793 m_head->m_pkthdr.l3hlen = iphlen; 794 795 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 796 th = mtodo(m_head, ehlen + iphlen); 797 798 ip->ip_len = 0; 799 ip->ip_sum = 0; 800 th->th_sum = in_pseudo(ip->ip_src.s_addr, 801 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 802 } 803 #endif 804 #if defined(INET6) && defined(INET) 805 else 806 #endif 807 #ifdef INET6 808 { 809 struct ip6_hdr *ip6; 810 811 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 812 ip6 = mtodo(m_head, ehlen); 813 if (ip6->ip6_nxt != IPPROTO_TCP) { 814 m_freem(m_head); 815 return (NULL); 816 } 817 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 818 819 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 820 th = mtodo(m_head, ehlen + sizeof(*ip6)); 821 822 ip6->ip6_plen = 0; 823 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 824 } 825 #endif 826 return (m_head); 827 } 828 829 /* 830 * NOTE: If this function failed, the m_head would be freed. 831 */ 832 static __inline struct mbuf * 833 hn_set_hlen(struct mbuf *m_head) 834 { 835 const struct ether_vlan_header *evl; 836 int ehlen; 837 838 PULLUP_HDR(m_head, sizeof(*evl)); 839 evl = mtod(m_head, const struct ether_vlan_header *); 840 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 841 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 842 else 843 ehlen = ETHER_HDR_LEN; 844 m_head->m_pkthdr.l2hlen = ehlen; 845 846 #ifdef INET 847 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 848 const struct ip *ip; 849 int iphlen; 850 851 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 852 ip = mtodo(m_head, ehlen); 853 iphlen = ip->ip_hl << 2; 854 m_head->m_pkthdr.l3hlen = iphlen; 855 856 /* 857 * UDP checksum offload does not work in Azure, if the 858 * following conditions meet: 859 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 860 * - IP_DF is not set in the IP hdr. 861 * 862 * Fallback to software checksum for these UDP datagrams. 863 */ 864 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 865 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 866 (ntohs(ip->ip_off) & IP_DF) == 0) { 867 uint16_t off = ehlen + iphlen; 868 869 counter_u64_add(hn_udpcs_fixup, 1); 870 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 871 *(uint16_t *)(m_head->m_data + off + 872 m_head->m_pkthdr.csum_data) = in_cksum_skip( 873 m_head, m_head->m_pkthdr.len, off); 874 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 875 } 876 } 877 #endif 878 #if defined(INET6) && defined(INET) 879 else 880 #endif 881 #ifdef INET6 882 { 883 const struct ip6_hdr *ip6; 884 885 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 886 ip6 = mtodo(m_head, ehlen); 887 if (ip6->ip6_nxt != IPPROTO_TCP && 888 ip6->ip6_nxt != IPPROTO_UDP) { 889 m_freem(m_head); 890 return (NULL); 891 } 892 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 893 } 894 #endif 895 return (m_head); 896 } 897 898 /* 899 * NOTE: If this function failed, the m_head would be freed. 900 */ 901 static __inline struct mbuf * 902 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 903 { 904 const struct tcphdr *th; 905 int ehlen, iphlen; 906 907 *tcpsyn = 0; 908 ehlen = m_head->m_pkthdr.l2hlen; 909 iphlen = m_head->m_pkthdr.l3hlen; 910 911 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 912 th = mtodo(m_head, ehlen + iphlen); 913 if (th->th_flags & TH_SYN) 914 *tcpsyn = 1; 915 return (m_head); 916 } 917 918 #undef PULLUP_HDR 919 920 #endif /* INET6 || INET */ 921 922 static int 923 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 924 { 925 int error = 0; 926 927 HN_LOCK_ASSERT(sc); 928 929 if (sc->hn_rx_filter != filter) { 930 error = hn_rndis_set_rxfilter(sc, filter); 931 if (!error) 932 sc->hn_rx_filter = filter; 933 } 934 return (error); 935 } 936 937 static int 938 hn_rxfilter_config(struct hn_softc *sc) 939 { 940 struct ifnet *ifp = sc->hn_ifp; 941 uint32_t filter; 942 943 HN_LOCK_ASSERT(sc); 944 945 /* 946 * If the non-transparent mode VF is activated, we don't know how 947 * its RX filter is configured, so stick the synthetic device in 948 * the promiscous mode. 949 */ 950 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 951 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 952 } else { 953 filter = NDIS_PACKET_TYPE_DIRECTED; 954 if (ifp->if_flags & IFF_BROADCAST) 955 filter |= NDIS_PACKET_TYPE_BROADCAST; 956 /* TODO: support multicast list */ 957 if ((ifp->if_flags & IFF_ALLMULTI) || 958 !CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 959 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 960 } 961 return (hn_set_rxfilter(sc, filter)); 962 } 963 964 static void 965 hn_set_txagg(struct hn_softc *sc) 966 { 967 uint32_t size, pkts; 968 int i; 969 970 /* 971 * Setup aggregation size. 972 */ 973 if (sc->hn_agg_size < 0) 974 size = UINT32_MAX; 975 else 976 size = sc->hn_agg_size; 977 978 if (sc->hn_rndis_agg_size < size) 979 size = sc->hn_rndis_agg_size; 980 981 /* NOTE: We only aggregate packets using chimney sending buffers. */ 982 if (size > (uint32_t)sc->hn_chim_szmax) 983 size = sc->hn_chim_szmax; 984 985 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 986 /* Disable */ 987 size = 0; 988 pkts = 0; 989 goto done; 990 } 991 992 /* NOTE: Type of the per TX ring setting is 'int'. */ 993 if (size > INT_MAX) 994 size = INT_MAX; 995 996 /* 997 * Setup aggregation packet count. 998 */ 999 if (sc->hn_agg_pkts < 0) 1000 pkts = UINT32_MAX; 1001 else 1002 pkts = sc->hn_agg_pkts; 1003 1004 if (sc->hn_rndis_agg_pkts < pkts) 1005 pkts = sc->hn_rndis_agg_pkts; 1006 1007 if (pkts <= 1) { 1008 /* Disable */ 1009 size = 0; 1010 pkts = 0; 1011 goto done; 1012 } 1013 1014 /* NOTE: Type of the per TX ring setting is 'short'. */ 1015 if (pkts > SHRT_MAX) 1016 pkts = SHRT_MAX; 1017 1018 done: 1019 /* NOTE: Type of the per TX ring setting is 'short'. */ 1020 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1021 /* Disable */ 1022 size = 0; 1023 pkts = 0; 1024 } 1025 1026 if (bootverbose) { 1027 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1028 size, pkts, sc->hn_rndis_agg_align); 1029 } 1030 1031 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1032 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1033 1034 mtx_lock(&txr->hn_tx_lock); 1035 txr->hn_agg_szmax = size; 1036 txr->hn_agg_pktmax = pkts; 1037 txr->hn_agg_align = sc->hn_rndis_agg_align; 1038 mtx_unlock(&txr->hn_tx_lock); 1039 } 1040 } 1041 1042 static int 1043 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1044 { 1045 1046 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1047 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1048 return txr->hn_txdesc_cnt; 1049 return hn_tx_swq_depth; 1050 } 1051 1052 static int 1053 hn_rss_reconfig(struct hn_softc *sc) 1054 { 1055 int error; 1056 1057 HN_LOCK_ASSERT(sc); 1058 1059 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1060 return (ENXIO); 1061 1062 /* 1063 * Disable RSS first. 1064 * 1065 * NOTE: 1066 * Direct reconfiguration by setting the UNCHG flags does 1067 * _not_ work properly. 1068 */ 1069 if (bootverbose) 1070 if_printf(sc->hn_ifp, "disable RSS\n"); 1071 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1072 if (error) { 1073 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1074 return (error); 1075 } 1076 1077 /* 1078 * Reenable the RSS w/ the updated RSS key or indirect 1079 * table. 1080 */ 1081 if (bootverbose) 1082 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1083 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1084 if (error) { 1085 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1086 return (error); 1087 } 1088 return (0); 1089 } 1090 1091 static void 1092 hn_rss_ind_fixup(struct hn_softc *sc) 1093 { 1094 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1095 int i, nchan; 1096 1097 nchan = sc->hn_rx_ring_inuse; 1098 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1099 1100 /* 1101 * Check indirect table to make sure that all channels in it 1102 * can be used. 1103 */ 1104 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1105 if (rss->rss_ind[i] >= nchan) { 1106 if_printf(sc->hn_ifp, 1107 "RSS indirect table %d fixup: %u -> %d\n", 1108 i, rss->rss_ind[i], nchan - 1); 1109 rss->rss_ind[i] = nchan - 1; 1110 } 1111 } 1112 } 1113 1114 static int 1115 hn_ifmedia_upd(struct ifnet *ifp __unused) 1116 { 1117 1118 return EOPNOTSUPP; 1119 } 1120 1121 static void 1122 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 1123 { 1124 struct hn_softc *sc = ifp->if_softc; 1125 1126 ifmr->ifm_status = IFM_AVALID; 1127 ifmr->ifm_active = IFM_ETHER; 1128 1129 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1130 ifmr->ifm_active |= IFM_NONE; 1131 return; 1132 } 1133 ifmr->ifm_status |= IFM_ACTIVE; 1134 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1135 } 1136 1137 static void 1138 hn_rxvf_set_task(void *xarg, int pending __unused) 1139 { 1140 struct hn_rxvf_setarg *arg = xarg; 1141 1142 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1143 } 1144 1145 static void 1146 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) 1147 { 1148 struct hn_rx_ring *rxr; 1149 struct hn_rxvf_setarg arg; 1150 struct task task; 1151 int i; 1152 1153 HN_LOCK_ASSERT(sc); 1154 1155 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1156 1157 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1158 rxr = &sc->hn_rx_ring[i]; 1159 1160 if (i < sc->hn_rx_ring_inuse) { 1161 arg.rxr = rxr; 1162 arg.vf_ifp = vf_ifp; 1163 vmbus_chan_run_task(rxr->hn_chan, &task); 1164 } else { 1165 rxr->hn_rxvf_ifp = vf_ifp; 1166 } 1167 } 1168 } 1169 1170 static bool 1171 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) 1172 { 1173 const struct ifnet *hn_ifp; 1174 1175 hn_ifp = sc->hn_ifp; 1176 1177 if (ifp == hn_ifp) 1178 return (false); 1179 1180 if (ifp->if_alloctype != IFT_ETHER) 1181 return (false); 1182 1183 /* Ignore lagg/vlan interfaces */ 1184 if (strcmp(ifp->if_dname, "lagg") == 0 || 1185 strcmp(ifp->if_dname, "vlan") == 0) 1186 return (false); 1187 1188 /* 1189 * During detach events ifp->if_addr might be NULL. 1190 * Make sure the bcmp() below doesn't panic on that: 1191 */ 1192 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL) 1193 return (false); 1194 1195 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 1196 return (false); 1197 1198 return (true); 1199 } 1200 1201 static void 1202 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) 1203 { 1204 struct ifnet *hn_ifp; 1205 1206 HN_LOCK(sc); 1207 1208 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1209 goto out; 1210 1211 if (!hn_ismyvf(sc, ifp)) 1212 goto out; 1213 hn_ifp = sc->hn_ifp; 1214 1215 if (rxvf) { 1216 if (sc->hn_flags & HN_FLAG_RXVF) 1217 goto out; 1218 1219 sc->hn_flags |= HN_FLAG_RXVF; 1220 hn_rxfilter_config(sc); 1221 } else { 1222 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1223 goto out; 1224 1225 sc->hn_flags &= ~HN_FLAG_RXVF; 1226 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 1227 hn_rxfilter_config(sc); 1228 else 1229 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1230 } 1231 1232 hn_nvs_set_datapath(sc, 1233 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1234 1235 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1236 1237 if (rxvf) { 1238 hn_vf_rss_fixup(sc, true); 1239 hn_suspend_mgmt(sc); 1240 sc->hn_link_flags &= 1241 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1242 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1243 } else { 1244 hn_vf_rss_restore(sc); 1245 hn_resume_mgmt(sc); 1246 } 1247 1248 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, 1249 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1250 1251 if (bootverbose) { 1252 if_printf(hn_ifp, "datapath is switched %s %s\n", 1253 rxvf ? "to" : "from", ifp->if_xname); 1254 } 1255 out: 1256 HN_UNLOCK(sc); 1257 } 1258 1259 static void 1260 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1261 { 1262 1263 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1264 return; 1265 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1266 } 1267 1268 static void 1269 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1270 { 1271 1272 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); 1273 } 1274 1275 static int 1276 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1277 { 1278 struct ifnet *ifp, *vf_ifp; 1279 uint64_t tmp; 1280 int error; 1281 1282 HN_LOCK_ASSERT(sc); 1283 ifp = sc->hn_ifp; 1284 vf_ifp = sc->hn_vf_ifp; 1285 1286 /* 1287 * Fix up requested capabilities w/ supported capabilities, 1288 * since the supported capabilities could have been changed. 1289 */ 1290 ifr->ifr_reqcap &= ifp->if_capabilities; 1291 /* Pass SIOCSIFCAP to VF. */ 1292 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); 1293 1294 /* 1295 * NOTE: 1296 * The error will be propagated to the callers, however, it 1297 * is _not_ useful here. 1298 */ 1299 1300 /* 1301 * Merge VF's enabled capabilities. 1302 */ 1303 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; 1304 1305 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); 1306 if (ifp->if_capenable & IFCAP_TXCSUM) 1307 ifp->if_hwassist |= tmp; 1308 else 1309 ifp->if_hwassist &= ~tmp; 1310 1311 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); 1312 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 1313 ifp->if_hwassist |= tmp; 1314 else 1315 ifp->if_hwassist &= ~tmp; 1316 1317 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; 1318 if (ifp->if_capenable & IFCAP_TSO4) 1319 ifp->if_hwassist |= tmp; 1320 else 1321 ifp->if_hwassist &= ~tmp; 1322 1323 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; 1324 if (ifp->if_capenable & IFCAP_TSO6) 1325 ifp->if_hwassist |= tmp; 1326 else 1327 ifp->if_hwassist &= ~tmp; 1328 1329 return (error); 1330 } 1331 1332 static int 1333 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1334 { 1335 struct ifnet *vf_ifp; 1336 struct ifreq ifr; 1337 1338 HN_LOCK_ASSERT(sc); 1339 vf_ifp = sc->hn_vf_ifp; 1340 1341 memset(&ifr, 0, sizeof(ifr)); 1342 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1343 ifr.ifr_flags = vf_ifp->if_flags & 0xffff; 1344 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; 1345 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); 1346 } 1347 1348 static void 1349 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1350 { 1351 struct ifnet *ifp = sc->hn_ifp; 1352 int allmulti = 0; 1353 1354 HN_LOCK_ASSERT(sc); 1355 1356 /* XXX vlan(4) style mcast addr maintenance */ 1357 if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 1358 allmulti = IFF_ALLMULTI; 1359 1360 /* Always set the VF's if_flags */ 1361 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; 1362 } 1363 1364 static void 1365 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) 1366 { 1367 struct rm_priotracker pt; 1368 struct ifnet *hn_ifp = NULL; 1369 struct mbuf *mn; 1370 1371 /* 1372 * XXX racy, if hn(4) ever detached. 1373 */ 1374 rm_rlock(&hn_vfmap_lock, &pt); 1375 if (vf_ifp->if_index < hn_vfmap_size) 1376 hn_ifp = hn_vfmap[vf_ifp->if_index]; 1377 rm_runlock(&hn_vfmap_lock, &pt); 1378 1379 if (hn_ifp != NULL) { 1380 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1381 /* 1382 * Allow tapping on the VF. 1383 */ 1384 ETHER_BPF_MTAP(vf_ifp, mn); 1385 1386 /* 1387 * Update VF stats. 1388 */ 1389 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { 1390 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1391 mn->m_pkthdr.len); 1392 } 1393 /* 1394 * XXX IFCOUNTER_IMCAST 1395 * This stat updating is kinda invasive, since it 1396 * requires two checks on the mbuf: the length check 1397 * and the ethernet header check. As of this write, 1398 * all multicast packets go directly to hn(4), which 1399 * makes imcast stat updating in the VF a try in vian. 1400 */ 1401 1402 /* 1403 * Fix up rcvif and increase hn(4)'s ipackets. 1404 */ 1405 mn->m_pkthdr.rcvif = hn_ifp; 1406 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1407 } 1408 /* 1409 * Go through hn(4)'s if_input. 1410 */ 1411 hn_ifp->if_input(hn_ifp, m); 1412 } else { 1413 /* 1414 * In the middle of the transition; free this 1415 * mbuf chain. 1416 */ 1417 while (m != NULL) { 1418 mn = m->m_nextpkt; 1419 m->m_nextpkt = NULL; 1420 m_freem(m); 1421 m = mn; 1422 } 1423 } 1424 } 1425 1426 static void 1427 hn_mtu_change_fixup(struct hn_softc *sc) 1428 { 1429 struct ifnet *ifp; 1430 1431 HN_LOCK_ASSERT(sc); 1432 ifp = sc->hn_ifp; 1433 1434 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 1435 #if __FreeBSD_version >= 1100099 1436 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1437 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1438 #endif 1439 } 1440 1441 static uint32_t 1442 hn_rss_type_fromndis(uint32_t rss_hash) 1443 { 1444 uint32_t types = 0; 1445 1446 if (rss_hash & NDIS_HASH_IPV4) 1447 types |= RSS_TYPE_IPV4; 1448 if (rss_hash & NDIS_HASH_TCP_IPV4) 1449 types |= RSS_TYPE_TCP_IPV4; 1450 if (rss_hash & NDIS_HASH_IPV6) 1451 types |= RSS_TYPE_IPV6; 1452 if (rss_hash & NDIS_HASH_IPV6_EX) 1453 types |= RSS_TYPE_IPV6_EX; 1454 if (rss_hash & NDIS_HASH_TCP_IPV6) 1455 types |= RSS_TYPE_TCP_IPV6; 1456 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1457 types |= RSS_TYPE_TCP_IPV6_EX; 1458 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1459 types |= RSS_TYPE_UDP_IPV4; 1460 return (types); 1461 } 1462 1463 static uint32_t 1464 hn_rss_type_tondis(uint32_t types) 1465 { 1466 uint32_t rss_hash = 0; 1467 1468 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1469 ("UDP6 and UDP6EX are not supported")); 1470 1471 if (types & RSS_TYPE_IPV4) 1472 rss_hash |= NDIS_HASH_IPV4; 1473 if (types & RSS_TYPE_TCP_IPV4) 1474 rss_hash |= NDIS_HASH_TCP_IPV4; 1475 if (types & RSS_TYPE_IPV6) 1476 rss_hash |= NDIS_HASH_IPV6; 1477 if (types & RSS_TYPE_IPV6_EX) 1478 rss_hash |= NDIS_HASH_IPV6_EX; 1479 if (types & RSS_TYPE_TCP_IPV6) 1480 rss_hash |= NDIS_HASH_TCP_IPV6; 1481 if (types & RSS_TYPE_TCP_IPV6_EX) 1482 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1483 if (types & RSS_TYPE_UDP_IPV4) 1484 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1485 return (rss_hash); 1486 } 1487 1488 static void 1489 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1490 { 1491 int i; 1492 1493 HN_LOCK_ASSERT(sc); 1494 1495 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1496 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1497 } 1498 1499 static void 1500 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1501 { 1502 struct ifnet *ifp, *vf_ifp; 1503 struct ifrsshash ifrh; 1504 struct ifrsskey ifrk; 1505 int error; 1506 uint32_t my_types, diff_types, mbuf_types = 0; 1507 1508 HN_LOCK_ASSERT(sc); 1509 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1510 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1511 1512 if (sc->hn_rx_ring_inuse == 1) { 1513 /* No RSS on synthetic parts; done. */ 1514 return; 1515 } 1516 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1517 /* Synthetic parts do not support Toeplitz; done. */ 1518 return; 1519 } 1520 1521 ifp = sc->hn_ifp; 1522 vf_ifp = sc->hn_vf_ifp; 1523 1524 /* 1525 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1526 * supported. 1527 */ 1528 memset(&ifrk, 0, sizeof(ifrk)); 1529 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); 1530 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); 1531 if (error) { 1532 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1533 vf_ifp->if_xname, error); 1534 goto done; 1535 } 1536 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1537 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1538 vf_ifp->if_xname, ifrk.ifrk_func); 1539 goto done; 1540 } 1541 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1542 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1543 vf_ifp->if_xname, ifrk.ifrk_keylen); 1544 goto done; 1545 } 1546 1547 /* 1548 * Extract VF's RSS hash. Only Toeplitz is supported. 1549 */ 1550 memset(&ifrh, 0, sizeof(ifrh)); 1551 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); 1552 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); 1553 if (error) { 1554 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1555 vf_ifp->if_xname, error); 1556 goto done; 1557 } 1558 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1559 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1560 vf_ifp->if_xname, ifrh.ifrh_func); 1561 goto done; 1562 } 1563 1564 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1565 if ((ifrh.ifrh_types & my_types) == 0) { 1566 /* This disables RSS; ignore it then */ 1567 if_printf(ifp, "%s intersection of RSS types failed. " 1568 "VF %#x, mine %#x\n", vf_ifp->if_xname, 1569 ifrh.ifrh_types, my_types); 1570 goto done; 1571 } 1572 1573 diff_types = my_types ^ ifrh.ifrh_types; 1574 my_types &= ifrh.ifrh_types; 1575 mbuf_types = my_types; 1576 1577 /* 1578 * Detect RSS hash value/type confliction. 1579 * 1580 * NOTE: 1581 * We don't disable the hash type, but stop delivery the hash 1582 * value/type through mbufs on RX path. 1583 * 1584 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1585 * hash is delivered with type of TCP_IPV4. This means if 1586 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1587 * least to hn_mbuf_hash. However, given that _all_ of the 1588 * NICs implement TCP_IPV4, this will _not_ impose any issues 1589 * here. 1590 */ 1591 if ((my_types & RSS_TYPE_IPV4) && 1592 (diff_types & ifrh.ifrh_types & 1593 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1594 /* Conflict; disable IPV4 hash type/value delivery. */ 1595 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1596 mbuf_types &= ~RSS_TYPE_IPV4; 1597 } 1598 if ((my_types & RSS_TYPE_IPV6) && 1599 (diff_types & ifrh.ifrh_types & 1600 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1601 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1602 RSS_TYPE_IPV6_EX))) { 1603 /* Conflict; disable IPV6 hash type/value delivery. */ 1604 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1605 mbuf_types &= ~RSS_TYPE_IPV6; 1606 } 1607 if ((my_types & RSS_TYPE_IPV6_EX) && 1608 (diff_types & ifrh.ifrh_types & 1609 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1610 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1611 RSS_TYPE_IPV6))) { 1612 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1613 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1614 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1615 } 1616 if ((my_types & RSS_TYPE_TCP_IPV6) && 1617 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1618 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1619 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1620 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1621 } 1622 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1623 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1624 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1625 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1626 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1627 } 1628 if ((my_types & RSS_TYPE_UDP_IPV6) && 1629 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1630 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1631 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1632 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1633 } 1634 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1635 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1636 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1637 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1638 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1639 } 1640 1641 /* 1642 * Indirect table does not matter. 1643 */ 1644 1645 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1646 hn_rss_type_tondis(my_types); 1647 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1648 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1649 1650 if (reconf) { 1651 error = hn_rss_reconfig(sc); 1652 if (error) { 1653 /* XXX roll-back? */ 1654 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1655 /* XXX keep going. */ 1656 } 1657 } 1658 done: 1659 /* Hash deliverability for mbufs. */ 1660 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1661 } 1662 1663 static void 1664 hn_vf_rss_restore(struct hn_softc *sc) 1665 { 1666 1667 HN_LOCK_ASSERT(sc); 1668 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1669 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1670 1671 if (sc->hn_rx_ring_inuse == 1) 1672 goto done; 1673 1674 /* 1675 * Restore hash types. Key does _not_ matter. 1676 */ 1677 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1678 int error; 1679 1680 sc->hn_rss_hash = sc->hn_rss_hcap; 1681 error = hn_rss_reconfig(sc); 1682 if (error) { 1683 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1684 error); 1685 /* XXX keep going. */ 1686 } 1687 } 1688 done: 1689 /* Hash deliverability for mbufs. */ 1690 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1691 } 1692 1693 static void 1694 hn_xpnt_vf_setready(struct hn_softc *sc) 1695 { 1696 struct ifnet *ifp, *vf_ifp; 1697 struct ifreq ifr; 1698 1699 HN_LOCK_ASSERT(sc); 1700 ifp = sc->hn_ifp; 1701 vf_ifp = sc->hn_vf_ifp; 1702 1703 /* 1704 * Mark the VF ready. 1705 */ 1706 sc->hn_vf_rdytick = 0; 1707 1708 /* 1709 * Save information for restoration. 1710 */ 1711 sc->hn_saved_caps = ifp->if_capabilities; 1712 sc->hn_saved_tsomax = ifp->if_hw_tsomax; 1713 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; 1714 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; 1715 1716 /* 1717 * Intersect supported/enabled capabilities. 1718 * 1719 * NOTE: 1720 * if_hwassist is not changed here. 1721 */ 1722 ifp->if_capabilities &= vf_ifp->if_capabilities; 1723 ifp->if_capenable &= ifp->if_capabilities; 1724 1725 /* 1726 * Fix TSO settings. 1727 */ 1728 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) 1729 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; 1730 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) 1731 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; 1732 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) 1733 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; 1734 1735 /* 1736 * Change VF's enabled capabilities. 1737 */ 1738 memset(&ifr, 0, sizeof(ifr)); 1739 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1740 ifr.ifr_reqcap = ifp->if_capenable; 1741 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1742 1743 if (ifp->if_mtu != ETHERMTU) { 1744 int error; 1745 1746 /* 1747 * Change VF's MTU. 1748 */ 1749 memset(&ifr, 0, sizeof(ifr)); 1750 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1751 ifr.ifr_mtu = ifp->if_mtu; 1752 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); 1753 if (error) { 1754 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1755 vf_ifp->if_xname, ifp->if_mtu); 1756 if (ifp->if_mtu > ETHERMTU) { 1757 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1758 1759 /* 1760 * XXX 1761 * No need to adjust the synthetic parts' MTU; 1762 * failure of the adjustment will cause us 1763 * infinite headache. 1764 */ 1765 ifp->if_mtu = ETHERMTU; 1766 hn_mtu_change_fixup(sc); 1767 } 1768 } 1769 } 1770 } 1771 1772 static bool 1773 hn_xpnt_vf_isready(struct hn_softc *sc) 1774 { 1775 1776 HN_LOCK_ASSERT(sc); 1777 1778 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1779 return (false); 1780 1781 if (sc->hn_vf_rdytick == 0) 1782 return (true); 1783 1784 if (sc->hn_vf_rdytick > ticks) 1785 return (false); 1786 1787 /* Mark VF as ready. */ 1788 hn_xpnt_vf_setready(sc); 1789 return (true); 1790 } 1791 1792 static void 1793 hn_xpnt_vf_setenable(struct hn_softc *sc) 1794 { 1795 int i; 1796 1797 HN_LOCK_ASSERT(sc); 1798 1799 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1800 rm_wlock(&sc->hn_vf_lock); 1801 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1802 rm_wunlock(&sc->hn_vf_lock); 1803 1804 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1805 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1806 } 1807 1808 static void 1809 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1810 { 1811 int i; 1812 1813 HN_LOCK_ASSERT(sc); 1814 1815 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1816 rm_wlock(&sc->hn_vf_lock); 1817 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1818 if (clear_vf) 1819 sc->hn_vf_ifp = NULL; 1820 rm_wunlock(&sc->hn_vf_lock); 1821 1822 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1823 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1824 } 1825 1826 static void 1827 hn_xpnt_vf_init(struct hn_softc *sc) 1828 { 1829 int error; 1830 1831 HN_LOCK_ASSERT(sc); 1832 1833 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1834 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1835 1836 if (bootverbose) { 1837 if_printf(sc->hn_ifp, "try bringing up %s\n", 1838 sc->hn_vf_ifp->if_xname); 1839 } 1840 1841 /* 1842 * Bring the VF up. 1843 */ 1844 hn_xpnt_vf_saveifflags(sc); 1845 sc->hn_vf_ifp->if_flags |= IFF_UP; 1846 error = hn_xpnt_vf_iocsetflags(sc); 1847 if (error) { 1848 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1849 sc->hn_vf_ifp->if_xname, error); 1850 return; 1851 } 1852 1853 /* 1854 * NOTE: 1855 * Datapath setting must happen _after_ bringing the VF up. 1856 */ 1857 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1858 1859 /* 1860 * NOTE: 1861 * Fixup RSS related bits _after_ the VF is brought up, since 1862 * many VFs generate RSS key during it's initialization. 1863 */ 1864 hn_vf_rss_fixup(sc, true); 1865 1866 /* Mark transparent mode VF as enabled. */ 1867 hn_xpnt_vf_setenable(sc); 1868 } 1869 1870 static void 1871 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1872 { 1873 struct hn_softc *sc = xsc; 1874 1875 HN_LOCK(sc); 1876 1877 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1878 goto done; 1879 if (sc->hn_vf_ifp == NULL) 1880 goto done; 1881 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1882 goto done; 1883 1884 if (sc->hn_vf_rdytick != 0) { 1885 /* Mark VF as ready. */ 1886 hn_xpnt_vf_setready(sc); 1887 } 1888 1889 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { 1890 /* 1891 * Delayed VF initialization. 1892 */ 1893 if (bootverbose) { 1894 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1895 sc->hn_vf_ifp->if_xname); 1896 } 1897 hn_xpnt_vf_init(sc); 1898 } 1899 done: 1900 HN_UNLOCK(sc); 1901 } 1902 1903 static void 1904 hn_ifnet_attevent(void *xsc, struct ifnet *ifp) 1905 { 1906 struct hn_softc *sc = xsc; 1907 1908 HN_LOCK(sc); 1909 1910 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1911 goto done; 1912 1913 if (!hn_ismyvf(sc, ifp)) 1914 goto done; 1915 1916 if (sc->hn_vf_ifp != NULL) { 1917 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1918 sc->hn_vf_ifp->if_xname); 1919 goto done; 1920 } 1921 1922 if (hn_xpnt_vf && ifp->if_start != NULL) { 1923 /* 1924 * ifnet.if_start is _not_ supported by transparent 1925 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1926 */ 1927 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1928 "in transparent VF mode.\n", ifp->if_xname); 1929 goto done; 1930 } 1931 1932 rm_wlock(&hn_vfmap_lock); 1933 1934 if (ifp->if_index >= hn_vfmap_size) { 1935 struct ifnet **newmap; 1936 int newsize; 1937 1938 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; 1939 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, 1940 M_WAITOK | M_ZERO); 1941 1942 memcpy(newmap, hn_vfmap, 1943 sizeof(struct ifnet *) * hn_vfmap_size); 1944 free(hn_vfmap, M_DEVBUF); 1945 hn_vfmap = newmap; 1946 hn_vfmap_size = newsize; 1947 } 1948 KASSERT(hn_vfmap[ifp->if_index] == NULL, 1949 ("%s: ifindex %d was mapped to %s", 1950 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); 1951 hn_vfmap[ifp->if_index] = sc->hn_ifp; 1952 1953 rm_wunlock(&hn_vfmap_lock); 1954 1955 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1956 rm_wlock(&sc->hn_vf_lock); 1957 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1958 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1959 sc->hn_vf_ifp = ifp; 1960 rm_wunlock(&sc->hn_vf_lock); 1961 1962 if (hn_xpnt_vf) { 1963 int wait_ticks; 1964 1965 /* 1966 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1967 * Save vf_ifp's current if_input for later restoration. 1968 */ 1969 sc->hn_vf_input = ifp->if_input; 1970 ifp->if_input = hn_xpnt_vf_input; 1971 1972 /* 1973 * Stop link status management; use the VF's. 1974 */ 1975 hn_suspend_mgmt(sc); 1976 1977 /* 1978 * Give VF sometime to complete its attach routing. 1979 */ 1980 wait_ticks = hn_xpnt_vf_attwait * hz; 1981 sc->hn_vf_rdytick = ticks + wait_ticks; 1982 1983 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1984 wait_ticks); 1985 } 1986 done: 1987 HN_UNLOCK(sc); 1988 } 1989 1990 static void 1991 hn_ifnet_detevent(void *xsc, struct ifnet *ifp) 1992 { 1993 struct hn_softc *sc = xsc; 1994 1995 HN_LOCK(sc); 1996 1997 if (sc->hn_vf_ifp == NULL) 1998 goto done; 1999 2000 if (!hn_ismyvf(sc, ifp)) 2001 goto done; 2002 2003 if (hn_xpnt_vf) { 2004 /* 2005 * Make sure that the delayed initialization is not running. 2006 * 2007 * NOTE: 2008 * - This lock _must_ be released, since the hn_vf_init task 2009 * will try holding this lock. 2010 * - It is safe to release this lock here, since the 2011 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 2012 * 2013 * XXX racy, if hn(4) ever detached. 2014 */ 2015 HN_UNLOCK(sc); 2016 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 2017 HN_LOCK(sc); 2018 2019 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 2020 sc->hn_ifp->if_xname)); 2021 ifp->if_input = sc->hn_vf_input; 2022 sc->hn_vf_input = NULL; 2023 2024 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 2025 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 2026 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 2027 2028 if (sc->hn_vf_rdytick == 0) { 2029 /* 2030 * The VF was ready; restore some settings. 2031 */ 2032 sc->hn_ifp->if_capabilities = sc->hn_saved_caps; 2033 /* 2034 * NOTE: 2035 * There is _no_ need to fixup if_capenable and 2036 * if_hwassist, since the if_capabilities before 2037 * restoration was an intersection of the VF's 2038 * if_capabilites and the synthetic device's 2039 * if_capabilites. 2040 */ 2041 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; 2042 sc->hn_ifp->if_hw_tsomaxsegcount = 2043 sc->hn_saved_tsosegcnt; 2044 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; 2045 } 2046 2047 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2048 /* 2049 * Restore RSS settings. 2050 */ 2051 hn_vf_rss_restore(sc); 2052 2053 /* 2054 * Resume link status management, which was suspended 2055 * by hn_ifnet_attevent(). 2056 */ 2057 hn_resume_mgmt(sc); 2058 } 2059 } 2060 2061 /* Mark transparent mode VF as disabled. */ 2062 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2063 2064 rm_wlock(&hn_vfmap_lock); 2065 2066 KASSERT(ifp->if_index < hn_vfmap_size, 2067 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); 2068 if (hn_vfmap[ifp->if_index] != NULL) { 2069 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, 2070 ("%s: ifindex %d was mapped to %s", 2071 ifp->if_xname, ifp->if_index, 2072 hn_vfmap[ifp->if_index]->if_xname)); 2073 hn_vfmap[ifp->if_index] = NULL; 2074 } 2075 2076 rm_wunlock(&hn_vfmap_lock); 2077 done: 2078 HN_UNLOCK(sc); 2079 } 2080 2081 static void 2082 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) 2083 { 2084 struct hn_softc *sc = xsc; 2085 2086 if (sc->hn_vf_ifp == ifp) 2087 if_link_state_change(sc->hn_ifp, link_state); 2088 } 2089 2090 static int 2091 hn_probe(device_t dev) 2092 { 2093 2094 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2095 device_set_desc(dev, "Hyper-V Network Interface"); 2096 return BUS_PROBE_DEFAULT; 2097 } 2098 return ENXIO; 2099 } 2100 2101 static int 2102 hn_attach(device_t dev) 2103 { 2104 struct hn_softc *sc = device_get_softc(dev); 2105 struct sysctl_oid_list *child; 2106 struct sysctl_ctx_list *ctx; 2107 uint8_t eaddr[ETHER_ADDR_LEN]; 2108 struct ifnet *ifp = NULL; 2109 int error, ring_cnt, tx_ring_cnt; 2110 uint32_t mtu; 2111 2112 sc->hn_dev = dev; 2113 sc->hn_prichan = vmbus_get_channel(dev); 2114 HN_LOCK_INIT(sc); 2115 rm_init(&sc->hn_vf_lock, "hnvf"); 2116 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2117 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2118 2119 /* 2120 * Initialize these tunables once. 2121 */ 2122 sc->hn_agg_size = hn_tx_agg_size; 2123 sc->hn_agg_pkts = hn_tx_agg_pkts; 2124 2125 /* 2126 * Setup taskqueue for transmission. 2127 */ 2128 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2129 int i; 2130 2131 sc->hn_tx_taskqs = 2132 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2133 M_DEVBUF, M_WAITOK); 2134 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2135 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2136 M_WAITOK, taskqueue_thread_enqueue, 2137 &sc->hn_tx_taskqs[i]); 2138 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2139 "%s tx%d", device_get_nameunit(dev), i); 2140 } 2141 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2142 sc->hn_tx_taskqs = hn_tx_taskque; 2143 } 2144 2145 /* 2146 * Setup taskqueue for mangement tasks, e.g. link status. 2147 */ 2148 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2149 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2150 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2151 device_get_nameunit(dev)); 2152 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2153 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2154 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2155 hn_netchg_status_taskfunc, sc); 2156 2157 if (hn_xpnt_vf) { 2158 /* 2159 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2160 */ 2161 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2162 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2163 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2164 device_get_nameunit(dev)); 2165 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2166 hn_xpnt_vf_init_taskfunc, sc); 2167 } 2168 2169 /* 2170 * Allocate ifnet and setup its name earlier, so that if_printf 2171 * can be used by functions, which will be called after 2172 * ether_ifattach(). 2173 */ 2174 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2175 ifp->if_softc = sc; 2176 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2177 2178 /* 2179 * Initialize ifmedia earlier so that it can be unconditionally 2180 * destroyed, if error happened later on. 2181 */ 2182 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2183 2184 /* 2185 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2186 * to use (tx_ring_cnt). 2187 * 2188 * NOTE: 2189 * The # of RX rings to use is same as the # of channels to use. 2190 */ 2191 ring_cnt = hn_chan_cnt; 2192 if (ring_cnt <= 0) { 2193 /* Default */ 2194 ring_cnt = mp_ncpus; 2195 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2196 ring_cnt = HN_RING_CNT_DEF_MAX; 2197 } else if (ring_cnt > mp_ncpus) { 2198 ring_cnt = mp_ncpus; 2199 } 2200 #ifdef RSS 2201 if (ring_cnt > rss_getnumbuckets()) 2202 ring_cnt = rss_getnumbuckets(); 2203 #endif 2204 2205 tx_ring_cnt = hn_tx_ring_cnt; 2206 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2207 tx_ring_cnt = ring_cnt; 2208 #ifdef HN_IFSTART_SUPPORT 2209 if (hn_use_if_start) { 2210 /* ifnet.if_start only needs one TX ring. */ 2211 tx_ring_cnt = 1; 2212 } 2213 #endif 2214 2215 /* 2216 * Set the leader CPU for channels. 2217 */ 2218 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2219 2220 /* 2221 * Create enough TX/RX rings, even if only limited number of 2222 * channels can be allocated. 2223 */ 2224 error = hn_create_tx_data(sc, tx_ring_cnt); 2225 if (error) 2226 goto failed; 2227 error = hn_create_rx_data(sc, ring_cnt); 2228 if (error) 2229 goto failed; 2230 2231 /* 2232 * Create transaction context for NVS and RNDIS transactions. 2233 */ 2234 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2235 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2236 if (sc->hn_xact == NULL) { 2237 error = ENXIO; 2238 goto failed; 2239 } 2240 2241 /* 2242 * Install orphan handler for the revocation of this device's 2243 * primary channel. 2244 * 2245 * NOTE: 2246 * The processing order is critical here: 2247 * Install the orphan handler, _before_ testing whether this 2248 * device's primary channel has been revoked or not. 2249 */ 2250 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2251 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2252 error = ENXIO; 2253 goto failed; 2254 } 2255 2256 /* 2257 * Attach the synthetic parts, i.e. NVS and RNDIS. 2258 */ 2259 error = hn_synth_attach(sc, ETHERMTU); 2260 if (error) 2261 goto failed; 2262 2263 error = hn_rndis_get_eaddr(sc, eaddr); 2264 if (error) 2265 goto failed; 2266 2267 error = hn_rndis_get_mtu(sc, &mtu); 2268 if (error) 2269 mtu = ETHERMTU; 2270 else if (bootverbose) 2271 device_printf(dev, "RNDIS mtu %u\n", mtu); 2272 2273 #if __FreeBSD_version >= 1100099 2274 if (sc->hn_rx_ring_inuse > 1) { 2275 /* 2276 * Reduce TCP segment aggregation limit for multiple 2277 * RX rings to increase ACK timeliness. 2278 */ 2279 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2280 } 2281 #endif 2282 2283 /* 2284 * Fixup TX/RX stuffs after synthetic parts are attached. 2285 */ 2286 hn_fixup_tx_data(sc); 2287 hn_fixup_rx_data(sc); 2288 2289 ctx = device_get_sysctl_ctx(dev); 2290 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2291 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2292 &sc->hn_nvs_ver, 0, "NVS version"); 2293 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2294 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2295 hn_ndis_version_sysctl, "A", "NDIS version"); 2296 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2297 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2298 hn_caps_sysctl, "A", "capabilities"); 2299 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2300 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2301 hn_hwassist_sysctl, "A", "hwassist"); 2302 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", 2303 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); 2304 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", 2305 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, 2306 "max # of TSO segments"); 2307 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", 2308 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, 2309 "max size of TSO segment"); 2310 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2311 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2312 hn_rxfilter_sysctl, "A", "rxfilter"); 2313 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2314 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2315 hn_rss_hash_sysctl, "A", "RSS hash"); 2316 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2317 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2318 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2319 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2320 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2321 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2322 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2323 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2324 #ifndef RSS 2325 /* 2326 * Don't allow RSS key/indirect table changes, if RSS is defined. 2327 */ 2328 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2329 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2330 hn_rss_key_sysctl, "IU", "RSS key"); 2331 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2332 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2333 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2334 #endif 2335 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2336 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2337 "RNDIS offered packet transmission aggregation size limit"); 2338 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2339 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2340 "RNDIS offered packet transmission aggregation count limit"); 2341 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2342 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2343 "RNDIS packet transmission aggregation alignment"); 2344 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2345 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2346 hn_txagg_size_sysctl, "I", 2347 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2348 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2349 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2350 hn_txagg_pkts_sysctl, "I", 2351 "Packet transmission aggregation packets, " 2352 "0 -- disable, -1 -- auto"); 2353 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2354 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2355 hn_polling_sysctl, "I", 2356 "Polling frequency: [100,1000000], 0 disable polling"); 2357 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2358 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2359 hn_vf_sysctl, "A", "Virtual Function's name"); 2360 if (!hn_xpnt_vf) { 2361 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2362 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2363 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2364 } else { 2365 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2366 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2367 hn_xpnt_vf_enabled_sysctl, "I", 2368 "Transparent VF enabled"); 2369 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2370 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2371 hn_xpnt_vf_accbpf_sysctl, "I", 2372 "Accurate BPF for transparent VF"); 2373 } 2374 2375 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch", 2376 CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A", 2377 "switch to rsc"); 2378 2379 /* 2380 * Setup the ifmedia, which has been initialized earlier. 2381 */ 2382 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2383 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2384 /* XXX ifmedia_set really should do this for us */ 2385 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2386 2387 /* 2388 * Setup the ifnet for this interface. 2389 */ 2390 2391 ifp->if_baudrate = IF_Gbps(10); 2392 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 2393 ifp->if_ioctl = hn_ioctl; 2394 ifp->if_init = hn_init; 2395 #ifdef HN_IFSTART_SUPPORT 2396 if (hn_use_if_start) { 2397 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2398 2399 ifp->if_start = hn_start; 2400 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 2401 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 2402 IFQ_SET_READY(&ifp->if_snd); 2403 } else 2404 #endif 2405 { 2406 ifp->if_transmit = hn_transmit; 2407 ifp->if_qflush = hn_xmit_qflush; 2408 } 2409 2410 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; 2411 #ifdef foo 2412 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2413 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 2414 #endif 2415 if (sc->hn_caps & HN_CAP_VLAN) { 2416 /* XXX not sure about VLAN_MTU. */ 2417 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 2418 } 2419 2420 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 2421 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 2422 ifp->if_capabilities |= IFCAP_TXCSUM; 2423 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 2424 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 2425 if (sc->hn_caps & HN_CAP_TSO4) { 2426 ifp->if_capabilities |= IFCAP_TSO4; 2427 ifp->if_hwassist |= CSUM_IP_TSO; 2428 } 2429 if (sc->hn_caps & HN_CAP_TSO6) { 2430 ifp->if_capabilities |= IFCAP_TSO6; 2431 ifp->if_hwassist |= CSUM_IP6_TSO; 2432 } 2433 2434 /* Enable all available capabilities by default. */ 2435 ifp->if_capenable = ifp->if_capabilities; 2436 2437 /* 2438 * Disable IPv6 TSO and TXCSUM by default, they still can 2439 * be enabled through SIOCSIFCAP. 2440 */ 2441 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 2442 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 2443 2444 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 2445 /* 2446 * Lock hn_set_tso_maxsize() to simplify its 2447 * internal logic. 2448 */ 2449 HN_LOCK(sc); 2450 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2451 HN_UNLOCK(sc); 2452 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 2453 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 2454 } 2455 2456 ether_ifattach(ifp, eaddr); 2457 2458 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2459 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2460 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 2461 } 2462 if (mtu < ETHERMTU) { 2463 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); 2464 ifp->if_mtu = mtu; 2465 } 2466 2467 /* Inform the upper layer about the long frame support. */ 2468 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 2469 2470 /* 2471 * Kick off link status check. 2472 */ 2473 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2474 hn_update_link_status(sc); 2475 2476 if (!hn_xpnt_vf) { 2477 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2478 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2479 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2480 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2481 } else { 2482 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2483 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2484 } 2485 2486 /* 2487 * NOTE: 2488 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2489 * since interface's LLADDR is needed; interface LLADDR is not 2490 * available when ifnet_arrival event is triggered. 2491 */ 2492 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2493 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2494 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2495 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2496 2497 return (0); 2498 failed: 2499 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2500 hn_synth_detach(sc); 2501 hn_detach(dev); 2502 return (error); 2503 } 2504 2505 static int 2506 hn_detach(device_t dev) 2507 { 2508 struct hn_softc *sc = device_get_softc(dev); 2509 struct ifnet *ifp = sc->hn_ifp, *vf_ifp; 2510 2511 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2512 /* 2513 * In case that the vmbus missed the orphan handler 2514 * installation. 2515 */ 2516 vmbus_xact_ctx_orphan(sc->hn_xact); 2517 } 2518 2519 if (sc->hn_ifaddr_evthand != NULL) 2520 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2521 if (sc->hn_ifnet_evthand != NULL) 2522 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2523 if (sc->hn_ifnet_atthand != NULL) { 2524 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2525 sc->hn_ifnet_atthand); 2526 } 2527 if (sc->hn_ifnet_dethand != NULL) { 2528 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2529 sc->hn_ifnet_dethand); 2530 } 2531 if (sc->hn_ifnet_lnkhand != NULL) 2532 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2533 2534 vf_ifp = sc->hn_vf_ifp; 2535 __compiler_membar(); 2536 if (vf_ifp != NULL) 2537 hn_ifnet_detevent(sc, vf_ifp); 2538 2539 if (device_is_attached(dev)) { 2540 HN_LOCK(sc); 2541 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2542 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2543 hn_stop(sc, true); 2544 /* 2545 * NOTE: 2546 * hn_stop() only suspends data, so managment 2547 * stuffs have to be suspended manually here. 2548 */ 2549 hn_suspend_mgmt(sc); 2550 hn_synth_detach(sc); 2551 } 2552 HN_UNLOCK(sc); 2553 ether_ifdetach(ifp); 2554 } 2555 2556 ifmedia_removeall(&sc->hn_media); 2557 hn_destroy_rx_data(sc); 2558 hn_destroy_tx_data(sc); 2559 2560 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2561 int i; 2562 2563 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2564 taskqueue_free(sc->hn_tx_taskqs[i]); 2565 free(sc->hn_tx_taskqs, M_DEVBUF); 2566 } 2567 taskqueue_free(sc->hn_mgmt_taskq0); 2568 if (sc->hn_vf_taskq != NULL) 2569 taskqueue_free(sc->hn_vf_taskq); 2570 2571 if (sc->hn_xact != NULL) { 2572 /* 2573 * Uninstall the orphan handler _before_ the xact is 2574 * destructed. 2575 */ 2576 vmbus_chan_unset_orphan(sc->hn_prichan); 2577 vmbus_xact_ctx_destroy(sc->hn_xact); 2578 } 2579 2580 if_free(ifp); 2581 2582 HN_LOCK_DESTROY(sc); 2583 rm_destroy(&sc->hn_vf_lock); 2584 return (0); 2585 } 2586 2587 static int 2588 hn_shutdown(device_t dev) 2589 { 2590 2591 return (0); 2592 } 2593 2594 static void 2595 hn_link_status(struct hn_softc *sc) 2596 { 2597 uint32_t link_status; 2598 int error; 2599 2600 error = hn_rndis_get_linkstatus(sc, &link_status); 2601 if (error) { 2602 /* XXX what to do? */ 2603 return; 2604 } 2605 2606 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2607 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2608 else 2609 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2610 if_link_state_change(sc->hn_ifp, 2611 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2612 LINK_STATE_UP : LINK_STATE_DOWN); 2613 } 2614 2615 static void 2616 hn_link_taskfunc(void *xsc, int pending __unused) 2617 { 2618 struct hn_softc *sc = xsc; 2619 2620 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2621 return; 2622 hn_link_status(sc); 2623 } 2624 2625 static void 2626 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2627 { 2628 struct hn_softc *sc = xsc; 2629 2630 /* Prevent any link status checks from running. */ 2631 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2632 2633 /* 2634 * Fake up a [link down --> link up] state change; 5 seconds 2635 * delay is used, which closely simulates miibus reaction 2636 * upon link down event. 2637 */ 2638 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2639 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2640 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2641 &sc->hn_netchg_status, 5 * hz); 2642 } 2643 2644 static void 2645 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2646 { 2647 struct hn_softc *sc = xsc; 2648 2649 /* Re-allow link status checks. */ 2650 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2651 hn_link_status(sc); 2652 } 2653 2654 static void 2655 hn_update_link_status(struct hn_softc *sc) 2656 { 2657 2658 if (sc->hn_mgmt_taskq != NULL) 2659 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2660 } 2661 2662 static void 2663 hn_change_network(struct hn_softc *sc) 2664 { 2665 2666 if (sc->hn_mgmt_taskq != NULL) 2667 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2668 } 2669 2670 static __inline int 2671 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2672 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2673 { 2674 struct mbuf *m = *m_head; 2675 int error; 2676 2677 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2678 2679 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2680 m, segs, nsegs, BUS_DMA_NOWAIT); 2681 if (error == EFBIG) { 2682 struct mbuf *m_new; 2683 2684 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2685 if (m_new == NULL) 2686 return ENOBUFS; 2687 else 2688 *m_head = m = m_new; 2689 txr->hn_tx_collapsed++; 2690 2691 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2692 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2693 } 2694 if (!error) { 2695 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2696 BUS_DMASYNC_PREWRITE); 2697 txd->flags |= HN_TXD_FLAG_DMAMAP; 2698 } 2699 return error; 2700 } 2701 2702 static __inline int 2703 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2704 { 2705 2706 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2707 ("put an onlist txd %#x", txd->flags)); 2708 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2709 ("put an onagg txd %#x", txd->flags)); 2710 2711 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2712 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2713 return 0; 2714 2715 if (!STAILQ_EMPTY(&txd->agg_list)) { 2716 struct hn_txdesc *tmp_txd; 2717 2718 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2719 int freed __diagused; 2720 2721 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2722 ("resursive aggregation on aggregated txdesc")); 2723 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2724 ("not aggregated txdesc")); 2725 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2726 ("aggregated txdesc uses dmamap")); 2727 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2728 ("aggregated txdesc consumes " 2729 "chimney sending buffer")); 2730 KASSERT(tmp_txd->chim_size == 0, 2731 ("aggregated txdesc has non-zero " 2732 "chimney sending size")); 2733 2734 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2735 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2736 freed = hn_txdesc_put(txr, tmp_txd); 2737 KASSERT(freed, ("failed to free aggregated txdesc")); 2738 } 2739 } 2740 2741 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2742 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2743 ("chim txd uses dmamap")); 2744 hn_chim_free(txr->hn_sc, txd->chim_index); 2745 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2746 txd->chim_size = 0; 2747 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2748 bus_dmamap_sync(txr->hn_tx_data_dtag, 2749 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2750 bus_dmamap_unload(txr->hn_tx_data_dtag, 2751 txd->data_dmap); 2752 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2753 } 2754 2755 if (txd->m != NULL) { 2756 m_freem(txd->m); 2757 txd->m = NULL; 2758 } 2759 2760 txd->flags |= HN_TXD_FLAG_ONLIST; 2761 #ifndef HN_USE_TXDESC_BUFRING 2762 mtx_lock_spin(&txr->hn_txlist_spin); 2763 KASSERT(txr->hn_txdesc_avail >= 0 && 2764 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2765 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2766 txr->hn_txdesc_avail++; 2767 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2768 mtx_unlock_spin(&txr->hn_txlist_spin); 2769 #else /* HN_USE_TXDESC_BUFRING */ 2770 #ifdef HN_DEBUG 2771 atomic_add_int(&txr->hn_txdesc_avail, 1); 2772 #endif 2773 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2774 #endif /* !HN_USE_TXDESC_BUFRING */ 2775 2776 return 1; 2777 } 2778 2779 static __inline struct hn_txdesc * 2780 hn_txdesc_get(struct hn_tx_ring *txr) 2781 { 2782 struct hn_txdesc *txd; 2783 2784 #ifndef HN_USE_TXDESC_BUFRING 2785 mtx_lock_spin(&txr->hn_txlist_spin); 2786 txd = SLIST_FIRST(&txr->hn_txlist); 2787 if (txd != NULL) { 2788 KASSERT(txr->hn_txdesc_avail > 0, 2789 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2790 txr->hn_txdesc_avail--; 2791 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2792 } 2793 mtx_unlock_spin(&txr->hn_txlist_spin); 2794 #else 2795 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2796 #endif 2797 2798 if (txd != NULL) { 2799 #ifdef HN_USE_TXDESC_BUFRING 2800 #ifdef HN_DEBUG 2801 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2802 #endif 2803 #endif /* HN_USE_TXDESC_BUFRING */ 2804 KASSERT(txd->m == NULL && txd->refs == 0 && 2805 STAILQ_EMPTY(&txd->agg_list) && 2806 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2807 txd->chim_size == 0 && 2808 (txd->flags & HN_TXD_FLAG_ONLIST) && 2809 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2810 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2811 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2812 txd->refs = 1; 2813 } 2814 return txd; 2815 } 2816 2817 static __inline void 2818 hn_txdesc_hold(struct hn_txdesc *txd) 2819 { 2820 2821 /* 0->1 transition will never work */ 2822 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2823 atomic_add_int(&txd->refs, 1); 2824 } 2825 2826 static __inline void 2827 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2828 { 2829 2830 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2831 ("recursive aggregation on aggregating txdesc")); 2832 2833 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2834 ("already aggregated")); 2835 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2836 ("recursive aggregation on to-be-aggregated txdesc")); 2837 2838 txd->flags |= HN_TXD_FLAG_ONAGG; 2839 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2840 } 2841 2842 static bool 2843 hn_tx_ring_pending(struct hn_tx_ring *txr) 2844 { 2845 bool pending = false; 2846 2847 #ifndef HN_USE_TXDESC_BUFRING 2848 mtx_lock_spin(&txr->hn_txlist_spin); 2849 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2850 pending = true; 2851 mtx_unlock_spin(&txr->hn_txlist_spin); 2852 #else 2853 if (!buf_ring_full(txr->hn_txdesc_br)) 2854 pending = true; 2855 #endif 2856 return (pending); 2857 } 2858 2859 static __inline void 2860 hn_txeof(struct hn_tx_ring *txr) 2861 { 2862 txr->hn_has_txeof = 0; 2863 txr->hn_txeof(txr); 2864 } 2865 2866 static void 2867 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2868 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2869 { 2870 struct hn_txdesc *txd = sndc->hn_cbarg; 2871 struct hn_tx_ring *txr; 2872 2873 txr = txd->txr; 2874 KASSERT(txr->hn_chan == chan, 2875 ("channel mismatch, on chan%u, should be chan%u", 2876 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2877 2878 txr->hn_has_txeof = 1; 2879 hn_txdesc_put(txr, txd); 2880 2881 ++txr->hn_txdone_cnt; 2882 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2883 txr->hn_txdone_cnt = 0; 2884 if (txr->hn_oactive) 2885 hn_txeof(txr); 2886 } 2887 } 2888 2889 static void 2890 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2891 { 2892 #if defined(INET) || defined(INET6) 2893 struct epoch_tracker et; 2894 2895 NET_EPOCH_ENTER(et); 2896 tcp_lro_flush_all(&rxr->hn_lro); 2897 NET_EPOCH_EXIT(et); 2898 #endif 2899 2900 /* 2901 * NOTE: 2902 * 'txr' could be NULL, if multiple channels and 2903 * ifnet.if_start method are enabled. 2904 */ 2905 if (txr == NULL || !txr->hn_has_txeof) 2906 return; 2907 2908 txr->hn_txdone_cnt = 0; 2909 hn_txeof(txr); 2910 } 2911 2912 static __inline uint32_t 2913 hn_rndis_pktmsg_offset(uint32_t ofs) 2914 { 2915 2916 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2917 ("invalid RNDIS packet msg offset %u", ofs)); 2918 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2919 } 2920 2921 static __inline void * 2922 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2923 size_t pi_dlen, uint32_t pi_type) 2924 { 2925 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2926 struct rndis_pktinfo *pi; 2927 2928 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2929 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2930 2931 /* 2932 * Per-packet-info does not move; it only grows. 2933 * 2934 * NOTE: 2935 * rm_pktinfooffset in this phase counts from the beginning 2936 * of rndis_packet_msg. 2937 */ 2938 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2939 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2940 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2941 pkt->rm_pktinfolen); 2942 pkt->rm_pktinfolen += pi_size; 2943 2944 pi->rm_size = pi_size; 2945 pi->rm_type = pi_type; 2946 pi->rm_internal = 0; 2947 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2948 2949 return (pi->rm_data); 2950 } 2951 2952 static __inline int 2953 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 2954 { 2955 struct hn_txdesc *txd; 2956 struct mbuf *m; 2957 int error, pkts; 2958 2959 txd = txr->hn_agg_txd; 2960 KASSERT(txd != NULL, ("no aggregate txdesc")); 2961 2962 /* 2963 * Since hn_txpkt() will reset this temporary stat, save 2964 * it now, so that oerrors can be updated properly, if 2965 * hn_txpkt() ever fails. 2966 */ 2967 pkts = txr->hn_stat_pkts; 2968 2969 /* 2970 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2971 * failure, save it for later freeing, if hn_txpkt() ever 2972 * fails. 2973 */ 2974 m = txd->m; 2975 error = hn_txpkt(ifp, txr, txd); 2976 if (__predict_false(error)) { 2977 /* txd is freed, but m is not. */ 2978 m_freem(m); 2979 2980 txr->hn_flush_failed++; 2981 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2982 } 2983 2984 /* Reset all aggregation states. */ 2985 txr->hn_agg_txd = NULL; 2986 txr->hn_agg_szleft = 0; 2987 txr->hn_agg_pktleft = 0; 2988 txr->hn_agg_prevpkt = NULL; 2989 2990 return (error); 2991 } 2992 2993 static void * 2994 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2995 int pktsize) 2996 { 2997 void *chim; 2998 2999 if (txr->hn_agg_txd != NULL) { 3000 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 3001 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 3002 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 3003 int olen; 3004 3005 /* 3006 * Update the previous RNDIS packet's total length, 3007 * it can be increased due to the mandatory alignment 3008 * padding for this RNDIS packet. And update the 3009 * aggregating txdesc's chimney sending buffer size 3010 * accordingly. 3011 * 3012 * XXX 3013 * Zero-out the padding, as required by the RNDIS spec. 3014 */ 3015 olen = pkt->rm_len; 3016 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 3017 agg_txd->chim_size += pkt->rm_len - olen; 3018 3019 /* Link this txdesc to the parent. */ 3020 hn_txdesc_agg(agg_txd, txd); 3021 3022 chim = (uint8_t *)pkt + pkt->rm_len; 3023 /* Save the current packet for later fixup. */ 3024 txr->hn_agg_prevpkt = chim; 3025 3026 txr->hn_agg_pktleft--; 3027 txr->hn_agg_szleft -= pktsize; 3028 if (txr->hn_agg_szleft <= 3029 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3030 /* 3031 * Probably can't aggregate more packets, 3032 * flush this aggregating txdesc proactively. 3033 */ 3034 txr->hn_agg_pktleft = 0; 3035 } 3036 /* Done! */ 3037 return (chim); 3038 } 3039 hn_flush_txagg(ifp, txr); 3040 } 3041 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3042 3043 txr->hn_tx_chimney_tried++; 3044 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3045 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3046 return (NULL); 3047 txr->hn_tx_chimney++; 3048 3049 chim = txr->hn_sc->hn_chim + 3050 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3051 3052 if (txr->hn_agg_pktmax > 1 && 3053 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3054 txr->hn_agg_txd = txd; 3055 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3056 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3057 txr->hn_agg_prevpkt = chim; 3058 } 3059 return (chim); 3060 } 3061 3062 /* 3063 * NOTE: 3064 * If this function fails, then both txd and m_head0 will be freed. 3065 */ 3066 static int 3067 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3068 struct mbuf **m_head0) 3069 { 3070 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3071 int error, nsegs, i; 3072 struct mbuf *m_head = *m_head0; 3073 struct rndis_packet_msg *pkt; 3074 uint32_t *pi_data; 3075 void *chim = NULL; 3076 int pkt_hlen, pkt_size; 3077 3078 pkt = txd->rndis_pkt; 3079 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3080 if (pkt_size < txr->hn_chim_size) { 3081 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3082 if (chim != NULL) 3083 pkt = chim; 3084 } else { 3085 if (txr->hn_agg_txd != NULL) 3086 hn_flush_txagg(ifp, txr); 3087 } 3088 3089 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3090 pkt->rm_len = m_head->m_pkthdr.len; 3091 pkt->rm_dataoffset = 0; 3092 pkt->rm_datalen = m_head->m_pkthdr.len; 3093 pkt->rm_oobdataoffset = 0; 3094 pkt->rm_oobdatalen = 0; 3095 pkt->rm_oobdataelements = 0; 3096 pkt->rm_pktinfooffset = sizeof(*pkt); 3097 pkt->rm_pktinfolen = 0; 3098 pkt->rm_vchandle = 0; 3099 pkt->rm_reserved = 0; 3100 3101 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3102 /* 3103 * Set the hash value for this packet. 3104 */ 3105 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3106 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3107 3108 if (M_HASHTYPE_ISHASH(m_head)) 3109 /* 3110 * The flowid field contains the hash value host 3111 * set in the rx queue if it is a ip forwarding pkt. 3112 * Set the same hash value so host can send on the 3113 * cpu it was received. 3114 */ 3115 *pi_data = m_head->m_pkthdr.flowid; 3116 else 3117 /* 3118 * Otherwise just put the tx queue index. 3119 */ 3120 *pi_data = txr->hn_tx_idx; 3121 } 3122 3123 if (m_head->m_flags & M_VLANTAG) { 3124 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3125 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3126 *pi_data = NDIS_VLAN_INFO_MAKE( 3127 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3128 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3129 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3130 } 3131 3132 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3133 #if defined(INET6) || defined(INET) 3134 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3135 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3136 #ifdef INET 3137 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3138 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3139 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3140 m_head->m_pkthdr.tso_segsz); 3141 } 3142 #endif 3143 #if defined(INET6) && defined(INET) 3144 else 3145 #endif 3146 #ifdef INET6 3147 { 3148 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3149 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3150 m_head->m_pkthdr.tso_segsz); 3151 } 3152 #endif 3153 #endif /* INET6 || INET */ 3154 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3155 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3156 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3157 if (m_head->m_pkthdr.csum_flags & 3158 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3159 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3160 } else { 3161 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3162 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3163 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3164 } 3165 3166 if (m_head->m_pkthdr.csum_flags & 3167 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3168 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3169 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3170 } else if (m_head->m_pkthdr.csum_flags & 3171 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3172 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3173 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3174 } 3175 } 3176 3177 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3178 /* Fixup RNDIS packet message total length */ 3179 pkt->rm_len += pkt_hlen; 3180 /* Convert RNDIS packet message offsets */ 3181 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3182 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3183 3184 /* 3185 * Fast path: Chimney sending. 3186 */ 3187 if (chim != NULL) { 3188 struct hn_txdesc *tgt_txd = txd; 3189 3190 if (txr->hn_agg_txd != NULL) { 3191 tgt_txd = txr->hn_agg_txd; 3192 #ifdef INVARIANTS 3193 *m_head0 = NULL; 3194 #endif 3195 } 3196 3197 KASSERT(pkt == chim, 3198 ("RNDIS pkt not in chimney sending buffer")); 3199 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3200 ("chimney sending buffer is not used")); 3201 tgt_txd->chim_size += pkt->rm_len; 3202 3203 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3204 ((uint8_t *)chim) + pkt_hlen); 3205 3206 txr->hn_gpa_cnt = 0; 3207 txr->hn_sendpkt = hn_txpkt_chim; 3208 goto done; 3209 } 3210 3211 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3212 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3213 ("chimney buffer is used")); 3214 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3215 3216 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3217 if (__predict_false(error)) { 3218 int freed __diagused; 3219 3220 /* 3221 * This mbuf is not linked w/ the txd yet, so free it now. 3222 */ 3223 m_freem(m_head); 3224 *m_head0 = NULL; 3225 3226 freed = hn_txdesc_put(txr, txd); 3227 KASSERT(freed != 0, 3228 ("fail to free txd upon txdma error")); 3229 3230 txr->hn_txdma_failed++; 3231 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3232 return error; 3233 } 3234 *m_head0 = m_head; 3235 3236 /* +1 RNDIS packet message */ 3237 txr->hn_gpa_cnt = nsegs + 1; 3238 3239 /* send packet with page buffer */ 3240 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3241 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3242 txr->hn_gpa[0].gpa_len = pkt_hlen; 3243 3244 /* 3245 * Fill the page buffers with mbuf info after the page 3246 * buffer for RNDIS packet message. 3247 */ 3248 for (i = 0; i < nsegs; ++i) { 3249 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3250 3251 gpa->gpa_page = atop(segs[i].ds_addr); 3252 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3253 gpa->gpa_len = segs[i].ds_len; 3254 } 3255 3256 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3257 txd->chim_size = 0; 3258 txr->hn_sendpkt = hn_txpkt_sglist; 3259 done: 3260 txd->m = m_head; 3261 3262 /* Set the completion routine */ 3263 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3264 3265 /* Update temporary stats for later use. */ 3266 txr->hn_stat_pkts++; 3267 txr->hn_stat_size += m_head->m_pkthdr.len; 3268 if (m_head->m_flags & M_MCAST) 3269 txr->hn_stat_mcasts++; 3270 3271 return 0; 3272 } 3273 3274 /* 3275 * NOTE: 3276 * If this function fails, then txd will be freed, but the mbuf 3277 * associated w/ the txd will _not_ be freed. 3278 */ 3279 static int 3280 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3281 { 3282 int error, send_failed = 0, has_bpf; 3283 3284 again: 3285 has_bpf = bpf_peers_present(ifp->if_bpf); 3286 if (has_bpf) { 3287 /* 3288 * Make sure that this txd and any aggregated txds are not 3289 * freed before ETHER_BPF_MTAP. 3290 */ 3291 hn_txdesc_hold(txd); 3292 } 3293 error = txr->hn_sendpkt(txr, txd); 3294 if (!error) { 3295 if (has_bpf) { 3296 const struct hn_txdesc *tmp_txd; 3297 3298 ETHER_BPF_MTAP(ifp, txd->m); 3299 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3300 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3301 } 3302 3303 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3304 #ifdef HN_IFSTART_SUPPORT 3305 if (!hn_use_if_start) 3306 #endif 3307 { 3308 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3309 txr->hn_stat_size); 3310 if (txr->hn_stat_mcasts != 0) { 3311 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3312 txr->hn_stat_mcasts); 3313 } 3314 } 3315 txr->hn_pkts += txr->hn_stat_pkts; 3316 txr->hn_sends++; 3317 } 3318 if (has_bpf) 3319 hn_txdesc_put(txr, txd); 3320 3321 if (__predict_false(error)) { 3322 int freed __diagused; 3323 3324 /* 3325 * This should "really rarely" happen. 3326 * 3327 * XXX Too many RX to be acked or too many sideband 3328 * commands to run? Ask netvsc_channel_rollup() 3329 * to kick start later. 3330 */ 3331 txr->hn_has_txeof = 1; 3332 if (!send_failed) { 3333 txr->hn_send_failed++; 3334 send_failed = 1; 3335 /* 3336 * Try sending again after set hn_has_txeof; 3337 * in case that we missed the last 3338 * netvsc_channel_rollup(). 3339 */ 3340 goto again; 3341 } 3342 if_printf(ifp, "send failed\n"); 3343 3344 /* 3345 * Caller will perform further processing on the 3346 * associated mbuf, so don't free it in hn_txdesc_put(); 3347 * only unload it from the DMA map in hn_txdesc_put(), 3348 * if it was loaded. 3349 */ 3350 txd->m = NULL; 3351 freed = hn_txdesc_put(txr, txd); 3352 KASSERT(freed != 0, 3353 ("fail to free txd upon send error")); 3354 3355 txr->hn_send_failed++; 3356 } 3357 3358 /* Reset temporary stats, after this sending is done. */ 3359 txr->hn_stat_size = 0; 3360 txr->hn_stat_pkts = 0; 3361 txr->hn_stat_mcasts = 0; 3362 3363 return (error); 3364 } 3365 3366 /* 3367 * Append the specified data to the indicated mbuf chain, 3368 * Extend the mbuf chain if the new data does not fit in 3369 * existing space. 3370 * 3371 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3372 * There should be an equivalent in the kernel mbuf code, 3373 * but there does not appear to be one yet. 3374 * 3375 * Differs from m_append() in that additional mbufs are 3376 * allocated with cluster size MJUMPAGESIZE, and filled 3377 * accordingly. 3378 * 3379 * Return the last mbuf in the chain or NULL if failed to 3380 * allocate new mbuf. 3381 */ 3382 static struct mbuf * 3383 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3384 { 3385 struct mbuf *m, *n; 3386 int remainder, space; 3387 3388 for (m = m0; m->m_next != NULL; m = m->m_next) 3389 ; 3390 remainder = len; 3391 space = M_TRAILINGSPACE(m); 3392 if (space > 0) { 3393 /* 3394 * Copy into available space. 3395 */ 3396 if (space > remainder) 3397 space = remainder; 3398 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3399 m->m_len += space; 3400 cp += space; 3401 remainder -= space; 3402 } 3403 while (remainder > 0) { 3404 /* 3405 * Allocate a new mbuf; could check space 3406 * and allocate a cluster instead. 3407 */ 3408 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3409 if (n == NULL) 3410 return NULL; 3411 n->m_len = min(MJUMPAGESIZE, remainder); 3412 bcopy(cp, mtod(n, caddr_t), n->m_len); 3413 cp += n->m_len; 3414 remainder -= n->m_len; 3415 m->m_next = n; 3416 m = n; 3417 } 3418 3419 return m; 3420 } 3421 3422 #if defined(INET) || defined(INET6) 3423 static __inline int 3424 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3425 { 3426 #if __FreeBSD_version >= 1100095 3427 if (hn_lro_mbufq_depth) { 3428 tcp_lro_queue_mbuf(lc, m); 3429 return 0; 3430 } 3431 #endif 3432 return tcp_lro_rx(lc, m, 0); 3433 } 3434 #endif 3435 3436 static int 3437 hn_rxpkt(struct hn_rx_ring *rxr) 3438 { 3439 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; 3440 struct mbuf *m_new, *n; 3441 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3442 int hash_type = M_HASHTYPE_NONE; 3443 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3444 int i; 3445 3446 ifp = hn_ifp; 3447 if (rxr->hn_rxvf_ifp != NULL) { 3448 /* 3449 * Non-transparent mode VF; pretend this packet is from 3450 * the VF. 3451 */ 3452 ifp = rxr->hn_rxvf_ifp; 3453 is_vf = 1; 3454 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3455 /* Transparent mode VF. */ 3456 is_vf = 1; 3457 } 3458 3459 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 3460 /* 3461 * NOTE: 3462 * See the NOTE of hn_rndis_init_fixat(). This 3463 * function can be reached, immediately after the 3464 * RNDIS is initialized but before the ifnet is 3465 * setup on the hn_attach() path; drop the unexpected 3466 * packets. 3467 */ 3468 return (0); 3469 } 3470 3471 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3472 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3473 return (0); 3474 } 3475 3476 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3477 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3478 if (m_new == NULL) { 3479 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3480 return (0); 3481 } 3482 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3483 rxr->rsc.frag_len[0]); 3484 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3485 } else { 3486 /* 3487 * Get an mbuf with a cluster. For packets 2K or less, 3488 * get a standard 2K cluster. For anything larger, get a 3489 * 4K cluster. Any buffers larger than 4K can cause problems 3490 * if looped around to the Hyper-V TX channel, so avoid them. 3491 */ 3492 size = MCLBYTES; 3493 if (rxr->rsc.pktlen > MCLBYTES) { 3494 /* 4096 */ 3495 size = MJUMPAGESIZE; 3496 } 3497 3498 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3499 if (m_new == NULL) { 3500 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3501 return (0); 3502 } 3503 3504 n = m_new; 3505 for (i = 0; i < rxr->rsc.cnt; i++) { 3506 n = hv_m_append(n, rxr->rsc.frag_len[i], 3507 rxr->rsc.frag_data[i]); 3508 if (n == NULL) { 3509 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3510 return (0); 3511 } else { 3512 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3513 } 3514 } 3515 } 3516 if (rxr->rsc.pktlen <= MHLEN) 3517 rxr->hn_small_pkts++; 3518 3519 m_new->m_pkthdr.rcvif = ifp; 3520 3521 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) 3522 do_csum = 0; 3523 3524 /* receive side checksum offload */ 3525 if (rxr->rsc.csum_info != NULL) { 3526 /* IP csum offload */ 3527 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3528 m_new->m_pkthdr.csum_flags |= 3529 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3530 rxr->hn_csum_ip++; 3531 } 3532 3533 /* TCP/UDP csum offload */ 3534 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3535 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3536 m_new->m_pkthdr.csum_flags |= 3537 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3538 m_new->m_pkthdr.csum_data = 0xffff; 3539 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3540 rxr->hn_csum_tcp++; 3541 else 3542 rxr->hn_csum_udp++; 3543 } 3544 3545 /* 3546 * XXX 3547 * As of this write (Oct 28th, 2016), host side will turn 3548 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3549 * the do_lro setting here is actually _not_ accurate. We 3550 * depend on the RSS hash type check to reset do_lro. 3551 */ 3552 if ((*(rxr->rsc.csum_info) & 3553 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3554 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3555 do_lro = 1; 3556 } else { 3557 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3558 if (l3proto == ETHERTYPE_IP) { 3559 if (l4proto == IPPROTO_TCP) { 3560 if (do_csum && 3561 (rxr->hn_trust_hcsum & 3562 HN_TRUST_HCSUM_TCP)) { 3563 rxr->hn_csum_trusted++; 3564 m_new->m_pkthdr.csum_flags |= 3565 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3566 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3567 m_new->m_pkthdr.csum_data = 0xffff; 3568 } 3569 do_lro = 1; 3570 } else if (l4proto == IPPROTO_UDP) { 3571 if (do_csum && 3572 (rxr->hn_trust_hcsum & 3573 HN_TRUST_HCSUM_UDP)) { 3574 rxr->hn_csum_trusted++; 3575 m_new->m_pkthdr.csum_flags |= 3576 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3577 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3578 m_new->m_pkthdr.csum_data = 0xffff; 3579 } 3580 } else if (l4proto != IPPROTO_DONE && do_csum && 3581 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3582 rxr->hn_csum_trusted++; 3583 m_new->m_pkthdr.csum_flags |= 3584 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3585 } 3586 } 3587 } 3588 3589 if (rxr->rsc.vlan_info != NULL) { 3590 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3591 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3592 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3593 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3594 m_new->m_flags |= M_VLANTAG; 3595 } 3596 3597 /* 3598 * If VF is activated (tranparent/non-transparent mode does not 3599 * matter here). 3600 * 3601 * - Disable LRO 3602 * 3603 * hn(4) will only receive broadcast packets, multicast packets, 3604 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3605 * packet types. 3606 * 3607 * For non-transparent, we definitely _cannot_ enable LRO at 3608 * all, since the LRO flush will use hn(4) as the receiving 3609 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3610 */ 3611 if (is_vf) 3612 do_lro = 0; 3613 3614 /* 3615 * If VF is activated (tranparent/non-transparent mode does not 3616 * matter here), do _not_ mess with unsupported hash types or 3617 * functions. 3618 */ 3619 if (rxr->rsc.hash_info != NULL) { 3620 rxr->hn_rss_pkts++; 3621 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3622 if (!is_vf) 3623 hash_type = M_HASHTYPE_OPAQUE_HASH; 3624 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3625 NDIS_HASH_FUNCTION_TOEPLITZ) { 3626 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3627 rxr->hn_mbuf_hash); 3628 3629 /* 3630 * NOTE: 3631 * do_lro is resetted, if the hash types are not TCP 3632 * related. See the comment in the above csum_flags 3633 * setup section. 3634 */ 3635 switch (type) { 3636 case NDIS_HASH_IPV4: 3637 hash_type = M_HASHTYPE_RSS_IPV4; 3638 do_lro = 0; 3639 break; 3640 3641 case NDIS_HASH_TCP_IPV4: 3642 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3643 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3644 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3645 3646 if (is_vf) 3647 def_htype = M_HASHTYPE_NONE; 3648 3649 /* 3650 * UDP 4-tuple hash is delivered as 3651 * TCP 4-tuple hash. 3652 */ 3653 if (l3proto == ETHERTYPE_MAX) { 3654 hn_rxpkt_proto(m_new, 3655 &l3proto, &l4proto); 3656 } 3657 if (l3proto == ETHERTYPE_IP) { 3658 if (l4proto == IPPROTO_UDP && 3659 (rxr->hn_mbuf_hash & 3660 NDIS_HASH_UDP_IPV4_X)) { 3661 hash_type = 3662 M_HASHTYPE_RSS_UDP_IPV4; 3663 do_lro = 0; 3664 } else if (l4proto != 3665 IPPROTO_TCP) { 3666 hash_type = def_htype; 3667 do_lro = 0; 3668 } 3669 } else { 3670 hash_type = def_htype; 3671 do_lro = 0; 3672 } 3673 } 3674 break; 3675 3676 case NDIS_HASH_IPV6: 3677 hash_type = M_HASHTYPE_RSS_IPV6; 3678 do_lro = 0; 3679 break; 3680 3681 case NDIS_HASH_IPV6_EX: 3682 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3683 do_lro = 0; 3684 break; 3685 3686 case NDIS_HASH_TCP_IPV6: 3687 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3688 break; 3689 3690 case NDIS_HASH_TCP_IPV6_EX: 3691 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3692 break; 3693 } 3694 } 3695 } else if (!is_vf) { 3696 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3697 hash_type = M_HASHTYPE_OPAQUE; 3698 } 3699 M_HASHTYPE_SET(m_new, hash_type); 3700 3701 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3702 if (hn_ifp != ifp) { 3703 const struct ether_header *eh; 3704 3705 /* 3706 * Non-transparent mode VF is activated. 3707 */ 3708 3709 /* 3710 * Allow tapping on hn(4). 3711 */ 3712 ETHER_BPF_MTAP(hn_ifp, m_new); 3713 3714 /* 3715 * Update hn(4)'s stats. 3716 */ 3717 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3718 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3719 /* Checked at the beginning of this function. */ 3720 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3721 eh = mtod(m_new, struct ether_header *); 3722 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3723 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3724 } 3725 rxr->hn_pkts++; 3726 3727 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { 3728 #if defined(INET) || defined(INET6) 3729 struct lro_ctrl *lro = &rxr->hn_lro; 3730 3731 if (lro->lro_cnt) { 3732 rxr->hn_lro_tried++; 3733 if (hn_lro_rx(lro, m_new) == 0) { 3734 /* DONE! */ 3735 return 0; 3736 } 3737 } 3738 #endif 3739 } 3740 ifp->if_input(ifp, m_new); 3741 3742 return (0); 3743 } 3744 3745 static int 3746 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 3747 { 3748 struct hn_softc *sc = ifp->if_softc; 3749 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3750 struct ifnet *vf_ifp; 3751 int mask, error = 0; 3752 struct ifrsskey *ifrk; 3753 struct ifrsshash *ifrh; 3754 uint32_t mtu; 3755 3756 switch (cmd) { 3757 case SIOCSIFMTU: 3758 if (ifr->ifr_mtu > HN_MTU_MAX) { 3759 error = EINVAL; 3760 break; 3761 } 3762 3763 HN_LOCK(sc); 3764 3765 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3766 HN_UNLOCK(sc); 3767 break; 3768 } 3769 3770 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3771 /* Can't change MTU */ 3772 HN_UNLOCK(sc); 3773 error = EOPNOTSUPP; 3774 break; 3775 } 3776 3777 if (ifp->if_mtu == ifr->ifr_mtu) { 3778 HN_UNLOCK(sc); 3779 break; 3780 } 3781 3782 if (hn_xpnt_vf_isready(sc)) { 3783 vf_ifp = sc->hn_vf_ifp; 3784 ifr_vf = *ifr; 3785 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, 3786 sizeof(ifr_vf.ifr_name)); 3787 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, 3788 (caddr_t)&ifr_vf); 3789 if (error) { 3790 HN_UNLOCK(sc); 3791 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3792 vf_ifp->if_xname, ifr->ifr_mtu, error); 3793 break; 3794 } 3795 } 3796 3797 /* 3798 * Suspend this interface before the synthetic parts 3799 * are ripped. 3800 */ 3801 hn_suspend(sc); 3802 3803 /* 3804 * Detach the synthetics parts, i.e. NVS and RNDIS. 3805 */ 3806 hn_synth_detach(sc); 3807 3808 /* 3809 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3810 * with the new MTU setting. 3811 */ 3812 error = hn_synth_attach(sc, ifr->ifr_mtu); 3813 if (error) { 3814 HN_UNLOCK(sc); 3815 break; 3816 } 3817 3818 error = hn_rndis_get_mtu(sc, &mtu); 3819 if (error) 3820 mtu = ifr->ifr_mtu; 3821 else if (bootverbose) 3822 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3823 3824 /* 3825 * Commit the requested MTU, after the synthetic parts 3826 * have been successfully attached. 3827 */ 3828 if (mtu >= ifr->ifr_mtu) { 3829 mtu = ifr->ifr_mtu; 3830 } else { 3831 if_printf(ifp, "fixup mtu %d -> %u\n", 3832 ifr->ifr_mtu, mtu); 3833 } 3834 ifp->if_mtu = mtu; 3835 3836 /* 3837 * Synthetic parts' reattach may change the chimney 3838 * sending size; update it. 3839 */ 3840 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3841 hn_set_chim_size(sc, sc->hn_chim_szmax); 3842 3843 /* 3844 * Make sure that various parameters based on MTU are 3845 * still valid, after the MTU change. 3846 */ 3847 hn_mtu_change_fixup(sc); 3848 3849 /* 3850 * All done! Resume the interface now. 3851 */ 3852 hn_resume(sc); 3853 3854 if ((sc->hn_flags & HN_FLAG_RXVF) || 3855 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3856 /* 3857 * Since we have reattached the NVS part, 3858 * change the datapath to VF again; in case 3859 * that it is lost, after the NVS was detached. 3860 */ 3861 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3862 } 3863 3864 HN_UNLOCK(sc); 3865 break; 3866 3867 case SIOCSIFFLAGS: 3868 HN_LOCK(sc); 3869 3870 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3871 HN_UNLOCK(sc); 3872 break; 3873 } 3874 3875 if (hn_xpnt_vf_isready(sc)) 3876 hn_xpnt_vf_saveifflags(sc); 3877 3878 if (ifp->if_flags & IFF_UP) { 3879 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3880 /* 3881 * Caller meight hold mutex, e.g. 3882 * bpf; use busy-wait for the RNDIS 3883 * reply. 3884 */ 3885 HN_NO_SLEEPING(sc); 3886 hn_rxfilter_config(sc); 3887 HN_SLEEPING_OK(sc); 3888 3889 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3890 error = hn_xpnt_vf_iocsetflags(sc); 3891 } else { 3892 hn_init_locked(sc); 3893 } 3894 } else { 3895 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3896 hn_stop(sc, false); 3897 } 3898 sc->hn_if_flags = ifp->if_flags; 3899 3900 HN_UNLOCK(sc); 3901 break; 3902 3903 case SIOCSIFCAP: 3904 HN_LOCK(sc); 3905 3906 if (hn_xpnt_vf_isready(sc)) { 3907 ifr_vf = *ifr; 3908 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, 3909 sizeof(ifr_vf.ifr_name)); 3910 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3911 HN_UNLOCK(sc); 3912 break; 3913 } 3914 3915 /* 3916 * Fix up requested capabilities w/ supported capabilities, 3917 * since the supported capabilities could have been changed. 3918 */ 3919 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ 3920 ifp->if_capenable; 3921 3922 if (mask & IFCAP_TXCSUM) { 3923 ifp->if_capenable ^= IFCAP_TXCSUM; 3924 if (ifp->if_capenable & IFCAP_TXCSUM) 3925 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 3926 else 3927 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 3928 } 3929 if (mask & IFCAP_TXCSUM_IPV6) { 3930 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 3931 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 3932 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 3933 else 3934 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 3935 } 3936 3937 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3938 if (mask & IFCAP_RXCSUM) 3939 ifp->if_capenable ^= IFCAP_RXCSUM; 3940 #ifdef foo 3941 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3942 if (mask & IFCAP_RXCSUM_IPV6) 3943 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 3944 #endif 3945 3946 if (mask & IFCAP_LRO) 3947 ifp->if_capenable ^= IFCAP_LRO; 3948 3949 if (mask & IFCAP_TSO4) { 3950 ifp->if_capenable ^= IFCAP_TSO4; 3951 if (ifp->if_capenable & IFCAP_TSO4) 3952 ifp->if_hwassist |= CSUM_IP_TSO; 3953 else 3954 ifp->if_hwassist &= ~CSUM_IP_TSO; 3955 } 3956 if (mask & IFCAP_TSO6) { 3957 ifp->if_capenable ^= IFCAP_TSO6; 3958 if (ifp->if_capenable & IFCAP_TSO6) 3959 ifp->if_hwassist |= CSUM_IP6_TSO; 3960 else 3961 ifp->if_hwassist &= ~CSUM_IP6_TSO; 3962 } 3963 3964 HN_UNLOCK(sc); 3965 break; 3966 3967 case SIOCADDMULTI: 3968 case SIOCDELMULTI: 3969 HN_LOCK(sc); 3970 3971 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3972 HN_UNLOCK(sc); 3973 break; 3974 } 3975 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3976 /* 3977 * Multicast uses mutex; use busy-wait for 3978 * the RNDIS reply. 3979 */ 3980 HN_NO_SLEEPING(sc); 3981 hn_rxfilter_config(sc); 3982 HN_SLEEPING_OK(sc); 3983 } 3984 3985 /* XXX vlan(4) style mcast addr maintenance */ 3986 if (hn_xpnt_vf_isready(sc)) { 3987 int old_if_flags; 3988 3989 old_if_flags = sc->hn_vf_ifp->if_flags; 3990 hn_xpnt_vf_saveifflags(sc); 3991 3992 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3993 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & 3994 IFF_ALLMULTI)) 3995 error = hn_xpnt_vf_iocsetflags(sc); 3996 } 3997 3998 HN_UNLOCK(sc); 3999 break; 4000 4001 case SIOCSIFMEDIA: 4002 case SIOCGIFMEDIA: 4003 HN_LOCK(sc); 4004 if (hn_xpnt_vf_isready(sc)) { 4005 /* 4006 * SIOCGIFMEDIA expects ifmediareq, so don't 4007 * create and pass ifr_vf to the VF here; just 4008 * replace the ifr_name. 4009 */ 4010 vf_ifp = sc->hn_vf_ifp; 4011 strlcpy(ifr->ifr_name, vf_ifp->if_xname, 4012 sizeof(ifr->ifr_name)); 4013 error = vf_ifp->if_ioctl(vf_ifp, cmd, data); 4014 /* Restore the ifr_name. */ 4015 strlcpy(ifr->ifr_name, ifp->if_xname, 4016 sizeof(ifr->ifr_name)); 4017 HN_UNLOCK(sc); 4018 break; 4019 } 4020 HN_UNLOCK(sc); 4021 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4022 break; 4023 4024 case SIOCGIFRSSHASH: 4025 ifrh = (struct ifrsshash *)data; 4026 HN_LOCK(sc); 4027 if (sc->hn_rx_ring_inuse == 1) { 4028 HN_UNLOCK(sc); 4029 ifrh->ifrh_func = RSS_FUNC_NONE; 4030 ifrh->ifrh_types = 0; 4031 break; 4032 } 4033 4034 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4035 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4036 else 4037 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4038 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4039 HN_UNLOCK(sc); 4040 break; 4041 4042 case SIOCGIFRSSKEY: 4043 ifrk = (struct ifrsskey *)data; 4044 HN_LOCK(sc); 4045 if (sc->hn_rx_ring_inuse == 1) { 4046 HN_UNLOCK(sc); 4047 ifrk->ifrk_func = RSS_FUNC_NONE; 4048 ifrk->ifrk_keylen = 0; 4049 break; 4050 } 4051 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4052 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4053 else 4054 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4055 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4056 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4057 NDIS_HASH_KEYSIZE_TOEPLITZ); 4058 HN_UNLOCK(sc); 4059 break; 4060 4061 default: 4062 error = ether_ioctl(ifp, cmd, data); 4063 break; 4064 } 4065 return (error); 4066 } 4067 4068 static void 4069 hn_stop(struct hn_softc *sc, bool detaching) 4070 { 4071 struct ifnet *ifp = sc->hn_ifp; 4072 int i; 4073 4074 HN_LOCK_ASSERT(sc); 4075 4076 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4077 ("synthetic parts were not attached")); 4078 4079 /* Clear RUNNING bit ASAP. */ 4080 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4081 4082 /* Disable polling. */ 4083 hn_polling(sc, 0); 4084 4085 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4086 KASSERT(sc->hn_vf_ifp != NULL, 4087 ("%s: VF is not attached", ifp->if_xname)); 4088 4089 /* Mark transparent mode VF as disabled. */ 4090 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4091 4092 /* 4093 * NOTE: 4094 * Datapath setting must happen _before_ bringing 4095 * the VF down. 4096 */ 4097 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4098 4099 /* 4100 * Bring the VF down. 4101 */ 4102 hn_xpnt_vf_saveifflags(sc); 4103 sc->hn_vf_ifp->if_flags &= ~IFF_UP; 4104 hn_xpnt_vf_iocsetflags(sc); 4105 } 4106 4107 /* Suspend data transfers. */ 4108 hn_suspend_data(sc); 4109 4110 /* Clear OACTIVE bit. */ 4111 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4112 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4113 sc->hn_tx_ring[i].hn_oactive = 0; 4114 4115 /* 4116 * If the non-transparent mode VF is active, make sure 4117 * that the RX filter still allows packet reception. 4118 */ 4119 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4120 hn_rxfilter_config(sc); 4121 } 4122 4123 static void 4124 hn_init_locked(struct hn_softc *sc) 4125 { 4126 struct ifnet *ifp = sc->hn_ifp; 4127 int i; 4128 4129 HN_LOCK_ASSERT(sc); 4130 4131 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4132 return; 4133 4134 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 4135 return; 4136 4137 /* Configure RX filter */ 4138 hn_rxfilter_config(sc); 4139 4140 /* Clear OACTIVE bit. */ 4141 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4142 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4143 sc->hn_tx_ring[i].hn_oactive = 0; 4144 4145 /* Clear TX 'suspended' bit. */ 4146 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4147 4148 if (hn_xpnt_vf_isready(sc)) { 4149 /* Initialize transparent VF. */ 4150 hn_xpnt_vf_init(sc); 4151 } 4152 4153 /* Everything is ready; unleash! */ 4154 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4155 4156 /* Re-enable polling if requested. */ 4157 if (sc->hn_pollhz > 0) 4158 hn_polling(sc, sc->hn_pollhz); 4159 } 4160 4161 static void 4162 hn_init(void *xsc) 4163 { 4164 struct hn_softc *sc = xsc; 4165 4166 HN_LOCK(sc); 4167 hn_init_locked(sc); 4168 HN_UNLOCK(sc); 4169 } 4170 4171 #if __FreeBSD_version >= 1100099 4172 4173 static int 4174 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4175 { 4176 struct hn_softc *sc = arg1; 4177 unsigned int lenlim; 4178 int error; 4179 4180 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4181 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4182 if (error || req->newptr == NULL) 4183 return error; 4184 4185 HN_LOCK(sc); 4186 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4187 lenlim > TCP_LRO_LENGTH_MAX) { 4188 HN_UNLOCK(sc); 4189 return EINVAL; 4190 } 4191 hn_set_lro_lenlim(sc, lenlim); 4192 HN_UNLOCK(sc); 4193 4194 return 0; 4195 } 4196 4197 static int 4198 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4199 { 4200 struct hn_softc *sc = arg1; 4201 int ackcnt, error, i; 4202 4203 /* 4204 * lro_ackcnt_lim is append count limit, 4205 * +1 to turn it into aggregation limit. 4206 */ 4207 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4208 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4209 if (error || req->newptr == NULL) 4210 return error; 4211 4212 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4213 return EINVAL; 4214 4215 /* 4216 * Convert aggregation limit back to append 4217 * count limit. 4218 */ 4219 --ackcnt; 4220 HN_LOCK(sc); 4221 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4222 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4223 HN_UNLOCK(sc); 4224 return 0; 4225 } 4226 4227 #endif 4228 4229 static int 4230 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4231 { 4232 struct hn_softc *sc = arg1; 4233 int hcsum = arg2; 4234 int on, error, i; 4235 4236 on = 0; 4237 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4238 on = 1; 4239 4240 error = sysctl_handle_int(oidp, &on, 0, req); 4241 if (error || req->newptr == NULL) 4242 return error; 4243 4244 HN_LOCK(sc); 4245 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4246 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4247 4248 if (on) 4249 rxr->hn_trust_hcsum |= hcsum; 4250 else 4251 rxr->hn_trust_hcsum &= ~hcsum; 4252 } 4253 HN_UNLOCK(sc); 4254 return 0; 4255 } 4256 4257 static int 4258 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4259 { 4260 struct hn_softc *sc = arg1; 4261 int chim_size, error; 4262 4263 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4264 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4265 if (error || req->newptr == NULL) 4266 return error; 4267 4268 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4269 return EINVAL; 4270 4271 HN_LOCK(sc); 4272 hn_set_chim_size(sc, chim_size); 4273 HN_UNLOCK(sc); 4274 return 0; 4275 } 4276 4277 #if __FreeBSD_version < 1100095 4278 static int 4279 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 4280 { 4281 struct hn_softc *sc = arg1; 4282 int ofs = arg2, i, error; 4283 struct hn_rx_ring *rxr; 4284 uint64_t stat; 4285 4286 stat = 0; 4287 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4288 rxr = &sc->hn_rx_ring[i]; 4289 stat += *((int *)((uint8_t *)rxr + ofs)); 4290 } 4291 4292 error = sysctl_handle_64(oidp, &stat, 0, req); 4293 if (error || req->newptr == NULL) 4294 return error; 4295 4296 /* Zero out this stat. */ 4297 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4298 rxr = &sc->hn_rx_ring[i]; 4299 *((int *)((uint8_t *)rxr + ofs)) = 0; 4300 } 4301 return 0; 4302 } 4303 #else 4304 static int 4305 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4306 { 4307 struct hn_softc *sc = arg1; 4308 int ofs = arg2, i, error; 4309 struct hn_rx_ring *rxr; 4310 uint64_t stat; 4311 4312 stat = 0; 4313 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4314 rxr = &sc->hn_rx_ring[i]; 4315 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4316 } 4317 4318 error = sysctl_handle_64(oidp, &stat, 0, req); 4319 if (error || req->newptr == NULL) 4320 return error; 4321 4322 /* Zero out this stat. */ 4323 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4324 rxr = &sc->hn_rx_ring[i]; 4325 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4326 } 4327 return 0; 4328 } 4329 4330 #endif 4331 4332 static int 4333 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4334 { 4335 struct hn_softc *sc = arg1; 4336 int ofs = arg2, i, error; 4337 struct hn_rx_ring *rxr; 4338 u_long stat; 4339 4340 stat = 0; 4341 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4342 rxr = &sc->hn_rx_ring[i]; 4343 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4344 } 4345 4346 error = sysctl_handle_long(oidp, &stat, 0, req); 4347 if (error || req->newptr == NULL) 4348 return error; 4349 4350 /* Zero out this stat. */ 4351 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4352 rxr = &sc->hn_rx_ring[i]; 4353 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4354 } 4355 return 0; 4356 } 4357 4358 static int 4359 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4360 { 4361 struct hn_softc *sc = arg1; 4362 int ofs = arg2, i, error; 4363 struct hn_tx_ring *txr; 4364 u_long stat; 4365 4366 stat = 0; 4367 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4368 txr = &sc->hn_tx_ring[i]; 4369 stat += *((u_long *)((uint8_t *)txr + ofs)); 4370 } 4371 4372 error = sysctl_handle_long(oidp, &stat, 0, req); 4373 if (error || req->newptr == NULL) 4374 return error; 4375 4376 /* Zero out this stat. */ 4377 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4378 txr = &sc->hn_tx_ring[i]; 4379 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4380 } 4381 return 0; 4382 } 4383 4384 static int 4385 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4386 { 4387 struct hn_softc *sc = arg1; 4388 int ofs = arg2, i, error, conf; 4389 struct hn_tx_ring *txr; 4390 4391 txr = &sc->hn_tx_ring[0]; 4392 conf = *((int *)((uint8_t *)txr + ofs)); 4393 4394 error = sysctl_handle_int(oidp, &conf, 0, req); 4395 if (error || req->newptr == NULL) 4396 return error; 4397 4398 HN_LOCK(sc); 4399 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4400 txr = &sc->hn_tx_ring[i]; 4401 *((int *)((uint8_t *)txr + ofs)) = conf; 4402 } 4403 HN_UNLOCK(sc); 4404 4405 return 0; 4406 } 4407 4408 static int 4409 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4410 { 4411 struct hn_softc *sc = arg1; 4412 int error, size; 4413 4414 size = sc->hn_agg_size; 4415 error = sysctl_handle_int(oidp, &size, 0, req); 4416 if (error || req->newptr == NULL) 4417 return (error); 4418 4419 HN_LOCK(sc); 4420 sc->hn_agg_size = size; 4421 hn_set_txagg(sc); 4422 HN_UNLOCK(sc); 4423 4424 return (0); 4425 } 4426 4427 static int 4428 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4429 { 4430 struct hn_softc *sc = arg1; 4431 int error, pkts; 4432 4433 pkts = sc->hn_agg_pkts; 4434 error = sysctl_handle_int(oidp, &pkts, 0, req); 4435 if (error || req->newptr == NULL) 4436 return (error); 4437 4438 HN_LOCK(sc); 4439 sc->hn_agg_pkts = pkts; 4440 hn_set_txagg(sc); 4441 HN_UNLOCK(sc); 4442 4443 return (0); 4444 } 4445 4446 static int 4447 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4448 { 4449 struct hn_softc *sc = arg1; 4450 int pkts; 4451 4452 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4453 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4454 } 4455 4456 static int 4457 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4458 { 4459 struct hn_softc *sc = arg1; 4460 int align; 4461 4462 align = sc->hn_tx_ring[0].hn_agg_align; 4463 return (sysctl_handle_int(oidp, &align, 0, req)); 4464 } 4465 4466 static void 4467 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4468 { 4469 if (pollhz == 0) 4470 vmbus_chan_poll_disable(chan); 4471 else 4472 vmbus_chan_poll_enable(chan, pollhz); 4473 } 4474 4475 static void 4476 hn_polling(struct hn_softc *sc, u_int pollhz) 4477 { 4478 int nsubch = sc->hn_rx_ring_inuse - 1; 4479 4480 HN_LOCK_ASSERT(sc); 4481 4482 if (nsubch > 0) { 4483 struct vmbus_channel **subch; 4484 int i; 4485 4486 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4487 for (i = 0; i < nsubch; ++i) 4488 hn_chan_polling(subch[i], pollhz); 4489 vmbus_subchan_rel(subch, nsubch); 4490 } 4491 hn_chan_polling(sc->hn_prichan, pollhz); 4492 } 4493 4494 static int 4495 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4496 { 4497 struct hn_softc *sc = arg1; 4498 int pollhz, error; 4499 4500 pollhz = sc->hn_pollhz; 4501 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4502 if (error || req->newptr == NULL) 4503 return (error); 4504 4505 if (pollhz != 0 && 4506 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4507 return (EINVAL); 4508 4509 HN_LOCK(sc); 4510 if (sc->hn_pollhz != pollhz) { 4511 sc->hn_pollhz = pollhz; 4512 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 4513 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4514 hn_polling(sc, sc->hn_pollhz); 4515 } 4516 HN_UNLOCK(sc); 4517 4518 return (0); 4519 } 4520 4521 static int 4522 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4523 { 4524 struct hn_softc *sc = arg1; 4525 char verstr[16]; 4526 4527 snprintf(verstr, sizeof(verstr), "%u.%u", 4528 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4529 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4530 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4531 } 4532 4533 static int 4534 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4535 { 4536 struct hn_softc *sc = arg1; 4537 char caps_str[128]; 4538 uint32_t caps; 4539 4540 HN_LOCK(sc); 4541 caps = sc->hn_caps; 4542 HN_UNLOCK(sc); 4543 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4544 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4545 } 4546 4547 static int 4548 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4549 { 4550 struct hn_softc *sc = arg1; 4551 char assist_str[128]; 4552 uint32_t hwassist; 4553 4554 HN_LOCK(sc); 4555 hwassist = sc->hn_ifp->if_hwassist; 4556 HN_UNLOCK(sc); 4557 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4558 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4559 } 4560 4561 static int 4562 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4563 { 4564 struct hn_softc *sc = arg1; 4565 char filter_str[128]; 4566 uint32_t filter; 4567 4568 HN_LOCK(sc); 4569 filter = sc->hn_rx_filter; 4570 HN_UNLOCK(sc); 4571 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4572 NDIS_PACKET_TYPES); 4573 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4574 } 4575 4576 static int 4577 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS) 4578 { 4579 struct hn_softc *sc = arg1; 4580 uint32_t mtu; 4581 int error; 4582 HN_LOCK(sc); 4583 error = hn_rndis_get_mtu(sc, &mtu); 4584 if (error) { 4585 if_printf(sc->hn_ifp, "failed to get mtu\n"); 4586 goto back; 4587 } 4588 error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4589 if (error || req->newptr == NULL) 4590 goto back; 4591 4592 error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4593 if (error) 4594 goto back; 4595 error = hn_rndis_reconf_offload(sc, mtu); 4596 back: 4597 HN_UNLOCK(sc); 4598 return (error); 4599 } 4600 #ifndef RSS 4601 4602 static int 4603 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4604 { 4605 struct hn_softc *sc = arg1; 4606 int error; 4607 4608 HN_LOCK(sc); 4609 4610 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4611 if (error || req->newptr == NULL) 4612 goto back; 4613 4614 if ((sc->hn_flags & HN_FLAG_RXVF) || 4615 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4616 /* 4617 * RSS key is synchronized w/ VF's, don't allow users 4618 * to change it. 4619 */ 4620 error = EBUSY; 4621 goto back; 4622 } 4623 4624 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4625 if (error) 4626 goto back; 4627 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4628 4629 if (sc->hn_rx_ring_inuse > 1) { 4630 error = hn_rss_reconfig(sc); 4631 } else { 4632 /* Not RSS capable, at least for now; just save the RSS key. */ 4633 error = 0; 4634 } 4635 back: 4636 HN_UNLOCK(sc); 4637 return (error); 4638 } 4639 4640 static int 4641 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4642 { 4643 struct hn_softc *sc = arg1; 4644 int error; 4645 4646 HN_LOCK(sc); 4647 4648 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4649 if (error || req->newptr == NULL) 4650 goto back; 4651 4652 /* 4653 * Don't allow RSS indirect table change, if this interface is not 4654 * RSS capable currently. 4655 */ 4656 if (sc->hn_rx_ring_inuse == 1) { 4657 error = EOPNOTSUPP; 4658 goto back; 4659 } 4660 4661 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4662 if (error) 4663 goto back; 4664 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4665 4666 hn_rss_ind_fixup(sc); 4667 error = hn_rss_reconfig(sc); 4668 back: 4669 HN_UNLOCK(sc); 4670 return (error); 4671 } 4672 4673 #endif /* !RSS */ 4674 4675 static int 4676 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4677 { 4678 struct hn_softc *sc = arg1; 4679 char hash_str[128]; 4680 uint32_t hash; 4681 4682 HN_LOCK(sc); 4683 hash = sc->hn_rss_hash; 4684 HN_UNLOCK(sc); 4685 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4686 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4687 } 4688 4689 static int 4690 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4691 { 4692 struct hn_softc *sc = arg1; 4693 char hash_str[128]; 4694 uint32_t hash; 4695 4696 HN_LOCK(sc); 4697 hash = sc->hn_rss_hcap; 4698 HN_UNLOCK(sc); 4699 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4700 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4701 } 4702 4703 static int 4704 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4705 { 4706 struct hn_softc *sc = arg1; 4707 char hash_str[128]; 4708 uint32_t hash; 4709 4710 HN_LOCK(sc); 4711 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4712 HN_UNLOCK(sc); 4713 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4714 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4715 } 4716 4717 static int 4718 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4719 { 4720 struct hn_softc *sc = arg1; 4721 char vf_name[IFNAMSIZ + 1]; 4722 struct ifnet *vf_ifp; 4723 4724 HN_LOCK(sc); 4725 vf_name[0] = '\0'; 4726 vf_ifp = sc->hn_vf_ifp; 4727 if (vf_ifp != NULL) 4728 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4729 HN_UNLOCK(sc); 4730 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4731 } 4732 4733 static int 4734 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4735 { 4736 struct hn_softc *sc = arg1; 4737 char vf_name[IFNAMSIZ + 1]; 4738 struct ifnet *vf_ifp; 4739 4740 HN_LOCK(sc); 4741 vf_name[0] = '\0'; 4742 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4743 if (vf_ifp != NULL) 4744 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4745 HN_UNLOCK(sc); 4746 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4747 } 4748 4749 static int 4750 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4751 { 4752 struct rm_priotracker pt; 4753 struct sbuf *sb; 4754 int error, i; 4755 bool first; 4756 4757 error = sysctl_wire_old_buffer(req, 0); 4758 if (error != 0) 4759 return (error); 4760 4761 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4762 if (sb == NULL) 4763 return (ENOMEM); 4764 4765 rm_rlock(&hn_vfmap_lock, &pt); 4766 4767 first = true; 4768 for (i = 0; i < hn_vfmap_size; ++i) { 4769 struct epoch_tracker et; 4770 struct ifnet *ifp; 4771 4772 if (hn_vfmap[i] == NULL) 4773 continue; 4774 4775 NET_EPOCH_ENTER(et); 4776 ifp = ifnet_byindex(i); 4777 if (ifp != NULL) { 4778 if (first) 4779 sbuf_printf(sb, "%s", ifp->if_xname); 4780 else 4781 sbuf_printf(sb, " %s", ifp->if_xname); 4782 first = false; 4783 } 4784 NET_EPOCH_EXIT(et); 4785 } 4786 4787 rm_runlock(&hn_vfmap_lock, &pt); 4788 4789 error = sbuf_finish(sb); 4790 sbuf_delete(sb); 4791 return (error); 4792 } 4793 4794 static int 4795 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4796 { 4797 struct rm_priotracker pt; 4798 struct sbuf *sb; 4799 int error, i; 4800 bool first; 4801 4802 error = sysctl_wire_old_buffer(req, 0); 4803 if (error != 0) 4804 return (error); 4805 4806 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4807 if (sb == NULL) 4808 return (ENOMEM); 4809 4810 rm_rlock(&hn_vfmap_lock, &pt); 4811 4812 first = true; 4813 for (i = 0; i < hn_vfmap_size; ++i) { 4814 struct epoch_tracker et; 4815 struct ifnet *ifp, *hn_ifp; 4816 4817 hn_ifp = hn_vfmap[i]; 4818 if (hn_ifp == NULL) 4819 continue; 4820 4821 NET_EPOCH_ENTER(et); 4822 ifp = ifnet_byindex(i); 4823 if (ifp != NULL) { 4824 if (first) { 4825 sbuf_printf(sb, "%s:%s", ifp->if_xname, 4826 hn_ifp->if_xname); 4827 } else { 4828 sbuf_printf(sb, " %s:%s", ifp->if_xname, 4829 hn_ifp->if_xname); 4830 } 4831 first = false; 4832 } 4833 NET_EPOCH_EXIT(et); 4834 } 4835 4836 rm_runlock(&hn_vfmap_lock, &pt); 4837 4838 error = sbuf_finish(sb); 4839 sbuf_delete(sb); 4840 return (error); 4841 } 4842 4843 static int 4844 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4845 { 4846 struct hn_softc *sc = arg1; 4847 int error, onoff = 0; 4848 4849 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4850 onoff = 1; 4851 error = sysctl_handle_int(oidp, &onoff, 0, req); 4852 if (error || req->newptr == NULL) 4853 return (error); 4854 4855 HN_LOCK(sc); 4856 /* NOTE: hn_vf_lock for hn_transmit() */ 4857 rm_wlock(&sc->hn_vf_lock); 4858 if (onoff) 4859 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4860 else 4861 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4862 rm_wunlock(&sc->hn_vf_lock); 4863 HN_UNLOCK(sc); 4864 4865 return (0); 4866 } 4867 4868 static int 4869 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4870 { 4871 struct hn_softc *sc = arg1; 4872 int enabled = 0; 4873 4874 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4875 enabled = 1; 4876 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4877 } 4878 4879 static int 4880 hn_check_iplen(const struct mbuf *m, int hoff) 4881 { 4882 const struct ip *ip; 4883 int len, iphlen, iplen; 4884 const struct tcphdr *th; 4885 int thoff; /* TCP data offset */ 4886 4887 len = hoff + sizeof(struct ip); 4888 4889 /* The packet must be at least the size of an IP header. */ 4890 if (m->m_pkthdr.len < len) 4891 return IPPROTO_DONE; 4892 4893 /* The fixed IP header must reside completely in the first mbuf. */ 4894 if (m->m_len < len) 4895 return IPPROTO_DONE; 4896 4897 ip = mtodo(m, hoff); 4898 4899 /* Bound check the packet's stated IP header length. */ 4900 iphlen = ip->ip_hl << 2; 4901 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4902 return IPPROTO_DONE; 4903 4904 /* The full IP header must reside completely in the one mbuf. */ 4905 if (m->m_len < hoff + iphlen) 4906 return IPPROTO_DONE; 4907 4908 iplen = ntohs(ip->ip_len); 4909 4910 /* 4911 * Check that the amount of data in the buffers is as 4912 * at least much as the IP header would have us expect. 4913 */ 4914 if (m->m_pkthdr.len < hoff + iplen) 4915 return IPPROTO_DONE; 4916 4917 /* 4918 * Ignore IP fragments. 4919 */ 4920 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4921 return IPPROTO_DONE; 4922 4923 /* 4924 * The TCP/IP or UDP/IP header must be entirely contained within 4925 * the first fragment of a packet. 4926 */ 4927 switch (ip->ip_p) { 4928 case IPPROTO_TCP: 4929 if (iplen < iphlen + sizeof(struct tcphdr)) 4930 return IPPROTO_DONE; 4931 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4932 return IPPROTO_DONE; 4933 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4934 thoff = th->th_off << 2; 4935 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4936 return IPPROTO_DONE; 4937 if (m->m_len < hoff + iphlen + thoff) 4938 return IPPROTO_DONE; 4939 break; 4940 case IPPROTO_UDP: 4941 if (iplen < iphlen + sizeof(struct udphdr)) 4942 return IPPROTO_DONE; 4943 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4944 return IPPROTO_DONE; 4945 break; 4946 default: 4947 if (iplen < iphlen) 4948 return IPPROTO_DONE; 4949 break; 4950 } 4951 return ip->ip_p; 4952 } 4953 4954 static void 4955 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4956 { 4957 const struct ether_header *eh; 4958 uint16_t etype; 4959 int hoff; 4960 4961 hoff = sizeof(*eh); 4962 /* Checked at the beginning of this function. */ 4963 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4964 4965 eh = mtod(m_new, const struct ether_header *); 4966 etype = ntohs(eh->ether_type); 4967 if (etype == ETHERTYPE_VLAN) { 4968 const struct ether_vlan_header *evl; 4969 4970 hoff = sizeof(*evl); 4971 if (m_new->m_len < hoff) 4972 return; 4973 evl = mtod(m_new, const struct ether_vlan_header *); 4974 etype = ntohs(evl->evl_proto); 4975 } 4976 *l3proto = etype; 4977 4978 if (etype == ETHERTYPE_IP) 4979 *l4proto = hn_check_iplen(m_new, hoff); 4980 else 4981 *l4proto = IPPROTO_DONE; 4982 } 4983 4984 static int 4985 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4986 { 4987 struct sysctl_oid_list *child; 4988 struct sysctl_ctx_list *ctx; 4989 device_t dev = sc->hn_dev; 4990 #if defined(INET) || defined(INET6) 4991 #if __FreeBSD_version >= 1100095 4992 int lroent_cnt; 4993 #endif 4994 #endif 4995 int i; 4996 4997 /* 4998 * Create RXBUF for reception. 4999 * 5000 * NOTE: 5001 * - It is shared by all channels. 5002 * - A large enough buffer is allocated, certain version of NVSes 5003 * may further limit the usable space. 5004 */ 5005 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 5006 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 5007 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5008 if (sc->hn_rxbuf == NULL) { 5009 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 5010 return (ENOMEM); 5011 } 5012 5013 sc->hn_rx_ring_cnt = ring_cnt; 5014 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 5015 5016 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 5017 M_DEVBUF, M_WAITOK | M_ZERO); 5018 5019 #if defined(INET) || defined(INET6) 5020 #if __FreeBSD_version >= 1100095 5021 lroent_cnt = hn_lro_entry_count; 5022 if (lroent_cnt < TCP_LRO_ENTRIES) 5023 lroent_cnt = TCP_LRO_ENTRIES; 5024 if (bootverbose) 5025 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 5026 #endif 5027 #endif /* INET || INET6 */ 5028 5029 ctx = device_get_sysctl_ctx(dev); 5030 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 5031 5032 /* Create dev.hn.UNIT.rx sysctl tree */ 5033 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 5034 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5035 5036 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5037 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5038 5039 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 5040 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 5041 &rxr->hn_br_dma, BUS_DMA_WAITOK); 5042 if (rxr->hn_br == NULL) { 5043 device_printf(dev, "allocate bufring failed\n"); 5044 return (ENOMEM); 5045 } 5046 5047 if (hn_trust_hosttcp) 5048 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 5049 if (hn_trust_hostudp) 5050 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 5051 if (hn_trust_hostip) 5052 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 5053 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 5054 rxr->hn_ifp = sc->hn_ifp; 5055 if (i < sc->hn_tx_ring_cnt) 5056 rxr->hn_txr = &sc->hn_tx_ring[i]; 5057 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 5058 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 5059 rxr->hn_rx_idx = i; 5060 rxr->hn_rxbuf = sc->hn_rxbuf; 5061 5062 /* 5063 * Initialize LRO. 5064 */ 5065 #if defined(INET) || defined(INET6) 5066 #if __FreeBSD_version >= 1100095 5067 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5068 hn_lro_mbufq_depth); 5069 #else 5070 tcp_lro_init(&rxr->hn_lro); 5071 rxr->hn_lro.ifp = sc->hn_ifp; 5072 #endif 5073 #if __FreeBSD_version >= 1100099 5074 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5075 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5076 #endif 5077 #endif /* INET || INET6 */ 5078 5079 if (sc->hn_rx_sysctl_tree != NULL) { 5080 char name[16]; 5081 5082 /* 5083 * Create per RX ring sysctl tree: 5084 * dev.hn.UNIT.rx.RINGID 5085 */ 5086 snprintf(name, sizeof(name), "%d", i); 5087 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5088 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5089 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5090 5091 if (rxr->hn_rx_sysctl_tree != NULL) { 5092 SYSCTL_ADD_ULONG(ctx, 5093 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5094 OID_AUTO, "packets", 5095 CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts, 5096 "# of packets received"); 5097 SYSCTL_ADD_ULONG(ctx, 5098 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5099 OID_AUTO, "rss_pkts", 5100 CTLFLAG_RW | CTLFLAG_STATS, 5101 &rxr->hn_rss_pkts, 5102 "# of packets w/ RSS info received"); 5103 SYSCTL_ADD_ULONG(ctx, 5104 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5105 OID_AUTO, "rsc_pkts", 5106 CTLFLAG_RW | CTLFLAG_STATS, 5107 &rxr->hn_rsc_pkts, 5108 "# of RSC packets received"); 5109 SYSCTL_ADD_ULONG(ctx, 5110 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5111 OID_AUTO, "rsc_drop", 5112 CTLFLAG_RW | CTLFLAG_STATS, 5113 &rxr->hn_rsc_drop, 5114 "# of RSC fragments dropped"); 5115 SYSCTL_ADD_INT(ctx, 5116 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5117 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5118 &rxr->hn_pktbuf_len, 0, 5119 "Temporary channel packet buffer length"); 5120 } 5121 } 5122 } 5123 5124 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5125 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5126 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5127 #if __FreeBSD_version < 1100095 5128 hn_rx_stat_int_sysctl, 5129 #else 5130 hn_rx_stat_u64_sysctl, 5131 #endif 5132 "LU", "LRO queued"); 5133 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5134 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5135 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5136 #if __FreeBSD_version < 1100095 5137 hn_rx_stat_int_sysctl, 5138 #else 5139 hn_rx_stat_u64_sysctl, 5140 #endif 5141 "LU", "LRO flushed"); 5142 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5143 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5144 __offsetof(struct hn_rx_ring, hn_lro_tried), 5145 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5146 #if __FreeBSD_version >= 1100099 5147 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5148 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5149 hn_lro_lenlim_sysctl, "IU", 5150 "Max # of data bytes to be aggregated by LRO"); 5151 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5152 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5153 hn_lro_ackcnt_sysctl, "I", 5154 "Max # of ACKs to be aggregated by LRO"); 5155 #endif 5156 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5157 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5158 hn_trust_hcsum_sysctl, "I", 5159 "Trust tcp segment verification on host side, " 5160 "when csum info is missing"); 5161 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5162 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5163 hn_trust_hcsum_sysctl, "I", 5164 "Trust udp datagram verification on host side, " 5165 "when csum info is missing"); 5166 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5167 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5168 hn_trust_hcsum_sysctl, "I", 5169 "Trust ip packet verification on host side, " 5170 "when csum info is missing"); 5171 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5172 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5173 __offsetof(struct hn_rx_ring, hn_csum_ip), 5174 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5175 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5176 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5177 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5178 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5179 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5180 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5181 __offsetof(struct hn_rx_ring, hn_csum_udp), 5182 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5183 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5184 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5185 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5186 hn_rx_stat_ulong_sysctl, "LU", 5187 "# of packets that we trust host's csum verification"); 5188 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5189 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5190 __offsetof(struct hn_rx_ring, hn_small_pkts), 5191 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5192 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5193 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5194 __offsetof(struct hn_rx_ring, hn_ack_failed), 5195 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5196 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5197 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5198 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5199 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5200 5201 return (0); 5202 } 5203 5204 static void 5205 hn_destroy_rx_data(struct hn_softc *sc) 5206 { 5207 int i; 5208 5209 if (sc->hn_rxbuf != NULL) { 5210 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5211 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 5212 else 5213 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5214 sc->hn_rxbuf = NULL; 5215 } 5216 5217 if (sc->hn_rx_ring_cnt == 0) 5218 return; 5219 5220 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5221 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5222 5223 if (rxr->hn_br == NULL) 5224 continue; 5225 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5226 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 5227 } else { 5228 device_printf(sc->hn_dev, 5229 "%dth channel bufring is referenced", i); 5230 } 5231 rxr->hn_br = NULL; 5232 5233 #if defined(INET) || defined(INET6) 5234 tcp_lro_free(&rxr->hn_lro); 5235 #endif 5236 free(rxr->hn_pktbuf, M_DEVBUF); 5237 } 5238 free(sc->hn_rx_ring, M_DEVBUF); 5239 sc->hn_rx_ring = NULL; 5240 5241 sc->hn_rx_ring_cnt = 0; 5242 sc->hn_rx_ring_inuse = 0; 5243 } 5244 5245 static int 5246 hn_tx_ring_create(struct hn_softc *sc, int id) 5247 { 5248 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5249 device_t dev = sc->hn_dev; 5250 bus_dma_tag_t parent_dtag; 5251 int error, i; 5252 5253 txr->hn_sc = sc; 5254 txr->hn_tx_idx = id; 5255 5256 #ifndef HN_USE_TXDESC_BUFRING 5257 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5258 #endif 5259 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5260 5261 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5262 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5263 M_DEVBUF, M_WAITOK | M_ZERO); 5264 #ifndef HN_USE_TXDESC_BUFRING 5265 SLIST_INIT(&txr->hn_txlist); 5266 #else 5267 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5268 M_WAITOK, &txr->hn_tx_lock); 5269 #endif 5270 5271 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5272 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5273 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5274 } else { 5275 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5276 } 5277 5278 #ifdef HN_IFSTART_SUPPORT 5279 if (hn_use_if_start) { 5280 txr->hn_txeof = hn_start_txeof; 5281 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5282 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5283 } else 5284 #endif 5285 { 5286 int br_depth; 5287 5288 txr->hn_txeof = hn_xmit_txeof; 5289 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5290 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5291 5292 br_depth = hn_get_txswq_depth(txr); 5293 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5294 M_WAITOK, &txr->hn_tx_lock); 5295 } 5296 5297 txr->hn_direct_tx_size = hn_direct_tx_size; 5298 5299 /* 5300 * Always schedule transmission instead of trying to do direct 5301 * transmission. This one gives the best performance so far. 5302 */ 5303 txr->hn_sched_tx = 1; 5304 5305 parent_dtag = bus_get_dma_tag(dev); 5306 5307 /* DMA tag for RNDIS packet messages. */ 5308 error = bus_dma_tag_create(parent_dtag, /* parent */ 5309 HN_RNDIS_PKT_ALIGN, /* alignment */ 5310 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5311 BUS_SPACE_MAXADDR, /* lowaddr */ 5312 BUS_SPACE_MAXADDR, /* highaddr */ 5313 NULL, NULL, /* filter, filterarg */ 5314 HN_RNDIS_PKT_LEN, /* maxsize */ 5315 1, /* nsegments */ 5316 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5317 0, /* flags */ 5318 NULL, /* lockfunc */ 5319 NULL, /* lockfuncarg */ 5320 &txr->hn_tx_rndis_dtag); 5321 if (error) { 5322 device_printf(dev, "failed to create rndis dmatag\n"); 5323 return error; 5324 } 5325 5326 /* DMA tag for data. */ 5327 error = bus_dma_tag_create(parent_dtag, /* parent */ 5328 1, /* alignment */ 5329 HN_TX_DATA_BOUNDARY, /* boundary */ 5330 BUS_SPACE_MAXADDR, /* lowaddr */ 5331 BUS_SPACE_MAXADDR, /* highaddr */ 5332 NULL, NULL, /* filter, filterarg */ 5333 HN_TX_DATA_MAXSIZE, /* maxsize */ 5334 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5335 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5336 0, /* flags */ 5337 NULL, /* lockfunc */ 5338 NULL, /* lockfuncarg */ 5339 &txr->hn_tx_data_dtag); 5340 if (error) { 5341 device_printf(dev, "failed to create data dmatag\n"); 5342 return error; 5343 } 5344 5345 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5346 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5347 5348 txd->txr = txr; 5349 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5350 STAILQ_INIT(&txd->agg_list); 5351 5352 /* 5353 * Allocate and load RNDIS packet message. 5354 */ 5355 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5356 (void **)&txd->rndis_pkt, 5357 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5358 &txd->rndis_pkt_dmap); 5359 if (error) { 5360 device_printf(dev, 5361 "failed to allocate rndis_packet_msg, %d\n", i); 5362 return error; 5363 } 5364 5365 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5366 txd->rndis_pkt_dmap, 5367 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5368 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5369 BUS_DMA_NOWAIT); 5370 if (error) { 5371 device_printf(dev, 5372 "failed to load rndis_packet_msg, %d\n", i); 5373 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5374 txd->rndis_pkt, txd->rndis_pkt_dmap); 5375 return error; 5376 } 5377 5378 /* DMA map for TX data. */ 5379 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5380 &txd->data_dmap); 5381 if (error) { 5382 device_printf(dev, 5383 "failed to allocate tx data dmamap\n"); 5384 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5385 txd->rndis_pkt_dmap); 5386 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5387 txd->rndis_pkt, txd->rndis_pkt_dmap); 5388 return error; 5389 } 5390 5391 /* All set, put it to list */ 5392 txd->flags |= HN_TXD_FLAG_ONLIST; 5393 #ifndef HN_USE_TXDESC_BUFRING 5394 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5395 #else 5396 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5397 #endif 5398 } 5399 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5400 5401 if (sc->hn_tx_sysctl_tree != NULL) { 5402 struct sysctl_oid_list *child; 5403 struct sysctl_ctx_list *ctx; 5404 char name[16]; 5405 5406 /* 5407 * Create per TX ring sysctl tree: 5408 * dev.hn.UNIT.tx.RINGID 5409 */ 5410 ctx = device_get_sysctl_ctx(dev); 5411 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5412 5413 snprintf(name, sizeof(name), "%d", id); 5414 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5415 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5416 5417 if (txr->hn_tx_sysctl_tree != NULL) { 5418 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5419 5420 #ifdef HN_DEBUG 5421 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5422 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5423 "# of available TX descs"); 5424 #endif 5425 #ifdef HN_IFSTART_SUPPORT 5426 if (!hn_use_if_start) 5427 #endif 5428 { 5429 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5430 CTLFLAG_RD, &txr->hn_oactive, 0, 5431 "over active"); 5432 } 5433 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5434 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts, 5435 "# of packets transmitted"); 5436 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5437 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends, 5438 "# of sends"); 5439 } 5440 } 5441 5442 return 0; 5443 } 5444 5445 static void 5446 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5447 { 5448 struct hn_tx_ring *txr = txd->txr; 5449 5450 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5451 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5452 5453 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5454 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5455 txd->rndis_pkt_dmap); 5456 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5457 } 5458 5459 static void 5460 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5461 { 5462 5463 KASSERT(txd->refs == 0 || txd->refs == 1, 5464 ("invalid txd refs %d", txd->refs)); 5465 5466 /* Aggregated txds will be freed by their aggregating txd. */ 5467 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5468 int freed __diagused; 5469 5470 freed = hn_txdesc_put(txr, txd); 5471 KASSERT(freed, ("can't free txdesc")); 5472 } 5473 } 5474 5475 static void 5476 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5477 { 5478 int i; 5479 5480 if (txr->hn_txdesc == NULL) 5481 return; 5482 5483 /* 5484 * NOTE: 5485 * Because the freeing of aggregated txds will be deferred 5486 * to the aggregating txd, two passes are used here: 5487 * - The first pass GCes any pending txds. This GC is necessary, 5488 * since if the channels are revoked, hypervisor will not 5489 * deliver send-done for all pending txds. 5490 * - The second pass frees the busdma stuffs, i.e. after all txds 5491 * were freed. 5492 */ 5493 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5494 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5495 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5496 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5497 5498 if (txr->hn_tx_data_dtag != NULL) 5499 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5500 if (txr->hn_tx_rndis_dtag != NULL) 5501 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5502 5503 #ifdef HN_USE_TXDESC_BUFRING 5504 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5505 #endif 5506 5507 free(txr->hn_txdesc, M_DEVBUF); 5508 txr->hn_txdesc = NULL; 5509 5510 if (txr->hn_mbuf_br != NULL) 5511 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5512 5513 #ifndef HN_USE_TXDESC_BUFRING 5514 mtx_destroy(&txr->hn_txlist_spin); 5515 #endif 5516 mtx_destroy(&txr->hn_tx_lock); 5517 } 5518 5519 static int 5520 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5521 { 5522 struct sysctl_oid_list *child; 5523 struct sysctl_ctx_list *ctx; 5524 int i; 5525 5526 /* 5527 * Create TXBUF for chimney sending. 5528 * 5529 * NOTE: It is shared by all channels. 5530 */ 5531 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 5532 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 5533 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5534 if (sc->hn_chim == NULL) { 5535 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5536 return (ENOMEM); 5537 } 5538 5539 sc->hn_tx_ring_cnt = ring_cnt; 5540 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5541 5542 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5543 M_DEVBUF, M_WAITOK | M_ZERO); 5544 5545 ctx = device_get_sysctl_ctx(sc->hn_dev); 5546 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5547 5548 /* Create dev.hn.UNIT.tx sysctl tree */ 5549 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5550 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5551 5552 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5553 int error; 5554 5555 error = hn_tx_ring_create(sc, i); 5556 if (error) 5557 return error; 5558 } 5559 5560 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5561 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5562 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5563 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5564 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5565 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5566 __offsetof(struct hn_tx_ring, hn_send_failed), 5567 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5568 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5569 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5570 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5571 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5572 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5573 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5574 __offsetof(struct hn_tx_ring, hn_flush_failed), 5575 hn_tx_stat_ulong_sysctl, "LU", 5576 "# of packet transmission aggregation flush failure"); 5577 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5578 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5579 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5580 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5581 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5582 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5583 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5584 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5585 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5586 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5587 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5588 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5589 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5590 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5591 "# of total TX descs"); 5592 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5593 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5594 "Chimney send packet size upper boundary"); 5595 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5596 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5597 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5598 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5599 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5600 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5601 hn_tx_conf_int_sysctl, "I", 5602 "Size of the packet for direct transmission"); 5603 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5604 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5605 __offsetof(struct hn_tx_ring, hn_sched_tx), 5606 hn_tx_conf_int_sysctl, "I", 5607 "Always schedule transmission " 5608 "instead of doing direct transmission"); 5609 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5610 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5611 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5612 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5613 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5614 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5615 "Applied packet transmission aggregation size"); 5616 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5617 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5618 hn_txagg_pktmax_sysctl, "I", 5619 "Applied packet transmission aggregation packets"); 5620 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5621 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5622 hn_txagg_align_sysctl, "I", 5623 "Applied packet transmission aggregation alignment"); 5624 5625 return 0; 5626 } 5627 5628 static void 5629 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5630 { 5631 int i; 5632 5633 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5634 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5635 } 5636 5637 static void 5638 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5639 { 5640 struct ifnet *ifp = sc->hn_ifp; 5641 u_int hw_tsomax; 5642 int tso_minlen; 5643 5644 HN_LOCK_ASSERT(sc); 5645 5646 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5647 return; 5648 5649 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5650 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5651 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5652 5653 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5654 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5655 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5656 5657 if (tso_maxlen < tso_minlen) 5658 tso_maxlen = tso_minlen; 5659 else if (tso_maxlen > IP_MAXPACKET) 5660 tso_maxlen = IP_MAXPACKET; 5661 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5662 tso_maxlen = sc->hn_ndis_tso_szmax; 5663 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5664 5665 if (hn_xpnt_vf_isready(sc)) { 5666 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) 5667 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; 5668 } 5669 ifp->if_hw_tsomax = hw_tsomax; 5670 if (bootverbose) 5671 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 5672 } 5673 5674 static void 5675 hn_fixup_tx_data(struct hn_softc *sc) 5676 { 5677 uint64_t csum_assist; 5678 int i; 5679 5680 hn_set_chim_size(sc, sc->hn_chim_szmax); 5681 if (hn_tx_chimney_size > 0 && 5682 hn_tx_chimney_size < sc->hn_chim_szmax) 5683 hn_set_chim_size(sc, hn_tx_chimney_size); 5684 5685 csum_assist = 0; 5686 if (sc->hn_caps & HN_CAP_IPCS) 5687 csum_assist |= CSUM_IP; 5688 if (sc->hn_caps & HN_CAP_TCP4CS) 5689 csum_assist |= CSUM_IP_TCP; 5690 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5691 csum_assist |= CSUM_IP_UDP; 5692 if (sc->hn_caps & HN_CAP_TCP6CS) 5693 csum_assist |= CSUM_IP6_TCP; 5694 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5695 csum_assist |= CSUM_IP6_UDP; 5696 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5697 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5698 5699 if (sc->hn_caps & HN_CAP_HASHVAL) { 5700 /* 5701 * Support HASHVAL pktinfo on TX path. 5702 */ 5703 if (bootverbose) 5704 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5705 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5706 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5707 } 5708 } 5709 5710 static void 5711 hn_fixup_rx_data(struct hn_softc *sc) 5712 { 5713 5714 if (sc->hn_caps & HN_CAP_UDPHASH) { 5715 int i; 5716 5717 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5718 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5719 } 5720 } 5721 5722 static void 5723 hn_destroy_tx_data(struct hn_softc *sc) 5724 { 5725 int i; 5726 5727 if (sc->hn_chim != NULL) { 5728 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5729 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5730 } else { 5731 device_printf(sc->hn_dev, 5732 "chimney sending buffer is referenced"); 5733 } 5734 sc->hn_chim = NULL; 5735 } 5736 5737 if (sc->hn_tx_ring_cnt == 0) 5738 return; 5739 5740 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5741 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5742 5743 free(sc->hn_tx_ring, M_DEVBUF); 5744 sc->hn_tx_ring = NULL; 5745 5746 sc->hn_tx_ring_cnt = 0; 5747 sc->hn_tx_ring_inuse = 0; 5748 } 5749 5750 #ifdef HN_IFSTART_SUPPORT 5751 5752 static void 5753 hn_start_taskfunc(void *xtxr, int pending __unused) 5754 { 5755 struct hn_tx_ring *txr = xtxr; 5756 5757 mtx_lock(&txr->hn_tx_lock); 5758 hn_start_locked(txr, 0); 5759 mtx_unlock(&txr->hn_tx_lock); 5760 } 5761 5762 static int 5763 hn_start_locked(struct hn_tx_ring *txr, int len) 5764 { 5765 struct hn_softc *sc = txr->hn_sc; 5766 struct ifnet *ifp = sc->hn_ifp; 5767 int sched = 0; 5768 5769 KASSERT(hn_use_if_start, 5770 ("hn_start_locked is called, when if_start is disabled")); 5771 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5772 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5773 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5774 5775 if (__predict_false(txr->hn_suspended)) 5776 return (0); 5777 5778 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5779 IFF_DRV_RUNNING) 5780 return (0); 5781 5782 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 5783 struct hn_txdesc *txd; 5784 struct mbuf *m_head; 5785 int error; 5786 5787 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 5788 if (m_head == NULL) 5789 break; 5790 5791 if (len > 0 && m_head->m_pkthdr.len > len) { 5792 /* 5793 * This sending could be time consuming; let callers 5794 * dispatch this packet sending (and sending of any 5795 * following up packets) to tx taskqueue. 5796 */ 5797 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5798 sched = 1; 5799 break; 5800 } 5801 5802 #if defined(INET6) || defined(INET) 5803 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5804 m_head = hn_tso_fixup(m_head); 5805 if (__predict_false(m_head == NULL)) { 5806 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5807 continue; 5808 } 5809 } else if (m_head->m_pkthdr.csum_flags & 5810 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5811 m_head = hn_set_hlen(m_head); 5812 if (__predict_false(m_head == NULL)) { 5813 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5814 continue; 5815 } 5816 } 5817 #endif 5818 5819 txd = hn_txdesc_get(txr); 5820 if (txd == NULL) { 5821 txr->hn_no_txdescs++; 5822 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5823 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5824 break; 5825 } 5826 5827 error = hn_encap(ifp, txr, txd, &m_head); 5828 if (error) { 5829 /* Both txd and m_head are freed */ 5830 KASSERT(txr->hn_agg_txd == NULL, 5831 ("encap failed w/ pending aggregating txdesc")); 5832 continue; 5833 } 5834 5835 if (txr->hn_agg_pktleft == 0) { 5836 if (txr->hn_agg_txd != NULL) { 5837 KASSERT(m_head == NULL, 5838 ("pending mbuf for aggregating txdesc")); 5839 error = hn_flush_txagg(ifp, txr); 5840 if (__predict_false(error)) { 5841 atomic_set_int(&ifp->if_drv_flags, 5842 IFF_DRV_OACTIVE); 5843 break; 5844 } 5845 } else { 5846 KASSERT(m_head != NULL, ("mbuf was freed")); 5847 error = hn_txpkt(ifp, txr, txd); 5848 if (__predict_false(error)) { 5849 /* txd is freed, but m_head is not */ 5850 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5851 atomic_set_int(&ifp->if_drv_flags, 5852 IFF_DRV_OACTIVE); 5853 break; 5854 } 5855 } 5856 } 5857 #ifdef INVARIANTS 5858 else { 5859 KASSERT(txr->hn_agg_txd != NULL, 5860 ("no aggregating txdesc")); 5861 KASSERT(m_head == NULL, 5862 ("pending mbuf for aggregating txdesc")); 5863 } 5864 #endif 5865 } 5866 5867 /* Flush pending aggerated transmission. */ 5868 if (txr->hn_agg_txd != NULL) 5869 hn_flush_txagg(ifp, txr); 5870 return (sched); 5871 } 5872 5873 static void 5874 hn_start(struct ifnet *ifp) 5875 { 5876 struct hn_softc *sc = ifp->if_softc; 5877 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5878 5879 if (txr->hn_sched_tx) 5880 goto do_sched; 5881 5882 if (mtx_trylock(&txr->hn_tx_lock)) { 5883 int sched; 5884 5885 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5886 mtx_unlock(&txr->hn_tx_lock); 5887 if (!sched) 5888 return; 5889 } 5890 do_sched: 5891 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5892 } 5893 5894 static void 5895 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5896 { 5897 struct hn_tx_ring *txr = xtxr; 5898 5899 mtx_lock(&txr->hn_tx_lock); 5900 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 5901 hn_start_locked(txr, 0); 5902 mtx_unlock(&txr->hn_tx_lock); 5903 } 5904 5905 static void 5906 hn_start_txeof(struct hn_tx_ring *txr) 5907 { 5908 struct hn_softc *sc = txr->hn_sc; 5909 struct ifnet *ifp = sc->hn_ifp; 5910 5911 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5912 5913 if (txr->hn_sched_tx) 5914 goto do_sched; 5915 5916 if (mtx_trylock(&txr->hn_tx_lock)) { 5917 int sched; 5918 5919 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5920 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5921 mtx_unlock(&txr->hn_tx_lock); 5922 if (sched) { 5923 taskqueue_enqueue(txr->hn_tx_taskq, 5924 &txr->hn_tx_task); 5925 } 5926 } else { 5927 do_sched: 5928 /* 5929 * Release the OACTIVE earlier, with the hope, that 5930 * others could catch up. The task will clear the 5931 * flag again with the hn_tx_lock to avoid possible 5932 * races. 5933 */ 5934 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5935 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5936 } 5937 } 5938 5939 #endif /* HN_IFSTART_SUPPORT */ 5940 5941 static int 5942 hn_xmit(struct hn_tx_ring *txr, int len) 5943 { 5944 struct hn_softc *sc = txr->hn_sc; 5945 struct ifnet *ifp = sc->hn_ifp; 5946 struct mbuf *m_head; 5947 int sched = 0; 5948 5949 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5950 #ifdef HN_IFSTART_SUPPORT 5951 KASSERT(hn_use_if_start == 0, 5952 ("hn_xmit is called, when if_start is enabled")); 5953 #endif 5954 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5955 5956 if (__predict_false(txr->hn_suspended)) 5957 return (0); 5958 5959 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5960 return (0); 5961 5962 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5963 struct hn_txdesc *txd; 5964 int error; 5965 5966 if (len > 0 && m_head->m_pkthdr.len > len) { 5967 /* 5968 * This sending could be time consuming; let callers 5969 * dispatch this packet sending (and sending of any 5970 * following up packets) to tx taskqueue. 5971 */ 5972 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5973 sched = 1; 5974 break; 5975 } 5976 5977 txd = hn_txdesc_get(txr); 5978 if (txd == NULL) { 5979 txr->hn_no_txdescs++; 5980 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5981 txr->hn_oactive = 1; 5982 break; 5983 } 5984 5985 error = hn_encap(ifp, txr, txd, &m_head); 5986 if (error) { 5987 /* Both txd and m_head are freed; discard */ 5988 KASSERT(txr->hn_agg_txd == NULL, 5989 ("encap failed w/ pending aggregating txdesc")); 5990 drbr_advance(ifp, txr->hn_mbuf_br); 5991 continue; 5992 } 5993 5994 if (txr->hn_agg_pktleft == 0) { 5995 if (txr->hn_agg_txd != NULL) { 5996 KASSERT(m_head == NULL, 5997 ("pending mbuf for aggregating txdesc")); 5998 error = hn_flush_txagg(ifp, txr); 5999 if (__predict_false(error)) { 6000 txr->hn_oactive = 1; 6001 break; 6002 } 6003 } else { 6004 KASSERT(m_head != NULL, ("mbuf was freed")); 6005 error = hn_txpkt(ifp, txr, txd); 6006 if (__predict_false(error)) { 6007 /* txd is freed, but m_head is not */ 6008 drbr_putback(ifp, txr->hn_mbuf_br, 6009 m_head); 6010 txr->hn_oactive = 1; 6011 break; 6012 } 6013 } 6014 } 6015 #ifdef INVARIANTS 6016 else { 6017 KASSERT(txr->hn_agg_txd != NULL, 6018 ("no aggregating txdesc")); 6019 KASSERT(m_head == NULL, 6020 ("pending mbuf for aggregating txdesc")); 6021 } 6022 #endif 6023 6024 /* Sent */ 6025 drbr_advance(ifp, txr->hn_mbuf_br); 6026 } 6027 6028 /* Flush pending aggerated transmission. */ 6029 if (txr->hn_agg_txd != NULL) 6030 hn_flush_txagg(ifp, txr); 6031 return (sched); 6032 } 6033 6034 static int 6035 hn_transmit(struct ifnet *ifp, struct mbuf *m) 6036 { 6037 struct hn_softc *sc = ifp->if_softc; 6038 struct hn_tx_ring *txr; 6039 int error, idx = 0; 6040 6041 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 6042 struct rm_priotracker pt; 6043 6044 rm_rlock(&sc->hn_vf_lock, &pt); 6045 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6046 struct mbuf *m_bpf = NULL; 6047 int obytes, omcast; 6048 6049 obytes = m->m_pkthdr.len; 6050 omcast = (m->m_flags & M_MCAST) != 0; 6051 6052 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 6053 if (bpf_peers_present(ifp->if_bpf)) { 6054 m_bpf = m_copypacket(m, M_NOWAIT); 6055 if (m_bpf == NULL) { 6056 /* 6057 * Failed to grab a shallow 6058 * copy; tap now. 6059 */ 6060 ETHER_BPF_MTAP(ifp, m); 6061 } 6062 } 6063 } else { 6064 ETHER_BPF_MTAP(ifp, m); 6065 } 6066 6067 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); 6068 rm_runlock(&sc->hn_vf_lock, &pt); 6069 6070 if (m_bpf != NULL) { 6071 if (!error) 6072 ETHER_BPF_MTAP(ifp, m_bpf); 6073 m_freem(m_bpf); 6074 } 6075 6076 if (error == ENOBUFS) { 6077 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6078 } else if (error) { 6079 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6080 } else { 6081 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6082 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6083 if (omcast) { 6084 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6085 omcast); 6086 } 6087 } 6088 return (error); 6089 } 6090 rm_runlock(&sc->hn_vf_lock, &pt); 6091 } 6092 6093 #if defined(INET6) || defined(INET) 6094 /* 6095 * Perform TSO packet header fixup or get l2/l3 header length now, 6096 * since packet headers should be cache-hot. 6097 */ 6098 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6099 m = hn_tso_fixup(m); 6100 if (__predict_false(m == NULL)) { 6101 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6102 return EIO; 6103 } 6104 } else if (m->m_pkthdr.csum_flags & 6105 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6106 m = hn_set_hlen(m); 6107 if (__predict_false(m == NULL)) { 6108 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6109 return EIO; 6110 } 6111 } 6112 #endif 6113 6114 /* 6115 * Select the TX ring based on flowid 6116 */ 6117 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6118 #ifdef RSS 6119 uint32_t bid; 6120 6121 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6122 &bid) == 0) 6123 idx = bid % sc->hn_tx_ring_inuse; 6124 else 6125 #endif 6126 { 6127 #if defined(INET6) || defined(INET) 6128 int tcpsyn = 0; 6129 6130 if (m->m_pkthdr.len < 128 && 6131 (m->m_pkthdr.csum_flags & 6132 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6133 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6134 m = hn_check_tcpsyn(m, &tcpsyn); 6135 if (__predict_false(m == NULL)) { 6136 if_inc_counter(ifp, 6137 IFCOUNTER_OERRORS, 1); 6138 return (EIO); 6139 } 6140 } 6141 #else 6142 const int tcpsyn = 0; 6143 #endif 6144 if (tcpsyn) 6145 idx = 0; 6146 else 6147 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6148 } 6149 } 6150 txr = &sc->hn_tx_ring[idx]; 6151 6152 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6153 if (error) { 6154 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6155 return error; 6156 } 6157 6158 if (txr->hn_oactive) 6159 return 0; 6160 6161 if (txr->hn_sched_tx) 6162 goto do_sched; 6163 6164 if (mtx_trylock(&txr->hn_tx_lock)) { 6165 int sched; 6166 6167 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6168 mtx_unlock(&txr->hn_tx_lock); 6169 if (!sched) 6170 return 0; 6171 } 6172 do_sched: 6173 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6174 return 0; 6175 } 6176 6177 static void 6178 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6179 { 6180 struct mbuf *m; 6181 6182 mtx_lock(&txr->hn_tx_lock); 6183 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6184 m_freem(m); 6185 mtx_unlock(&txr->hn_tx_lock); 6186 } 6187 6188 static void 6189 hn_xmit_qflush(struct ifnet *ifp) 6190 { 6191 struct hn_softc *sc = ifp->if_softc; 6192 struct rm_priotracker pt; 6193 int i; 6194 6195 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6196 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6197 if_qflush(ifp); 6198 6199 rm_rlock(&sc->hn_vf_lock, &pt); 6200 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6201 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); 6202 rm_runlock(&sc->hn_vf_lock, &pt); 6203 } 6204 6205 static void 6206 hn_xmit_txeof(struct hn_tx_ring *txr) 6207 { 6208 6209 if (txr->hn_sched_tx) 6210 goto do_sched; 6211 6212 if (mtx_trylock(&txr->hn_tx_lock)) { 6213 int sched; 6214 6215 txr->hn_oactive = 0; 6216 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6217 mtx_unlock(&txr->hn_tx_lock); 6218 if (sched) { 6219 taskqueue_enqueue(txr->hn_tx_taskq, 6220 &txr->hn_tx_task); 6221 } 6222 } else { 6223 do_sched: 6224 /* 6225 * Release the oactive earlier, with the hope, that 6226 * others could catch up. The task will clear the 6227 * oactive again with the hn_tx_lock to avoid possible 6228 * races. 6229 */ 6230 txr->hn_oactive = 0; 6231 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6232 } 6233 } 6234 6235 static void 6236 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6237 { 6238 struct hn_tx_ring *txr = xtxr; 6239 6240 mtx_lock(&txr->hn_tx_lock); 6241 hn_xmit(txr, 0); 6242 mtx_unlock(&txr->hn_tx_lock); 6243 } 6244 6245 static void 6246 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6247 { 6248 struct hn_tx_ring *txr = xtxr; 6249 6250 mtx_lock(&txr->hn_tx_lock); 6251 txr->hn_oactive = 0; 6252 hn_xmit(txr, 0); 6253 mtx_unlock(&txr->hn_tx_lock); 6254 } 6255 6256 static int 6257 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6258 { 6259 struct vmbus_chan_br cbr; 6260 struct hn_rx_ring *rxr; 6261 struct hn_tx_ring *txr = NULL; 6262 int idx, error; 6263 6264 idx = vmbus_chan_subidx(chan); 6265 6266 /* 6267 * Link this channel to RX/TX ring. 6268 */ 6269 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6270 ("invalid channel index %d, should > 0 && < %d", 6271 idx, sc->hn_rx_ring_inuse)); 6272 rxr = &sc->hn_rx_ring[idx]; 6273 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6274 ("RX ring %d already attached", idx)); 6275 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6276 rxr->hn_chan = chan; 6277 6278 if (bootverbose) { 6279 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6280 idx, vmbus_chan_id(chan)); 6281 } 6282 6283 if (idx < sc->hn_tx_ring_inuse) { 6284 txr = &sc->hn_tx_ring[idx]; 6285 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6286 ("TX ring %d already attached", idx)); 6287 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6288 6289 txr->hn_chan = chan; 6290 if (bootverbose) { 6291 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6292 idx, vmbus_chan_id(chan)); 6293 } 6294 } 6295 6296 /* Bind this channel to a proper CPU. */ 6297 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6298 6299 /* 6300 * Open this channel 6301 */ 6302 cbr.cbr = rxr->hn_br; 6303 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 6304 cbr.cbr_txsz = HN_TXBR_SIZE; 6305 cbr.cbr_rxsz = HN_RXBR_SIZE; 6306 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6307 if (error) { 6308 if (error == EISCONN) { 6309 if_printf(sc->hn_ifp, "bufring is connected after " 6310 "chan%u open failure\n", vmbus_chan_id(chan)); 6311 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6312 } else { 6313 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6314 vmbus_chan_id(chan), error); 6315 } 6316 } 6317 return (error); 6318 } 6319 6320 static void 6321 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6322 { 6323 struct hn_rx_ring *rxr; 6324 int idx, error; 6325 6326 idx = vmbus_chan_subidx(chan); 6327 6328 /* 6329 * Link this channel to RX/TX ring. 6330 */ 6331 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6332 ("invalid channel index %d, should > 0 && < %d", 6333 idx, sc->hn_rx_ring_inuse)); 6334 rxr = &sc->hn_rx_ring[idx]; 6335 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6336 ("RX ring %d is not attached", idx)); 6337 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6338 6339 if (idx < sc->hn_tx_ring_inuse) { 6340 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6341 6342 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6343 ("TX ring %d is not attached attached", idx)); 6344 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6345 } 6346 6347 /* 6348 * Close this channel. 6349 * 6350 * NOTE: 6351 * Channel closing does _not_ destroy the target channel. 6352 */ 6353 error = vmbus_chan_close_direct(chan); 6354 if (error == EISCONN) { 6355 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6356 "after being closed\n", vmbus_chan_id(chan)); 6357 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6358 } else if (error) { 6359 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6360 vmbus_chan_id(chan), error); 6361 } 6362 } 6363 6364 static int 6365 hn_attach_subchans(struct hn_softc *sc) 6366 { 6367 struct vmbus_channel **subchans; 6368 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6369 int i, error = 0; 6370 6371 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6372 6373 /* Attach the sub-channels. */ 6374 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6375 for (i = 0; i < subchan_cnt; ++i) { 6376 int error1; 6377 6378 error1 = hn_chan_attach(sc, subchans[i]); 6379 if (error1) { 6380 error = error1; 6381 /* Move on; all channels will be detached later. */ 6382 } 6383 } 6384 vmbus_subchan_rel(subchans, subchan_cnt); 6385 6386 if (error) { 6387 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6388 } else { 6389 if (bootverbose) { 6390 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6391 subchan_cnt); 6392 } 6393 } 6394 return (error); 6395 } 6396 6397 static void 6398 hn_detach_allchans(struct hn_softc *sc) 6399 { 6400 struct vmbus_channel **subchans; 6401 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6402 int i; 6403 6404 if (subchan_cnt == 0) 6405 goto back; 6406 6407 /* Detach the sub-channels. */ 6408 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6409 for (i = 0; i < subchan_cnt; ++i) 6410 hn_chan_detach(sc, subchans[i]); 6411 vmbus_subchan_rel(subchans, subchan_cnt); 6412 6413 back: 6414 /* 6415 * Detach the primary channel, _after_ all sub-channels 6416 * are detached. 6417 */ 6418 hn_chan_detach(sc, sc->hn_prichan); 6419 6420 /* Wait for sub-channels to be destroyed, if any. */ 6421 vmbus_subchan_drain(sc->hn_prichan); 6422 6423 #ifdef INVARIANTS 6424 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6425 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6426 HN_RX_FLAG_ATTACHED) == 0, 6427 ("%dth RX ring is still attached", i)); 6428 } 6429 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6430 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6431 HN_TX_FLAG_ATTACHED) == 0, 6432 ("%dth TX ring is still attached", i)); 6433 } 6434 #endif 6435 } 6436 6437 static int 6438 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6439 { 6440 struct vmbus_channel **subchans; 6441 int nchan, rxr_cnt, error; 6442 6443 nchan = *nsubch + 1; 6444 if (nchan == 1) { 6445 /* 6446 * Multiple RX/TX rings are not requested. 6447 */ 6448 *nsubch = 0; 6449 return (0); 6450 } 6451 6452 /* 6453 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6454 * table entries. 6455 */ 6456 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6457 if (error) { 6458 /* No RSS; this is benign. */ 6459 *nsubch = 0; 6460 return (0); 6461 } 6462 if (bootverbose) { 6463 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6464 rxr_cnt, nchan); 6465 } 6466 6467 if (nchan > rxr_cnt) 6468 nchan = rxr_cnt; 6469 if (nchan == 1) { 6470 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6471 *nsubch = 0; 6472 return (0); 6473 } 6474 6475 /* 6476 * Allocate sub-channels from NVS. 6477 */ 6478 *nsubch = nchan - 1; 6479 error = hn_nvs_alloc_subchans(sc, nsubch); 6480 if (error || *nsubch == 0) { 6481 /* Failed to allocate sub-channels. */ 6482 *nsubch = 0; 6483 return (0); 6484 } 6485 6486 /* 6487 * Wait for all sub-channels to become ready before moving on. 6488 */ 6489 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6490 vmbus_subchan_rel(subchans, *nsubch); 6491 return (0); 6492 } 6493 6494 static bool 6495 hn_synth_attachable(const struct hn_softc *sc) 6496 { 6497 int i; 6498 6499 if (sc->hn_flags & HN_FLAG_ERRORS) 6500 return (false); 6501 6502 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6503 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6504 6505 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6506 return (false); 6507 } 6508 return (true); 6509 } 6510 6511 /* 6512 * Make sure that the RX filter is zero after the successful 6513 * RNDIS initialization. 6514 * 6515 * NOTE: 6516 * Under certain conditions on certain versions of Hyper-V, 6517 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6518 * after the successful RNDIS initialization, which breaks 6519 * the assumption of any following code (well, it breaks the 6520 * RNDIS API contract actually). Clear the RNDIS rxfilter 6521 * explicitly, drain packets sneaking through, and drain the 6522 * interrupt taskqueues scheduled due to the stealth packets. 6523 */ 6524 static void 6525 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6526 { 6527 6528 hn_disable_rx(sc); 6529 hn_drain_rxtx(sc, nchan); 6530 } 6531 6532 static int 6533 hn_synth_attach(struct hn_softc *sc, int mtu) 6534 { 6535 #define ATTACHED_NVS 0x0002 6536 #define ATTACHED_RNDIS 0x0004 6537 6538 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6539 int error, nsubch, nchan = 1, i, rndis_inited; 6540 uint32_t old_caps, attached = 0; 6541 6542 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6543 ("synthetic parts were attached")); 6544 6545 if (!hn_synth_attachable(sc)) 6546 return (ENXIO); 6547 6548 /* Save capabilities for later verification. */ 6549 old_caps = sc->hn_caps; 6550 sc->hn_caps = 0; 6551 6552 /* Clear RSS stuffs. */ 6553 sc->hn_rss_ind_size = 0; 6554 sc->hn_rss_hash = 0; 6555 sc->hn_rss_hcap = 0; 6556 6557 /* 6558 * Attach the primary channel _before_ attaching NVS and RNDIS. 6559 */ 6560 error = hn_chan_attach(sc, sc->hn_prichan); 6561 if (error) 6562 goto failed; 6563 6564 /* 6565 * Attach NVS. 6566 */ 6567 error = hn_nvs_attach(sc, mtu); 6568 if (error) 6569 goto failed; 6570 attached |= ATTACHED_NVS; 6571 6572 /* 6573 * Attach RNDIS _after_ NVS is attached. 6574 */ 6575 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6576 if (rndis_inited) 6577 attached |= ATTACHED_RNDIS; 6578 if (error) 6579 goto failed; 6580 6581 /* 6582 * Make sure capabilities are not changed. 6583 */ 6584 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6585 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6586 old_caps, sc->hn_caps); 6587 error = ENXIO; 6588 goto failed; 6589 } 6590 6591 /* 6592 * Allocate sub-channels for multi-TX/RX rings. 6593 * 6594 * NOTE: 6595 * The # of RX rings that can be used is equivalent to the # of 6596 * channels to be requested. 6597 */ 6598 nsubch = sc->hn_rx_ring_cnt - 1; 6599 error = hn_synth_alloc_subchans(sc, &nsubch); 6600 if (error) 6601 goto failed; 6602 /* NOTE: _Full_ synthetic parts detach is required now. */ 6603 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6604 6605 /* 6606 * Set the # of TX/RX rings that could be used according to 6607 * the # of channels that NVS offered. 6608 */ 6609 nchan = nsubch + 1; 6610 hn_set_ring_inuse(sc, nchan); 6611 if (nchan == 1) { 6612 /* Only the primary channel can be used; done */ 6613 goto back; 6614 } 6615 6616 /* 6617 * Attach the sub-channels. 6618 * 6619 * NOTE: hn_set_ring_inuse() _must_ have been called. 6620 */ 6621 error = hn_attach_subchans(sc); 6622 if (error) 6623 goto failed; 6624 6625 /* 6626 * Configure RSS key and indirect table _after_ all sub-channels 6627 * are attached. 6628 */ 6629 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6630 /* 6631 * RSS key is not set yet; set it to the default RSS key. 6632 */ 6633 if (bootverbose) 6634 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6635 #ifdef RSS 6636 rss_getkey(rss->rss_key); 6637 #else 6638 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6639 #endif 6640 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6641 } 6642 6643 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6644 /* 6645 * RSS indirect table is not set yet; set it up in round- 6646 * robin fashion. 6647 */ 6648 if (bootverbose) { 6649 if_printf(sc->hn_ifp, "setup default RSS indirect " 6650 "table\n"); 6651 } 6652 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6653 uint32_t subidx; 6654 6655 #ifdef RSS 6656 subidx = rss_get_indirection_to_bucket(i); 6657 #else 6658 subidx = i; 6659 #endif 6660 rss->rss_ind[i] = subidx % nchan; 6661 } 6662 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6663 } else { 6664 /* 6665 * # of usable channels may be changed, so we have to 6666 * make sure that all entries in RSS indirect table 6667 * are valid. 6668 * 6669 * NOTE: hn_set_ring_inuse() _must_ have been called. 6670 */ 6671 hn_rss_ind_fixup(sc); 6672 } 6673 6674 sc->hn_rss_hash = sc->hn_rss_hcap; 6675 if ((sc->hn_flags & HN_FLAG_RXVF) || 6676 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6677 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6678 hn_vf_rss_fixup(sc, false); 6679 } 6680 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6681 if (error) 6682 goto failed; 6683 back: 6684 /* 6685 * Fixup transmission aggregation setup. 6686 */ 6687 hn_set_txagg(sc); 6688 hn_rndis_init_fixat(sc, nchan); 6689 return (0); 6690 6691 failed: 6692 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6693 hn_rndis_init_fixat(sc, nchan); 6694 hn_synth_detach(sc); 6695 } else { 6696 if (attached & ATTACHED_RNDIS) { 6697 hn_rndis_init_fixat(sc, nchan); 6698 hn_rndis_detach(sc); 6699 } 6700 if (attached & ATTACHED_NVS) 6701 hn_nvs_detach(sc); 6702 hn_chan_detach(sc, sc->hn_prichan); 6703 /* Restore old capabilities. */ 6704 sc->hn_caps = old_caps; 6705 } 6706 return (error); 6707 6708 #undef ATTACHED_RNDIS 6709 #undef ATTACHED_NVS 6710 } 6711 6712 /* 6713 * NOTE: 6714 * The interface must have been suspended though hn_suspend(), before 6715 * this function get called. 6716 */ 6717 static void 6718 hn_synth_detach(struct hn_softc *sc) 6719 { 6720 6721 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6722 ("synthetic parts were not attached")); 6723 6724 /* Detach the RNDIS first. */ 6725 hn_rndis_detach(sc); 6726 6727 /* Detach NVS. */ 6728 hn_nvs_detach(sc); 6729 6730 /* Detach all of the channels. */ 6731 hn_detach_allchans(sc); 6732 6733 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6734 /* 6735 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6736 */ 6737 int error; 6738 6739 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6740 sc->hn_rxbuf_gpadl); 6741 if (error) { 6742 if_printf(sc->hn_ifp, 6743 "rxbuf gpadl disconn failed: %d\n", error); 6744 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6745 } 6746 sc->hn_rxbuf_gpadl = 0; 6747 } 6748 6749 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6750 /* 6751 * Host is post-Win2016, disconnect chimney sending buffer from 6752 * primary channel here. 6753 */ 6754 int error; 6755 6756 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6757 sc->hn_chim_gpadl); 6758 if (error) { 6759 if_printf(sc->hn_ifp, 6760 "chim gpadl disconn failed: %d\n", error); 6761 sc->hn_flags |= HN_FLAG_CHIM_REF; 6762 } 6763 sc->hn_chim_gpadl = 0; 6764 } 6765 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6766 } 6767 6768 static void 6769 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6770 { 6771 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6772 ("invalid ring count %d", ring_cnt)); 6773 6774 if (sc->hn_tx_ring_cnt > ring_cnt) 6775 sc->hn_tx_ring_inuse = ring_cnt; 6776 else 6777 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6778 sc->hn_rx_ring_inuse = ring_cnt; 6779 6780 #ifdef RSS 6781 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6782 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6783 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6784 rss_getnumbuckets()); 6785 } 6786 #endif 6787 6788 if (bootverbose) { 6789 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6790 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6791 } 6792 } 6793 6794 static void 6795 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6796 { 6797 6798 /* 6799 * NOTE: 6800 * The TX bufring will not be drained by the hypervisor, 6801 * if the primary channel is revoked. 6802 */ 6803 while (!vmbus_chan_rx_empty(chan) || 6804 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6805 !vmbus_chan_tx_empty(chan))) 6806 pause("waitch", 1); 6807 vmbus_chan_intr_drain(chan); 6808 } 6809 6810 static void 6811 hn_disable_rx(struct hn_softc *sc) 6812 { 6813 6814 /* 6815 * Disable RX by clearing RX filter forcefully. 6816 */ 6817 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6818 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6819 6820 /* 6821 * Give RNDIS enough time to flush all pending data packets. 6822 */ 6823 pause("waitrx", (200 * hz) / 1000); 6824 } 6825 6826 /* 6827 * NOTE: 6828 * RX/TX _must_ have been suspended/disabled, before this function 6829 * is called. 6830 */ 6831 static void 6832 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6833 { 6834 struct vmbus_channel **subch = NULL; 6835 int nsubch; 6836 6837 /* 6838 * Drain RX/TX bufrings and interrupts. 6839 */ 6840 nsubch = nchan - 1; 6841 if (nsubch > 0) 6842 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6843 6844 if (subch != NULL) { 6845 int i; 6846 6847 for (i = 0; i < nsubch; ++i) 6848 hn_chan_drain(sc, subch[i]); 6849 } 6850 hn_chan_drain(sc, sc->hn_prichan); 6851 6852 if (subch != NULL) 6853 vmbus_subchan_rel(subch, nsubch); 6854 } 6855 6856 static void 6857 hn_suspend_data(struct hn_softc *sc) 6858 { 6859 struct hn_tx_ring *txr; 6860 int i; 6861 6862 HN_LOCK_ASSERT(sc); 6863 6864 /* 6865 * Suspend TX. 6866 */ 6867 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6868 txr = &sc->hn_tx_ring[i]; 6869 6870 mtx_lock(&txr->hn_tx_lock); 6871 txr->hn_suspended = 1; 6872 mtx_unlock(&txr->hn_tx_lock); 6873 /* No one is able send more packets now. */ 6874 6875 /* 6876 * Wait for all pending sends to finish. 6877 * 6878 * NOTE: 6879 * We will _not_ receive all pending send-done, if the 6880 * primary channel is revoked. 6881 */ 6882 while (hn_tx_ring_pending(txr) && 6883 !vmbus_chan_is_revoked(sc->hn_prichan)) 6884 pause("hnwtx", 1 /* 1 tick */); 6885 } 6886 6887 /* 6888 * Disable RX. 6889 */ 6890 hn_disable_rx(sc); 6891 6892 /* 6893 * Drain RX/TX. 6894 */ 6895 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6896 6897 /* 6898 * Drain any pending TX tasks. 6899 * 6900 * NOTE: 6901 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6902 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6903 */ 6904 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6905 txr = &sc->hn_tx_ring[i]; 6906 6907 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6908 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6909 } 6910 } 6911 6912 static void 6913 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6914 { 6915 6916 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6917 } 6918 6919 static void 6920 hn_suspend_mgmt(struct hn_softc *sc) 6921 { 6922 struct task task; 6923 6924 HN_LOCK_ASSERT(sc); 6925 6926 /* 6927 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6928 * through hn_mgmt_taskq. 6929 */ 6930 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6931 vmbus_chan_run_task(sc->hn_prichan, &task); 6932 6933 /* 6934 * Make sure that all pending management tasks are completed. 6935 */ 6936 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6937 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6938 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6939 } 6940 6941 static void 6942 hn_suspend(struct hn_softc *sc) 6943 { 6944 6945 /* Disable polling. */ 6946 hn_polling(sc, 0); 6947 6948 /* 6949 * If the non-transparent mode VF is activated, the synthetic 6950 * device is receiving packets, so the data path of the 6951 * synthetic device must be suspended. 6952 */ 6953 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6954 (sc->hn_flags & HN_FLAG_RXVF)) 6955 hn_suspend_data(sc); 6956 hn_suspend_mgmt(sc); 6957 } 6958 6959 static void 6960 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6961 { 6962 int i; 6963 6964 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6965 ("invalid TX ring count %d", tx_ring_cnt)); 6966 6967 for (i = 0; i < tx_ring_cnt; ++i) { 6968 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6969 6970 mtx_lock(&txr->hn_tx_lock); 6971 txr->hn_suspended = 0; 6972 mtx_unlock(&txr->hn_tx_lock); 6973 } 6974 } 6975 6976 static void 6977 hn_resume_data(struct hn_softc *sc) 6978 { 6979 int i; 6980 6981 HN_LOCK_ASSERT(sc); 6982 6983 /* 6984 * Re-enable RX. 6985 */ 6986 hn_rxfilter_config(sc); 6987 6988 /* 6989 * Make sure to clear suspend status on "all" TX rings, 6990 * since hn_tx_ring_inuse can be changed after 6991 * hn_suspend_data(). 6992 */ 6993 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6994 6995 #ifdef HN_IFSTART_SUPPORT 6996 if (!hn_use_if_start) 6997 #endif 6998 { 6999 /* 7000 * Flush unused drbrs, since hn_tx_ring_inuse may be 7001 * reduced. 7002 */ 7003 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 7004 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 7005 } 7006 7007 /* 7008 * Kick start TX. 7009 */ 7010 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 7011 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 7012 7013 /* 7014 * Use txeof task, so that any pending oactive can be 7015 * cleared properly. 7016 */ 7017 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 7018 } 7019 } 7020 7021 static void 7022 hn_resume_mgmt(struct hn_softc *sc) 7023 { 7024 7025 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 7026 7027 /* 7028 * Kick off network change detection, if it was pending. 7029 * If no network change was pending, start link status 7030 * checks, which is more lightweight than network change 7031 * detection. 7032 */ 7033 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 7034 hn_change_network(sc); 7035 else 7036 hn_update_link_status(sc); 7037 } 7038 7039 static void 7040 hn_resume(struct hn_softc *sc) 7041 { 7042 7043 /* 7044 * If the non-transparent mode VF is activated, the synthetic 7045 * device have to receive packets, so the data path of the 7046 * synthetic device must be resumed. 7047 */ 7048 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 7049 (sc->hn_flags & HN_FLAG_RXVF)) 7050 hn_resume_data(sc); 7051 7052 /* 7053 * Don't resume link status change if VF is attached/activated. 7054 * - In the non-transparent VF mode, the synthetic device marks 7055 * link down until the VF is deactivated; i.e. VF is down. 7056 * - In transparent VF mode, VF's media status is used until 7057 * the VF is detached. 7058 */ 7059 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 7060 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 7061 hn_resume_mgmt(sc); 7062 7063 /* 7064 * Re-enable polling if this interface is running and 7065 * the polling is requested. 7066 */ 7067 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 7068 hn_polling(sc, sc->hn_pollhz); 7069 } 7070 7071 static void 7072 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 7073 { 7074 const struct rndis_status_msg *msg; 7075 int ofs; 7076 7077 if (dlen < sizeof(*msg)) { 7078 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7079 return; 7080 } 7081 msg = data; 7082 7083 switch (msg->rm_status) { 7084 case RNDIS_STATUS_MEDIA_CONNECT: 7085 case RNDIS_STATUS_MEDIA_DISCONNECT: 7086 hn_update_link_status(sc); 7087 break; 7088 7089 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7090 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7091 /* Not really useful; ignore. */ 7092 break; 7093 7094 case RNDIS_STATUS_NETWORK_CHANGE: 7095 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7096 if (dlen < ofs + msg->rm_stbuflen || 7097 msg->rm_stbuflen < sizeof(uint32_t)) { 7098 if_printf(sc->hn_ifp, "network changed\n"); 7099 } else { 7100 uint32_t change; 7101 7102 memcpy(&change, ((const uint8_t *)msg) + ofs, 7103 sizeof(change)); 7104 if_printf(sc->hn_ifp, "network changed, change %u\n", 7105 change); 7106 } 7107 hn_change_network(sc); 7108 break; 7109 7110 default: 7111 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7112 msg->rm_status); 7113 break; 7114 } 7115 } 7116 7117 static int 7118 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7119 { 7120 const struct rndis_pktinfo *pi = info_data; 7121 uint32_t mask = 0; 7122 7123 while (info_dlen != 0) { 7124 const void *data; 7125 uint32_t dlen; 7126 7127 if (__predict_false(info_dlen < sizeof(*pi))) 7128 return (EINVAL); 7129 if (__predict_false(info_dlen < pi->rm_size)) 7130 return (EINVAL); 7131 info_dlen -= pi->rm_size; 7132 7133 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7134 return (EINVAL); 7135 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7136 return (EINVAL); 7137 dlen = pi->rm_size - pi->rm_pktinfooffset; 7138 data = pi->rm_data; 7139 7140 if (pi->rm_internal == 1) { 7141 switch (pi->rm_type) { 7142 case NDIS_PKTINFO_IT_PKTINFO_ID: 7143 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7144 return (EINVAL); 7145 info->pktinfo_id = 7146 (const struct packet_info_id *)data; 7147 mask |= HN_RXINFO_PKTINFO_ID; 7148 break; 7149 7150 default: 7151 goto next; 7152 } 7153 } else { 7154 switch (pi->rm_type) { 7155 case NDIS_PKTINFO_TYPE_VLAN: 7156 if (__predict_false(dlen 7157 < NDIS_VLAN_INFO_SIZE)) 7158 return (EINVAL); 7159 info->vlan_info = (const uint32_t *)data; 7160 mask |= HN_RXINFO_VLAN; 7161 break; 7162 7163 case NDIS_PKTINFO_TYPE_CSUM: 7164 if (__predict_false(dlen 7165 < NDIS_RXCSUM_INFO_SIZE)) 7166 return (EINVAL); 7167 info->csum_info = (const uint32_t *)data; 7168 mask |= HN_RXINFO_CSUM; 7169 break; 7170 7171 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7172 if (__predict_false(dlen 7173 < HN_NDIS_HASH_VALUE_SIZE)) 7174 return (EINVAL); 7175 info->hash_value = (const uint32_t *)data; 7176 mask |= HN_RXINFO_HASHVAL; 7177 break; 7178 7179 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7180 if (__predict_false(dlen 7181 < HN_NDIS_HASH_INFO_SIZE)) 7182 return (EINVAL); 7183 info->hash_info = (const uint32_t *)data; 7184 mask |= HN_RXINFO_HASHINF; 7185 break; 7186 7187 default: 7188 goto next; 7189 } 7190 } 7191 7192 if (mask == HN_RXINFO_ALL) { 7193 /* All found; done */ 7194 break; 7195 } 7196 next: 7197 pi = (const struct rndis_pktinfo *) 7198 ((const uint8_t *)pi + pi->rm_size); 7199 } 7200 7201 /* 7202 * Final fixup. 7203 * - If there is no hash value, invalidate the hash info. 7204 */ 7205 if ((mask & HN_RXINFO_HASHVAL) == 0) 7206 info->hash_info = NULL; 7207 return (0); 7208 } 7209 7210 static __inline bool 7211 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7212 { 7213 7214 if (off < check_off) { 7215 if (__predict_true(off + len <= check_off)) 7216 return (false); 7217 } else if (off > check_off) { 7218 if (__predict_true(check_off + check_len <= off)) 7219 return (false); 7220 } 7221 return (true); 7222 } 7223 7224 static __inline void 7225 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7226 uint32_t len, struct hn_rxinfo *info) 7227 { 7228 uint32_t cnt = rxr->rsc.cnt; 7229 7230 if (cnt) { 7231 rxr->rsc.pktlen += len; 7232 } else { 7233 rxr->rsc.vlan_info = info->vlan_info; 7234 rxr->rsc.csum_info = info->csum_info; 7235 rxr->rsc.hash_info = info->hash_info; 7236 rxr->rsc.hash_value = info->hash_value; 7237 rxr->rsc.pktlen = len; 7238 } 7239 7240 rxr->rsc.frag_data[cnt] = data; 7241 rxr->rsc.frag_len[cnt] = len; 7242 rxr->rsc.cnt++; 7243 } 7244 7245 static void 7246 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7247 { 7248 const struct rndis_packet_msg *pkt; 7249 struct hn_rxinfo info; 7250 int data_off, pktinfo_off, data_len, pktinfo_len; 7251 bool rsc_more= false; 7252 7253 /* 7254 * Check length. 7255 */ 7256 if (__predict_false(dlen < sizeof(*pkt))) { 7257 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7258 return; 7259 } 7260 pkt = data; 7261 7262 if (__predict_false(dlen < pkt->rm_len)) { 7263 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7264 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7265 return; 7266 } 7267 if (__predict_false(pkt->rm_len < 7268 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7269 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7270 "msglen %u, data %u, oob %u, pktinfo %u\n", 7271 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7272 pkt->rm_pktinfolen); 7273 return; 7274 } 7275 if (__predict_false(pkt->rm_datalen == 0)) { 7276 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7277 return; 7278 } 7279 7280 /* 7281 * Check offests. 7282 */ 7283 #define IS_OFFSET_INVALID(ofs) \ 7284 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7285 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7286 7287 /* XXX Hyper-V does not meet data offset alignment requirement */ 7288 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7289 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7290 "data offset %u\n", pkt->rm_dataoffset); 7291 return; 7292 } 7293 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7294 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7295 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7296 "oob offset %u\n", pkt->rm_oobdataoffset); 7297 return; 7298 } 7299 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7300 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7301 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7302 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7303 return; 7304 } 7305 7306 #undef IS_OFFSET_INVALID 7307 7308 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7309 data_len = pkt->rm_datalen; 7310 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7311 pktinfo_len = pkt->rm_pktinfolen; 7312 7313 /* 7314 * Check OOB coverage. 7315 */ 7316 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7317 int oob_off, oob_len; 7318 7319 if_printf(rxr->hn_ifp, "got oobdata\n"); 7320 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7321 oob_len = pkt->rm_oobdatalen; 7322 7323 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7324 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7325 "oob overflow, msglen %u, oob abs %d len %d\n", 7326 pkt->rm_len, oob_off, oob_len); 7327 return; 7328 } 7329 7330 /* 7331 * Check against data. 7332 */ 7333 if (hn_rndis_check_overlap(oob_off, oob_len, 7334 data_off, data_len)) { 7335 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7336 "oob overlaps data, oob abs %d len %d, " 7337 "data abs %d len %d\n", 7338 oob_off, oob_len, data_off, data_len); 7339 return; 7340 } 7341 7342 /* 7343 * Check against pktinfo. 7344 */ 7345 if (pktinfo_len != 0 && 7346 hn_rndis_check_overlap(oob_off, oob_len, 7347 pktinfo_off, pktinfo_len)) { 7348 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7349 "oob overlaps pktinfo, oob abs %d len %d, " 7350 "pktinfo abs %d len %d\n", 7351 oob_off, oob_len, pktinfo_off, pktinfo_len); 7352 return; 7353 } 7354 } 7355 7356 /* 7357 * Check per-packet-info coverage and find useful per-packet-info. 7358 */ 7359 info.vlan_info = NULL; 7360 info.csum_info = NULL; 7361 info.hash_info = NULL; 7362 info.pktinfo_id = NULL; 7363 7364 if (__predict_true(pktinfo_len != 0)) { 7365 bool overlap; 7366 int error; 7367 7368 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7369 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7370 "pktinfo overflow, msglen %u, " 7371 "pktinfo abs %d len %d\n", 7372 pkt->rm_len, pktinfo_off, pktinfo_len); 7373 return; 7374 } 7375 7376 /* 7377 * Check packet info coverage. 7378 */ 7379 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7380 data_off, data_len); 7381 if (__predict_false(overlap)) { 7382 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7383 "pktinfo overlap data, pktinfo abs %d len %d, " 7384 "data abs %d len %d\n", 7385 pktinfo_off, pktinfo_len, data_off, data_len); 7386 return; 7387 } 7388 7389 /* 7390 * Find useful per-packet-info. 7391 */ 7392 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7393 pktinfo_len, &info); 7394 if (__predict_false(error)) { 7395 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7396 "pktinfo\n"); 7397 return; 7398 } 7399 } 7400 7401 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7402 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7403 "data overflow, msglen %u, data abs %d len %d\n", 7404 pkt->rm_len, data_off, data_len); 7405 return; 7406 } 7407 7408 /* Identify RSC fragments, drop invalid packets */ 7409 if ((info.pktinfo_id != NULL) && 7410 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7411 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7412 rxr->rsc.cnt = 0; 7413 rxr->hn_rsc_pkts++; 7414 } else if (rxr->rsc.cnt == 0) 7415 goto drop; 7416 7417 rsc_more = true; 7418 7419 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7420 rsc_more = false; 7421 7422 if (rsc_more && rxr->rsc.is_last) 7423 goto drop; 7424 } else { 7425 rxr->rsc.cnt = 0; 7426 } 7427 7428 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7429 goto drop; 7430 7431 /* Store data in per rx ring structure */ 7432 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7433 data_len, &info); 7434 7435 if (rsc_more) 7436 return; 7437 7438 hn_rxpkt(rxr); 7439 rxr->rsc.cnt = 0; 7440 return; 7441 drop: 7442 rxr->hn_rsc_drop++; 7443 return; 7444 } 7445 7446 static __inline void 7447 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7448 { 7449 const struct rndis_msghdr *hdr; 7450 7451 if (__predict_false(dlen < sizeof(*hdr))) { 7452 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7453 return; 7454 } 7455 hdr = data; 7456 7457 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7458 /* Hot data path. */ 7459 hn_rndis_rx_data(rxr, data, dlen); 7460 /* Done! */ 7461 return; 7462 } 7463 7464 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7465 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 7466 else 7467 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 7468 } 7469 7470 static void 7471 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7472 { 7473 const struct hn_nvs_hdr *hdr; 7474 7475 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7476 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7477 return; 7478 } 7479 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7480 7481 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7482 /* Useless; ignore */ 7483 return; 7484 } 7485 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7486 } 7487 7488 static void 7489 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7490 const struct vmbus_chanpkt_hdr *pkt) 7491 { 7492 struct hn_nvs_sendctx *sndc; 7493 7494 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7495 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7496 VMBUS_CHANPKT_DATALEN(pkt)); 7497 /* 7498 * NOTE: 7499 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7500 * its callback. 7501 */ 7502 } 7503 7504 static void 7505 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7506 const struct vmbus_chanpkt_hdr *pkthdr) 7507 { 7508 struct epoch_tracker et; 7509 const struct vmbus_chanpkt_rxbuf *pkt; 7510 const struct hn_nvs_hdr *nvs_hdr; 7511 int count, i, hlen; 7512 7513 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7514 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7515 return; 7516 } 7517 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7518 7519 /* Make sure that this is a RNDIS message. */ 7520 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7521 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7522 nvs_hdr->nvs_type); 7523 return; 7524 } 7525 7526 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7527 if (__predict_false(hlen < sizeof(*pkt))) { 7528 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7529 return; 7530 } 7531 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7532 7533 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7534 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7535 pkt->cp_rxbuf_id); 7536 return; 7537 } 7538 7539 count = pkt->cp_rxbuf_cnt; 7540 if (__predict_false(hlen < 7541 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7542 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7543 return; 7544 } 7545 7546 NET_EPOCH_ENTER(et); 7547 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7548 for (i = 0; i < count; ++i) { 7549 int ofs, len; 7550 7551 ofs = pkt->cp_rxbuf[i].rb_ofs; 7552 len = pkt->cp_rxbuf[i].rb_len; 7553 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7554 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7555 "ofs %d, len %d\n", i, ofs, len); 7556 continue; 7557 } 7558 7559 rxr->rsc.is_last = (i == (count - 1)); 7560 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7561 } 7562 NET_EPOCH_EXIT(et); 7563 7564 /* 7565 * Ack the consumed RXBUF associated w/ this channel packet, 7566 * so that this RXBUF can be recycled by the hypervisor. 7567 */ 7568 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7569 } 7570 7571 static void 7572 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7573 uint64_t tid) 7574 { 7575 struct hn_nvs_rndis_ack ack; 7576 int retries, error; 7577 7578 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7579 ack.nvs_status = HN_NVS_STATUS_OK; 7580 7581 retries = 0; 7582 again: 7583 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7584 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7585 if (__predict_false(error == EAGAIN)) { 7586 /* 7587 * NOTE: 7588 * This should _not_ happen in real world, since the 7589 * consumption of the TX bufring from the TX path is 7590 * controlled. 7591 */ 7592 if (rxr->hn_ack_failed == 0) 7593 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7594 rxr->hn_ack_failed++; 7595 retries++; 7596 if (retries < 10) { 7597 DELAY(100); 7598 goto again; 7599 } 7600 /* RXBUF leaks! */ 7601 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7602 } 7603 } 7604 7605 static void 7606 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7607 { 7608 struct hn_rx_ring *rxr = xrxr; 7609 struct hn_softc *sc = rxr->hn_ifp->if_softc; 7610 7611 for (;;) { 7612 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7613 int error, pktlen; 7614 7615 pktlen = rxr->hn_pktbuf_len; 7616 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7617 if (__predict_false(error == ENOBUFS)) { 7618 void *nbuf; 7619 int nlen; 7620 7621 /* 7622 * Expand channel packet buffer. 7623 * 7624 * XXX 7625 * Use M_WAITOK here, since allocation failure 7626 * is fatal. 7627 */ 7628 nlen = rxr->hn_pktbuf_len * 2; 7629 while (nlen < pktlen) 7630 nlen *= 2; 7631 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7632 7633 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7634 rxr->hn_pktbuf_len, nlen); 7635 7636 free(rxr->hn_pktbuf, M_DEVBUF); 7637 rxr->hn_pktbuf = nbuf; 7638 rxr->hn_pktbuf_len = nlen; 7639 /* Retry! */ 7640 continue; 7641 } else if (__predict_false(error == EAGAIN)) { 7642 /* No more channel packets; done! */ 7643 break; 7644 } 7645 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7646 7647 switch (pkt->cph_type) { 7648 case VMBUS_CHANPKT_TYPE_COMP: 7649 hn_nvs_handle_comp(sc, chan, pkt); 7650 break; 7651 7652 case VMBUS_CHANPKT_TYPE_RXBUF: 7653 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7654 break; 7655 7656 case VMBUS_CHANPKT_TYPE_INBAND: 7657 hn_nvs_handle_notify(sc, pkt); 7658 break; 7659 7660 default: 7661 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7662 pkt->cph_type); 7663 break; 7664 } 7665 } 7666 hn_chan_rollup(rxr, rxr->hn_txr); 7667 } 7668 7669 static void 7670 hn_sysinit(void *arg __unused) 7671 { 7672 int i; 7673 7674 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7675 7676 #ifdef HN_IFSTART_SUPPORT 7677 /* 7678 * Don't use ifnet.if_start if transparent VF mode is requested; 7679 * mainly due to the IFF_DRV_OACTIVE flag. 7680 */ 7681 if (hn_xpnt_vf && hn_use_if_start) { 7682 hn_use_if_start = 0; 7683 printf("hn: tranparent VF mode, if_transmit will be used, " 7684 "instead of if_start\n"); 7685 } 7686 #endif 7687 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7688 printf("hn: invalid transparent VF attach routing " 7689 "wait timeout %d, reset to %d\n", 7690 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7691 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7692 } 7693 7694 /* 7695 * Initialize VF map. 7696 */ 7697 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7698 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7699 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, 7700 M_WAITOK | M_ZERO); 7701 7702 /* 7703 * Fix the # of TX taskqueues. 7704 */ 7705 if (hn_tx_taskq_cnt <= 0) 7706 hn_tx_taskq_cnt = 1; 7707 else if (hn_tx_taskq_cnt > mp_ncpus) 7708 hn_tx_taskq_cnt = mp_ncpus; 7709 7710 /* 7711 * Fix the TX taskqueue mode. 7712 */ 7713 switch (hn_tx_taskq_mode) { 7714 case HN_TX_TASKQ_M_INDEP: 7715 case HN_TX_TASKQ_M_GLOBAL: 7716 case HN_TX_TASKQ_M_EVTTQ: 7717 break; 7718 default: 7719 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7720 break; 7721 } 7722 7723 if (vm_guest != VM_GUEST_HV) 7724 return; 7725 7726 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7727 return; 7728 7729 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7730 M_DEVBUF, M_WAITOK); 7731 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7732 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7733 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7734 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7735 "hn tx%d", i); 7736 } 7737 } 7738 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7739 7740 static void 7741 hn_sysuninit(void *arg __unused) 7742 { 7743 7744 if (hn_tx_taskque != NULL) { 7745 int i; 7746 7747 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7748 taskqueue_free(hn_tx_taskque[i]); 7749 free(hn_tx_taskque, M_DEVBUF); 7750 } 7751 7752 if (hn_vfmap != NULL) 7753 free(hn_vfmap, M_DEVBUF); 7754 rm_destroy(&hn_vfmap_lock); 7755 7756 counter_u64_free(hn_udpcs_fixup); 7757 } 7758 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7759