1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/bus.h> 66 #include <sys/counter.h> 67 #include <sys/kernel.h> 68 #include <sys/limits.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/module.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/socket.h> 80 #include <sys/sockio.h> 81 #include <sys/sx.h> 82 #include <sys/sysctl.h> 83 #include <sys/taskqueue.h> 84 #include <sys/buf_ring.h> 85 #include <sys/eventhandler.h> 86 #include <sys/epoch.h> 87 88 #include <machine/atomic.h> 89 #include <machine/in_cksum.h> 90 91 #include <net/bpf.h> 92 #include <net/ethernet.h> 93 #include <net/if.h> 94 #include <net/if_dl.h> 95 #include <net/if_media.h> 96 #include <net/if_types.h> 97 #include <net/if_var.h> 98 #include <net/rndis.h> 99 #ifdef RSS 100 #include <net/rss_config.h> 101 #endif 102 103 #include <netinet/in_systm.h> 104 #include <netinet/in.h> 105 #include <netinet/ip.h> 106 #include <netinet/ip6.h> 107 #include <netinet/tcp.h> 108 #include <netinet/tcp_lro.h> 109 #include <netinet/udp.h> 110 111 #include <dev/hyperv/include/hyperv.h> 112 #include <dev/hyperv/include/hyperv_busdma.h> 113 #include <dev/hyperv/include/vmbus.h> 114 #include <dev/hyperv/include/vmbus_xact.h> 115 116 #include <dev/hyperv/netvsc/ndis.h> 117 #include <dev/hyperv/netvsc/if_hnreg.h> 118 #include <dev/hyperv/netvsc/if_hnvar.h> 119 #include <dev/hyperv/netvsc/hn_nvs.h> 120 #include <dev/hyperv/netvsc/hn_rndis.h> 121 122 #include "vmbus_if.h" 123 124 #define HN_IFSTART_SUPPORT 125 126 #define HN_RING_CNT_DEF_MAX 8 127 128 #define HN_VFMAP_SIZE_DEF 8 129 130 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 131 132 /* YYY should get it from the underlying channel */ 133 #define HN_TX_DESC_CNT 512 134 135 #define HN_RNDIS_PKT_LEN \ 136 (sizeof(struct rndis_packet_msg) + \ 137 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 138 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 139 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 140 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 141 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 142 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 143 144 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 145 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 146 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 147 /* -1 for RNDIS packet message */ 148 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 149 150 #define HN_DIRECT_TX_SIZE_DEF 128 151 152 #define HN_EARLY_TXEOF_THRESH 8 153 154 #define HN_PKTBUF_LEN_DEF (16 * 1024) 155 156 #define HN_LROENT_CNT_DEF 128 157 158 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 159 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 160 /* YYY 2*MTU is a bit rough, but should be good enough. */ 161 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 162 163 #define HN_LRO_ACKCNT_DEF 1 164 165 #define HN_LOCK_INIT(sc) \ 166 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 167 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 168 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 169 #define HN_LOCK(sc) \ 170 do { \ 171 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 172 /* Relinquish cpu to avoid deadlock */ \ 173 sched_relinquish(curthread); \ 174 DELAY(1000); \ 175 } \ 176 } while (0) 177 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 178 179 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 180 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 181 #define HN_CSUM_IP_HWASSIST(sc) \ 182 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 183 #define HN_CSUM_IP6_HWASSIST(sc) \ 184 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 185 186 #define HN_PKTSIZE_MIN(align) \ 187 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 188 HN_RNDIS_PKT_LEN, (align)) 189 #define HN_PKTSIZE(m, align) \ 190 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 191 192 #ifdef RSS 193 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 194 #else 195 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 196 #endif 197 198 struct hn_txdesc { 199 #ifndef HN_USE_TXDESC_BUFRING 200 SLIST_ENTRY(hn_txdesc) link; 201 #endif 202 STAILQ_ENTRY(hn_txdesc) agg_link; 203 204 /* Aggregated txdescs, in sending order. */ 205 STAILQ_HEAD(, hn_txdesc) agg_list; 206 207 /* The oldest packet, if transmission aggregation happens. */ 208 struct mbuf *m; 209 struct hn_tx_ring *txr; 210 int refs; 211 uint32_t flags; /* HN_TXD_FLAG_ */ 212 struct hn_nvs_sendctx send_ctx; 213 uint32_t chim_index; 214 int chim_size; 215 216 bus_dmamap_t data_dmap; 217 218 bus_addr_t rndis_pkt_paddr; 219 struct rndis_packet_msg *rndis_pkt; 220 bus_dmamap_t rndis_pkt_dmap; 221 }; 222 223 #define HN_TXD_FLAG_ONLIST 0x0001 224 #define HN_TXD_FLAG_DMAMAP 0x0002 225 #define HN_TXD_FLAG_ONAGG 0x0004 226 227 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 228 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 229 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 230 231 struct packet_info_id { 232 uint8_t ver; 233 uint8_t flag; 234 uint16_t pkt_id; 235 }; 236 237 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 238 239 240 struct hn_rxinfo { 241 const uint32_t *vlan_info; 242 const uint32_t *csum_info; 243 const uint32_t *hash_info; 244 const uint32_t *hash_value; 245 const struct packet_info_id *pktinfo_id; 246 }; 247 248 struct hn_rxvf_setarg { 249 struct hn_rx_ring *rxr; 250 struct ifnet *vf_ifp; 251 }; 252 253 #define HN_RXINFO_VLAN 0x0001 254 #define HN_RXINFO_CSUM 0x0002 255 #define HN_RXINFO_HASHINF 0x0004 256 #define HN_RXINFO_HASHVAL 0x0008 257 #define HN_RXINFO_PKTINFO_ID 0x0010 258 #define HN_RXINFO_ALL \ 259 (HN_RXINFO_VLAN | \ 260 HN_RXINFO_CSUM | \ 261 HN_RXINFO_HASHINF | \ 262 HN_RXINFO_HASHVAL | \ 263 HN_RXINFO_PKTINFO_ID) 264 265 static int hn_probe(device_t); 266 static int hn_attach(device_t); 267 static int hn_detach(device_t); 268 static int hn_shutdown(device_t); 269 static void hn_chan_callback(struct vmbus_channel *, 270 void *); 271 272 static void hn_init(void *); 273 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 274 #ifdef HN_IFSTART_SUPPORT 275 static void hn_start(struct ifnet *); 276 #endif 277 static int hn_transmit(struct ifnet *, struct mbuf *); 278 static void hn_xmit_qflush(struct ifnet *); 279 static int hn_ifmedia_upd(struct ifnet *); 280 static void hn_ifmedia_sts(struct ifnet *, 281 struct ifmediareq *); 282 283 static void hn_ifnet_event(void *, struct ifnet *, int); 284 static void hn_ifaddr_event(void *, struct ifnet *); 285 static void hn_ifnet_attevent(void *, struct ifnet *); 286 static void hn_ifnet_detevent(void *, struct ifnet *); 287 static void hn_ifnet_lnkevent(void *, struct ifnet *, int); 288 289 static bool hn_ismyvf(const struct hn_softc *, 290 const struct ifnet *); 291 static void hn_rxvf_change(struct hn_softc *, 292 struct ifnet *, bool); 293 static void hn_rxvf_set(struct hn_softc *, struct ifnet *); 294 static void hn_rxvf_set_task(void *, int); 295 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); 296 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 297 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 298 struct ifreq *); 299 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 300 static bool hn_xpnt_vf_isready(struct hn_softc *); 301 static void hn_xpnt_vf_setready(struct hn_softc *); 302 static void hn_xpnt_vf_init_taskfunc(void *, int); 303 static void hn_xpnt_vf_init(struct hn_softc *); 304 static void hn_xpnt_vf_setenable(struct hn_softc *); 305 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 306 static void hn_vf_rss_fixup(struct hn_softc *, bool); 307 static void hn_vf_rss_restore(struct hn_softc *); 308 309 static int hn_rndis_rxinfo(const void *, int, 310 struct hn_rxinfo *); 311 static void hn_rndis_rx_data(struct hn_rx_ring *, 312 const void *, int); 313 static void hn_rndis_rx_status(struct hn_softc *, 314 const void *, int); 315 static void hn_rndis_init_fixat(struct hn_softc *, int); 316 317 static void hn_nvs_handle_notify(struct hn_softc *, 318 const struct vmbus_chanpkt_hdr *); 319 static void hn_nvs_handle_comp(struct hn_softc *, 320 struct vmbus_channel *, 321 const struct vmbus_chanpkt_hdr *); 322 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 323 struct vmbus_channel *, 324 const struct vmbus_chanpkt_hdr *); 325 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 326 struct vmbus_channel *, uint64_t); 327 328 #if __FreeBSD_version >= 1100099 329 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 330 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 331 #endif 332 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 334 #if __FreeBSD_version < 1100095 335 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 336 #else 337 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 338 #endif 339 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 342 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 343 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 344 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 345 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 346 #ifndef RSS 347 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 348 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 349 #endif 350 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 358 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 359 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 360 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 361 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 362 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 363 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 364 365 static void hn_stop(struct hn_softc *, bool); 366 static void hn_init_locked(struct hn_softc *); 367 static int hn_chan_attach(struct hn_softc *, 368 struct vmbus_channel *); 369 static void hn_chan_detach(struct hn_softc *, 370 struct vmbus_channel *); 371 static int hn_attach_subchans(struct hn_softc *); 372 static void hn_detach_allchans(struct hn_softc *); 373 static void hn_chan_rollup(struct hn_rx_ring *, 374 struct hn_tx_ring *); 375 static void hn_set_ring_inuse(struct hn_softc *, int); 376 static int hn_synth_attach(struct hn_softc *, int); 377 static void hn_synth_detach(struct hn_softc *); 378 static int hn_synth_alloc_subchans(struct hn_softc *, 379 int *); 380 static bool hn_synth_attachable(const struct hn_softc *); 381 static void hn_suspend(struct hn_softc *); 382 static void hn_suspend_data(struct hn_softc *); 383 static void hn_suspend_mgmt(struct hn_softc *); 384 static void hn_resume(struct hn_softc *); 385 static void hn_resume_data(struct hn_softc *); 386 static void hn_resume_mgmt(struct hn_softc *); 387 static void hn_suspend_mgmt_taskfunc(void *, int); 388 static void hn_chan_drain(struct hn_softc *, 389 struct vmbus_channel *); 390 static void hn_disable_rx(struct hn_softc *); 391 static void hn_drain_rxtx(struct hn_softc *, int); 392 static void hn_polling(struct hn_softc *, u_int); 393 static void hn_chan_polling(struct vmbus_channel *, u_int); 394 static void hn_mtu_change_fixup(struct hn_softc *); 395 396 static void hn_update_link_status(struct hn_softc *); 397 static void hn_change_network(struct hn_softc *); 398 static void hn_link_taskfunc(void *, int); 399 static void hn_netchg_init_taskfunc(void *, int); 400 static void hn_netchg_status_taskfunc(void *, int); 401 static void hn_link_status(struct hn_softc *); 402 403 static int hn_create_rx_data(struct hn_softc *, int); 404 static void hn_destroy_rx_data(struct hn_softc *); 405 static int hn_check_iplen(const struct mbuf *, int); 406 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 407 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 408 static int hn_rxfilter_config(struct hn_softc *); 409 static int hn_rss_reconfig(struct hn_softc *); 410 static void hn_rss_ind_fixup(struct hn_softc *); 411 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 412 static int hn_rxpkt(struct hn_rx_ring *); 413 static uint32_t hn_rss_type_fromndis(uint32_t); 414 static uint32_t hn_rss_type_tondis(uint32_t); 415 416 static int hn_tx_ring_create(struct hn_softc *, int); 417 static void hn_tx_ring_destroy(struct hn_tx_ring *); 418 static int hn_create_tx_data(struct hn_softc *, int); 419 static void hn_fixup_tx_data(struct hn_softc *); 420 static void hn_fixup_rx_data(struct hn_softc *); 421 static void hn_destroy_tx_data(struct hn_softc *); 422 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 423 static void hn_txdesc_gc(struct hn_tx_ring *, 424 struct hn_txdesc *); 425 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 426 struct hn_txdesc *, struct mbuf **); 427 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 428 struct hn_txdesc *); 429 static void hn_set_chim_size(struct hn_softc *, int); 430 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 431 static bool hn_tx_ring_pending(struct hn_tx_ring *); 432 static void hn_tx_ring_qflush(struct hn_tx_ring *); 433 static void hn_resume_tx(struct hn_softc *, int); 434 static void hn_set_txagg(struct hn_softc *); 435 static void *hn_try_txagg(struct ifnet *, 436 struct hn_tx_ring *, struct hn_txdesc *, 437 int); 438 static int hn_get_txswq_depth(const struct hn_tx_ring *); 439 static void hn_txpkt_done(struct hn_nvs_sendctx *, 440 struct hn_softc *, struct vmbus_channel *, 441 const void *, int); 442 static int hn_txpkt_sglist(struct hn_tx_ring *, 443 struct hn_txdesc *); 444 static int hn_txpkt_chim(struct hn_tx_ring *, 445 struct hn_txdesc *); 446 static int hn_xmit(struct hn_tx_ring *, int); 447 static void hn_xmit_taskfunc(void *, int); 448 static void hn_xmit_txeof(struct hn_tx_ring *); 449 static void hn_xmit_txeof_taskfunc(void *, int); 450 #ifdef HN_IFSTART_SUPPORT 451 static int hn_start_locked(struct hn_tx_ring *, int); 452 static void hn_start_taskfunc(void *, int); 453 static void hn_start_txeof(struct hn_tx_ring *); 454 static void hn_start_txeof_taskfunc(void *, int); 455 #endif 456 457 static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS); 458 459 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 460 "Hyper-V network interface"); 461 462 /* Trust tcp segment verification on host side. */ 463 static int hn_trust_hosttcp = 1; 464 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 465 &hn_trust_hosttcp, 0, 466 "Trust tcp segment verification on host side, " 467 "when csum info is missing (global setting)"); 468 469 /* Trust udp datagrams verification on host side. */ 470 static int hn_trust_hostudp = 1; 471 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 472 &hn_trust_hostudp, 0, 473 "Trust udp datagram verification on host side, " 474 "when csum info is missing (global setting)"); 475 476 /* Trust ip packets verification on host side. */ 477 static int hn_trust_hostip = 1; 478 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 479 &hn_trust_hostip, 0, 480 "Trust ip packet verification on host side, " 481 "when csum info is missing (global setting)"); 482 483 /* 484 * Offload UDP/IPv4 checksum. 485 */ 486 static int hn_enable_udp4cs = 1; 487 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 488 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 489 490 /* 491 * Offload UDP/IPv6 checksum. 492 */ 493 static int hn_enable_udp6cs = 1; 494 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 495 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 496 497 /* Stats. */ 498 static counter_u64_t hn_udpcs_fixup; 499 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 500 &hn_udpcs_fixup, "# of UDP checksum fixup"); 501 502 /* 503 * See hn_set_hlen(). 504 * 505 * This value is for Azure. For Hyper-V, set this above 506 * 65536 to disable UDP datagram checksum fixup. 507 */ 508 static int hn_udpcs_fixup_mtu = 1420; 509 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 510 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 511 512 /* Limit TSO burst size */ 513 static int hn_tso_maxlen = IP_MAXPACKET; 514 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 515 &hn_tso_maxlen, 0, "TSO burst limit"); 516 517 /* Limit chimney send size */ 518 static int hn_tx_chimney_size = 0; 519 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 520 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 521 522 /* Limit the size of packet for direct transmission */ 523 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 524 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 525 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 526 527 /* # of LRO entries per RX ring */ 528 #if defined(INET) || defined(INET6) 529 #if __FreeBSD_version >= 1100095 530 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 531 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 532 &hn_lro_entry_count, 0, "LRO entry count"); 533 #endif 534 #endif 535 536 static int hn_tx_taskq_cnt = 1; 537 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 538 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 539 540 #define HN_TX_TASKQ_M_INDEP 0 541 #define HN_TX_TASKQ_M_GLOBAL 1 542 #define HN_TX_TASKQ_M_EVTTQ 2 543 544 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 545 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 546 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 547 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 548 549 #ifndef HN_USE_TXDESC_BUFRING 550 static int hn_use_txdesc_bufring = 0; 551 #else 552 static int hn_use_txdesc_bufring = 1; 553 #endif 554 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 555 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 556 557 #ifdef HN_IFSTART_SUPPORT 558 /* Use ifnet.if_start instead of ifnet.if_transmit */ 559 static int hn_use_if_start = 0; 560 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 561 &hn_use_if_start, 0, "Use if_start TX method"); 562 #endif 563 564 /* # of channels to use */ 565 static int hn_chan_cnt = 0; 566 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 567 &hn_chan_cnt, 0, 568 "# of channels to use; each channel has one RX ring and one TX ring"); 569 570 /* # of transmit rings to use */ 571 static int hn_tx_ring_cnt = 0; 572 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 573 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 574 575 /* Software TX ring deptch */ 576 static int hn_tx_swq_depth = 0; 577 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 578 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 579 580 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 581 #if __FreeBSD_version >= 1100095 582 static u_int hn_lro_mbufq_depth = 0; 583 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 584 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 585 #endif 586 587 /* Packet transmission aggregation size limit */ 588 static int hn_tx_agg_size = -1; 589 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 590 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 591 592 /* Packet transmission aggregation count limit */ 593 static int hn_tx_agg_pkts = -1; 594 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 595 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 596 597 /* VF list */ 598 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 599 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 600 hn_vflist_sysctl, "A", 601 "VF list"); 602 603 /* VF mapping */ 604 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 605 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 606 hn_vfmap_sysctl, "A", 607 "VF mapping"); 608 609 /* Transparent VF */ 610 static int hn_xpnt_vf = 1; 611 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 612 &hn_xpnt_vf, 0, "Transparent VF mod"); 613 614 /* Accurate BPF support for Transparent VF */ 615 static int hn_xpnt_vf_accbpf = 0; 616 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 617 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 618 619 /* Extra wait for transparent VF attach routing; unit seconds. */ 620 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 621 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 622 &hn_xpnt_vf_attwait, 0, 623 "Extra wait for transparent VF attach routing; unit: seconds"); 624 625 static u_int hn_cpu_index; /* next CPU for channel */ 626 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 627 628 static struct rmlock hn_vfmap_lock; 629 static int hn_vfmap_size; 630 static struct ifnet **hn_vfmap; 631 632 #ifndef RSS 633 static const uint8_t 634 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 635 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 636 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 637 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 638 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 639 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 640 }; 641 #endif /* !RSS */ 642 643 static const struct hyperv_guid hn_guid = { 644 .hv_guid = { 645 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 646 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 647 }; 648 649 static device_method_t hn_methods[] = { 650 /* Device interface */ 651 DEVMETHOD(device_probe, hn_probe), 652 DEVMETHOD(device_attach, hn_attach), 653 DEVMETHOD(device_detach, hn_detach), 654 DEVMETHOD(device_shutdown, hn_shutdown), 655 DEVMETHOD_END 656 }; 657 658 static driver_t hn_driver = { 659 "hn", 660 hn_methods, 661 sizeof(struct hn_softc) 662 }; 663 664 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0); 665 MODULE_VERSION(hn, 1); 666 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 667 668 #if __FreeBSD_version >= 1100099 669 static void 670 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 671 { 672 int i; 673 674 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 675 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 676 } 677 #endif 678 679 static int 680 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 681 { 682 683 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 684 txd->chim_size == 0, ("invalid rndis sglist txd")); 685 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 686 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 687 } 688 689 static int 690 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 691 { 692 struct hn_nvs_rndis rndis; 693 694 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 695 txd->chim_size > 0, ("invalid rndis chim txd")); 696 697 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 698 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 699 rndis.nvs_chim_idx = txd->chim_index; 700 rndis.nvs_chim_sz = txd->chim_size; 701 702 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 703 &rndis, sizeof(rndis), &txd->send_ctx)); 704 } 705 706 static __inline uint32_t 707 hn_chim_alloc(struct hn_softc *sc) 708 { 709 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 710 u_long *bmap = sc->hn_chim_bmap; 711 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 712 713 for (i = 0; i < bmap_cnt; ++i) { 714 int idx; 715 716 idx = ffsl(~bmap[i]); 717 if (idx == 0) 718 continue; 719 720 --idx; /* ffsl is 1-based */ 721 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 722 ("invalid i %d and idx %d", i, idx)); 723 724 if (atomic_testandset_long(&bmap[i], idx)) 725 continue; 726 727 ret = i * LONG_BIT + idx; 728 break; 729 } 730 return (ret); 731 } 732 733 static __inline void 734 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 735 { 736 u_long mask; 737 uint32_t idx; 738 739 idx = chim_idx / LONG_BIT; 740 KASSERT(idx < sc->hn_chim_bmap_cnt, 741 ("invalid chimney index 0x%x", chim_idx)); 742 743 mask = 1UL << (chim_idx % LONG_BIT); 744 KASSERT(sc->hn_chim_bmap[idx] & mask, 745 ("index bitmap 0x%lx, chimney index %u, " 746 "bitmap idx %d, bitmask 0x%lx", 747 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 748 749 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 750 } 751 752 #if defined(INET6) || defined(INET) 753 754 #define PULLUP_HDR(m, len) \ 755 do { \ 756 if (__predict_false((m)->m_len < (len))) { \ 757 (m) = m_pullup((m), (len)); \ 758 if ((m) == NULL) \ 759 return (NULL); \ 760 } \ 761 } while (0) 762 763 /* 764 * NOTE: If this function failed, the m_head would be freed. 765 */ 766 static __inline struct mbuf * 767 hn_tso_fixup(struct mbuf *m_head) 768 { 769 struct ether_vlan_header *evl; 770 struct tcphdr *th; 771 int ehlen; 772 773 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 774 775 PULLUP_HDR(m_head, sizeof(*evl)); 776 evl = mtod(m_head, struct ether_vlan_header *); 777 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 778 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 779 else 780 ehlen = ETHER_HDR_LEN; 781 m_head->m_pkthdr.l2hlen = ehlen; 782 783 #ifdef INET 784 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 785 struct ip *ip; 786 int iphlen; 787 788 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 789 ip = mtodo(m_head, ehlen); 790 iphlen = ip->ip_hl << 2; 791 m_head->m_pkthdr.l3hlen = iphlen; 792 793 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 794 th = mtodo(m_head, ehlen + iphlen); 795 796 ip->ip_len = 0; 797 ip->ip_sum = 0; 798 th->th_sum = in_pseudo(ip->ip_src.s_addr, 799 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 800 } 801 #endif 802 #if defined(INET6) && defined(INET) 803 else 804 #endif 805 #ifdef INET6 806 { 807 struct ip6_hdr *ip6; 808 809 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 810 ip6 = mtodo(m_head, ehlen); 811 if (ip6->ip6_nxt != IPPROTO_TCP) { 812 m_freem(m_head); 813 return (NULL); 814 } 815 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 816 817 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 818 th = mtodo(m_head, ehlen + sizeof(*ip6)); 819 820 ip6->ip6_plen = 0; 821 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 822 } 823 #endif 824 return (m_head); 825 } 826 827 /* 828 * NOTE: If this function failed, the m_head would be freed. 829 */ 830 static __inline struct mbuf * 831 hn_set_hlen(struct mbuf *m_head) 832 { 833 const struct ether_vlan_header *evl; 834 int ehlen; 835 836 PULLUP_HDR(m_head, sizeof(*evl)); 837 evl = mtod(m_head, const struct ether_vlan_header *); 838 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 839 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 840 else 841 ehlen = ETHER_HDR_LEN; 842 m_head->m_pkthdr.l2hlen = ehlen; 843 844 #ifdef INET 845 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 846 const struct ip *ip; 847 int iphlen; 848 849 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 850 ip = mtodo(m_head, ehlen); 851 iphlen = ip->ip_hl << 2; 852 m_head->m_pkthdr.l3hlen = iphlen; 853 854 /* 855 * UDP checksum offload does not work in Azure, if the 856 * following conditions meet: 857 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 858 * - IP_DF is not set in the IP hdr. 859 * 860 * Fallback to software checksum for these UDP datagrams. 861 */ 862 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 863 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 864 (ntohs(ip->ip_off) & IP_DF) == 0) { 865 uint16_t off = ehlen + iphlen; 866 867 counter_u64_add(hn_udpcs_fixup, 1); 868 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 869 *(uint16_t *)(m_head->m_data + off + 870 m_head->m_pkthdr.csum_data) = in_cksum_skip( 871 m_head, m_head->m_pkthdr.len, off); 872 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 873 } 874 } 875 #endif 876 #if defined(INET6) && defined(INET) 877 else 878 #endif 879 #ifdef INET6 880 { 881 const struct ip6_hdr *ip6; 882 883 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 884 ip6 = mtodo(m_head, ehlen); 885 if (ip6->ip6_nxt != IPPROTO_TCP && 886 ip6->ip6_nxt != IPPROTO_UDP) { 887 m_freem(m_head); 888 return (NULL); 889 } 890 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 891 } 892 #endif 893 return (m_head); 894 } 895 896 /* 897 * NOTE: If this function failed, the m_head would be freed. 898 */ 899 static __inline struct mbuf * 900 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 901 { 902 const struct tcphdr *th; 903 int ehlen, iphlen; 904 905 *tcpsyn = 0; 906 ehlen = m_head->m_pkthdr.l2hlen; 907 iphlen = m_head->m_pkthdr.l3hlen; 908 909 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 910 th = mtodo(m_head, ehlen + iphlen); 911 if (th->th_flags & TH_SYN) 912 *tcpsyn = 1; 913 return (m_head); 914 } 915 916 #undef PULLUP_HDR 917 918 #endif /* INET6 || INET */ 919 920 static int 921 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 922 { 923 int error = 0; 924 925 HN_LOCK_ASSERT(sc); 926 927 if (sc->hn_rx_filter != filter) { 928 error = hn_rndis_set_rxfilter(sc, filter); 929 if (!error) 930 sc->hn_rx_filter = filter; 931 } 932 return (error); 933 } 934 935 static int 936 hn_rxfilter_config(struct hn_softc *sc) 937 { 938 struct ifnet *ifp = sc->hn_ifp; 939 uint32_t filter; 940 941 HN_LOCK_ASSERT(sc); 942 943 /* 944 * If the non-transparent mode VF is activated, we don't know how 945 * its RX filter is configured, so stick the synthetic device in 946 * the promiscous mode. 947 */ 948 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 949 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 950 } else { 951 filter = NDIS_PACKET_TYPE_DIRECTED; 952 if (ifp->if_flags & IFF_BROADCAST) 953 filter |= NDIS_PACKET_TYPE_BROADCAST; 954 /* TODO: support multicast list */ 955 if ((ifp->if_flags & IFF_ALLMULTI) || 956 !CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 957 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 958 } 959 return (hn_set_rxfilter(sc, filter)); 960 } 961 962 static void 963 hn_set_txagg(struct hn_softc *sc) 964 { 965 uint32_t size, pkts; 966 int i; 967 968 /* 969 * Setup aggregation size. 970 */ 971 if (sc->hn_agg_size < 0) 972 size = UINT32_MAX; 973 else 974 size = sc->hn_agg_size; 975 976 if (sc->hn_rndis_agg_size < size) 977 size = sc->hn_rndis_agg_size; 978 979 /* NOTE: We only aggregate packets using chimney sending buffers. */ 980 if (size > (uint32_t)sc->hn_chim_szmax) 981 size = sc->hn_chim_szmax; 982 983 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 984 /* Disable */ 985 size = 0; 986 pkts = 0; 987 goto done; 988 } 989 990 /* NOTE: Type of the per TX ring setting is 'int'. */ 991 if (size > INT_MAX) 992 size = INT_MAX; 993 994 /* 995 * Setup aggregation packet count. 996 */ 997 if (sc->hn_agg_pkts < 0) 998 pkts = UINT32_MAX; 999 else 1000 pkts = sc->hn_agg_pkts; 1001 1002 if (sc->hn_rndis_agg_pkts < pkts) 1003 pkts = sc->hn_rndis_agg_pkts; 1004 1005 if (pkts <= 1) { 1006 /* Disable */ 1007 size = 0; 1008 pkts = 0; 1009 goto done; 1010 } 1011 1012 /* NOTE: Type of the per TX ring setting is 'short'. */ 1013 if (pkts > SHRT_MAX) 1014 pkts = SHRT_MAX; 1015 1016 done: 1017 /* NOTE: Type of the per TX ring setting is 'short'. */ 1018 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1019 /* Disable */ 1020 size = 0; 1021 pkts = 0; 1022 } 1023 1024 if (bootverbose) { 1025 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1026 size, pkts, sc->hn_rndis_agg_align); 1027 } 1028 1029 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1030 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1031 1032 mtx_lock(&txr->hn_tx_lock); 1033 txr->hn_agg_szmax = size; 1034 txr->hn_agg_pktmax = pkts; 1035 txr->hn_agg_align = sc->hn_rndis_agg_align; 1036 mtx_unlock(&txr->hn_tx_lock); 1037 } 1038 } 1039 1040 static int 1041 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1042 { 1043 1044 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1045 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1046 return txr->hn_txdesc_cnt; 1047 return hn_tx_swq_depth; 1048 } 1049 1050 static int 1051 hn_rss_reconfig(struct hn_softc *sc) 1052 { 1053 int error; 1054 1055 HN_LOCK_ASSERT(sc); 1056 1057 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1058 return (ENXIO); 1059 1060 /* 1061 * Disable RSS first. 1062 * 1063 * NOTE: 1064 * Direct reconfiguration by setting the UNCHG flags does 1065 * _not_ work properly. 1066 */ 1067 if (bootverbose) 1068 if_printf(sc->hn_ifp, "disable RSS\n"); 1069 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1070 if (error) { 1071 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1072 return (error); 1073 } 1074 1075 /* 1076 * Reenable the RSS w/ the updated RSS key or indirect 1077 * table. 1078 */ 1079 if (bootverbose) 1080 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1081 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1082 if (error) { 1083 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1084 return (error); 1085 } 1086 return (0); 1087 } 1088 1089 static void 1090 hn_rss_ind_fixup(struct hn_softc *sc) 1091 { 1092 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1093 int i, nchan; 1094 1095 nchan = sc->hn_rx_ring_inuse; 1096 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1097 1098 /* 1099 * Check indirect table to make sure that all channels in it 1100 * can be used. 1101 */ 1102 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1103 if (rss->rss_ind[i] >= nchan) { 1104 if_printf(sc->hn_ifp, 1105 "RSS indirect table %d fixup: %u -> %d\n", 1106 i, rss->rss_ind[i], nchan - 1); 1107 rss->rss_ind[i] = nchan - 1; 1108 } 1109 } 1110 } 1111 1112 static int 1113 hn_ifmedia_upd(struct ifnet *ifp __unused) 1114 { 1115 1116 return EOPNOTSUPP; 1117 } 1118 1119 static void 1120 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 1121 { 1122 struct hn_softc *sc = ifp->if_softc; 1123 1124 ifmr->ifm_status = IFM_AVALID; 1125 ifmr->ifm_active = IFM_ETHER; 1126 1127 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1128 ifmr->ifm_active |= IFM_NONE; 1129 return; 1130 } 1131 ifmr->ifm_status |= IFM_ACTIVE; 1132 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1133 } 1134 1135 static void 1136 hn_rxvf_set_task(void *xarg, int pending __unused) 1137 { 1138 struct hn_rxvf_setarg *arg = xarg; 1139 1140 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1141 } 1142 1143 static void 1144 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) 1145 { 1146 struct hn_rx_ring *rxr; 1147 struct hn_rxvf_setarg arg; 1148 struct task task; 1149 int i; 1150 1151 HN_LOCK_ASSERT(sc); 1152 1153 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1154 1155 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1156 rxr = &sc->hn_rx_ring[i]; 1157 1158 if (i < sc->hn_rx_ring_inuse) { 1159 arg.rxr = rxr; 1160 arg.vf_ifp = vf_ifp; 1161 vmbus_chan_run_task(rxr->hn_chan, &task); 1162 } else { 1163 rxr->hn_rxvf_ifp = vf_ifp; 1164 } 1165 } 1166 } 1167 1168 static bool 1169 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) 1170 { 1171 const struct ifnet *hn_ifp; 1172 1173 hn_ifp = sc->hn_ifp; 1174 1175 if (ifp == hn_ifp) 1176 return (false); 1177 1178 if (ifp->if_alloctype != IFT_ETHER) 1179 return (false); 1180 1181 /* Ignore lagg/vlan interfaces */ 1182 if (strcmp(ifp->if_dname, "lagg") == 0 || 1183 strcmp(ifp->if_dname, "vlan") == 0) 1184 return (false); 1185 1186 /* 1187 * During detach events ifp->if_addr might be NULL. 1188 * Make sure the bcmp() below doesn't panic on that: 1189 */ 1190 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL) 1191 return (false); 1192 1193 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 1194 return (false); 1195 1196 return (true); 1197 } 1198 1199 static void 1200 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) 1201 { 1202 struct ifnet *hn_ifp; 1203 1204 HN_LOCK(sc); 1205 1206 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1207 goto out; 1208 1209 if (!hn_ismyvf(sc, ifp)) 1210 goto out; 1211 hn_ifp = sc->hn_ifp; 1212 1213 if (rxvf) { 1214 if (sc->hn_flags & HN_FLAG_RXVF) 1215 goto out; 1216 1217 sc->hn_flags |= HN_FLAG_RXVF; 1218 hn_rxfilter_config(sc); 1219 } else { 1220 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1221 goto out; 1222 1223 sc->hn_flags &= ~HN_FLAG_RXVF; 1224 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 1225 hn_rxfilter_config(sc); 1226 else 1227 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1228 } 1229 1230 hn_nvs_set_datapath(sc, 1231 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1232 1233 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1234 1235 if (rxvf) { 1236 hn_vf_rss_fixup(sc, true); 1237 hn_suspend_mgmt(sc); 1238 sc->hn_link_flags &= 1239 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1240 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1241 } else { 1242 hn_vf_rss_restore(sc); 1243 hn_resume_mgmt(sc); 1244 } 1245 1246 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, 1247 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1248 1249 if (bootverbose) { 1250 if_printf(hn_ifp, "datapath is switched %s %s\n", 1251 rxvf ? "to" : "from", ifp->if_xname); 1252 } 1253 out: 1254 HN_UNLOCK(sc); 1255 } 1256 1257 static void 1258 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1259 { 1260 1261 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1262 return; 1263 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1264 } 1265 1266 static void 1267 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1268 { 1269 1270 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); 1271 } 1272 1273 static int 1274 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1275 { 1276 struct ifnet *ifp, *vf_ifp; 1277 uint64_t tmp; 1278 int error; 1279 1280 HN_LOCK_ASSERT(sc); 1281 ifp = sc->hn_ifp; 1282 vf_ifp = sc->hn_vf_ifp; 1283 1284 /* 1285 * Fix up requested capabilities w/ supported capabilities, 1286 * since the supported capabilities could have been changed. 1287 */ 1288 ifr->ifr_reqcap &= ifp->if_capabilities; 1289 /* Pass SIOCSIFCAP to VF. */ 1290 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); 1291 1292 /* 1293 * NOTE: 1294 * The error will be propagated to the callers, however, it 1295 * is _not_ useful here. 1296 */ 1297 1298 /* 1299 * Merge VF's enabled capabilities. 1300 */ 1301 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; 1302 1303 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); 1304 if (ifp->if_capenable & IFCAP_TXCSUM) 1305 ifp->if_hwassist |= tmp; 1306 else 1307 ifp->if_hwassist &= ~tmp; 1308 1309 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); 1310 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 1311 ifp->if_hwassist |= tmp; 1312 else 1313 ifp->if_hwassist &= ~tmp; 1314 1315 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; 1316 if (ifp->if_capenable & IFCAP_TSO4) 1317 ifp->if_hwassist |= tmp; 1318 else 1319 ifp->if_hwassist &= ~tmp; 1320 1321 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; 1322 if (ifp->if_capenable & IFCAP_TSO6) 1323 ifp->if_hwassist |= tmp; 1324 else 1325 ifp->if_hwassist &= ~tmp; 1326 1327 return (error); 1328 } 1329 1330 static int 1331 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1332 { 1333 struct ifnet *vf_ifp; 1334 struct ifreq ifr; 1335 1336 HN_LOCK_ASSERT(sc); 1337 vf_ifp = sc->hn_vf_ifp; 1338 1339 memset(&ifr, 0, sizeof(ifr)); 1340 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1341 ifr.ifr_flags = vf_ifp->if_flags & 0xffff; 1342 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; 1343 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); 1344 } 1345 1346 static void 1347 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1348 { 1349 struct ifnet *ifp = sc->hn_ifp; 1350 int allmulti = 0; 1351 1352 HN_LOCK_ASSERT(sc); 1353 1354 /* XXX vlan(4) style mcast addr maintenance */ 1355 if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 1356 allmulti = IFF_ALLMULTI; 1357 1358 /* Always set the VF's if_flags */ 1359 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; 1360 } 1361 1362 static void 1363 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) 1364 { 1365 struct rm_priotracker pt; 1366 struct ifnet *hn_ifp = NULL; 1367 struct mbuf *mn; 1368 1369 /* 1370 * XXX racy, if hn(4) ever detached. 1371 */ 1372 rm_rlock(&hn_vfmap_lock, &pt); 1373 if (vf_ifp->if_index < hn_vfmap_size) 1374 hn_ifp = hn_vfmap[vf_ifp->if_index]; 1375 rm_runlock(&hn_vfmap_lock, &pt); 1376 1377 if (hn_ifp != NULL) { 1378 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1379 /* 1380 * Allow tapping on the VF. 1381 */ 1382 ETHER_BPF_MTAP(vf_ifp, mn); 1383 1384 /* 1385 * Update VF stats. 1386 */ 1387 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { 1388 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1389 mn->m_pkthdr.len); 1390 } 1391 /* 1392 * XXX IFCOUNTER_IMCAST 1393 * This stat updating is kinda invasive, since it 1394 * requires two checks on the mbuf: the length check 1395 * and the ethernet header check. As of this write, 1396 * all multicast packets go directly to hn(4), which 1397 * makes imcast stat updating in the VF a try in vian. 1398 */ 1399 1400 /* 1401 * Fix up rcvif and increase hn(4)'s ipackets. 1402 */ 1403 mn->m_pkthdr.rcvif = hn_ifp; 1404 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1405 } 1406 /* 1407 * Go through hn(4)'s if_input. 1408 */ 1409 hn_ifp->if_input(hn_ifp, m); 1410 } else { 1411 /* 1412 * In the middle of the transition; free this 1413 * mbuf chain. 1414 */ 1415 while (m != NULL) { 1416 mn = m->m_nextpkt; 1417 m->m_nextpkt = NULL; 1418 m_freem(m); 1419 m = mn; 1420 } 1421 } 1422 } 1423 1424 static void 1425 hn_mtu_change_fixup(struct hn_softc *sc) 1426 { 1427 struct ifnet *ifp; 1428 1429 HN_LOCK_ASSERT(sc); 1430 ifp = sc->hn_ifp; 1431 1432 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 1433 #if __FreeBSD_version >= 1100099 1434 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1435 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1436 #endif 1437 } 1438 1439 static uint32_t 1440 hn_rss_type_fromndis(uint32_t rss_hash) 1441 { 1442 uint32_t types = 0; 1443 1444 if (rss_hash & NDIS_HASH_IPV4) 1445 types |= RSS_TYPE_IPV4; 1446 if (rss_hash & NDIS_HASH_TCP_IPV4) 1447 types |= RSS_TYPE_TCP_IPV4; 1448 if (rss_hash & NDIS_HASH_IPV6) 1449 types |= RSS_TYPE_IPV6; 1450 if (rss_hash & NDIS_HASH_IPV6_EX) 1451 types |= RSS_TYPE_IPV6_EX; 1452 if (rss_hash & NDIS_HASH_TCP_IPV6) 1453 types |= RSS_TYPE_TCP_IPV6; 1454 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1455 types |= RSS_TYPE_TCP_IPV6_EX; 1456 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1457 types |= RSS_TYPE_UDP_IPV4; 1458 return (types); 1459 } 1460 1461 static uint32_t 1462 hn_rss_type_tondis(uint32_t types) 1463 { 1464 uint32_t rss_hash = 0; 1465 1466 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1467 ("UDP6 and UDP6EX are not supported")); 1468 1469 if (types & RSS_TYPE_IPV4) 1470 rss_hash |= NDIS_HASH_IPV4; 1471 if (types & RSS_TYPE_TCP_IPV4) 1472 rss_hash |= NDIS_HASH_TCP_IPV4; 1473 if (types & RSS_TYPE_IPV6) 1474 rss_hash |= NDIS_HASH_IPV6; 1475 if (types & RSS_TYPE_IPV6_EX) 1476 rss_hash |= NDIS_HASH_IPV6_EX; 1477 if (types & RSS_TYPE_TCP_IPV6) 1478 rss_hash |= NDIS_HASH_TCP_IPV6; 1479 if (types & RSS_TYPE_TCP_IPV6_EX) 1480 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1481 if (types & RSS_TYPE_UDP_IPV4) 1482 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1483 return (rss_hash); 1484 } 1485 1486 static void 1487 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1488 { 1489 int i; 1490 1491 HN_LOCK_ASSERT(sc); 1492 1493 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1494 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1495 } 1496 1497 static void 1498 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1499 { 1500 struct ifnet *ifp, *vf_ifp; 1501 struct ifrsshash ifrh; 1502 struct ifrsskey ifrk; 1503 int error; 1504 uint32_t my_types, diff_types, mbuf_types = 0; 1505 1506 HN_LOCK_ASSERT(sc); 1507 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1508 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1509 1510 if (sc->hn_rx_ring_inuse == 1) { 1511 /* No RSS on synthetic parts; done. */ 1512 return; 1513 } 1514 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1515 /* Synthetic parts do not support Toeplitz; done. */ 1516 return; 1517 } 1518 1519 ifp = sc->hn_ifp; 1520 vf_ifp = sc->hn_vf_ifp; 1521 1522 /* 1523 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1524 * supported. 1525 */ 1526 memset(&ifrk, 0, sizeof(ifrk)); 1527 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); 1528 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); 1529 if (error) { 1530 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1531 vf_ifp->if_xname, error); 1532 goto done; 1533 } 1534 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1535 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1536 vf_ifp->if_xname, ifrk.ifrk_func); 1537 goto done; 1538 } 1539 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1540 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1541 vf_ifp->if_xname, ifrk.ifrk_keylen); 1542 goto done; 1543 } 1544 1545 /* 1546 * Extract VF's RSS hash. Only Toeplitz is supported. 1547 */ 1548 memset(&ifrh, 0, sizeof(ifrh)); 1549 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); 1550 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); 1551 if (error) { 1552 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1553 vf_ifp->if_xname, error); 1554 goto done; 1555 } 1556 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1557 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1558 vf_ifp->if_xname, ifrh.ifrh_func); 1559 goto done; 1560 } 1561 1562 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1563 if ((ifrh.ifrh_types & my_types) == 0) { 1564 /* This disables RSS; ignore it then */ 1565 if_printf(ifp, "%s intersection of RSS types failed. " 1566 "VF %#x, mine %#x\n", vf_ifp->if_xname, 1567 ifrh.ifrh_types, my_types); 1568 goto done; 1569 } 1570 1571 diff_types = my_types ^ ifrh.ifrh_types; 1572 my_types &= ifrh.ifrh_types; 1573 mbuf_types = my_types; 1574 1575 /* 1576 * Detect RSS hash value/type confliction. 1577 * 1578 * NOTE: 1579 * We don't disable the hash type, but stop delivery the hash 1580 * value/type through mbufs on RX path. 1581 * 1582 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1583 * hash is delivered with type of TCP_IPV4. This means if 1584 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1585 * least to hn_mbuf_hash. However, given that _all_ of the 1586 * NICs implement TCP_IPV4, this will _not_ impose any issues 1587 * here. 1588 */ 1589 if ((my_types & RSS_TYPE_IPV4) && 1590 (diff_types & ifrh.ifrh_types & 1591 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1592 /* Conflict; disable IPV4 hash type/value delivery. */ 1593 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1594 mbuf_types &= ~RSS_TYPE_IPV4; 1595 } 1596 if ((my_types & RSS_TYPE_IPV6) && 1597 (diff_types & ifrh.ifrh_types & 1598 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1599 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1600 RSS_TYPE_IPV6_EX))) { 1601 /* Conflict; disable IPV6 hash type/value delivery. */ 1602 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1603 mbuf_types &= ~RSS_TYPE_IPV6; 1604 } 1605 if ((my_types & RSS_TYPE_IPV6_EX) && 1606 (diff_types & ifrh.ifrh_types & 1607 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1608 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1609 RSS_TYPE_IPV6))) { 1610 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1611 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1612 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1613 } 1614 if ((my_types & RSS_TYPE_TCP_IPV6) && 1615 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1616 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1617 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1618 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1619 } 1620 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1621 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1622 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1623 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1624 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1625 } 1626 if ((my_types & RSS_TYPE_UDP_IPV6) && 1627 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1628 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1629 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1630 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1631 } 1632 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1633 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1634 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1635 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1636 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1637 } 1638 1639 /* 1640 * Indirect table does not matter. 1641 */ 1642 1643 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1644 hn_rss_type_tondis(my_types); 1645 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1646 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1647 1648 if (reconf) { 1649 error = hn_rss_reconfig(sc); 1650 if (error) { 1651 /* XXX roll-back? */ 1652 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1653 /* XXX keep going. */ 1654 } 1655 } 1656 done: 1657 /* Hash deliverability for mbufs. */ 1658 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1659 } 1660 1661 static void 1662 hn_vf_rss_restore(struct hn_softc *sc) 1663 { 1664 1665 HN_LOCK_ASSERT(sc); 1666 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1667 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1668 1669 if (sc->hn_rx_ring_inuse == 1) 1670 goto done; 1671 1672 /* 1673 * Restore hash types. Key does _not_ matter. 1674 */ 1675 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1676 int error; 1677 1678 sc->hn_rss_hash = sc->hn_rss_hcap; 1679 error = hn_rss_reconfig(sc); 1680 if (error) { 1681 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1682 error); 1683 /* XXX keep going. */ 1684 } 1685 } 1686 done: 1687 /* Hash deliverability for mbufs. */ 1688 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1689 } 1690 1691 static void 1692 hn_xpnt_vf_setready(struct hn_softc *sc) 1693 { 1694 struct ifnet *ifp, *vf_ifp; 1695 struct ifreq ifr; 1696 1697 HN_LOCK_ASSERT(sc); 1698 ifp = sc->hn_ifp; 1699 vf_ifp = sc->hn_vf_ifp; 1700 1701 /* 1702 * Mark the VF ready. 1703 */ 1704 sc->hn_vf_rdytick = 0; 1705 1706 /* 1707 * Save information for restoration. 1708 */ 1709 sc->hn_saved_caps = ifp->if_capabilities; 1710 sc->hn_saved_tsomax = ifp->if_hw_tsomax; 1711 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; 1712 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; 1713 1714 /* 1715 * Intersect supported/enabled capabilities. 1716 * 1717 * NOTE: 1718 * if_hwassist is not changed here. 1719 */ 1720 ifp->if_capabilities &= vf_ifp->if_capabilities; 1721 ifp->if_capenable &= ifp->if_capabilities; 1722 1723 /* 1724 * Fix TSO settings. 1725 */ 1726 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) 1727 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; 1728 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) 1729 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; 1730 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) 1731 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; 1732 1733 /* 1734 * Change VF's enabled capabilities. 1735 */ 1736 memset(&ifr, 0, sizeof(ifr)); 1737 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1738 ifr.ifr_reqcap = ifp->if_capenable; 1739 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1740 1741 if (ifp->if_mtu != ETHERMTU) { 1742 int error; 1743 1744 /* 1745 * Change VF's MTU. 1746 */ 1747 memset(&ifr, 0, sizeof(ifr)); 1748 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1749 ifr.ifr_mtu = ifp->if_mtu; 1750 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); 1751 if (error) { 1752 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1753 vf_ifp->if_xname, ifp->if_mtu); 1754 if (ifp->if_mtu > ETHERMTU) { 1755 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1756 1757 /* 1758 * XXX 1759 * No need to adjust the synthetic parts' MTU; 1760 * failure of the adjustment will cause us 1761 * infinite headache. 1762 */ 1763 ifp->if_mtu = ETHERMTU; 1764 hn_mtu_change_fixup(sc); 1765 } 1766 } 1767 } 1768 } 1769 1770 static bool 1771 hn_xpnt_vf_isready(struct hn_softc *sc) 1772 { 1773 1774 HN_LOCK_ASSERT(sc); 1775 1776 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1777 return (false); 1778 1779 if (sc->hn_vf_rdytick == 0) 1780 return (true); 1781 1782 if (sc->hn_vf_rdytick > ticks) 1783 return (false); 1784 1785 /* Mark VF as ready. */ 1786 hn_xpnt_vf_setready(sc); 1787 return (true); 1788 } 1789 1790 static void 1791 hn_xpnt_vf_setenable(struct hn_softc *sc) 1792 { 1793 int i; 1794 1795 HN_LOCK_ASSERT(sc); 1796 1797 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1798 rm_wlock(&sc->hn_vf_lock); 1799 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1800 rm_wunlock(&sc->hn_vf_lock); 1801 1802 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1803 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1804 } 1805 1806 static void 1807 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1808 { 1809 int i; 1810 1811 HN_LOCK_ASSERT(sc); 1812 1813 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1814 rm_wlock(&sc->hn_vf_lock); 1815 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1816 if (clear_vf) 1817 sc->hn_vf_ifp = NULL; 1818 rm_wunlock(&sc->hn_vf_lock); 1819 1820 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1821 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1822 } 1823 1824 static void 1825 hn_xpnt_vf_init(struct hn_softc *sc) 1826 { 1827 int error; 1828 1829 HN_LOCK_ASSERT(sc); 1830 1831 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1832 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1833 1834 if (bootverbose) { 1835 if_printf(sc->hn_ifp, "try bringing up %s\n", 1836 sc->hn_vf_ifp->if_xname); 1837 } 1838 1839 /* 1840 * Bring the VF up. 1841 */ 1842 hn_xpnt_vf_saveifflags(sc); 1843 sc->hn_vf_ifp->if_flags |= IFF_UP; 1844 error = hn_xpnt_vf_iocsetflags(sc); 1845 if (error) { 1846 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1847 sc->hn_vf_ifp->if_xname, error); 1848 return; 1849 } 1850 1851 /* 1852 * NOTE: 1853 * Datapath setting must happen _after_ bringing the VF up. 1854 */ 1855 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1856 1857 /* 1858 * NOTE: 1859 * Fixup RSS related bits _after_ the VF is brought up, since 1860 * many VFs generate RSS key during it's initialization. 1861 */ 1862 hn_vf_rss_fixup(sc, true); 1863 1864 /* Mark transparent mode VF as enabled. */ 1865 hn_xpnt_vf_setenable(sc); 1866 } 1867 1868 static void 1869 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1870 { 1871 struct hn_softc *sc = xsc; 1872 1873 HN_LOCK(sc); 1874 1875 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1876 goto done; 1877 if (sc->hn_vf_ifp == NULL) 1878 goto done; 1879 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1880 goto done; 1881 1882 if (sc->hn_vf_rdytick != 0) { 1883 /* Mark VF as ready. */ 1884 hn_xpnt_vf_setready(sc); 1885 } 1886 1887 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { 1888 /* 1889 * Delayed VF initialization. 1890 */ 1891 if (bootverbose) { 1892 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1893 sc->hn_vf_ifp->if_xname); 1894 } 1895 hn_xpnt_vf_init(sc); 1896 } 1897 done: 1898 HN_UNLOCK(sc); 1899 } 1900 1901 static void 1902 hn_ifnet_attevent(void *xsc, struct ifnet *ifp) 1903 { 1904 struct hn_softc *sc = xsc; 1905 1906 HN_LOCK(sc); 1907 1908 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1909 goto done; 1910 1911 if (!hn_ismyvf(sc, ifp)) 1912 goto done; 1913 1914 if (sc->hn_vf_ifp != NULL) { 1915 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1916 sc->hn_vf_ifp->if_xname); 1917 goto done; 1918 } 1919 1920 if (hn_xpnt_vf && ifp->if_start != NULL) { 1921 /* 1922 * ifnet.if_start is _not_ supported by transparent 1923 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1924 */ 1925 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1926 "in transparent VF mode.\n", ifp->if_xname); 1927 goto done; 1928 } 1929 1930 rm_wlock(&hn_vfmap_lock); 1931 1932 if (ifp->if_index >= hn_vfmap_size) { 1933 struct ifnet **newmap; 1934 int newsize; 1935 1936 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; 1937 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, 1938 M_WAITOK | M_ZERO); 1939 1940 memcpy(newmap, hn_vfmap, 1941 sizeof(struct ifnet *) * hn_vfmap_size); 1942 free(hn_vfmap, M_DEVBUF); 1943 hn_vfmap = newmap; 1944 hn_vfmap_size = newsize; 1945 } 1946 KASSERT(hn_vfmap[ifp->if_index] == NULL, 1947 ("%s: ifindex %d was mapped to %s", 1948 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); 1949 hn_vfmap[ifp->if_index] = sc->hn_ifp; 1950 1951 rm_wunlock(&hn_vfmap_lock); 1952 1953 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1954 rm_wlock(&sc->hn_vf_lock); 1955 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1956 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1957 sc->hn_vf_ifp = ifp; 1958 rm_wunlock(&sc->hn_vf_lock); 1959 1960 if (hn_xpnt_vf) { 1961 int wait_ticks; 1962 1963 /* 1964 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1965 * Save vf_ifp's current if_input for later restoration. 1966 */ 1967 sc->hn_vf_input = ifp->if_input; 1968 ifp->if_input = hn_xpnt_vf_input; 1969 1970 /* 1971 * Stop link status management; use the VF's. 1972 */ 1973 hn_suspend_mgmt(sc); 1974 1975 /* 1976 * Give VF sometime to complete its attach routing. 1977 */ 1978 wait_ticks = hn_xpnt_vf_attwait * hz; 1979 sc->hn_vf_rdytick = ticks + wait_ticks; 1980 1981 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1982 wait_ticks); 1983 } 1984 done: 1985 HN_UNLOCK(sc); 1986 } 1987 1988 static void 1989 hn_ifnet_detevent(void *xsc, struct ifnet *ifp) 1990 { 1991 struct hn_softc *sc = xsc; 1992 1993 HN_LOCK(sc); 1994 1995 if (sc->hn_vf_ifp == NULL) 1996 goto done; 1997 1998 if (!hn_ismyvf(sc, ifp)) 1999 goto done; 2000 2001 if (hn_xpnt_vf) { 2002 /* 2003 * Make sure that the delayed initialization is not running. 2004 * 2005 * NOTE: 2006 * - This lock _must_ be released, since the hn_vf_init task 2007 * will try holding this lock. 2008 * - It is safe to release this lock here, since the 2009 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 2010 * 2011 * XXX racy, if hn(4) ever detached. 2012 */ 2013 HN_UNLOCK(sc); 2014 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 2015 HN_LOCK(sc); 2016 2017 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 2018 sc->hn_ifp->if_xname)); 2019 ifp->if_input = sc->hn_vf_input; 2020 sc->hn_vf_input = NULL; 2021 2022 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 2023 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 2024 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 2025 2026 if (sc->hn_vf_rdytick == 0) { 2027 /* 2028 * The VF was ready; restore some settings. 2029 */ 2030 sc->hn_ifp->if_capabilities = sc->hn_saved_caps; 2031 /* 2032 * NOTE: 2033 * There is _no_ need to fixup if_capenable and 2034 * if_hwassist, since the if_capabilities before 2035 * restoration was an intersection of the VF's 2036 * if_capabilites and the synthetic device's 2037 * if_capabilites. 2038 */ 2039 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; 2040 sc->hn_ifp->if_hw_tsomaxsegcount = 2041 sc->hn_saved_tsosegcnt; 2042 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; 2043 } 2044 2045 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2046 /* 2047 * Restore RSS settings. 2048 */ 2049 hn_vf_rss_restore(sc); 2050 2051 /* 2052 * Resume link status management, which was suspended 2053 * by hn_ifnet_attevent(). 2054 */ 2055 hn_resume_mgmt(sc); 2056 } 2057 } 2058 2059 /* Mark transparent mode VF as disabled. */ 2060 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2061 2062 rm_wlock(&hn_vfmap_lock); 2063 2064 KASSERT(ifp->if_index < hn_vfmap_size, 2065 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); 2066 if (hn_vfmap[ifp->if_index] != NULL) { 2067 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, 2068 ("%s: ifindex %d was mapped to %s", 2069 ifp->if_xname, ifp->if_index, 2070 hn_vfmap[ifp->if_index]->if_xname)); 2071 hn_vfmap[ifp->if_index] = NULL; 2072 } 2073 2074 rm_wunlock(&hn_vfmap_lock); 2075 done: 2076 HN_UNLOCK(sc); 2077 } 2078 2079 static void 2080 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) 2081 { 2082 struct hn_softc *sc = xsc; 2083 2084 if (sc->hn_vf_ifp == ifp) 2085 if_link_state_change(sc->hn_ifp, link_state); 2086 } 2087 2088 static int 2089 hn_probe(device_t dev) 2090 { 2091 2092 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2093 device_set_desc(dev, "Hyper-V Network Interface"); 2094 return BUS_PROBE_DEFAULT; 2095 } 2096 return ENXIO; 2097 } 2098 2099 static int 2100 hn_attach(device_t dev) 2101 { 2102 struct hn_softc *sc = device_get_softc(dev); 2103 struct sysctl_oid_list *child; 2104 struct sysctl_ctx_list *ctx; 2105 uint8_t eaddr[ETHER_ADDR_LEN]; 2106 struct ifnet *ifp = NULL; 2107 int error, ring_cnt, tx_ring_cnt; 2108 uint32_t mtu; 2109 2110 sc->hn_dev = dev; 2111 sc->hn_prichan = vmbus_get_channel(dev); 2112 HN_LOCK_INIT(sc); 2113 rm_init(&sc->hn_vf_lock, "hnvf"); 2114 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2115 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2116 2117 /* 2118 * Initialize these tunables once. 2119 */ 2120 sc->hn_agg_size = hn_tx_agg_size; 2121 sc->hn_agg_pkts = hn_tx_agg_pkts; 2122 2123 /* 2124 * Setup taskqueue for transmission. 2125 */ 2126 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2127 int i; 2128 2129 sc->hn_tx_taskqs = 2130 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2131 M_DEVBUF, M_WAITOK); 2132 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2133 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2134 M_WAITOK, taskqueue_thread_enqueue, 2135 &sc->hn_tx_taskqs[i]); 2136 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2137 "%s tx%d", device_get_nameunit(dev), i); 2138 } 2139 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2140 sc->hn_tx_taskqs = hn_tx_taskque; 2141 } 2142 2143 /* 2144 * Setup taskqueue for mangement tasks, e.g. link status. 2145 */ 2146 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2147 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2148 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2149 device_get_nameunit(dev)); 2150 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2151 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2152 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2153 hn_netchg_status_taskfunc, sc); 2154 2155 if (hn_xpnt_vf) { 2156 /* 2157 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2158 */ 2159 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2160 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2161 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2162 device_get_nameunit(dev)); 2163 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2164 hn_xpnt_vf_init_taskfunc, sc); 2165 } 2166 2167 /* 2168 * Allocate ifnet and setup its name earlier, so that if_printf 2169 * can be used by functions, which will be called after 2170 * ether_ifattach(). 2171 */ 2172 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2173 ifp->if_softc = sc; 2174 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2175 2176 /* 2177 * Initialize ifmedia earlier so that it can be unconditionally 2178 * destroyed, if error happened later on. 2179 */ 2180 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2181 2182 /* 2183 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2184 * to use (tx_ring_cnt). 2185 * 2186 * NOTE: 2187 * The # of RX rings to use is same as the # of channels to use. 2188 */ 2189 ring_cnt = hn_chan_cnt; 2190 if (ring_cnt <= 0) { 2191 /* Default */ 2192 ring_cnt = mp_ncpus; 2193 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2194 ring_cnt = HN_RING_CNT_DEF_MAX; 2195 } else if (ring_cnt > mp_ncpus) { 2196 ring_cnt = mp_ncpus; 2197 } 2198 #ifdef RSS 2199 if (ring_cnt > rss_getnumbuckets()) 2200 ring_cnt = rss_getnumbuckets(); 2201 #endif 2202 2203 tx_ring_cnt = hn_tx_ring_cnt; 2204 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2205 tx_ring_cnt = ring_cnt; 2206 #ifdef HN_IFSTART_SUPPORT 2207 if (hn_use_if_start) { 2208 /* ifnet.if_start only needs one TX ring. */ 2209 tx_ring_cnt = 1; 2210 } 2211 #endif 2212 2213 /* 2214 * Set the leader CPU for channels. 2215 */ 2216 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2217 2218 /* 2219 * Create enough TX/RX rings, even if only limited number of 2220 * channels can be allocated. 2221 */ 2222 error = hn_create_tx_data(sc, tx_ring_cnt); 2223 if (error) 2224 goto failed; 2225 error = hn_create_rx_data(sc, ring_cnt); 2226 if (error) 2227 goto failed; 2228 2229 /* 2230 * Create transaction context for NVS and RNDIS transactions. 2231 */ 2232 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2233 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2234 if (sc->hn_xact == NULL) { 2235 error = ENXIO; 2236 goto failed; 2237 } 2238 2239 /* 2240 * Install orphan handler for the revocation of this device's 2241 * primary channel. 2242 * 2243 * NOTE: 2244 * The processing order is critical here: 2245 * Install the orphan handler, _before_ testing whether this 2246 * device's primary channel has been revoked or not. 2247 */ 2248 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2249 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2250 error = ENXIO; 2251 goto failed; 2252 } 2253 2254 /* 2255 * Attach the synthetic parts, i.e. NVS and RNDIS. 2256 */ 2257 error = hn_synth_attach(sc, ETHERMTU); 2258 if (error) 2259 goto failed; 2260 2261 error = hn_rndis_get_eaddr(sc, eaddr); 2262 if (error) 2263 goto failed; 2264 2265 error = hn_rndis_get_mtu(sc, &mtu); 2266 if (error) 2267 mtu = ETHERMTU; 2268 else if (bootverbose) 2269 device_printf(dev, "RNDIS mtu %u\n", mtu); 2270 2271 #if __FreeBSD_version >= 1100099 2272 if (sc->hn_rx_ring_inuse > 1) { 2273 /* 2274 * Reduce TCP segment aggregation limit for multiple 2275 * RX rings to increase ACK timeliness. 2276 */ 2277 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2278 } 2279 #endif 2280 2281 /* 2282 * Fixup TX/RX stuffs after synthetic parts are attached. 2283 */ 2284 hn_fixup_tx_data(sc); 2285 hn_fixup_rx_data(sc); 2286 2287 ctx = device_get_sysctl_ctx(dev); 2288 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2289 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2290 &sc->hn_nvs_ver, 0, "NVS version"); 2291 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2292 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2293 hn_ndis_version_sysctl, "A", "NDIS version"); 2294 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2295 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2296 hn_caps_sysctl, "A", "capabilities"); 2297 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2298 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2299 hn_hwassist_sysctl, "A", "hwassist"); 2300 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", 2301 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); 2302 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", 2303 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, 2304 "max # of TSO segments"); 2305 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", 2306 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, 2307 "max size of TSO segment"); 2308 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2309 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2310 hn_rxfilter_sysctl, "A", "rxfilter"); 2311 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2312 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2313 hn_rss_hash_sysctl, "A", "RSS hash"); 2314 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2315 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2316 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2318 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2319 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2320 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2321 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2322 #ifndef RSS 2323 /* 2324 * Don't allow RSS key/indirect table changes, if RSS is defined. 2325 */ 2326 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2327 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2328 hn_rss_key_sysctl, "IU", "RSS key"); 2329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2330 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2331 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2332 #endif 2333 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2334 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2335 "RNDIS offered packet transmission aggregation size limit"); 2336 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2337 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2338 "RNDIS offered packet transmission aggregation count limit"); 2339 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2340 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2341 "RNDIS packet transmission aggregation alignment"); 2342 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2343 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2344 hn_txagg_size_sysctl, "I", 2345 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2346 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2347 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2348 hn_txagg_pkts_sysctl, "I", 2349 "Packet transmission aggregation packets, " 2350 "0 -- disable, -1 -- auto"); 2351 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2352 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2353 hn_polling_sysctl, "I", 2354 "Polling frequency: [100,1000000], 0 disable polling"); 2355 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2356 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2357 hn_vf_sysctl, "A", "Virtual Function's name"); 2358 if (!hn_xpnt_vf) { 2359 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2360 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2361 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2362 } else { 2363 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2364 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2365 hn_xpnt_vf_enabled_sysctl, "I", 2366 "Transparent VF enabled"); 2367 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2368 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2369 hn_xpnt_vf_accbpf_sysctl, "I", 2370 "Accurate BPF for transparent VF"); 2371 } 2372 2373 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch", 2374 CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A", 2375 "switch to rsc"); 2376 2377 /* 2378 * Setup the ifmedia, which has been initialized earlier. 2379 */ 2380 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2381 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2382 /* XXX ifmedia_set really should do this for us */ 2383 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2384 2385 /* 2386 * Setup the ifnet for this interface. 2387 */ 2388 2389 ifp->if_baudrate = IF_Gbps(10); 2390 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 2391 ifp->if_ioctl = hn_ioctl; 2392 ifp->if_init = hn_init; 2393 #ifdef HN_IFSTART_SUPPORT 2394 if (hn_use_if_start) { 2395 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2396 2397 ifp->if_start = hn_start; 2398 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 2399 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 2400 IFQ_SET_READY(&ifp->if_snd); 2401 } else 2402 #endif 2403 { 2404 ifp->if_transmit = hn_transmit; 2405 ifp->if_qflush = hn_xmit_qflush; 2406 } 2407 2408 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; 2409 #ifdef foo 2410 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2411 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 2412 #endif 2413 if (sc->hn_caps & HN_CAP_VLAN) { 2414 /* XXX not sure about VLAN_MTU. */ 2415 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 2416 } 2417 2418 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 2419 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 2420 ifp->if_capabilities |= IFCAP_TXCSUM; 2421 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 2422 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 2423 if (sc->hn_caps & HN_CAP_TSO4) { 2424 ifp->if_capabilities |= IFCAP_TSO4; 2425 ifp->if_hwassist |= CSUM_IP_TSO; 2426 } 2427 if (sc->hn_caps & HN_CAP_TSO6) { 2428 ifp->if_capabilities |= IFCAP_TSO6; 2429 ifp->if_hwassist |= CSUM_IP6_TSO; 2430 } 2431 2432 /* Enable all available capabilities by default. */ 2433 ifp->if_capenable = ifp->if_capabilities; 2434 2435 /* 2436 * Disable IPv6 TSO and TXCSUM by default, they still can 2437 * be enabled through SIOCSIFCAP. 2438 */ 2439 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 2440 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 2441 2442 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 2443 /* 2444 * Lock hn_set_tso_maxsize() to simplify its 2445 * internal logic. 2446 */ 2447 HN_LOCK(sc); 2448 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2449 HN_UNLOCK(sc); 2450 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 2451 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 2452 } 2453 2454 ether_ifattach(ifp, eaddr); 2455 2456 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2457 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2458 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 2459 } 2460 if (mtu < ETHERMTU) { 2461 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); 2462 ifp->if_mtu = mtu; 2463 } 2464 2465 /* Inform the upper layer about the long frame support. */ 2466 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 2467 2468 /* 2469 * Kick off link status check. 2470 */ 2471 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2472 hn_update_link_status(sc); 2473 2474 if (!hn_xpnt_vf) { 2475 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2476 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2477 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2478 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2479 } else { 2480 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2481 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2482 } 2483 2484 /* 2485 * NOTE: 2486 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2487 * since interface's LLADDR is needed; interface LLADDR is not 2488 * available when ifnet_arrival event is triggered. 2489 */ 2490 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2491 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2492 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2493 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2494 2495 return (0); 2496 failed: 2497 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2498 hn_synth_detach(sc); 2499 hn_detach(dev); 2500 return (error); 2501 } 2502 2503 static int 2504 hn_detach(device_t dev) 2505 { 2506 struct hn_softc *sc = device_get_softc(dev); 2507 struct ifnet *ifp = sc->hn_ifp, *vf_ifp; 2508 2509 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2510 /* 2511 * In case that the vmbus missed the orphan handler 2512 * installation. 2513 */ 2514 vmbus_xact_ctx_orphan(sc->hn_xact); 2515 } 2516 2517 if (sc->hn_ifaddr_evthand != NULL) 2518 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2519 if (sc->hn_ifnet_evthand != NULL) 2520 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2521 if (sc->hn_ifnet_atthand != NULL) { 2522 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2523 sc->hn_ifnet_atthand); 2524 } 2525 if (sc->hn_ifnet_dethand != NULL) { 2526 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2527 sc->hn_ifnet_dethand); 2528 } 2529 if (sc->hn_ifnet_lnkhand != NULL) 2530 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2531 2532 vf_ifp = sc->hn_vf_ifp; 2533 __compiler_membar(); 2534 if (vf_ifp != NULL) 2535 hn_ifnet_detevent(sc, vf_ifp); 2536 2537 if (device_is_attached(dev)) { 2538 HN_LOCK(sc); 2539 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2540 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2541 hn_stop(sc, true); 2542 /* 2543 * NOTE: 2544 * hn_stop() only suspends data, so managment 2545 * stuffs have to be suspended manually here. 2546 */ 2547 hn_suspend_mgmt(sc); 2548 hn_synth_detach(sc); 2549 } 2550 HN_UNLOCK(sc); 2551 ether_ifdetach(ifp); 2552 } 2553 2554 ifmedia_removeall(&sc->hn_media); 2555 hn_destroy_rx_data(sc); 2556 hn_destroy_tx_data(sc); 2557 2558 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2559 int i; 2560 2561 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2562 taskqueue_free(sc->hn_tx_taskqs[i]); 2563 free(sc->hn_tx_taskqs, M_DEVBUF); 2564 } 2565 taskqueue_free(sc->hn_mgmt_taskq0); 2566 if (sc->hn_vf_taskq != NULL) 2567 taskqueue_free(sc->hn_vf_taskq); 2568 2569 if (sc->hn_xact != NULL) { 2570 /* 2571 * Uninstall the orphan handler _before_ the xact is 2572 * destructed. 2573 */ 2574 vmbus_chan_unset_orphan(sc->hn_prichan); 2575 vmbus_xact_ctx_destroy(sc->hn_xact); 2576 } 2577 2578 if_free(ifp); 2579 2580 HN_LOCK_DESTROY(sc); 2581 rm_destroy(&sc->hn_vf_lock); 2582 return (0); 2583 } 2584 2585 static int 2586 hn_shutdown(device_t dev) 2587 { 2588 2589 return (0); 2590 } 2591 2592 static void 2593 hn_link_status(struct hn_softc *sc) 2594 { 2595 uint32_t link_status; 2596 int error; 2597 2598 error = hn_rndis_get_linkstatus(sc, &link_status); 2599 if (error) { 2600 /* XXX what to do? */ 2601 return; 2602 } 2603 2604 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2605 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2606 else 2607 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2608 if_link_state_change(sc->hn_ifp, 2609 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2610 LINK_STATE_UP : LINK_STATE_DOWN); 2611 } 2612 2613 static void 2614 hn_link_taskfunc(void *xsc, int pending __unused) 2615 { 2616 struct hn_softc *sc = xsc; 2617 2618 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2619 return; 2620 hn_link_status(sc); 2621 } 2622 2623 static void 2624 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2625 { 2626 struct hn_softc *sc = xsc; 2627 2628 /* Prevent any link status checks from running. */ 2629 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2630 2631 /* 2632 * Fake up a [link down --> link up] state change; 5 seconds 2633 * delay is used, which closely simulates miibus reaction 2634 * upon link down event. 2635 */ 2636 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2637 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2638 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2639 &sc->hn_netchg_status, 5 * hz); 2640 } 2641 2642 static void 2643 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2644 { 2645 struct hn_softc *sc = xsc; 2646 2647 /* Re-allow link status checks. */ 2648 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2649 hn_link_status(sc); 2650 } 2651 2652 static void 2653 hn_update_link_status(struct hn_softc *sc) 2654 { 2655 2656 if (sc->hn_mgmt_taskq != NULL) 2657 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2658 } 2659 2660 static void 2661 hn_change_network(struct hn_softc *sc) 2662 { 2663 2664 if (sc->hn_mgmt_taskq != NULL) 2665 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2666 } 2667 2668 static __inline int 2669 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2670 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2671 { 2672 struct mbuf *m = *m_head; 2673 int error; 2674 2675 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2676 2677 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2678 m, segs, nsegs, BUS_DMA_NOWAIT); 2679 if (error == EFBIG) { 2680 struct mbuf *m_new; 2681 2682 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2683 if (m_new == NULL) 2684 return ENOBUFS; 2685 else 2686 *m_head = m = m_new; 2687 txr->hn_tx_collapsed++; 2688 2689 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2690 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2691 } 2692 if (!error) { 2693 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2694 BUS_DMASYNC_PREWRITE); 2695 txd->flags |= HN_TXD_FLAG_DMAMAP; 2696 } 2697 return error; 2698 } 2699 2700 static __inline int 2701 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2702 { 2703 2704 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2705 ("put an onlist txd %#x", txd->flags)); 2706 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2707 ("put an onagg txd %#x", txd->flags)); 2708 2709 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2710 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2711 return 0; 2712 2713 if (!STAILQ_EMPTY(&txd->agg_list)) { 2714 struct hn_txdesc *tmp_txd; 2715 2716 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2717 int freed __diagused; 2718 2719 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2720 ("resursive aggregation on aggregated txdesc")); 2721 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2722 ("not aggregated txdesc")); 2723 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2724 ("aggregated txdesc uses dmamap")); 2725 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2726 ("aggregated txdesc consumes " 2727 "chimney sending buffer")); 2728 KASSERT(tmp_txd->chim_size == 0, 2729 ("aggregated txdesc has non-zero " 2730 "chimney sending size")); 2731 2732 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2733 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2734 freed = hn_txdesc_put(txr, tmp_txd); 2735 KASSERT(freed, ("failed to free aggregated txdesc")); 2736 } 2737 } 2738 2739 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2740 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2741 ("chim txd uses dmamap")); 2742 hn_chim_free(txr->hn_sc, txd->chim_index); 2743 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2744 txd->chim_size = 0; 2745 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2746 bus_dmamap_sync(txr->hn_tx_data_dtag, 2747 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2748 bus_dmamap_unload(txr->hn_tx_data_dtag, 2749 txd->data_dmap); 2750 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2751 } 2752 2753 if (txd->m != NULL) { 2754 m_freem(txd->m); 2755 txd->m = NULL; 2756 } 2757 2758 txd->flags |= HN_TXD_FLAG_ONLIST; 2759 #ifndef HN_USE_TXDESC_BUFRING 2760 mtx_lock_spin(&txr->hn_txlist_spin); 2761 KASSERT(txr->hn_txdesc_avail >= 0 && 2762 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2763 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2764 txr->hn_txdesc_avail++; 2765 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2766 mtx_unlock_spin(&txr->hn_txlist_spin); 2767 #else /* HN_USE_TXDESC_BUFRING */ 2768 #ifdef HN_DEBUG 2769 atomic_add_int(&txr->hn_txdesc_avail, 1); 2770 #endif 2771 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2772 #endif /* !HN_USE_TXDESC_BUFRING */ 2773 2774 return 1; 2775 } 2776 2777 static __inline struct hn_txdesc * 2778 hn_txdesc_get(struct hn_tx_ring *txr) 2779 { 2780 struct hn_txdesc *txd; 2781 2782 #ifndef HN_USE_TXDESC_BUFRING 2783 mtx_lock_spin(&txr->hn_txlist_spin); 2784 txd = SLIST_FIRST(&txr->hn_txlist); 2785 if (txd != NULL) { 2786 KASSERT(txr->hn_txdesc_avail > 0, 2787 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2788 txr->hn_txdesc_avail--; 2789 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2790 } 2791 mtx_unlock_spin(&txr->hn_txlist_spin); 2792 #else 2793 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2794 #endif 2795 2796 if (txd != NULL) { 2797 #ifdef HN_USE_TXDESC_BUFRING 2798 #ifdef HN_DEBUG 2799 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2800 #endif 2801 #endif /* HN_USE_TXDESC_BUFRING */ 2802 KASSERT(txd->m == NULL && txd->refs == 0 && 2803 STAILQ_EMPTY(&txd->agg_list) && 2804 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2805 txd->chim_size == 0 && 2806 (txd->flags & HN_TXD_FLAG_ONLIST) && 2807 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2808 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2809 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2810 txd->refs = 1; 2811 } 2812 return txd; 2813 } 2814 2815 static __inline void 2816 hn_txdesc_hold(struct hn_txdesc *txd) 2817 { 2818 2819 /* 0->1 transition will never work */ 2820 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2821 atomic_add_int(&txd->refs, 1); 2822 } 2823 2824 static __inline void 2825 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2826 { 2827 2828 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2829 ("recursive aggregation on aggregating txdesc")); 2830 2831 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2832 ("already aggregated")); 2833 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2834 ("recursive aggregation on to-be-aggregated txdesc")); 2835 2836 txd->flags |= HN_TXD_FLAG_ONAGG; 2837 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2838 } 2839 2840 static bool 2841 hn_tx_ring_pending(struct hn_tx_ring *txr) 2842 { 2843 bool pending = false; 2844 2845 #ifndef HN_USE_TXDESC_BUFRING 2846 mtx_lock_spin(&txr->hn_txlist_spin); 2847 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2848 pending = true; 2849 mtx_unlock_spin(&txr->hn_txlist_spin); 2850 #else 2851 if (!buf_ring_full(txr->hn_txdesc_br)) 2852 pending = true; 2853 #endif 2854 return (pending); 2855 } 2856 2857 static __inline void 2858 hn_txeof(struct hn_tx_ring *txr) 2859 { 2860 txr->hn_has_txeof = 0; 2861 txr->hn_txeof(txr); 2862 } 2863 2864 static void 2865 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2866 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2867 { 2868 struct hn_txdesc *txd = sndc->hn_cbarg; 2869 struct hn_tx_ring *txr; 2870 2871 txr = txd->txr; 2872 KASSERT(txr->hn_chan == chan, 2873 ("channel mismatch, on chan%u, should be chan%u", 2874 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2875 2876 txr->hn_has_txeof = 1; 2877 hn_txdesc_put(txr, txd); 2878 2879 ++txr->hn_txdone_cnt; 2880 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2881 txr->hn_txdone_cnt = 0; 2882 if (txr->hn_oactive) 2883 hn_txeof(txr); 2884 } 2885 } 2886 2887 static void 2888 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2889 { 2890 #if defined(INET) || defined(INET6) 2891 struct epoch_tracker et; 2892 2893 NET_EPOCH_ENTER(et); 2894 tcp_lro_flush_all(&rxr->hn_lro); 2895 NET_EPOCH_EXIT(et); 2896 #endif 2897 2898 /* 2899 * NOTE: 2900 * 'txr' could be NULL, if multiple channels and 2901 * ifnet.if_start method are enabled. 2902 */ 2903 if (txr == NULL || !txr->hn_has_txeof) 2904 return; 2905 2906 txr->hn_txdone_cnt = 0; 2907 hn_txeof(txr); 2908 } 2909 2910 static __inline uint32_t 2911 hn_rndis_pktmsg_offset(uint32_t ofs) 2912 { 2913 2914 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2915 ("invalid RNDIS packet msg offset %u", ofs)); 2916 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2917 } 2918 2919 static __inline void * 2920 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2921 size_t pi_dlen, uint32_t pi_type) 2922 { 2923 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2924 struct rndis_pktinfo *pi; 2925 2926 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2927 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2928 2929 /* 2930 * Per-packet-info does not move; it only grows. 2931 * 2932 * NOTE: 2933 * rm_pktinfooffset in this phase counts from the beginning 2934 * of rndis_packet_msg. 2935 */ 2936 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2937 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2938 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2939 pkt->rm_pktinfolen); 2940 pkt->rm_pktinfolen += pi_size; 2941 2942 pi->rm_size = pi_size; 2943 pi->rm_type = pi_type; 2944 pi->rm_internal = 0; 2945 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2946 2947 return (pi->rm_data); 2948 } 2949 2950 static __inline int 2951 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 2952 { 2953 struct hn_txdesc *txd; 2954 struct mbuf *m; 2955 int error, pkts; 2956 2957 txd = txr->hn_agg_txd; 2958 KASSERT(txd != NULL, ("no aggregate txdesc")); 2959 2960 /* 2961 * Since hn_txpkt() will reset this temporary stat, save 2962 * it now, so that oerrors can be updated properly, if 2963 * hn_txpkt() ever fails. 2964 */ 2965 pkts = txr->hn_stat_pkts; 2966 2967 /* 2968 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2969 * failure, save it for later freeing, if hn_txpkt() ever 2970 * fails. 2971 */ 2972 m = txd->m; 2973 error = hn_txpkt(ifp, txr, txd); 2974 if (__predict_false(error)) { 2975 /* txd is freed, but m is not. */ 2976 m_freem(m); 2977 2978 txr->hn_flush_failed++; 2979 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2980 } 2981 2982 /* Reset all aggregation states. */ 2983 txr->hn_agg_txd = NULL; 2984 txr->hn_agg_szleft = 0; 2985 txr->hn_agg_pktleft = 0; 2986 txr->hn_agg_prevpkt = NULL; 2987 2988 return (error); 2989 } 2990 2991 static void * 2992 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2993 int pktsize) 2994 { 2995 void *chim; 2996 2997 if (txr->hn_agg_txd != NULL) { 2998 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2999 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 3000 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 3001 int olen; 3002 3003 /* 3004 * Update the previous RNDIS packet's total length, 3005 * it can be increased due to the mandatory alignment 3006 * padding for this RNDIS packet. And update the 3007 * aggregating txdesc's chimney sending buffer size 3008 * accordingly. 3009 * 3010 * XXX 3011 * Zero-out the padding, as required by the RNDIS spec. 3012 */ 3013 olen = pkt->rm_len; 3014 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 3015 agg_txd->chim_size += pkt->rm_len - olen; 3016 3017 /* Link this txdesc to the parent. */ 3018 hn_txdesc_agg(agg_txd, txd); 3019 3020 chim = (uint8_t *)pkt + pkt->rm_len; 3021 /* Save the current packet for later fixup. */ 3022 txr->hn_agg_prevpkt = chim; 3023 3024 txr->hn_agg_pktleft--; 3025 txr->hn_agg_szleft -= pktsize; 3026 if (txr->hn_agg_szleft <= 3027 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3028 /* 3029 * Probably can't aggregate more packets, 3030 * flush this aggregating txdesc proactively. 3031 */ 3032 txr->hn_agg_pktleft = 0; 3033 } 3034 /* Done! */ 3035 return (chim); 3036 } 3037 hn_flush_txagg(ifp, txr); 3038 } 3039 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3040 3041 txr->hn_tx_chimney_tried++; 3042 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3043 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3044 return (NULL); 3045 txr->hn_tx_chimney++; 3046 3047 chim = txr->hn_sc->hn_chim + 3048 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3049 3050 if (txr->hn_agg_pktmax > 1 && 3051 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3052 txr->hn_agg_txd = txd; 3053 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3054 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3055 txr->hn_agg_prevpkt = chim; 3056 } 3057 return (chim); 3058 } 3059 3060 /* 3061 * NOTE: 3062 * If this function fails, then both txd and m_head0 will be freed. 3063 */ 3064 static int 3065 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3066 struct mbuf **m_head0) 3067 { 3068 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3069 int error, nsegs, i; 3070 struct mbuf *m_head = *m_head0; 3071 struct rndis_packet_msg *pkt; 3072 uint32_t *pi_data; 3073 void *chim = NULL; 3074 int pkt_hlen, pkt_size; 3075 3076 pkt = txd->rndis_pkt; 3077 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3078 if (pkt_size < txr->hn_chim_size) { 3079 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3080 if (chim != NULL) 3081 pkt = chim; 3082 } else { 3083 if (txr->hn_agg_txd != NULL) 3084 hn_flush_txagg(ifp, txr); 3085 } 3086 3087 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3088 pkt->rm_len = m_head->m_pkthdr.len; 3089 pkt->rm_dataoffset = 0; 3090 pkt->rm_datalen = m_head->m_pkthdr.len; 3091 pkt->rm_oobdataoffset = 0; 3092 pkt->rm_oobdatalen = 0; 3093 pkt->rm_oobdataelements = 0; 3094 pkt->rm_pktinfooffset = sizeof(*pkt); 3095 pkt->rm_pktinfolen = 0; 3096 pkt->rm_vchandle = 0; 3097 pkt->rm_reserved = 0; 3098 3099 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3100 /* 3101 * Set the hash value for this packet. 3102 */ 3103 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3104 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3105 3106 if (M_HASHTYPE_ISHASH(m_head)) 3107 /* 3108 * The flowid field contains the hash value host 3109 * set in the rx queue if it is a ip forwarding pkt. 3110 * Set the same hash value so host can send on the 3111 * cpu it was received. 3112 */ 3113 *pi_data = m_head->m_pkthdr.flowid; 3114 else 3115 /* 3116 * Otherwise just put the tx queue index. 3117 */ 3118 *pi_data = txr->hn_tx_idx; 3119 } 3120 3121 if (m_head->m_flags & M_VLANTAG) { 3122 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3123 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3124 *pi_data = NDIS_VLAN_INFO_MAKE( 3125 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3126 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3127 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3128 } 3129 3130 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3131 #if defined(INET6) || defined(INET) 3132 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3133 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3134 #ifdef INET 3135 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3136 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3137 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3138 m_head->m_pkthdr.tso_segsz); 3139 } 3140 #endif 3141 #if defined(INET6) && defined(INET) 3142 else 3143 #endif 3144 #ifdef INET6 3145 { 3146 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3147 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3148 m_head->m_pkthdr.tso_segsz); 3149 } 3150 #endif 3151 #endif /* INET6 || INET */ 3152 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3153 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3154 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3155 if (m_head->m_pkthdr.csum_flags & 3156 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3157 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3158 } else { 3159 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3160 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3161 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3162 } 3163 3164 if (m_head->m_pkthdr.csum_flags & 3165 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3166 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3167 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3168 } else if (m_head->m_pkthdr.csum_flags & 3169 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3170 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3171 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3172 } 3173 } 3174 3175 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3176 /* Fixup RNDIS packet message total length */ 3177 pkt->rm_len += pkt_hlen; 3178 /* Convert RNDIS packet message offsets */ 3179 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3180 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3181 3182 /* 3183 * Fast path: Chimney sending. 3184 */ 3185 if (chim != NULL) { 3186 struct hn_txdesc *tgt_txd = txd; 3187 3188 if (txr->hn_agg_txd != NULL) { 3189 tgt_txd = txr->hn_agg_txd; 3190 #ifdef INVARIANTS 3191 *m_head0 = NULL; 3192 #endif 3193 } 3194 3195 KASSERT(pkt == chim, 3196 ("RNDIS pkt not in chimney sending buffer")); 3197 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3198 ("chimney sending buffer is not used")); 3199 tgt_txd->chim_size += pkt->rm_len; 3200 3201 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3202 ((uint8_t *)chim) + pkt_hlen); 3203 3204 txr->hn_gpa_cnt = 0; 3205 txr->hn_sendpkt = hn_txpkt_chim; 3206 goto done; 3207 } 3208 3209 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3210 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3211 ("chimney buffer is used")); 3212 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3213 3214 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3215 if (__predict_false(error)) { 3216 int freed __diagused; 3217 3218 /* 3219 * This mbuf is not linked w/ the txd yet, so free it now. 3220 */ 3221 m_freem(m_head); 3222 *m_head0 = NULL; 3223 3224 freed = hn_txdesc_put(txr, txd); 3225 KASSERT(freed != 0, 3226 ("fail to free txd upon txdma error")); 3227 3228 txr->hn_txdma_failed++; 3229 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3230 return error; 3231 } 3232 *m_head0 = m_head; 3233 3234 /* +1 RNDIS packet message */ 3235 txr->hn_gpa_cnt = nsegs + 1; 3236 3237 /* send packet with page buffer */ 3238 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3239 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3240 txr->hn_gpa[0].gpa_len = pkt_hlen; 3241 3242 /* 3243 * Fill the page buffers with mbuf info after the page 3244 * buffer for RNDIS packet message. 3245 */ 3246 for (i = 0; i < nsegs; ++i) { 3247 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3248 3249 gpa->gpa_page = atop(segs[i].ds_addr); 3250 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3251 gpa->gpa_len = segs[i].ds_len; 3252 } 3253 3254 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3255 txd->chim_size = 0; 3256 txr->hn_sendpkt = hn_txpkt_sglist; 3257 done: 3258 txd->m = m_head; 3259 3260 /* Set the completion routine */ 3261 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3262 3263 /* Update temporary stats for later use. */ 3264 txr->hn_stat_pkts++; 3265 txr->hn_stat_size += m_head->m_pkthdr.len; 3266 if (m_head->m_flags & M_MCAST) 3267 txr->hn_stat_mcasts++; 3268 3269 return 0; 3270 } 3271 3272 /* 3273 * NOTE: 3274 * If this function fails, then txd will be freed, but the mbuf 3275 * associated w/ the txd will _not_ be freed. 3276 */ 3277 static int 3278 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3279 { 3280 int error, send_failed = 0, has_bpf; 3281 3282 again: 3283 has_bpf = bpf_peers_present(ifp->if_bpf); 3284 if (has_bpf) { 3285 /* 3286 * Make sure that this txd and any aggregated txds are not 3287 * freed before ETHER_BPF_MTAP. 3288 */ 3289 hn_txdesc_hold(txd); 3290 } 3291 error = txr->hn_sendpkt(txr, txd); 3292 if (!error) { 3293 if (has_bpf) { 3294 const struct hn_txdesc *tmp_txd; 3295 3296 ETHER_BPF_MTAP(ifp, txd->m); 3297 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3298 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3299 } 3300 3301 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3302 #ifdef HN_IFSTART_SUPPORT 3303 if (!hn_use_if_start) 3304 #endif 3305 { 3306 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3307 txr->hn_stat_size); 3308 if (txr->hn_stat_mcasts != 0) { 3309 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3310 txr->hn_stat_mcasts); 3311 } 3312 } 3313 txr->hn_pkts += txr->hn_stat_pkts; 3314 txr->hn_sends++; 3315 } 3316 if (has_bpf) 3317 hn_txdesc_put(txr, txd); 3318 3319 if (__predict_false(error)) { 3320 int freed __diagused; 3321 3322 /* 3323 * This should "really rarely" happen. 3324 * 3325 * XXX Too many RX to be acked or too many sideband 3326 * commands to run? Ask netvsc_channel_rollup() 3327 * to kick start later. 3328 */ 3329 txr->hn_has_txeof = 1; 3330 if (!send_failed) { 3331 txr->hn_send_failed++; 3332 send_failed = 1; 3333 /* 3334 * Try sending again after set hn_has_txeof; 3335 * in case that we missed the last 3336 * netvsc_channel_rollup(). 3337 */ 3338 goto again; 3339 } 3340 if_printf(ifp, "send failed\n"); 3341 3342 /* 3343 * Caller will perform further processing on the 3344 * associated mbuf, so don't free it in hn_txdesc_put(); 3345 * only unload it from the DMA map in hn_txdesc_put(), 3346 * if it was loaded. 3347 */ 3348 txd->m = NULL; 3349 freed = hn_txdesc_put(txr, txd); 3350 KASSERT(freed != 0, 3351 ("fail to free txd upon send error")); 3352 3353 txr->hn_send_failed++; 3354 } 3355 3356 /* Reset temporary stats, after this sending is done. */ 3357 txr->hn_stat_size = 0; 3358 txr->hn_stat_pkts = 0; 3359 txr->hn_stat_mcasts = 0; 3360 3361 return (error); 3362 } 3363 3364 /* 3365 * Append the specified data to the indicated mbuf chain, 3366 * Extend the mbuf chain if the new data does not fit in 3367 * existing space. 3368 * 3369 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3370 * There should be an equivalent in the kernel mbuf code, 3371 * but there does not appear to be one yet. 3372 * 3373 * Differs from m_append() in that additional mbufs are 3374 * allocated with cluster size MJUMPAGESIZE, and filled 3375 * accordingly. 3376 * 3377 * Return the last mbuf in the chain or NULL if failed to 3378 * allocate new mbuf. 3379 */ 3380 static struct mbuf * 3381 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3382 { 3383 struct mbuf *m, *n; 3384 int remainder, space; 3385 3386 for (m = m0; m->m_next != NULL; m = m->m_next) 3387 ; 3388 remainder = len; 3389 space = M_TRAILINGSPACE(m); 3390 if (space > 0) { 3391 /* 3392 * Copy into available space. 3393 */ 3394 if (space > remainder) 3395 space = remainder; 3396 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3397 m->m_len += space; 3398 cp += space; 3399 remainder -= space; 3400 } 3401 while (remainder > 0) { 3402 /* 3403 * Allocate a new mbuf; could check space 3404 * and allocate a cluster instead. 3405 */ 3406 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3407 if (n == NULL) 3408 return NULL; 3409 n->m_len = min(MJUMPAGESIZE, remainder); 3410 bcopy(cp, mtod(n, caddr_t), n->m_len); 3411 cp += n->m_len; 3412 remainder -= n->m_len; 3413 m->m_next = n; 3414 m = n; 3415 } 3416 3417 return m; 3418 } 3419 3420 #if defined(INET) || defined(INET6) 3421 static __inline int 3422 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3423 { 3424 #if __FreeBSD_version >= 1100095 3425 if (hn_lro_mbufq_depth) { 3426 tcp_lro_queue_mbuf(lc, m); 3427 return 0; 3428 } 3429 #endif 3430 return tcp_lro_rx(lc, m, 0); 3431 } 3432 #endif 3433 3434 static int 3435 hn_rxpkt(struct hn_rx_ring *rxr) 3436 { 3437 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; 3438 struct mbuf *m_new, *n; 3439 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3440 int hash_type = M_HASHTYPE_NONE; 3441 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3442 int i; 3443 3444 ifp = hn_ifp; 3445 if (rxr->hn_rxvf_ifp != NULL) { 3446 /* 3447 * Non-transparent mode VF; pretend this packet is from 3448 * the VF. 3449 */ 3450 ifp = rxr->hn_rxvf_ifp; 3451 is_vf = 1; 3452 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3453 /* Transparent mode VF. */ 3454 is_vf = 1; 3455 } 3456 3457 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 3458 /* 3459 * NOTE: 3460 * See the NOTE of hn_rndis_init_fixat(). This 3461 * function can be reached, immediately after the 3462 * RNDIS is initialized but before the ifnet is 3463 * setup on the hn_attach() path; drop the unexpected 3464 * packets. 3465 */ 3466 return (0); 3467 } 3468 3469 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3470 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3471 return (0); 3472 } 3473 3474 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3475 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3476 if (m_new == NULL) { 3477 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3478 return (0); 3479 } 3480 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3481 rxr->rsc.frag_len[0]); 3482 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3483 } else { 3484 /* 3485 * Get an mbuf with a cluster. For packets 2K or less, 3486 * get a standard 2K cluster. For anything larger, get a 3487 * 4K cluster. Any buffers larger than 4K can cause problems 3488 * if looped around to the Hyper-V TX channel, so avoid them. 3489 */ 3490 size = MCLBYTES; 3491 if (rxr->rsc.pktlen > MCLBYTES) { 3492 /* 4096 */ 3493 size = MJUMPAGESIZE; 3494 } 3495 3496 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3497 if (m_new == NULL) { 3498 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3499 return (0); 3500 } 3501 3502 n = m_new; 3503 for (i = 0; i < rxr->rsc.cnt; i++) { 3504 n = hv_m_append(n, rxr->rsc.frag_len[i], 3505 rxr->rsc.frag_data[i]); 3506 if (n == NULL) { 3507 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3508 return (0); 3509 } else { 3510 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3511 } 3512 } 3513 } 3514 if (rxr->rsc.pktlen <= MHLEN) 3515 rxr->hn_small_pkts++; 3516 3517 m_new->m_pkthdr.rcvif = ifp; 3518 3519 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) 3520 do_csum = 0; 3521 3522 /* receive side checksum offload */ 3523 if (rxr->rsc.csum_info != NULL) { 3524 /* IP csum offload */ 3525 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3526 m_new->m_pkthdr.csum_flags |= 3527 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3528 rxr->hn_csum_ip++; 3529 } 3530 3531 /* TCP/UDP csum offload */ 3532 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3533 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3534 m_new->m_pkthdr.csum_flags |= 3535 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3536 m_new->m_pkthdr.csum_data = 0xffff; 3537 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3538 rxr->hn_csum_tcp++; 3539 else 3540 rxr->hn_csum_udp++; 3541 } 3542 3543 /* 3544 * XXX 3545 * As of this write (Oct 28th, 2016), host side will turn 3546 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3547 * the do_lro setting here is actually _not_ accurate. We 3548 * depend on the RSS hash type check to reset do_lro. 3549 */ 3550 if ((*(rxr->rsc.csum_info) & 3551 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3552 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3553 do_lro = 1; 3554 } else { 3555 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3556 if (l3proto == ETHERTYPE_IP) { 3557 if (l4proto == IPPROTO_TCP) { 3558 if (do_csum && 3559 (rxr->hn_trust_hcsum & 3560 HN_TRUST_HCSUM_TCP)) { 3561 rxr->hn_csum_trusted++; 3562 m_new->m_pkthdr.csum_flags |= 3563 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3564 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3565 m_new->m_pkthdr.csum_data = 0xffff; 3566 } 3567 do_lro = 1; 3568 } else if (l4proto == IPPROTO_UDP) { 3569 if (do_csum && 3570 (rxr->hn_trust_hcsum & 3571 HN_TRUST_HCSUM_UDP)) { 3572 rxr->hn_csum_trusted++; 3573 m_new->m_pkthdr.csum_flags |= 3574 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3575 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3576 m_new->m_pkthdr.csum_data = 0xffff; 3577 } 3578 } else if (l4proto != IPPROTO_DONE && do_csum && 3579 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3580 rxr->hn_csum_trusted++; 3581 m_new->m_pkthdr.csum_flags |= 3582 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3583 } 3584 } 3585 } 3586 3587 if (rxr->rsc.vlan_info != NULL) { 3588 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3589 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3590 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3591 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3592 m_new->m_flags |= M_VLANTAG; 3593 } 3594 3595 /* 3596 * If VF is activated (tranparent/non-transparent mode does not 3597 * matter here). 3598 * 3599 * - Disable LRO 3600 * 3601 * hn(4) will only receive broadcast packets, multicast packets, 3602 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3603 * packet types. 3604 * 3605 * For non-transparent, we definitely _cannot_ enable LRO at 3606 * all, since the LRO flush will use hn(4) as the receiving 3607 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3608 */ 3609 if (is_vf) 3610 do_lro = 0; 3611 3612 /* 3613 * If VF is activated (tranparent/non-transparent mode does not 3614 * matter here), do _not_ mess with unsupported hash types or 3615 * functions. 3616 */ 3617 if (rxr->rsc.hash_info != NULL) { 3618 rxr->hn_rss_pkts++; 3619 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3620 if (!is_vf) 3621 hash_type = M_HASHTYPE_OPAQUE_HASH; 3622 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3623 NDIS_HASH_FUNCTION_TOEPLITZ) { 3624 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3625 rxr->hn_mbuf_hash); 3626 3627 /* 3628 * NOTE: 3629 * do_lro is resetted, if the hash types are not TCP 3630 * related. See the comment in the above csum_flags 3631 * setup section. 3632 */ 3633 switch (type) { 3634 case NDIS_HASH_IPV4: 3635 hash_type = M_HASHTYPE_RSS_IPV4; 3636 do_lro = 0; 3637 break; 3638 3639 case NDIS_HASH_TCP_IPV4: 3640 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3641 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3642 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3643 3644 if (is_vf) 3645 def_htype = M_HASHTYPE_NONE; 3646 3647 /* 3648 * UDP 4-tuple hash is delivered as 3649 * TCP 4-tuple hash. 3650 */ 3651 if (l3proto == ETHERTYPE_MAX) { 3652 hn_rxpkt_proto(m_new, 3653 &l3proto, &l4proto); 3654 } 3655 if (l3proto == ETHERTYPE_IP) { 3656 if (l4proto == IPPROTO_UDP && 3657 (rxr->hn_mbuf_hash & 3658 NDIS_HASH_UDP_IPV4_X)) { 3659 hash_type = 3660 M_HASHTYPE_RSS_UDP_IPV4; 3661 do_lro = 0; 3662 } else if (l4proto != 3663 IPPROTO_TCP) { 3664 hash_type = def_htype; 3665 do_lro = 0; 3666 } 3667 } else { 3668 hash_type = def_htype; 3669 do_lro = 0; 3670 } 3671 } 3672 break; 3673 3674 case NDIS_HASH_IPV6: 3675 hash_type = M_HASHTYPE_RSS_IPV6; 3676 do_lro = 0; 3677 break; 3678 3679 case NDIS_HASH_IPV6_EX: 3680 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3681 do_lro = 0; 3682 break; 3683 3684 case NDIS_HASH_TCP_IPV6: 3685 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3686 break; 3687 3688 case NDIS_HASH_TCP_IPV6_EX: 3689 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3690 break; 3691 } 3692 } 3693 } else if (!is_vf) { 3694 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3695 hash_type = M_HASHTYPE_OPAQUE; 3696 } 3697 M_HASHTYPE_SET(m_new, hash_type); 3698 3699 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3700 if (hn_ifp != ifp) { 3701 const struct ether_header *eh; 3702 3703 /* 3704 * Non-transparent mode VF is activated. 3705 */ 3706 3707 /* 3708 * Allow tapping on hn(4). 3709 */ 3710 ETHER_BPF_MTAP(hn_ifp, m_new); 3711 3712 /* 3713 * Update hn(4)'s stats. 3714 */ 3715 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3716 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3717 /* Checked at the beginning of this function. */ 3718 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3719 eh = mtod(m_new, struct ether_header *); 3720 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3721 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3722 } 3723 rxr->hn_pkts++; 3724 3725 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { 3726 #if defined(INET) || defined(INET6) 3727 struct lro_ctrl *lro = &rxr->hn_lro; 3728 3729 if (lro->lro_cnt) { 3730 rxr->hn_lro_tried++; 3731 if (hn_lro_rx(lro, m_new) == 0) { 3732 /* DONE! */ 3733 return 0; 3734 } 3735 } 3736 #endif 3737 } 3738 ifp->if_input(ifp, m_new); 3739 3740 return (0); 3741 } 3742 3743 static int 3744 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 3745 { 3746 struct hn_softc *sc = ifp->if_softc; 3747 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3748 struct ifnet *vf_ifp; 3749 int mask, error = 0; 3750 struct ifrsskey *ifrk; 3751 struct ifrsshash *ifrh; 3752 uint32_t mtu; 3753 3754 switch (cmd) { 3755 case SIOCSIFMTU: 3756 if (ifr->ifr_mtu > HN_MTU_MAX) { 3757 error = EINVAL; 3758 break; 3759 } 3760 3761 HN_LOCK(sc); 3762 3763 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3764 HN_UNLOCK(sc); 3765 break; 3766 } 3767 3768 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3769 /* Can't change MTU */ 3770 HN_UNLOCK(sc); 3771 error = EOPNOTSUPP; 3772 break; 3773 } 3774 3775 if (ifp->if_mtu == ifr->ifr_mtu) { 3776 HN_UNLOCK(sc); 3777 break; 3778 } 3779 3780 if (hn_xpnt_vf_isready(sc)) { 3781 vf_ifp = sc->hn_vf_ifp; 3782 ifr_vf = *ifr; 3783 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, 3784 sizeof(ifr_vf.ifr_name)); 3785 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, 3786 (caddr_t)&ifr_vf); 3787 if (error) { 3788 HN_UNLOCK(sc); 3789 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3790 vf_ifp->if_xname, ifr->ifr_mtu, error); 3791 break; 3792 } 3793 } 3794 3795 /* 3796 * Suspend this interface before the synthetic parts 3797 * are ripped. 3798 */ 3799 hn_suspend(sc); 3800 3801 /* 3802 * Detach the synthetics parts, i.e. NVS and RNDIS. 3803 */ 3804 hn_synth_detach(sc); 3805 3806 /* 3807 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3808 * with the new MTU setting. 3809 */ 3810 error = hn_synth_attach(sc, ifr->ifr_mtu); 3811 if (error) { 3812 HN_UNLOCK(sc); 3813 break; 3814 } 3815 3816 error = hn_rndis_get_mtu(sc, &mtu); 3817 if (error) 3818 mtu = ifr->ifr_mtu; 3819 else if (bootverbose) 3820 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3821 3822 /* 3823 * Commit the requested MTU, after the synthetic parts 3824 * have been successfully attached. 3825 */ 3826 if (mtu >= ifr->ifr_mtu) { 3827 mtu = ifr->ifr_mtu; 3828 } else { 3829 if_printf(ifp, "fixup mtu %d -> %u\n", 3830 ifr->ifr_mtu, mtu); 3831 } 3832 ifp->if_mtu = mtu; 3833 3834 /* 3835 * Synthetic parts' reattach may change the chimney 3836 * sending size; update it. 3837 */ 3838 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3839 hn_set_chim_size(sc, sc->hn_chim_szmax); 3840 3841 /* 3842 * Make sure that various parameters based on MTU are 3843 * still valid, after the MTU change. 3844 */ 3845 hn_mtu_change_fixup(sc); 3846 3847 /* 3848 * All done! Resume the interface now. 3849 */ 3850 hn_resume(sc); 3851 3852 if ((sc->hn_flags & HN_FLAG_RXVF) || 3853 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3854 /* 3855 * Since we have reattached the NVS part, 3856 * change the datapath to VF again; in case 3857 * that it is lost, after the NVS was detached. 3858 */ 3859 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3860 } 3861 3862 HN_UNLOCK(sc); 3863 break; 3864 3865 case SIOCSIFFLAGS: 3866 HN_LOCK(sc); 3867 3868 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3869 HN_UNLOCK(sc); 3870 break; 3871 } 3872 3873 if (hn_xpnt_vf_isready(sc)) 3874 hn_xpnt_vf_saveifflags(sc); 3875 3876 if (ifp->if_flags & IFF_UP) { 3877 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3878 /* 3879 * Caller meight hold mutex, e.g. 3880 * bpf; use busy-wait for the RNDIS 3881 * reply. 3882 */ 3883 HN_NO_SLEEPING(sc); 3884 hn_rxfilter_config(sc); 3885 HN_SLEEPING_OK(sc); 3886 3887 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3888 error = hn_xpnt_vf_iocsetflags(sc); 3889 } else { 3890 hn_init_locked(sc); 3891 } 3892 } else { 3893 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3894 hn_stop(sc, false); 3895 } 3896 sc->hn_if_flags = ifp->if_flags; 3897 3898 HN_UNLOCK(sc); 3899 break; 3900 3901 case SIOCSIFCAP: 3902 HN_LOCK(sc); 3903 3904 if (hn_xpnt_vf_isready(sc)) { 3905 ifr_vf = *ifr; 3906 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, 3907 sizeof(ifr_vf.ifr_name)); 3908 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3909 HN_UNLOCK(sc); 3910 break; 3911 } 3912 3913 /* 3914 * Fix up requested capabilities w/ supported capabilities, 3915 * since the supported capabilities could have been changed. 3916 */ 3917 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ 3918 ifp->if_capenable; 3919 3920 if (mask & IFCAP_TXCSUM) { 3921 ifp->if_capenable ^= IFCAP_TXCSUM; 3922 if (ifp->if_capenable & IFCAP_TXCSUM) 3923 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 3924 else 3925 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 3926 } 3927 if (mask & IFCAP_TXCSUM_IPV6) { 3928 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 3929 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 3930 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 3931 else 3932 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 3933 } 3934 3935 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3936 if (mask & IFCAP_RXCSUM) 3937 ifp->if_capenable ^= IFCAP_RXCSUM; 3938 #ifdef foo 3939 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3940 if (mask & IFCAP_RXCSUM_IPV6) 3941 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 3942 #endif 3943 3944 if (mask & IFCAP_LRO) 3945 ifp->if_capenable ^= IFCAP_LRO; 3946 3947 if (mask & IFCAP_TSO4) { 3948 ifp->if_capenable ^= IFCAP_TSO4; 3949 if (ifp->if_capenable & IFCAP_TSO4) 3950 ifp->if_hwassist |= CSUM_IP_TSO; 3951 else 3952 ifp->if_hwassist &= ~CSUM_IP_TSO; 3953 } 3954 if (mask & IFCAP_TSO6) { 3955 ifp->if_capenable ^= IFCAP_TSO6; 3956 if (ifp->if_capenable & IFCAP_TSO6) 3957 ifp->if_hwassist |= CSUM_IP6_TSO; 3958 else 3959 ifp->if_hwassist &= ~CSUM_IP6_TSO; 3960 } 3961 3962 HN_UNLOCK(sc); 3963 break; 3964 3965 case SIOCADDMULTI: 3966 case SIOCDELMULTI: 3967 HN_LOCK(sc); 3968 3969 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3970 HN_UNLOCK(sc); 3971 break; 3972 } 3973 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3974 /* 3975 * Multicast uses mutex; use busy-wait for 3976 * the RNDIS reply. 3977 */ 3978 HN_NO_SLEEPING(sc); 3979 hn_rxfilter_config(sc); 3980 HN_SLEEPING_OK(sc); 3981 } 3982 3983 /* XXX vlan(4) style mcast addr maintenance */ 3984 if (hn_xpnt_vf_isready(sc)) { 3985 int old_if_flags; 3986 3987 old_if_flags = sc->hn_vf_ifp->if_flags; 3988 hn_xpnt_vf_saveifflags(sc); 3989 3990 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3991 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & 3992 IFF_ALLMULTI)) 3993 error = hn_xpnt_vf_iocsetflags(sc); 3994 } 3995 3996 HN_UNLOCK(sc); 3997 break; 3998 3999 case SIOCSIFMEDIA: 4000 case SIOCGIFMEDIA: 4001 HN_LOCK(sc); 4002 if (hn_xpnt_vf_isready(sc)) { 4003 /* 4004 * SIOCGIFMEDIA expects ifmediareq, so don't 4005 * create and pass ifr_vf to the VF here; just 4006 * replace the ifr_name. 4007 */ 4008 vf_ifp = sc->hn_vf_ifp; 4009 strlcpy(ifr->ifr_name, vf_ifp->if_xname, 4010 sizeof(ifr->ifr_name)); 4011 error = vf_ifp->if_ioctl(vf_ifp, cmd, data); 4012 /* Restore the ifr_name. */ 4013 strlcpy(ifr->ifr_name, ifp->if_xname, 4014 sizeof(ifr->ifr_name)); 4015 HN_UNLOCK(sc); 4016 break; 4017 } 4018 HN_UNLOCK(sc); 4019 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4020 break; 4021 4022 case SIOCGIFRSSHASH: 4023 ifrh = (struct ifrsshash *)data; 4024 HN_LOCK(sc); 4025 if (sc->hn_rx_ring_inuse == 1) { 4026 HN_UNLOCK(sc); 4027 ifrh->ifrh_func = RSS_FUNC_NONE; 4028 ifrh->ifrh_types = 0; 4029 break; 4030 } 4031 4032 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4033 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4034 else 4035 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4036 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4037 HN_UNLOCK(sc); 4038 break; 4039 4040 case SIOCGIFRSSKEY: 4041 ifrk = (struct ifrsskey *)data; 4042 HN_LOCK(sc); 4043 if (sc->hn_rx_ring_inuse == 1) { 4044 HN_UNLOCK(sc); 4045 ifrk->ifrk_func = RSS_FUNC_NONE; 4046 ifrk->ifrk_keylen = 0; 4047 break; 4048 } 4049 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4050 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4051 else 4052 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4053 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4054 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4055 NDIS_HASH_KEYSIZE_TOEPLITZ); 4056 HN_UNLOCK(sc); 4057 break; 4058 4059 default: 4060 error = ether_ioctl(ifp, cmd, data); 4061 break; 4062 } 4063 return (error); 4064 } 4065 4066 static void 4067 hn_stop(struct hn_softc *sc, bool detaching) 4068 { 4069 struct ifnet *ifp = sc->hn_ifp; 4070 int i; 4071 4072 HN_LOCK_ASSERT(sc); 4073 4074 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4075 ("synthetic parts were not attached")); 4076 4077 /* Clear RUNNING bit ASAP. */ 4078 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4079 4080 /* Disable polling. */ 4081 hn_polling(sc, 0); 4082 4083 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4084 KASSERT(sc->hn_vf_ifp != NULL, 4085 ("%s: VF is not attached", ifp->if_xname)); 4086 4087 /* Mark transparent mode VF as disabled. */ 4088 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4089 4090 /* 4091 * NOTE: 4092 * Datapath setting must happen _before_ bringing 4093 * the VF down. 4094 */ 4095 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4096 4097 /* 4098 * Bring the VF down. 4099 */ 4100 hn_xpnt_vf_saveifflags(sc); 4101 sc->hn_vf_ifp->if_flags &= ~IFF_UP; 4102 hn_xpnt_vf_iocsetflags(sc); 4103 } 4104 4105 /* Suspend data transfers. */ 4106 hn_suspend_data(sc); 4107 4108 /* Clear OACTIVE bit. */ 4109 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4110 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4111 sc->hn_tx_ring[i].hn_oactive = 0; 4112 4113 /* 4114 * If the non-transparent mode VF is active, make sure 4115 * that the RX filter still allows packet reception. 4116 */ 4117 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4118 hn_rxfilter_config(sc); 4119 } 4120 4121 static void 4122 hn_init_locked(struct hn_softc *sc) 4123 { 4124 struct ifnet *ifp = sc->hn_ifp; 4125 int i; 4126 4127 HN_LOCK_ASSERT(sc); 4128 4129 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4130 return; 4131 4132 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 4133 return; 4134 4135 /* Configure RX filter */ 4136 hn_rxfilter_config(sc); 4137 4138 /* Clear OACTIVE bit. */ 4139 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4140 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4141 sc->hn_tx_ring[i].hn_oactive = 0; 4142 4143 /* Clear TX 'suspended' bit. */ 4144 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4145 4146 if (hn_xpnt_vf_isready(sc)) { 4147 /* Initialize transparent VF. */ 4148 hn_xpnt_vf_init(sc); 4149 } 4150 4151 /* Everything is ready; unleash! */ 4152 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4153 4154 /* Re-enable polling if requested. */ 4155 if (sc->hn_pollhz > 0) 4156 hn_polling(sc, sc->hn_pollhz); 4157 } 4158 4159 static void 4160 hn_init(void *xsc) 4161 { 4162 struct hn_softc *sc = xsc; 4163 4164 HN_LOCK(sc); 4165 hn_init_locked(sc); 4166 HN_UNLOCK(sc); 4167 } 4168 4169 #if __FreeBSD_version >= 1100099 4170 4171 static int 4172 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4173 { 4174 struct hn_softc *sc = arg1; 4175 unsigned int lenlim; 4176 int error; 4177 4178 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4179 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4180 if (error || req->newptr == NULL) 4181 return error; 4182 4183 HN_LOCK(sc); 4184 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4185 lenlim > TCP_LRO_LENGTH_MAX) { 4186 HN_UNLOCK(sc); 4187 return EINVAL; 4188 } 4189 hn_set_lro_lenlim(sc, lenlim); 4190 HN_UNLOCK(sc); 4191 4192 return 0; 4193 } 4194 4195 static int 4196 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4197 { 4198 struct hn_softc *sc = arg1; 4199 int ackcnt, error, i; 4200 4201 /* 4202 * lro_ackcnt_lim is append count limit, 4203 * +1 to turn it into aggregation limit. 4204 */ 4205 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4206 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4207 if (error || req->newptr == NULL) 4208 return error; 4209 4210 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4211 return EINVAL; 4212 4213 /* 4214 * Convert aggregation limit back to append 4215 * count limit. 4216 */ 4217 --ackcnt; 4218 HN_LOCK(sc); 4219 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4220 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4221 HN_UNLOCK(sc); 4222 return 0; 4223 } 4224 4225 #endif 4226 4227 static int 4228 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4229 { 4230 struct hn_softc *sc = arg1; 4231 int hcsum = arg2; 4232 int on, error, i; 4233 4234 on = 0; 4235 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4236 on = 1; 4237 4238 error = sysctl_handle_int(oidp, &on, 0, req); 4239 if (error || req->newptr == NULL) 4240 return error; 4241 4242 HN_LOCK(sc); 4243 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4244 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4245 4246 if (on) 4247 rxr->hn_trust_hcsum |= hcsum; 4248 else 4249 rxr->hn_trust_hcsum &= ~hcsum; 4250 } 4251 HN_UNLOCK(sc); 4252 return 0; 4253 } 4254 4255 static int 4256 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4257 { 4258 struct hn_softc *sc = arg1; 4259 int chim_size, error; 4260 4261 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4262 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4263 if (error || req->newptr == NULL) 4264 return error; 4265 4266 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4267 return EINVAL; 4268 4269 HN_LOCK(sc); 4270 hn_set_chim_size(sc, chim_size); 4271 HN_UNLOCK(sc); 4272 return 0; 4273 } 4274 4275 #if __FreeBSD_version < 1100095 4276 static int 4277 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 4278 { 4279 struct hn_softc *sc = arg1; 4280 int ofs = arg2, i, error; 4281 struct hn_rx_ring *rxr; 4282 uint64_t stat; 4283 4284 stat = 0; 4285 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4286 rxr = &sc->hn_rx_ring[i]; 4287 stat += *((int *)((uint8_t *)rxr + ofs)); 4288 } 4289 4290 error = sysctl_handle_64(oidp, &stat, 0, req); 4291 if (error || req->newptr == NULL) 4292 return error; 4293 4294 /* Zero out this stat. */ 4295 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4296 rxr = &sc->hn_rx_ring[i]; 4297 *((int *)((uint8_t *)rxr + ofs)) = 0; 4298 } 4299 return 0; 4300 } 4301 #else 4302 static int 4303 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4304 { 4305 struct hn_softc *sc = arg1; 4306 int ofs = arg2, i, error; 4307 struct hn_rx_ring *rxr; 4308 uint64_t stat; 4309 4310 stat = 0; 4311 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4312 rxr = &sc->hn_rx_ring[i]; 4313 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4314 } 4315 4316 error = sysctl_handle_64(oidp, &stat, 0, req); 4317 if (error || req->newptr == NULL) 4318 return error; 4319 4320 /* Zero out this stat. */ 4321 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4322 rxr = &sc->hn_rx_ring[i]; 4323 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4324 } 4325 return 0; 4326 } 4327 4328 #endif 4329 4330 static int 4331 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4332 { 4333 struct hn_softc *sc = arg1; 4334 int ofs = arg2, i, error; 4335 struct hn_rx_ring *rxr; 4336 u_long stat; 4337 4338 stat = 0; 4339 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4340 rxr = &sc->hn_rx_ring[i]; 4341 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4342 } 4343 4344 error = sysctl_handle_long(oidp, &stat, 0, req); 4345 if (error || req->newptr == NULL) 4346 return error; 4347 4348 /* Zero out this stat. */ 4349 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4350 rxr = &sc->hn_rx_ring[i]; 4351 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4352 } 4353 return 0; 4354 } 4355 4356 static int 4357 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4358 { 4359 struct hn_softc *sc = arg1; 4360 int ofs = arg2, i, error; 4361 struct hn_tx_ring *txr; 4362 u_long stat; 4363 4364 stat = 0; 4365 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4366 txr = &sc->hn_tx_ring[i]; 4367 stat += *((u_long *)((uint8_t *)txr + ofs)); 4368 } 4369 4370 error = sysctl_handle_long(oidp, &stat, 0, req); 4371 if (error || req->newptr == NULL) 4372 return error; 4373 4374 /* Zero out this stat. */ 4375 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4376 txr = &sc->hn_tx_ring[i]; 4377 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4378 } 4379 return 0; 4380 } 4381 4382 static int 4383 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4384 { 4385 struct hn_softc *sc = arg1; 4386 int ofs = arg2, i, error, conf; 4387 struct hn_tx_ring *txr; 4388 4389 txr = &sc->hn_tx_ring[0]; 4390 conf = *((int *)((uint8_t *)txr + ofs)); 4391 4392 error = sysctl_handle_int(oidp, &conf, 0, req); 4393 if (error || req->newptr == NULL) 4394 return error; 4395 4396 HN_LOCK(sc); 4397 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4398 txr = &sc->hn_tx_ring[i]; 4399 *((int *)((uint8_t *)txr + ofs)) = conf; 4400 } 4401 HN_UNLOCK(sc); 4402 4403 return 0; 4404 } 4405 4406 static int 4407 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4408 { 4409 struct hn_softc *sc = arg1; 4410 int error, size; 4411 4412 size = sc->hn_agg_size; 4413 error = sysctl_handle_int(oidp, &size, 0, req); 4414 if (error || req->newptr == NULL) 4415 return (error); 4416 4417 HN_LOCK(sc); 4418 sc->hn_agg_size = size; 4419 hn_set_txagg(sc); 4420 HN_UNLOCK(sc); 4421 4422 return (0); 4423 } 4424 4425 static int 4426 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4427 { 4428 struct hn_softc *sc = arg1; 4429 int error, pkts; 4430 4431 pkts = sc->hn_agg_pkts; 4432 error = sysctl_handle_int(oidp, &pkts, 0, req); 4433 if (error || req->newptr == NULL) 4434 return (error); 4435 4436 HN_LOCK(sc); 4437 sc->hn_agg_pkts = pkts; 4438 hn_set_txagg(sc); 4439 HN_UNLOCK(sc); 4440 4441 return (0); 4442 } 4443 4444 static int 4445 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4446 { 4447 struct hn_softc *sc = arg1; 4448 int pkts; 4449 4450 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4451 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4452 } 4453 4454 static int 4455 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4456 { 4457 struct hn_softc *sc = arg1; 4458 int align; 4459 4460 align = sc->hn_tx_ring[0].hn_agg_align; 4461 return (sysctl_handle_int(oidp, &align, 0, req)); 4462 } 4463 4464 static void 4465 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4466 { 4467 if (pollhz == 0) 4468 vmbus_chan_poll_disable(chan); 4469 else 4470 vmbus_chan_poll_enable(chan, pollhz); 4471 } 4472 4473 static void 4474 hn_polling(struct hn_softc *sc, u_int pollhz) 4475 { 4476 int nsubch = sc->hn_rx_ring_inuse - 1; 4477 4478 HN_LOCK_ASSERT(sc); 4479 4480 if (nsubch > 0) { 4481 struct vmbus_channel **subch; 4482 int i; 4483 4484 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4485 for (i = 0; i < nsubch; ++i) 4486 hn_chan_polling(subch[i], pollhz); 4487 vmbus_subchan_rel(subch, nsubch); 4488 } 4489 hn_chan_polling(sc->hn_prichan, pollhz); 4490 } 4491 4492 static int 4493 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4494 { 4495 struct hn_softc *sc = arg1; 4496 int pollhz, error; 4497 4498 pollhz = sc->hn_pollhz; 4499 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4500 if (error || req->newptr == NULL) 4501 return (error); 4502 4503 if (pollhz != 0 && 4504 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4505 return (EINVAL); 4506 4507 HN_LOCK(sc); 4508 if (sc->hn_pollhz != pollhz) { 4509 sc->hn_pollhz = pollhz; 4510 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 4511 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4512 hn_polling(sc, sc->hn_pollhz); 4513 } 4514 HN_UNLOCK(sc); 4515 4516 return (0); 4517 } 4518 4519 static int 4520 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4521 { 4522 struct hn_softc *sc = arg1; 4523 char verstr[16]; 4524 4525 snprintf(verstr, sizeof(verstr), "%u.%u", 4526 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4527 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4528 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4529 } 4530 4531 static int 4532 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4533 { 4534 struct hn_softc *sc = arg1; 4535 char caps_str[128]; 4536 uint32_t caps; 4537 4538 HN_LOCK(sc); 4539 caps = sc->hn_caps; 4540 HN_UNLOCK(sc); 4541 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4542 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4543 } 4544 4545 static int 4546 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4547 { 4548 struct hn_softc *sc = arg1; 4549 char assist_str[128]; 4550 uint32_t hwassist; 4551 4552 HN_LOCK(sc); 4553 hwassist = sc->hn_ifp->if_hwassist; 4554 HN_UNLOCK(sc); 4555 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4556 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4557 } 4558 4559 static int 4560 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4561 { 4562 struct hn_softc *sc = arg1; 4563 char filter_str[128]; 4564 uint32_t filter; 4565 4566 HN_LOCK(sc); 4567 filter = sc->hn_rx_filter; 4568 HN_UNLOCK(sc); 4569 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4570 NDIS_PACKET_TYPES); 4571 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4572 } 4573 4574 static int 4575 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS) 4576 { 4577 struct hn_softc *sc = arg1; 4578 uint32_t mtu; 4579 int error; 4580 HN_LOCK(sc); 4581 error = hn_rndis_get_mtu(sc, &mtu); 4582 if (error) { 4583 if_printf(sc->hn_ifp, "failed to get mtu\n"); 4584 goto back; 4585 } 4586 error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4587 if (error || req->newptr == NULL) 4588 goto back; 4589 4590 error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4591 if (error) 4592 goto back; 4593 error = hn_rndis_reconf_offload(sc, mtu); 4594 back: 4595 HN_UNLOCK(sc); 4596 return (error); 4597 } 4598 #ifndef RSS 4599 4600 static int 4601 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4602 { 4603 struct hn_softc *sc = arg1; 4604 int error; 4605 4606 HN_LOCK(sc); 4607 4608 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4609 if (error || req->newptr == NULL) 4610 goto back; 4611 4612 if ((sc->hn_flags & HN_FLAG_RXVF) || 4613 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4614 /* 4615 * RSS key is synchronized w/ VF's, don't allow users 4616 * to change it. 4617 */ 4618 error = EBUSY; 4619 goto back; 4620 } 4621 4622 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4623 if (error) 4624 goto back; 4625 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4626 4627 if (sc->hn_rx_ring_inuse > 1) { 4628 error = hn_rss_reconfig(sc); 4629 } else { 4630 /* Not RSS capable, at least for now; just save the RSS key. */ 4631 error = 0; 4632 } 4633 back: 4634 HN_UNLOCK(sc); 4635 return (error); 4636 } 4637 4638 static int 4639 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4640 { 4641 struct hn_softc *sc = arg1; 4642 int error; 4643 4644 HN_LOCK(sc); 4645 4646 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4647 if (error || req->newptr == NULL) 4648 goto back; 4649 4650 /* 4651 * Don't allow RSS indirect table change, if this interface is not 4652 * RSS capable currently. 4653 */ 4654 if (sc->hn_rx_ring_inuse == 1) { 4655 error = EOPNOTSUPP; 4656 goto back; 4657 } 4658 4659 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4660 if (error) 4661 goto back; 4662 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4663 4664 hn_rss_ind_fixup(sc); 4665 error = hn_rss_reconfig(sc); 4666 back: 4667 HN_UNLOCK(sc); 4668 return (error); 4669 } 4670 4671 #endif /* !RSS */ 4672 4673 static int 4674 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4675 { 4676 struct hn_softc *sc = arg1; 4677 char hash_str[128]; 4678 uint32_t hash; 4679 4680 HN_LOCK(sc); 4681 hash = sc->hn_rss_hash; 4682 HN_UNLOCK(sc); 4683 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4684 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4685 } 4686 4687 static int 4688 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4689 { 4690 struct hn_softc *sc = arg1; 4691 char hash_str[128]; 4692 uint32_t hash; 4693 4694 HN_LOCK(sc); 4695 hash = sc->hn_rss_hcap; 4696 HN_UNLOCK(sc); 4697 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4698 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4699 } 4700 4701 static int 4702 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4703 { 4704 struct hn_softc *sc = arg1; 4705 char hash_str[128]; 4706 uint32_t hash; 4707 4708 HN_LOCK(sc); 4709 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4710 HN_UNLOCK(sc); 4711 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4712 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4713 } 4714 4715 static int 4716 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4717 { 4718 struct hn_softc *sc = arg1; 4719 char vf_name[IFNAMSIZ + 1]; 4720 struct ifnet *vf_ifp; 4721 4722 HN_LOCK(sc); 4723 vf_name[0] = '\0'; 4724 vf_ifp = sc->hn_vf_ifp; 4725 if (vf_ifp != NULL) 4726 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4727 HN_UNLOCK(sc); 4728 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4729 } 4730 4731 static int 4732 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4733 { 4734 struct hn_softc *sc = arg1; 4735 char vf_name[IFNAMSIZ + 1]; 4736 struct ifnet *vf_ifp; 4737 4738 HN_LOCK(sc); 4739 vf_name[0] = '\0'; 4740 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4741 if (vf_ifp != NULL) 4742 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4743 HN_UNLOCK(sc); 4744 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4745 } 4746 4747 static int 4748 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4749 { 4750 struct rm_priotracker pt; 4751 struct sbuf *sb; 4752 int error, i; 4753 bool first; 4754 4755 error = sysctl_wire_old_buffer(req, 0); 4756 if (error != 0) 4757 return (error); 4758 4759 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4760 if (sb == NULL) 4761 return (ENOMEM); 4762 4763 rm_rlock(&hn_vfmap_lock, &pt); 4764 4765 first = true; 4766 for (i = 0; i < hn_vfmap_size; ++i) { 4767 struct epoch_tracker et; 4768 struct ifnet *ifp; 4769 4770 if (hn_vfmap[i] == NULL) 4771 continue; 4772 4773 NET_EPOCH_ENTER(et); 4774 ifp = ifnet_byindex(i); 4775 if (ifp != NULL) { 4776 if (first) 4777 sbuf_printf(sb, "%s", ifp->if_xname); 4778 else 4779 sbuf_printf(sb, " %s", ifp->if_xname); 4780 first = false; 4781 } 4782 NET_EPOCH_EXIT(et); 4783 } 4784 4785 rm_runlock(&hn_vfmap_lock, &pt); 4786 4787 error = sbuf_finish(sb); 4788 sbuf_delete(sb); 4789 return (error); 4790 } 4791 4792 static int 4793 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4794 { 4795 struct rm_priotracker pt; 4796 struct sbuf *sb; 4797 int error, i; 4798 bool first; 4799 4800 error = sysctl_wire_old_buffer(req, 0); 4801 if (error != 0) 4802 return (error); 4803 4804 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4805 if (sb == NULL) 4806 return (ENOMEM); 4807 4808 rm_rlock(&hn_vfmap_lock, &pt); 4809 4810 first = true; 4811 for (i = 0; i < hn_vfmap_size; ++i) { 4812 struct epoch_tracker et; 4813 struct ifnet *ifp, *hn_ifp; 4814 4815 hn_ifp = hn_vfmap[i]; 4816 if (hn_ifp == NULL) 4817 continue; 4818 4819 NET_EPOCH_ENTER(et); 4820 ifp = ifnet_byindex(i); 4821 if (ifp != NULL) { 4822 if (first) { 4823 sbuf_printf(sb, "%s:%s", ifp->if_xname, 4824 hn_ifp->if_xname); 4825 } else { 4826 sbuf_printf(sb, " %s:%s", ifp->if_xname, 4827 hn_ifp->if_xname); 4828 } 4829 first = false; 4830 } 4831 NET_EPOCH_EXIT(et); 4832 } 4833 4834 rm_runlock(&hn_vfmap_lock, &pt); 4835 4836 error = sbuf_finish(sb); 4837 sbuf_delete(sb); 4838 return (error); 4839 } 4840 4841 static int 4842 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4843 { 4844 struct hn_softc *sc = arg1; 4845 int error, onoff = 0; 4846 4847 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4848 onoff = 1; 4849 error = sysctl_handle_int(oidp, &onoff, 0, req); 4850 if (error || req->newptr == NULL) 4851 return (error); 4852 4853 HN_LOCK(sc); 4854 /* NOTE: hn_vf_lock for hn_transmit() */ 4855 rm_wlock(&sc->hn_vf_lock); 4856 if (onoff) 4857 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4858 else 4859 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4860 rm_wunlock(&sc->hn_vf_lock); 4861 HN_UNLOCK(sc); 4862 4863 return (0); 4864 } 4865 4866 static int 4867 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4868 { 4869 struct hn_softc *sc = arg1; 4870 int enabled = 0; 4871 4872 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4873 enabled = 1; 4874 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4875 } 4876 4877 static int 4878 hn_check_iplen(const struct mbuf *m, int hoff) 4879 { 4880 const struct ip *ip; 4881 int len, iphlen, iplen; 4882 const struct tcphdr *th; 4883 int thoff; /* TCP data offset */ 4884 4885 len = hoff + sizeof(struct ip); 4886 4887 /* The packet must be at least the size of an IP header. */ 4888 if (m->m_pkthdr.len < len) 4889 return IPPROTO_DONE; 4890 4891 /* The fixed IP header must reside completely in the first mbuf. */ 4892 if (m->m_len < len) 4893 return IPPROTO_DONE; 4894 4895 ip = mtodo(m, hoff); 4896 4897 /* Bound check the packet's stated IP header length. */ 4898 iphlen = ip->ip_hl << 2; 4899 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4900 return IPPROTO_DONE; 4901 4902 /* The full IP header must reside completely in the one mbuf. */ 4903 if (m->m_len < hoff + iphlen) 4904 return IPPROTO_DONE; 4905 4906 iplen = ntohs(ip->ip_len); 4907 4908 /* 4909 * Check that the amount of data in the buffers is as 4910 * at least much as the IP header would have us expect. 4911 */ 4912 if (m->m_pkthdr.len < hoff + iplen) 4913 return IPPROTO_DONE; 4914 4915 /* 4916 * Ignore IP fragments. 4917 */ 4918 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4919 return IPPROTO_DONE; 4920 4921 /* 4922 * The TCP/IP or UDP/IP header must be entirely contained within 4923 * the first fragment of a packet. 4924 */ 4925 switch (ip->ip_p) { 4926 case IPPROTO_TCP: 4927 if (iplen < iphlen + sizeof(struct tcphdr)) 4928 return IPPROTO_DONE; 4929 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4930 return IPPROTO_DONE; 4931 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4932 thoff = th->th_off << 2; 4933 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4934 return IPPROTO_DONE; 4935 if (m->m_len < hoff + iphlen + thoff) 4936 return IPPROTO_DONE; 4937 break; 4938 case IPPROTO_UDP: 4939 if (iplen < iphlen + sizeof(struct udphdr)) 4940 return IPPROTO_DONE; 4941 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4942 return IPPROTO_DONE; 4943 break; 4944 default: 4945 if (iplen < iphlen) 4946 return IPPROTO_DONE; 4947 break; 4948 } 4949 return ip->ip_p; 4950 } 4951 4952 static void 4953 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4954 { 4955 const struct ether_header *eh; 4956 uint16_t etype; 4957 int hoff; 4958 4959 hoff = sizeof(*eh); 4960 /* Checked at the beginning of this function. */ 4961 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4962 4963 eh = mtod(m_new, const struct ether_header *); 4964 etype = ntohs(eh->ether_type); 4965 if (etype == ETHERTYPE_VLAN) { 4966 const struct ether_vlan_header *evl; 4967 4968 hoff = sizeof(*evl); 4969 if (m_new->m_len < hoff) 4970 return; 4971 evl = mtod(m_new, const struct ether_vlan_header *); 4972 etype = ntohs(evl->evl_proto); 4973 } 4974 *l3proto = etype; 4975 4976 if (etype == ETHERTYPE_IP) 4977 *l4proto = hn_check_iplen(m_new, hoff); 4978 else 4979 *l4proto = IPPROTO_DONE; 4980 } 4981 4982 static int 4983 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4984 { 4985 struct sysctl_oid_list *child; 4986 struct sysctl_ctx_list *ctx; 4987 device_t dev = sc->hn_dev; 4988 #if defined(INET) || defined(INET6) 4989 #if __FreeBSD_version >= 1100095 4990 int lroent_cnt; 4991 #endif 4992 #endif 4993 int i; 4994 4995 /* 4996 * Create RXBUF for reception. 4997 * 4998 * NOTE: 4999 * - It is shared by all channels. 5000 * - A large enough buffer is allocated, certain version of NVSes 5001 * may further limit the usable space. 5002 */ 5003 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 5004 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 5005 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5006 if (sc->hn_rxbuf == NULL) { 5007 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 5008 return (ENOMEM); 5009 } 5010 5011 sc->hn_rx_ring_cnt = ring_cnt; 5012 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 5013 5014 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 5015 M_DEVBUF, M_WAITOK | M_ZERO); 5016 5017 #if defined(INET) || defined(INET6) 5018 #if __FreeBSD_version >= 1100095 5019 lroent_cnt = hn_lro_entry_count; 5020 if (lroent_cnt < TCP_LRO_ENTRIES) 5021 lroent_cnt = TCP_LRO_ENTRIES; 5022 if (bootverbose) 5023 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 5024 #endif 5025 #endif /* INET || INET6 */ 5026 5027 ctx = device_get_sysctl_ctx(dev); 5028 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 5029 5030 /* Create dev.hn.UNIT.rx sysctl tree */ 5031 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 5032 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5033 5034 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5035 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5036 5037 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 5038 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 5039 &rxr->hn_br_dma, BUS_DMA_WAITOK); 5040 if (rxr->hn_br == NULL) { 5041 device_printf(dev, "allocate bufring failed\n"); 5042 return (ENOMEM); 5043 } 5044 5045 if (hn_trust_hosttcp) 5046 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 5047 if (hn_trust_hostudp) 5048 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 5049 if (hn_trust_hostip) 5050 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 5051 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 5052 rxr->hn_ifp = sc->hn_ifp; 5053 if (i < sc->hn_tx_ring_cnt) 5054 rxr->hn_txr = &sc->hn_tx_ring[i]; 5055 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 5056 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 5057 rxr->hn_rx_idx = i; 5058 rxr->hn_rxbuf = sc->hn_rxbuf; 5059 5060 /* 5061 * Initialize LRO. 5062 */ 5063 #if defined(INET) || defined(INET6) 5064 #if __FreeBSD_version >= 1100095 5065 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5066 hn_lro_mbufq_depth); 5067 #else 5068 tcp_lro_init(&rxr->hn_lro); 5069 rxr->hn_lro.ifp = sc->hn_ifp; 5070 #endif 5071 #if __FreeBSD_version >= 1100099 5072 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5073 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5074 #endif 5075 #endif /* INET || INET6 */ 5076 5077 if (sc->hn_rx_sysctl_tree != NULL) { 5078 char name[16]; 5079 5080 /* 5081 * Create per RX ring sysctl tree: 5082 * dev.hn.UNIT.rx.RINGID 5083 */ 5084 snprintf(name, sizeof(name), "%d", i); 5085 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5086 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5087 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5088 5089 if (rxr->hn_rx_sysctl_tree != NULL) { 5090 SYSCTL_ADD_ULONG(ctx, 5091 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5092 OID_AUTO, "packets", 5093 CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts, 5094 "# of packets received"); 5095 SYSCTL_ADD_ULONG(ctx, 5096 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5097 OID_AUTO, "rss_pkts", 5098 CTLFLAG_RW | CTLFLAG_STATS, 5099 &rxr->hn_rss_pkts, 5100 "# of packets w/ RSS info received"); 5101 SYSCTL_ADD_ULONG(ctx, 5102 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5103 OID_AUTO, "rsc_pkts", 5104 CTLFLAG_RW | CTLFLAG_STATS, 5105 &rxr->hn_rsc_pkts, 5106 "# of RSC packets received"); 5107 SYSCTL_ADD_ULONG(ctx, 5108 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5109 OID_AUTO, "rsc_drop", 5110 CTLFLAG_RW | CTLFLAG_STATS, 5111 &rxr->hn_rsc_drop, 5112 "# of RSC fragments dropped"); 5113 SYSCTL_ADD_INT(ctx, 5114 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5115 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5116 &rxr->hn_pktbuf_len, 0, 5117 "Temporary channel packet buffer length"); 5118 } 5119 } 5120 } 5121 5122 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5123 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5124 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5125 #if __FreeBSD_version < 1100095 5126 hn_rx_stat_int_sysctl, 5127 #else 5128 hn_rx_stat_u64_sysctl, 5129 #endif 5130 "LU", "LRO queued"); 5131 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5132 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5133 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5134 #if __FreeBSD_version < 1100095 5135 hn_rx_stat_int_sysctl, 5136 #else 5137 hn_rx_stat_u64_sysctl, 5138 #endif 5139 "LU", "LRO flushed"); 5140 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5141 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5142 __offsetof(struct hn_rx_ring, hn_lro_tried), 5143 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5144 #if __FreeBSD_version >= 1100099 5145 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5146 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5147 hn_lro_lenlim_sysctl, "IU", 5148 "Max # of data bytes to be aggregated by LRO"); 5149 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5150 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5151 hn_lro_ackcnt_sysctl, "I", 5152 "Max # of ACKs to be aggregated by LRO"); 5153 #endif 5154 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5155 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5156 hn_trust_hcsum_sysctl, "I", 5157 "Trust tcp segment verification on host side, " 5158 "when csum info is missing"); 5159 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5160 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5161 hn_trust_hcsum_sysctl, "I", 5162 "Trust udp datagram verification on host side, " 5163 "when csum info is missing"); 5164 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5165 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5166 hn_trust_hcsum_sysctl, "I", 5167 "Trust ip packet verification on host side, " 5168 "when csum info is missing"); 5169 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5170 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5171 __offsetof(struct hn_rx_ring, hn_csum_ip), 5172 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5173 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5174 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5175 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5176 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5177 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5178 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5179 __offsetof(struct hn_rx_ring, hn_csum_udp), 5180 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5181 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5182 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5183 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5184 hn_rx_stat_ulong_sysctl, "LU", 5185 "# of packets that we trust host's csum verification"); 5186 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5187 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5188 __offsetof(struct hn_rx_ring, hn_small_pkts), 5189 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5190 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5191 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5192 __offsetof(struct hn_rx_ring, hn_ack_failed), 5193 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5194 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5195 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5196 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5197 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5198 5199 return (0); 5200 } 5201 5202 static void 5203 hn_destroy_rx_data(struct hn_softc *sc) 5204 { 5205 int i; 5206 5207 if (sc->hn_rxbuf != NULL) { 5208 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5209 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 5210 else 5211 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5212 sc->hn_rxbuf = NULL; 5213 } 5214 5215 if (sc->hn_rx_ring_cnt == 0) 5216 return; 5217 5218 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5219 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5220 5221 if (rxr->hn_br == NULL) 5222 continue; 5223 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5224 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 5225 } else { 5226 device_printf(sc->hn_dev, 5227 "%dth channel bufring is referenced", i); 5228 } 5229 rxr->hn_br = NULL; 5230 5231 #if defined(INET) || defined(INET6) 5232 tcp_lro_free(&rxr->hn_lro); 5233 #endif 5234 free(rxr->hn_pktbuf, M_DEVBUF); 5235 } 5236 free(sc->hn_rx_ring, M_DEVBUF); 5237 sc->hn_rx_ring = NULL; 5238 5239 sc->hn_rx_ring_cnt = 0; 5240 sc->hn_rx_ring_inuse = 0; 5241 } 5242 5243 static int 5244 hn_tx_ring_create(struct hn_softc *sc, int id) 5245 { 5246 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5247 device_t dev = sc->hn_dev; 5248 bus_dma_tag_t parent_dtag; 5249 int error, i; 5250 5251 txr->hn_sc = sc; 5252 txr->hn_tx_idx = id; 5253 5254 #ifndef HN_USE_TXDESC_BUFRING 5255 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5256 #endif 5257 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5258 5259 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5260 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5261 M_DEVBUF, M_WAITOK | M_ZERO); 5262 #ifndef HN_USE_TXDESC_BUFRING 5263 SLIST_INIT(&txr->hn_txlist); 5264 #else 5265 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5266 M_WAITOK, &txr->hn_tx_lock); 5267 #endif 5268 5269 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5270 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5271 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5272 } else { 5273 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5274 } 5275 5276 #ifdef HN_IFSTART_SUPPORT 5277 if (hn_use_if_start) { 5278 txr->hn_txeof = hn_start_txeof; 5279 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5280 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5281 } else 5282 #endif 5283 { 5284 int br_depth; 5285 5286 txr->hn_txeof = hn_xmit_txeof; 5287 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5288 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5289 5290 br_depth = hn_get_txswq_depth(txr); 5291 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5292 M_WAITOK, &txr->hn_tx_lock); 5293 } 5294 5295 txr->hn_direct_tx_size = hn_direct_tx_size; 5296 5297 /* 5298 * Always schedule transmission instead of trying to do direct 5299 * transmission. This one gives the best performance so far. 5300 */ 5301 txr->hn_sched_tx = 1; 5302 5303 parent_dtag = bus_get_dma_tag(dev); 5304 5305 /* DMA tag for RNDIS packet messages. */ 5306 error = bus_dma_tag_create(parent_dtag, /* parent */ 5307 HN_RNDIS_PKT_ALIGN, /* alignment */ 5308 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5309 BUS_SPACE_MAXADDR, /* lowaddr */ 5310 BUS_SPACE_MAXADDR, /* highaddr */ 5311 NULL, NULL, /* filter, filterarg */ 5312 HN_RNDIS_PKT_LEN, /* maxsize */ 5313 1, /* nsegments */ 5314 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5315 0, /* flags */ 5316 NULL, /* lockfunc */ 5317 NULL, /* lockfuncarg */ 5318 &txr->hn_tx_rndis_dtag); 5319 if (error) { 5320 device_printf(dev, "failed to create rndis dmatag\n"); 5321 return error; 5322 } 5323 5324 /* DMA tag for data. */ 5325 error = bus_dma_tag_create(parent_dtag, /* parent */ 5326 1, /* alignment */ 5327 HN_TX_DATA_BOUNDARY, /* boundary */ 5328 BUS_SPACE_MAXADDR, /* lowaddr */ 5329 BUS_SPACE_MAXADDR, /* highaddr */ 5330 NULL, NULL, /* filter, filterarg */ 5331 HN_TX_DATA_MAXSIZE, /* maxsize */ 5332 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5333 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5334 0, /* flags */ 5335 NULL, /* lockfunc */ 5336 NULL, /* lockfuncarg */ 5337 &txr->hn_tx_data_dtag); 5338 if (error) { 5339 device_printf(dev, "failed to create data dmatag\n"); 5340 return error; 5341 } 5342 5343 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5344 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5345 5346 txd->txr = txr; 5347 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5348 STAILQ_INIT(&txd->agg_list); 5349 5350 /* 5351 * Allocate and load RNDIS packet message. 5352 */ 5353 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5354 (void **)&txd->rndis_pkt, 5355 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5356 &txd->rndis_pkt_dmap); 5357 if (error) { 5358 device_printf(dev, 5359 "failed to allocate rndis_packet_msg, %d\n", i); 5360 return error; 5361 } 5362 5363 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5364 txd->rndis_pkt_dmap, 5365 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5366 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5367 BUS_DMA_NOWAIT); 5368 if (error) { 5369 device_printf(dev, 5370 "failed to load rndis_packet_msg, %d\n", i); 5371 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5372 txd->rndis_pkt, txd->rndis_pkt_dmap); 5373 return error; 5374 } 5375 5376 /* DMA map for TX data. */ 5377 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5378 &txd->data_dmap); 5379 if (error) { 5380 device_printf(dev, 5381 "failed to allocate tx data dmamap\n"); 5382 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5383 txd->rndis_pkt_dmap); 5384 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5385 txd->rndis_pkt, txd->rndis_pkt_dmap); 5386 return error; 5387 } 5388 5389 /* All set, put it to list */ 5390 txd->flags |= HN_TXD_FLAG_ONLIST; 5391 #ifndef HN_USE_TXDESC_BUFRING 5392 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5393 #else 5394 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5395 #endif 5396 } 5397 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5398 5399 if (sc->hn_tx_sysctl_tree != NULL) { 5400 struct sysctl_oid_list *child; 5401 struct sysctl_ctx_list *ctx; 5402 char name[16]; 5403 5404 /* 5405 * Create per TX ring sysctl tree: 5406 * dev.hn.UNIT.tx.RINGID 5407 */ 5408 ctx = device_get_sysctl_ctx(dev); 5409 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5410 5411 snprintf(name, sizeof(name), "%d", id); 5412 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5413 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5414 5415 if (txr->hn_tx_sysctl_tree != NULL) { 5416 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5417 5418 #ifdef HN_DEBUG 5419 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5420 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5421 "# of available TX descs"); 5422 #endif 5423 #ifdef HN_IFSTART_SUPPORT 5424 if (!hn_use_if_start) 5425 #endif 5426 { 5427 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5428 CTLFLAG_RD, &txr->hn_oactive, 0, 5429 "over active"); 5430 } 5431 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5432 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts, 5433 "# of packets transmitted"); 5434 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5435 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends, 5436 "# of sends"); 5437 } 5438 } 5439 5440 return 0; 5441 } 5442 5443 static void 5444 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5445 { 5446 struct hn_tx_ring *txr = txd->txr; 5447 5448 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5449 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5450 5451 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5452 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5453 txd->rndis_pkt_dmap); 5454 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5455 } 5456 5457 static void 5458 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5459 { 5460 5461 KASSERT(txd->refs == 0 || txd->refs == 1, 5462 ("invalid txd refs %d", txd->refs)); 5463 5464 /* Aggregated txds will be freed by their aggregating txd. */ 5465 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5466 int freed __diagused; 5467 5468 freed = hn_txdesc_put(txr, txd); 5469 KASSERT(freed, ("can't free txdesc")); 5470 } 5471 } 5472 5473 static void 5474 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5475 { 5476 int i; 5477 5478 if (txr->hn_txdesc == NULL) 5479 return; 5480 5481 /* 5482 * NOTE: 5483 * Because the freeing of aggregated txds will be deferred 5484 * to the aggregating txd, two passes are used here: 5485 * - The first pass GCes any pending txds. This GC is necessary, 5486 * since if the channels are revoked, hypervisor will not 5487 * deliver send-done for all pending txds. 5488 * - The second pass frees the busdma stuffs, i.e. after all txds 5489 * were freed. 5490 */ 5491 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5492 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5493 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5494 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5495 5496 if (txr->hn_tx_data_dtag != NULL) 5497 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5498 if (txr->hn_tx_rndis_dtag != NULL) 5499 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5500 5501 #ifdef HN_USE_TXDESC_BUFRING 5502 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5503 #endif 5504 5505 free(txr->hn_txdesc, M_DEVBUF); 5506 txr->hn_txdesc = NULL; 5507 5508 if (txr->hn_mbuf_br != NULL) 5509 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5510 5511 #ifndef HN_USE_TXDESC_BUFRING 5512 mtx_destroy(&txr->hn_txlist_spin); 5513 #endif 5514 mtx_destroy(&txr->hn_tx_lock); 5515 } 5516 5517 static int 5518 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5519 { 5520 struct sysctl_oid_list *child; 5521 struct sysctl_ctx_list *ctx; 5522 int i; 5523 5524 /* 5525 * Create TXBUF for chimney sending. 5526 * 5527 * NOTE: It is shared by all channels. 5528 */ 5529 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 5530 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 5531 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5532 if (sc->hn_chim == NULL) { 5533 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5534 return (ENOMEM); 5535 } 5536 5537 sc->hn_tx_ring_cnt = ring_cnt; 5538 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5539 5540 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5541 M_DEVBUF, M_WAITOK | M_ZERO); 5542 5543 ctx = device_get_sysctl_ctx(sc->hn_dev); 5544 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5545 5546 /* Create dev.hn.UNIT.tx sysctl tree */ 5547 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5548 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5549 5550 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5551 int error; 5552 5553 error = hn_tx_ring_create(sc, i); 5554 if (error) 5555 return error; 5556 } 5557 5558 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5559 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5560 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5561 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5562 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5563 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5564 __offsetof(struct hn_tx_ring, hn_send_failed), 5565 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5566 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5567 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5568 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5569 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5570 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5571 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5572 __offsetof(struct hn_tx_ring, hn_flush_failed), 5573 hn_tx_stat_ulong_sysctl, "LU", 5574 "# of packet transmission aggregation flush failure"); 5575 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5576 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5577 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5578 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5579 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5580 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5581 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5582 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5583 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5584 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5585 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5586 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5587 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5588 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5589 "# of total TX descs"); 5590 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5591 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5592 "Chimney send packet size upper boundary"); 5593 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5594 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5595 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5596 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5597 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5598 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5599 hn_tx_conf_int_sysctl, "I", 5600 "Size of the packet for direct transmission"); 5601 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5602 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5603 __offsetof(struct hn_tx_ring, hn_sched_tx), 5604 hn_tx_conf_int_sysctl, "I", 5605 "Always schedule transmission " 5606 "instead of doing direct transmission"); 5607 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5608 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5609 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5610 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5611 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5612 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5613 "Applied packet transmission aggregation size"); 5614 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5615 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5616 hn_txagg_pktmax_sysctl, "I", 5617 "Applied packet transmission aggregation packets"); 5618 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5619 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5620 hn_txagg_align_sysctl, "I", 5621 "Applied packet transmission aggregation alignment"); 5622 5623 return 0; 5624 } 5625 5626 static void 5627 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5628 { 5629 int i; 5630 5631 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5632 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5633 } 5634 5635 static void 5636 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5637 { 5638 struct ifnet *ifp = sc->hn_ifp; 5639 u_int hw_tsomax; 5640 int tso_minlen; 5641 5642 HN_LOCK_ASSERT(sc); 5643 5644 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5645 return; 5646 5647 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5648 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5649 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5650 5651 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5652 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5653 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5654 5655 if (tso_maxlen < tso_minlen) 5656 tso_maxlen = tso_minlen; 5657 else if (tso_maxlen > IP_MAXPACKET) 5658 tso_maxlen = IP_MAXPACKET; 5659 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5660 tso_maxlen = sc->hn_ndis_tso_szmax; 5661 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5662 5663 if (hn_xpnt_vf_isready(sc)) { 5664 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) 5665 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; 5666 } 5667 ifp->if_hw_tsomax = hw_tsomax; 5668 if (bootverbose) 5669 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 5670 } 5671 5672 static void 5673 hn_fixup_tx_data(struct hn_softc *sc) 5674 { 5675 uint64_t csum_assist; 5676 int i; 5677 5678 hn_set_chim_size(sc, sc->hn_chim_szmax); 5679 if (hn_tx_chimney_size > 0 && 5680 hn_tx_chimney_size < sc->hn_chim_szmax) 5681 hn_set_chim_size(sc, hn_tx_chimney_size); 5682 5683 csum_assist = 0; 5684 if (sc->hn_caps & HN_CAP_IPCS) 5685 csum_assist |= CSUM_IP; 5686 if (sc->hn_caps & HN_CAP_TCP4CS) 5687 csum_assist |= CSUM_IP_TCP; 5688 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5689 csum_assist |= CSUM_IP_UDP; 5690 if (sc->hn_caps & HN_CAP_TCP6CS) 5691 csum_assist |= CSUM_IP6_TCP; 5692 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5693 csum_assist |= CSUM_IP6_UDP; 5694 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5695 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5696 5697 if (sc->hn_caps & HN_CAP_HASHVAL) { 5698 /* 5699 * Support HASHVAL pktinfo on TX path. 5700 */ 5701 if (bootverbose) 5702 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5703 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5704 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5705 } 5706 } 5707 5708 static void 5709 hn_fixup_rx_data(struct hn_softc *sc) 5710 { 5711 5712 if (sc->hn_caps & HN_CAP_UDPHASH) { 5713 int i; 5714 5715 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5716 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5717 } 5718 } 5719 5720 static void 5721 hn_destroy_tx_data(struct hn_softc *sc) 5722 { 5723 int i; 5724 5725 if (sc->hn_chim != NULL) { 5726 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5727 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5728 } else { 5729 device_printf(sc->hn_dev, 5730 "chimney sending buffer is referenced"); 5731 } 5732 sc->hn_chim = NULL; 5733 } 5734 5735 if (sc->hn_tx_ring_cnt == 0) 5736 return; 5737 5738 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5739 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5740 5741 free(sc->hn_tx_ring, M_DEVBUF); 5742 sc->hn_tx_ring = NULL; 5743 5744 sc->hn_tx_ring_cnt = 0; 5745 sc->hn_tx_ring_inuse = 0; 5746 } 5747 5748 #ifdef HN_IFSTART_SUPPORT 5749 5750 static void 5751 hn_start_taskfunc(void *xtxr, int pending __unused) 5752 { 5753 struct hn_tx_ring *txr = xtxr; 5754 5755 mtx_lock(&txr->hn_tx_lock); 5756 hn_start_locked(txr, 0); 5757 mtx_unlock(&txr->hn_tx_lock); 5758 } 5759 5760 static int 5761 hn_start_locked(struct hn_tx_ring *txr, int len) 5762 { 5763 struct hn_softc *sc = txr->hn_sc; 5764 struct ifnet *ifp = sc->hn_ifp; 5765 int sched = 0; 5766 5767 KASSERT(hn_use_if_start, 5768 ("hn_start_locked is called, when if_start is disabled")); 5769 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5770 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5771 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5772 5773 if (__predict_false(txr->hn_suspended)) 5774 return (0); 5775 5776 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5777 IFF_DRV_RUNNING) 5778 return (0); 5779 5780 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 5781 struct hn_txdesc *txd; 5782 struct mbuf *m_head; 5783 int error; 5784 5785 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 5786 if (m_head == NULL) 5787 break; 5788 5789 if (len > 0 && m_head->m_pkthdr.len > len) { 5790 /* 5791 * This sending could be time consuming; let callers 5792 * dispatch this packet sending (and sending of any 5793 * following up packets) to tx taskqueue. 5794 */ 5795 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5796 sched = 1; 5797 break; 5798 } 5799 5800 #if defined(INET6) || defined(INET) 5801 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5802 m_head = hn_tso_fixup(m_head); 5803 if (__predict_false(m_head == NULL)) { 5804 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5805 continue; 5806 } 5807 } else if (m_head->m_pkthdr.csum_flags & 5808 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5809 m_head = hn_set_hlen(m_head); 5810 if (__predict_false(m_head == NULL)) { 5811 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5812 continue; 5813 } 5814 } 5815 #endif 5816 5817 txd = hn_txdesc_get(txr); 5818 if (txd == NULL) { 5819 txr->hn_no_txdescs++; 5820 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5821 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5822 break; 5823 } 5824 5825 error = hn_encap(ifp, txr, txd, &m_head); 5826 if (error) { 5827 /* Both txd and m_head are freed */ 5828 KASSERT(txr->hn_agg_txd == NULL, 5829 ("encap failed w/ pending aggregating txdesc")); 5830 continue; 5831 } 5832 5833 if (txr->hn_agg_pktleft == 0) { 5834 if (txr->hn_agg_txd != NULL) { 5835 KASSERT(m_head == NULL, 5836 ("pending mbuf for aggregating txdesc")); 5837 error = hn_flush_txagg(ifp, txr); 5838 if (__predict_false(error)) { 5839 atomic_set_int(&ifp->if_drv_flags, 5840 IFF_DRV_OACTIVE); 5841 break; 5842 } 5843 } else { 5844 KASSERT(m_head != NULL, ("mbuf was freed")); 5845 error = hn_txpkt(ifp, txr, txd); 5846 if (__predict_false(error)) { 5847 /* txd is freed, but m_head is not */ 5848 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5849 atomic_set_int(&ifp->if_drv_flags, 5850 IFF_DRV_OACTIVE); 5851 break; 5852 } 5853 } 5854 } 5855 #ifdef INVARIANTS 5856 else { 5857 KASSERT(txr->hn_agg_txd != NULL, 5858 ("no aggregating txdesc")); 5859 KASSERT(m_head == NULL, 5860 ("pending mbuf for aggregating txdesc")); 5861 } 5862 #endif 5863 } 5864 5865 /* Flush pending aggerated transmission. */ 5866 if (txr->hn_agg_txd != NULL) 5867 hn_flush_txagg(ifp, txr); 5868 return (sched); 5869 } 5870 5871 static void 5872 hn_start(struct ifnet *ifp) 5873 { 5874 struct hn_softc *sc = ifp->if_softc; 5875 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5876 5877 if (txr->hn_sched_tx) 5878 goto do_sched; 5879 5880 if (mtx_trylock(&txr->hn_tx_lock)) { 5881 int sched; 5882 5883 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5884 mtx_unlock(&txr->hn_tx_lock); 5885 if (!sched) 5886 return; 5887 } 5888 do_sched: 5889 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5890 } 5891 5892 static void 5893 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5894 { 5895 struct hn_tx_ring *txr = xtxr; 5896 5897 mtx_lock(&txr->hn_tx_lock); 5898 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 5899 hn_start_locked(txr, 0); 5900 mtx_unlock(&txr->hn_tx_lock); 5901 } 5902 5903 static void 5904 hn_start_txeof(struct hn_tx_ring *txr) 5905 { 5906 struct hn_softc *sc = txr->hn_sc; 5907 struct ifnet *ifp = sc->hn_ifp; 5908 5909 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5910 5911 if (txr->hn_sched_tx) 5912 goto do_sched; 5913 5914 if (mtx_trylock(&txr->hn_tx_lock)) { 5915 int sched; 5916 5917 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5918 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5919 mtx_unlock(&txr->hn_tx_lock); 5920 if (sched) { 5921 taskqueue_enqueue(txr->hn_tx_taskq, 5922 &txr->hn_tx_task); 5923 } 5924 } else { 5925 do_sched: 5926 /* 5927 * Release the OACTIVE earlier, with the hope, that 5928 * others could catch up. The task will clear the 5929 * flag again with the hn_tx_lock to avoid possible 5930 * races. 5931 */ 5932 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5933 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5934 } 5935 } 5936 5937 #endif /* HN_IFSTART_SUPPORT */ 5938 5939 static int 5940 hn_xmit(struct hn_tx_ring *txr, int len) 5941 { 5942 struct hn_softc *sc = txr->hn_sc; 5943 struct ifnet *ifp = sc->hn_ifp; 5944 struct mbuf *m_head; 5945 int sched = 0; 5946 5947 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5948 #ifdef HN_IFSTART_SUPPORT 5949 KASSERT(hn_use_if_start == 0, 5950 ("hn_xmit is called, when if_start is enabled")); 5951 #endif 5952 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5953 5954 if (__predict_false(txr->hn_suspended)) 5955 return (0); 5956 5957 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5958 return (0); 5959 5960 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5961 struct hn_txdesc *txd; 5962 int error; 5963 5964 if (len > 0 && m_head->m_pkthdr.len > len) { 5965 /* 5966 * This sending could be time consuming; let callers 5967 * dispatch this packet sending (and sending of any 5968 * following up packets) to tx taskqueue. 5969 */ 5970 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5971 sched = 1; 5972 break; 5973 } 5974 5975 txd = hn_txdesc_get(txr); 5976 if (txd == NULL) { 5977 txr->hn_no_txdescs++; 5978 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5979 txr->hn_oactive = 1; 5980 break; 5981 } 5982 5983 error = hn_encap(ifp, txr, txd, &m_head); 5984 if (error) { 5985 /* Both txd and m_head are freed; discard */ 5986 KASSERT(txr->hn_agg_txd == NULL, 5987 ("encap failed w/ pending aggregating txdesc")); 5988 drbr_advance(ifp, txr->hn_mbuf_br); 5989 continue; 5990 } 5991 5992 if (txr->hn_agg_pktleft == 0) { 5993 if (txr->hn_agg_txd != NULL) { 5994 KASSERT(m_head == NULL, 5995 ("pending mbuf for aggregating txdesc")); 5996 error = hn_flush_txagg(ifp, txr); 5997 if (__predict_false(error)) { 5998 txr->hn_oactive = 1; 5999 break; 6000 } 6001 } else { 6002 KASSERT(m_head != NULL, ("mbuf was freed")); 6003 error = hn_txpkt(ifp, txr, txd); 6004 if (__predict_false(error)) { 6005 /* txd is freed, but m_head is not */ 6006 drbr_putback(ifp, txr->hn_mbuf_br, 6007 m_head); 6008 txr->hn_oactive = 1; 6009 break; 6010 } 6011 } 6012 } 6013 #ifdef INVARIANTS 6014 else { 6015 KASSERT(txr->hn_agg_txd != NULL, 6016 ("no aggregating txdesc")); 6017 KASSERT(m_head == NULL, 6018 ("pending mbuf for aggregating txdesc")); 6019 } 6020 #endif 6021 6022 /* Sent */ 6023 drbr_advance(ifp, txr->hn_mbuf_br); 6024 } 6025 6026 /* Flush pending aggerated transmission. */ 6027 if (txr->hn_agg_txd != NULL) 6028 hn_flush_txagg(ifp, txr); 6029 return (sched); 6030 } 6031 6032 static int 6033 hn_transmit(struct ifnet *ifp, struct mbuf *m) 6034 { 6035 struct hn_softc *sc = ifp->if_softc; 6036 struct hn_tx_ring *txr; 6037 int error, idx = 0; 6038 6039 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 6040 struct rm_priotracker pt; 6041 6042 rm_rlock(&sc->hn_vf_lock, &pt); 6043 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6044 struct mbuf *m_bpf = NULL; 6045 int obytes, omcast; 6046 6047 obytes = m->m_pkthdr.len; 6048 omcast = (m->m_flags & M_MCAST) != 0; 6049 6050 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 6051 if (bpf_peers_present(ifp->if_bpf)) { 6052 m_bpf = m_copypacket(m, M_NOWAIT); 6053 if (m_bpf == NULL) { 6054 /* 6055 * Failed to grab a shallow 6056 * copy; tap now. 6057 */ 6058 ETHER_BPF_MTAP(ifp, m); 6059 } 6060 } 6061 } else { 6062 ETHER_BPF_MTAP(ifp, m); 6063 } 6064 6065 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); 6066 rm_runlock(&sc->hn_vf_lock, &pt); 6067 6068 if (m_bpf != NULL) { 6069 if (!error) 6070 ETHER_BPF_MTAP(ifp, m_bpf); 6071 m_freem(m_bpf); 6072 } 6073 6074 if (error == ENOBUFS) { 6075 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6076 } else if (error) { 6077 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6078 } else { 6079 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6080 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6081 if (omcast) { 6082 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6083 omcast); 6084 } 6085 } 6086 return (error); 6087 } 6088 rm_runlock(&sc->hn_vf_lock, &pt); 6089 } 6090 6091 #if defined(INET6) || defined(INET) 6092 /* 6093 * Perform TSO packet header fixup or get l2/l3 header length now, 6094 * since packet headers should be cache-hot. 6095 */ 6096 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6097 m = hn_tso_fixup(m); 6098 if (__predict_false(m == NULL)) { 6099 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6100 return EIO; 6101 } 6102 } else if (m->m_pkthdr.csum_flags & 6103 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6104 m = hn_set_hlen(m); 6105 if (__predict_false(m == NULL)) { 6106 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6107 return EIO; 6108 } 6109 } 6110 #endif 6111 6112 /* 6113 * Select the TX ring based on flowid 6114 */ 6115 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6116 #ifdef RSS 6117 uint32_t bid; 6118 6119 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6120 &bid) == 0) 6121 idx = bid % sc->hn_tx_ring_inuse; 6122 else 6123 #endif 6124 { 6125 #if defined(INET6) || defined(INET) 6126 int tcpsyn = 0; 6127 6128 if (m->m_pkthdr.len < 128 && 6129 (m->m_pkthdr.csum_flags & 6130 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6131 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6132 m = hn_check_tcpsyn(m, &tcpsyn); 6133 if (__predict_false(m == NULL)) { 6134 if_inc_counter(ifp, 6135 IFCOUNTER_OERRORS, 1); 6136 return (EIO); 6137 } 6138 } 6139 #else 6140 const int tcpsyn = 0; 6141 #endif 6142 if (tcpsyn) 6143 idx = 0; 6144 else 6145 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6146 } 6147 } 6148 txr = &sc->hn_tx_ring[idx]; 6149 6150 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6151 if (error) { 6152 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6153 return error; 6154 } 6155 6156 if (txr->hn_oactive) 6157 return 0; 6158 6159 if (txr->hn_sched_tx) 6160 goto do_sched; 6161 6162 if (mtx_trylock(&txr->hn_tx_lock)) { 6163 int sched; 6164 6165 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6166 mtx_unlock(&txr->hn_tx_lock); 6167 if (!sched) 6168 return 0; 6169 } 6170 do_sched: 6171 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6172 return 0; 6173 } 6174 6175 static void 6176 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6177 { 6178 struct mbuf *m; 6179 6180 mtx_lock(&txr->hn_tx_lock); 6181 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6182 m_freem(m); 6183 mtx_unlock(&txr->hn_tx_lock); 6184 } 6185 6186 static void 6187 hn_xmit_qflush(struct ifnet *ifp) 6188 { 6189 struct hn_softc *sc = ifp->if_softc; 6190 struct rm_priotracker pt; 6191 int i; 6192 6193 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6194 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6195 if_qflush(ifp); 6196 6197 rm_rlock(&sc->hn_vf_lock, &pt); 6198 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6199 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); 6200 rm_runlock(&sc->hn_vf_lock, &pt); 6201 } 6202 6203 static void 6204 hn_xmit_txeof(struct hn_tx_ring *txr) 6205 { 6206 6207 if (txr->hn_sched_tx) 6208 goto do_sched; 6209 6210 if (mtx_trylock(&txr->hn_tx_lock)) { 6211 int sched; 6212 6213 txr->hn_oactive = 0; 6214 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6215 mtx_unlock(&txr->hn_tx_lock); 6216 if (sched) { 6217 taskqueue_enqueue(txr->hn_tx_taskq, 6218 &txr->hn_tx_task); 6219 } 6220 } else { 6221 do_sched: 6222 /* 6223 * Release the oactive earlier, with the hope, that 6224 * others could catch up. The task will clear the 6225 * oactive again with the hn_tx_lock to avoid possible 6226 * races. 6227 */ 6228 txr->hn_oactive = 0; 6229 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6230 } 6231 } 6232 6233 static void 6234 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6235 { 6236 struct hn_tx_ring *txr = xtxr; 6237 6238 mtx_lock(&txr->hn_tx_lock); 6239 hn_xmit(txr, 0); 6240 mtx_unlock(&txr->hn_tx_lock); 6241 } 6242 6243 static void 6244 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6245 { 6246 struct hn_tx_ring *txr = xtxr; 6247 6248 mtx_lock(&txr->hn_tx_lock); 6249 txr->hn_oactive = 0; 6250 hn_xmit(txr, 0); 6251 mtx_unlock(&txr->hn_tx_lock); 6252 } 6253 6254 static int 6255 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6256 { 6257 struct vmbus_chan_br cbr; 6258 struct hn_rx_ring *rxr; 6259 struct hn_tx_ring *txr = NULL; 6260 int idx, error; 6261 6262 idx = vmbus_chan_subidx(chan); 6263 6264 /* 6265 * Link this channel to RX/TX ring. 6266 */ 6267 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6268 ("invalid channel index %d, should > 0 && < %d", 6269 idx, sc->hn_rx_ring_inuse)); 6270 rxr = &sc->hn_rx_ring[idx]; 6271 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6272 ("RX ring %d already attached", idx)); 6273 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6274 rxr->hn_chan = chan; 6275 6276 if (bootverbose) { 6277 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6278 idx, vmbus_chan_id(chan)); 6279 } 6280 6281 if (idx < sc->hn_tx_ring_inuse) { 6282 txr = &sc->hn_tx_ring[idx]; 6283 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6284 ("TX ring %d already attached", idx)); 6285 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6286 6287 txr->hn_chan = chan; 6288 if (bootverbose) { 6289 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6290 idx, vmbus_chan_id(chan)); 6291 } 6292 } 6293 6294 /* Bind this channel to a proper CPU. */ 6295 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6296 6297 /* 6298 * Open this channel 6299 */ 6300 cbr.cbr = rxr->hn_br; 6301 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 6302 cbr.cbr_txsz = HN_TXBR_SIZE; 6303 cbr.cbr_rxsz = HN_RXBR_SIZE; 6304 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6305 if (error) { 6306 if (error == EISCONN) { 6307 if_printf(sc->hn_ifp, "bufring is connected after " 6308 "chan%u open failure\n", vmbus_chan_id(chan)); 6309 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6310 } else { 6311 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6312 vmbus_chan_id(chan), error); 6313 } 6314 } 6315 return (error); 6316 } 6317 6318 static void 6319 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6320 { 6321 struct hn_rx_ring *rxr; 6322 int idx, error; 6323 6324 idx = vmbus_chan_subidx(chan); 6325 6326 /* 6327 * Link this channel to RX/TX ring. 6328 */ 6329 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6330 ("invalid channel index %d, should > 0 && < %d", 6331 idx, sc->hn_rx_ring_inuse)); 6332 rxr = &sc->hn_rx_ring[idx]; 6333 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6334 ("RX ring %d is not attached", idx)); 6335 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6336 6337 if (idx < sc->hn_tx_ring_inuse) { 6338 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6339 6340 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6341 ("TX ring %d is not attached attached", idx)); 6342 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6343 } 6344 6345 /* 6346 * Close this channel. 6347 * 6348 * NOTE: 6349 * Channel closing does _not_ destroy the target channel. 6350 */ 6351 error = vmbus_chan_close_direct(chan); 6352 if (error == EISCONN) { 6353 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6354 "after being closed\n", vmbus_chan_id(chan)); 6355 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6356 } else if (error) { 6357 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6358 vmbus_chan_id(chan), error); 6359 } 6360 } 6361 6362 static int 6363 hn_attach_subchans(struct hn_softc *sc) 6364 { 6365 struct vmbus_channel **subchans; 6366 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6367 int i, error = 0; 6368 6369 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6370 6371 /* Attach the sub-channels. */ 6372 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6373 for (i = 0; i < subchan_cnt; ++i) { 6374 int error1; 6375 6376 error1 = hn_chan_attach(sc, subchans[i]); 6377 if (error1) { 6378 error = error1; 6379 /* Move on; all channels will be detached later. */ 6380 } 6381 } 6382 vmbus_subchan_rel(subchans, subchan_cnt); 6383 6384 if (error) { 6385 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6386 } else { 6387 if (bootverbose) { 6388 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6389 subchan_cnt); 6390 } 6391 } 6392 return (error); 6393 } 6394 6395 static void 6396 hn_detach_allchans(struct hn_softc *sc) 6397 { 6398 struct vmbus_channel **subchans; 6399 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6400 int i; 6401 6402 if (subchan_cnt == 0) 6403 goto back; 6404 6405 /* Detach the sub-channels. */ 6406 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6407 for (i = 0; i < subchan_cnt; ++i) 6408 hn_chan_detach(sc, subchans[i]); 6409 vmbus_subchan_rel(subchans, subchan_cnt); 6410 6411 back: 6412 /* 6413 * Detach the primary channel, _after_ all sub-channels 6414 * are detached. 6415 */ 6416 hn_chan_detach(sc, sc->hn_prichan); 6417 6418 /* Wait for sub-channels to be destroyed, if any. */ 6419 vmbus_subchan_drain(sc->hn_prichan); 6420 6421 #ifdef INVARIANTS 6422 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6423 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6424 HN_RX_FLAG_ATTACHED) == 0, 6425 ("%dth RX ring is still attached", i)); 6426 } 6427 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6428 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6429 HN_TX_FLAG_ATTACHED) == 0, 6430 ("%dth TX ring is still attached", i)); 6431 } 6432 #endif 6433 } 6434 6435 static int 6436 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6437 { 6438 struct vmbus_channel **subchans; 6439 int nchan, rxr_cnt, error; 6440 6441 nchan = *nsubch + 1; 6442 if (nchan == 1) { 6443 /* 6444 * Multiple RX/TX rings are not requested. 6445 */ 6446 *nsubch = 0; 6447 return (0); 6448 } 6449 6450 /* 6451 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6452 * table entries. 6453 */ 6454 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6455 if (error) { 6456 /* No RSS; this is benign. */ 6457 *nsubch = 0; 6458 return (0); 6459 } 6460 if (bootverbose) { 6461 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6462 rxr_cnt, nchan); 6463 } 6464 6465 if (nchan > rxr_cnt) 6466 nchan = rxr_cnt; 6467 if (nchan == 1) { 6468 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6469 *nsubch = 0; 6470 return (0); 6471 } 6472 6473 /* 6474 * Allocate sub-channels from NVS. 6475 */ 6476 *nsubch = nchan - 1; 6477 error = hn_nvs_alloc_subchans(sc, nsubch); 6478 if (error || *nsubch == 0) { 6479 /* Failed to allocate sub-channels. */ 6480 *nsubch = 0; 6481 return (0); 6482 } 6483 6484 /* 6485 * Wait for all sub-channels to become ready before moving on. 6486 */ 6487 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6488 vmbus_subchan_rel(subchans, *nsubch); 6489 return (0); 6490 } 6491 6492 static bool 6493 hn_synth_attachable(const struct hn_softc *sc) 6494 { 6495 int i; 6496 6497 if (sc->hn_flags & HN_FLAG_ERRORS) 6498 return (false); 6499 6500 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6501 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6502 6503 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6504 return (false); 6505 } 6506 return (true); 6507 } 6508 6509 /* 6510 * Make sure that the RX filter is zero after the successful 6511 * RNDIS initialization. 6512 * 6513 * NOTE: 6514 * Under certain conditions on certain versions of Hyper-V, 6515 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6516 * after the successful RNDIS initialization, which breaks 6517 * the assumption of any following code (well, it breaks the 6518 * RNDIS API contract actually). Clear the RNDIS rxfilter 6519 * explicitly, drain packets sneaking through, and drain the 6520 * interrupt taskqueues scheduled due to the stealth packets. 6521 */ 6522 static void 6523 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6524 { 6525 6526 hn_disable_rx(sc); 6527 hn_drain_rxtx(sc, nchan); 6528 } 6529 6530 static int 6531 hn_synth_attach(struct hn_softc *sc, int mtu) 6532 { 6533 #define ATTACHED_NVS 0x0002 6534 #define ATTACHED_RNDIS 0x0004 6535 6536 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6537 int error, nsubch, nchan = 1, i, rndis_inited; 6538 uint32_t old_caps, attached = 0; 6539 6540 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6541 ("synthetic parts were attached")); 6542 6543 if (!hn_synth_attachable(sc)) 6544 return (ENXIO); 6545 6546 /* Save capabilities for later verification. */ 6547 old_caps = sc->hn_caps; 6548 sc->hn_caps = 0; 6549 6550 /* Clear RSS stuffs. */ 6551 sc->hn_rss_ind_size = 0; 6552 sc->hn_rss_hash = 0; 6553 sc->hn_rss_hcap = 0; 6554 6555 /* 6556 * Attach the primary channel _before_ attaching NVS and RNDIS. 6557 */ 6558 error = hn_chan_attach(sc, sc->hn_prichan); 6559 if (error) 6560 goto failed; 6561 6562 /* 6563 * Attach NVS. 6564 */ 6565 error = hn_nvs_attach(sc, mtu); 6566 if (error) 6567 goto failed; 6568 attached |= ATTACHED_NVS; 6569 6570 /* 6571 * Attach RNDIS _after_ NVS is attached. 6572 */ 6573 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6574 if (rndis_inited) 6575 attached |= ATTACHED_RNDIS; 6576 if (error) 6577 goto failed; 6578 6579 /* 6580 * Make sure capabilities are not changed. 6581 */ 6582 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6583 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6584 old_caps, sc->hn_caps); 6585 error = ENXIO; 6586 goto failed; 6587 } 6588 6589 /* 6590 * Allocate sub-channels for multi-TX/RX rings. 6591 * 6592 * NOTE: 6593 * The # of RX rings that can be used is equivalent to the # of 6594 * channels to be requested. 6595 */ 6596 nsubch = sc->hn_rx_ring_cnt - 1; 6597 error = hn_synth_alloc_subchans(sc, &nsubch); 6598 if (error) 6599 goto failed; 6600 /* NOTE: _Full_ synthetic parts detach is required now. */ 6601 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6602 6603 /* 6604 * Set the # of TX/RX rings that could be used according to 6605 * the # of channels that NVS offered. 6606 */ 6607 nchan = nsubch + 1; 6608 hn_set_ring_inuse(sc, nchan); 6609 if (nchan == 1) { 6610 /* Only the primary channel can be used; done */ 6611 goto back; 6612 } 6613 6614 /* 6615 * Attach the sub-channels. 6616 * 6617 * NOTE: hn_set_ring_inuse() _must_ have been called. 6618 */ 6619 error = hn_attach_subchans(sc); 6620 if (error) 6621 goto failed; 6622 6623 /* 6624 * Configure RSS key and indirect table _after_ all sub-channels 6625 * are attached. 6626 */ 6627 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6628 /* 6629 * RSS key is not set yet; set it to the default RSS key. 6630 */ 6631 if (bootverbose) 6632 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6633 #ifdef RSS 6634 rss_getkey(rss->rss_key); 6635 #else 6636 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6637 #endif 6638 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6639 } 6640 6641 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6642 /* 6643 * RSS indirect table is not set yet; set it up in round- 6644 * robin fashion. 6645 */ 6646 if (bootverbose) { 6647 if_printf(sc->hn_ifp, "setup default RSS indirect " 6648 "table\n"); 6649 } 6650 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6651 uint32_t subidx; 6652 6653 #ifdef RSS 6654 subidx = rss_get_indirection_to_bucket(i); 6655 #else 6656 subidx = i; 6657 #endif 6658 rss->rss_ind[i] = subidx % nchan; 6659 } 6660 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6661 } else { 6662 /* 6663 * # of usable channels may be changed, so we have to 6664 * make sure that all entries in RSS indirect table 6665 * are valid. 6666 * 6667 * NOTE: hn_set_ring_inuse() _must_ have been called. 6668 */ 6669 hn_rss_ind_fixup(sc); 6670 } 6671 6672 sc->hn_rss_hash = sc->hn_rss_hcap; 6673 if ((sc->hn_flags & HN_FLAG_RXVF) || 6674 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6675 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6676 hn_vf_rss_fixup(sc, false); 6677 } 6678 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6679 if (error) 6680 goto failed; 6681 back: 6682 /* 6683 * Fixup transmission aggregation setup. 6684 */ 6685 hn_set_txagg(sc); 6686 hn_rndis_init_fixat(sc, nchan); 6687 return (0); 6688 6689 failed: 6690 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6691 hn_rndis_init_fixat(sc, nchan); 6692 hn_synth_detach(sc); 6693 } else { 6694 if (attached & ATTACHED_RNDIS) { 6695 hn_rndis_init_fixat(sc, nchan); 6696 hn_rndis_detach(sc); 6697 } 6698 if (attached & ATTACHED_NVS) 6699 hn_nvs_detach(sc); 6700 hn_chan_detach(sc, sc->hn_prichan); 6701 /* Restore old capabilities. */ 6702 sc->hn_caps = old_caps; 6703 } 6704 return (error); 6705 6706 #undef ATTACHED_RNDIS 6707 #undef ATTACHED_NVS 6708 } 6709 6710 /* 6711 * NOTE: 6712 * The interface must have been suspended though hn_suspend(), before 6713 * this function get called. 6714 */ 6715 static void 6716 hn_synth_detach(struct hn_softc *sc) 6717 { 6718 6719 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6720 ("synthetic parts were not attached")); 6721 6722 /* Detach the RNDIS first. */ 6723 hn_rndis_detach(sc); 6724 6725 /* Detach NVS. */ 6726 hn_nvs_detach(sc); 6727 6728 /* Detach all of the channels. */ 6729 hn_detach_allchans(sc); 6730 6731 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6732 /* 6733 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6734 */ 6735 int error; 6736 6737 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6738 sc->hn_rxbuf_gpadl); 6739 if (error) { 6740 if_printf(sc->hn_ifp, 6741 "rxbuf gpadl disconn failed: %d\n", error); 6742 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6743 } 6744 sc->hn_rxbuf_gpadl = 0; 6745 } 6746 6747 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6748 /* 6749 * Host is post-Win2016, disconnect chimney sending buffer from 6750 * primary channel here. 6751 */ 6752 int error; 6753 6754 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6755 sc->hn_chim_gpadl); 6756 if (error) { 6757 if_printf(sc->hn_ifp, 6758 "chim gpadl disconn failed: %d\n", error); 6759 sc->hn_flags |= HN_FLAG_CHIM_REF; 6760 } 6761 sc->hn_chim_gpadl = 0; 6762 } 6763 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6764 } 6765 6766 static void 6767 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6768 { 6769 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6770 ("invalid ring count %d", ring_cnt)); 6771 6772 if (sc->hn_tx_ring_cnt > ring_cnt) 6773 sc->hn_tx_ring_inuse = ring_cnt; 6774 else 6775 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6776 sc->hn_rx_ring_inuse = ring_cnt; 6777 6778 #ifdef RSS 6779 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6780 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6781 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6782 rss_getnumbuckets()); 6783 } 6784 #endif 6785 6786 if (bootverbose) { 6787 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6788 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6789 } 6790 } 6791 6792 static void 6793 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6794 { 6795 6796 /* 6797 * NOTE: 6798 * The TX bufring will not be drained by the hypervisor, 6799 * if the primary channel is revoked. 6800 */ 6801 while (!vmbus_chan_rx_empty(chan) || 6802 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6803 !vmbus_chan_tx_empty(chan))) 6804 pause("waitch", 1); 6805 vmbus_chan_intr_drain(chan); 6806 } 6807 6808 static void 6809 hn_disable_rx(struct hn_softc *sc) 6810 { 6811 6812 /* 6813 * Disable RX by clearing RX filter forcefully. 6814 */ 6815 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6816 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6817 6818 /* 6819 * Give RNDIS enough time to flush all pending data packets. 6820 */ 6821 pause("waitrx", (200 * hz) / 1000); 6822 } 6823 6824 /* 6825 * NOTE: 6826 * RX/TX _must_ have been suspended/disabled, before this function 6827 * is called. 6828 */ 6829 static void 6830 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6831 { 6832 struct vmbus_channel **subch = NULL; 6833 int nsubch; 6834 6835 /* 6836 * Drain RX/TX bufrings and interrupts. 6837 */ 6838 nsubch = nchan - 1; 6839 if (nsubch > 0) 6840 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6841 6842 if (subch != NULL) { 6843 int i; 6844 6845 for (i = 0; i < nsubch; ++i) 6846 hn_chan_drain(sc, subch[i]); 6847 } 6848 hn_chan_drain(sc, sc->hn_prichan); 6849 6850 if (subch != NULL) 6851 vmbus_subchan_rel(subch, nsubch); 6852 } 6853 6854 static void 6855 hn_suspend_data(struct hn_softc *sc) 6856 { 6857 struct hn_tx_ring *txr; 6858 int i; 6859 6860 HN_LOCK_ASSERT(sc); 6861 6862 /* 6863 * Suspend TX. 6864 */ 6865 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6866 txr = &sc->hn_tx_ring[i]; 6867 6868 mtx_lock(&txr->hn_tx_lock); 6869 txr->hn_suspended = 1; 6870 mtx_unlock(&txr->hn_tx_lock); 6871 /* No one is able send more packets now. */ 6872 6873 /* 6874 * Wait for all pending sends to finish. 6875 * 6876 * NOTE: 6877 * We will _not_ receive all pending send-done, if the 6878 * primary channel is revoked. 6879 */ 6880 while (hn_tx_ring_pending(txr) && 6881 !vmbus_chan_is_revoked(sc->hn_prichan)) 6882 pause("hnwtx", 1 /* 1 tick */); 6883 } 6884 6885 /* 6886 * Disable RX. 6887 */ 6888 hn_disable_rx(sc); 6889 6890 /* 6891 * Drain RX/TX. 6892 */ 6893 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6894 6895 /* 6896 * Drain any pending TX tasks. 6897 * 6898 * NOTE: 6899 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6900 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6901 */ 6902 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6903 txr = &sc->hn_tx_ring[i]; 6904 6905 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6906 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6907 } 6908 } 6909 6910 static void 6911 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6912 { 6913 6914 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6915 } 6916 6917 static void 6918 hn_suspend_mgmt(struct hn_softc *sc) 6919 { 6920 struct task task; 6921 6922 HN_LOCK_ASSERT(sc); 6923 6924 /* 6925 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6926 * through hn_mgmt_taskq. 6927 */ 6928 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6929 vmbus_chan_run_task(sc->hn_prichan, &task); 6930 6931 /* 6932 * Make sure that all pending management tasks are completed. 6933 */ 6934 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6935 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6936 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6937 } 6938 6939 static void 6940 hn_suspend(struct hn_softc *sc) 6941 { 6942 6943 /* Disable polling. */ 6944 hn_polling(sc, 0); 6945 6946 /* 6947 * If the non-transparent mode VF is activated, the synthetic 6948 * device is receiving packets, so the data path of the 6949 * synthetic device must be suspended. 6950 */ 6951 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6952 (sc->hn_flags & HN_FLAG_RXVF)) 6953 hn_suspend_data(sc); 6954 hn_suspend_mgmt(sc); 6955 } 6956 6957 static void 6958 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6959 { 6960 int i; 6961 6962 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6963 ("invalid TX ring count %d", tx_ring_cnt)); 6964 6965 for (i = 0; i < tx_ring_cnt; ++i) { 6966 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6967 6968 mtx_lock(&txr->hn_tx_lock); 6969 txr->hn_suspended = 0; 6970 mtx_unlock(&txr->hn_tx_lock); 6971 } 6972 } 6973 6974 static void 6975 hn_resume_data(struct hn_softc *sc) 6976 { 6977 int i; 6978 6979 HN_LOCK_ASSERT(sc); 6980 6981 /* 6982 * Re-enable RX. 6983 */ 6984 hn_rxfilter_config(sc); 6985 6986 /* 6987 * Make sure to clear suspend status on "all" TX rings, 6988 * since hn_tx_ring_inuse can be changed after 6989 * hn_suspend_data(). 6990 */ 6991 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6992 6993 #ifdef HN_IFSTART_SUPPORT 6994 if (!hn_use_if_start) 6995 #endif 6996 { 6997 /* 6998 * Flush unused drbrs, since hn_tx_ring_inuse may be 6999 * reduced. 7000 */ 7001 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 7002 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 7003 } 7004 7005 /* 7006 * Kick start TX. 7007 */ 7008 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 7009 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 7010 7011 /* 7012 * Use txeof task, so that any pending oactive can be 7013 * cleared properly. 7014 */ 7015 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 7016 } 7017 } 7018 7019 static void 7020 hn_resume_mgmt(struct hn_softc *sc) 7021 { 7022 7023 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 7024 7025 /* 7026 * Kick off network change detection, if it was pending. 7027 * If no network change was pending, start link status 7028 * checks, which is more lightweight than network change 7029 * detection. 7030 */ 7031 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 7032 hn_change_network(sc); 7033 else 7034 hn_update_link_status(sc); 7035 } 7036 7037 static void 7038 hn_resume(struct hn_softc *sc) 7039 { 7040 7041 /* 7042 * If the non-transparent mode VF is activated, the synthetic 7043 * device have to receive packets, so the data path of the 7044 * synthetic device must be resumed. 7045 */ 7046 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 7047 (sc->hn_flags & HN_FLAG_RXVF)) 7048 hn_resume_data(sc); 7049 7050 /* 7051 * Don't resume link status change if VF is attached/activated. 7052 * - In the non-transparent VF mode, the synthetic device marks 7053 * link down until the VF is deactivated; i.e. VF is down. 7054 * - In transparent VF mode, VF's media status is used until 7055 * the VF is detached. 7056 */ 7057 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 7058 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 7059 hn_resume_mgmt(sc); 7060 7061 /* 7062 * Re-enable polling if this interface is running and 7063 * the polling is requested. 7064 */ 7065 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 7066 hn_polling(sc, sc->hn_pollhz); 7067 } 7068 7069 static void 7070 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 7071 { 7072 const struct rndis_status_msg *msg; 7073 int ofs; 7074 7075 if (dlen < sizeof(*msg)) { 7076 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7077 return; 7078 } 7079 msg = data; 7080 7081 switch (msg->rm_status) { 7082 case RNDIS_STATUS_MEDIA_CONNECT: 7083 case RNDIS_STATUS_MEDIA_DISCONNECT: 7084 hn_update_link_status(sc); 7085 break; 7086 7087 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7088 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7089 /* Not really useful; ignore. */ 7090 break; 7091 7092 case RNDIS_STATUS_NETWORK_CHANGE: 7093 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7094 if (dlen < ofs + msg->rm_stbuflen || 7095 msg->rm_stbuflen < sizeof(uint32_t)) { 7096 if_printf(sc->hn_ifp, "network changed\n"); 7097 } else { 7098 uint32_t change; 7099 7100 memcpy(&change, ((const uint8_t *)msg) + ofs, 7101 sizeof(change)); 7102 if_printf(sc->hn_ifp, "network changed, change %u\n", 7103 change); 7104 } 7105 hn_change_network(sc); 7106 break; 7107 7108 default: 7109 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7110 msg->rm_status); 7111 break; 7112 } 7113 } 7114 7115 static int 7116 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7117 { 7118 const struct rndis_pktinfo *pi = info_data; 7119 uint32_t mask = 0; 7120 7121 while (info_dlen != 0) { 7122 const void *data; 7123 uint32_t dlen; 7124 7125 if (__predict_false(info_dlen < sizeof(*pi))) 7126 return (EINVAL); 7127 if (__predict_false(info_dlen < pi->rm_size)) 7128 return (EINVAL); 7129 info_dlen -= pi->rm_size; 7130 7131 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7132 return (EINVAL); 7133 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7134 return (EINVAL); 7135 dlen = pi->rm_size - pi->rm_pktinfooffset; 7136 data = pi->rm_data; 7137 7138 if (pi->rm_internal == 1) { 7139 switch (pi->rm_type) { 7140 case NDIS_PKTINFO_IT_PKTINFO_ID: 7141 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7142 return (EINVAL); 7143 info->pktinfo_id = 7144 (const struct packet_info_id *)data; 7145 mask |= HN_RXINFO_PKTINFO_ID; 7146 break; 7147 7148 default: 7149 goto next; 7150 } 7151 } else { 7152 switch (pi->rm_type) { 7153 case NDIS_PKTINFO_TYPE_VLAN: 7154 if (__predict_false(dlen 7155 < NDIS_VLAN_INFO_SIZE)) 7156 return (EINVAL); 7157 info->vlan_info = (const uint32_t *)data; 7158 mask |= HN_RXINFO_VLAN; 7159 break; 7160 7161 case NDIS_PKTINFO_TYPE_CSUM: 7162 if (__predict_false(dlen 7163 < NDIS_RXCSUM_INFO_SIZE)) 7164 return (EINVAL); 7165 info->csum_info = (const uint32_t *)data; 7166 mask |= HN_RXINFO_CSUM; 7167 break; 7168 7169 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7170 if (__predict_false(dlen 7171 < HN_NDIS_HASH_VALUE_SIZE)) 7172 return (EINVAL); 7173 info->hash_value = (const uint32_t *)data; 7174 mask |= HN_RXINFO_HASHVAL; 7175 break; 7176 7177 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7178 if (__predict_false(dlen 7179 < HN_NDIS_HASH_INFO_SIZE)) 7180 return (EINVAL); 7181 info->hash_info = (const uint32_t *)data; 7182 mask |= HN_RXINFO_HASHINF; 7183 break; 7184 7185 default: 7186 goto next; 7187 } 7188 } 7189 7190 if (mask == HN_RXINFO_ALL) { 7191 /* All found; done */ 7192 break; 7193 } 7194 next: 7195 pi = (const struct rndis_pktinfo *) 7196 ((const uint8_t *)pi + pi->rm_size); 7197 } 7198 7199 /* 7200 * Final fixup. 7201 * - If there is no hash value, invalidate the hash info. 7202 */ 7203 if ((mask & HN_RXINFO_HASHVAL) == 0) 7204 info->hash_info = NULL; 7205 return (0); 7206 } 7207 7208 static __inline bool 7209 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7210 { 7211 7212 if (off < check_off) { 7213 if (__predict_true(off + len <= check_off)) 7214 return (false); 7215 } else if (off > check_off) { 7216 if (__predict_true(check_off + check_len <= off)) 7217 return (false); 7218 } 7219 return (true); 7220 } 7221 7222 static __inline void 7223 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7224 uint32_t len, struct hn_rxinfo *info) 7225 { 7226 uint32_t cnt = rxr->rsc.cnt; 7227 7228 if (cnt) { 7229 rxr->rsc.pktlen += len; 7230 } else { 7231 rxr->rsc.vlan_info = info->vlan_info; 7232 rxr->rsc.csum_info = info->csum_info; 7233 rxr->rsc.hash_info = info->hash_info; 7234 rxr->rsc.hash_value = info->hash_value; 7235 rxr->rsc.pktlen = len; 7236 } 7237 7238 rxr->rsc.frag_data[cnt] = data; 7239 rxr->rsc.frag_len[cnt] = len; 7240 rxr->rsc.cnt++; 7241 } 7242 7243 static void 7244 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7245 { 7246 const struct rndis_packet_msg *pkt; 7247 struct hn_rxinfo info; 7248 int data_off, pktinfo_off, data_len, pktinfo_len; 7249 bool rsc_more= false; 7250 7251 /* 7252 * Check length. 7253 */ 7254 if (__predict_false(dlen < sizeof(*pkt))) { 7255 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7256 return; 7257 } 7258 pkt = data; 7259 7260 if (__predict_false(dlen < pkt->rm_len)) { 7261 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7262 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7263 return; 7264 } 7265 if (__predict_false(pkt->rm_len < 7266 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7267 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7268 "msglen %u, data %u, oob %u, pktinfo %u\n", 7269 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7270 pkt->rm_pktinfolen); 7271 return; 7272 } 7273 if (__predict_false(pkt->rm_datalen == 0)) { 7274 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7275 return; 7276 } 7277 7278 /* 7279 * Check offests. 7280 */ 7281 #define IS_OFFSET_INVALID(ofs) \ 7282 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7283 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7284 7285 /* XXX Hyper-V does not meet data offset alignment requirement */ 7286 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7287 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7288 "data offset %u\n", pkt->rm_dataoffset); 7289 return; 7290 } 7291 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7292 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7293 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7294 "oob offset %u\n", pkt->rm_oobdataoffset); 7295 return; 7296 } 7297 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7298 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7299 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7300 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7301 return; 7302 } 7303 7304 #undef IS_OFFSET_INVALID 7305 7306 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7307 data_len = pkt->rm_datalen; 7308 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7309 pktinfo_len = pkt->rm_pktinfolen; 7310 7311 /* 7312 * Check OOB coverage. 7313 */ 7314 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7315 int oob_off, oob_len; 7316 7317 if_printf(rxr->hn_ifp, "got oobdata\n"); 7318 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7319 oob_len = pkt->rm_oobdatalen; 7320 7321 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7322 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7323 "oob overflow, msglen %u, oob abs %d len %d\n", 7324 pkt->rm_len, oob_off, oob_len); 7325 return; 7326 } 7327 7328 /* 7329 * Check against data. 7330 */ 7331 if (hn_rndis_check_overlap(oob_off, oob_len, 7332 data_off, data_len)) { 7333 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7334 "oob overlaps data, oob abs %d len %d, " 7335 "data abs %d len %d\n", 7336 oob_off, oob_len, data_off, data_len); 7337 return; 7338 } 7339 7340 /* 7341 * Check against pktinfo. 7342 */ 7343 if (pktinfo_len != 0 && 7344 hn_rndis_check_overlap(oob_off, oob_len, 7345 pktinfo_off, pktinfo_len)) { 7346 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7347 "oob overlaps pktinfo, oob abs %d len %d, " 7348 "pktinfo abs %d len %d\n", 7349 oob_off, oob_len, pktinfo_off, pktinfo_len); 7350 return; 7351 } 7352 } 7353 7354 /* 7355 * Check per-packet-info coverage and find useful per-packet-info. 7356 */ 7357 info.vlan_info = NULL; 7358 info.csum_info = NULL; 7359 info.hash_info = NULL; 7360 info.pktinfo_id = NULL; 7361 7362 if (__predict_true(pktinfo_len != 0)) { 7363 bool overlap; 7364 int error; 7365 7366 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7367 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7368 "pktinfo overflow, msglen %u, " 7369 "pktinfo abs %d len %d\n", 7370 pkt->rm_len, pktinfo_off, pktinfo_len); 7371 return; 7372 } 7373 7374 /* 7375 * Check packet info coverage. 7376 */ 7377 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7378 data_off, data_len); 7379 if (__predict_false(overlap)) { 7380 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7381 "pktinfo overlap data, pktinfo abs %d len %d, " 7382 "data abs %d len %d\n", 7383 pktinfo_off, pktinfo_len, data_off, data_len); 7384 return; 7385 } 7386 7387 /* 7388 * Find useful per-packet-info. 7389 */ 7390 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7391 pktinfo_len, &info); 7392 if (__predict_false(error)) { 7393 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7394 "pktinfo\n"); 7395 return; 7396 } 7397 } 7398 7399 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7400 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7401 "data overflow, msglen %u, data abs %d len %d\n", 7402 pkt->rm_len, data_off, data_len); 7403 return; 7404 } 7405 7406 /* Identify RSC fragments, drop invalid packets */ 7407 if ((info.pktinfo_id != NULL) && 7408 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7409 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7410 rxr->rsc.cnt = 0; 7411 rxr->hn_rsc_pkts++; 7412 } else if (rxr->rsc.cnt == 0) 7413 goto drop; 7414 7415 rsc_more = true; 7416 7417 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7418 rsc_more = false; 7419 7420 if (rsc_more && rxr->rsc.is_last) 7421 goto drop; 7422 } else { 7423 rxr->rsc.cnt = 0; 7424 } 7425 7426 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7427 goto drop; 7428 7429 /* Store data in per rx ring structure */ 7430 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7431 data_len, &info); 7432 7433 if (rsc_more) 7434 return; 7435 7436 hn_rxpkt(rxr); 7437 rxr->rsc.cnt = 0; 7438 return; 7439 drop: 7440 rxr->hn_rsc_drop++; 7441 return; 7442 } 7443 7444 static __inline void 7445 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7446 { 7447 const struct rndis_msghdr *hdr; 7448 7449 if (__predict_false(dlen < sizeof(*hdr))) { 7450 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7451 return; 7452 } 7453 hdr = data; 7454 7455 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7456 /* Hot data path. */ 7457 hn_rndis_rx_data(rxr, data, dlen); 7458 /* Done! */ 7459 return; 7460 } 7461 7462 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7463 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 7464 else 7465 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 7466 } 7467 7468 static void 7469 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7470 { 7471 const struct hn_nvs_hdr *hdr; 7472 7473 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7474 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7475 return; 7476 } 7477 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7478 7479 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7480 /* Useless; ignore */ 7481 return; 7482 } 7483 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7484 } 7485 7486 static void 7487 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7488 const struct vmbus_chanpkt_hdr *pkt) 7489 { 7490 struct hn_nvs_sendctx *sndc; 7491 7492 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7493 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7494 VMBUS_CHANPKT_DATALEN(pkt)); 7495 /* 7496 * NOTE: 7497 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7498 * its callback. 7499 */ 7500 } 7501 7502 static void 7503 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7504 const struct vmbus_chanpkt_hdr *pkthdr) 7505 { 7506 struct epoch_tracker et; 7507 const struct vmbus_chanpkt_rxbuf *pkt; 7508 const struct hn_nvs_hdr *nvs_hdr; 7509 int count, i, hlen; 7510 7511 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7512 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7513 return; 7514 } 7515 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7516 7517 /* Make sure that this is a RNDIS message. */ 7518 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7519 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7520 nvs_hdr->nvs_type); 7521 return; 7522 } 7523 7524 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7525 if (__predict_false(hlen < sizeof(*pkt))) { 7526 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7527 return; 7528 } 7529 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7530 7531 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7532 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7533 pkt->cp_rxbuf_id); 7534 return; 7535 } 7536 7537 count = pkt->cp_rxbuf_cnt; 7538 if (__predict_false(hlen < 7539 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7540 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7541 return; 7542 } 7543 7544 NET_EPOCH_ENTER(et); 7545 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7546 for (i = 0; i < count; ++i) { 7547 int ofs, len; 7548 7549 ofs = pkt->cp_rxbuf[i].rb_ofs; 7550 len = pkt->cp_rxbuf[i].rb_len; 7551 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7552 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7553 "ofs %d, len %d\n", i, ofs, len); 7554 continue; 7555 } 7556 7557 rxr->rsc.is_last = (i == (count - 1)); 7558 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7559 } 7560 NET_EPOCH_EXIT(et); 7561 7562 /* 7563 * Ack the consumed RXBUF associated w/ this channel packet, 7564 * so that this RXBUF can be recycled by the hypervisor. 7565 */ 7566 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7567 } 7568 7569 static void 7570 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7571 uint64_t tid) 7572 { 7573 struct hn_nvs_rndis_ack ack; 7574 int retries, error; 7575 7576 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7577 ack.nvs_status = HN_NVS_STATUS_OK; 7578 7579 retries = 0; 7580 again: 7581 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7582 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7583 if (__predict_false(error == EAGAIN)) { 7584 /* 7585 * NOTE: 7586 * This should _not_ happen in real world, since the 7587 * consumption of the TX bufring from the TX path is 7588 * controlled. 7589 */ 7590 if (rxr->hn_ack_failed == 0) 7591 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7592 rxr->hn_ack_failed++; 7593 retries++; 7594 if (retries < 10) { 7595 DELAY(100); 7596 goto again; 7597 } 7598 /* RXBUF leaks! */ 7599 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7600 } 7601 } 7602 7603 static void 7604 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7605 { 7606 struct hn_rx_ring *rxr = xrxr; 7607 struct hn_softc *sc = rxr->hn_ifp->if_softc; 7608 7609 for (;;) { 7610 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7611 int error, pktlen; 7612 7613 pktlen = rxr->hn_pktbuf_len; 7614 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7615 if (__predict_false(error == ENOBUFS)) { 7616 void *nbuf; 7617 int nlen; 7618 7619 /* 7620 * Expand channel packet buffer. 7621 * 7622 * XXX 7623 * Use M_WAITOK here, since allocation failure 7624 * is fatal. 7625 */ 7626 nlen = rxr->hn_pktbuf_len * 2; 7627 while (nlen < pktlen) 7628 nlen *= 2; 7629 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7630 7631 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7632 rxr->hn_pktbuf_len, nlen); 7633 7634 free(rxr->hn_pktbuf, M_DEVBUF); 7635 rxr->hn_pktbuf = nbuf; 7636 rxr->hn_pktbuf_len = nlen; 7637 /* Retry! */ 7638 continue; 7639 } else if (__predict_false(error == EAGAIN)) { 7640 /* No more channel packets; done! */ 7641 break; 7642 } 7643 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7644 7645 switch (pkt->cph_type) { 7646 case VMBUS_CHANPKT_TYPE_COMP: 7647 hn_nvs_handle_comp(sc, chan, pkt); 7648 break; 7649 7650 case VMBUS_CHANPKT_TYPE_RXBUF: 7651 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7652 break; 7653 7654 case VMBUS_CHANPKT_TYPE_INBAND: 7655 hn_nvs_handle_notify(sc, pkt); 7656 break; 7657 7658 default: 7659 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7660 pkt->cph_type); 7661 break; 7662 } 7663 } 7664 hn_chan_rollup(rxr, rxr->hn_txr); 7665 } 7666 7667 static void 7668 hn_sysinit(void *arg __unused) 7669 { 7670 int i; 7671 7672 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7673 7674 #ifdef HN_IFSTART_SUPPORT 7675 /* 7676 * Don't use ifnet.if_start if transparent VF mode is requested; 7677 * mainly due to the IFF_DRV_OACTIVE flag. 7678 */ 7679 if (hn_xpnt_vf && hn_use_if_start) { 7680 hn_use_if_start = 0; 7681 printf("hn: tranparent VF mode, if_transmit will be used, " 7682 "instead of if_start\n"); 7683 } 7684 #endif 7685 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7686 printf("hn: invalid transparent VF attach routing " 7687 "wait timeout %d, reset to %d\n", 7688 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7689 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7690 } 7691 7692 /* 7693 * Initialize VF map. 7694 */ 7695 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7696 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7697 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, 7698 M_WAITOK | M_ZERO); 7699 7700 /* 7701 * Fix the # of TX taskqueues. 7702 */ 7703 if (hn_tx_taskq_cnt <= 0) 7704 hn_tx_taskq_cnt = 1; 7705 else if (hn_tx_taskq_cnt > mp_ncpus) 7706 hn_tx_taskq_cnt = mp_ncpus; 7707 7708 /* 7709 * Fix the TX taskqueue mode. 7710 */ 7711 switch (hn_tx_taskq_mode) { 7712 case HN_TX_TASKQ_M_INDEP: 7713 case HN_TX_TASKQ_M_GLOBAL: 7714 case HN_TX_TASKQ_M_EVTTQ: 7715 break; 7716 default: 7717 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7718 break; 7719 } 7720 7721 if (vm_guest != VM_GUEST_HV) 7722 return; 7723 7724 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7725 return; 7726 7727 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7728 M_DEVBUF, M_WAITOK); 7729 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7730 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7731 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7732 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7733 "hn tx%d", i); 7734 } 7735 } 7736 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7737 7738 static void 7739 hn_sysuninit(void *arg __unused) 7740 { 7741 7742 if (hn_tx_taskque != NULL) { 7743 int i; 7744 7745 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7746 taskqueue_free(hn_tx_taskque[i]); 7747 free(hn_tx_taskque, M_DEVBUF); 7748 } 7749 7750 if (hn_vfmap != NULL) 7751 free(hn_vfmap, M_DEVBUF); 7752 rm_destroy(&hn_vfmap_lock); 7753 7754 counter_u64_free(hn_udpcs_fixup); 7755 } 7756 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7757