1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/bus.h> 66 #include <sys/counter.h> 67 #include <sys/kernel.h> 68 #include <sys/limits.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/module.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/socket.h> 80 #include <sys/sockio.h> 81 #include <sys/sx.h> 82 #include <sys/sysctl.h> 83 #include <sys/taskqueue.h> 84 #include <sys/buf_ring.h> 85 #include <sys/eventhandler.h> 86 #include <sys/epoch.h> 87 88 #include <machine/atomic.h> 89 #include <machine/in_cksum.h> 90 91 #include <net/bpf.h> 92 #include <net/ethernet.h> 93 #include <net/if.h> 94 #include <net/if_dl.h> 95 #include <net/if_media.h> 96 #include <net/if_types.h> 97 #include <net/if_var.h> 98 #include <net/rndis.h> 99 #ifdef RSS 100 #include <net/rss_config.h> 101 #endif 102 103 #include <netinet/in_systm.h> 104 #include <netinet/in.h> 105 #include <netinet/ip.h> 106 #include <netinet/ip6.h> 107 #include <netinet/tcp.h> 108 #include <netinet/tcp_lro.h> 109 #include <netinet/udp.h> 110 111 #include <dev/hyperv/include/hyperv.h> 112 #include <dev/hyperv/include/hyperv_busdma.h> 113 #include <dev/hyperv/include/vmbus.h> 114 #include <dev/hyperv/include/vmbus_xact.h> 115 116 #include <dev/hyperv/netvsc/ndis.h> 117 #include <dev/hyperv/netvsc/if_hnreg.h> 118 #include <dev/hyperv/netvsc/if_hnvar.h> 119 #include <dev/hyperv/netvsc/hn_nvs.h> 120 #include <dev/hyperv/netvsc/hn_rndis.h> 121 122 #include "vmbus_if.h" 123 124 #define HN_IFSTART_SUPPORT 125 126 #define HN_RING_CNT_DEF_MAX 8 127 128 #define HN_VFMAP_SIZE_DEF 8 129 130 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 131 132 /* YYY should get it from the underlying channel */ 133 #define HN_TX_DESC_CNT 512 134 135 #define HN_RNDIS_PKT_LEN \ 136 (sizeof(struct rndis_packet_msg) + \ 137 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 138 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 139 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 140 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 141 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 142 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 143 144 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 145 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 146 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 147 /* -1 for RNDIS packet message */ 148 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 149 150 #define HN_DIRECT_TX_SIZE_DEF 128 151 152 #define HN_EARLY_TXEOF_THRESH 8 153 154 #define HN_PKTBUF_LEN_DEF (16 * 1024) 155 156 #define HN_LROENT_CNT_DEF 128 157 158 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 159 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 160 /* YYY 2*MTU is a bit rough, but should be good enough. */ 161 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 162 163 #define HN_LRO_ACKCNT_DEF 1 164 165 #define HN_LOCK_INIT(sc) \ 166 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 167 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 168 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 169 #define HN_LOCK(sc) \ 170 do { \ 171 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 172 /* Relinquish cpu to avoid deadlock */ \ 173 sched_relinquish(curthread); \ 174 DELAY(1000); \ 175 } \ 176 } while (0) 177 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 178 179 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 180 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 181 #define HN_CSUM_IP_HWASSIST(sc) \ 182 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 183 #define HN_CSUM_IP6_HWASSIST(sc) \ 184 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 185 186 #define HN_PKTSIZE_MIN(align) \ 187 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 188 HN_RNDIS_PKT_LEN, (align)) 189 #define HN_PKTSIZE(m, align) \ 190 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 191 192 #ifdef RSS 193 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 194 #else 195 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 196 #endif 197 198 struct hn_txdesc { 199 #ifndef HN_USE_TXDESC_BUFRING 200 SLIST_ENTRY(hn_txdesc) link; 201 #endif 202 STAILQ_ENTRY(hn_txdesc) agg_link; 203 204 /* Aggregated txdescs, in sending order. */ 205 STAILQ_HEAD(, hn_txdesc) agg_list; 206 207 /* The oldest packet, if transmission aggregation happens. */ 208 struct mbuf *m; 209 struct hn_tx_ring *txr; 210 int refs; 211 uint32_t flags; /* HN_TXD_FLAG_ */ 212 struct hn_nvs_sendctx send_ctx; 213 uint32_t chim_index; 214 int chim_size; 215 216 bus_dmamap_t data_dmap; 217 218 bus_addr_t rndis_pkt_paddr; 219 struct rndis_packet_msg *rndis_pkt; 220 bus_dmamap_t rndis_pkt_dmap; 221 }; 222 223 #define HN_TXD_FLAG_ONLIST 0x0001 224 #define HN_TXD_FLAG_DMAMAP 0x0002 225 #define HN_TXD_FLAG_ONAGG 0x0004 226 227 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 228 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 229 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 230 231 struct packet_info_id { 232 uint8_t ver; 233 uint8_t flag; 234 uint16_t pkt_id; 235 }; 236 237 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 238 239 240 struct hn_rxinfo { 241 const uint32_t *vlan_info; 242 const uint32_t *csum_info; 243 const uint32_t *hash_info; 244 const uint32_t *hash_value; 245 const struct packet_info_id *pktinfo_id; 246 }; 247 248 struct hn_rxvf_setarg { 249 struct hn_rx_ring *rxr; 250 struct ifnet *vf_ifp; 251 }; 252 253 #define HN_RXINFO_VLAN 0x0001 254 #define HN_RXINFO_CSUM 0x0002 255 #define HN_RXINFO_HASHINF 0x0004 256 #define HN_RXINFO_HASHVAL 0x0008 257 #define HN_RXINFO_PKTINFO_ID 0x0010 258 #define HN_RXINFO_ALL \ 259 (HN_RXINFO_VLAN | \ 260 HN_RXINFO_CSUM | \ 261 HN_RXINFO_HASHINF | \ 262 HN_RXINFO_HASHVAL | \ 263 HN_RXINFO_PKTINFO_ID) 264 265 static int hn_probe(device_t); 266 static int hn_attach(device_t); 267 static int hn_detach(device_t); 268 static int hn_shutdown(device_t); 269 static void hn_chan_callback(struct vmbus_channel *, 270 void *); 271 272 static void hn_init(void *); 273 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 274 #ifdef HN_IFSTART_SUPPORT 275 static void hn_start(struct ifnet *); 276 #endif 277 static int hn_transmit(struct ifnet *, struct mbuf *); 278 static void hn_xmit_qflush(struct ifnet *); 279 static int hn_ifmedia_upd(struct ifnet *); 280 static void hn_ifmedia_sts(struct ifnet *, 281 struct ifmediareq *); 282 283 static void hn_ifnet_event(void *, struct ifnet *, int); 284 static void hn_ifaddr_event(void *, struct ifnet *); 285 static void hn_ifnet_attevent(void *, struct ifnet *); 286 static void hn_ifnet_detevent(void *, struct ifnet *); 287 static void hn_ifnet_lnkevent(void *, struct ifnet *, int); 288 289 static bool hn_ismyvf(const struct hn_softc *, 290 const struct ifnet *); 291 static void hn_rxvf_change(struct hn_softc *, 292 struct ifnet *, bool); 293 static void hn_rxvf_set(struct hn_softc *, struct ifnet *); 294 static void hn_rxvf_set_task(void *, int); 295 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); 296 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 297 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 298 struct ifreq *); 299 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 300 static bool hn_xpnt_vf_isready(struct hn_softc *); 301 static void hn_xpnt_vf_setready(struct hn_softc *); 302 static void hn_xpnt_vf_init_taskfunc(void *, int); 303 static void hn_xpnt_vf_init(struct hn_softc *); 304 static void hn_xpnt_vf_setenable(struct hn_softc *); 305 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 306 static void hn_vf_rss_fixup(struct hn_softc *, bool); 307 static void hn_vf_rss_restore(struct hn_softc *); 308 309 static int hn_rndis_rxinfo(const void *, int, 310 struct hn_rxinfo *); 311 static void hn_rndis_rx_data(struct hn_rx_ring *, 312 const void *, int); 313 static void hn_rndis_rx_status(struct hn_softc *, 314 const void *, int); 315 static void hn_rndis_init_fixat(struct hn_softc *, int); 316 317 static void hn_nvs_handle_notify(struct hn_softc *, 318 const struct vmbus_chanpkt_hdr *); 319 static void hn_nvs_handle_comp(struct hn_softc *, 320 struct vmbus_channel *, 321 const struct vmbus_chanpkt_hdr *); 322 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 323 struct vmbus_channel *, 324 const struct vmbus_chanpkt_hdr *); 325 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 326 struct vmbus_channel *, uint64_t); 327 328 #if __FreeBSD_version >= 1100099 329 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 330 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 331 #endif 332 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 334 #if __FreeBSD_version < 1100095 335 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 336 #else 337 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 338 #endif 339 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 342 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 343 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 344 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 345 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 346 #ifndef RSS 347 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 348 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 349 #endif 350 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 358 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 359 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 360 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 361 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 362 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 363 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 364 365 static void hn_stop(struct hn_softc *, bool); 366 static void hn_init_locked(struct hn_softc *); 367 static int hn_chan_attach(struct hn_softc *, 368 struct vmbus_channel *); 369 static void hn_chan_detach(struct hn_softc *, 370 struct vmbus_channel *); 371 static int hn_attach_subchans(struct hn_softc *); 372 static void hn_detach_allchans(struct hn_softc *); 373 static void hn_chan_rollup(struct hn_rx_ring *, 374 struct hn_tx_ring *); 375 static void hn_set_ring_inuse(struct hn_softc *, int); 376 static int hn_synth_attach(struct hn_softc *, int); 377 static void hn_synth_detach(struct hn_softc *); 378 static int hn_synth_alloc_subchans(struct hn_softc *, 379 int *); 380 static bool hn_synth_attachable(const struct hn_softc *); 381 static void hn_suspend(struct hn_softc *); 382 static void hn_suspend_data(struct hn_softc *); 383 static void hn_suspend_mgmt(struct hn_softc *); 384 static void hn_resume(struct hn_softc *); 385 static void hn_resume_data(struct hn_softc *); 386 static void hn_resume_mgmt(struct hn_softc *); 387 static void hn_suspend_mgmt_taskfunc(void *, int); 388 static void hn_chan_drain(struct hn_softc *, 389 struct vmbus_channel *); 390 static void hn_disable_rx(struct hn_softc *); 391 static void hn_drain_rxtx(struct hn_softc *, int); 392 static void hn_polling(struct hn_softc *, u_int); 393 static void hn_chan_polling(struct vmbus_channel *, u_int); 394 static void hn_mtu_change_fixup(struct hn_softc *); 395 396 static void hn_update_link_status(struct hn_softc *); 397 static void hn_change_network(struct hn_softc *); 398 static void hn_link_taskfunc(void *, int); 399 static void hn_netchg_init_taskfunc(void *, int); 400 static void hn_netchg_status_taskfunc(void *, int); 401 static void hn_link_status(struct hn_softc *); 402 403 static int hn_create_rx_data(struct hn_softc *, int); 404 static void hn_destroy_rx_data(struct hn_softc *); 405 static int hn_check_iplen(const struct mbuf *, int); 406 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 407 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 408 static int hn_rxfilter_config(struct hn_softc *); 409 static int hn_rss_reconfig(struct hn_softc *); 410 static void hn_rss_ind_fixup(struct hn_softc *); 411 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 412 static int hn_rxpkt(struct hn_rx_ring *); 413 static uint32_t hn_rss_type_fromndis(uint32_t); 414 static uint32_t hn_rss_type_tondis(uint32_t); 415 416 static int hn_tx_ring_create(struct hn_softc *, int); 417 static void hn_tx_ring_destroy(struct hn_tx_ring *); 418 static int hn_create_tx_data(struct hn_softc *, int); 419 static void hn_fixup_tx_data(struct hn_softc *); 420 static void hn_fixup_rx_data(struct hn_softc *); 421 static void hn_destroy_tx_data(struct hn_softc *); 422 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 423 static void hn_txdesc_gc(struct hn_tx_ring *, 424 struct hn_txdesc *); 425 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 426 struct hn_txdesc *, struct mbuf **); 427 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 428 struct hn_txdesc *); 429 static void hn_set_chim_size(struct hn_softc *, int); 430 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 431 static bool hn_tx_ring_pending(struct hn_tx_ring *); 432 static void hn_tx_ring_qflush(struct hn_tx_ring *); 433 static void hn_resume_tx(struct hn_softc *, int); 434 static void hn_set_txagg(struct hn_softc *); 435 static void *hn_try_txagg(struct ifnet *, 436 struct hn_tx_ring *, struct hn_txdesc *, 437 int); 438 static int hn_get_txswq_depth(const struct hn_tx_ring *); 439 static void hn_txpkt_done(struct hn_nvs_sendctx *, 440 struct hn_softc *, struct vmbus_channel *, 441 const void *, int); 442 static int hn_txpkt_sglist(struct hn_tx_ring *, 443 struct hn_txdesc *); 444 static int hn_txpkt_chim(struct hn_tx_ring *, 445 struct hn_txdesc *); 446 static int hn_xmit(struct hn_tx_ring *, int); 447 static void hn_xmit_taskfunc(void *, int); 448 static void hn_xmit_txeof(struct hn_tx_ring *); 449 static void hn_xmit_txeof_taskfunc(void *, int); 450 #ifdef HN_IFSTART_SUPPORT 451 static int hn_start_locked(struct hn_tx_ring *, int); 452 static void hn_start_taskfunc(void *, int); 453 static void hn_start_txeof(struct hn_tx_ring *); 454 static void hn_start_txeof_taskfunc(void *, int); 455 #endif 456 457 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 458 "Hyper-V network interface"); 459 460 /* Trust tcp segment verification on host side. */ 461 static int hn_trust_hosttcp = 1; 462 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 463 &hn_trust_hosttcp, 0, 464 "Trust tcp segment verification on host side, " 465 "when csum info is missing (global setting)"); 466 467 /* Trust udp datagrams verification on host side. */ 468 static int hn_trust_hostudp = 1; 469 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 470 &hn_trust_hostudp, 0, 471 "Trust udp datagram verification on host side, " 472 "when csum info is missing (global setting)"); 473 474 /* Trust ip packets verification on host side. */ 475 static int hn_trust_hostip = 1; 476 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 477 &hn_trust_hostip, 0, 478 "Trust ip packet verification on host side, " 479 "when csum info is missing (global setting)"); 480 481 /* 482 * Offload UDP/IPv4 checksum. 483 */ 484 static int hn_enable_udp4cs = 1; 485 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 486 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 487 488 /* 489 * Offload UDP/IPv6 checksum. 490 */ 491 static int hn_enable_udp6cs = 1; 492 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 493 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 494 495 /* Stats. */ 496 static counter_u64_t hn_udpcs_fixup; 497 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 498 &hn_udpcs_fixup, "# of UDP checksum fixup"); 499 500 /* 501 * See hn_set_hlen(). 502 * 503 * This value is for Azure. For Hyper-V, set this above 504 * 65536 to disable UDP datagram checksum fixup. 505 */ 506 static int hn_udpcs_fixup_mtu = 1420; 507 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 508 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 509 510 /* Limit TSO burst size */ 511 static int hn_tso_maxlen = IP_MAXPACKET; 512 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 513 &hn_tso_maxlen, 0, "TSO burst limit"); 514 515 /* Limit chimney send size */ 516 static int hn_tx_chimney_size = 0; 517 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 518 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 519 520 /* Limit the size of packet for direct transmission */ 521 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 522 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 523 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 524 525 /* # of LRO entries per RX ring */ 526 #if defined(INET) || defined(INET6) 527 #if __FreeBSD_version >= 1100095 528 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 529 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 530 &hn_lro_entry_count, 0, "LRO entry count"); 531 #endif 532 #endif 533 534 static int hn_tx_taskq_cnt = 1; 535 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 536 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 537 538 #define HN_TX_TASKQ_M_INDEP 0 539 #define HN_TX_TASKQ_M_GLOBAL 1 540 #define HN_TX_TASKQ_M_EVTTQ 2 541 542 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 543 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 544 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 545 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 546 547 #ifndef HN_USE_TXDESC_BUFRING 548 static int hn_use_txdesc_bufring = 0; 549 #else 550 static int hn_use_txdesc_bufring = 1; 551 #endif 552 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 553 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 554 555 #ifdef HN_IFSTART_SUPPORT 556 /* Use ifnet.if_start instead of ifnet.if_transmit */ 557 static int hn_use_if_start = 0; 558 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 559 &hn_use_if_start, 0, "Use if_start TX method"); 560 #endif 561 562 /* # of channels to use */ 563 static int hn_chan_cnt = 0; 564 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 565 &hn_chan_cnt, 0, 566 "# of channels to use; each channel has one RX ring and one TX ring"); 567 568 /* # of transmit rings to use */ 569 static int hn_tx_ring_cnt = 0; 570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 571 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 572 573 /* Software TX ring deptch */ 574 static int hn_tx_swq_depth = 0; 575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 576 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 577 578 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 579 #if __FreeBSD_version >= 1100095 580 static u_int hn_lro_mbufq_depth = 0; 581 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 582 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 583 #endif 584 585 /* Packet transmission aggregation size limit */ 586 static int hn_tx_agg_size = -1; 587 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 588 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 589 590 /* Packet transmission aggregation count limit */ 591 static int hn_tx_agg_pkts = -1; 592 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 593 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 594 595 /* VF list */ 596 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 597 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 598 hn_vflist_sysctl, "A", 599 "VF list"); 600 601 /* VF mapping */ 602 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 603 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 604 hn_vfmap_sysctl, "A", 605 "VF mapping"); 606 607 /* Transparent VF */ 608 static int hn_xpnt_vf = 1; 609 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 610 &hn_xpnt_vf, 0, "Transparent VF mod"); 611 612 /* Accurate BPF support for Transparent VF */ 613 static int hn_xpnt_vf_accbpf = 0; 614 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 615 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 616 617 /* Extra wait for transparent VF attach routing; unit seconds. */ 618 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 619 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 620 &hn_xpnt_vf_attwait, 0, 621 "Extra wait for transparent VF attach routing; unit: seconds"); 622 623 static u_int hn_cpu_index; /* next CPU for channel */ 624 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 625 626 static struct rmlock hn_vfmap_lock; 627 static int hn_vfmap_size; 628 static struct ifnet **hn_vfmap; 629 630 #ifndef RSS 631 static const uint8_t 632 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 633 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 634 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 635 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 636 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 637 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 638 }; 639 #endif /* !RSS */ 640 641 static const struct hyperv_guid hn_guid = { 642 .hv_guid = { 643 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 644 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 645 }; 646 647 static device_method_t hn_methods[] = { 648 /* Device interface */ 649 DEVMETHOD(device_probe, hn_probe), 650 DEVMETHOD(device_attach, hn_attach), 651 DEVMETHOD(device_detach, hn_detach), 652 DEVMETHOD(device_shutdown, hn_shutdown), 653 DEVMETHOD_END 654 }; 655 656 static driver_t hn_driver = { 657 "hn", 658 hn_methods, 659 sizeof(struct hn_softc) 660 }; 661 662 static devclass_t hn_devclass; 663 664 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 665 MODULE_VERSION(hn, 1); 666 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 667 668 #if __FreeBSD_version >= 1100099 669 static void 670 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 671 { 672 int i; 673 674 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 675 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 676 } 677 #endif 678 679 static int 680 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 681 { 682 683 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 684 txd->chim_size == 0, ("invalid rndis sglist txd")); 685 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 686 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 687 } 688 689 static int 690 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 691 { 692 struct hn_nvs_rndis rndis; 693 694 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 695 txd->chim_size > 0, ("invalid rndis chim txd")); 696 697 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 698 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 699 rndis.nvs_chim_idx = txd->chim_index; 700 rndis.nvs_chim_sz = txd->chim_size; 701 702 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 703 &rndis, sizeof(rndis), &txd->send_ctx)); 704 } 705 706 static __inline uint32_t 707 hn_chim_alloc(struct hn_softc *sc) 708 { 709 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 710 u_long *bmap = sc->hn_chim_bmap; 711 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 712 713 for (i = 0; i < bmap_cnt; ++i) { 714 int idx; 715 716 idx = ffsl(~bmap[i]); 717 if (idx == 0) 718 continue; 719 720 --idx; /* ffsl is 1-based */ 721 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 722 ("invalid i %d and idx %d", i, idx)); 723 724 if (atomic_testandset_long(&bmap[i], idx)) 725 continue; 726 727 ret = i * LONG_BIT + idx; 728 break; 729 } 730 return (ret); 731 } 732 733 static __inline void 734 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 735 { 736 u_long mask; 737 uint32_t idx; 738 739 idx = chim_idx / LONG_BIT; 740 KASSERT(idx < sc->hn_chim_bmap_cnt, 741 ("invalid chimney index 0x%x", chim_idx)); 742 743 mask = 1UL << (chim_idx % LONG_BIT); 744 KASSERT(sc->hn_chim_bmap[idx] & mask, 745 ("index bitmap 0x%lx, chimney index %u, " 746 "bitmap idx %d, bitmask 0x%lx", 747 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 748 749 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 750 } 751 752 #if defined(INET6) || defined(INET) 753 754 #define PULLUP_HDR(m, len) \ 755 do { \ 756 if (__predict_false((m)->m_len < (len))) { \ 757 (m) = m_pullup((m), (len)); \ 758 if ((m) == NULL) \ 759 return (NULL); \ 760 } \ 761 } while (0) 762 763 /* 764 * NOTE: If this function failed, the m_head would be freed. 765 */ 766 static __inline struct mbuf * 767 hn_tso_fixup(struct mbuf *m_head) 768 { 769 struct ether_vlan_header *evl; 770 struct tcphdr *th; 771 int ehlen; 772 773 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 774 775 PULLUP_HDR(m_head, sizeof(*evl)); 776 evl = mtod(m_head, struct ether_vlan_header *); 777 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 778 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 779 else 780 ehlen = ETHER_HDR_LEN; 781 m_head->m_pkthdr.l2hlen = ehlen; 782 783 #ifdef INET 784 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 785 struct ip *ip; 786 int iphlen; 787 788 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 789 ip = mtodo(m_head, ehlen); 790 iphlen = ip->ip_hl << 2; 791 m_head->m_pkthdr.l3hlen = iphlen; 792 793 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 794 th = mtodo(m_head, ehlen + iphlen); 795 796 ip->ip_len = 0; 797 ip->ip_sum = 0; 798 th->th_sum = in_pseudo(ip->ip_src.s_addr, 799 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 800 } 801 #endif 802 #if defined(INET6) && defined(INET) 803 else 804 #endif 805 #ifdef INET6 806 { 807 struct ip6_hdr *ip6; 808 809 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 810 ip6 = mtodo(m_head, ehlen); 811 if (ip6->ip6_nxt != IPPROTO_TCP) { 812 m_freem(m_head); 813 return (NULL); 814 } 815 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 816 817 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 818 th = mtodo(m_head, ehlen + sizeof(*ip6)); 819 820 ip6->ip6_plen = 0; 821 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 822 } 823 #endif 824 return (m_head); 825 } 826 827 /* 828 * NOTE: If this function failed, the m_head would be freed. 829 */ 830 static __inline struct mbuf * 831 hn_set_hlen(struct mbuf *m_head) 832 { 833 const struct ether_vlan_header *evl; 834 int ehlen; 835 836 PULLUP_HDR(m_head, sizeof(*evl)); 837 evl = mtod(m_head, const struct ether_vlan_header *); 838 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 839 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 840 else 841 ehlen = ETHER_HDR_LEN; 842 m_head->m_pkthdr.l2hlen = ehlen; 843 844 #ifdef INET 845 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 846 const struct ip *ip; 847 int iphlen; 848 849 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 850 ip = mtodo(m_head, ehlen); 851 iphlen = ip->ip_hl << 2; 852 m_head->m_pkthdr.l3hlen = iphlen; 853 854 /* 855 * UDP checksum offload does not work in Azure, if the 856 * following conditions meet: 857 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 858 * - IP_DF is not set in the IP hdr. 859 * 860 * Fallback to software checksum for these UDP datagrams. 861 */ 862 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 863 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 864 (ntohs(ip->ip_off) & IP_DF) == 0) { 865 uint16_t off = ehlen + iphlen; 866 867 counter_u64_add(hn_udpcs_fixup, 1); 868 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 869 *(uint16_t *)(m_head->m_data + off + 870 m_head->m_pkthdr.csum_data) = in_cksum_skip( 871 m_head, m_head->m_pkthdr.len, off); 872 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 873 } 874 } 875 #endif 876 #if defined(INET6) && defined(INET) 877 else 878 #endif 879 #ifdef INET6 880 { 881 const struct ip6_hdr *ip6; 882 883 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 884 ip6 = mtodo(m_head, ehlen); 885 if (ip6->ip6_nxt != IPPROTO_TCP && 886 ip6->ip6_nxt != IPPROTO_UDP) { 887 m_freem(m_head); 888 return (NULL); 889 } 890 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 891 } 892 #endif 893 return (m_head); 894 } 895 896 /* 897 * NOTE: If this function failed, the m_head would be freed. 898 */ 899 static __inline struct mbuf * 900 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 901 { 902 const struct tcphdr *th; 903 int ehlen, iphlen; 904 905 *tcpsyn = 0; 906 ehlen = m_head->m_pkthdr.l2hlen; 907 iphlen = m_head->m_pkthdr.l3hlen; 908 909 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 910 th = mtodo(m_head, ehlen + iphlen); 911 if (th->th_flags & TH_SYN) 912 *tcpsyn = 1; 913 return (m_head); 914 } 915 916 #undef PULLUP_HDR 917 918 #endif /* INET6 || INET */ 919 920 static int 921 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 922 { 923 int error = 0; 924 925 HN_LOCK_ASSERT(sc); 926 927 if (sc->hn_rx_filter != filter) { 928 error = hn_rndis_set_rxfilter(sc, filter); 929 if (!error) 930 sc->hn_rx_filter = filter; 931 } 932 return (error); 933 } 934 935 static int 936 hn_rxfilter_config(struct hn_softc *sc) 937 { 938 struct ifnet *ifp = sc->hn_ifp; 939 uint32_t filter; 940 941 HN_LOCK_ASSERT(sc); 942 943 /* 944 * If the non-transparent mode VF is activated, we don't know how 945 * its RX filter is configured, so stick the synthetic device in 946 * the promiscous mode. 947 */ 948 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 949 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 950 } else { 951 filter = NDIS_PACKET_TYPE_DIRECTED; 952 if (ifp->if_flags & IFF_BROADCAST) 953 filter |= NDIS_PACKET_TYPE_BROADCAST; 954 /* TODO: support multicast list */ 955 if ((ifp->if_flags & IFF_ALLMULTI) || 956 !CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 957 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 958 } 959 return (hn_set_rxfilter(sc, filter)); 960 } 961 962 static void 963 hn_set_txagg(struct hn_softc *sc) 964 { 965 uint32_t size, pkts; 966 int i; 967 968 /* 969 * Setup aggregation size. 970 */ 971 if (sc->hn_agg_size < 0) 972 size = UINT32_MAX; 973 else 974 size = sc->hn_agg_size; 975 976 if (sc->hn_rndis_agg_size < size) 977 size = sc->hn_rndis_agg_size; 978 979 /* NOTE: We only aggregate packets using chimney sending buffers. */ 980 if (size > (uint32_t)sc->hn_chim_szmax) 981 size = sc->hn_chim_szmax; 982 983 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 984 /* Disable */ 985 size = 0; 986 pkts = 0; 987 goto done; 988 } 989 990 /* NOTE: Type of the per TX ring setting is 'int'. */ 991 if (size > INT_MAX) 992 size = INT_MAX; 993 994 /* 995 * Setup aggregation packet count. 996 */ 997 if (sc->hn_agg_pkts < 0) 998 pkts = UINT32_MAX; 999 else 1000 pkts = sc->hn_agg_pkts; 1001 1002 if (sc->hn_rndis_agg_pkts < pkts) 1003 pkts = sc->hn_rndis_agg_pkts; 1004 1005 if (pkts <= 1) { 1006 /* Disable */ 1007 size = 0; 1008 pkts = 0; 1009 goto done; 1010 } 1011 1012 /* NOTE: Type of the per TX ring setting is 'short'. */ 1013 if (pkts > SHRT_MAX) 1014 pkts = SHRT_MAX; 1015 1016 done: 1017 /* NOTE: Type of the per TX ring setting is 'short'. */ 1018 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1019 /* Disable */ 1020 size = 0; 1021 pkts = 0; 1022 } 1023 1024 if (bootverbose) { 1025 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1026 size, pkts, sc->hn_rndis_agg_align); 1027 } 1028 1029 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1030 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1031 1032 mtx_lock(&txr->hn_tx_lock); 1033 txr->hn_agg_szmax = size; 1034 txr->hn_agg_pktmax = pkts; 1035 txr->hn_agg_align = sc->hn_rndis_agg_align; 1036 mtx_unlock(&txr->hn_tx_lock); 1037 } 1038 } 1039 1040 static int 1041 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1042 { 1043 1044 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1045 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1046 return txr->hn_txdesc_cnt; 1047 return hn_tx_swq_depth; 1048 } 1049 1050 static int 1051 hn_rss_reconfig(struct hn_softc *sc) 1052 { 1053 int error; 1054 1055 HN_LOCK_ASSERT(sc); 1056 1057 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1058 return (ENXIO); 1059 1060 /* 1061 * Disable RSS first. 1062 * 1063 * NOTE: 1064 * Direct reconfiguration by setting the UNCHG flags does 1065 * _not_ work properly. 1066 */ 1067 if (bootverbose) 1068 if_printf(sc->hn_ifp, "disable RSS\n"); 1069 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1070 if (error) { 1071 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1072 return (error); 1073 } 1074 1075 /* 1076 * Reenable the RSS w/ the updated RSS key or indirect 1077 * table. 1078 */ 1079 if (bootverbose) 1080 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1081 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1082 if (error) { 1083 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1084 return (error); 1085 } 1086 return (0); 1087 } 1088 1089 static void 1090 hn_rss_ind_fixup(struct hn_softc *sc) 1091 { 1092 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1093 int i, nchan; 1094 1095 nchan = sc->hn_rx_ring_inuse; 1096 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1097 1098 /* 1099 * Check indirect table to make sure that all channels in it 1100 * can be used. 1101 */ 1102 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1103 if (rss->rss_ind[i] >= nchan) { 1104 if_printf(sc->hn_ifp, 1105 "RSS indirect table %d fixup: %u -> %d\n", 1106 i, rss->rss_ind[i], nchan - 1); 1107 rss->rss_ind[i] = nchan - 1; 1108 } 1109 } 1110 } 1111 1112 static int 1113 hn_ifmedia_upd(struct ifnet *ifp __unused) 1114 { 1115 1116 return EOPNOTSUPP; 1117 } 1118 1119 static void 1120 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 1121 { 1122 struct hn_softc *sc = ifp->if_softc; 1123 1124 ifmr->ifm_status = IFM_AVALID; 1125 ifmr->ifm_active = IFM_ETHER; 1126 1127 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1128 ifmr->ifm_active |= IFM_NONE; 1129 return; 1130 } 1131 ifmr->ifm_status |= IFM_ACTIVE; 1132 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1133 } 1134 1135 static void 1136 hn_rxvf_set_task(void *xarg, int pending __unused) 1137 { 1138 struct hn_rxvf_setarg *arg = xarg; 1139 1140 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1141 } 1142 1143 static void 1144 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) 1145 { 1146 struct hn_rx_ring *rxr; 1147 struct hn_rxvf_setarg arg; 1148 struct task task; 1149 int i; 1150 1151 HN_LOCK_ASSERT(sc); 1152 1153 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1154 1155 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1156 rxr = &sc->hn_rx_ring[i]; 1157 1158 if (i < sc->hn_rx_ring_inuse) { 1159 arg.rxr = rxr; 1160 arg.vf_ifp = vf_ifp; 1161 vmbus_chan_run_task(rxr->hn_chan, &task); 1162 } else { 1163 rxr->hn_rxvf_ifp = vf_ifp; 1164 } 1165 } 1166 } 1167 1168 static bool 1169 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) 1170 { 1171 const struct ifnet *hn_ifp; 1172 1173 hn_ifp = sc->hn_ifp; 1174 1175 if (ifp == hn_ifp) 1176 return (false); 1177 1178 if (ifp->if_alloctype != IFT_ETHER) 1179 return (false); 1180 1181 /* Ignore lagg/vlan interfaces */ 1182 if (strcmp(ifp->if_dname, "lagg") == 0 || 1183 strcmp(ifp->if_dname, "vlan") == 0) 1184 return (false); 1185 1186 /* 1187 * During detach events ifp->if_addr might be NULL. 1188 * Make sure the bcmp() below doesn't panic on that: 1189 */ 1190 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL) 1191 return (false); 1192 1193 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 1194 return (false); 1195 1196 return (true); 1197 } 1198 1199 static void 1200 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) 1201 { 1202 struct ifnet *hn_ifp; 1203 1204 HN_LOCK(sc); 1205 1206 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1207 goto out; 1208 1209 if (!hn_ismyvf(sc, ifp)) 1210 goto out; 1211 hn_ifp = sc->hn_ifp; 1212 1213 if (rxvf) { 1214 if (sc->hn_flags & HN_FLAG_RXVF) 1215 goto out; 1216 1217 sc->hn_flags |= HN_FLAG_RXVF; 1218 hn_rxfilter_config(sc); 1219 } else { 1220 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1221 goto out; 1222 1223 sc->hn_flags &= ~HN_FLAG_RXVF; 1224 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 1225 hn_rxfilter_config(sc); 1226 else 1227 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1228 } 1229 1230 hn_nvs_set_datapath(sc, 1231 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1232 1233 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1234 1235 if (rxvf) { 1236 hn_vf_rss_fixup(sc, true); 1237 hn_suspend_mgmt(sc); 1238 sc->hn_link_flags &= 1239 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1240 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1241 } else { 1242 hn_vf_rss_restore(sc); 1243 hn_resume_mgmt(sc); 1244 } 1245 1246 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, 1247 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1248 1249 if (bootverbose) { 1250 if_printf(hn_ifp, "datapath is switched %s %s\n", 1251 rxvf ? "to" : "from", ifp->if_xname); 1252 } 1253 out: 1254 HN_UNLOCK(sc); 1255 } 1256 1257 static void 1258 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1259 { 1260 1261 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1262 return; 1263 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1264 } 1265 1266 static void 1267 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1268 { 1269 1270 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); 1271 } 1272 1273 static int 1274 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1275 { 1276 struct ifnet *ifp, *vf_ifp; 1277 uint64_t tmp; 1278 int error; 1279 1280 HN_LOCK_ASSERT(sc); 1281 ifp = sc->hn_ifp; 1282 vf_ifp = sc->hn_vf_ifp; 1283 1284 /* 1285 * Fix up requested capabilities w/ supported capabilities, 1286 * since the supported capabilities could have been changed. 1287 */ 1288 ifr->ifr_reqcap &= ifp->if_capabilities; 1289 /* Pass SIOCSIFCAP to VF. */ 1290 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); 1291 1292 /* 1293 * NOTE: 1294 * The error will be propagated to the callers, however, it 1295 * is _not_ useful here. 1296 */ 1297 1298 /* 1299 * Merge VF's enabled capabilities. 1300 */ 1301 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; 1302 1303 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); 1304 if (ifp->if_capenable & IFCAP_TXCSUM) 1305 ifp->if_hwassist |= tmp; 1306 else 1307 ifp->if_hwassist &= ~tmp; 1308 1309 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); 1310 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 1311 ifp->if_hwassist |= tmp; 1312 else 1313 ifp->if_hwassist &= ~tmp; 1314 1315 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; 1316 if (ifp->if_capenable & IFCAP_TSO4) 1317 ifp->if_hwassist |= tmp; 1318 else 1319 ifp->if_hwassist &= ~tmp; 1320 1321 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; 1322 if (ifp->if_capenable & IFCAP_TSO6) 1323 ifp->if_hwassist |= tmp; 1324 else 1325 ifp->if_hwassist &= ~tmp; 1326 1327 return (error); 1328 } 1329 1330 static int 1331 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1332 { 1333 struct ifnet *vf_ifp; 1334 struct ifreq ifr; 1335 1336 HN_LOCK_ASSERT(sc); 1337 vf_ifp = sc->hn_vf_ifp; 1338 1339 memset(&ifr, 0, sizeof(ifr)); 1340 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1341 ifr.ifr_flags = vf_ifp->if_flags & 0xffff; 1342 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; 1343 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); 1344 } 1345 1346 static void 1347 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1348 { 1349 struct ifnet *ifp = sc->hn_ifp; 1350 int allmulti = 0; 1351 1352 HN_LOCK_ASSERT(sc); 1353 1354 /* XXX vlan(4) style mcast addr maintenance */ 1355 if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 1356 allmulti = IFF_ALLMULTI; 1357 1358 /* Always set the VF's if_flags */ 1359 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; 1360 } 1361 1362 static void 1363 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) 1364 { 1365 struct rm_priotracker pt; 1366 struct ifnet *hn_ifp = NULL; 1367 struct mbuf *mn; 1368 1369 /* 1370 * XXX racy, if hn(4) ever detached. 1371 */ 1372 rm_rlock(&hn_vfmap_lock, &pt); 1373 if (vf_ifp->if_index < hn_vfmap_size) 1374 hn_ifp = hn_vfmap[vf_ifp->if_index]; 1375 rm_runlock(&hn_vfmap_lock, &pt); 1376 1377 if (hn_ifp != NULL) { 1378 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1379 /* 1380 * Allow tapping on the VF. 1381 */ 1382 ETHER_BPF_MTAP(vf_ifp, mn); 1383 1384 /* 1385 * Update VF stats. 1386 */ 1387 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { 1388 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1389 mn->m_pkthdr.len); 1390 } 1391 /* 1392 * XXX IFCOUNTER_IMCAST 1393 * This stat updating is kinda invasive, since it 1394 * requires two checks on the mbuf: the length check 1395 * and the ethernet header check. As of this write, 1396 * all multicast packets go directly to hn(4), which 1397 * makes imcast stat updating in the VF a try in vian. 1398 */ 1399 1400 /* 1401 * Fix up rcvif and increase hn(4)'s ipackets. 1402 */ 1403 mn->m_pkthdr.rcvif = hn_ifp; 1404 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1405 } 1406 /* 1407 * Go through hn(4)'s if_input. 1408 */ 1409 hn_ifp->if_input(hn_ifp, m); 1410 } else { 1411 /* 1412 * In the middle of the transition; free this 1413 * mbuf chain. 1414 */ 1415 while (m != NULL) { 1416 mn = m->m_nextpkt; 1417 m->m_nextpkt = NULL; 1418 m_freem(m); 1419 m = mn; 1420 } 1421 } 1422 } 1423 1424 static void 1425 hn_mtu_change_fixup(struct hn_softc *sc) 1426 { 1427 struct ifnet *ifp; 1428 1429 HN_LOCK_ASSERT(sc); 1430 ifp = sc->hn_ifp; 1431 1432 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 1433 #if __FreeBSD_version >= 1100099 1434 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1435 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1436 #endif 1437 } 1438 1439 static uint32_t 1440 hn_rss_type_fromndis(uint32_t rss_hash) 1441 { 1442 uint32_t types = 0; 1443 1444 if (rss_hash & NDIS_HASH_IPV4) 1445 types |= RSS_TYPE_IPV4; 1446 if (rss_hash & NDIS_HASH_TCP_IPV4) 1447 types |= RSS_TYPE_TCP_IPV4; 1448 if (rss_hash & NDIS_HASH_IPV6) 1449 types |= RSS_TYPE_IPV6; 1450 if (rss_hash & NDIS_HASH_IPV6_EX) 1451 types |= RSS_TYPE_IPV6_EX; 1452 if (rss_hash & NDIS_HASH_TCP_IPV6) 1453 types |= RSS_TYPE_TCP_IPV6; 1454 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1455 types |= RSS_TYPE_TCP_IPV6_EX; 1456 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1457 types |= RSS_TYPE_UDP_IPV4; 1458 return (types); 1459 } 1460 1461 static uint32_t 1462 hn_rss_type_tondis(uint32_t types) 1463 { 1464 uint32_t rss_hash = 0; 1465 1466 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1467 ("UDP6 and UDP6EX are not supported")); 1468 1469 if (types & RSS_TYPE_IPV4) 1470 rss_hash |= NDIS_HASH_IPV4; 1471 if (types & RSS_TYPE_TCP_IPV4) 1472 rss_hash |= NDIS_HASH_TCP_IPV4; 1473 if (types & RSS_TYPE_IPV6) 1474 rss_hash |= NDIS_HASH_IPV6; 1475 if (types & RSS_TYPE_IPV6_EX) 1476 rss_hash |= NDIS_HASH_IPV6_EX; 1477 if (types & RSS_TYPE_TCP_IPV6) 1478 rss_hash |= NDIS_HASH_TCP_IPV6; 1479 if (types & RSS_TYPE_TCP_IPV6_EX) 1480 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1481 if (types & RSS_TYPE_UDP_IPV4) 1482 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1483 return (rss_hash); 1484 } 1485 1486 static void 1487 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1488 { 1489 int i; 1490 1491 HN_LOCK_ASSERT(sc); 1492 1493 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1494 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1495 } 1496 1497 static void 1498 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1499 { 1500 struct ifnet *ifp, *vf_ifp; 1501 struct ifrsshash ifrh; 1502 struct ifrsskey ifrk; 1503 int error; 1504 uint32_t my_types, diff_types, mbuf_types = 0; 1505 1506 HN_LOCK_ASSERT(sc); 1507 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1508 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1509 1510 if (sc->hn_rx_ring_inuse == 1) { 1511 /* No RSS on synthetic parts; done. */ 1512 return; 1513 } 1514 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1515 /* Synthetic parts do not support Toeplitz; done. */ 1516 return; 1517 } 1518 1519 ifp = sc->hn_ifp; 1520 vf_ifp = sc->hn_vf_ifp; 1521 1522 /* 1523 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1524 * supported. 1525 */ 1526 memset(&ifrk, 0, sizeof(ifrk)); 1527 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); 1528 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); 1529 if (error) { 1530 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1531 vf_ifp->if_xname, error); 1532 goto done; 1533 } 1534 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1535 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1536 vf_ifp->if_xname, ifrk.ifrk_func); 1537 goto done; 1538 } 1539 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1540 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1541 vf_ifp->if_xname, ifrk.ifrk_keylen); 1542 goto done; 1543 } 1544 1545 /* 1546 * Extract VF's RSS hash. Only Toeplitz is supported. 1547 */ 1548 memset(&ifrh, 0, sizeof(ifrh)); 1549 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); 1550 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); 1551 if (error) { 1552 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1553 vf_ifp->if_xname, error); 1554 goto done; 1555 } 1556 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1557 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1558 vf_ifp->if_xname, ifrh.ifrh_func); 1559 goto done; 1560 } 1561 1562 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1563 if ((ifrh.ifrh_types & my_types) == 0) { 1564 /* This disables RSS; ignore it then */ 1565 if_printf(ifp, "%s intersection of RSS types failed. " 1566 "VF %#x, mine %#x\n", vf_ifp->if_xname, 1567 ifrh.ifrh_types, my_types); 1568 goto done; 1569 } 1570 1571 diff_types = my_types ^ ifrh.ifrh_types; 1572 my_types &= ifrh.ifrh_types; 1573 mbuf_types = my_types; 1574 1575 /* 1576 * Detect RSS hash value/type confliction. 1577 * 1578 * NOTE: 1579 * We don't disable the hash type, but stop delivery the hash 1580 * value/type through mbufs on RX path. 1581 * 1582 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1583 * hash is delivered with type of TCP_IPV4. This means if 1584 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1585 * least to hn_mbuf_hash. However, given that _all_ of the 1586 * NICs implement TCP_IPV4, this will _not_ impose any issues 1587 * here. 1588 */ 1589 if ((my_types & RSS_TYPE_IPV4) && 1590 (diff_types & ifrh.ifrh_types & 1591 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1592 /* Conflict; disable IPV4 hash type/value delivery. */ 1593 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1594 mbuf_types &= ~RSS_TYPE_IPV4; 1595 } 1596 if ((my_types & RSS_TYPE_IPV6) && 1597 (diff_types & ifrh.ifrh_types & 1598 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1599 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1600 RSS_TYPE_IPV6_EX))) { 1601 /* Conflict; disable IPV6 hash type/value delivery. */ 1602 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1603 mbuf_types &= ~RSS_TYPE_IPV6; 1604 } 1605 if ((my_types & RSS_TYPE_IPV6_EX) && 1606 (diff_types & ifrh.ifrh_types & 1607 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1608 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1609 RSS_TYPE_IPV6))) { 1610 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1611 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1612 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1613 } 1614 if ((my_types & RSS_TYPE_TCP_IPV6) && 1615 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1616 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1617 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1618 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1619 } 1620 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1621 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1622 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1623 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1624 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1625 } 1626 if ((my_types & RSS_TYPE_UDP_IPV6) && 1627 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1628 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1629 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1630 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1631 } 1632 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1633 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1634 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1635 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1636 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1637 } 1638 1639 /* 1640 * Indirect table does not matter. 1641 */ 1642 1643 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1644 hn_rss_type_tondis(my_types); 1645 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1646 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1647 1648 if (reconf) { 1649 error = hn_rss_reconfig(sc); 1650 if (error) { 1651 /* XXX roll-back? */ 1652 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1653 /* XXX keep going. */ 1654 } 1655 } 1656 done: 1657 /* Hash deliverability for mbufs. */ 1658 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1659 } 1660 1661 static void 1662 hn_vf_rss_restore(struct hn_softc *sc) 1663 { 1664 1665 HN_LOCK_ASSERT(sc); 1666 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1667 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1668 1669 if (sc->hn_rx_ring_inuse == 1) 1670 goto done; 1671 1672 /* 1673 * Restore hash types. Key does _not_ matter. 1674 */ 1675 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1676 int error; 1677 1678 sc->hn_rss_hash = sc->hn_rss_hcap; 1679 error = hn_rss_reconfig(sc); 1680 if (error) { 1681 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1682 error); 1683 /* XXX keep going. */ 1684 } 1685 } 1686 done: 1687 /* Hash deliverability for mbufs. */ 1688 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1689 } 1690 1691 static void 1692 hn_xpnt_vf_setready(struct hn_softc *sc) 1693 { 1694 struct ifnet *ifp, *vf_ifp; 1695 struct ifreq ifr; 1696 1697 HN_LOCK_ASSERT(sc); 1698 ifp = sc->hn_ifp; 1699 vf_ifp = sc->hn_vf_ifp; 1700 1701 /* 1702 * Mark the VF ready. 1703 */ 1704 sc->hn_vf_rdytick = 0; 1705 1706 /* 1707 * Save information for restoration. 1708 */ 1709 sc->hn_saved_caps = ifp->if_capabilities; 1710 sc->hn_saved_tsomax = ifp->if_hw_tsomax; 1711 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; 1712 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; 1713 1714 /* 1715 * Intersect supported/enabled capabilities. 1716 * 1717 * NOTE: 1718 * if_hwassist is not changed here. 1719 */ 1720 ifp->if_capabilities &= vf_ifp->if_capabilities; 1721 ifp->if_capenable &= ifp->if_capabilities; 1722 1723 /* 1724 * Fix TSO settings. 1725 */ 1726 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) 1727 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; 1728 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) 1729 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; 1730 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) 1731 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; 1732 1733 /* 1734 * Change VF's enabled capabilities. 1735 */ 1736 memset(&ifr, 0, sizeof(ifr)); 1737 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1738 ifr.ifr_reqcap = ifp->if_capenable; 1739 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1740 1741 if (ifp->if_mtu != ETHERMTU) { 1742 int error; 1743 1744 /* 1745 * Change VF's MTU. 1746 */ 1747 memset(&ifr, 0, sizeof(ifr)); 1748 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1749 ifr.ifr_mtu = ifp->if_mtu; 1750 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); 1751 if (error) { 1752 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1753 vf_ifp->if_xname, ifp->if_mtu); 1754 if (ifp->if_mtu > ETHERMTU) { 1755 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1756 1757 /* 1758 * XXX 1759 * No need to adjust the synthetic parts' MTU; 1760 * failure of the adjustment will cause us 1761 * infinite headache. 1762 */ 1763 ifp->if_mtu = ETHERMTU; 1764 hn_mtu_change_fixup(sc); 1765 } 1766 } 1767 } 1768 } 1769 1770 static bool 1771 hn_xpnt_vf_isready(struct hn_softc *sc) 1772 { 1773 1774 HN_LOCK_ASSERT(sc); 1775 1776 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1777 return (false); 1778 1779 if (sc->hn_vf_rdytick == 0) 1780 return (true); 1781 1782 if (sc->hn_vf_rdytick > ticks) 1783 return (false); 1784 1785 /* Mark VF as ready. */ 1786 hn_xpnt_vf_setready(sc); 1787 return (true); 1788 } 1789 1790 static void 1791 hn_xpnt_vf_setenable(struct hn_softc *sc) 1792 { 1793 int i; 1794 1795 HN_LOCK_ASSERT(sc); 1796 1797 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1798 rm_wlock(&sc->hn_vf_lock); 1799 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1800 rm_wunlock(&sc->hn_vf_lock); 1801 1802 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1803 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1804 } 1805 1806 static void 1807 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1808 { 1809 int i; 1810 1811 HN_LOCK_ASSERT(sc); 1812 1813 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1814 rm_wlock(&sc->hn_vf_lock); 1815 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1816 if (clear_vf) 1817 sc->hn_vf_ifp = NULL; 1818 rm_wunlock(&sc->hn_vf_lock); 1819 1820 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1821 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1822 } 1823 1824 static void 1825 hn_xpnt_vf_init(struct hn_softc *sc) 1826 { 1827 int error; 1828 1829 HN_LOCK_ASSERT(sc); 1830 1831 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1832 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1833 1834 if (bootverbose) { 1835 if_printf(sc->hn_ifp, "try bringing up %s\n", 1836 sc->hn_vf_ifp->if_xname); 1837 } 1838 1839 /* 1840 * Bring the VF up. 1841 */ 1842 hn_xpnt_vf_saveifflags(sc); 1843 sc->hn_vf_ifp->if_flags |= IFF_UP; 1844 error = hn_xpnt_vf_iocsetflags(sc); 1845 if (error) { 1846 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1847 sc->hn_vf_ifp->if_xname, error); 1848 return; 1849 } 1850 1851 /* 1852 * NOTE: 1853 * Datapath setting must happen _after_ bringing the VF up. 1854 */ 1855 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1856 1857 /* 1858 * NOTE: 1859 * Fixup RSS related bits _after_ the VF is brought up, since 1860 * many VFs generate RSS key during it's initialization. 1861 */ 1862 hn_vf_rss_fixup(sc, true); 1863 1864 /* Mark transparent mode VF as enabled. */ 1865 hn_xpnt_vf_setenable(sc); 1866 } 1867 1868 static void 1869 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1870 { 1871 struct hn_softc *sc = xsc; 1872 1873 HN_LOCK(sc); 1874 1875 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1876 goto done; 1877 if (sc->hn_vf_ifp == NULL) 1878 goto done; 1879 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1880 goto done; 1881 1882 if (sc->hn_vf_rdytick != 0) { 1883 /* Mark VF as ready. */ 1884 hn_xpnt_vf_setready(sc); 1885 } 1886 1887 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { 1888 /* 1889 * Delayed VF initialization. 1890 */ 1891 if (bootverbose) { 1892 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1893 sc->hn_vf_ifp->if_xname); 1894 } 1895 hn_xpnt_vf_init(sc); 1896 } 1897 done: 1898 HN_UNLOCK(sc); 1899 } 1900 1901 static void 1902 hn_ifnet_attevent(void *xsc, struct ifnet *ifp) 1903 { 1904 struct hn_softc *sc = xsc; 1905 1906 HN_LOCK(sc); 1907 1908 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1909 goto done; 1910 1911 if (!hn_ismyvf(sc, ifp)) 1912 goto done; 1913 1914 if (sc->hn_vf_ifp != NULL) { 1915 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1916 sc->hn_vf_ifp->if_xname); 1917 goto done; 1918 } 1919 1920 if (hn_xpnt_vf && ifp->if_start != NULL) { 1921 /* 1922 * ifnet.if_start is _not_ supported by transparent 1923 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1924 */ 1925 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1926 "in transparent VF mode.\n", ifp->if_xname); 1927 goto done; 1928 } 1929 1930 rm_wlock(&hn_vfmap_lock); 1931 1932 if (ifp->if_index >= hn_vfmap_size) { 1933 struct ifnet **newmap; 1934 int newsize; 1935 1936 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; 1937 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, 1938 M_WAITOK | M_ZERO); 1939 1940 memcpy(newmap, hn_vfmap, 1941 sizeof(struct ifnet *) * hn_vfmap_size); 1942 free(hn_vfmap, M_DEVBUF); 1943 hn_vfmap = newmap; 1944 hn_vfmap_size = newsize; 1945 } 1946 KASSERT(hn_vfmap[ifp->if_index] == NULL, 1947 ("%s: ifindex %d was mapped to %s", 1948 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); 1949 hn_vfmap[ifp->if_index] = sc->hn_ifp; 1950 1951 rm_wunlock(&hn_vfmap_lock); 1952 1953 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1954 rm_wlock(&sc->hn_vf_lock); 1955 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1956 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1957 sc->hn_vf_ifp = ifp; 1958 rm_wunlock(&sc->hn_vf_lock); 1959 1960 if (hn_xpnt_vf) { 1961 int wait_ticks; 1962 1963 /* 1964 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1965 * Save vf_ifp's current if_input for later restoration. 1966 */ 1967 sc->hn_vf_input = ifp->if_input; 1968 ifp->if_input = hn_xpnt_vf_input; 1969 1970 /* 1971 * Stop link status management; use the VF's. 1972 */ 1973 hn_suspend_mgmt(sc); 1974 1975 /* 1976 * Give VF sometime to complete its attach routing. 1977 */ 1978 wait_ticks = hn_xpnt_vf_attwait * hz; 1979 sc->hn_vf_rdytick = ticks + wait_ticks; 1980 1981 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1982 wait_ticks); 1983 } 1984 done: 1985 HN_UNLOCK(sc); 1986 } 1987 1988 static void 1989 hn_ifnet_detevent(void *xsc, struct ifnet *ifp) 1990 { 1991 struct hn_softc *sc = xsc; 1992 1993 HN_LOCK(sc); 1994 1995 if (sc->hn_vf_ifp == NULL) 1996 goto done; 1997 1998 if (!hn_ismyvf(sc, ifp)) 1999 goto done; 2000 2001 if (hn_xpnt_vf) { 2002 /* 2003 * Make sure that the delayed initialization is not running. 2004 * 2005 * NOTE: 2006 * - This lock _must_ be released, since the hn_vf_init task 2007 * will try holding this lock. 2008 * - It is safe to release this lock here, since the 2009 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 2010 * 2011 * XXX racy, if hn(4) ever detached. 2012 */ 2013 HN_UNLOCK(sc); 2014 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 2015 HN_LOCK(sc); 2016 2017 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 2018 sc->hn_ifp->if_xname)); 2019 ifp->if_input = sc->hn_vf_input; 2020 sc->hn_vf_input = NULL; 2021 2022 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 2023 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 2024 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 2025 2026 if (sc->hn_vf_rdytick == 0) { 2027 /* 2028 * The VF was ready; restore some settings. 2029 */ 2030 sc->hn_ifp->if_capabilities = sc->hn_saved_caps; 2031 /* 2032 * NOTE: 2033 * There is _no_ need to fixup if_capenable and 2034 * if_hwassist, since the if_capabilities before 2035 * restoration was an intersection of the VF's 2036 * if_capabilites and the synthetic device's 2037 * if_capabilites. 2038 */ 2039 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; 2040 sc->hn_ifp->if_hw_tsomaxsegcount = 2041 sc->hn_saved_tsosegcnt; 2042 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; 2043 } 2044 2045 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2046 /* 2047 * Restore RSS settings. 2048 */ 2049 hn_vf_rss_restore(sc); 2050 2051 /* 2052 * Resume link status management, which was suspended 2053 * by hn_ifnet_attevent(). 2054 */ 2055 hn_resume_mgmt(sc); 2056 } 2057 } 2058 2059 /* Mark transparent mode VF as disabled. */ 2060 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2061 2062 rm_wlock(&hn_vfmap_lock); 2063 2064 KASSERT(ifp->if_index < hn_vfmap_size, 2065 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); 2066 if (hn_vfmap[ifp->if_index] != NULL) { 2067 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, 2068 ("%s: ifindex %d was mapped to %s", 2069 ifp->if_xname, ifp->if_index, 2070 hn_vfmap[ifp->if_index]->if_xname)); 2071 hn_vfmap[ifp->if_index] = NULL; 2072 } 2073 2074 rm_wunlock(&hn_vfmap_lock); 2075 done: 2076 HN_UNLOCK(sc); 2077 } 2078 2079 static void 2080 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) 2081 { 2082 struct hn_softc *sc = xsc; 2083 2084 if (sc->hn_vf_ifp == ifp) 2085 if_link_state_change(sc->hn_ifp, link_state); 2086 } 2087 2088 static int 2089 hn_probe(device_t dev) 2090 { 2091 2092 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2093 device_set_desc(dev, "Hyper-V Network Interface"); 2094 return BUS_PROBE_DEFAULT; 2095 } 2096 return ENXIO; 2097 } 2098 2099 static int 2100 hn_attach(device_t dev) 2101 { 2102 struct hn_softc *sc = device_get_softc(dev); 2103 struct sysctl_oid_list *child; 2104 struct sysctl_ctx_list *ctx; 2105 uint8_t eaddr[ETHER_ADDR_LEN]; 2106 struct ifnet *ifp = NULL; 2107 int error, ring_cnt, tx_ring_cnt; 2108 uint32_t mtu; 2109 2110 sc->hn_dev = dev; 2111 sc->hn_prichan = vmbus_get_channel(dev); 2112 HN_LOCK_INIT(sc); 2113 rm_init(&sc->hn_vf_lock, "hnvf"); 2114 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2115 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2116 2117 /* 2118 * Initialize these tunables once. 2119 */ 2120 sc->hn_agg_size = hn_tx_agg_size; 2121 sc->hn_agg_pkts = hn_tx_agg_pkts; 2122 2123 /* 2124 * Setup taskqueue for transmission. 2125 */ 2126 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2127 int i; 2128 2129 sc->hn_tx_taskqs = 2130 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2131 M_DEVBUF, M_WAITOK); 2132 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2133 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2134 M_WAITOK, taskqueue_thread_enqueue, 2135 &sc->hn_tx_taskqs[i]); 2136 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2137 "%s tx%d", device_get_nameunit(dev), i); 2138 } 2139 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2140 sc->hn_tx_taskqs = hn_tx_taskque; 2141 } 2142 2143 /* 2144 * Setup taskqueue for mangement tasks, e.g. link status. 2145 */ 2146 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2147 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2148 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2149 device_get_nameunit(dev)); 2150 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2151 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2152 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2153 hn_netchg_status_taskfunc, sc); 2154 2155 if (hn_xpnt_vf) { 2156 /* 2157 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2158 */ 2159 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2160 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2161 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2162 device_get_nameunit(dev)); 2163 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2164 hn_xpnt_vf_init_taskfunc, sc); 2165 } 2166 2167 /* 2168 * Allocate ifnet and setup its name earlier, so that if_printf 2169 * can be used by functions, which will be called after 2170 * ether_ifattach(). 2171 */ 2172 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2173 ifp->if_softc = sc; 2174 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2175 2176 /* 2177 * Initialize ifmedia earlier so that it can be unconditionally 2178 * destroyed, if error happened later on. 2179 */ 2180 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2181 2182 /* 2183 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2184 * to use (tx_ring_cnt). 2185 * 2186 * NOTE: 2187 * The # of RX rings to use is same as the # of channels to use. 2188 */ 2189 ring_cnt = hn_chan_cnt; 2190 if (ring_cnt <= 0) { 2191 /* Default */ 2192 ring_cnt = mp_ncpus; 2193 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2194 ring_cnt = HN_RING_CNT_DEF_MAX; 2195 } else if (ring_cnt > mp_ncpus) { 2196 ring_cnt = mp_ncpus; 2197 } 2198 #ifdef RSS 2199 if (ring_cnt > rss_getnumbuckets()) 2200 ring_cnt = rss_getnumbuckets(); 2201 #endif 2202 2203 tx_ring_cnt = hn_tx_ring_cnt; 2204 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2205 tx_ring_cnt = ring_cnt; 2206 #ifdef HN_IFSTART_SUPPORT 2207 if (hn_use_if_start) { 2208 /* ifnet.if_start only needs one TX ring. */ 2209 tx_ring_cnt = 1; 2210 } 2211 #endif 2212 2213 /* 2214 * Set the leader CPU for channels. 2215 */ 2216 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2217 2218 /* 2219 * Create enough TX/RX rings, even if only limited number of 2220 * channels can be allocated. 2221 */ 2222 error = hn_create_tx_data(sc, tx_ring_cnt); 2223 if (error) 2224 goto failed; 2225 error = hn_create_rx_data(sc, ring_cnt); 2226 if (error) 2227 goto failed; 2228 2229 /* 2230 * Create transaction context for NVS and RNDIS transactions. 2231 */ 2232 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2233 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2234 if (sc->hn_xact == NULL) { 2235 error = ENXIO; 2236 goto failed; 2237 } 2238 2239 /* 2240 * Install orphan handler for the revocation of this device's 2241 * primary channel. 2242 * 2243 * NOTE: 2244 * The processing order is critical here: 2245 * Install the orphan handler, _before_ testing whether this 2246 * device's primary channel has been revoked or not. 2247 */ 2248 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2249 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2250 error = ENXIO; 2251 goto failed; 2252 } 2253 2254 /* 2255 * Attach the synthetic parts, i.e. NVS and RNDIS. 2256 */ 2257 error = hn_synth_attach(sc, ETHERMTU); 2258 if (error) 2259 goto failed; 2260 2261 error = hn_rndis_get_eaddr(sc, eaddr); 2262 if (error) 2263 goto failed; 2264 2265 error = hn_rndis_get_mtu(sc, &mtu); 2266 if (error) 2267 mtu = ETHERMTU; 2268 else if (bootverbose) 2269 device_printf(dev, "RNDIS mtu %u\n", mtu); 2270 2271 #if __FreeBSD_version >= 1100099 2272 if (sc->hn_rx_ring_inuse > 1) { 2273 /* 2274 * Reduce TCP segment aggregation limit for multiple 2275 * RX rings to increase ACK timeliness. 2276 */ 2277 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2278 } 2279 #endif 2280 2281 /* 2282 * Fixup TX/RX stuffs after synthetic parts are attached. 2283 */ 2284 hn_fixup_tx_data(sc); 2285 hn_fixup_rx_data(sc); 2286 2287 ctx = device_get_sysctl_ctx(dev); 2288 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2289 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2290 &sc->hn_nvs_ver, 0, "NVS version"); 2291 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2292 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2293 hn_ndis_version_sysctl, "A", "NDIS version"); 2294 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2295 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2296 hn_caps_sysctl, "A", "capabilities"); 2297 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2298 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2299 hn_hwassist_sysctl, "A", "hwassist"); 2300 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", 2301 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); 2302 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", 2303 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, 2304 "max # of TSO segments"); 2305 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", 2306 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, 2307 "max size of TSO segment"); 2308 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2309 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2310 hn_rxfilter_sysctl, "A", "rxfilter"); 2311 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2312 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2313 hn_rss_hash_sysctl, "A", "RSS hash"); 2314 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2315 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2316 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2318 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2319 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2320 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2321 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2322 #ifndef RSS 2323 /* 2324 * Don't allow RSS key/indirect table changes, if RSS is defined. 2325 */ 2326 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2327 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2328 hn_rss_key_sysctl, "IU", "RSS key"); 2329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2330 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2331 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2332 #endif 2333 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2334 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2335 "RNDIS offered packet transmission aggregation size limit"); 2336 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2337 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2338 "RNDIS offered packet transmission aggregation count limit"); 2339 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2340 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2341 "RNDIS packet transmission aggregation alignment"); 2342 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2343 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2344 hn_txagg_size_sysctl, "I", 2345 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2346 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2347 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2348 hn_txagg_pkts_sysctl, "I", 2349 "Packet transmission aggregation packets, " 2350 "0 -- disable, -1 -- auto"); 2351 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2352 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2353 hn_polling_sysctl, "I", 2354 "Polling frequency: [100,1000000], 0 disable polling"); 2355 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2356 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2357 hn_vf_sysctl, "A", "Virtual Function's name"); 2358 if (!hn_xpnt_vf) { 2359 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2360 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2361 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2362 } else { 2363 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2364 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2365 hn_xpnt_vf_enabled_sysctl, "I", 2366 "Transparent VF enabled"); 2367 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2368 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2369 hn_xpnt_vf_accbpf_sysctl, "I", 2370 "Accurate BPF for transparent VF"); 2371 } 2372 2373 /* 2374 * Setup the ifmedia, which has been initialized earlier. 2375 */ 2376 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2377 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2378 /* XXX ifmedia_set really should do this for us */ 2379 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2380 2381 /* 2382 * Setup the ifnet for this interface. 2383 */ 2384 2385 ifp->if_baudrate = IF_Gbps(10); 2386 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 2387 ifp->if_ioctl = hn_ioctl; 2388 ifp->if_init = hn_init; 2389 #ifdef HN_IFSTART_SUPPORT 2390 if (hn_use_if_start) { 2391 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2392 2393 ifp->if_start = hn_start; 2394 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 2395 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 2396 IFQ_SET_READY(&ifp->if_snd); 2397 } else 2398 #endif 2399 { 2400 ifp->if_transmit = hn_transmit; 2401 ifp->if_qflush = hn_xmit_qflush; 2402 } 2403 2404 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; 2405 #ifdef foo 2406 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2407 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 2408 #endif 2409 if (sc->hn_caps & HN_CAP_VLAN) { 2410 /* XXX not sure about VLAN_MTU. */ 2411 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 2412 } 2413 2414 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 2415 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 2416 ifp->if_capabilities |= IFCAP_TXCSUM; 2417 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 2418 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 2419 if (sc->hn_caps & HN_CAP_TSO4) { 2420 ifp->if_capabilities |= IFCAP_TSO4; 2421 ifp->if_hwassist |= CSUM_IP_TSO; 2422 } 2423 if (sc->hn_caps & HN_CAP_TSO6) { 2424 ifp->if_capabilities |= IFCAP_TSO6; 2425 ifp->if_hwassist |= CSUM_IP6_TSO; 2426 } 2427 2428 /* Enable all available capabilities by default. */ 2429 ifp->if_capenable = ifp->if_capabilities; 2430 2431 /* 2432 * Disable IPv6 TSO and TXCSUM by default, they still can 2433 * be enabled through SIOCSIFCAP. 2434 */ 2435 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 2436 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 2437 2438 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 2439 /* 2440 * Lock hn_set_tso_maxsize() to simplify its 2441 * internal logic. 2442 */ 2443 HN_LOCK(sc); 2444 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2445 HN_UNLOCK(sc); 2446 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 2447 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 2448 } 2449 2450 ether_ifattach(ifp, eaddr); 2451 2452 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2453 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2454 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 2455 } 2456 if (mtu < ETHERMTU) { 2457 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); 2458 ifp->if_mtu = mtu; 2459 } 2460 2461 /* Inform the upper layer about the long frame support. */ 2462 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 2463 2464 /* 2465 * Kick off link status check. 2466 */ 2467 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2468 hn_update_link_status(sc); 2469 2470 if (!hn_xpnt_vf) { 2471 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2472 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2473 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2474 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2475 } else { 2476 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2477 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2478 } 2479 2480 /* 2481 * NOTE: 2482 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2483 * since interface's LLADDR is needed; interface LLADDR is not 2484 * available when ifnet_arrival event is triggered. 2485 */ 2486 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2487 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2488 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2489 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2490 2491 return (0); 2492 failed: 2493 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2494 hn_synth_detach(sc); 2495 hn_detach(dev); 2496 return (error); 2497 } 2498 2499 static int 2500 hn_detach(device_t dev) 2501 { 2502 struct hn_softc *sc = device_get_softc(dev); 2503 struct ifnet *ifp = sc->hn_ifp, *vf_ifp; 2504 2505 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2506 /* 2507 * In case that the vmbus missed the orphan handler 2508 * installation. 2509 */ 2510 vmbus_xact_ctx_orphan(sc->hn_xact); 2511 } 2512 2513 if (sc->hn_ifaddr_evthand != NULL) 2514 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2515 if (sc->hn_ifnet_evthand != NULL) 2516 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2517 if (sc->hn_ifnet_atthand != NULL) { 2518 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2519 sc->hn_ifnet_atthand); 2520 } 2521 if (sc->hn_ifnet_dethand != NULL) { 2522 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2523 sc->hn_ifnet_dethand); 2524 } 2525 if (sc->hn_ifnet_lnkhand != NULL) 2526 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2527 2528 vf_ifp = sc->hn_vf_ifp; 2529 __compiler_membar(); 2530 if (vf_ifp != NULL) 2531 hn_ifnet_detevent(sc, vf_ifp); 2532 2533 if (device_is_attached(dev)) { 2534 HN_LOCK(sc); 2535 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2536 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2537 hn_stop(sc, true); 2538 /* 2539 * NOTE: 2540 * hn_stop() only suspends data, so managment 2541 * stuffs have to be suspended manually here. 2542 */ 2543 hn_suspend_mgmt(sc); 2544 hn_synth_detach(sc); 2545 } 2546 HN_UNLOCK(sc); 2547 ether_ifdetach(ifp); 2548 } 2549 2550 ifmedia_removeall(&sc->hn_media); 2551 hn_destroy_rx_data(sc); 2552 hn_destroy_tx_data(sc); 2553 2554 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2555 int i; 2556 2557 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2558 taskqueue_free(sc->hn_tx_taskqs[i]); 2559 free(sc->hn_tx_taskqs, M_DEVBUF); 2560 } 2561 taskqueue_free(sc->hn_mgmt_taskq0); 2562 if (sc->hn_vf_taskq != NULL) 2563 taskqueue_free(sc->hn_vf_taskq); 2564 2565 if (sc->hn_xact != NULL) { 2566 /* 2567 * Uninstall the orphan handler _before_ the xact is 2568 * destructed. 2569 */ 2570 vmbus_chan_unset_orphan(sc->hn_prichan); 2571 vmbus_xact_ctx_destroy(sc->hn_xact); 2572 } 2573 2574 if_free(ifp); 2575 2576 HN_LOCK_DESTROY(sc); 2577 rm_destroy(&sc->hn_vf_lock); 2578 return (0); 2579 } 2580 2581 static int 2582 hn_shutdown(device_t dev) 2583 { 2584 2585 return (0); 2586 } 2587 2588 static void 2589 hn_link_status(struct hn_softc *sc) 2590 { 2591 uint32_t link_status; 2592 int error; 2593 2594 error = hn_rndis_get_linkstatus(sc, &link_status); 2595 if (error) { 2596 /* XXX what to do? */ 2597 return; 2598 } 2599 2600 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2601 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2602 else 2603 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2604 if_link_state_change(sc->hn_ifp, 2605 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2606 LINK_STATE_UP : LINK_STATE_DOWN); 2607 } 2608 2609 static void 2610 hn_link_taskfunc(void *xsc, int pending __unused) 2611 { 2612 struct hn_softc *sc = xsc; 2613 2614 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2615 return; 2616 hn_link_status(sc); 2617 } 2618 2619 static void 2620 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2621 { 2622 struct hn_softc *sc = xsc; 2623 2624 /* Prevent any link status checks from running. */ 2625 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2626 2627 /* 2628 * Fake up a [link down --> link up] state change; 5 seconds 2629 * delay is used, which closely simulates miibus reaction 2630 * upon link down event. 2631 */ 2632 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2633 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2634 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2635 &sc->hn_netchg_status, 5 * hz); 2636 } 2637 2638 static void 2639 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2640 { 2641 struct hn_softc *sc = xsc; 2642 2643 /* Re-allow link status checks. */ 2644 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2645 hn_link_status(sc); 2646 } 2647 2648 static void 2649 hn_update_link_status(struct hn_softc *sc) 2650 { 2651 2652 if (sc->hn_mgmt_taskq != NULL) 2653 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2654 } 2655 2656 static void 2657 hn_change_network(struct hn_softc *sc) 2658 { 2659 2660 if (sc->hn_mgmt_taskq != NULL) 2661 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2662 } 2663 2664 static __inline int 2665 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2666 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2667 { 2668 struct mbuf *m = *m_head; 2669 int error; 2670 2671 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2672 2673 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2674 m, segs, nsegs, BUS_DMA_NOWAIT); 2675 if (error == EFBIG) { 2676 struct mbuf *m_new; 2677 2678 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2679 if (m_new == NULL) 2680 return ENOBUFS; 2681 else 2682 *m_head = m = m_new; 2683 txr->hn_tx_collapsed++; 2684 2685 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2686 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2687 } 2688 if (!error) { 2689 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2690 BUS_DMASYNC_PREWRITE); 2691 txd->flags |= HN_TXD_FLAG_DMAMAP; 2692 } 2693 return error; 2694 } 2695 2696 static __inline int 2697 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2698 { 2699 2700 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2701 ("put an onlist txd %#x", txd->flags)); 2702 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2703 ("put an onagg txd %#x", txd->flags)); 2704 2705 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2706 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2707 return 0; 2708 2709 if (!STAILQ_EMPTY(&txd->agg_list)) { 2710 struct hn_txdesc *tmp_txd; 2711 2712 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2713 int freed; 2714 2715 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2716 ("resursive aggregation on aggregated txdesc")); 2717 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2718 ("not aggregated txdesc")); 2719 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2720 ("aggregated txdesc uses dmamap")); 2721 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2722 ("aggregated txdesc consumes " 2723 "chimney sending buffer")); 2724 KASSERT(tmp_txd->chim_size == 0, 2725 ("aggregated txdesc has non-zero " 2726 "chimney sending size")); 2727 2728 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2729 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2730 freed = hn_txdesc_put(txr, tmp_txd); 2731 KASSERT(freed, ("failed to free aggregated txdesc")); 2732 } 2733 } 2734 2735 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2736 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2737 ("chim txd uses dmamap")); 2738 hn_chim_free(txr->hn_sc, txd->chim_index); 2739 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2740 txd->chim_size = 0; 2741 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2742 bus_dmamap_sync(txr->hn_tx_data_dtag, 2743 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2744 bus_dmamap_unload(txr->hn_tx_data_dtag, 2745 txd->data_dmap); 2746 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2747 } 2748 2749 if (txd->m != NULL) { 2750 m_freem(txd->m); 2751 txd->m = NULL; 2752 } 2753 2754 txd->flags |= HN_TXD_FLAG_ONLIST; 2755 #ifndef HN_USE_TXDESC_BUFRING 2756 mtx_lock_spin(&txr->hn_txlist_spin); 2757 KASSERT(txr->hn_txdesc_avail >= 0 && 2758 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2759 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2760 txr->hn_txdesc_avail++; 2761 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2762 mtx_unlock_spin(&txr->hn_txlist_spin); 2763 #else /* HN_USE_TXDESC_BUFRING */ 2764 #ifdef HN_DEBUG 2765 atomic_add_int(&txr->hn_txdesc_avail, 1); 2766 #endif 2767 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2768 #endif /* !HN_USE_TXDESC_BUFRING */ 2769 2770 return 1; 2771 } 2772 2773 static __inline struct hn_txdesc * 2774 hn_txdesc_get(struct hn_tx_ring *txr) 2775 { 2776 struct hn_txdesc *txd; 2777 2778 #ifndef HN_USE_TXDESC_BUFRING 2779 mtx_lock_spin(&txr->hn_txlist_spin); 2780 txd = SLIST_FIRST(&txr->hn_txlist); 2781 if (txd != NULL) { 2782 KASSERT(txr->hn_txdesc_avail > 0, 2783 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2784 txr->hn_txdesc_avail--; 2785 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2786 } 2787 mtx_unlock_spin(&txr->hn_txlist_spin); 2788 #else 2789 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2790 #endif 2791 2792 if (txd != NULL) { 2793 #ifdef HN_USE_TXDESC_BUFRING 2794 #ifdef HN_DEBUG 2795 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2796 #endif 2797 #endif /* HN_USE_TXDESC_BUFRING */ 2798 KASSERT(txd->m == NULL && txd->refs == 0 && 2799 STAILQ_EMPTY(&txd->agg_list) && 2800 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2801 txd->chim_size == 0 && 2802 (txd->flags & HN_TXD_FLAG_ONLIST) && 2803 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2804 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2805 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2806 txd->refs = 1; 2807 } 2808 return txd; 2809 } 2810 2811 static __inline void 2812 hn_txdesc_hold(struct hn_txdesc *txd) 2813 { 2814 2815 /* 0->1 transition will never work */ 2816 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2817 atomic_add_int(&txd->refs, 1); 2818 } 2819 2820 static __inline void 2821 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2822 { 2823 2824 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2825 ("recursive aggregation on aggregating txdesc")); 2826 2827 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2828 ("already aggregated")); 2829 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2830 ("recursive aggregation on to-be-aggregated txdesc")); 2831 2832 txd->flags |= HN_TXD_FLAG_ONAGG; 2833 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2834 } 2835 2836 static bool 2837 hn_tx_ring_pending(struct hn_tx_ring *txr) 2838 { 2839 bool pending = false; 2840 2841 #ifndef HN_USE_TXDESC_BUFRING 2842 mtx_lock_spin(&txr->hn_txlist_spin); 2843 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2844 pending = true; 2845 mtx_unlock_spin(&txr->hn_txlist_spin); 2846 #else 2847 if (!buf_ring_full(txr->hn_txdesc_br)) 2848 pending = true; 2849 #endif 2850 return (pending); 2851 } 2852 2853 static __inline void 2854 hn_txeof(struct hn_tx_ring *txr) 2855 { 2856 txr->hn_has_txeof = 0; 2857 txr->hn_txeof(txr); 2858 } 2859 2860 static void 2861 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2862 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2863 { 2864 struct hn_txdesc *txd = sndc->hn_cbarg; 2865 struct hn_tx_ring *txr; 2866 2867 txr = txd->txr; 2868 KASSERT(txr->hn_chan == chan, 2869 ("channel mismatch, on chan%u, should be chan%u", 2870 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2871 2872 txr->hn_has_txeof = 1; 2873 hn_txdesc_put(txr, txd); 2874 2875 ++txr->hn_txdone_cnt; 2876 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2877 txr->hn_txdone_cnt = 0; 2878 if (txr->hn_oactive) 2879 hn_txeof(txr); 2880 } 2881 } 2882 2883 static void 2884 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2885 { 2886 #if defined(INET) || defined(INET6) 2887 struct epoch_tracker et; 2888 2889 NET_EPOCH_ENTER(et); 2890 tcp_lro_flush_all(&rxr->hn_lro); 2891 NET_EPOCH_EXIT(et); 2892 #endif 2893 2894 /* 2895 * NOTE: 2896 * 'txr' could be NULL, if multiple channels and 2897 * ifnet.if_start method are enabled. 2898 */ 2899 if (txr == NULL || !txr->hn_has_txeof) 2900 return; 2901 2902 txr->hn_txdone_cnt = 0; 2903 hn_txeof(txr); 2904 } 2905 2906 static __inline uint32_t 2907 hn_rndis_pktmsg_offset(uint32_t ofs) 2908 { 2909 2910 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2911 ("invalid RNDIS packet msg offset %u", ofs)); 2912 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2913 } 2914 2915 static __inline void * 2916 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2917 size_t pi_dlen, uint32_t pi_type) 2918 { 2919 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2920 struct rndis_pktinfo *pi; 2921 2922 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2923 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2924 2925 /* 2926 * Per-packet-info does not move; it only grows. 2927 * 2928 * NOTE: 2929 * rm_pktinfooffset in this phase counts from the beginning 2930 * of rndis_packet_msg. 2931 */ 2932 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2933 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2934 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2935 pkt->rm_pktinfolen); 2936 pkt->rm_pktinfolen += pi_size; 2937 2938 pi->rm_size = pi_size; 2939 pi->rm_type = pi_type; 2940 pi->rm_internal = 0; 2941 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2942 2943 return (pi->rm_data); 2944 } 2945 2946 static __inline int 2947 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 2948 { 2949 struct hn_txdesc *txd; 2950 struct mbuf *m; 2951 int error, pkts; 2952 2953 txd = txr->hn_agg_txd; 2954 KASSERT(txd != NULL, ("no aggregate txdesc")); 2955 2956 /* 2957 * Since hn_txpkt() will reset this temporary stat, save 2958 * it now, so that oerrors can be updated properly, if 2959 * hn_txpkt() ever fails. 2960 */ 2961 pkts = txr->hn_stat_pkts; 2962 2963 /* 2964 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2965 * failure, save it for later freeing, if hn_txpkt() ever 2966 * fails. 2967 */ 2968 m = txd->m; 2969 error = hn_txpkt(ifp, txr, txd); 2970 if (__predict_false(error)) { 2971 /* txd is freed, but m is not. */ 2972 m_freem(m); 2973 2974 txr->hn_flush_failed++; 2975 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2976 } 2977 2978 /* Reset all aggregation states. */ 2979 txr->hn_agg_txd = NULL; 2980 txr->hn_agg_szleft = 0; 2981 txr->hn_agg_pktleft = 0; 2982 txr->hn_agg_prevpkt = NULL; 2983 2984 return (error); 2985 } 2986 2987 static void * 2988 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2989 int pktsize) 2990 { 2991 void *chim; 2992 2993 if (txr->hn_agg_txd != NULL) { 2994 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2995 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 2996 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 2997 int olen; 2998 2999 /* 3000 * Update the previous RNDIS packet's total length, 3001 * it can be increased due to the mandatory alignment 3002 * padding for this RNDIS packet. And update the 3003 * aggregating txdesc's chimney sending buffer size 3004 * accordingly. 3005 * 3006 * XXX 3007 * Zero-out the padding, as required by the RNDIS spec. 3008 */ 3009 olen = pkt->rm_len; 3010 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 3011 agg_txd->chim_size += pkt->rm_len - olen; 3012 3013 /* Link this txdesc to the parent. */ 3014 hn_txdesc_agg(agg_txd, txd); 3015 3016 chim = (uint8_t *)pkt + pkt->rm_len; 3017 /* Save the current packet for later fixup. */ 3018 txr->hn_agg_prevpkt = chim; 3019 3020 txr->hn_agg_pktleft--; 3021 txr->hn_agg_szleft -= pktsize; 3022 if (txr->hn_agg_szleft <= 3023 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3024 /* 3025 * Probably can't aggregate more packets, 3026 * flush this aggregating txdesc proactively. 3027 */ 3028 txr->hn_agg_pktleft = 0; 3029 } 3030 /* Done! */ 3031 return (chim); 3032 } 3033 hn_flush_txagg(ifp, txr); 3034 } 3035 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3036 3037 txr->hn_tx_chimney_tried++; 3038 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3039 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3040 return (NULL); 3041 txr->hn_tx_chimney++; 3042 3043 chim = txr->hn_sc->hn_chim + 3044 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3045 3046 if (txr->hn_agg_pktmax > 1 && 3047 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3048 txr->hn_agg_txd = txd; 3049 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3050 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3051 txr->hn_agg_prevpkt = chim; 3052 } 3053 return (chim); 3054 } 3055 3056 /* 3057 * NOTE: 3058 * If this function fails, then both txd and m_head0 will be freed. 3059 */ 3060 static int 3061 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3062 struct mbuf **m_head0) 3063 { 3064 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3065 int error, nsegs, i; 3066 struct mbuf *m_head = *m_head0; 3067 struct rndis_packet_msg *pkt; 3068 uint32_t *pi_data; 3069 void *chim = NULL; 3070 int pkt_hlen, pkt_size; 3071 3072 pkt = txd->rndis_pkt; 3073 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3074 if (pkt_size < txr->hn_chim_size) { 3075 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3076 if (chim != NULL) 3077 pkt = chim; 3078 } else { 3079 if (txr->hn_agg_txd != NULL) 3080 hn_flush_txagg(ifp, txr); 3081 } 3082 3083 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3084 pkt->rm_len = m_head->m_pkthdr.len; 3085 pkt->rm_dataoffset = 0; 3086 pkt->rm_datalen = m_head->m_pkthdr.len; 3087 pkt->rm_oobdataoffset = 0; 3088 pkt->rm_oobdatalen = 0; 3089 pkt->rm_oobdataelements = 0; 3090 pkt->rm_pktinfooffset = sizeof(*pkt); 3091 pkt->rm_pktinfolen = 0; 3092 pkt->rm_vchandle = 0; 3093 pkt->rm_reserved = 0; 3094 3095 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3096 /* 3097 * Set the hash value for this packet. 3098 */ 3099 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3100 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3101 3102 if (M_HASHTYPE_ISHASH(m_head)) 3103 /* 3104 * The flowid field contains the hash value host 3105 * set in the rx queue if it is a ip forwarding pkt. 3106 * Set the same hash value so host can send on the 3107 * cpu it was received. 3108 */ 3109 *pi_data = m_head->m_pkthdr.flowid; 3110 else 3111 /* 3112 * Otherwise just put the tx queue index. 3113 */ 3114 *pi_data = txr->hn_tx_idx; 3115 } 3116 3117 if (m_head->m_flags & M_VLANTAG) { 3118 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3119 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3120 *pi_data = NDIS_VLAN_INFO_MAKE( 3121 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3122 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3123 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3124 } 3125 3126 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3127 #if defined(INET6) || defined(INET) 3128 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3129 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3130 #ifdef INET 3131 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3132 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3133 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3134 m_head->m_pkthdr.tso_segsz); 3135 } 3136 #endif 3137 #if defined(INET6) && defined(INET) 3138 else 3139 #endif 3140 #ifdef INET6 3141 { 3142 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3143 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3144 m_head->m_pkthdr.tso_segsz); 3145 } 3146 #endif 3147 #endif /* INET6 || INET */ 3148 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3149 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3150 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3151 if (m_head->m_pkthdr.csum_flags & 3152 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3153 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3154 } else { 3155 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3156 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3157 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3158 } 3159 3160 if (m_head->m_pkthdr.csum_flags & 3161 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3162 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3163 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3164 } else if (m_head->m_pkthdr.csum_flags & 3165 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3166 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3167 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3168 } 3169 } 3170 3171 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3172 /* Fixup RNDIS packet message total length */ 3173 pkt->rm_len += pkt_hlen; 3174 /* Convert RNDIS packet message offsets */ 3175 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3176 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3177 3178 /* 3179 * Fast path: Chimney sending. 3180 */ 3181 if (chim != NULL) { 3182 struct hn_txdesc *tgt_txd = txd; 3183 3184 if (txr->hn_agg_txd != NULL) { 3185 tgt_txd = txr->hn_agg_txd; 3186 #ifdef INVARIANTS 3187 *m_head0 = NULL; 3188 #endif 3189 } 3190 3191 KASSERT(pkt == chim, 3192 ("RNDIS pkt not in chimney sending buffer")); 3193 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3194 ("chimney sending buffer is not used")); 3195 tgt_txd->chim_size += pkt->rm_len; 3196 3197 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3198 ((uint8_t *)chim) + pkt_hlen); 3199 3200 txr->hn_gpa_cnt = 0; 3201 txr->hn_sendpkt = hn_txpkt_chim; 3202 goto done; 3203 } 3204 3205 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3206 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3207 ("chimney buffer is used")); 3208 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3209 3210 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3211 if (__predict_false(error)) { 3212 int freed; 3213 3214 /* 3215 * This mbuf is not linked w/ the txd yet, so free it now. 3216 */ 3217 m_freem(m_head); 3218 *m_head0 = NULL; 3219 3220 freed = hn_txdesc_put(txr, txd); 3221 KASSERT(freed != 0, 3222 ("fail to free txd upon txdma error")); 3223 3224 txr->hn_txdma_failed++; 3225 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3226 return error; 3227 } 3228 *m_head0 = m_head; 3229 3230 /* +1 RNDIS packet message */ 3231 txr->hn_gpa_cnt = nsegs + 1; 3232 3233 /* send packet with page buffer */ 3234 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3235 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3236 txr->hn_gpa[0].gpa_len = pkt_hlen; 3237 3238 /* 3239 * Fill the page buffers with mbuf info after the page 3240 * buffer for RNDIS packet message. 3241 */ 3242 for (i = 0; i < nsegs; ++i) { 3243 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3244 3245 gpa->gpa_page = atop(segs[i].ds_addr); 3246 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3247 gpa->gpa_len = segs[i].ds_len; 3248 } 3249 3250 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3251 txd->chim_size = 0; 3252 txr->hn_sendpkt = hn_txpkt_sglist; 3253 done: 3254 txd->m = m_head; 3255 3256 /* Set the completion routine */ 3257 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3258 3259 /* Update temporary stats for later use. */ 3260 txr->hn_stat_pkts++; 3261 txr->hn_stat_size += m_head->m_pkthdr.len; 3262 if (m_head->m_flags & M_MCAST) 3263 txr->hn_stat_mcasts++; 3264 3265 return 0; 3266 } 3267 3268 /* 3269 * NOTE: 3270 * If this function fails, then txd will be freed, but the mbuf 3271 * associated w/ the txd will _not_ be freed. 3272 */ 3273 static int 3274 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3275 { 3276 int error, send_failed = 0, has_bpf; 3277 3278 again: 3279 has_bpf = bpf_peers_present(ifp->if_bpf); 3280 if (has_bpf) { 3281 /* 3282 * Make sure that this txd and any aggregated txds are not 3283 * freed before ETHER_BPF_MTAP. 3284 */ 3285 hn_txdesc_hold(txd); 3286 } 3287 error = txr->hn_sendpkt(txr, txd); 3288 if (!error) { 3289 if (has_bpf) { 3290 const struct hn_txdesc *tmp_txd; 3291 3292 ETHER_BPF_MTAP(ifp, txd->m); 3293 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3294 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3295 } 3296 3297 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3298 #ifdef HN_IFSTART_SUPPORT 3299 if (!hn_use_if_start) 3300 #endif 3301 { 3302 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3303 txr->hn_stat_size); 3304 if (txr->hn_stat_mcasts != 0) { 3305 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3306 txr->hn_stat_mcasts); 3307 } 3308 } 3309 txr->hn_pkts += txr->hn_stat_pkts; 3310 txr->hn_sends++; 3311 } 3312 if (has_bpf) 3313 hn_txdesc_put(txr, txd); 3314 3315 if (__predict_false(error)) { 3316 int freed; 3317 3318 /* 3319 * This should "really rarely" happen. 3320 * 3321 * XXX Too many RX to be acked or too many sideband 3322 * commands to run? Ask netvsc_channel_rollup() 3323 * to kick start later. 3324 */ 3325 txr->hn_has_txeof = 1; 3326 if (!send_failed) { 3327 txr->hn_send_failed++; 3328 send_failed = 1; 3329 /* 3330 * Try sending again after set hn_has_txeof; 3331 * in case that we missed the last 3332 * netvsc_channel_rollup(). 3333 */ 3334 goto again; 3335 } 3336 if_printf(ifp, "send failed\n"); 3337 3338 /* 3339 * Caller will perform further processing on the 3340 * associated mbuf, so don't free it in hn_txdesc_put(); 3341 * only unload it from the DMA map in hn_txdesc_put(), 3342 * if it was loaded. 3343 */ 3344 txd->m = NULL; 3345 freed = hn_txdesc_put(txr, txd); 3346 KASSERT(freed != 0, 3347 ("fail to free txd upon send error")); 3348 3349 txr->hn_send_failed++; 3350 } 3351 3352 /* Reset temporary stats, after this sending is done. */ 3353 txr->hn_stat_size = 0; 3354 txr->hn_stat_pkts = 0; 3355 txr->hn_stat_mcasts = 0; 3356 3357 return (error); 3358 } 3359 3360 /* 3361 * Append the specified data to the indicated mbuf chain, 3362 * Extend the mbuf chain if the new data does not fit in 3363 * existing space. 3364 * 3365 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3366 * There should be an equivalent in the kernel mbuf code, 3367 * but there does not appear to be one yet. 3368 * 3369 * Differs from m_append() in that additional mbufs are 3370 * allocated with cluster size MJUMPAGESIZE, and filled 3371 * accordingly. 3372 * 3373 * Return the last mbuf in the chain or NULL if failed to 3374 * allocate new mbuf. 3375 */ 3376 static struct mbuf * 3377 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3378 { 3379 struct mbuf *m, *n; 3380 int remainder, space; 3381 3382 for (m = m0; m->m_next != NULL; m = m->m_next) 3383 ; 3384 remainder = len; 3385 space = M_TRAILINGSPACE(m); 3386 if (space > 0) { 3387 /* 3388 * Copy into available space. 3389 */ 3390 if (space > remainder) 3391 space = remainder; 3392 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3393 m->m_len += space; 3394 cp += space; 3395 remainder -= space; 3396 } 3397 while (remainder > 0) { 3398 /* 3399 * Allocate a new mbuf; could check space 3400 * and allocate a cluster instead. 3401 */ 3402 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3403 if (n == NULL) 3404 return NULL; 3405 n->m_len = min(MJUMPAGESIZE, remainder); 3406 bcopy(cp, mtod(n, caddr_t), n->m_len); 3407 cp += n->m_len; 3408 remainder -= n->m_len; 3409 m->m_next = n; 3410 m = n; 3411 } 3412 3413 return m; 3414 } 3415 3416 #if defined(INET) || defined(INET6) 3417 static __inline int 3418 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3419 { 3420 #if __FreeBSD_version >= 1100095 3421 if (hn_lro_mbufq_depth) { 3422 tcp_lro_queue_mbuf(lc, m); 3423 return 0; 3424 } 3425 #endif 3426 return tcp_lro_rx(lc, m, 0); 3427 } 3428 #endif 3429 3430 static int 3431 hn_rxpkt(struct hn_rx_ring *rxr) 3432 { 3433 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; 3434 struct mbuf *m_new, *n; 3435 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3436 int hash_type = M_HASHTYPE_NONE; 3437 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3438 int i; 3439 3440 ifp = hn_ifp; 3441 if (rxr->hn_rxvf_ifp != NULL) { 3442 /* 3443 * Non-transparent mode VF; pretend this packet is from 3444 * the VF. 3445 */ 3446 ifp = rxr->hn_rxvf_ifp; 3447 is_vf = 1; 3448 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3449 /* Transparent mode VF. */ 3450 is_vf = 1; 3451 } 3452 3453 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 3454 /* 3455 * NOTE: 3456 * See the NOTE of hn_rndis_init_fixat(). This 3457 * function can be reached, immediately after the 3458 * RNDIS is initialized but before the ifnet is 3459 * setup on the hn_attach() path; drop the unexpected 3460 * packets. 3461 */ 3462 return (0); 3463 } 3464 3465 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3466 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3467 return (0); 3468 } 3469 3470 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3471 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3472 if (m_new == NULL) { 3473 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3474 return (0); 3475 } 3476 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3477 rxr->rsc.frag_len[0]); 3478 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3479 } else { 3480 /* 3481 * Get an mbuf with a cluster. For packets 2K or less, 3482 * get a standard 2K cluster. For anything larger, get a 3483 * 4K cluster. Any buffers larger than 4K can cause problems 3484 * if looped around to the Hyper-V TX channel, so avoid them. 3485 */ 3486 size = MCLBYTES; 3487 if (rxr->rsc.pktlen > MCLBYTES) { 3488 /* 4096 */ 3489 size = MJUMPAGESIZE; 3490 } 3491 3492 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3493 if (m_new == NULL) { 3494 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3495 return (0); 3496 } 3497 3498 n = m_new; 3499 for (i = 0; i < rxr->rsc.cnt; i++) { 3500 n = hv_m_append(n, rxr->rsc.frag_len[i], 3501 rxr->rsc.frag_data[i]); 3502 if (n == NULL) { 3503 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3504 return (0); 3505 } else { 3506 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3507 } 3508 } 3509 } 3510 if (rxr->rsc.pktlen <= MHLEN) 3511 rxr->hn_small_pkts++; 3512 3513 m_new->m_pkthdr.rcvif = ifp; 3514 3515 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) 3516 do_csum = 0; 3517 3518 /* receive side checksum offload */ 3519 if (rxr->rsc.csum_info != NULL) { 3520 /* IP csum offload */ 3521 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3522 m_new->m_pkthdr.csum_flags |= 3523 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3524 rxr->hn_csum_ip++; 3525 } 3526 3527 /* TCP/UDP csum offload */ 3528 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3529 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3530 m_new->m_pkthdr.csum_flags |= 3531 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3532 m_new->m_pkthdr.csum_data = 0xffff; 3533 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3534 rxr->hn_csum_tcp++; 3535 else 3536 rxr->hn_csum_udp++; 3537 } 3538 3539 /* 3540 * XXX 3541 * As of this write (Oct 28th, 2016), host side will turn 3542 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3543 * the do_lro setting here is actually _not_ accurate. We 3544 * depend on the RSS hash type check to reset do_lro. 3545 */ 3546 if ((*(rxr->rsc.csum_info) & 3547 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3548 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3549 do_lro = 1; 3550 } else { 3551 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3552 if (l3proto == ETHERTYPE_IP) { 3553 if (l4proto == IPPROTO_TCP) { 3554 if (do_csum && 3555 (rxr->hn_trust_hcsum & 3556 HN_TRUST_HCSUM_TCP)) { 3557 rxr->hn_csum_trusted++; 3558 m_new->m_pkthdr.csum_flags |= 3559 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3560 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3561 m_new->m_pkthdr.csum_data = 0xffff; 3562 } 3563 do_lro = 1; 3564 } else if (l4proto == IPPROTO_UDP) { 3565 if (do_csum && 3566 (rxr->hn_trust_hcsum & 3567 HN_TRUST_HCSUM_UDP)) { 3568 rxr->hn_csum_trusted++; 3569 m_new->m_pkthdr.csum_flags |= 3570 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3571 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3572 m_new->m_pkthdr.csum_data = 0xffff; 3573 } 3574 } else if (l4proto != IPPROTO_DONE && do_csum && 3575 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3576 rxr->hn_csum_trusted++; 3577 m_new->m_pkthdr.csum_flags |= 3578 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3579 } 3580 } 3581 } 3582 3583 if (rxr->rsc.vlan_info != NULL) { 3584 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3585 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3586 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3587 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3588 m_new->m_flags |= M_VLANTAG; 3589 } 3590 3591 /* 3592 * If VF is activated (tranparent/non-transparent mode does not 3593 * matter here). 3594 * 3595 * - Disable LRO 3596 * 3597 * hn(4) will only receive broadcast packets, multicast packets, 3598 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3599 * packet types. 3600 * 3601 * For non-transparent, we definitely _cannot_ enable LRO at 3602 * all, since the LRO flush will use hn(4) as the receiving 3603 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3604 */ 3605 if (is_vf) 3606 do_lro = 0; 3607 3608 /* 3609 * If VF is activated (tranparent/non-transparent mode does not 3610 * matter here), do _not_ mess with unsupported hash types or 3611 * functions. 3612 */ 3613 if (rxr->rsc.hash_info != NULL) { 3614 rxr->hn_rss_pkts++; 3615 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3616 if (!is_vf) 3617 hash_type = M_HASHTYPE_OPAQUE_HASH; 3618 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3619 NDIS_HASH_FUNCTION_TOEPLITZ) { 3620 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3621 rxr->hn_mbuf_hash); 3622 3623 /* 3624 * NOTE: 3625 * do_lro is resetted, if the hash types are not TCP 3626 * related. See the comment in the above csum_flags 3627 * setup section. 3628 */ 3629 switch (type) { 3630 case NDIS_HASH_IPV4: 3631 hash_type = M_HASHTYPE_RSS_IPV4; 3632 do_lro = 0; 3633 break; 3634 3635 case NDIS_HASH_TCP_IPV4: 3636 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3637 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3638 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3639 3640 if (is_vf) 3641 def_htype = M_HASHTYPE_NONE; 3642 3643 /* 3644 * UDP 4-tuple hash is delivered as 3645 * TCP 4-tuple hash. 3646 */ 3647 if (l3proto == ETHERTYPE_MAX) { 3648 hn_rxpkt_proto(m_new, 3649 &l3proto, &l4proto); 3650 } 3651 if (l3proto == ETHERTYPE_IP) { 3652 if (l4proto == IPPROTO_UDP && 3653 (rxr->hn_mbuf_hash & 3654 NDIS_HASH_UDP_IPV4_X)) { 3655 hash_type = 3656 M_HASHTYPE_RSS_UDP_IPV4; 3657 do_lro = 0; 3658 } else if (l4proto != 3659 IPPROTO_TCP) { 3660 hash_type = def_htype; 3661 do_lro = 0; 3662 } 3663 } else { 3664 hash_type = def_htype; 3665 do_lro = 0; 3666 } 3667 } 3668 break; 3669 3670 case NDIS_HASH_IPV6: 3671 hash_type = M_HASHTYPE_RSS_IPV6; 3672 do_lro = 0; 3673 break; 3674 3675 case NDIS_HASH_IPV6_EX: 3676 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3677 do_lro = 0; 3678 break; 3679 3680 case NDIS_HASH_TCP_IPV6: 3681 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3682 break; 3683 3684 case NDIS_HASH_TCP_IPV6_EX: 3685 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3686 break; 3687 } 3688 } 3689 } else if (!is_vf) { 3690 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3691 hash_type = M_HASHTYPE_OPAQUE; 3692 } 3693 M_HASHTYPE_SET(m_new, hash_type); 3694 3695 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3696 if (hn_ifp != ifp) { 3697 const struct ether_header *eh; 3698 3699 /* 3700 * Non-transparent mode VF is activated. 3701 */ 3702 3703 /* 3704 * Allow tapping on hn(4). 3705 */ 3706 ETHER_BPF_MTAP(hn_ifp, m_new); 3707 3708 /* 3709 * Update hn(4)'s stats. 3710 */ 3711 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3712 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3713 /* Checked at the beginning of this function. */ 3714 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3715 eh = mtod(m_new, struct ether_header *); 3716 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3717 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3718 } 3719 rxr->hn_pkts++; 3720 3721 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { 3722 #if defined(INET) || defined(INET6) 3723 struct lro_ctrl *lro = &rxr->hn_lro; 3724 3725 if (lro->lro_cnt) { 3726 rxr->hn_lro_tried++; 3727 if (hn_lro_rx(lro, m_new) == 0) { 3728 /* DONE! */ 3729 return 0; 3730 } 3731 } 3732 #endif 3733 } 3734 ifp->if_input(ifp, m_new); 3735 3736 return (0); 3737 } 3738 3739 static int 3740 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 3741 { 3742 struct hn_softc *sc = ifp->if_softc; 3743 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3744 struct ifnet *vf_ifp; 3745 int mask, error = 0; 3746 struct ifrsskey *ifrk; 3747 struct ifrsshash *ifrh; 3748 uint32_t mtu; 3749 3750 switch (cmd) { 3751 case SIOCSIFMTU: 3752 if (ifr->ifr_mtu > HN_MTU_MAX) { 3753 error = EINVAL; 3754 break; 3755 } 3756 3757 HN_LOCK(sc); 3758 3759 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3760 HN_UNLOCK(sc); 3761 break; 3762 } 3763 3764 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3765 /* Can't change MTU */ 3766 HN_UNLOCK(sc); 3767 error = EOPNOTSUPP; 3768 break; 3769 } 3770 3771 if (ifp->if_mtu == ifr->ifr_mtu) { 3772 HN_UNLOCK(sc); 3773 break; 3774 } 3775 3776 if (hn_xpnt_vf_isready(sc)) { 3777 vf_ifp = sc->hn_vf_ifp; 3778 ifr_vf = *ifr; 3779 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, 3780 sizeof(ifr_vf.ifr_name)); 3781 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, 3782 (caddr_t)&ifr_vf); 3783 if (error) { 3784 HN_UNLOCK(sc); 3785 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3786 vf_ifp->if_xname, ifr->ifr_mtu, error); 3787 break; 3788 } 3789 } 3790 3791 /* 3792 * Suspend this interface before the synthetic parts 3793 * are ripped. 3794 */ 3795 hn_suspend(sc); 3796 3797 /* 3798 * Detach the synthetics parts, i.e. NVS and RNDIS. 3799 */ 3800 hn_synth_detach(sc); 3801 3802 /* 3803 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3804 * with the new MTU setting. 3805 */ 3806 error = hn_synth_attach(sc, ifr->ifr_mtu); 3807 if (error) { 3808 HN_UNLOCK(sc); 3809 break; 3810 } 3811 3812 error = hn_rndis_get_mtu(sc, &mtu); 3813 if (error) 3814 mtu = ifr->ifr_mtu; 3815 else if (bootverbose) 3816 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3817 3818 /* 3819 * Commit the requested MTU, after the synthetic parts 3820 * have been successfully attached. 3821 */ 3822 if (mtu >= ifr->ifr_mtu) { 3823 mtu = ifr->ifr_mtu; 3824 } else { 3825 if_printf(ifp, "fixup mtu %d -> %u\n", 3826 ifr->ifr_mtu, mtu); 3827 } 3828 ifp->if_mtu = mtu; 3829 3830 /* 3831 * Synthetic parts' reattach may change the chimney 3832 * sending size; update it. 3833 */ 3834 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3835 hn_set_chim_size(sc, sc->hn_chim_szmax); 3836 3837 /* 3838 * Make sure that various parameters based on MTU are 3839 * still valid, after the MTU change. 3840 */ 3841 hn_mtu_change_fixup(sc); 3842 3843 /* 3844 * All done! Resume the interface now. 3845 */ 3846 hn_resume(sc); 3847 3848 if ((sc->hn_flags & HN_FLAG_RXVF) || 3849 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3850 /* 3851 * Since we have reattached the NVS part, 3852 * change the datapath to VF again; in case 3853 * that it is lost, after the NVS was detached. 3854 */ 3855 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3856 } 3857 3858 HN_UNLOCK(sc); 3859 break; 3860 3861 case SIOCSIFFLAGS: 3862 HN_LOCK(sc); 3863 3864 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3865 HN_UNLOCK(sc); 3866 break; 3867 } 3868 3869 if (hn_xpnt_vf_isready(sc)) 3870 hn_xpnt_vf_saveifflags(sc); 3871 3872 if (ifp->if_flags & IFF_UP) { 3873 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3874 /* 3875 * Caller meight hold mutex, e.g. 3876 * bpf; use busy-wait for the RNDIS 3877 * reply. 3878 */ 3879 HN_NO_SLEEPING(sc); 3880 hn_rxfilter_config(sc); 3881 HN_SLEEPING_OK(sc); 3882 3883 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3884 error = hn_xpnt_vf_iocsetflags(sc); 3885 } else { 3886 hn_init_locked(sc); 3887 } 3888 } else { 3889 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3890 hn_stop(sc, false); 3891 } 3892 sc->hn_if_flags = ifp->if_flags; 3893 3894 HN_UNLOCK(sc); 3895 break; 3896 3897 case SIOCSIFCAP: 3898 HN_LOCK(sc); 3899 3900 if (hn_xpnt_vf_isready(sc)) { 3901 ifr_vf = *ifr; 3902 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, 3903 sizeof(ifr_vf.ifr_name)); 3904 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3905 HN_UNLOCK(sc); 3906 break; 3907 } 3908 3909 /* 3910 * Fix up requested capabilities w/ supported capabilities, 3911 * since the supported capabilities could have been changed. 3912 */ 3913 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ 3914 ifp->if_capenable; 3915 3916 if (mask & IFCAP_TXCSUM) { 3917 ifp->if_capenable ^= IFCAP_TXCSUM; 3918 if (ifp->if_capenable & IFCAP_TXCSUM) 3919 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 3920 else 3921 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 3922 } 3923 if (mask & IFCAP_TXCSUM_IPV6) { 3924 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 3925 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 3926 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 3927 else 3928 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 3929 } 3930 3931 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3932 if (mask & IFCAP_RXCSUM) 3933 ifp->if_capenable ^= IFCAP_RXCSUM; 3934 #ifdef foo 3935 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3936 if (mask & IFCAP_RXCSUM_IPV6) 3937 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 3938 #endif 3939 3940 if (mask & IFCAP_LRO) 3941 ifp->if_capenable ^= IFCAP_LRO; 3942 3943 if (mask & IFCAP_TSO4) { 3944 ifp->if_capenable ^= IFCAP_TSO4; 3945 if (ifp->if_capenable & IFCAP_TSO4) 3946 ifp->if_hwassist |= CSUM_IP_TSO; 3947 else 3948 ifp->if_hwassist &= ~CSUM_IP_TSO; 3949 } 3950 if (mask & IFCAP_TSO6) { 3951 ifp->if_capenable ^= IFCAP_TSO6; 3952 if (ifp->if_capenable & IFCAP_TSO6) 3953 ifp->if_hwassist |= CSUM_IP6_TSO; 3954 else 3955 ifp->if_hwassist &= ~CSUM_IP6_TSO; 3956 } 3957 3958 HN_UNLOCK(sc); 3959 break; 3960 3961 case SIOCADDMULTI: 3962 case SIOCDELMULTI: 3963 HN_LOCK(sc); 3964 3965 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3966 HN_UNLOCK(sc); 3967 break; 3968 } 3969 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3970 /* 3971 * Multicast uses mutex; use busy-wait for 3972 * the RNDIS reply. 3973 */ 3974 HN_NO_SLEEPING(sc); 3975 hn_rxfilter_config(sc); 3976 HN_SLEEPING_OK(sc); 3977 } 3978 3979 /* XXX vlan(4) style mcast addr maintenance */ 3980 if (hn_xpnt_vf_isready(sc)) { 3981 int old_if_flags; 3982 3983 old_if_flags = sc->hn_vf_ifp->if_flags; 3984 hn_xpnt_vf_saveifflags(sc); 3985 3986 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3987 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & 3988 IFF_ALLMULTI)) 3989 error = hn_xpnt_vf_iocsetflags(sc); 3990 } 3991 3992 HN_UNLOCK(sc); 3993 break; 3994 3995 case SIOCSIFMEDIA: 3996 case SIOCGIFMEDIA: 3997 HN_LOCK(sc); 3998 if (hn_xpnt_vf_isready(sc)) { 3999 /* 4000 * SIOCGIFMEDIA expects ifmediareq, so don't 4001 * create and pass ifr_vf to the VF here; just 4002 * replace the ifr_name. 4003 */ 4004 vf_ifp = sc->hn_vf_ifp; 4005 strlcpy(ifr->ifr_name, vf_ifp->if_xname, 4006 sizeof(ifr->ifr_name)); 4007 error = vf_ifp->if_ioctl(vf_ifp, cmd, data); 4008 /* Restore the ifr_name. */ 4009 strlcpy(ifr->ifr_name, ifp->if_xname, 4010 sizeof(ifr->ifr_name)); 4011 HN_UNLOCK(sc); 4012 break; 4013 } 4014 HN_UNLOCK(sc); 4015 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4016 break; 4017 4018 case SIOCGIFRSSHASH: 4019 ifrh = (struct ifrsshash *)data; 4020 HN_LOCK(sc); 4021 if (sc->hn_rx_ring_inuse == 1) { 4022 HN_UNLOCK(sc); 4023 ifrh->ifrh_func = RSS_FUNC_NONE; 4024 ifrh->ifrh_types = 0; 4025 break; 4026 } 4027 4028 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4029 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4030 else 4031 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4032 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4033 HN_UNLOCK(sc); 4034 break; 4035 4036 case SIOCGIFRSSKEY: 4037 ifrk = (struct ifrsskey *)data; 4038 HN_LOCK(sc); 4039 if (sc->hn_rx_ring_inuse == 1) { 4040 HN_UNLOCK(sc); 4041 ifrk->ifrk_func = RSS_FUNC_NONE; 4042 ifrk->ifrk_keylen = 0; 4043 break; 4044 } 4045 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4046 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4047 else 4048 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4049 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4050 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4051 NDIS_HASH_KEYSIZE_TOEPLITZ); 4052 HN_UNLOCK(sc); 4053 break; 4054 4055 default: 4056 error = ether_ioctl(ifp, cmd, data); 4057 break; 4058 } 4059 return (error); 4060 } 4061 4062 static void 4063 hn_stop(struct hn_softc *sc, bool detaching) 4064 { 4065 struct ifnet *ifp = sc->hn_ifp; 4066 int i; 4067 4068 HN_LOCK_ASSERT(sc); 4069 4070 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4071 ("synthetic parts were not attached")); 4072 4073 /* Clear RUNNING bit ASAP. */ 4074 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4075 4076 /* Disable polling. */ 4077 hn_polling(sc, 0); 4078 4079 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4080 KASSERT(sc->hn_vf_ifp != NULL, 4081 ("%s: VF is not attached", ifp->if_xname)); 4082 4083 /* Mark transparent mode VF as disabled. */ 4084 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4085 4086 /* 4087 * NOTE: 4088 * Datapath setting must happen _before_ bringing 4089 * the VF down. 4090 */ 4091 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4092 4093 /* 4094 * Bring the VF down. 4095 */ 4096 hn_xpnt_vf_saveifflags(sc); 4097 sc->hn_vf_ifp->if_flags &= ~IFF_UP; 4098 hn_xpnt_vf_iocsetflags(sc); 4099 } 4100 4101 /* Suspend data transfers. */ 4102 hn_suspend_data(sc); 4103 4104 /* Clear OACTIVE bit. */ 4105 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4106 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4107 sc->hn_tx_ring[i].hn_oactive = 0; 4108 4109 /* 4110 * If the non-transparent mode VF is active, make sure 4111 * that the RX filter still allows packet reception. 4112 */ 4113 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4114 hn_rxfilter_config(sc); 4115 } 4116 4117 static void 4118 hn_init_locked(struct hn_softc *sc) 4119 { 4120 struct ifnet *ifp = sc->hn_ifp; 4121 int i; 4122 4123 HN_LOCK_ASSERT(sc); 4124 4125 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4126 return; 4127 4128 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 4129 return; 4130 4131 /* Configure RX filter */ 4132 hn_rxfilter_config(sc); 4133 4134 /* Clear OACTIVE bit. */ 4135 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4136 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4137 sc->hn_tx_ring[i].hn_oactive = 0; 4138 4139 /* Clear TX 'suspended' bit. */ 4140 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4141 4142 if (hn_xpnt_vf_isready(sc)) { 4143 /* Initialize transparent VF. */ 4144 hn_xpnt_vf_init(sc); 4145 } 4146 4147 /* Everything is ready; unleash! */ 4148 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4149 4150 /* Re-enable polling if requested. */ 4151 if (sc->hn_pollhz > 0) 4152 hn_polling(sc, sc->hn_pollhz); 4153 } 4154 4155 static void 4156 hn_init(void *xsc) 4157 { 4158 struct hn_softc *sc = xsc; 4159 4160 HN_LOCK(sc); 4161 hn_init_locked(sc); 4162 HN_UNLOCK(sc); 4163 } 4164 4165 #if __FreeBSD_version >= 1100099 4166 4167 static int 4168 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4169 { 4170 struct hn_softc *sc = arg1; 4171 unsigned int lenlim; 4172 int error; 4173 4174 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4175 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4176 if (error || req->newptr == NULL) 4177 return error; 4178 4179 HN_LOCK(sc); 4180 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4181 lenlim > TCP_LRO_LENGTH_MAX) { 4182 HN_UNLOCK(sc); 4183 return EINVAL; 4184 } 4185 hn_set_lro_lenlim(sc, lenlim); 4186 HN_UNLOCK(sc); 4187 4188 return 0; 4189 } 4190 4191 static int 4192 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4193 { 4194 struct hn_softc *sc = arg1; 4195 int ackcnt, error, i; 4196 4197 /* 4198 * lro_ackcnt_lim is append count limit, 4199 * +1 to turn it into aggregation limit. 4200 */ 4201 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4202 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4203 if (error || req->newptr == NULL) 4204 return error; 4205 4206 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4207 return EINVAL; 4208 4209 /* 4210 * Convert aggregation limit back to append 4211 * count limit. 4212 */ 4213 --ackcnt; 4214 HN_LOCK(sc); 4215 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4216 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4217 HN_UNLOCK(sc); 4218 return 0; 4219 } 4220 4221 #endif 4222 4223 static int 4224 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4225 { 4226 struct hn_softc *sc = arg1; 4227 int hcsum = arg2; 4228 int on, error, i; 4229 4230 on = 0; 4231 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4232 on = 1; 4233 4234 error = sysctl_handle_int(oidp, &on, 0, req); 4235 if (error || req->newptr == NULL) 4236 return error; 4237 4238 HN_LOCK(sc); 4239 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4240 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4241 4242 if (on) 4243 rxr->hn_trust_hcsum |= hcsum; 4244 else 4245 rxr->hn_trust_hcsum &= ~hcsum; 4246 } 4247 HN_UNLOCK(sc); 4248 return 0; 4249 } 4250 4251 static int 4252 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4253 { 4254 struct hn_softc *sc = arg1; 4255 int chim_size, error; 4256 4257 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4258 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4259 if (error || req->newptr == NULL) 4260 return error; 4261 4262 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4263 return EINVAL; 4264 4265 HN_LOCK(sc); 4266 hn_set_chim_size(sc, chim_size); 4267 HN_UNLOCK(sc); 4268 return 0; 4269 } 4270 4271 #if __FreeBSD_version < 1100095 4272 static int 4273 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 4274 { 4275 struct hn_softc *sc = arg1; 4276 int ofs = arg2, i, error; 4277 struct hn_rx_ring *rxr; 4278 uint64_t stat; 4279 4280 stat = 0; 4281 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4282 rxr = &sc->hn_rx_ring[i]; 4283 stat += *((int *)((uint8_t *)rxr + ofs)); 4284 } 4285 4286 error = sysctl_handle_64(oidp, &stat, 0, req); 4287 if (error || req->newptr == NULL) 4288 return error; 4289 4290 /* Zero out this stat. */ 4291 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4292 rxr = &sc->hn_rx_ring[i]; 4293 *((int *)((uint8_t *)rxr + ofs)) = 0; 4294 } 4295 return 0; 4296 } 4297 #else 4298 static int 4299 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4300 { 4301 struct hn_softc *sc = arg1; 4302 int ofs = arg2, i, error; 4303 struct hn_rx_ring *rxr; 4304 uint64_t stat; 4305 4306 stat = 0; 4307 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4308 rxr = &sc->hn_rx_ring[i]; 4309 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4310 } 4311 4312 error = sysctl_handle_64(oidp, &stat, 0, req); 4313 if (error || req->newptr == NULL) 4314 return error; 4315 4316 /* Zero out this stat. */ 4317 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4318 rxr = &sc->hn_rx_ring[i]; 4319 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4320 } 4321 return 0; 4322 } 4323 4324 #endif 4325 4326 static int 4327 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4328 { 4329 struct hn_softc *sc = arg1; 4330 int ofs = arg2, i, error; 4331 struct hn_rx_ring *rxr; 4332 u_long stat; 4333 4334 stat = 0; 4335 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4336 rxr = &sc->hn_rx_ring[i]; 4337 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4338 } 4339 4340 error = sysctl_handle_long(oidp, &stat, 0, req); 4341 if (error || req->newptr == NULL) 4342 return error; 4343 4344 /* Zero out this stat. */ 4345 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4346 rxr = &sc->hn_rx_ring[i]; 4347 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4348 } 4349 return 0; 4350 } 4351 4352 static int 4353 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4354 { 4355 struct hn_softc *sc = arg1; 4356 int ofs = arg2, i, error; 4357 struct hn_tx_ring *txr; 4358 u_long stat; 4359 4360 stat = 0; 4361 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4362 txr = &sc->hn_tx_ring[i]; 4363 stat += *((u_long *)((uint8_t *)txr + ofs)); 4364 } 4365 4366 error = sysctl_handle_long(oidp, &stat, 0, req); 4367 if (error || req->newptr == NULL) 4368 return error; 4369 4370 /* Zero out this stat. */ 4371 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4372 txr = &sc->hn_tx_ring[i]; 4373 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4374 } 4375 return 0; 4376 } 4377 4378 static int 4379 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4380 { 4381 struct hn_softc *sc = arg1; 4382 int ofs = arg2, i, error, conf; 4383 struct hn_tx_ring *txr; 4384 4385 txr = &sc->hn_tx_ring[0]; 4386 conf = *((int *)((uint8_t *)txr + ofs)); 4387 4388 error = sysctl_handle_int(oidp, &conf, 0, req); 4389 if (error || req->newptr == NULL) 4390 return error; 4391 4392 HN_LOCK(sc); 4393 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4394 txr = &sc->hn_tx_ring[i]; 4395 *((int *)((uint8_t *)txr + ofs)) = conf; 4396 } 4397 HN_UNLOCK(sc); 4398 4399 return 0; 4400 } 4401 4402 static int 4403 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4404 { 4405 struct hn_softc *sc = arg1; 4406 int error, size; 4407 4408 size = sc->hn_agg_size; 4409 error = sysctl_handle_int(oidp, &size, 0, req); 4410 if (error || req->newptr == NULL) 4411 return (error); 4412 4413 HN_LOCK(sc); 4414 sc->hn_agg_size = size; 4415 hn_set_txagg(sc); 4416 HN_UNLOCK(sc); 4417 4418 return (0); 4419 } 4420 4421 static int 4422 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4423 { 4424 struct hn_softc *sc = arg1; 4425 int error, pkts; 4426 4427 pkts = sc->hn_agg_pkts; 4428 error = sysctl_handle_int(oidp, &pkts, 0, req); 4429 if (error || req->newptr == NULL) 4430 return (error); 4431 4432 HN_LOCK(sc); 4433 sc->hn_agg_pkts = pkts; 4434 hn_set_txagg(sc); 4435 HN_UNLOCK(sc); 4436 4437 return (0); 4438 } 4439 4440 static int 4441 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4442 { 4443 struct hn_softc *sc = arg1; 4444 int pkts; 4445 4446 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4447 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4448 } 4449 4450 static int 4451 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4452 { 4453 struct hn_softc *sc = arg1; 4454 int align; 4455 4456 align = sc->hn_tx_ring[0].hn_agg_align; 4457 return (sysctl_handle_int(oidp, &align, 0, req)); 4458 } 4459 4460 static void 4461 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4462 { 4463 if (pollhz == 0) 4464 vmbus_chan_poll_disable(chan); 4465 else 4466 vmbus_chan_poll_enable(chan, pollhz); 4467 } 4468 4469 static void 4470 hn_polling(struct hn_softc *sc, u_int pollhz) 4471 { 4472 int nsubch = sc->hn_rx_ring_inuse - 1; 4473 4474 HN_LOCK_ASSERT(sc); 4475 4476 if (nsubch > 0) { 4477 struct vmbus_channel **subch; 4478 int i; 4479 4480 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4481 for (i = 0; i < nsubch; ++i) 4482 hn_chan_polling(subch[i], pollhz); 4483 vmbus_subchan_rel(subch, nsubch); 4484 } 4485 hn_chan_polling(sc->hn_prichan, pollhz); 4486 } 4487 4488 static int 4489 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4490 { 4491 struct hn_softc *sc = arg1; 4492 int pollhz, error; 4493 4494 pollhz = sc->hn_pollhz; 4495 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4496 if (error || req->newptr == NULL) 4497 return (error); 4498 4499 if (pollhz != 0 && 4500 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4501 return (EINVAL); 4502 4503 HN_LOCK(sc); 4504 if (sc->hn_pollhz != pollhz) { 4505 sc->hn_pollhz = pollhz; 4506 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 4507 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4508 hn_polling(sc, sc->hn_pollhz); 4509 } 4510 HN_UNLOCK(sc); 4511 4512 return (0); 4513 } 4514 4515 static int 4516 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4517 { 4518 struct hn_softc *sc = arg1; 4519 char verstr[16]; 4520 4521 snprintf(verstr, sizeof(verstr), "%u.%u", 4522 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4523 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4524 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4525 } 4526 4527 static int 4528 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4529 { 4530 struct hn_softc *sc = arg1; 4531 char caps_str[128]; 4532 uint32_t caps; 4533 4534 HN_LOCK(sc); 4535 caps = sc->hn_caps; 4536 HN_UNLOCK(sc); 4537 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4538 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4539 } 4540 4541 static int 4542 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4543 { 4544 struct hn_softc *sc = arg1; 4545 char assist_str[128]; 4546 uint32_t hwassist; 4547 4548 HN_LOCK(sc); 4549 hwassist = sc->hn_ifp->if_hwassist; 4550 HN_UNLOCK(sc); 4551 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4552 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4553 } 4554 4555 static int 4556 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4557 { 4558 struct hn_softc *sc = arg1; 4559 char filter_str[128]; 4560 uint32_t filter; 4561 4562 HN_LOCK(sc); 4563 filter = sc->hn_rx_filter; 4564 HN_UNLOCK(sc); 4565 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4566 NDIS_PACKET_TYPES); 4567 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4568 } 4569 4570 #ifndef RSS 4571 4572 static int 4573 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4574 { 4575 struct hn_softc *sc = arg1; 4576 int error; 4577 4578 HN_LOCK(sc); 4579 4580 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4581 if (error || req->newptr == NULL) 4582 goto back; 4583 4584 if ((sc->hn_flags & HN_FLAG_RXVF) || 4585 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4586 /* 4587 * RSS key is synchronized w/ VF's, don't allow users 4588 * to change it. 4589 */ 4590 error = EBUSY; 4591 goto back; 4592 } 4593 4594 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4595 if (error) 4596 goto back; 4597 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4598 4599 if (sc->hn_rx_ring_inuse > 1) { 4600 error = hn_rss_reconfig(sc); 4601 } else { 4602 /* Not RSS capable, at least for now; just save the RSS key. */ 4603 error = 0; 4604 } 4605 back: 4606 HN_UNLOCK(sc); 4607 return (error); 4608 } 4609 4610 static int 4611 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4612 { 4613 struct hn_softc *sc = arg1; 4614 int error; 4615 4616 HN_LOCK(sc); 4617 4618 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4619 if (error || req->newptr == NULL) 4620 goto back; 4621 4622 /* 4623 * Don't allow RSS indirect table change, if this interface is not 4624 * RSS capable currently. 4625 */ 4626 if (sc->hn_rx_ring_inuse == 1) { 4627 error = EOPNOTSUPP; 4628 goto back; 4629 } 4630 4631 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4632 if (error) 4633 goto back; 4634 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4635 4636 hn_rss_ind_fixup(sc); 4637 error = hn_rss_reconfig(sc); 4638 back: 4639 HN_UNLOCK(sc); 4640 return (error); 4641 } 4642 4643 #endif /* !RSS */ 4644 4645 static int 4646 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4647 { 4648 struct hn_softc *sc = arg1; 4649 char hash_str[128]; 4650 uint32_t hash; 4651 4652 HN_LOCK(sc); 4653 hash = sc->hn_rss_hash; 4654 HN_UNLOCK(sc); 4655 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4656 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4657 } 4658 4659 static int 4660 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4661 { 4662 struct hn_softc *sc = arg1; 4663 char hash_str[128]; 4664 uint32_t hash; 4665 4666 HN_LOCK(sc); 4667 hash = sc->hn_rss_hcap; 4668 HN_UNLOCK(sc); 4669 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4670 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4671 } 4672 4673 static int 4674 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4675 { 4676 struct hn_softc *sc = arg1; 4677 char hash_str[128]; 4678 uint32_t hash; 4679 4680 HN_LOCK(sc); 4681 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4682 HN_UNLOCK(sc); 4683 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4684 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4685 } 4686 4687 static int 4688 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4689 { 4690 struct hn_softc *sc = arg1; 4691 char vf_name[IFNAMSIZ + 1]; 4692 struct ifnet *vf_ifp; 4693 4694 HN_LOCK(sc); 4695 vf_name[0] = '\0'; 4696 vf_ifp = sc->hn_vf_ifp; 4697 if (vf_ifp != NULL) 4698 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4699 HN_UNLOCK(sc); 4700 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4701 } 4702 4703 static int 4704 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4705 { 4706 struct hn_softc *sc = arg1; 4707 char vf_name[IFNAMSIZ + 1]; 4708 struct ifnet *vf_ifp; 4709 4710 HN_LOCK(sc); 4711 vf_name[0] = '\0'; 4712 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4713 if (vf_ifp != NULL) 4714 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4715 HN_UNLOCK(sc); 4716 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4717 } 4718 4719 static int 4720 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4721 { 4722 struct rm_priotracker pt; 4723 struct sbuf *sb; 4724 int error, i; 4725 bool first; 4726 4727 error = sysctl_wire_old_buffer(req, 0); 4728 if (error != 0) 4729 return (error); 4730 4731 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4732 if (sb == NULL) 4733 return (ENOMEM); 4734 4735 rm_rlock(&hn_vfmap_lock, &pt); 4736 4737 first = true; 4738 for (i = 0; i < hn_vfmap_size; ++i) { 4739 struct ifnet *ifp; 4740 4741 if (hn_vfmap[i] == NULL) 4742 continue; 4743 4744 ifp = ifnet_byindex(i); 4745 if (ifp != NULL) { 4746 if (first) 4747 sbuf_printf(sb, "%s", ifp->if_xname); 4748 else 4749 sbuf_printf(sb, " %s", ifp->if_xname); 4750 first = false; 4751 } 4752 } 4753 4754 rm_runlock(&hn_vfmap_lock, &pt); 4755 4756 error = sbuf_finish(sb); 4757 sbuf_delete(sb); 4758 return (error); 4759 } 4760 4761 static int 4762 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4763 { 4764 struct rm_priotracker pt; 4765 struct sbuf *sb; 4766 int error, i; 4767 bool first; 4768 4769 error = sysctl_wire_old_buffer(req, 0); 4770 if (error != 0) 4771 return (error); 4772 4773 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4774 if (sb == NULL) 4775 return (ENOMEM); 4776 4777 rm_rlock(&hn_vfmap_lock, &pt); 4778 4779 first = true; 4780 for (i = 0; i < hn_vfmap_size; ++i) { 4781 struct ifnet *ifp, *hn_ifp; 4782 4783 hn_ifp = hn_vfmap[i]; 4784 if (hn_ifp == NULL) 4785 continue; 4786 4787 ifp = ifnet_byindex(i); 4788 if (ifp != NULL) { 4789 if (first) { 4790 sbuf_printf(sb, "%s:%s", ifp->if_xname, 4791 hn_ifp->if_xname); 4792 } else { 4793 sbuf_printf(sb, " %s:%s", ifp->if_xname, 4794 hn_ifp->if_xname); 4795 } 4796 first = false; 4797 } 4798 } 4799 4800 rm_runlock(&hn_vfmap_lock, &pt); 4801 4802 error = sbuf_finish(sb); 4803 sbuf_delete(sb); 4804 return (error); 4805 } 4806 4807 static int 4808 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4809 { 4810 struct hn_softc *sc = arg1; 4811 int error, onoff = 0; 4812 4813 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4814 onoff = 1; 4815 error = sysctl_handle_int(oidp, &onoff, 0, req); 4816 if (error || req->newptr == NULL) 4817 return (error); 4818 4819 HN_LOCK(sc); 4820 /* NOTE: hn_vf_lock for hn_transmit() */ 4821 rm_wlock(&sc->hn_vf_lock); 4822 if (onoff) 4823 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4824 else 4825 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4826 rm_wunlock(&sc->hn_vf_lock); 4827 HN_UNLOCK(sc); 4828 4829 return (0); 4830 } 4831 4832 static int 4833 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4834 { 4835 struct hn_softc *sc = arg1; 4836 int enabled = 0; 4837 4838 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4839 enabled = 1; 4840 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4841 } 4842 4843 static int 4844 hn_check_iplen(const struct mbuf *m, int hoff) 4845 { 4846 const struct ip *ip; 4847 int len, iphlen, iplen; 4848 const struct tcphdr *th; 4849 int thoff; /* TCP data offset */ 4850 4851 len = hoff + sizeof(struct ip); 4852 4853 /* The packet must be at least the size of an IP header. */ 4854 if (m->m_pkthdr.len < len) 4855 return IPPROTO_DONE; 4856 4857 /* The fixed IP header must reside completely in the first mbuf. */ 4858 if (m->m_len < len) 4859 return IPPROTO_DONE; 4860 4861 ip = mtodo(m, hoff); 4862 4863 /* Bound check the packet's stated IP header length. */ 4864 iphlen = ip->ip_hl << 2; 4865 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4866 return IPPROTO_DONE; 4867 4868 /* The full IP header must reside completely in the one mbuf. */ 4869 if (m->m_len < hoff + iphlen) 4870 return IPPROTO_DONE; 4871 4872 iplen = ntohs(ip->ip_len); 4873 4874 /* 4875 * Check that the amount of data in the buffers is as 4876 * at least much as the IP header would have us expect. 4877 */ 4878 if (m->m_pkthdr.len < hoff + iplen) 4879 return IPPROTO_DONE; 4880 4881 /* 4882 * Ignore IP fragments. 4883 */ 4884 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4885 return IPPROTO_DONE; 4886 4887 /* 4888 * The TCP/IP or UDP/IP header must be entirely contained within 4889 * the first fragment of a packet. 4890 */ 4891 switch (ip->ip_p) { 4892 case IPPROTO_TCP: 4893 if (iplen < iphlen + sizeof(struct tcphdr)) 4894 return IPPROTO_DONE; 4895 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4896 return IPPROTO_DONE; 4897 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4898 thoff = th->th_off << 2; 4899 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4900 return IPPROTO_DONE; 4901 if (m->m_len < hoff + iphlen + thoff) 4902 return IPPROTO_DONE; 4903 break; 4904 case IPPROTO_UDP: 4905 if (iplen < iphlen + sizeof(struct udphdr)) 4906 return IPPROTO_DONE; 4907 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4908 return IPPROTO_DONE; 4909 break; 4910 default: 4911 if (iplen < iphlen) 4912 return IPPROTO_DONE; 4913 break; 4914 } 4915 return ip->ip_p; 4916 } 4917 4918 static void 4919 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4920 { 4921 const struct ether_header *eh; 4922 uint16_t etype; 4923 int hoff; 4924 4925 hoff = sizeof(*eh); 4926 /* Checked at the beginning of this function. */ 4927 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4928 4929 eh = mtod(m_new, const struct ether_header *); 4930 etype = ntohs(eh->ether_type); 4931 if (etype == ETHERTYPE_VLAN) { 4932 const struct ether_vlan_header *evl; 4933 4934 hoff = sizeof(*evl); 4935 if (m_new->m_len < hoff) 4936 return; 4937 evl = mtod(m_new, const struct ether_vlan_header *); 4938 etype = ntohs(evl->evl_proto); 4939 } 4940 *l3proto = etype; 4941 4942 if (etype == ETHERTYPE_IP) 4943 *l4proto = hn_check_iplen(m_new, hoff); 4944 else 4945 *l4proto = IPPROTO_DONE; 4946 } 4947 4948 static int 4949 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4950 { 4951 struct sysctl_oid_list *child; 4952 struct sysctl_ctx_list *ctx; 4953 device_t dev = sc->hn_dev; 4954 #if defined(INET) || defined(INET6) 4955 #if __FreeBSD_version >= 1100095 4956 int lroent_cnt; 4957 #endif 4958 #endif 4959 int i; 4960 4961 /* 4962 * Create RXBUF for reception. 4963 * 4964 * NOTE: 4965 * - It is shared by all channels. 4966 * - A large enough buffer is allocated, certain version of NVSes 4967 * may further limit the usable space. 4968 */ 4969 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4970 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 4971 BUS_DMA_WAITOK | BUS_DMA_ZERO); 4972 if (sc->hn_rxbuf == NULL) { 4973 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4974 return (ENOMEM); 4975 } 4976 4977 sc->hn_rx_ring_cnt = ring_cnt; 4978 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4979 4980 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4981 M_DEVBUF, M_WAITOK | M_ZERO); 4982 4983 #if defined(INET) || defined(INET6) 4984 #if __FreeBSD_version >= 1100095 4985 lroent_cnt = hn_lro_entry_count; 4986 if (lroent_cnt < TCP_LRO_ENTRIES) 4987 lroent_cnt = TCP_LRO_ENTRIES; 4988 if (bootverbose) 4989 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 4990 #endif 4991 #endif /* INET || INET6 */ 4992 4993 ctx = device_get_sysctl_ctx(dev); 4994 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 4995 4996 /* Create dev.hn.UNIT.rx sysctl tree */ 4997 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 4998 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4999 5000 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5001 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5002 5003 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 5004 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 5005 &rxr->hn_br_dma, BUS_DMA_WAITOK); 5006 if (rxr->hn_br == NULL) { 5007 device_printf(dev, "allocate bufring failed\n"); 5008 return (ENOMEM); 5009 } 5010 5011 if (hn_trust_hosttcp) 5012 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 5013 if (hn_trust_hostudp) 5014 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 5015 if (hn_trust_hostip) 5016 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 5017 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 5018 rxr->hn_ifp = sc->hn_ifp; 5019 if (i < sc->hn_tx_ring_cnt) 5020 rxr->hn_txr = &sc->hn_tx_ring[i]; 5021 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 5022 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 5023 rxr->hn_rx_idx = i; 5024 rxr->hn_rxbuf = sc->hn_rxbuf; 5025 5026 /* 5027 * Initialize LRO. 5028 */ 5029 #if defined(INET) || defined(INET6) 5030 #if __FreeBSD_version >= 1100095 5031 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5032 hn_lro_mbufq_depth); 5033 #else 5034 tcp_lro_init(&rxr->hn_lro); 5035 rxr->hn_lro.ifp = sc->hn_ifp; 5036 #endif 5037 #if __FreeBSD_version >= 1100099 5038 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5039 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5040 #endif 5041 #endif /* INET || INET6 */ 5042 5043 if (sc->hn_rx_sysctl_tree != NULL) { 5044 char name[16]; 5045 5046 /* 5047 * Create per RX ring sysctl tree: 5048 * dev.hn.UNIT.rx.RINGID 5049 */ 5050 snprintf(name, sizeof(name), "%d", i); 5051 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5052 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5053 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5054 5055 if (rxr->hn_rx_sysctl_tree != NULL) { 5056 SYSCTL_ADD_ULONG(ctx, 5057 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5058 OID_AUTO, "packets", 5059 CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts, 5060 "# of packets received"); 5061 SYSCTL_ADD_ULONG(ctx, 5062 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5063 OID_AUTO, "rss_pkts", 5064 CTLFLAG_RW | CTLFLAG_STATS, 5065 &rxr->hn_rss_pkts, 5066 "# of packets w/ RSS info received"); 5067 SYSCTL_ADD_ULONG(ctx, 5068 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5069 OID_AUTO, "rsc_pkts", 5070 CTLFLAG_RW | CTLFLAG_STATS, 5071 &rxr->hn_rsc_pkts, 5072 "# of RSC packets received"); 5073 SYSCTL_ADD_ULONG(ctx, 5074 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5075 OID_AUTO, "rsc_drop", 5076 CTLFLAG_RW | CTLFLAG_STATS, 5077 &rxr->hn_rsc_drop, 5078 "# of RSC fragments dropped"); 5079 SYSCTL_ADD_INT(ctx, 5080 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5081 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5082 &rxr->hn_pktbuf_len, 0, 5083 "Temporary channel packet buffer length"); 5084 } 5085 } 5086 } 5087 5088 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5089 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5090 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5091 #if __FreeBSD_version < 1100095 5092 hn_rx_stat_int_sysctl, 5093 #else 5094 hn_rx_stat_u64_sysctl, 5095 #endif 5096 "LU", "LRO queued"); 5097 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5098 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5099 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5100 #if __FreeBSD_version < 1100095 5101 hn_rx_stat_int_sysctl, 5102 #else 5103 hn_rx_stat_u64_sysctl, 5104 #endif 5105 "LU", "LRO flushed"); 5106 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5107 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5108 __offsetof(struct hn_rx_ring, hn_lro_tried), 5109 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5110 #if __FreeBSD_version >= 1100099 5111 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5112 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5113 hn_lro_lenlim_sysctl, "IU", 5114 "Max # of data bytes to be aggregated by LRO"); 5115 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5116 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5117 hn_lro_ackcnt_sysctl, "I", 5118 "Max # of ACKs to be aggregated by LRO"); 5119 #endif 5120 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5121 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5122 hn_trust_hcsum_sysctl, "I", 5123 "Trust tcp segment verification on host side, " 5124 "when csum info is missing"); 5125 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5126 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5127 hn_trust_hcsum_sysctl, "I", 5128 "Trust udp datagram verification on host side, " 5129 "when csum info is missing"); 5130 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5131 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5132 hn_trust_hcsum_sysctl, "I", 5133 "Trust ip packet verification on host side, " 5134 "when csum info is missing"); 5135 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5136 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5137 __offsetof(struct hn_rx_ring, hn_csum_ip), 5138 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5139 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5140 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5141 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5142 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5143 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5144 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5145 __offsetof(struct hn_rx_ring, hn_csum_udp), 5146 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5147 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5148 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5149 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5150 hn_rx_stat_ulong_sysctl, "LU", 5151 "# of packets that we trust host's csum verification"); 5152 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5153 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5154 __offsetof(struct hn_rx_ring, hn_small_pkts), 5155 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5156 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5157 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5158 __offsetof(struct hn_rx_ring, hn_ack_failed), 5159 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5160 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5161 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5162 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5163 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5164 5165 return (0); 5166 } 5167 5168 static void 5169 hn_destroy_rx_data(struct hn_softc *sc) 5170 { 5171 int i; 5172 5173 if (sc->hn_rxbuf != NULL) { 5174 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5175 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 5176 else 5177 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5178 sc->hn_rxbuf = NULL; 5179 } 5180 5181 if (sc->hn_rx_ring_cnt == 0) 5182 return; 5183 5184 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5185 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5186 5187 if (rxr->hn_br == NULL) 5188 continue; 5189 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5190 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 5191 } else { 5192 device_printf(sc->hn_dev, 5193 "%dth channel bufring is referenced", i); 5194 } 5195 rxr->hn_br = NULL; 5196 5197 #if defined(INET) || defined(INET6) 5198 tcp_lro_free(&rxr->hn_lro); 5199 #endif 5200 free(rxr->hn_pktbuf, M_DEVBUF); 5201 } 5202 free(sc->hn_rx_ring, M_DEVBUF); 5203 sc->hn_rx_ring = NULL; 5204 5205 sc->hn_rx_ring_cnt = 0; 5206 sc->hn_rx_ring_inuse = 0; 5207 } 5208 5209 static int 5210 hn_tx_ring_create(struct hn_softc *sc, int id) 5211 { 5212 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5213 device_t dev = sc->hn_dev; 5214 bus_dma_tag_t parent_dtag; 5215 int error, i; 5216 5217 txr->hn_sc = sc; 5218 txr->hn_tx_idx = id; 5219 5220 #ifndef HN_USE_TXDESC_BUFRING 5221 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5222 #endif 5223 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5224 5225 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5226 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5227 M_DEVBUF, M_WAITOK | M_ZERO); 5228 #ifndef HN_USE_TXDESC_BUFRING 5229 SLIST_INIT(&txr->hn_txlist); 5230 #else 5231 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5232 M_WAITOK, &txr->hn_tx_lock); 5233 #endif 5234 5235 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5236 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5237 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5238 } else { 5239 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5240 } 5241 5242 #ifdef HN_IFSTART_SUPPORT 5243 if (hn_use_if_start) { 5244 txr->hn_txeof = hn_start_txeof; 5245 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5246 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5247 } else 5248 #endif 5249 { 5250 int br_depth; 5251 5252 txr->hn_txeof = hn_xmit_txeof; 5253 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5254 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5255 5256 br_depth = hn_get_txswq_depth(txr); 5257 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5258 M_WAITOK, &txr->hn_tx_lock); 5259 } 5260 5261 txr->hn_direct_tx_size = hn_direct_tx_size; 5262 5263 /* 5264 * Always schedule transmission instead of trying to do direct 5265 * transmission. This one gives the best performance so far. 5266 */ 5267 txr->hn_sched_tx = 1; 5268 5269 parent_dtag = bus_get_dma_tag(dev); 5270 5271 /* DMA tag for RNDIS packet messages. */ 5272 error = bus_dma_tag_create(parent_dtag, /* parent */ 5273 HN_RNDIS_PKT_ALIGN, /* alignment */ 5274 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5275 BUS_SPACE_MAXADDR, /* lowaddr */ 5276 BUS_SPACE_MAXADDR, /* highaddr */ 5277 NULL, NULL, /* filter, filterarg */ 5278 HN_RNDIS_PKT_LEN, /* maxsize */ 5279 1, /* nsegments */ 5280 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5281 0, /* flags */ 5282 NULL, /* lockfunc */ 5283 NULL, /* lockfuncarg */ 5284 &txr->hn_tx_rndis_dtag); 5285 if (error) { 5286 device_printf(dev, "failed to create rndis dmatag\n"); 5287 return error; 5288 } 5289 5290 /* DMA tag for data. */ 5291 error = bus_dma_tag_create(parent_dtag, /* parent */ 5292 1, /* alignment */ 5293 HN_TX_DATA_BOUNDARY, /* boundary */ 5294 BUS_SPACE_MAXADDR, /* lowaddr */ 5295 BUS_SPACE_MAXADDR, /* highaddr */ 5296 NULL, NULL, /* filter, filterarg */ 5297 HN_TX_DATA_MAXSIZE, /* maxsize */ 5298 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5299 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5300 0, /* flags */ 5301 NULL, /* lockfunc */ 5302 NULL, /* lockfuncarg */ 5303 &txr->hn_tx_data_dtag); 5304 if (error) { 5305 device_printf(dev, "failed to create data dmatag\n"); 5306 return error; 5307 } 5308 5309 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5310 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5311 5312 txd->txr = txr; 5313 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5314 STAILQ_INIT(&txd->agg_list); 5315 5316 /* 5317 * Allocate and load RNDIS packet message. 5318 */ 5319 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5320 (void **)&txd->rndis_pkt, 5321 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5322 &txd->rndis_pkt_dmap); 5323 if (error) { 5324 device_printf(dev, 5325 "failed to allocate rndis_packet_msg, %d\n", i); 5326 return error; 5327 } 5328 5329 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5330 txd->rndis_pkt_dmap, 5331 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5332 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5333 BUS_DMA_NOWAIT); 5334 if (error) { 5335 device_printf(dev, 5336 "failed to load rndis_packet_msg, %d\n", i); 5337 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5338 txd->rndis_pkt, txd->rndis_pkt_dmap); 5339 return error; 5340 } 5341 5342 /* DMA map for TX data. */ 5343 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5344 &txd->data_dmap); 5345 if (error) { 5346 device_printf(dev, 5347 "failed to allocate tx data dmamap\n"); 5348 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5349 txd->rndis_pkt_dmap); 5350 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5351 txd->rndis_pkt, txd->rndis_pkt_dmap); 5352 return error; 5353 } 5354 5355 /* All set, put it to list */ 5356 txd->flags |= HN_TXD_FLAG_ONLIST; 5357 #ifndef HN_USE_TXDESC_BUFRING 5358 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5359 #else 5360 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5361 #endif 5362 } 5363 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5364 5365 if (sc->hn_tx_sysctl_tree != NULL) { 5366 struct sysctl_oid_list *child; 5367 struct sysctl_ctx_list *ctx; 5368 char name[16]; 5369 5370 /* 5371 * Create per TX ring sysctl tree: 5372 * dev.hn.UNIT.tx.RINGID 5373 */ 5374 ctx = device_get_sysctl_ctx(dev); 5375 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5376 5377 snprintf(name, sizeof(name), "%d", id); 5378 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5379 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5380 5381 if (txr->hn_tx_sysctl_tree != NULL) { 5382 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5383 5384 #ifdef HN_DEBUG 5385 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5386 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5387 "# of available TX descs"); 5388 #endif 5389 #ifdef HN_IFSTART_SUPPORT 5390 if (!hn_use_if_start) 5391 #endif 5392 { 5393 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5394 CTLFLAG_RD, &txr->hn_oactive, 0, 5395 "over active"); 5396 } 5397 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5398 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts, 5399 "# of packets transmitted"); 5400 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5401 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends, 5402 "# of sends"); 5403 } 5404 } 5405 5406 return 0; 5407 } 5408 5409 static void 5410 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5411 { 5412 struct hn_tx_ring *txr = txd->txr; 5413 5414 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5415 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5416 5417 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5418 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5419 txd->rndis_pkt_dmap); 5420 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5421 } 5422 5423 static void 5424 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5425 { 5426 5427 KASSERT(txd->refs == 0 || txd->refs == 1, 5428 ("invalid txd refs %d", txd->refs)); 5429 5430 /* Aggregated txds will be freed by their aggregating txd. */ 5431 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5432 int freed; 5433 5434 freed = hn_txdesc_put(txr, txd); 5435 KASSERT(freed, ("can't free txdesc")); 5436 } 5437 } 5438 5439 static void 5440 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5441 { 5442 int i; 5443 5444 if (txr->hn_txdesc == NULL) 5445 return; 5446 5447 /* 5448 * NOTE: 5449 * Because the freeing of aggregated txds will be deferred 5450 * to the aggregating txd, two passes are used here: 5451 * - The first pass GCes any pending txds. This GC is necessary, 5452 * since if the channels are revoked, hypervisor will not 5453 * deliver send-done for all pending txds. 5454 * - The second pass frees the busdma stuffs, i.e. after all txds 5455 * were freed. 5456 */ 5457 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5458 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5459 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5460 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5461 5462 if (txr->hn_tx_data_dtag != NULL) 5463 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5464 if (txr->hn_tx_rndis_dtag != NULL) 5465 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5466 5467 #ifdef HN_USE_TXDESC_BUFRING 5468 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5469 #endif 5470 5471 free(txr->hn_txdesc, M_DEVBUF); 5472 txr->hn_txdesc = NULL; 5473 5474 if (txr->hn_mbuf_br != NULL) 5475 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5476 5477 #ifndef HN_USE_TXDESC_BUFRING 5478 mtx_destroy(&txr->hn_txlist_spin); 5479 #endif 5480 mtx_destroy(&txr->hn_tx_lock); 5481 } 5482 5483 static int 5484 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5485 { 5486 struct sysctl_oid_list *child; 5487 struct sysctl_ctx_list *ctx; 5488 int i; 5489 5490 /* 5491 * Create TXBUF for chimney sending. 5492 * 5493 * NOTE: It is shared by all channels. 5494 */ 5495 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 5496 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 5497 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5498 if (sc->hn_chim == NULL) { 5499 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5500 return (ENOMEM); 5501 } 5502 5503 sc->hn_tx_ring_cnt = ring_cnt; 5504 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5505 5506 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5507 M_DEVBUF, M_WAITOK | M_ZERO); 5508 5509 ctx = device_get_sysctl_ctx(sc->hn_dev); 5510 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5511 5512 /* Create dev.hn.UNIT.tx sysctl tree */ 5513 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5514 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5515 5516 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5517 int error; 5518 5519 error = hn_tx_ring_create(sc, i); 5520 if (error) 5521 return error; 5522 } 5523 5524 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5525 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5526 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5527 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5528 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5529 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5530 __offsetof(struct hn_tx_ring, hn_send_failed), 5531 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5532 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5533 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5534 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5535 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5536 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5537 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5538 __offsetof(struct hn_tx_ring, hn_flush_failed), 5539 hn_tx_stat_ulong_sysctl, "LU", 5540 "# of packet transmission aggregation flush failure"); 5541 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5542 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5543 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5544 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5545 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5546 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5547 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5548 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5549 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5550 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5551 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5552 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5553 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5554 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5555 "# of total TX descs"); 5556 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5557 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5558 "Chimney send packet size upper boundary"); 5559 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5560 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5561 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5562 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5563 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5564 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5565 hn_tx_conf_int_sysctl, "I", 5566 "Size of the packet for direct transmission"); 5567 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5568 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5569 __offsetof(struct hn_tx_ring, hn_sched_tx), 5570 hn_tx_conf_int_sysctl, "I", 5571 "Always schedule transmission " 5572 "instead of doing direct transmission"); 5573 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5574 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5575 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5576 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5577 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5578 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5579 "Applied packet transmission aggregation size"); 5580 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5581 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5582 hn_txagg_pktmax_sysctl, "I", 5583 "Applied packet transmission aggregation packets"); 5584 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5585 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5586 hn_txagg_align_sysctl, "I", 5587 "Applied packet transmission aggregation alignment"); 5588 5589 return 0; 5590 } 5591 5592 static void 5593 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5594 { 5595 int i; 5596 5597 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5598 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5599 } 5600 5601 static void 5602 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5603 { 5604 struct ifnet *ifp = sc->hn_ifp; 5605 u_int hw_tsomax; 5606 int tso_minlen; 5607 5608 HN_LOCK_ASSERT(sc); 5609 5610 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5611 return; 5612 5613 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5614 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5615 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5616 5617 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5618 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5619 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5620 5621 if (tso_maxlen < tso_minlen) 5622 tso_maxlen = tso_minlen; 5623 else if (tso_maxlen > IP_MAXPACKET) 5624 tso_maxlen = IP_MAXPACKET; 5625 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5626 tso_maxlen = sc->hn_ndis_tso_szmax; 5627 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5628 5629 if (hn_xpnt_vf_isready(sc)) { 5630 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) 5631 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; 5632 } 5633 ifp->if_hw_tsomax = hw_tsomax; 5634 if (bootverbose) 5635 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 5636 } 5637 5638 static void 5639 hn_fixup_tx_data(struct hn_softc *sc) 5640 { 5641 uint64_t csum_assist; 5642 int i; 5643 5644 hn_set_chim_size(sc, sc->hn_chim_szmax); 5645 if (hn_tx_chimney_size > 0 && 5646 hn_tx_chimney_size < sc->hn_chim_szmax) 5647 hn_set_chim_size(sc, hn_tx_chimney_size); 5648 5649 csum_assist = 0; 5650 if (sc->hn_caps & HN_CAP_IPCS) 5651 csum_assist |= CSUM_IP; 5652 if (sc->hn_caps & HN_CAP_TCP4CS) 5653 csum_assist |= CSUM_IP_TCP; 5654 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5655 csum_assist |= CSUM_IP_UDP; 5656 if (sc->hn_caps & HN_CAP_TCP6CS) 5657 csum_assist |= CSUM_IP6_TCP; 5658 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5659 csum_assist |= CSUM_IP6_UDP; 5660 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5661 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5662 5663 if (sc->hn_caps & HN_CAP_HASHVAL) { 5664 /* 5665 * Support HASHVAL pktinfo on TX path. 5666 */ 5667 if (bootverbose) 5668 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5669 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5670 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5671 } 5672 } 5673 5674 static void 5675 hn_fixup_rx_data(struct hn_softc *sc) 5676 { 5677 5678 if (sc->hn_caps & HN_CAP_UDPHASH) { 5679 int i; 5680 5681 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5682 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5683 } 5684 } 5685 5686 static void 5687 hn_destroy_tx_data(struct hn_softc *sc) 5688 { 5689 int i; 5690 5691 if (sc->hn_chim != NULL) { 5692 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5693 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5694 } else { 5695 device_printf(sc->hn_dev, 5696 "chimney sending buffer is referenced"); 5697 } 5698 sc->hn_chim = NULL; 5699 } 5700 5701 if (sc->hn_tx_ring_cnt == 0) 5702 return; 5703 5704 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5705 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5706 5707 free(sc->hn_tx_ring, M_DEVBUF); 5708 sc->hn_tx_ring = NULL; 5709 5710 sc->hn_tx_ring_cnt = 0; 5711 sc->hn_tx_ring_inuse = 0; 5712 } 5713 5714 #ifdef HN_IFSTART_SUPPORT 5715 5716 static void 5717 hn_start_taskfunc(void *xtxr, int pending __unused) 5718 { 5719 struct hn_tx_ring *txr = xtxr; 5720 5721 mtx_lock(&txr->hn_tx_lock); 5722 hn_start_locked(txr, 0); 5723 mtx_unlock(&txr->hn_tx_lock); 5724 } 5725 5726 static int 5727 hn_start_locked(struct hn_tx_ring *txr, int len) 5728 { 5729 struct hn_softc *sc = txr->hn_sc; 5730 struct ifnet *ifp = sc->hn_ifp; 5731 int sched = 0; 5732 5733 KASSERT(hn_use_if_start, 5734 ("hn_start_locked is called, when if_start is disabled")); 5735 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5736 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5737 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5738 5739 if (__predict_false(txr->hn_suspended)) 5740 return (0); 5741 5742 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5743 IFF_DRV_RUNNING) 5744 return (0); 5745 5746 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 5747 struct hn_txdesc *txd; 5748 struct mbuf *m_head; 5749 int error; 5750 5751 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 5752 if (m_head == NULL) 5753 break; 5754 5755 if (len > 0 && m_head->m_pkthdr.len > len) { 5756 /* 5757 * This sending could be time consuming; let callers 5758 * dispatch this packet sending (and sending of any 5759 * following up packets) to tx taskqueue. 5760 */ 5761 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5762 sched = 1; 5763 break; 5764 } 5765 5766 #if defined(INET6) || defined(INET) 5767 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5768 m_head = hn_tso_fixup(m_head); 5769 if (__predict_false(m_head == NULL)) { 5770 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5771 continue; 5772 } 5773 } else if (m_head->m_pkthdr.csum_flags & 5774 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5775 m_head = hn_set_hlen(m_head); 5776 if (__predict_false(m_head == NULL)) { 5777 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5778 continue; 5779 } 5780 } 5781 #endif 5782 5783 txd = hn_txdesc_get(txr); 5784 if (txd == NULL) { 5785 txr->hn_no_txdescs++; 5786 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5787 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5788 break; 5789 } 5790 5791 error = hn_encap(ifp, txr, txd, &m_head); 5792 if (error) { 5793 /* Both txd and m_head are freed */ 5794 KASSERT(txr->hn_agg_txd == NULL, 5795 ("encap failed w/ pending aggregating txdesc")); 5796 continue; 5797 } 5798 5799 if (txr->hn_agg_pktleft == 0) { 5800 if (txr->hn_agg_txd != NULL) { 5801 KASSERT(m_head == NULL, 5802 ("pending mbuf for aggregating txdesc")); 5803 error = hn_flush_txagg(ifp, txr); 5804 if (__predict_false(error)) { 5805 atomic_set_int(&ifp->if_drv_flags, 5806 IFF_DRV_OACTIVE); 5807 break; 5808 } 5809 } else { 5810 KASSERT(m_head != NULL, ("mbuf was freed")); 5811 error = hn_txpkt(ifp, txr, txd); 5812 if (__predict_false(error)) { 5813 /* txd is freed, but m_head is not */ 5814 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5815 atomic_set_int(&ifp->if_drv_flags, 5816 IFF_DRV_OACTIVE); 5817 break; 5818 } 5819 } 5820 } 5821 #ifdef INVARIANTS 5822 else { 5823 KASSERT(txr->hn_agg_txd != NULL, 5824 ("no aggregating txdesc")); 5825 KASSERT(m_head == NULL, 5826 ("pending mbuf for aggregating txdesc")); 5827 } 5828 #endif 5829 } 5830 5831 /* Flush pending aggerated transmission. */ 5832 if (txr->hn_agg_txd != NULL) 5833 hn_flush_txagg(ifp, txr); 5834 return (sched); 5835 } 5836 5837 static void 5838 hn_start(struct ifnet *ifp) 5839 { 5840 struct hn_softc *sc = ifp->if_softc; 5841 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5842 5843 if (txr->hn_sched_tx) 5844 goto do_sched; 5845 5846 if (mtx_trylock(&txr->hn_tx_lock)) { 5847 int sched; 5848 5849 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5850 mtx_unlock(&txr->hn_tx_lock); 5851 if (!sched) 5852 return; 5853 } 5854 do_sched: 5855 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5856 } 5857 5858 static void 5859 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5860 { 5861 struct hn_tx_ring *txr = xtxr; 5862 5863 mtx_lock(&txr->hn_tx_lock); 5864 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 5865 hn_start_locked(txr, 0); 5866 mtx_unlock(&txr->hn_tx_lock); 5867 } 5868 5869 static void 5870 hn_start_txeof(struct hn_tx_ring *txr) 5871 { 5872 struct hn_softc *sc = txr->hn_sc; 5873 struct ifnet *ifp = sc->hn_ifp; 5874 5875 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5876 5877 if (txr->hn_sched_tx) 5878 goto do_sched; 5879 5880 if (mtx_trylock(&txr->hn_tx_lock)) { 5881 int sched; 5882 5883 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5884 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5885 mtx_unlock(&txr->hn_tx_lock); 5886 if (sched) { 5887 taskqueue_enqueue(txr->hn_tx_taskq, 5888 &txr->hn_tx_task); 5889 } 5890 } else { 5891 do_sched: 5892 /* 5893 * Release the OACTIVE earlier, with the hope, that 5894 * others could catch up. The task will clear the 5895 * flag again with the hn_tx_lock to avoid possible 5896 * races. 5897 */ 5898 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5899 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5900 } 5901 } 5902 5903 #endif /* HN_IFSTART_SUPPORT */ 5904 5905 static int 5906 hn_xmit(struct hn_tx_ring *txr, int len) 5907 { 5908 struct hn_softc *sc = txr->hn_sc; 5909 struct ifnet *ifp = sc->hn_ifp; 5910 struct mbuf *m_head; 5911 int sched = 0; 5912 5913 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5914 #ifdef HN_IFSTART_SUPPORT 5915 KASSERT(hn_use_if_start == 0, 5916 ("hn_xmit is called, when if_start is enabled")); 5917 #endif 5918 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5919 5920 if (__predict_false(txr->hn_suspended)) 5921 return (0); 5922 5923 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5924 return (0); 5925 5926 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5927 struct hn_txdesc *txd; 5928 int error; 5929 5930 if (len > 0 && m_head->m_pkthdr.len > len) { 5931 /* 5932 * This sending could be time consuming; let callers 5933 * dispatch this packet sending (and sending of any 5934 * following up packets) to tx taskqueue. 5935 */ 5936 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5937 sched = 1; 5938 break; 5939 } 5940 5941 txd = hn_txdesc_get(txr); 5942 if (txd == NULL) { 5943 txr->hn_no_txdescs++; 5944 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5945 txr->hn_oactive = 1; 5946 break; 5947 } 5948 5949 error = hn_encap(ifp, txr, txd, &m_head); 5950 if (error) { 5951 /* Both txd and m_head are freed; discard */ 5952 KASSERT(txr->hn_agg_txd == NULL, 5953 ("encap failed w/ pending aggregating txdesc")); 5954 drbr_advance(ifp, txr->hn_mbuf_br); 5955 continue; 5956 } 5957 5958 if (txr->hn_agg_pktleft == 0) { 5959 if (txr->hn_agg_txd != NULL) { 5960 KASSERT(m_head == NULL, 5961 ("pending mbuf for aggregating txdesc")); 5962 error = hn_flush_txagg(ifp, txr); 5963 if (__predict_false(error)) { 5964 txr->hn_oactive = 1; 5965 break; 5966 } 5967 } else { 5968 KASSERT(m_head != NULL, ("mbuf was freed")); 5969 error = hn_txpkt(ifp, txr, txd); 5970 if (__predict_false(error)) { 5971 /* txd is freed, but m_head is not */ 5972 drbr_putback(ifp, txr->hn_mbuf_br, 5973 m_head); 5974 txr->hn_oactive = 1; 5975 break; 5976 } 5977 } 5978 } 5979 #ifdef INVARIANTS 5980 else { 5981 KASSERT(txr->hn_agg_txd != NULL, 5982 ("no aggregating txdesc")); 5983 KASSERT(m_head == NULL, 5984 ("pending mbuf for aggregating txdesc")); 5985 } 5986 #endif 5987 5988 /* Sent */ 5989 drbr_advance(ifp, txr->hn_mbuf_br); 5990 } 5991 5992 /* Flush pending aggerated transmission. */ 5993 if (txr->hn_agg_txd != NULL) 5994 hn_flush_txagg(ifp, txr); 5995 return (sched); 5996 } 5997 5998 static int 5999 hn_transmit(struct ifnet *ifp, struct mbuf *m) 6000 { 6001 struct hn_softc *sc = ifp->if_softc; 6002 struct hn_tx_ring *txr; 6003 int error, idx = 0; 6004 6005 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 6006 struct rm_priotracker pt; 6007 6008 rm_rlock(&sc->hn_vf_lock, &pt); 6009 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6010 struct mbuf *m_bpf = NULL; 6011 int obytes, omcast; 6012 6013 obytes = m->m_pkthdr.len; 6014 omcast = (m->m_flags & M_MCAST) != 0; 6015 6016 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 6017 if (bpf_peers_present(ifp->if_bpf)) { 6018 m_bpf = m_copypacket(m, M_NOWAIT); 6019 if (m_bpf == NULL) { 6020 /* 6021 * Failed to grab a shallow 6022 * copy; tap now. 6023 */ 6024 ETHER_BPF_MTAP(ifp, m); 6025 } 6026 } 6027 } else { 6028 ETHER_BPF_MTAP(ifp, m); 6029 } 6030 6031 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); 6032 rm_runlock(&sc->hn_vf_lock, &pt); 6033 6034 if (m_bpf != NULL) { 6035 if (!error) 6036 ETHER_BPF_MTAP(ifp, m_bpf); 6037 m_freem(m_bpf); 6038 } 6039 6040 if (error == ENOBUFS) { 6041 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6042 } else if (error) { 6043 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6044 } else { 6045 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6046 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6047 if (omcast) { 6048 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6049 omcast); 6050 } 6051 } 6052 return (error); 6053 } 6054 rm_runlock(&sc->hn_vf_lock, &pt); 6055 } 6056 6057 #if defined(INET6) || defined(INET) 6058 /* 6059 * Perform TSO packet header fixup or get l2/l3 header length now, 6060 * since packet headers should be cache-hot. 6061 */ 6062 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6063 m = hn_tso_fixup(m); 6064 if (__predict_false(m == NULL)) { 6065 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6066 return EIO; 6067 } 6068 } else if (m->m_pkthdr.csum_flags & 6069 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6070 m = hn_set_hlen(m); 6071 if (__predict_false(m == NULL)) { 6072 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6073 return EIO; 6074 } 6075 } 6076 #endif 6077 6078 /* 6079 * Select the TX ring based on flowid 6080 */ 6081 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6082 #ifdef RSS 6083 uint32_t bid; 6084 6085 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6086 &bid) == 0) 6087 idx = bid % sc->hn_tx_ring_inuse; 6088 else 6089 #endif 6090 { 6091 #if defined(INET6) || defined(INET) 6092 int tcpsyn = 0; 6093 6094 if (m->m_pkthdr.len < 128 && 6095 (m->m_pkthdr.csum_flags & 6096 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6097 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6098 m = hn_check_tcpsyn(m, &tcpsyn); 6099 if (__predict_false(m == NULL)) { 6100 if_inc_counter(ifp, 6101 IFCOUNTER_OERRORS, 1); 6102 return (EIO); 6103 } 6104 } 6105 #else 6106 const int tcpsyn = 0; 6107 #endif 6108 if (tcpsyn) 6109 idx = 0; 6110 else 6111 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6112 } 6113 } 6114 txr = &sc->hn_tx_ring[idx]; 6115 6116 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6117 if (error) { 6118 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6119 return error; 6120 } 6121 6122 if (txr->hn_oactive) 6123 return 0; 6124 6125 if (txr->hn_sched_tx) 6126 goto do_sched; 6127 6128 if (mtx_trylock(&txr->hn_tx_lock)) { 6129 int sched; 6130 6131 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6132 mtx_unlock(&txr->hn_tx_lock); 6133 if (!sched) 6134 return 0; 6135 } 6136 do_sched: 6137 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6138 return 0; 6139 } 6140 6141 static void 6142 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6143 { 6144 struct mbuf *m; 6145 6146 mtx_lock(&txr->hn_tx_lock); 6147 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6148 m_freem(m); 6149 mtx_unlock(&txr->hn_tx_lock); 6150 } 6151 6152 static void 6153 hn_xmit_qflush(struct ifnet *ifp) 6154 { 6155 struct hn_softc *sc = ifp->if_softc; 6156 struct rm_priotracker pt; 6157 int i; 6158 6159 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6160 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6161 if_qflush(ifp); 6162 6163 rm_rlock(&sc->hn_vf_lock, &pt); 6164 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6165 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); 6166 rm_runlock(&sc->hn_vf_lock, &pt); 6167 } 6168 6169 static void 6170 hn_xmit_txeof(struct hn_tx_ring *txr) 6171 { 6172 6173 if (txr->hn_sched_tx) 6174 goto do_sched; 6175 6176 if (mtx_trylock(&txr->hn_tx_lock)) { 6177 int sched; 6178 6179 txr->hn_oactive = 0; 6180 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6181 mtx_unlock(&txr->hn_tx_lock); 6182 if (sched) { 6183 taskqueue_enqueue(txr->hn_tx_taskq, 6184 &txr->hn_tx_task); 6185 } 6186 } else { 6187 do_sched: 6188 /* 6189 * Release the oactive earlier, with the hope, that 6190 * others could catch up. The task will clear the 6191 * oactive again with the hn_tx_lock to avoid possible 6192 * races. 6193 */ 6194 txr->hn_oactive = 0; 6195 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6196 } 6197 } 6198 6199 static void 6200 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6201 { 6202 struct hn_tx_ring *txr = xtxr; 6203 6204 mtx_lock(&txr->hn_tx_lock); 6205 hn_xmit(txr, 0); 6206 mtx_unlock(&txr->hn_tx_lock); 6207 } 6208 6209 static void 6210 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6211 { 6212 struct hn_tx_ring *txr = xtxr; 6213 6214 mtx_lock(&txr->hn_tx_lock); 6215 txr->hn_oactive = 0; 6216 hn_xmit(txr, 0); 6217 mtx_unlock(&txr->hn_tx_lock); 6218 } 6219 6220 static int 6221 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6222 { 6223 struct vmbus_chan_br cbr; 6224 struct hn_rx_ring *rxr; 6225 struct hn_tx_ring *txr = NULL; 6226 int idx, error; 6227 6228 idx = vmbus_chan_subidx(chan); 6229 6230 /* 6231 * Link this channel to RX/TX ring. 6232 */ 6233 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6234 ("invalid channel index %d, should > 0 && < %d", 6235 idx, sc->hn_rx_ring_inuse)); 6236 rxr = &sc->hn_rx_ring[idx]; 6237 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6238 ("RX ring %d already attached", idx)); 6239 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6240 rxr->hn_chan = chan; 6241 6242 if (bootverbose) { 6243 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6244 idx, vmbus_chan_id(chan)); 6245 } 6246 6247 if (idx < sc->hn_tx_ring_inuse) { 6248 txr = &sc->hn_tx_ring[idx]; 6249 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6250 ("TX ring %d already attached", idx)); 6251 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6252 6253 txr->hn_chan = chan; 6254 if (bootverbose) { 6255 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6256 idx, vmbus_chan_id(chan)); 6257 } 6258 } 6259 6260 /* Bind this channel to a proper CPU. */ 6261 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6262 6263 /* 6264 * Open this channel 6265 */ 6266 cbr.cbr = rxr->hn_br; 6267 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 6268 cbr.cbr_txsz = HN_TXBR_SIZE; 6269 cbr.cbr_rxsz = HN_RXBR_SIZE; 6270 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6271 if (error) { 6272 if (error == EISCONN) { 6273 if_printf(sc->hn_ifp, "bufring is connected after " 6274 "chan%u open failure\n", vmbus_chan_id(chan)); 6275 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6276 } else { 6277 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6278 vmbus_chan_id(chan), error); 6279 } 6280 } 6281 return (error); 6282 } 6283 6284 static void 6285 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6286 { 6287 struct hn_rx_ring *rxr; 6288 int idx, error; 6289 6290 idx = vmbus_chan_subidx(chan); 6291 6292 /* 6293 * Link this channel to RX/TX ring. 6294 */ 6295 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6296 ("invalid channel index %d, should > 0 && < %d", 6297 idx, sc->hn_rx_ring_inuse)); 6298 rxr = &sc->hn_rx_ring[idx]; 6299 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6300 ("RX ring %d is not attached", idx)); 6301 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6302 6303 if (idx < sc->hn_tx_ring_inuse) { 6304 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6305 6306 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6307 ("TX ring %d is not attached attached", idx)); 6308 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6309 } 6310 6311 /* 6312 * Close this channel. 6313 * 6314 * NOTE: 6315 * Channel closing does _not_ destroy the target channel. 6316 */ 6317 error = vmbus_chan_close_direct(chan); 6318 if (error == EISCONN) { 6319 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6320 "after being closed\n", vmbus_chan_id(chan)); 6321 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6322 } else if (error) { 6323 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6324 vmbus_chan_id(chan), error); 6325 } 6326 } 6327 6328 static int 6329 hn_attach_subchans(struct hn_softc *sc) 6330 { 6331 struct vmbus_channel **subchans; 6332 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6333 int i, error = 0; 6334 6335 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6336 6337 /* Attach the sub-channels. */ 6338 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6339 for (i = 0; i < subchan_cnt; ++i) { 6340 int error1; 6341 6342 error1 = hn_chan_attach(sc, subchans[i]); 6343 if (error1) { 6344 error = error1; 6345 /* Move on; all channels will be detached later. */ 6346 } 6347 } 6348 vmbus_subchan_rel(subchans, subchan_cnt); 6349 6350 if (error) { 6351 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6352 } else { 6353 if (bootverbose) { 6354 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6355 subchan_cnt); 6356 } 6357 } 6358 return (error); 6359 } 6360 6361 static void 6362 hn_detach_allchans(struct hn_softc *sc) 6363 { 6364 struct vmbus_channel **subchans; 6365 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6366 int i; 6367 6368 if (subchan_cnt == 0) 6369 goto back; 6370 6371 /* Detach the sub-channels. */ 6372 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6373 for (i = 0; i < subchan_cnt; ++i) 6374 hn_chan_detach(sc, subchans[i]); 6375 vmbus_subchan_rel(subchans, subchan_cnt); 6376 6377 back: 6378 /* 6379 * Detach the primary channel, _after_ all sub-channels 6380 * are detached. 6381 */ 6382 hn_chan_detach(sc, sc->hn_prichan); 6383 6384 /* Wait for sub-channels to be destroyed, if any. */ 6385 vmbus_subchan_drain(sc->hn_prichan); 6386 6387 #ifdef INVARIANTS 6388 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6389 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6390 HN_RX_FLAG_ATTACHED) == 0, 6391 ("%dth RX ring is still attached", i)); 6392 } 6393 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6394 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6395 HN_TX_FLAG_ATTACHED) == 0, 6396 ("%dth TX ring is still attached", i)); 6397 } 6398 #endif 6399 } 6400 6401 static int 6402 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6403 { 6404 struct vmbus_channel **subchans; 6405 int nchan, rxr_cnt, error; 6406 6407 nchan = *nsubch + 1; 6408 if (nchan == 1) { 6409 /* 6410 * Multiple RX/TX rings are not requested. 6411 */ 6412 *nsubch = 0; 6413 return (0); 6414 } 6415 6416 /* 6417 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6418 * table entries. 6419 */ 6420 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6421 if (error) { 6422 /* No RSS; this is benign. */ 6423 *nsubch = 0; 6424 return (0); 6425 } 6426 if (bootverbose) { 6427 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6428 rxr_cnt, nchan); 6429 } 6430 6431 if (nchan > rxr_cnt) 6432 nchan = rxr_cnt; 6433 if (nchan == 1) { 6434 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6435 *nsubch = 0; 6436 return (0); 6437 } 6438 6439 /* 6440 * Allocate sub-channels from NVS. 6441 */ 6442 *nsubch = nchan - 1; 6443 error = hn_nvs_alloc_subchans(sc, nsubch); 6444 if (error || *nsubch == 0) { 6445 /* Failed to allocate sub-channels. */ 6446 *nsubch = 0; 6447 return (0); 6448 } 6449 6450 /* 6451 * Wait for all sub-channels to become ready before moving on. 6452 */ 6453 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6454 vmbus_subchan_rel(subchans, *nsubch); 6455 return (0); 6456 } 6457 6458 static bool 6459 hn_synth_attachable(const struct hn_softc *sc) 6460 { 6461 int i; 6462 6463 if (sc->hn_flags & HN_FLAG_ERRORS) 6464 return (false); 6465 6466 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6467 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6468 6469 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6470 return (false); 6471 } 6472 return (true); 6473 } 6474 6475 /* 6476 * Make sure that the RX filter is zero after the successful 6477 * RNDIS initialization. 6478 * 6479 * NOTE: 6480 * Under certain conditions on certain versions of Hyper-V, 6481 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6482 * after the successful RNDIS initialization, which breaks 6483 * the assumption of any following code (well, it breaks the 6484 * RNDIS API contract actually). Clear the RNDIS rxfilter 6485 * explicitly, drain packets sneaking through, and drain the 6486 * interrupt taskqueues scheduled due to the stealth packets. 6487 */ 6488 static void 6489 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6490 { 6491 6492 hn_disable_rx(sc); 6493 hn_drain_rxtx(sc, nchan); 6494 } 6495 6496 static int 6497 hn_synth_attach(struct hn_softc *sc, int mtu) 6498 { 6499 #define ATTACHED_NVS 0x0002 6500 #define ATTACHED_RNDIS 0x0004 6501 6502 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6503 int error, nsubch, nchan = 1, i, rndis_inited; 6504 uint32_t old_caps, attached = 0; 6505 6506 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6507 ("synthetic parts were attached")); 6508 6509 if (!hn_synth_attachable(sc)) 6510 return (ENXIO); 6511 6512 /* Save capabilities for later verification. */ 6513 old_caps = sc->hn_caps; 6514 sc->hn_caps = 0; 6515 6516 /* Clear RSS stuffs. */ 6517 sc->hn_rss_ind_size = 0; 6518 sc->hn_rss_hash = 0; 6519 sc->hn_rss_hcap = 0; 6520 6521 /* 6522 * Attach the primary channel _before_ attaching NVS and RNDIS. 6523 */ 6524 error = hn_chan_attach(sc, sc->hn_prichan); 6525 if (error) 6526 goto failed; 6527 6528 /* 6529 * Attach NVS. 6530 */ 6531 error = hn_nvs_attach(sc, mtu); 6532 if (error) 6533 goto failed; 6534 attached |= ATTACHED_NVS; 6535 6536 /* 6537 * Attach RNDIS _after_ NVS is attached. 6538 */ 6539 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6540 if (rndis_inited) 6541 attached |= ATTACHED_RNDIS; 6542 if (error) 6543 goto failed; 6544 6545 /* 6546 * Make sure capabilities are not changed. 6547 */ 6548 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6549 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6550 old_caps, sc->hn_caps); 6551 error = ENXIO; 6552 goto failed; 6553 } 6554 6555 /* 6556 * Allocate sub-channels for multi-TX/RX rings. 6557 * 6558 * NOTE: 6559 * The # of RX rings that can be used is equivalent to the # of 6560 * channels to be requested. 6561 */ 6562 nsubch = sc->hn_rx_ring_cnt - 1; 6563 error = hn_synth_alloc_subchans(sc, &nsubch); 6564 if (error) 6565 goto failed; 6566 /* NOTE: _Full_ synthetic parts detach is required now. */ 6567 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6568 6569 /* 6570 * Set the # of TX/RX rings that could be used according to 6571 * the # of channels that NVS offered. 6572 */ 6573 nchan = nsubch + 1; 6574 hn_set_ring_inuse(sc, nchan); 6575 if (nchan == 1) { 6576 /* Only the primary channel can be used; done */ 6577 goto back; 6578 } 6579 6580 /* 6581 * Attach the sub-channels. 6582 * 6583 * NOTE: hn_set_ring_inuse() _must_ have been called. 6584 */ 6585 error = hn_attach_subchans(sc); 6586 if (error) 6587 goto failed; 6588 6589 /* 6590 * Configure RSS key and indirect table _after_ all sub-channels 6591 * are attached. 6592 */ 6593 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6594 /* 6595 * RSS key is not set yet; set it to the default RSS key. 6596 */ 6597 if (bootverbose) 6598 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6599 #ifdef RSS 6600 rss_getkey(rss->rss_key); 6601 #else 6602 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6603 #endif 6604 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6605 } 6606 6607 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6608 /* 6609 * RSS indirect table is not set yet; set it up in round- 6610 * robin fashion. 6611 */ 6612 if (bootverbose) { 6613 if_printf(sc->hn_ifp, "setup default RSS indirect " 6614 "table\n"); 6615 } 6616 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6617 uint32_t subidx; 6618 6619 #ifdef RSS 6620 subidx = rss_get_indirection_to_bucket(i); 6621 #else 6622 subidx = i; 6623 #endif 6624 rss->rss_ind[i] = subidx % nchan; 6625 } 6626 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6627 } else { 6628 /* 6629 * # of usable channels may be changed, so we have to 6630 * make sure that all entries in RSS indirect table 6631 * are valid. 6632 * 6633 * NOTE: hn_set_ring_inuse() _must_ have been called. 6634 */ 6635 hn_rss_ind_fixup(sc); 6636 } 6637 6638 sc->hn_rss_hash = sc->hn_rss_hcap; 6639 if ((sc->hn_flags & HN_FLAG_RXVF) || 6640 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6641 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6642 hn_vf_rss_fixup(sc, false); 6643 } 6644 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6645 if (error) 6646 goto failed; 6647 back: 6648 /* 6649 * Fixup transmission aggregation setup. 6650 */ 6651 hn_set_txagg(sc); 6652 hn_rndis_init_fixat(sc, nchan); 6653 return (0); 6654 6655 failed: 6656 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6657 hn_rndis_init_fixat(sc, nchan); 6658 hn_synth_detach(sc); 6659 } else { 6660 if (attached & ATTACHED_RNDIS) { 6661 hn_rndis_init_fixat(sc, nchan); 6662 hn_rndis_detach(sc); 6663 } 6664 if (attached & ATTACHED_NVS) 6665 hn_nvs_detach(sc); 6666 hn_chan_detach(sc, sc->hn_prichan); 6667 /* Restore old capabilities. */ 6668 sc->hn_caps = old_caps; 6669 } 6670 return (error); 6671 6672 #undef ATTACHED_RNDIS 6673 #undef ATTACHED_NVS 6674 } 6675 6676 /* 6677 * NOTE: 6678 * The interface must have been suspended though hn_suspend(), before 6679 * this function get called. 6680 */ 6681 static void 6682 hn_synth_detach(struct hn_softc *sc) 6683 { 6684 6685 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6686 ("synthetic parts were not attached")); 6687 6688 /* Detach the RNDIS first. */ 6689 hn_rndis_detach(sc); 6690 6691 /* Detach NVS. */ 6692 hn_nvs_detach(sc); 6693 6694 /* Detach all of the channels. */ 6695 hn_detach_allchans(sc); 6696 6697 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6698 /* 6699 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6700 */ 6701 int error; 6702 6703 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6704 sc->hn_rxbuf_gpadl); 6705 if (error) { 6706 if_printf(sc->hn_ifp, 6707 "rxbuf gpadl disconn failed: %d\n", error); 6708 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6709 } 6710 sc->hn_rxbuf_gpadl = 0; 6711 } 6712 6713 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6714 /* 6715 * Host is post-Win2016, disconnect chimney sending buffer from 6716 * primary channel here. 6717 */ 6718 int error; 6719 6720 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6721 sc->hn_chim_gpadl); 6722 if (error) { 6723 if_printf(sc->hn_ifp, 6724 "chim gpadl disconn failed: %d\n", error); 6725 sc->hn_flags |= HN_FLAG_CHIM_REF; 6726 } 6727 sc->hn_chim_gpadl = 0; 6728 } 6729 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6730 } 6731 6732 static void 6733 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6734 { 6735 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6736 ("invalid ring count %d", ring_cnt)); 6737 6738 if (sc->hn_tx_ring_cnt > ring_cnt) 6739 sc->hn_tx_ring_inuse = ring_cnt; 6740 else 6741 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6742 sc->hn_rx_ring_inuse = ring_cnt; 6743 6744 #ifdef RSS 6745 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6746 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6747 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6748 rss_getnumbuckets()); 6749 } 6750 #endif 6751 6752 if (bootverbose) { 6753 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6754 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6755 } 6756 } 6757 6758 static void 6759 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6760 { 6761 6762 /* 6763 * NOTE: 6764 * The TX bufring will not be drained by the hypervisor, 6765 * if the primary channel is revoked. 6766 */ 6767 while (!vmbus_chan_rx_empty(chan) || 6768 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6769 !vmbus_chan_tx_empty(chan))) 6770 pause("waitch", 1); 6771 vmbus_chan_intr_drain(chan); 6772 } 6773 6774 static void 6775 hn_disable_rx(struct hn_softc *sc) 6776 { 6777 6778 /* 6779 * Disable RX by clearing RX filter forcefully. 6780 */ 6781 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6782 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6783 6784 /* 6785 * Give RNDIS enough time to flush all pending data packets. 6786 */ 6787 pause("waitrx", (200 * hz) / 1000); 6788 } 6789 6790 /* 6791 * NOTE: 6792 * RX/TX _must_ have been suspended/disabled, before this function 6793 * is called. 6794 */ 6795 static void 6796 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6797 { 6798 struct vmbus_channel **subch = NULL; 6799 int nsubch; 6800 6801 /* 6802 * Drain RX/TX bufrings and interrupts. 6803 */ 6804 nsubch = nchan - 1; 6805 if (nsubch > 0) 6806 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6807 6808 if (subch != NULL) { 6809 int i; 6810 6811 for (i = 0; i < nsubch; ++i) 6812 hn_chan_drain(sc, subch[i]); 6813 } 6814 hn_chan_drain(sc, sc->hn_prichan); 6815 6816 if (subch != NULL) 6817 vmbus_subchan_rel(subch, nsubch); 6818 } 6819 6820 static void 6821 hn_suspend_data(struct hn_softc *sc) 6822 { 6823 struct hn_tx_ring *txr; 6824 int i; 6825 6826 HN_LOCK_ASSERT(sc); 6827 6828 /* 6829 * Suspend TX. 6830 */ 6831 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6832 txr = &sc->hn_tx_ring[i]; 6833 6834 mtx_lock(&txr->hn_tx_lock); 6835 txr->hn_suspended = 1; 6836 mtx_unlock(&txr->hn_tx_lock); 6837 /* No one is able send more packets now. */ 6838 6839 /* 6840 * Wait for all pending sends to finish. 6841 * 6842 * NOTE: 6843 * We will _not_ receive all pending send-done, if the 6844 * primary channel is revoked. 6845 */ 6846 while (hn_tx_ring_pending(txr) && 6847 !vmbus_chan_is_revoked(sc->hn_prichan)) 6848 pause("hnwtx", 1 /* 1 tick */); 6849 } 6850 6851 /* 6852 * Disable RX. 6853 */ 6854 hn_disable_rx(sc); 6855 6856 /* 6857 * Drain RX/TX. 6858 */ 6859 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6860 6861 /* 6862 * Drain any pending TX tasks. 6863 * 6864 * NOTE: 6865 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6866 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6867 */ 6868 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6869 txr = &sc->hn_tx_ring[i]; 6870 6871 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6872 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6873 } 6874 } 6875 6876 static void 6877 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6878 { 6879 6880 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6881 } 6882 6883 static void 6884 hn_suspend_mgmt(struct hn_softc *sc) 6885 { 6886 struct task task; 6887 6888 HN_LOCK_ASSERT(sc); 6889 6890 /* 6891 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6892 * through hn_mgmt_taskq. 6893 */ 6894 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6895 vmbus_chan_run_task(sc->hn_prichan, &task); 6896 6897 /* 6898 * Make sure that all pending management tasks are completed. 6899 */ 6900 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6901 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6902 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6903 } 6904 6905 static void 6906 hn_suspend(struct hn_softc *sc) 6907 { 6908 6909 /* Disable polling. */ 6910 hn_polling(sc, 0); 6911 6912 /* 6913 * If the non-transparent mode VF is activated, the synthetic 6914 * device is receiving packets, so the data path of the 6915 * synthetic device must be suspended. 6916 */ 6917 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6918 (sc->hn_flags & HN_FLAG_RXVF)) 6919 hn_suspend_data(sc); 6920 hn_suspend_mgmt(sc); 6921 } 6922 6923 static void 6924 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6925 { 6926 int i; 6927 6928 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6929 ("invalid TX ring count %d", tx_ring_cnt)); 6930 6931 for (i = 0; i < tx_ring_cnt; ++i) { 6932 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6933 6934 mtx_lock(&txr->hn_tx_lock); 6935 txr->hn_suspended = 0; 6936 mtx_unlock(&txr->hn_tx_lock); 6937 } 6938 } 6939 6940 static void 6941 hn_resume_data(struct hn_softc *sc) 6942 { 6943 int i; 6944 6945 HN_LOCK_ASSERT(sc); 6946 6947 /* 6948 * Re-enable RX. 6949 */ 6950 hn_rxfilter_config(sc); 6951 6952 /* 6953 * Make sure to clear suspend status on "all" TX rings, 6954 * since hn_tx_ring_inuse can be changed after 6955 * hn_suspend_data(). 6956 */ 6957 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6958 6959 #ifdef HN_IFSTART_SUPPORT 6960 if (!hn_use_if_start) 6961 #endif 6962 { 6963 /* 6964 * Flush unused drbrs, since hn_tx_ring_inuse may be 6965 * reduced. 6966 */ 6967 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6968 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6969 } 6970 6971 /* 6972 * Kick start TX. 6973 */ 6974 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6975 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6976 6977 /* 6978 * Use txeof task, so that any pending oactive can be 6979 * cleared properly. 6980 */ 6981 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6982 } 6983 } 6984 6985 static void 6986 hn_resume_mgmt(struct hn_softc *sc) 6987 { 6988 6989 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6990 6991 /* 6992 * Kick off network change detection, if it was pending. 6993 * If no network change was pending, start link status 6994 * checks, which is more lightweight than network change 6995 * detection. 6996 */ 6997 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6998 hn_change_network(sc); 6999 else 7000 hn_update_link_status(sc); 7001 } 7002 7003 static void 7004 hn_resume(struct hn_softc *sc) 7005 { 7006 7007 /* 7008 * If the non-transparent mode VF is activated, the synthetic 7009 * device have to receive packets, so the data path of the 7010 * synthetic device must be resumed. 7011 */ 7012 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 7013 (sc->hn_flags & HN_FLAG_RXVF)) 7014 hn_resume_data(sc); 7015 7016 /* 7017 * Don't resume link status change if VF is attached/activated. 7018 * - In the non-transparent VF mode, the synthetic device marks 7019 * link down until the VF is deactivated; i.e. VF is down. 7020 * - In transparent VF mode, VF's media status is used until 7021 * the VF is detached. 7022 */ 7023 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 7024 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 7025 hn_resume_mgmt(sc); 7026 7027 /* 7028 * Re-enable polling if this interface is running and 7029 * the polling is requested. 7030 */ 7031 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 7032 hn_polling(sc, sc->hn_pollhz); 7033 } 7034 7035 static void 7036 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 7037 { 7038 const struct rndis_status_msg *msg; 7039 int ofs; 7040 7041 if (dlen < sizeof(*msg)) { 7042 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7043 return; 7044 } 7045 msg = data; 7046 7047 switch (msg->rm_status) { 7048 case RNDIS_STATUS_MEDIA_CONNECT: 7049 case RNDIS_STATUS_MEDIA_DISCONNECT: 7050 hn_update_link_status(sc); 7051 break; 7052 7053 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7054 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7055 /* Not really useful; ignore. */ 7056 break; 7057 7058 case RNDIS_STATUS_NETWORK_CHANGE: 7059 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7060 if (dlen < ofs + msg->rm_stbuflen || 7061 msg->rm_stbuflen < sizeof(uint32_t)) { 7062 if_printf(sc->hn_ifp, "network changed\n"); 7063 } else { 7064 uint32_t change; 7065 7066 memcpy(&change, ((const uint8_t *)msg) + ofs, 7067 sizeof(change)); 7068 if_printf(sc->hn_ifp, "network changed, change %u\n", 7069 change); 7070 } 7071 hn_change_network(sc); 7072 break; 7073 7074 default: 7075 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7076 msg->rm_status); 7077 break; 7078 } 7079 } 7080 7081 static int 7082 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7083 { 7084 const struct rndis_pktinfo *pi = info_data; 7085 uint32_t mask = 0; 7086 7087 while (info_dlen != 0) { 7088 const void *data; 7089 uint32_t dlen; 7090 7091 if (__predict_false(info_dlen < sizeof(*pi))) 7092 return (EINVAL); 7093 if (__predict_false(info_dlen < pi->rm_size)) 7094 return (EINVAL); 7095 info_dlen -= pi->rm_size; 7096 7097 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7098 return (EINVAL); 7099 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7100 return (EINVAL); 7101 dlen = pi->rm_size - pi->rm_pktinfooffset; 7102 data = pi->rm_data; 7103 7104 if (pi->rm_internal == 1) { 7105 switch (pi->rm_type) { 7106 case NDIS_PKTINFO_IT_PKTINFO_ID: 7107 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7108 return (EINVAL); 7109 info->pktinfo_id = 7110 (const struct packet_info_id *)data; 7111 mask |= HN_RXINFO_PKTINFO_ID; 7112 break; 7113 7114 default: 7115 goto next; 7116 } 7117 } else { 7118 switch (pi->rm_type) { 7119 case NDIS_PKTINFO_TYPE_VLAN: 7120 if (__predict_false(dlen 7121 < NDIS_VLAN_INFO_SIZE)) 7122 return (EINVAL); 7123 info->vlan_info = (const uint32_t *)data; 7124 mask |= HN_RXINFO_VLAN; 7125 break; 7126 7127 case NDIS_PKTINFO_TYPE_CSUM: 7128 if (__predict_false(dlen 7129 < NDIS_RXCSUM_INFO_SIZE)) 7130 return (EINVAL); 7131 info->csum_info = (const uint32_t *)data; 7132 mask |= HN_RXINFO_CSUM; 7133 break; 7134 7135 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7136 if (__predict_false(dlen 7137 < HN_NDIS_HASH_VALUE_SIZE)) 7138 return (EINVAL); 7139 info->hash_value = (const uint32_t *)data; 7140 mask |= HN_RXINFO_HASHVAL; 7141 break; 7142 7143 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7144 if (__predict_false(dlen 7145 < HN_NDIS_HASH_INFO_SIZE)) 7146 return (EINVAL); 7147 info->hash_info = (const uint32_t *)data; 7148 mask |= HN_RXINFO_HASHINF; 7149 break; 7150 7151 default: 7152 goto next; 7153 } 7154 } 7155 7156 if (mask == HN_RXINFO_ALL) { 7157 /* All found; done */ 7158 break; 7159 } 7160 next: 7161 pi = (const struct rndis_pktinfo *) 7162 ((const uint8_t *)pi + pi->rm_size); 7163 } 7164 7165 /* 7166 * Final fixup. 7167 * - If there is no hash value, invalidate the hash info. 7168 */ 7169 if ((mask & HN_RXINFO_HASHVAL) == 0) 7170 info->hash_info = NULL; 7171 return (0); 7172 } 7173 7174 static __inline bool 7175 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7176 { 7177 7178 if (off < check_off) { 7179 if (__predict_true(off + len <= check_off)) 7180 return (false); 7181 } else if (off > check_off) { 7182 if (__predict_true(check_off + check_len <= off)) 7183 return (false); 7184 } 7185 return (true); 7186 } 7187 7188 static __inline void 7189 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7190 uint32_t len, struct hn_rxinfo *info) 7191 { 7192 uint32_t cnt = rxr->rsc.cnt; 7193 7194 if (cnt) { 7195 rxr->rsc.pktlen += len; 7196 } else { 7197 rxr->rsc.vlan_info = info->vlan_info; 7198 rxr->rsc.csum_info = info->csum_info; 7199 rxr->rsc.hash_info = info->hash_info; 7200 rxr->rsc.hash_value = info->hash_value; 7201 rxr->rsc.pktlen = len; 7202 } 7203 7204 rxr->rsc.frag_data[cnt] = data; 7205 rxr->rsc.frag_len[cnt] = len; 7206 rxr->rsc.cnt++; 7207 } 7208 7209 static void 7210 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7211 { 7212 const struct rndis_packet_msg *pkt; 7213 struct hn_rxinfo info; 7214 int data_off, pktinfo_off, data_len, pktinfo_len; 7215 bool rsc_more= false; 7216 7217 /* 7218 * Check length. 7219 */ 7220 if (__predict_false(dlen < sizeof(*pkt))) { 7221 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7222 return; 7223 } 7224 pkt = data; 7225 7226 if (__predict_false(dlen < pkt->rm_len)) { 7227 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7228 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7229 return; 7230 } 7231 if (__predict_false(pkt->rm_len < 7232 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7233 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7234 "msglen %u, data %u, oob %u, pktinfo %u\n", 7235 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7236 pkt->rm_pktinfolen); 7237 return; 7238 } 7239 if (__predict_false(pkt->rm_datalen == 0)) { 7240 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7241 return; 7242 } 7243 7244 /* 7245 * Check offests. 7246 */ 7247 #define IS_OFFSET_INVALID(ofs) \ 7248 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7249 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7250 7251 /* XXX Hyper-V does not meet data offset alignment requirement */ 7252 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7253 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7254 "data offset %u\n", pkt->rm_dataoffset); 7255 return; 7256 } 7257 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7258 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7259 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7260 "oob offset %u\n", pkt->rm_oobdataoffset); 7261 return; 7262 } 7263 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7264 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7265 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7266 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7267 return; 7268 } 7269 7270 #undef IS_OFFSET_INVALID 7271 7272 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7273 data_len = pkt->rm_datalen; 7274 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7275 pktinfo_len = pkt->rm_pktinfolen; 7276 7277 /* 7278 * Check OOB coverage. 7279 */ 7280 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7281 int oob_off, oob_len; 7282 7283 if_printf(rxr->hn_ifp, "got oobdata\n"); 7284 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7285 oob_len = pkt->rm_oobdatalen; 7286 7287 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7288 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7289 "oob overflow, msglen %u, oob abs %d len %d\n", 7290 pkt->rm_len, oob_off, oob_len); 7291 return; 7292 } 7293 7294 /* 7295 * Check against data. 7296 */ 7297 if (hn_rndis_check_overlap(oob_off, oob_len, 7298 data_off, data_len)) { 7299 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7300 "oob overlaps data, oob abs %d len %d, " 7301 "data abs %d len %d\n", 7302 oob_off, oob_len, data_off, data_len); 7303 return; 7304 } 7305 7306 /* 7307 * Check against pktinfo. 7308 */ 7309 if (pktinfo_len != 0 && 7310 hn_rndis_check_overlap(oob_off, oob_len, 7311 pktinfo_off, pktinfo_len)) { 7312 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7313 "oob overlaps pktinfo, oob abs %d len %d, " 7314 "pktinfo abs %d len %d\n", 7315 oob_off, oob_len, pktinfo_off, pktinfo_len); 7316 return; 7317 } 7318 } 7319 7320 /* 7321 * Check per-packet-info coverage and find useful per-packet-info. 7322 */ 7323 info.vlan_info = NULL; 7324 info.csum_info = NULL; 7325 info.hash_info = NULL; 7326 info.pktinfo_id = NULL; 7327 7328 if (__predict_true(pktinfo_len != 0)) { 7329 bool overlap; 7330 int error; 7331 7332 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7333 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7334 "pktinfo overflow, msglen %u, " 7335 "pktinfo abs %d len %d\n", 7336 pkt->rm_len, pktinfo_off, pktinfo_len); 7337 return; 7338 } 7339 7340 /* 7341 * Check packet info coverage. 7342 */ 7343 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7344 data_off, data_len); 7345 if (__predict_false(overlap)) { 7346 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7347 "pktinfo overlap data, pktinfo abs %d len %d, " 7348 "data abs %d len %d\n", 7349 pktinfo_off, pktinfo_len, data_off, data_len); 7350 return; 7351 } 7352 7353 /* 7354 * Find useful per-packet-info. 7355 */ 7356 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7357 pktinfo_len, &info); 7358 if (__predict_false(error)) { 7359 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7360 "pktinfo\n"); 7361 return; 7362 } 7363 } 7364 7365 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7366 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7367 "data overflow, msglen %u, data abs %d len %d\n", 7368 pkt->rm_len, data_off, data_len); 7369 return; 7370 } 7371 7372 /* Identify RSC fragments, drop invalid packets */ 7373 if ((info.pktinfo_id != NULL) && 7374 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7375 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7376 rxr->rsc.cnt = 0; 7377 rxr->hn_rsc_pkts++; 7378 } else if (rxr->rsc.cnt == 0) 7379 goto drop; 7380 7381 rsc_more = true; 7382 7383 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7384 rsc_more = false; 7385 7386 if (rsc_more && rxr->rsc.is_last) 7387 goto drop; 7388 } else { 7389 rxr->rsc.cnt = 0; 7390 } 7391 7392 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7393 goto drop; 7394 7395 /* Store data in per rx ring structure */ 7396 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7397 data_len, &info); 7398 7399 if (rsc_more) 7400 return; 7401 7402 hn_rxpkt(rxr); 7403 rxr->rsc.cnt = 0; 7404 return; 7405 drop: 7406 rxr->hn_rsc_drop++; 7407 return; 7408 } 7409 7410 static __inline void 7411 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7412 { 7413 const struct rndis_msghdr *hdr; 7414 7415 if (__predict_false(dlen < sizeof(*hdr))) { 7416 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7417 return; 7418 } 7419 hdr = data; 7420 7421 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7422 /* Hot data path. */ 7423 hn_rndis_rx_data(rxr, data, dlen); 7424 /* Done! */ 7425 return; 7426 } 7427 7428 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7429 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 7430 else 7431 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 7432 } 7433 7434 static void 7435 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7436 { 7437 const struct hn_nvs_hdr *hdr; 7438 7439 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7440 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7441 return; 7442 } 7443 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7444 7445 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7446 /* Useless; ignore */ 7447 return; 7448 } 7449 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7450 } 7451 7452 static void 7453 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7454 const struct vmbus_chanpkt_hdr *pkt) 7455 { 7456 struct hn_nvs_sendctx *sndc; 7457 7458 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7459 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7460 VMBUS_CHANPKT_DATALEN(pkt)); 7461 /* 7462 * NOTE: 7463 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7464 * its callback. 7465 */ 7466 } 7467 7468 static void 7469 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7470 const struct vmbus_chanpkt_hdr *pkthdr) 7471 { 7472 struct epoch_tracker et; 7473 const struct vmbus_chanpkt_rxbuf *pkt; 7474 const struct hn_nvs_hdr *nvs_hdr; 7475 int count, i, hlen; 7476 7477 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7478 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7479 return; 7480 } 7481 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7482 7483 /* Make sure that this is a RNDIS message. */ 7484 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7485 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7486 nvs_hdr->nvs_type); 7487 return; 7488 } 7489 7490 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7491 if (__predict_false(hlen < sizeof(*pkt))) { 7492 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7493 return; 7494 } 7495 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7496 7497 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7498 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7499 pkt->cp_rxbuf_id); 7500 return; 7501 } 7502 7503 count = pkt->cp_rxbuf_cnt; 7504 if (__predict_false(hlen < 7505 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7506 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7507 return; 7508 } 7509 7510 NET_EPOCH_ENTER(et); 7511 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7512 for (i = 0; i < count; ++i) { 7513 int ofs, len; 7514 7515 ofs = pkt->cp_rxbuf[i].rb_ofs; 7516 len = pkt->cp_rxbuf[i].rb_len; 7517 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7518 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7519 "ofs %d, len %d\n", i, ofs, len); 7520 continue; 7521 } 7522 7523 rxr->rsc.is_last = (i == (count - 1)); 7524 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7525 } 7526 NET_EPOCH_EXIT(et); 7527 7528 /* 7529 * Ack the consumed RXBUF associated w/ this channel packet, 7530 * so that this RXBUF can be recycled by the hypervisor. 7531 */ 7532 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7533 } 7534 7535 static void 7536 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7537 uint64_t tid) 7538 { 7539 struct hn_nvs_rndis_ack ack; 7540 int retries, error; 7541 7542 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7543 ack.nvs_status = HN_NVS_STATUS_OK; 7544 7545 retries = 0; 7546 again: 7547 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7548 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7549 if (__predict_false(error == EAGAIN)) { 7550 /* 7551 * NOTE: 7552 * This should _not_ happen in real world, since the 7553 * consumption of the TX bufring from the TX path is 7554 * controlled. 7555 */ 7556 if (rxr->hn_ack_failed == 0) 7557 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7558 rxr->hn_ack_failed++; 7559 retries++; 7560 if (retries < 10) { 7561 DELAY(100); 7562 goto again; 7563 } 7564 /* RXBUF leaks! */ 7565 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7566 } 7567 } 7568 7569 static void 7570 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7571 { 7572 struct hn_rx_ring *rxr = xrxr; 7573 struct hn_softc *sc = rxr->hn_ifp->if_softc; 7574 7575 for (;;) { 7576 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7577 int error, pktlen; 7578 7579 pktlen = rxr->hn_pktbuf_len; 7580 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7581 if (__predict_false(error == ENOBUFS)) { 7582 void *nbuf; 7583 int nlen; 7584 7585 /* 7586 * Expand channel packet buffer. 7587 * 7588 * XXX 7589 * Use M_WAITOK here, since allocation failure 7590 * is fatal. 7591 */ 7592 nlen = rxr->hn_pktbuf_len * 2; 7593 while (nlen < pktlen) 7594 nlen *= 2; 7595 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7596 7597 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7598 rxr->hn_pktbuf_len, nlen); 7599 7600 free(rxr->hn_pktbuf, M_DEVBUF); 7601 rxr->hn_pktbuf = nbuf; 7602 rxr->hn_pktbuf_len = nlen; 7603 /* Retry! */ 7604 continue; 7605 } else if (__predict_false(error == EAGAIN)) { 7606 /* No more channel packets; done! */ 7607 break; 7608 } 7609 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7610 7611 switch (pkt->cph_type) { 7612 case VMBUS_CHANPKT_TYPE_COMP: 7613 hn_nvs_handle_comp(sc, chan, pkt); 7614 break; 7615 7616 case VMBUS_CHANPKT_TYPE_RXBUF: 7617 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7618 break; 7619 7620 case VMBUS_CHANPKT_TYPE_INBAND: 7621 hn_nvs_handle_notify(sc, pkt); 7622 break; 7623 7624 default: 7625 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7626 pkt->cph_type); 7627 break; 7628 } 7629 } 7630 hn_chan_rollup(rxr, rxr->hn_txr); 7631 } 7632 7633 static void 7634 hn_sysinit(void *arg __unused) 7635 { 7636 int i; 7637 7638 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7639 7640 #ifdef HN_IFSTART_SUPPORT 7641 /* 7642 * Don't use ifnet.if_start if transparent VF mode is requested; 7643 * mainly due to the IFF_DRV_OACTIVE flag. 7644 */ 7645 if (hn_xpnt_vf && hn_use_if_start) { 7646 hn_use_if_start = 0; 7647 printf("hn: tranparent VF mode, if_transmit will be used, " 7648 "instead of if_start\n"); 7649 } 7650 #endif 7651 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7652 printf("hn: invalid transparent VF attach routing " 7653 "wait timeout %d, reset to %d\n", 7654 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7655 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7656 } 7657 7658 /* 7659 * Initialize VF map. 7660 */ 7661 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7662 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7663 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, 7664 M_WAITOK | M_ZERO); 7665 7666 /* 7667 * Fix the # of TX taskqueues. 7668 */ 7669 if (hn_tx_taskq_cnt <= 0) 7670 hn_tx_taskq_cnt = 1; 7671 else if (hn_tx_taskq_cnt > mp_ncpus) 7672 hn_tx_taskq_cnt = mp_ncpus; 7673 7674 /* 7675 * Fix the TX taskqueue mode. 7676 */ 7677 switch (hn_tx_taskq_mode) { 7678 case HN_TX_TASKQ_M_INDEP: 7679 case HN_TX_TASKQ_M_GLOBAL: 7680 case HN_TX_TASKQ_M_EVTTQ: 7681 break; 7682 default: 7683 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7684 break; 7685 } 7686 7687 if (vm_guest != VM_GUEST_HV) 7688 return; 7689 7690 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7691 return; 7692 7693 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7694 M_DEVBUF, M_WAITOK); 7695 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7696 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7697 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7698 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7699 "hn tx%d", i); 7700 } 7701 } 7702 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7703 7704 static void 7705 hn_sysuninit(void *arg __unused) 7706 { 7707 7708 if (hn_tx_taskque != NULL) { 7709 int i; 7710 7711 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7712 taskqueue_free(hn_tx_taskque[i]); 7713 free(hn_tx_taskque, M_DEVBUF); 7714 } 7715 7716 if (hn_vfmap != NULL) 7717 free(hn_vfmap, M_DEVBUF); 7718 rm_destroy(&hn_vfmap_lock); 7719 7720 counter_u64_free(hn_udpcs_fixup); 7721 } 7722 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7723