1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/bus.h> 65 #include <sys/kernel.h> 66 #include <sys/limits.h> 67 #include <sys/malloc.h> 68 #include <sys/mbuf.h> 69 #include <sys/module.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/smp.h> 73 #include <sys/socket.h> 74 #include <sys/sockio.h> 75 #include <sys/sx.h> 76 #include <sys/sysctl.h> 77 #include <sys/systm.h> 78 #include <sys/taskqueue.h> 79 #include <sys/buf_ring.h> 80 #include <sys/eventhandler.h> 81 82 #include <machine/atomic.h> 83 #include <machine/in_cksum.h> 84 85 #include <net/bpf.h> 86 #include <net/ethernet.h> 87 #include <net/if.h> 88 #include <net/if_dl.h> 89 #include <net/if_media.h> 90 #include <net/if_types.h> 91 #include <net/if_var.h> 92 #include <net/rndis.h> 93 #ifdef RSS 94 #include <net/rss_config.h> 95 #endif 96 97 #include <netinet/in_systm.h> 98 #include <netinet/in.h> 99 #include <netinet/ip.h> 100 #include <netinet/ip6.h> 101 #include <netinet/tcp.h> 102 #include <netinet/tcp_lro.h> 103 #include <netinet/udp.h> 104 105 #include <dev/hyperv/include/hyperv.h> 106 #include <dev/hyperv/include/hyperv_busdma.h> 107 #include <dev/hyperv/include/vmbus.h> 108 #include <dev/hyperv/include/vmbus_xact.h> 109 110 #include <dev/hyperv/netvsc/ndis.h> 111 #include <dev/hyperv/netvsc/if_hnreg.h> 112 #include <dev/hyperv/netvsc/if_hnvar.h> 113 #include <dev/hyperv/netvsc/hn_nvs.h> 114 #include <dev/hyperv/netvsc/hn_rndis.h> 115 116 #include "vmbus_if.h" 117 118 #define HN_IFSTART_SUPPORT 119 120 #define HN_RING_CNT_DEF_MAX 8 121 122 /* YYY should get it from the underlying channel */ 123 #define HN_TX_DESC_CNT 512 124 125 #define HN_RNDIS_PKT_LEN \ 126 (sizeof(struct rndis_packet_msg) + \ 127 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 128 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 129 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 130 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 131 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 132 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 133 134 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 135 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 136 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 137 /* -1 for RNDIS packet message */ 138 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 139 140 #define HN_DIRECT_TX_SIZE_DEF 128 141 142 #define HN_EARLY_TXEOF_THRESH 8 143 144 #define HN_PKTBUF_LEN_DEF (16 * 1024) 145 146 #define HN_LROENT_CNT_DEF 128 147 148 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 149 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 150 /* YYY 2*MTU is a bit rough, but should be good enough. */ 151 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 152 153 #define HN_LRO_ACKCNT_DEF 1 154 155 #define HN_LOCK_INIT(sc) \ 156 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 157 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 158 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 159 #define HN_LOCK(sc) \ 160 do { \ 161 while (sx_try_xlock(&(sc)->hn_lock) == 0) \ 162 DELAY(1000); \ 163 } while (0) 164 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 165 166 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 167 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 168 #define HN_CSUM_IP_HWASSIST(sc) \ 169 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 170 #define HN_CSUM_IP6_HWASSIST(sc) \ 171 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 172 173 #define HN_PKTSIZE_MIN(align) \ 174 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 175 HN_RNDIS_PKT_LEN, (align)) 176 #define HN_PKTSIZE(m, align) \ 177 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 178 179 #ifdef RSS 180 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 181 #else 182 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 183 #endif 184 185 struct hn_txdesc { 186 #ifndef HN_USE_TXDESC_BUFRING 187 SLIST_ENTRY(hn_txdesc) link; 188 #endif 189 STAILQ_ENTRY(hn_txdesc) agg_link; 190 191 /* Aggregated txdescs, in sending order. */ 192 STAILQ_HEAD(, hn_txdesc) agg_list; 193 194 /* The oldest packet, if transmission aggregation happens. */ 195 struct mbuf *m; 196 struct hn_tx_ring *txr; 197 int refs; 198 uint32_t flags; /* HN_TXD_FLAG_ */ 199 struct hn_nvs_sendctx send_ctx; 200 uint32_t chim_index; 201 int chim_size; 202 203 bus_dmamap_t data_dmap; 204 205 bus_addr_t rndis_pkt_paddr; 206 struct rndis_packet_msg *rndis_pkt; 207 bus_dmamap_t rndis_pkt_dmap; 208 }; 209 210 #define HN_TXD_FLAG_ONLIST 0x0001 211 #define HN_TXD_FLAG_DMAMAP 0x0002 212 #define HN_TXD_FLAG_ONAGG 0x0004 213 214 struct hn_rxinfo { 215 uint32_t vlan_info; 216 uint32_t csum_info; 217 uint32_t hash_info; 218 uint32_t hash_value; 219 }; 220 221 struct hn_update_vf { 222 struct hn_rx_ring *rxr; 223 struct ifnet *vf; 224 }; 225 226 #define HN_RXINFO_VLAN 0x0001 227 #define HN_RXINFO_CSUM 0x0002 228 #define HN_RXINFO_HASHINF 0x0004 229 #define HN_RXINFO_HASHVAL 0x0008 230 #define HN_RXINFO_ALL \ 231 (HN_RXINFO_VLAN | \ 232 HN_RXINFO_CSUM | \ 233 HN_RXINFO_HASHINF | \ 234 HN_RXINFO_HASHVAL) 235 236 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff 237 #define HN_NDIS_RXCSUM_INFO_INVALID 0 238 #define HN_NDIS_HASH_INFO_INVALID 0 239 240 static int hn_probe(device_t); 241 static int hn_attach(device_t); 242 static int hn_detach(device_t); 243 static int hn_shutdown(device_t); 244 static void hn_chan_callback(struct vmbus_channel *, 245 void *); 246 247 static void hn_init(void *); 248 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 249 #ifdef HN_IFSTART_SUPPORT 250 static void hn_start(struct ifnet *); 251 #endif 252 static int hn_transmit(struct ifnet *, struct mbuf *); 253 static void hn_xmit_qflush(struct ifnet *); 254 static int hn_ifmedia_upd(struct ifnet *); 255 static void hn_ifmedia_sts(struct ifnet *, 256 struct ifmediareq *); 257 258 static int hn_rndis_rxinfo(const void *, int, 259 struct hn_rxinfo *); 260 static void hn_rndis_rx_data(struct hn_rx_ring *, 261 const void *, int); 262 static void hn_rndis_rx_status(struct hn_softc *, 263 const void *, int); 264 static void hn_rndis_init_fixat(struct hn_softc *, int); 265 266 static void hn_nvs_handle_notify(struct hn_softc *, 267 const struct vmbus_chanpkt_hdr *); 268 static void hn_nvs_handle_comp(struct hn_softc *, 269 struct vmbus_channel *, 270 const struct vmbus_chanpkt_hdr *); 271 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 272 struct vmbus_channel *, 273 const struct vmbus_chanpkt_hdr *); 274 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 275 struct vmbus_channel *, uint64_t); 276 277 #if __FreeBSD_version >= 1100099 278 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 279 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 280 #endif 281 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 282 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 283 #if __FreeBSD_version < 1100095 284 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 285 #else 286 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 287 #endif 288 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 289 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 290 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 291 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 292 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 293 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 294 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 295 #ifndef RSS 296 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 297 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 298 #endif 299 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 300 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 301 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 302 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 303 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 304 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 305 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 306 307 static void hn_stop(struct hn_softc *, bool); 308 static void hn_init_locked(struct hn_softc *); 309 static int hn_chan_attach(struct hn_softc *, 310 struct vmbus_channel *); 311 static void hn_chan_detach(struct hn_softc *, 312 struct vmbus_channel *); 313 static int hn_attach_subchans(struct hn_softc *); 314 static void hn_detach_allchans(struct hn_softc *); 315 static void hn_chan_rollup(struct hn_rx_ring *, 316 struct hn_tx_ring *); 317 static void hn_set_ring_inuse(struct hn_softc *, int); 318 static int hn_synth_attach(struct hn_softc *, int); 319 static void hn_synth_detach(struct hn_softc *); 320 static int hn_synth_alloc_subchans(struct hn_softc *, 321 int *); 322 static bool hn_synth_attachable(const struct hn_softc *); 323 static void hn_suspend(struct hn_softc *); 324 static void hn_suspend_data(struct hn_softc *); 325 static void hn_suspend_mgmt(struct hn_softc *); 326 static void hn_resume(struct hn_softc *); 327 static void hn_resume_data(struct hn_softc *); 328 static void hn_resume_mgmt(struct hn_softc *); 329 static void hn_suspend_mgmt_taskfunc(void *, int); 330 static void hn_chan_drain(struct hn_softc *, 331 struct vmbus_channel *); 332 static void hn_disable_rx(struct hn_softc *); 333 static void hn_drain_rxtx(struct hn_softc *, int); 334 static void hn_polling(struct hn_softc *, u_int); 335 static void hn_chan_polling(struct vmbus_channel *, u_int); 336 337 static void hn_update_link_status(struct hn_softc *); 338 static void hn_change_network(struct hn_softc *); 339 static void hn_link_taskfunc(void *, int); 340 static void hn_netchg_init_taskfunc(void *, int); 341 static void hn_netchg_status_taskfunc(void *, int); 342 static void hn_link_status(struct hn_softc *); 343 344 static int hn_create_rx_data(struct hn_softc *, int); 345 static void hn_destroy_rx_data(struct hn_softc *); 346 static int hn_check_iplen(const struct mbuf *, int); 347 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 348 static int hn_rxfilter_config(struct hn_softc *); 349 #ifndef RSS 350 static int hn_rss_reconfig(struct hn_softc *); 351 #endif 352 static void hn_rss_ind_fixup(struct hn_softc *); 353 static int hn_rxpkt(struct hn_rx_ring *, const void *, 354 int, const struct hn_rxinfo *); 355 356 static int hn_tx_ring_create(struct hn_softc *, int); 357 static void hn_tx_ring_destroy(struct hn_tx_ring *); 358 static int hn_create_tx_data(struct hn_softc *, int); 359 static void hn_fixup_tx_data(struct hn_softc *); 360 static void hn_destroy_tx_data(struct hn_softc *); 361 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 362 static void hn_txdesc_gc(struct hn_tx_ring *, 363 struct hn_txdesc *); 364 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 365 struct hn_txdesc *, struct mbuf **); 366 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 367 struct hn_txdesc *); 368 static void hn_set_chim_size(struct hn_softc *, int); 369 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 370 static bool hn_tx_ring_pending(struct hn_tx_ring *); 371 static void hn_tx_ring_qflush(struct hn_tx_ring *); 372 static void hn_resume_tx(struct hn_softc *, int); 373 static void hn_set_txagg(struct hn_softc *); 374 static void *hn_try_txagg(struct ifnet *, 375 struct hn_tx_ring *, struct hn_txdesc *, 376 int); 377 static int hn_get_txswq_depth(const struct hn_tx_ring *); 378 static void hn_txpkt_done(struct hn_nvs_sendctx *, 379 struct hn_softc *, struct vmbus_channel *, 380 const void *, int); 381 static int hn_txpkt_sglist(struct hn_tx_ring *, 382 struct hn_txdesc *); 383 static int hn_txpkt_chim(struct hn_tx_ring *, 384 struct hn_txdesc *); 385 static int hn_xmit(struct hn_tx_ring *, int); 386 static void hn_xmit_taskfunc(void *, int); 387 static void hn_xmit_txeof(struct hn_tx_ring *); 388 static void hn_xmit_txeof_taskfunc(void *, int); 389 #ifdef HN_IFSTART_SUPPORT 390 static int hn_start_locked(struct hn_tx_ring *, int); 391 static void hn_start_taskfunc(void *, int); 392 static void hn_start_txeof(struct hn_tx_ring *); 393 static void hn_start_txeof_taskfunc(void *, int); 394 #endif 395 396 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 397 "Hyper-V network interface"); 398 399 /* Trust tcp segements verification on host side. */ 400 static int hn_trust_hosttcp = 1; 401 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 402 &hn_trust_hosttcp, 0, 403 "Trust tcp segement verification on host side, " 404 "when csum info is missing (global setting)"); 405 406 /* Trust udp datagrams verification on host side. */ 407 static int hn_trust_hostudp = 1; 408 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 409 &hn_trust_hostudp, 0, 410 "Trust udp datagram verification on host side, " 411 "when csum info is missing (global setting)"); 412 413 /* Trust ip packets verification on host side. */ 414 static int hn_trust_hostip = 1; 415 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 416 &hn_trust_hostip, 0, 417 "Trust ip packet verification on host side, " 418 "when csum info is missing (global setting)"); 419 420 /* Limit TSO burst size */ 421 static int hn_tso_maxlen = IP_MAXPACKET; 422 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 423 &hn_tso_maxlen, 0, "TSO burst limit"); 424 425 /* Limit chimney send size */ 426 static int hn_tx_chimney_size = 0; 427 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 428 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 429 430 /* Limit the size of packet for direct transmission */ 431 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 432 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 433 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 434 435 /* # of LRO entries per RX ring */ 436 #if defined(INET) || defined(INET6) 437 #if __FreeBSD_version >= 1100095 438 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 439 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 440 &hn_lro_entry_count, 0, "LRO entry count"); 441 #endif 442 #endif 443 444 static int hn_tx_taskq_cnt = 1; 445 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 446 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 447 448 #define HN_TX_TASKQ_M_INDEP 0 449 #define HN_TX_TASKQ_M_GLOBAL 1 450 #define HN_TX_TASKQ_M_EVTTQ 2 451 452 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 453 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 454 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 455 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 456 457 #ifndef HN_USE_TXDESC_BUFRING 458 static int hn_use_txdesc_bufring = 0; 459 #else 460 static int hn_use_txdesc_bufring = 1; 461 #endif 462 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 463 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 464 465 #ifdef HN_IFSTART_SUPPORT 466 /* Use ifnet.if_start instead of ifnet.if_transmit */ 467 static int hn_use_if_start = 0; 468 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 469 &hn_use_if_start, 0, "Use if_start TX method"); 470 #endif 471 472 /* # of channels to use */ 473 static int hn_chan_cnt = 0; 474 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 475 &hn_chan_cnt, 0, 476 "# of channels to use; each channel has one RX ring and one TX ring"); 477 478 /* # of transmit rings to use */ 479 static int hn_tx_ring_cnt = 0; 480 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 481 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 482 483 /* Software TX ring deptch */ 484 static int hn_tx_swq_depth = 0; 485 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 486 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 487 488 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 489 #if __FreeBSD_version >= 1100095 490 static u_int hn_lro_mbufq_depth = 0; 491 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 492 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 493 #endif 494 495 /* Packet transmission aggregation size limit */ 496 static int hn_tx_agg_size = -1; 497 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 498 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 499 500 /* Packet transmission aggregation count limit */ 501 static int hn_tx_agg_pkts = -1; 502 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 503 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 504 505 static u_int hn_cpu_index; /* next CPU for channel */ 506 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 507 508 #ifndef RSS 509 static const uint8_t 510 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 511 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 512 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 513 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 514 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 515 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 516 }; 517 #endif /* !RSS */ 518 519 static device_method_t hn_methods[] = { 520 /* Device interface */ 521 DEVMETHOD(device_probe, hn_probe), 522 DEVMETHOD(device_attach, hn_attach), 523 DEVMETHOD(device_detach, hn_detach), 524 DEVMETHOD(device_shutdown, hn_shutdown), 525 DEVMETHOD_END 526 }; 527 528 static driver_t hn_driver = { 529 "hn", 530 hn_methods, 531 sizeof(struct hn_softc) 532 }; 533 534 static devclass_t hn_devclass; 535 536 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 537 MODULE_VERSION(hn, 1); 538 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 539 540 #if __FreeBSD_version >= 1100099 541 static void 542 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 543 { 544 int i; 545 546 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 547 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 548 } 549 #endif 550 551 static int 552 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 553 { 554 555 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 556 txd->chim_size == 0, ("invalid rndis sglist txd")); 557 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 558 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 559 } 560 561 static int 562 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 563 { 564 struct hn_nvs_rndis rndis; 565 566 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 567 txd->chim_size > 0, ("invalid rndis chim txd")); 568 569 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 570 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 571 rndis.nvs_chim_idx = txd->chim_index; 572 rndis.nvs_chim_sz = txd->chim_size; 573 574 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 575 &rndis, sizeof(rndis), &txd->send_ctx)); 576 } 577 578 static __inline uint32_t 579 hn_chim_alloc(struct hn_softc *sc) 580 { 581 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 582 u_long *bmap = sc->hn_chim_bmap; 583 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 584 585 for (i = 0; i < bmap_cnt; ++i) { 586 int idx; 587 588 idx = ffsl(~bmap[i]); 589 if (idx == 0) 590 continue; 591 592 --idx; /* ffsl is 1-based */ 593 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 594 ("invalid i %d and idx %d", i, idx)); 595 596 if (atomic_testandset_long(&bmap[i], idx)) 597 continue; 598 599 ret = i * LONG_BIT + idx; 600 break; 601 } 602 return (ret); 603 } 604 605 static __inline void 606 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 607 { 608 u_long mask; 609 uint32_t idx; 610 611 idx = chim_idx / LONG_BIT; 612 KASSERT(idx < sc->hn_chim_bmap_cnt, 613 ("invalid chimney index 0x%x", chim_idx)); 614 615 mask = 1UL << (chim_idx % LONG_BIT); 616 KASSERT(sc->hn_chim_bmap[idx] & mask, 617 ("index bitmap 0x%lx, chimney index %u, " 618 "bitmap idx %d, bitmask 0x%lx", 619 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 620 621 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 622 } 623 624 #if defined(INET6) || defined(INET) 625 /* 626 * NOTE: If this function failed, the m_head would be freed. 627 */ 628 static __inline struct mbuf * 629 hn_tso_fixup(struct mbuf *m_head) 630 { 631 struct ether_vlan_header *evl; 632 struct tcphdr *th; 633 int ehlen; 634 635 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 636 637 #define PULLUP_HDR(m, len) \ 638 do { \ 639 if (__predict_false((m)->m_len < (len))) { \ 640 (m) = m_pullup((m), (len)); \ 641 if ((m) == NULL) \ 642 return (NULL); \ 643 } \ 644 } while (0) 645 646 PULLUP_HDR(m_head, sizeof(*evl)); 647 evl = mtod(m_head, struct ether_vlan_header *); 648 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 649 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 650 else 651 ehlen = ETHER_HDR_LEN; 652 653 #ifdef INET 654 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 655 struct ip *ip; 656 int iphlen; 657 658 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 659 ip = mtodo(m_head, ehlen); 660 iphlen = ip->ip_hl << 2; 661 662 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 663 th = mtodo(m_head, ehlen + iphlen); 664 665 ip->ip_len = 0; 666 ip->ip_sum = 0; 667 th->th_sum = in_pseudo(ip->ip_src.s_addr, 668 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 669 } 670 #endif 671 #if defined(INET6) && defined(INET) 672 else 673 #endif 674 #ifdef INET6 675 { 676 struct ip6_hdr *ip6; 677 678 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 679 ip6 = mtodo(m_head, ehlen); 680 if (ip6->ip6_nxt != IPPROTO_TCP) { 681 m_freem(m_head); 682 return (NULL); 683 } 684 685 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 686 th = mtodo(m_head, ehlen + sizeof(*ip6)); 687 688 ip6->ip6_plen = 0; 689 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 690 } 691 #endif 692 return (m_head); 693 694 #undef PULLUP_HDR 695 } 696 #endif /* INET6 || INET */ 697 698 static int 699 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 700 { 701 int error = 0; 702 703 HN_LOCK_ASSERT(sc); 704 705 if (sc->hn_rx_filter != filter) { 706 error = hn_rndis_set_rxfilter(sc, filter); 707 if (!error) 708 sc->hn_rx_filter = filter; 709 } 710 return (error); 711 } 712 713 static int 714 hn_rxfilter_config(struct hn_softc *sc) 715 { 716 struct ifnet *ifp = sc->hn_ifp; 717 uint32_t filter; 718 719 HN_LOCK_ASSERT(sc); 720 721 if ((ifp->if_flags & IFF_PROMISC) || 722 (sc->hn_flags & HN_FLAG_VF)) { 723 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 724 } else { 725 filter = NDIS_PACKET_TYPE_DIRECTED; 726 if (ifp->if_flags & IFF_BROADCAST) 727 filter |= NDIS_PACKET_TYPE_BROADCAST; 728 /* TODO: support multicast list */ 729 if ((ifp->if_flags & IFF_ALLMULTI) || 730 !TAILQ_EMPTY(&ifp->if_multiaddrs)) 731 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 732 } 733 return (hn_set_rxfilter(sc, filter)); 734 } 735 736 static void 737 hn_set_txagg(struct hn_softc *sc) 738 { 739 uint32_t size, pkts; 740 int i; 741 742 /* 743 * Setup aggregation size. 744 */ 745 if (sc->hn_agg_size < 0) 746 size = UINT32_MAX; 747 else 748 size = sc->hn_agg_size; 749 750 if (sc->hn_rndis_agg_size < size) 751 size = sc->hn_rndis_agg_size; 752 753 /* NOTE: We only aggregate packets using chimney sending buffers. */ 754 if (size > (uint32_t)sc->hn_chim_szmax) 755 size = sc->hn_chim_szmax; 756 757 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 758 /* Disable */ 759 size = 0; 760 pkts = 0; 761 goto done; 762 } 763 764 /* NOTE: Type of the per TX ring setting is 'int'. */ 765 if (size > INT_MAX) 766 size = INT_MAX; 767 768 /* 769 * Setup aggregation packet count. 770 */ 771 if (sc->hn_agg_pkts < 0) 772 pkts = UINT32_MAX; 773 else 774 pkts = sc->hn_agg_pkts; 775 776 if (sc->hn_rndis_agg_pkts < pkts) 777 pkts = sc->hn_rndis_agg_pkts; 778 779 if (pkts <= 1) { 780 /* Disable */ 781 size = 0; 782 pkts = 0; 783 goto done; 784 } 785 786 /* NOTE: Type of the per TX ring setting is 'short'. */ 787 if (pkts > SHRT_MAX) 788 pkts = SHRT_MAX; 789 790 done: 791 /* NOTE: Type of the per TX ring setting is 'short'. */ 792 if (sc->hn_rndis_agg_align > SHRT_MAX) { 793 /* Disable */ 794 size = 0; 795 pkts = 0; 796 } 797 798 if (bootverbose) { 799 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 800 size, pkts, sc->hn_rndis_agg_align); 801 } 802 803 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 804 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 805 806 mtx_lock(&txr->hn_tx_lock); 807 txr->hn_agg_szmax = size; 808 txr->hn_agg_pktmax = pkts; 809 txr->hn_agg_align = sc->hn_rndis_agg_align; 810 mtx_unlock(&txr->hn_tx_lock); 811 } 812 } 813 814 static int 815 hn_get_txswq_depth(const struct hn_tx_ring *txr) 816 { 817 818 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 819 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 820 return txr->hn_txdesc_cnt; 821 return hn_tx_swq_depth; 822 } 823 824 #ifndef RSS 825 static int 826 hn_rss_reconfig(struct hn_softc *sc) 827 { 828 int error; 829 830 HN_LOCK_ASSERT(sc); 831 832 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 833 return (ENXIO); 834 835 /* 836 * Disable RSS first. 837 * 838 * NOTE: 839 * Direct reconfiguration by setting the UNCHG flags does 840 * _not_ work properly. 841 */ 842 if (bootverbose) 843 if_printf(sc->hn_ifp, "disable RSS\n"); 844 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 845 if (error) { 846 if_printf(sc->hn_ifp, "RSS disable failed\n"); 847 return (error); 848 } 849 850 /* 851 * Reenable the RSS w/ the updated RSS key or indirect 852 * table. 853 */ 854 if (bootverbose) 855 if_printf(sc->hn_ifp, "reconfig RSS\n"); 856 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 857 if (error) { 858 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 859 return (error); 860 } 861 return (0); 862 } 863 #endif /* !RSS */ 864 865 static void 866 hn_rss_ind_fixup(struct hn_softc *sc) 867 { 868 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 869 int i, nchan; 870 871 nchan = sc->hn_rx_ring_inuse; 872 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 873 874 /* 875 * Check indirect table to make sure that all channels in it 876 * can be used. 877 */ 878 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 879 if (rss->rss_ind[i] >= nchan) { 880 if_printf(sc->hn_ifp, 881 "RSS indirect table %d fixup: %u -> %d\n", 882 i, rss->rss_ind[i], nchan - 1); 883 rss->rss_ind[i] = nchan - 1; 884 } 885 } 886 } 887 888 static int 889 hn_ifmedia_upd(struct ifnet *ifp __unused) 890 { 891 892 return EOPNOTSUPP; 893 } 894 895 static void 896 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 897 { 898 struct hn_softc *sc = ifp->if_softc; 899 900 ifmr->ifm_status = IFM_AVALID; 901 ifmr->ifm_active = IFM_ETHER; 902 903 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 904 ifmr->ifm_active |= IFM_NONE; 905 return; 906 } 907 ifmr->ifm_status |= IFM_ACTIVE; 908 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 909 } 910 911 static void 912 hn_update_vf_task(void *arg, int pending __unused) 913 { 914 struct hn_update_vf *uv = arg; 915 916 uv->rxr->hn_vf = uv->vf; 917 } 918 919 static void 920 hn_update_vf(struct hn_softc *sc, struct ifnet *vf) 921 { 922 struct hn_rx_ring *rxr; 923 struct hn_update_vf uv; 924 struct task task; 925 int i; 926 927 HN_LOCK_ASSERT(sc); 928 929 TASK_INIT(&task, 0, hn_update_vf_task, &uv); 930 931 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 932 rxr = &sc->hn_rx_ring[i]; 933 934 if (i < sc->hn_rx_ring_inuse) { 935 uv.rxr = rxr; 936 uv.vf = vf; 937 vmbus_chan_run_task(rxr->hn_chan, &task); 938 } else { 939 rxr->hn_vf = vf; 940 } 941 } 942 } 943 944 static void 945 hn_set_vf(struct hn_softc *sc, struct ifnet *ifp, bool vf) 946 { 947 struct ifnet *hn_ifp; 948 949 HN_LOCK(sc); 950 951 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 952 goto out; 953 954 hn_ifp = sc->hn_ifp; 955 956 if (ifp == hn_ifp) 957 goto out; 958 959 if (ifp->if_alloctype != IFT_ETHER) 960 goto out; 961 962 /* Ignore lagg/vlan interfaces */ 963 if (strcmp(ifp->if_dname, "lagg") == 0 || 964 strcmp(ifp->if_dname, "vlan") == 0) 965 goto out; 966 967 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 968 goto out; 969 970 /* Now we're sure 'ifp' is a real VF device. */ 971 if (vf) { 972 if (sc->hn_flags & HN_FLAG_VF) 973 goto out; 974 975 sc->hn_flags |= HN_FLAG_VF; 976 hn_rxfilter_config(sc); 977 } else { 978 if (!(sc->hn_flags & HN_FLAG_VF)) 979 goto out; 980 981 sc->hn_flags &= ~HN_FLAG_VF; 982 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 983 hn_rxfilter_config(sc); 984 else 985 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 986 } 987 988 hn_nvs_set_datapath(sc, 989 vf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC); 990 991 hn_update_vf(sc, vf ? ifp : NULL); 992 993 if (vf) { 994 hn_suspend_mgmt(sc); 995 sc->hn_link_flags &= 996 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 997 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 998 } else { 999 hn_resume_mgmt(sc); 1000 } 1001 1002 devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp), 1003 vf ? "VF_UP" : "VF_DOWN", NULL); 1004 1005 if (bootverbose) 1006 if_printf(hn_ifp, "Data path is switched %s %s\n", 1007 vf ? "to" : "from", if_name(ifp)); 1008 out: 1009 HN_UNLOCK(sc); 1010 } 1011 1012 static void 1013 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1014 { 1015 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1016 return; 1017 1018 hn_set_vf(arg, ifp, event == IFNET_EVENT_UP); 1019 } 1020 1021 static void 1022 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1023 { 1024 hn_set_vf(arg, ifp, ifp->if_flags & IFF_UP); 1025 } 1026 1027 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ 1028 static const struct hyperv_guid g_net_vsc_device_type = { 1029 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 1030 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} 1031 }; 1032 1033 static int 1034 hn_probe(device_t dev) 1035 { 1036 1037 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, 1038 &g_net_vsc_device_type) == 0) { 1039 device_set_desc(dev, "Hyper-V Network Interface"); 1040 return BUS_PROBE_DEFAULT; 1041 } 1042 return ENXIO; 1043 } 1044 1045 static int 1046 hn_attach(device_t dev) 1047 { 1048 struct hn_softc *sc = device_get_softc(dev); 1049 struct sysctl_oid_list *child; 1050 struct sysctl_ctx_list *ctx; 1051 uint8_t eaddr[ETHER_ADDR_LEN]; 1052 struct ifnet *ifp = NULL; 1053 int error, ring_cnt, tx_ring_cnt; 1054 1055 sc->hn_dev = dev; 1056 sc->hn_prichan = vmbus_get_channel(dev); 1057 HN_LOCK_INIT(sc); 1058 1059 /* 1060 * Initialize these tunables once. 1061 */ 1062 sc->hn_agg_size = hn_tx_agg_size; 1063 sc->hn_agg_pkts = hn_tx_agg_pkts; 1064 1065 /* 1066 * Setup taskqueue for transmission. 1067 */ 1068 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 1069 int i; 1070 1071 sc->hn_tx_taskqs = 1072 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 1073 M_DEVBUF, M_WAITOK); 1074 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 1075 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 1076 M_WAITOK, taskqueue_thread_enqueue, 1077 &sc->hn_tx_taskqs[i]); 1078 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 1079 "%s tx%d", device_get_nameunit(dev), i); 1080 } 1081 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 1082 sc->hn_tx_taskqs = hn_tx_taskque; 1083 } 1084 1085 /* 1086 * Setup taskqueue for mangement tasks, e.g. link status. 1087 */ 1088 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 1089 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 1090 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 1091 device_get_nameunit(dev)); 1092 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 1093 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 1094 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 1095 hn_netchg_status_taskfunc, sc); 1096 1097 /* 1098 * Allocate ifnet and setup its name earlier, so that if_printf 1099 * can be used by functions, which will be called after 1100 * ether_ifattach(). 1101 */ 1102 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 1103 ifp->if_softc = sc; 1104 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 1105 1106 /* 1107 * Initialize ifmedia earlier so that it can be unconditionally 1108 * destroyed, if error happened later on. 1109 */ 1110 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 1111 1112 /* 1113 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 1114 * to use (tx_ring_cnt). 1115 * 1116 * NOTE: 1117 * The # of RX rings to use is same as the # of channels to use. 1118 */ 1119 ring_cnt = hn_chan_cnt; 1120 if (ring_cnt <= 0) { 1121 /* Default */ 1122 ring_cnt = mp_ncpus; 1123 if (ring_cnt > HN_RING_CNT_DEF_MAX) 1124 ring_cnt = HN_RING_CNT_DEF_MAX; 1125 } else if (ring_cnt > mp_ncpus) { 1126 ring_cnt = mp_ncpus; 1127 } 1128 #ifdef RSS 1129 if (ring_cnt > rss_getnumbuckets()) 1130 ring_cnt = rss_getnumbuckets(); 1131 #endif 1132 1133 tx_ring_cnt = hn_tx_ring_cnt; 1134 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 1135 tx_ring_cnt = ring_cnt; 1136 #ifdef HN_IFSTART_SUPPORT 1137 if (hn_use_if_start) { 1138 /* ifnet.if_start only needs one TX ring. */ 1139 tx_ring_cnt = 1; 1140 } 1141 #endif 1142 1143 /* 1144 * Set the leader CPU for channels. 1145 */ 1146 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 1147 1148 /* 1149 * Create enough TX/RX rings, even if only limited number of 1150 * channels can be allocated. 1151 */ 1152 error = hn_create_tx_data(sc, tx_ring_cnt); 1153 if (error) 1154 goto failed; 1155 error = hn_create_rx_data(sc, ring_cnt); 1156 if (error) 1157 goto failed; 1158 1159 /* 1160 * Create transaction context for NVS and RNDIS transactions. 1161 */ 1162 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 1163 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 1164 if (sc->hn_xact == NULL) { 1165 error = ENXIO; 1166 goto failed; 1167 } 1168 1169 /* 1170 * Install orphan handler for the revocation of this device's 1171 * primary channel. 1172 * 1173 * NOTE: 1174 * The processing order is critical here: 1175 * Install the orphan handler, _before_ testing whether this 1176 * device's primary channel has been revoked or not. 1177 */ 1178 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 1179 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 1180 error = ENXIO; 1181 goto failed; 1182 } 1183 1184 /* 1185 * Attach the synthetic parts, i.e. NVS and RNDIS. 1186 */ 1187 error = hn_synth_attach(sc, ETHERMTU); 1188 if (error) 1189 goto failed; 1190 1191 error = hn_rndis_get_eaddr(sc, eaddr); 1192 if (error) 1193 goto failed; 1194 1195 #if __FreeBSD_version >= 1100099 1196 if (sc->hn_rx_ring_inuse > 1) { 1197 /* 1198 * Reduce TCP segment aggregation limit for multiple 1199 * RX rings to increase ACK timeliness. 1200 */ 1201 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 1202 } 1203 #endif 1204 1205 /* 1206 * Fixup TX stuffs after synthetic parts are attached. 1207 */ 1208 hn_fixup_tx_data(sc); 1209 1210 ctx = device_get_sysctl_ctx(dev); 1211 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 1212 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 1213 &sc->hn_nvs_ver, 0, "NVS version"); 1214 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 1215 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1216 hn_ndis_version_sysctl, "A", "NDIS version"); 1217 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 1218 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1219 hn_caps_sysctl, "A", "capabilities"); 1220 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 1221 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1222 hn_hwassist_sysctl, "A", "hwassist"); 1223 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 1224 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1225 hn_rxfilter_sysctl, "A", "rxfilter"); 1226 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 1227 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1228 hn_rss_hash_sysctl, "A", "RSS hash"); 1229 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 1230 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 1231 #ifndef RSS 1232 /* 1233 * Don't allow RSS key/indirect table changes, if RSS is defined. 1234 */ 1235 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 1236 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1237 hn_rss_key_sysctl, "IU", "RSS key"); 1238 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 1239 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1240 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 1241 #endif 1242 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 1243 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 1244 "RNDIS offered packet transmission aggregation size limit"); 1245 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 1246 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 1247 "RNDIS offered packet transmission aggregation count limit"); 1248 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 1249 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 1250 "RNDIS packet transmission aggregation alignment"); 1251 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 1252 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1253 hn_txagg_size_sysctl, "I", 1254 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 1255 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 1256 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1257 hn_txagg_pkts_sysctl, "I", 1258 "Packet transmission aggregation packets, " 1259 "0 -- disable, -1 -- auto"); 1260 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 1261 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1262 hn_polling_sysctl, "I", 1263 "Polling frequency: [100,1000000], 0 disable polling"); 1264 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 1265 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1266 hn_vf_sysctl, "A", "Virtual Function's name"); 1267 1268 /* 1269 * Setup the ifmedia, which has been initialized earlier. 1270 */ 1271 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 1272 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 1273 /* XXX ifmedia_set really should do this for us */ 1274 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 1275 1276 /* 1277 * Setup the ifnet for this interface. 1278 */ 1279 1280 ifp->if_baudrate = IF_Gbps(10); 1281 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 1282 ifp->if_ioctl = hn_ioctl; 1283 ifp->if_init = hn_init; 1284 #ifdef HN_IFSTART_SUPPORT 1285 if (hn_use_if_start) { 1286 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 1287 1288 ifp->if_start = hn_start; 1289 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 1290 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 1291 IFQ_SET_READY(&ifp->if_snd); 1292 } else 1293 #endif 1294 { 1295 ifp->if_transmit = hn_transmit; 1296 ifp->if_qflush = hn_xmit_qflush; 1297 } 1298 1299 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO; 1300 #ifdef foo 1301 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 1302 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 1303 #endif 1304 if (sc->hn_caps & HN_CAP_VLAN) { 1305 /* XXX not sure about VLAN_MTU. */ 1306 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 1307 } 1308 1309 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 1310 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 1311 ifp->if_capabilities |= IFCAP_TXCSUM; 1312 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 1313 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 1314 if (sc->hn_caps & HN_CAP_TSO4) { 1315 ifp->if_capabilities |= IFCAP_TSO4; 1316 ifp->if_hwassist |= CSUM_IP_TSO; 1317 } 1318 if (sc->hn_caps & HN_CAP_TSO6) { 1319 ifp->if_capabilities |= IFCAP_TSO6; 1320 ifp->if_hwassist |= CSUM_IP6_TSO; 1321 } 1322 1323 /* Enable all available capabilities by default. */ 1324 ifp->if_capenable = ifp->if_capabilities; 1325 1326 /* 1327 * Disable IPv6 TSO and TXCSUM by default, they still can 1328 * be enabled through SIOCSIFCAP. 1329 */ 1330 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 1331 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 1332 1333 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 1334 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 1335 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 1336 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 1337 } 1338 1339 ether_ifattach(ifp, eaddr); 1340 1341 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 1342 if_printf(ifp, "TSO segcnt %u segsz %u\n", 1343 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 1344 } 1345 1346 /* Inform the upper layer about the long frame support. */ 1347 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 1348 1349 /* 1350 * Kick off link status check. 1351 */ 1352 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 1353 hn_update_link_status(sc); 1354 1355 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 1356 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 1357 1358 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 1359 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 1360 1361 return (0); 1362 failed: 1363 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 1364 hn_synth_detach(sc); 1365 hn_detach(dev); 1366 return (error); 1367 } 1368 1369 static int 1370 hn_detach(device_t dev) 1371 { 1372 struct hn_softc *sc = device_get_softc(dev); 1373 struct ifnet *ifp = sc->hn_ifp; 1374 1375 if (sc->hn_ifaddr_evthand != NULL) 1376 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 1377 if (sc->hn_ifnet_evthand != NULL) 1378 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 1379 1380 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 1381 /* 1382 * In case that the vmbus missed the orphan handler 1383 * installation. 1384 */ 1385 vmbus_xact_ctx_orphan(sc->hn_xact); 1386 } 1387 1388 if (device_is_attached(dev)) { 1389 HN_LOCK(sc); 1390 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 1391 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 1392 hn_stop(sc, true); 1393 /* 1394 * NOTE: 1395 * hn_stop() only suspends data, so managment 1396 * stuffs have to be suspended manually here. 1397 */ 1398 hn_suspend_mgmt(sc); 1399 hn_synth_detach(sc); 1400 } 1401 HN_UNLOCK(sc); 1402 ether_ifdetach(ifp); 1403 } 1404 1405 ifmedia_removeall(&sc->hn_media); 1406 hn_destroy_rx_data(sc); 1407 hn_destroy_tx_data(sc); 1408 1409 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 1410 int i; 1411 1412 for (i = 0; i < hn_tx_taskq_cnt; ++i) 1413 taskqueue_free(sc->hn_tx_taskqs[i]); 1414 free(sc->hn_tx_taskqs, M_DEVBUF); 1415 } 1416 taskqueue_free(sc->hn_mgmt_taskq0); 1417 1418 if (sc->hn_xact != NULL) { 1419 /* 1420 * Uninstall the orphan handler _before_ the xact is 1421 * destructed. 1422 */ 1423 vmbus_chan_unset_orphan(sc->hn_prichan); 1424 vmbus_xact_ctx_destroy(sc->hn_xact); 1425 } 1426 1427 if_free(ifp); 1428 1429 HN_LOCK_DESTROY(sc); 1430 return (0); 1431 } 1432 1433 static int 1434 hn_shutdown(device_t dev) 1435 { 1436 1437 return (0); 1438 } 1439 1440 static void 1441 hn_link_status(struct hn_softc *sc) 1442 { 1443 uint32_t link_status; 1444 int error; 1445 1446 error = hn_rndis_get_linkstatus(sc, &link_status); 1447 if (error) { 1448 /* XXX what to do? */ 1449 return; 1450 } 1451 1452 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 1453 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 1454 else 1455 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1456 if_link_state_change(sc->hn_ifp, 1457 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 1458 LINK_STATE_UP : LINK_STATE_DOWN); 1459 } 1460 1461 static void 1462 hn_link_taskfunc(void *xsc, int pending __unused) 1463 { 1464 struct hn_softc *sc = xsc; 1465 1466 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 1467 return; 1468 hn_link_status(sc); 1469 } 1470 1471 static void 1472 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 1473 { 1474 struct hn_softc *sc = xsc; 1475 1476 /* Prevent any link status checks from running. */ 1477 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 1478 1479 /* 1480 * Fake up a [link down --> link up] state change; 5 seconds 1481 * delay is used, which closely simulates miibus reaction 1482 * upon link down event. 1483 */ 1484 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1485 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 1486 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 1487 &sc->hn_netchg_status, 5 * hz); 1488 } 1489 1490 static void 1491 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 1492 { 1493 struct hn_softc *sc = xsc; 1494 1495 /* Re-allow link status checks. */ 1496 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 1497 hn_link_status(sc); 1498 } 1499 1500 static void 1501 hn_update_link_status(struct hn_softc *sc) 1502 { 1503 1504 if (sc->hn_mgmt_taskq != NULL) 1505 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 1506 } 1507 1508 static void 1509 hn_change_network(struct hn_softc *sc) 1510 { 1511 1512 if (sc->hn_mgmt_taskq != NULL) 1513 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 1514 } 1515 1516 static __inline int 1517 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 1518 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 1519 { 1520 struct mbuf *m = *m_head; 1521 int error; 1522 1523 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 1524 1525 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 1526 m, segs, nsegs, BUS_DMA_NOWAIT); 1527 if (error == EFBIG) { 1528 struct mbuf *m_new; 1529 1530 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 1531 if (m_new == NULL) 1532 return ENOBUFS; 1533 else 1534 *m_head = m = m_new; 1535 txr->hn_tx_collapsed++; 1536 1537 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 1538 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 1539 } 1540 if (!error) { 1541 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 1542 BUS_DMASYNC_PREWRITE); 1543 txd->flags |= HN_TXD_FLAG_DMAMAP; 1544 } 1545 return error; 1546 } 1547 1548 static __inline int 1549 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 1550 { 1551 1552 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 1553 ("put an onlist txd %#x", txd->flags)); 1554 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1555 ("put an onagg txd %#x", txd->flags)); 1556 1557 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1558 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 1559 return 0; 1560 1561 if (!STAILQ_EMPTY(&txd->agg_list)) { 1562 struct hn_txdesc *tmp_txd; 1563 1564 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 1565 int freed; 1566 1567 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 1568 ("resursive aggregation on aggregated txdesc")); 1569 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 1570 ("not aggregated txdesc")); 1571 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1572 ("aggregated txdesc uses dmamap")); 1573 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 1574 ("aggregated txdesc consumes " 1575 "chimney sending buffer")); 1576 KASSERT(tmp_txd->chim_size == 0, 1577 ("aggregated txdesc has non-zero " 1578 "chimney sending size")); 1579 1580 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 1581 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 1582 freed = hn_txdesc_put(txr, tmp_txd); 1583 KASSERT(freed, ("failed to free aggregated txdesc")); 1584 } 1585 } 1586 1587 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 1588 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1589 ("chim txd uses dmamap")); 1590 hn_chim_free(txr->hn_sc, txd->chim_index); 1591 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 1592 txd->chim_size = 0; 1593 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 1594 bus_dmamap_sync(txr->hn_tx_data_dtag, 1595 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 1596 bus_dmamap_unload(txr->hn_tx_data_dtag, 1597 txd->data_dmap); 1598 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 1599 } 1600 1601 if (txd->m != NULL) { 1602 m_freem(txd->m); 1603 txd->m = NULL; 1604 } 1605 1606 txd->flags |= HN_TXD_FLAG_ONLIST; 1607 #ifndef HN_USE_TXDESC_BUFRING 1608 mtx_lock_spin(&txr->hn_txlist_spin); 1609 KASSERT(txr->hn_txdesc_avail >= 0 && 1610 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 1611 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 1612 txr->hn_txdesc_avail++; 1613 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 1614 mtx_unlock_spin(&txr->hn_txlist_spin); 1615 #else /* HN_USE_TXDESC_BUFRING */ 1616 #ifdef HN_DEBUG 1617 atomic_add_int(&txr->hn_txdesc_avail, 1); 1618 #endif 1619 buf_ring_enqueue(txr->hn_txdesc_br, txd); 1620 #endif /* !HN_USE_TXDESC_BUFRING */ 1621 1622 return 1; 1623 } 1624 1625 static __inline struct hn_txdesc * 1626 hn_txdesc_get(struct hn_tx_ring *txr) 1627 { 1628 struct hn_txdesc *txd; 1629 1630 #ifndef HN_USE_TXDESC_BUFRING 1631 mtx_lock_spin(&txr->hn_txlist_spin); 1632 txd = SLIST_FIRST(&txr->hn_txlist); 1633 if (txd != NULL) { 1634 KASSERT(txr->hn_txdesc_avail > 0, 1635 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 1636 txr->hn_txdesc_avail--; 1637 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 1638 } 1639 mtx_unlock_spin(&txr->hn_txlist_spin); 1640 #else 1641 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 1642 #endif 1643 1644 if (txd != NULL) { 1645 #ifdef HN_USE_TXDESC_BUFRING 1646 #ifdef HN_DEBUG 1647 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 1648 #endif 1649 #endif /* HN_USE_TXDESC_BUFRING */ 1650 KASSERT(txd->m == NULL && txd->refs == 0 && 1651 STAILQ_EMPTY(&txd->agg_list) && 1652 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 1653 txd->chim_size == 0 && 1654 (txd->flags & HN_TXD_FLAG_ONLIST) && 1655 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 1656 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 1657 txd->flags &= ~HN_TXD_FLAG_ONLIST; 1658 txd->refs = 1; 1659 } 1660 return txd; 1661 } 1662 1663 static __inline void 1664 hn_txdesc_hold(struct hn_txdesc *txd) 1665 { 1666 1667 /* 0->1 transition will never work */ 1668 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1669 atomic_add_int(&txd->refs, 1); 1670 } 1671 1672 static __inline void 1673 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 1674 { 1675 1676 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1677 ("recursive aggregation on aggregating txdesc")); 1678 1679 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1680 ("already aggregated")); 1681 KASSERT(STAILQ_EMPTY(&txd->agg_list), 1682 ("recursive aggregation on to-be-aggregated txdesc")); 1683 1684 txd->flags |= HN_TXD_FLAG_ONAGG; 1685 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 1686 } 1687 1688 static bool 1689 hn_tx_ring_pending(struct hn_tx_ring *txr) 1690 { 1691 bool pending = false; 1692 1693 #ifndef HN_USE_TXDESC_BUFRING 1694 mtx_lock_spin(&txr->hn_txlist_spin); 1695 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 1696 pending = true; 1697 mtx_unlock_spin(&txr->hn_txlist_spin); 1698 #else 1699 if (!buf_ring_full(txr->hn_txdesc_br)) 1700 pending = true; 1701 #endif 1702 return (pending); 1703 } 1704 1705 static __inline void 1706 hn_txeof(struct hn_tx_ring *txr) 1707 { 1708 txr->hn_has_txeof = 0; 1709 txr->hn_txeof(txr); 1710 } 1711 1712 static void 1713 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 1714 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 1715 { 1716 struct hn_txdesc *txd = sndc->hn_cbarg; 1717 struct hn_tx_ring *txr; 1718 1719 txr = txd->txr; 1720 KASSERT(txr->hn_chan == chan, 1721 ("channel mismatch, on chan%u, should be chan%u", 1722 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 1723 1724 txr->hn_has_txeof = 1; 1725 hn_txdesc_put(txr, txd); 1726 1727 ++txr->hn_txdone_cnt; 1728 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 1729 txr->hn_txdone_cnt = 0; 1730 if (txr->hn_oactive) 1731 hn_txeof(txr); 1732 } 1733 } 1734 1735 static void 1736 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 1737 { 1738 #if defined(INET) || defined(INET6) 1739 tcp_lro_flush_all(&rxr->hn_lro); 1740 #endif 1741 1742 /* 1743 * NOTE: 1744 * 'txr' could be NULL, if multiple channels and 1745 * ifnet.if_start method are enabled. 1746 */ 1747 if (txr == NULL || !txr->hn_has_txeof) 1748 return; 1749 1750 txr->hn_txdone_cnt = 0; 1751 hn_txeof(txr); 1752 } 1753 1754 static __inline uint32_t 1755 hn_rndis_pktmsg_offset(uint32_t ofs) 1756 { 1757 1758 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 1759 ("invalid RNDIS packet msg offset %u", ofs)); 1760 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 1761 } 1762 1763 static __inline void * 1764 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 1765 size_t pi_dlen, uint32_t pi_type) 1766 { 1767 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 1768 struct rndis_pktinfo *pi; 1769 1770 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 1771 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 1772 1773 /* 1774 * Per-packet-info does not move; it only grows. 1775 * 1776 * NOTE: 1777 * rm_pktinfooffset in this phase counts from the beginning 1778 * of rndis_packet_msg. 1779 */ 1780 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 1781 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 1782 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 1783 pkt->rm_pktinfolen); 1784 pkt->rm_pktinfolen += pi_size; 1785 1786 pi->rm_size = pi_size; 1787 pi->rm_type = pi_type; 1788 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 1789 1790 return (pi->rm_data); 1791 } 1792 1793 static __inline int 1794 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 1795 { 1796 struct hn_txdesc *txd; 1797 struct mbuf *m; 1798 int error, pkts; 1799 1800 txd = txr->hn_agg_txd; 1801 KASSERT(txd != NULL, ("no aggregate txdesc")); 1802 1803 /* 1804 * Since hn_txpkt() will reset this temporary stat, save 1805 * it now, so that oerrors can be updated properly, if 1806 * hn_txpkt() ever fails. 1807 */ 1808 pkts = txr->hn_stat_pkts; 1809 1810 /* 1811 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 1812 * failure, save it for later freeing, if hn_txpkt() ever 1813 * fails. 1814 */ 1815 m = txd->m; 1816 error = hn_txpkt(ifp, txr, txd); 1817 if (__predict_false(error)) { 1818 /* txd is freed, but m is not. */ 1819 m_freem(m); 1820 1821 txr->hn_flush_failed++; 1822 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 1823 } 1824 1825 /* Reset all aggregation states. */ 1826 txr->hn_agg_txd = NULL; 1827 txr->hn_agg_szleft = 0; 1828 txr->hn_agg_pktleft = 0; 1829 txr->hn_agg_prevpkt = NULL; 1830 1831 return (error); 1832 } 1833 1834 static void * 1835 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1836 int pktsize) 1837 { 1838 void *chim; 1839 1840 if (txr->hn_agg_txd != NULL) { 1841 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 1842 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 1843 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 1844 int olen; 1845 1846 /* 1847 * Update the previous RNDIS packet's total length, 1848 * it can be increased due to the mandatory alignment 1849 * padding for this RNDIS packet. And update the 1850 * aggregating txdesc's chimney sending buffer size 1851 * accordingly. 1852 * 1853 * XXX 1854 * Zero-out the padding, as required by the RNDIS spec. 1855 */ 1856 olen = pkt->rm_len; 1857 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 1858 agg_txd->chim_size += pkt->rm_len - olen; 1859 1860 /* Link this txdesc to the parent. */ 1861 hn_txdesc_agg(agg_txd, txd); 1862 1863 chim = (uint8_t *)pkt + pkt->rm_len; 1864 /* Save the current packet for later fixup. */ 1865 txr->hn_agg_prevpkt = chim; 1866 1867 txr->hn_agg_pktleft--; 1868 txr->hn_agg_szleft -= pktsize; 1869 if (txr->hn_agg_szleft <= 1870 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1871 /* 1872 * Probably can't aggregate more packets, 1873 * flush this aggregating txdesc proactively. 1874 */ 1875 txr->hn_agg_pktleft = 0; 1876 } 1877 /* Done! */ 1878 return (chim); 1879 } 1880 hn_flush_txagg(ifp, txr); 1881 } 1882 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 1883 1884 txr->hn_tx_chimney_tried++; 1885 txd->chim_index = hn_chim_alloc(txr->hn_sc); 1886 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 1887 return (NULL); 1888 txr->hn_tx_chimney++; 1889 1890 chim = txr->hn_sc->hn_chim + 1891 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 1892 1893 if (txr->hn_agg_pktmax > 1 && 1894 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1895 txr->hn_agg_txd = txd; 1896 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 1897 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 1898 txr->hn_agg_prevpkt = chim; 1899 } 1900 return (chim); 1901 } 1902 1903 /* 1904 * NOTE: 1905 * If this function fails, then both txd and m_head0 will be freed. 1906 */ 1907 static int 1908 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1909 struct mbuf **m_head0) 1910 { 1911 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 1912 int error, nsegs, i; 1913 struct mbuf *m_head = *m_head0; 1914 struct rndis_packet_msg *pkt; 1915 uint32_t *pi_data; 1916 void *chim = NULL; 1917 int pkt_hlen, pkt_size; 1918 1919 pkt = txd->rndis_pkt; 1920 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 1921 if (pkt_size < txr->hn_chim_size) { 1922 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 1923 if (chim != NULL) 1924 pkt = chim; 1925 } else { 1926 if (txr->hn_agg_txd != NULL) 1927 hn_flush_txagg(ifp, txr); 1928 } 1929 1930 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 1931 pkt->rm_len = m_head->m_pkthdr.len; 1932 pkt->rm_dataoffset = 0; 1933 pkt->rm_datalen = m_head->m_pkthdr.len; 1934 pkt->rm_oobdataoffset = 0; 1935 pkt->rm_oobdatalen = 0; 1936 pkt->rm_oobdataelements = 0; 1937 pkt->rm_pktinfooffset = sizeof(*pkt); 1938 pkt->rm_pktinfolen = 0; 1939 pkt->rm_vchandle = 0; 1940 pkt->rm_reserved = 0; 1941 1942 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 1943 /* 1944 * Set the hash value for this packet, so that the host could 1945 * dispatch the TX done event for this packet back to this TX 1946 * ring's channel. 1947 */ 1948 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1949 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 1950 *pi_data = txr->hn_tx_idx; 1951 } 1952 1953 if (m_head->m_flags & M_VLANTAG) { 1954 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1955 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 1956 *pi_data = NDIS_VLAN_INFO_MAKE( 1957 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 1958 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 1959 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 1960 } 1961 1962 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 1963 #if defined(INET6) || defined(INET) 1964 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1965 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 1966 #ifdef INET 1967 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 1968 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, 1969 m_head->m_pkthdr.tso_segsz); 1970 } 1971 #endif 1972 #if defined(INET6) && defined(INET) 1973 else 1974 #endif 1975 #ifdef INET6 1976 { 1977 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, 1978 m_head->m_pkthdr.tso_segsz); 1979 } 1980 #endif 1981 #endif /* INET6 || INET */ 1982 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 1983 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1984 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 1985 if (m_head->m_pkthdr.csum_flags & 1986 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 1987 *pi_data = NDIS_TXCSUM_INFO_IPV6; 1988 } else { 1989 *pi_data = NDIS_TXCSUM_INFO_IPV4; 1990 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 1991 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 1992 } 1993 1994 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) 1995 *pi_data |= NDIS_TXCSUM_INFO_TCPCS; 1996 else if (m_head->m_pkthdr.csum_flags & 1997 (CSUM_IP_UDP | CSUM_IP6_UDP)) 1998 *pi_data |= NDIS_TXCSUM_INFO_UDPCS; 1999 } 2000 2001 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 2002 /* Fixup RNDIS packet message total length */ 2003 pkt->rm_len += pkt_hlen; 2004 /* Convert RNDIS packet message offsets */ 2005 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 2006 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 2007 2008 /* 2009 * Fast path: Chimney sending. 2010 */ 2011 if (chim != NULL) { 2012 struct hn_txdesc *tgt_txd = txd; 2013 2014 if (txr->hn_agg_txd != NULL) { 2015 tgt_txd = txr->hn_agg_txd; 2016 #ifdef INVARIANTS 2017 *m_head0 = NULL; 2018 #endif 2019 } 2020 2021 KASSERT(pkt == chim, 2022 ("RNDIS pkt not in chimney sending buffer")); 2023 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 2024 ("chimney sending buffer is not used")); 2025 tgt_txd->chim_size += pkt->rm_len; 2026 2027 m_copydata(m_head, 0, m_head->m_pkthdr.len, 2028 ((uint8_t *)chim) + pkt_hlen); 2029 2030 txr->hn_gpa_cnt = 0; 2031 txr->hn_sendpkt = hn_txpkt_chim; 2032 goto done; 2033 } 2034 2035 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 2036 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2037 ("chimney buffer is used")); 2038 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 2039 2040 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 2041 if (__predict_false(error)) { 2042 int freed; 2043 2044 /* 2045 * This mbuf is not linked w/ the txd yet, so free it now. 2046 */ 2047 m_freem(m_head); 2048 *m_head0 = NULL; 2049 2050 freed = hn_txdesc_put(txr, txd); 2051 KASSERT(freed != 0, 2052 ("fail to free txd upon txdma error")); 2053 2054 txr->hn_txdma_failed++; 2055 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 2056 return error; 2057 } 2058 *m_head0 = m_head; 2059 2060 /* +1 RNDIS packet message */ 2061 txr->hn_gpa_cnt = nsegs + 1; 2062 2063 /* send packet with page buffer */ 2064 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 2065 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 2066 txr->hn_gpa[0].gpa_len = pkt_hlen; 2067 2068 /* 2069 * Fill the page buffers with mbuf info after the page 2070 * buffer for RNDIS packet message. 2071 */ 2072 for (i = 0; i < nsegs; ++i) { 2073 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 2074 2075 gpa->gpa_page = atop(segs[i].ds_addr); 2076 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 2077 gpa->gpa_len = segs[i].ds_len; 2078 } 2079 2080 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2081 txd->chim_size = 0; 2082 txr->hn_sendpkt = hn_txpkt_sglist; 2083 done: 2084 txd->m = m_head; 2085 2086 /* Set the completion routine */ 2087 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 2088 2089 /* Update temporary stats for later use. */ 2090 txr->hn_stat_pkts++; 2091 txr->hn_stat_size += m_head->m_pkthdr.len; 2092 if (m_head->m_flags & M_MCAST) 2093 txr->hn_stat_mcasts++; 2094 2095 return 0; 2096 } 2097 2098 /* 2099 * NOTE: 2100 * If this function fails, then txd will be freed, but the mbuf 2101 * associated w/ the txd will _not_ be freed. 2102 */ 2103 static int 2104 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 2105 { 2106 int error, send_failed = 0, has_bpf; 2107 2108 again: 2109 has_bpf = bpf_peers_present(ifp->if_bpf); 2110 if (has_bpf) { 2111 /* 2112 * Make sure that this txd and any aggregated txds are not 2113 * freed before ETHER_BPF_MTAP. 2114 */ 2115 hn_txdesc_hold(txd); 2116 } 2117 error = txr->hn_sendpkt(txr, txd); 2118 if (!error) { 2119 if (has_bpf) { 2120 const struct hn_txdesc *tmp_txd; 2121 2122 ETHER_BPF_MTAP(ifp, txd->m); 2123 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 2124 ETHER_BPF_MTAP(ifp, tmp_txd->m); 2125 } 2126 2127 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 2128 #ifdef HN_IFSTART_SUPPORT 2129 if (!hn_use_if_start) 2130 #endif 2131 { 2132 if_inc_counter(ifp, IFCOUNTER_OBYTES, 2133 txr->hn_stat_size); 2134 if (txr->hn_stat_mcasts != 0) { 2135 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 2136 txr->hn_stat_mcasts); 2137 } 2138 } 2139 txr->hn_pkts += txr->hn_stat_pkts; 2140 txr->hn_sends++; 2141 } 2142 if (has_bpf) 2143 hn_txdesc_put(txr, txd); 2144 2145 if (__predict_false(error)) { 2146 int freed; 2147 2148 /* 2149 * This should "really rarely" happen. 2150 * 2151 * XXX Too many RX to be acked or too many sideband 2152 * commands to run? Ask netvsc_channel_rollup() 2153 * to kick start later. 2154 */ 2155 txr->hn_has_txeof = 1; 2156 if (!send_failed) { 2157 txr->hn_send_failed++; 2158 send_failed = 1; 2159 /* 2160 * Try sending again after set hn_has_txeof; 2161 * in case that we missed the last 2162 * netvsc_channel_rollup(). 2163 */ 2164 goto again; 2165 } 2166 if_printf(ifp, "send failed\n"); 2167 2168 /* 2169 * Caller will perform further processing on the 2170 * associated mbuf, so don't free it in hn_txdesc_put(); 2171 * only unload it from the DMA map in hn_txdesc_put(), 2172 * if it was loaded. 2173 */ 2174 txd->m = NULL; 2175 freed = hn_txdesc_put(txr, txd); 2176 KASSERT(freed != 0, 2177 ("fail to free txd upon send error")); 2178 2179 txr->hn_send_failed++; 2180 } 2181 2182 /* Reset temporary stats, after this sending is done. */ 2183 txr->hn_stat_size = 0; 2184 txr->hn_stat_pkts = 0; 2185 txr->hn_stat_mcasts = 0; 2186 2187 return (error); 2188 } 2189 2190 /* 2191 * Append the specified data to the indicated mbuf chain, 2192 * Extend the mbuf chain if the new data does not fit in 2193 * existing space. 2194 * 2195 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 2196 * There should be an equivalent in the kernel mbuf code, 2197 * but there does not appear to be one yet. 2198 * 2199 * Differs from m_append() in that additional mbufs are 2200 * allocated with cluster size MJUMPAGESIZE, and filled 2201 * accordingly. 2202 * 2203 * Return 1 if able to complete the job; otherwise 0. 2204 */ 2205 static int 2206 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 2207 { 2208 struct mbuf *m, *n; 2209 int remainder, space; 2210 2211 for (m = m0; m->m_next != NULL; m = m->m_next) 2212 ; 2213 remainder = len; 2214 space = M_TRAILINGSPACE(m); 2215 if (space > 0) { 2216 /* 2217 * Copy into available space. 2218 */ 2219 if (space > remainder) 2220 space = remainder; 2221 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 2222 m->m_len += space; 2223 cp += space; 2224 remainder -= space; 2225 } 2226 while (remainder > 0) { 2227 /* 2228 * Allocate a new mbuf; could check space 2229 * and allocate a cluster instead. 2230 */ 2231 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 2232 if (n == NULL) 2233 break; 2234 n->m_len = min(MJUMPAGESIZE, remainder); 2235 bcopy(cp, mtod(n, caddr_t), n->m_len); 2236 cp += n->m_len; 2237 remainder -= n->m_len; 2238 m->m_next = n; 2239 m = n; 2240 } 2241 if (m0->m_flags & M_PKTHDR) 2242 m0->m_pkthdr.len += len - remainder; 2243 2244 return (remainder == 0); 2245 } 2246 2247 #if defined(INET) || defined(INET6) 2248 static __inline int 2249 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 2250 { 2251 #if __FreeBSD_version >= 1100095 2252 if (hn_lro_mbufq_depth) { 2253 tcp_lro_queue_mbuf(lc, m); 2254 return 0; 2255 } 2256 #endif 2257 return tcp_lro_rx(lc, m, 0); 2258 } 2259 #endif 2260 2261 static int 2262 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, 2263 const struct hn_rxinfo *info) 2264 { 2265 struct ifnet *ifp; 2266 struct mbuf *m_new; 2267 int size, do_lro = 0, do_csum = 1; 2268 int hash_type; 2269 2270 /* If the VF is active, inject the packet through the VF */ 2271 ifp = rxr->hn_vf ? rxr->hn_vf : rxr->hn_ifp; 2272 2273 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 2274 /* 2275 * NOTE: 2276 * See the NOTE of hn_rndis_init_fixat(). This 2277 * function can be reached, immediately after the 2278 * RNDIS is initialized but before the ifnet is 2279 * setup on the hn_attach() path; drop the unexpected 2280 * packets. 2281 */ 2282 return (0); 2283 } 2284 2285 if (dlen <= MHLEN) { 2286 m_new = m_gethdr(M_NOWAIT, MT_DATA); 2287 if (m_new == NULL) { 2288 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2289 return (0); 2290 } 2291 memcpy(mtod(m_new, void *), data, dlen); 2292 m_new->m_pkthdr.len = m_new->m_len = dlen; 2293 rxr->hn_small_pkts++; 2294 } else { 2295 /* 2296 * Get an mbuf with a cluster. For packets 2K or less, 2297 * get a standard 2K cluster. For anything larger, get a 2298 * 4K cluster. Any buffers larger than 4K can cause problems 2299 * if looped around to the Hyper-V TX channel, so avoid them. 2300 */ 2301 size = MCLBYTES; 2302 if (dlen > MCLBYTES) { 2303 /* 4096 */ 2304 size = MJUMPAGESIZE; 2305 } 2306 2307 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 2308 if (m_new == NULL) { 2309 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2310 return (0); 2311 } 2312 2313 hv_m_append(m_new, dlen, data); 2314 } 2315 m_new->m_pkthdr.rcvif = ifp; 2316 2317 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) 2318 do_csum = 0; 2319 2320 /* receive side checksum offload */ 2321 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { 2322 /* IP csum offload */ 2323 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 2324 m_new->m_pkthdr.csum_flags |= 2325 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2326 rxr->hn_csum_ip++; 2327 } 2328 2329 /* TCP/UDP csum offload */ 2330 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | 2331 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 2332 m_new->m_pkthdr.csum_flags |= 2333 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2334 m_new->m_pkthdr.csum_data = 0xffff; 2335 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) 2336 rxr->hn_csum_tcp++; 2337 else 2338 rxr->hn_csum_udp++; 2339 } 2340 2341 /* 2342 * XXX 2343 * As of this write (Oct 28th, 2016), host side will turn 2344 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 2345 * the do_lro setting here is actually _not_ accurate. We 2346 * depend on the RSS hash type check to reset do_lro. 2347 */ 2348 if ((info->csum_info & 2349 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 2350 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 2351 do_lro = 1; 2352 } else { 2353 const struct ether_header *eh; 2354 uint16_t etype; 2355 int hoff; 2356 2357 hoff = sizeof(*eh); 2358 if (m_new->m_len < hoff) 2359 goto skip; 2360 eh = mtod(m_new, struct ether_header *); 2361 etype = ntohs(eh->ether_type); 2362 if (etype == ETHERTYPE_VLAN) { 2363 const struct ether_vlan_header *evl; 2364 2365 hoff = sizeof(*evl); 2366 if (m_new->m_len < hoff) 2367 goto skip; 2368 evl = mtod(m_new, struct ether_vlan_header *); 2369 etype = ntohs(evl->evl_proto); 2370 } 2371 2372 if (etype == ETHERTYPE_IP) { 2373 int pr; 2374 2375 pr = hn_check_iplen(m_new, hoff); 2376 if (pr == IPPROTO_TCP) { 2377 if (do_csum && 2378 (rxr->hn_trust_hcsum & 2379 HN_TRUST_HCSUM_TCP)) { 2380 rxr->hn_csum_trusted++; 2381 m_new->m_pkthdr.csum_flags |= 2382 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2383 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2384 m_new->m_pkthdr.csum_data = 0xffff; 2385 } 2386 do_lro = 1; 2387 } else if (pr == IPPROTO_UDP) { 2388 if (do_csum && 2389 (rxr->hn_trust_hcsum & 2390 HN_TRUST_HCSUM_UDP)) { 2391 rxr->hn_csum_trusted++; 2392 m_new->m_pkthdr.csum_flags |= 2393 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2394 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2395 m_new->m_pkthdr.csum_data = 0xffff; 2396 } 2397 } else if (pr != IPPROTO_DONE && do_csum && 2398 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 2399 rxr->hn_csum_trusted++; 2400 m_new->m_pkthdr.csum_flags |= 2401 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2402 } 2403 } 2404 } 2405 skip: 2406 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { 2407 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 2408 NDIS_VLAN_INFO_ID(info->vlan_info), 2409 NDIS_VLAN_INFO_PRI(info->vlan_info), 2410 NDIS_VLAN_INFO_CFI(info->vlan_info)); 2411 m_new->m_flags |= M_VLANTAG; 2412 } 2413 2414 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { 2415 rxr->hn_rss_pkts++; 2416 m_new->m_pkthdr.flowid = info->hash_value; 2417 hash_type = M_HASHTYPE_OPAQUE_HASH; 2418 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == 2419 NDIS_HASH_FUNCTION_TOEPLITZ) { 2420 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); 2421 2422 /* 2423 * NOTE: 2424 * do_lro is resetted, if the hash types are not TCP 2425 * related. See the comment in the above csum_flags 2426 * setup section. 2427 */ 2428 switch (type) { 2429 case NDIS_HASH_IPV4: 2430 hash_type = M_HASHTYPE_RSS_IPV4; 2431 do_lro = 0; 2432 break; 2433 2434 case NDIS_HASH_TCP_IPV4: 2435 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 2436 break; 2437 2438 case NDIS_HASH_IPV6: 2439 hash_type = M_HASHTYPE_RSS_IPV6; 2440 do_lro = 0; 2441 break; 2442 2443 case NDIS_HASH_IPV6_EX: 2444 hash_type = M_HASHTYPE_RSS_IPV6_EX; 2445 do_lro = 0; 2446 break; 2447 2448 case NDIS_HASH_TCP_IPV6: 2449 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 2450 break; 2451 2452 case NDIS_HASH_TCP_IPV6_EX: 2453 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 2454 break; 2455 } 2456 } 2457 } else { 2458 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 2459 hash_type = M_HASHTYPE_OPAQUE; 2460 } 2461 M_HASHTYPE_SET(m_new, hash_type); 2462 2463 /* 2464 * Note: Moved RX completion back to hv_nv_on_receive() so all 2465 * messages (not just data messages) will trigger a response. 2466 */ 2467 2468 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 2469 rxr->hn_pkts++; 2470 2471 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { 2472 #if defined(INET) || defined(INET6) 2473 struct lro_ctrl *lro = &rxr->hn_lro; 2474 2475 if (lro->lro_cnt) { 2476 rxr->hn_lro_tried++; 2477 if (hn_lro_rx(lro, m_new) == 0) { 2478 /* DONE! */ 2479 return 0; 2480 } 2481 } 2482 #endif 2483 } 2484 2485 /* We're not holding the lock here, so don't release it */ 2486 (*ifp->if_input)(ifp, m_new); 2487 2488 return (0); 2489 } 2490 2491 static int 2492 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 2493 { 2494 struct hn_softc *sc = ifp->if_softc; 2495 struct ifreq *ifr = (struct ifreq *)data; 2496 int mask, error = 0; 2497 2498 switch (cmd) { 2499 case SIOCSIFMTU: 2500 if (ifr->ifr_mtu > HN_MTU_MAX) { 2501 error = EINVAL; 2502 break; 2503 } 2504 2505 HN_LOCK(sc); 2506 2507 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2508 HN_UNLOCK(sc); 2509 break; 2510 } 2511 2512 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 2513 /* Can't change MTU */ 2514 HN_UNLOCK(sc); 2515 error = EOPNOTSUPP; 2516 break; 2517 } 2518 2519 if (ifp->if_mtu == ifr->ifr_mtu) { 2520 HN_UNLOCK(sc); 2521 break; 2522 } 2523 2524 /* 2525 * Suspend this interface before the synthetic parts 2526 * are ripped. 2527 */ 2528 hn_suspend(sc); 2529 2530 /* 2531 * Detach the synthetics parts, i.e. NVS and RNDIS. 2532 */ 2533 hn_synth_detach(sc); 2534 2535 /* 2536 * Reattach the synthetic parts, i.e. NVS and RNDIS, 2537 * with the new MTU setting. 2538 */ 2539 error = hn_synth_attach(sc, ifr->ifr_mtu); 2540 if (error) { 2541 HN_UNLOCK(sc); 2542 break; 2543 } 2544 2545 /* 2546 * Commit the requested MTU, after the synthetic parts 2547 * have been successfully attached. 2548 */ 2549 ifp->if_mtu = ifr->ifr_mtu; 2550 2551 /* 2552 * Make sure that various parameters based on MTU are 2553 * still valid, after the MTU change. 2554 */ 2555 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 2556 hn_set_chim_size(sc, sc->hn_chim_szmax); 2557 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 2558 #if __FreeBSD_version >= 1100099 2559 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < 2560 HN_LRO_LENLIM_MIN(ifp)) 2561 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 2562 #endif 2563 2564 /* 2565 * All done! Resume the interface now. 2566 */ 2567 hn_resume(sc); 2568 2569 HN_UNLOCK(sc); 2570 break; 2571 2572 case SIOCSIFFLAGS: 2573 HN_LOCK(sc); 2574 2575 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2576 HN_UNLOCK(sc); 2577 break; 2578 } 2579 2580 if (ifp->if_flags & IFF_UP) { 2581 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2582 /* 2583 * Caller meight hold mutex, e.g. 2584 * bpf; use busy-wait for the RNDIS 2585 * reply. 2586 */ 2587 HN_NO_SLEEPING(sc); 2588 hn_rxfilter_config(sc); 2589 HN_SLEEPING_OK(sc); 2590 } else { 2591 hn_init_locked(sc); 2592 } 2593 } else { 2594 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2595 hn_stop(sc, false); 2596 } 2597 sc->hn_if_flags = ifp->if_flags; 2598 2599 HN_UNLOCK(sc); 2600 break; 2601 2602 case SIOCSIFCAP: 2603 HN_LOCK(sc); 2604 mask = ifr->ifr_reqcap ^ ifp->if_capenable; 2605 2606 if (mask & IFCAP_TXCSUM) { 2607 ifp->if_capenable ^= IFCAP_TXCSUM; 2608 if (ifp->if_capenable & IFCAP_TXCSUM) 2609 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 2610 else 2611 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 2612 } 2613 if (mask & IFCAP_TXCSUM_IPV6) { 2614 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 2615 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 2616 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 2617 else 2618 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 2619 } 2620 2621 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 2622 if (mask & IFCAP_RXCSUM) 2623 ifp->if_capenable ^= IFCAP_RXCSUM; 2624 #ifdef foo 2625 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2626 if (mask & IFCAP_RXCSUM_IPV6) 2627 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 2628 #endif 2629 2630 if (mask & IFCAP_LRO) 2631 ifp->if_capenable ^= IFCAP_LRO; 2632 2633 if (mask & IFCAP_TSO4) { 2634 ifp->if_capenable ^= IFCAP_TSO4; 2635 if (ifp->if_capenable & IFCAP_TSO4) 2636 ifp->if_hwassist |= CSUM_IP_TSO; 2637 else 2638 ifp->if_hwassist &= ~CSUM_IP_TSO; 2639 } 2640 if (mask & IFCAP_TSO6) { 2641 ifp->if_capenable ^= IFCAP_TSO6; 2642 if (ifp->if_capenable & IFCAP_TSO6) 2643 ifp->if_hwassist |= CSUM_IP6_TSO; 2644 else 2645 ifp->if_hwassist &= ~CSUM_IP6_TSO; 2646 } 2647 2648 HN_UNLOCK(sc); 2649 break; 2650 2651 case SIOCADDMULTI: 2652 case SIOCDELMULTI: 2653 HN_LOCK(sc); 2654 2655 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2656 HN_UNLOCK(sc); 2657 break; 2658 } 2659 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2660 /* 2661 * Multicast uses mutex; use busy-wait for 2662 * the RNDIS reply. 2663 */ 2664 HN_NO_SLEEPING(sc); 2665 hn_rxfilter_config(sc); 2666 HN_SLEEPING_OK(sc); 2667 } 2668 2669 HN_UNLOCK(sc); 2670 break; 2671 2672 case SIOCSIFMEDIA: 2673 case SIOCGIFMEDIA: 2674 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 2675 break; 2676 2677 default: 2678 error = ether_ioctl(ifp, cmd, data); 2679 break; 2680 } 2681 return (error); 2682 } 2683 2684 static void 2685 hn_stop(struct hn_softc *sc, bool detaching) 2686 { 2687 struct ifnet *ifp = sc->hn_ifp; 2688 int i; 2689 2690 HN_LOCK_ASSERT(sc); 2691 2692 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 2693 ("synthetic parts were not attached")); 2694 2695 /* Disable polling. */ 2696 hn_polling(sc, 0); 2697 2698 /* Clear RUNNING bit _before_ hn_suspend_data() */ 2699 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2700 hn_suspend_data(sc); 2701 2702 /* Clear OACTIVE bit. */ 2703 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2704 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2705 sc->hn_tx_ring[i].hn_oactive = 0; 2706 2707 /* 2708 * If the VF is active, make sure the filter is not 0, even if 2709 * the synthetic NIC is down. 2710 */ 2711 if (!detaching && (sc->hn_flags & HN_FLAG_VF)) 2712 hn_rxfilter_config(sc); 2713 } 2714 2715 static void 2716 hn_init_locked(struct hn_softc *sc) 2717 { 2718 struct ifnet *ifp = sc->hn_ifp; 2719 int i; 2720 2721 HN_LOCK_ASSERT(sc); 2722 2723 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 2724 return; 2725 2726 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2727 return; 2728 2729 /* Configure RX filter */ 2730 hn_rxfilter_config(sc); 2731 2732 /* Clear OACTIVE bit. */ 2733 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2734 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2735 sc->hn_tx_ring[i].hn_oactive = 0; 2736 2737 /* Clear TX 'suspended' bit. */ 2738 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 2739 2740 /* Everything is ready; unleash! */ 2741 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2742 2743 /* Re-enable polling if requested. */ 2744 if (sc->hn_pollhz > 0) 2745 hn_polling(sc, sc->hn_pollhz); 2746 } 2747 2748 static void 2749 hn_init(void *xsc) 2750 { 2751 struct hn_softc *sc = xsc; 2752 2753 HN_LOCK(sc); 2754 hn_init_locked(sc); 2755 HN_UNLOCK(sc); 2756 } 2757 2758 #if __FreeBSD_version >= 1100099 2759 2760 static int 2761 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 2762 { 2763 struct hn_softc *sc = arg1; 2764 unsigned int lenlim; 2765 int error; 2766 2767 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 2768 error = sysctl_handle_int(oidp, &lenlim, 0, req); 2769 if (error || req->newptr == NULL) 2770 return error; 2771 2772 HN_LOCK(sc); 2773 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 2774 lenlim > TCP_LRO_LENGTH_MAX) { 2775 HN_UNLOCK(sc); 2776 return EINVAL; 2777 } 2778 hn_set_lro_lenlim(sc, lenlim); 2779 HN_UNLOCK(sc); 2780 2781 return 0; 2782 } 2783 2784 static int 2785 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 2786 { 2787 struct hn_softc *sc = arg1; 2788 int ackcnt, error, i; 2789 2790 /* 2791 * lro_ackcnt_lim is append count limit, 2792 * +1 to turn it into aggregation limit. 2793 */ 2794 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 2795 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 2796 if (error || req->newptr == NULL) 2797 return error; 2798 2799 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 2800 return EINVAL; 2801 2802 /* 2803 * Convert aggregation limit back to append 2804 * count limit. 2805 */ 2806 --ackcnt; 2807 HN_LOCK(sc); 2808 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 2809 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 2810 HN_UNLOCK(sc); 2811 return 0; 2812 } 2813 2814 #endif 2815 2816 static int 2817 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 2818 { 2819 struct hn_softc *sc = arg1; 2820 int hcsum = arg2; 2821 int on, error, i; 2822 2823 on = 0; 2824 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 2825 on = 1; 2826 2827 error = sysctl_handle_int(oidp, &on, 0, req); 2828 if (error || req->newptr == NULL) 2829 return error; 2830 2831 HN_LOCK(sc); 2832 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2833 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 2834 2835 if (on) 2836 rxr->hn_trust_hcsum |= hcsum; 2837 else 2838 rxr->hn_trust_hcsum &= ~hcsum; 2839 } 2840 HN_UNLOCK(sc); 2841 return 0; 2842 } 2843 2844 static int 2845 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 2846 { 2847 struct hn_softc *sc = arg1; 2848 int chim_size, error; 2849 2850 chim_size = sc->hn_tx_ring[0].hn_chim_size; 2851 error = sysctl_handle_int(oidp, &chim_size, 0, req); 2852 if (error || req->newptr == NULL) 2853 return error; 2854 2855 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 2856 return EINVAL; 2857 2858 HN_LOCK(sc); 2859 hn_set_chim_size(sc, chim_size); 2860 HN_UNLOCK(sc); 2861 return 0; 2862 } 2863 2864 #if __FreeBSD_version < 1100095 2865 static int 2866 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 2867 { 2868 struct hn_softc *sc = arg1; 2869 int ofs = arg2, i, error; 2870 struct hn_rx_ring *rxr; 2871 uint64_t stat; 2872 2873 stat = 0; 2874 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2875 rxr = &sc->hn_rx_ring[i]; 2876 stat += *((int *)((uint8_t *)rxr + ofs)); 2877 } 2878 2879 error = sysctl_handle_64(oidp, &stat, 0, req); 2880 if (error || req->newptr == NULL) 2881 return error; 2882 2883 /* Zero out this stat. */ 2884 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2885 rxr = &sc->hn_rx_ring[i]; 2886 *((int *)((uint8_t *)rxr + ofs)) = 0; 2887 } 2888 return 0; 2889 } 2890 #else 2891 static int 2892 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 2893 { 2894 struct hn_softc *sc = arg1; 2895 int ofs = arg2, i, error; 2896 struct hn_rx_ring *rxr; 2897 uint64_t stat; 2898 2899 stat = 0; 2900 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2901 rxr = &sc->hn_rx_ring[i]; 2902 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 2903 } 2904 2905 error = sysctl_handle_64(oidp, &stat, 0, req); 2906 if (error || req->newptr == NULL) 2907 return error; 2908 2909 /* Zero out this stat. */ 2910 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2911 rxr = &sc->hn_rx_ring[i]; 2912 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 2913 } 2914 return 0; 2915 } 2916 2917 #endif 2918 2919 static int 2920 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2921 { 2922 struct hn_softc *sc = arg1; 2923 int ofs = arg2, i, error; 2924 struct hn_rx_ring *rxr; 2925 u_long stat; 2926 2927 stat = 0; 2928 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2929 rxr = &sc->hn_rx_ring[i]; 2930 stat += *((u_long *)((uint8_t *)rxr + ofs)); 2931 } 2932 2933 error = sysctl_handle_long(oidp, &stat, 0, req); 2934 if (error || req->newptr == NULL) 2935 return error; 2936 2937 /* Zero out this stat. */ 2938 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2939 rxr = &sc->hn_rx_ring[i]; 2940 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 2941 } 2942 return 0; 2943 } 2944 2945 static int 2946 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2947 { 2948 struct hn_softc *sc = arg1; 2949 int ofs = arg2, i, error; 2950 struct hn_tx_ring *txr; 2951 u_long stat; 2952 2953 stat = 0; 2954 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2955 txr = &sc->hn_tx_ring[i]; 2956 stat += *((u_long *)((uint8_t *)txr + ofs)); 2957 } 2958 2959 error = sysctl_handle_long(oidp, &stat, 0, req); 2960 if (error || req->newptr == NULL) 2961 return error; 2962 2963 /* Zero out this stat. */ 2964 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2965 txr = &sc->hn_tx_ring[i]; 2966 *((u_long *)((uint8_t *)txr + ofs)) = 0; 2967 } 2968 return 0; 2969 } 2970 2971 static int 2972 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 2973 { 2974 struct hn_softc *sc = arg1; 2975 int ofs = arg2, i, error, conf; 2976 struct hn_tx_ring *txr; 2977 2978 txr = &sc->hn_tx_ring[0]; 2979 conf = *((int *)((uint8_t *)txr + ofs)); 2980 2981 error = sysctl_handle_int(oidp, &conf, 0, req); 2982 if (error || req->newptr == NULL) 2983 return error; 2984 2985 HN_LOCK(sc); 2986 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2987 txr = &sc->hn_tx_ring[i]; 2988 *((int *)((uint8_t *)txr + ofs)) = conf; 2989 } 2990 HN_UNLOCK(sc); 2991 2992 return 0; 2993 } 2994 2995 static int 2996 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 2997 { 2998 struct hn_softc *sc = arg1; 2999 int error, size; 3000 3001 size = sc->hn_agg_size; 3002 error = sysctl_handle_int(oidp, &size, 0, req); 3003 if (error || req->newptr == NULL) 3004 return (error); 3005 3006 HN_LOCK(sc); 3007 sc->hn_agg_size = size; 3008 hn_set_txagg(sc); 3009 HN_UNLOCK(sc); 3010 3011 return (0); 3012 } 3013 3014 static int 3015 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 3016 { 3017 struct hn_softc *sc = arg1; 3018 int error, pkts; 3019 3020 pkts = sc->hn_agg_pkts; 3021 error = sysctl_handle_int(oidp, &pkts, 0, req); 3022 if (error || req->newptr == NULL) 3023 return (error); 3024 3025 HN_LOCK(sc); 3026 sc->hn_agg_pkts = pkts; 3027 hn_set_txagg(sc); 3028 HN_UNLOCK(sc); 3029 3030 return (0); 3031 } 3032 3033 static int 3034 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 3035 { 3036 struct hn_softc *sc = arg1; 3037 int pkts; 3038 3039 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 3040 return (sysctl_handle_int(oidp, &pkts, 0, req)); 3041 } 3042 3043 static int 3044 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 3045 { 3046 struct hn_softc *sc = arg1; 3047 int align; 3048 3049 align = sc->hn_tx_ring[0].hn_agg_align; 3050 return (sysctl_handle_int(oidp, &align, 0, req)); 3051 } 3052 3053 static void 3054 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 3055 { 3056 if (pollhz == 0) 3057 vmbus_chan_poll_disable(chan); 3058 else 3059 vmbus_chan_poll_enable(chan, pollhz); 3060 } 3061 3062 static void 3063 hn_polling(struct hn_softc *sc, u_int pollhz) 3064 { 3065 int nsubch = sc->hn_rx_ring_inuse - 1; 3066 3067 HN_LOCK_ASSERT(sc); 3068 3069 if (nsubch > 0) { 3070 struct vmbus_channel **subch; 3071 int i; 3072 3073 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 3074 for (i = 0; i < nsubch; ++i) 3075 hn_chan_polling(subch[i], pollhz); 3076 vmbus_subchan_rel(subch, nsubch); 3077 } 3078 hn_chan_polling(sc->hn_prichan, pollhz); 3079 } 3080 3081 static int 3082 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 3083 { 3084 struct hn_softc *sc = arg1; 3085 int pollhz, error; 3086 3087 pollhz = sc->hn_pollhz; 3088 error = sysctl_handle_int(oidp, &pollhz, 0, req); 3089 if (error || req->newptr == NULL) 3090 return (error); 3091 3092 if (pollhz != 0 && 3093 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 3094 return (EINVAL); 3095 3096 HN_LOCK(sc); 3097 if (sc->hn_pollhz != pollhz) { 3098 sc->hn_pollhz = pollhz; 3099 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 3100 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 3101 hn_polling(sc, sc->hn_pollhz); 3102 } 3103 HN_UNLOCK(sc); 3104 3105 return (0); 3106 } 3107 3108 static int 3109 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 3110 { 3111 struct hn_softc *sc = arg1; 3112 char verstr[16]; 3113 3114 snprintf(verstr, sizeof(verstr), "%u.%u", 3115 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 3116 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 3117 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 3118 } 3119 3120 static int 3121 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 3122 { 3123 struct hn_softc *sc = arg1; 3124 char caps_str[128]; 3125 uint32_t caps; 3126 3127 HN_LOCK(sc); 3128 caps = sc->hn_caps; 3129 HN_UNLOCK(sc); 3130 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 3131 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 3132 } 3133 3134 static int 3135 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 3136 { 3137 struct hn_softc *sc = arg1; 3138 char assist_str[128]; 3139 uint32_t hwassist; 3140 3141 HN_LOCK(sc); 3142 hwassist = sc->hn_ifp->if_hwassist; 3143 HN_UNLOCK(sc); 3144 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 3145 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 3146 } 3147 3148 static int 3149 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 3150 { 3151 struct hn_softc *sc = arg1; 3152 char filter_str[128]; 3153 uint32_t filter; 3154 3155 HN_LOCK(sc); 3156 filter = sc->hn_rx_filter; 3157 HN_UNLOCK(sc); 3158 snprintf(filter_str, sizeof(filter_str), "%b", filter, 3159 NDIS_PACKET_TYPES); 3160 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 3161 } 3162 3163 #ifndef RSS 3164 3165 static int 3166 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 3167 { 3168 struct hn_softc *sc = arg1; 3169 int error; 3170 3171 HN_LOCK(sc); 3172 3173 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3174 if (error || req->newptr == NULL) 3175 goto back; 3176 3177 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3178 if (error) 3179 goto back; 3180 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 3181 3182 if (sc->hn_rx_ring_inuse > 1) { 3183 error = hn_rss_reconfig(sc); 3184 } else { 3185 /* Not RSS capable, at least for now; just save the RSS key. */ 3186 error = 0; 3187 } 3188 back: 3189 HN_UNLOCK(sc); 3190 return (error); 3191 } 3192 3193 static int 3194 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 3195 { 3196 struct hn_softc *sc = arg1; 3197 int error; 3198 3199 HN_LOCK(sc); 3200 3201 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 3202 if (error || req->newptr == NULL) 3203 goto back; 3204 3205 /* 3206 * Don't allow RSS indirect table change, if this interface is not 3207 * RSS capable currently. 3208 */ 3209 if (sc->hn_rx_ring_inuse == 1) { 3210 error = EOPNOTSUPP; 3211 goto back; 3212 } 3213 3214 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 3215 if (error) 3216 goto back; 3217 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 3218 3219 hn_rss_ind_fixup(sc); 3220 error = hn_rss_reconfig(sc); 3221 back: 3222 HN_UNLOCK(sc); 3223 return (error); 3224 } 3225 3226 #endif /* !RSS */ 3227 3228 static int 3229 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 3230 { 3231 struct hn_softc *sc = arg1; 3232 char hash_str[128]; 3233 uint32_t hash; 3234 3235 HN_LOCK(sc); 3236 hash = sc->hn_rss_hash; 3237 HN_UNLOCK(sc); 3238 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 3239 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 3240 } 3241 3242 static int 3243 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 3244 { 3245 struct hn_softc *sc = arg1; 3246 char vf_name[128]; 3247 struct ifnet *vf; 3248 3249 HN_LOCK(sc); 3250 vf_name[0] = '\0'; 3251 vf = sc->hn_rx_ring[0].hn_vf; 3252 if (vf != NULL) 3253 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf)); 3254 HN_UNLOCK(sc); 3255 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 3256 } 3257 3258 static int 3259 hn_check_iplen(const struct mbuf *m, int hoff) 3260 { 3261 const struct ip *ip; 3262 int len, iphlen, iplen; 3263 const struct tcphdr *th; 3264 int thoff; /* TCP data offset */ 3265 3266 len = hoff + sizeof(struct ip); 3267 3268 /* The packet must be at least the size of an IP header. */ 3269 if (m->m_pkthdr.len < len) 3270 return IPPROTO_DONE; 3271 3272 /* The fixed IP header must reside completely in the first mbuf. */ 3273 if (m->m_len < len) 3274 return IPPROTO_DONE; 3275 3276 ip = mtodo(m, hoff); 3277 3278 /* Bound check the packet's stated IP header length. */ 3279 iphlen = ip->ip_hl << 2; 3280 if (iphlen < sizeof(struct ip)) /* minimum header length */ 3281 return IPPROTO_DONE; 3282 3283 /* The full IP header must reside completely in the one mbuf. */ 3284 if (m->m_len < hoff + iphlen) 3285 return IPPROTO_DONE; 3286 3287 iplen = ntohs(ip->ip_len); 3288 3289 /* 3290 * Check that the amount of data in the buffers is as 3291 * at least much as the IP header would have us expect. 3292 */ 3293 if (m->m_pkthdr.len < hoff + iplen) 3294 return IPPROTO_DONE; 3295 3296 /* 3297 * Ignore IP fragments. 3298 */ 3299 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 3300 return IPPROTO_DONE; 3301 3302 /* 3303 * The TCP/IP or UDP/IP header must be entirely contained within 3304 * the first fragment of a packet. 3305 */ 3306 switch (ip->ip_p) { 3307 case IPPROTO_TCP: 3308 if (iplen < iphlen + sizeof(struct tcphdr)) 3309 return IPPROTO_DONE; 3310 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 3311 return IPPROTO_DONE; 3312 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 3313 thoff = th->th_off << 2; 3314 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 3315 return IPPROTO_DONE; 3316 if (m->m_len < hoff + iphlen + thoff) 3317 return IPPROTO_DONE; 3318 break; 3319 case IPPROTO_UDP: 3320 if (iplen < iphlen + sizeof(struct udphdr)) 3321 return IPPROTO_DONE; 3322 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 3323 return IPPROTO_DONE; 3324 break; 3325 default: 3326 if (iplen < iphlen) 3327 return IPPROTO_DONE; 3328 break; 3329 } 3330 return ip->ip_p; 3331 } 3332 3333 static int 3334 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 3335 { 3336 struct sysctl_oid_list *child; 3337 struct sysctl_ctx_list *ctx; 3338 device_t dev = sc->hn_dev; 3339 #if defined(INET) || defined(INET6) 3340 #if __FreeBSD_version >= 1100095 3341 int lroent_cnt; 3342 #endif 3343 #endif 3344 int i; 3345 3346 /* 3347 * Create RXBUF for reception. 3348 * 3349 * NOTE: 3350 * - It is shared by all channels. 3351 * - A large enough buffer is allocated, certain version of NVSes 3352 * may further limit the usable space. 3353 */ 3354 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3355 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 3356 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3357 if (sc->hn_rxbuf == NULL) { 3358 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 3359 return (ENOMEM); 3360 } 3361 3362 sc->hn_rx_ring_cnt = ring_cnt; 3363 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 3364 3365 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 3366 M_DEVBUF, M_WAITOK | M_ZERO); 3367 3368 #if defined(INET) || defined(INET6) 3369 #if __FreeBSD_version >= 1100095 3370 lroent_cnt = hn_lro_entry_count; 3371 if (lroent_cnt < TCP_LRO_ENTRIES) 3372 lroent_cnt = TCP_LRO_ENTRIES; 3373 if (bootverbose) 3374 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 3375 #endif 3376 #endif /* INET || INET6 */ 3377 3378 ctx = device_get_sysctl_ctx(dev); 3379 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 3380 3381 /* Create dev.hn.UNIT.rx sysctl tree */ 3382 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 3383 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3384 3385 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3386 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3387 3388 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3389 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 3390 &rxr->hn_br_dma, BUS_DMA_WAITOK); 3391 if (rxr->hn_br == NULL) { 3392 device_printf(dev, "allocate bufring failed\n"); 3393 return (ENOMEM); 3394 } 3395 3396 if (hn_trust_hosttcp) 3397 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 3398 if (hn_trust_hostudp) 3399 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 3400 if (hn_trust_hostip) 3401 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 3402 rxr->hn_ifp = sc->hn_ifp; 3403 if (i < sc->hn_tx_ring_cnt) 3404 rxr->hn_txr = &sc->hn_tx_ring[i]; 3405 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 3406 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 3407 rxr->hn_rx_idx = i; 3408 rxr->hn_rxbuf = sc->hn_rxbuf; 3409 3410 /* 3411 * Initialize LRO. 3412 */ 3413 #if defined(INET) || defined(INET6) 3414 #if __FreeBSD_version >= 1100095 3415 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 3416 hn_lro_mbufq_depth); 3417 #else 3418 tcp_lro_init(&rxr->hn_lro); 3419 rxr->hn_lro.ifp = sc->hn_ifp; 3420 #endif 3421 #if __FreeBSD_version >= 1100099 3422 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 3423 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 3424 #endif 3425 #endif /* INET || INET6 */ 3426 3427 if (sc->hn_rx_sysctl_tree != NULL) { 3428 char name[16]; 3429 3430 /* 3431 * Create per RX ring sysctl tree: 3432 * dev.hn.UNIT.rx.RINGID 3433 */ 3434 snprintf(name, sizeof(name), "%d", i); 3435 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 3436 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 3437 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3438 3439 if (rxr->hn_rx_sysctl_tree != NULL) { 3440 SYSCTL_ADD_ULONG(ctx, 3441 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3442 OID_AUTO, "packets", CTLFLAG_RW, 3443 &rxr->hn_pkts, "# of packets received"); 3444 SYSCTL_ADD_ULONG(ctx, 3445 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3446 OID_AUTO, "rss_pkts", CTLFLAG_RW, 3447 &rxr->hn_rss_pkts, 3448 "# of packets w/ RSS info received"); 3449 SYSCTL_ADD_INT(ctx, 3450 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3451 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 3452 &rxr->hn_pktbuf_len, 0, 3453 "Temporary channel packet buffer length"); 3454 } 3455 } 3456 } 3457 3458 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 3459 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3460 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 3461 #if __FreeBSD_version < 1100095 3462 hn_rx_stat_int_sysctl, 3463 #else 3464 hn_rx_stat_u64_sysctl, 3465 #endif 3466 "LU", "LRO queued"); 3467 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 3468 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3469 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 3470 #if __FreeBSD_version < 1100095 3471 hn_rx_stat_int_sysctl, 3472 #else 3473 hn_rx_stat_u64_sysctl, 3474 #endif 3475 "LU", "LRO flushed"); 3476 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 3477 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3478 __offsetof(struct hn_rx_ring, hn_lro_tried), 3479 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 3480 #if __FreeBSD_version >= 1100099 3481 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 3482 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3483 hn_lro_lenlim_sysctl, "IU", 3484 "Max # of data bytes to be aggregated by LRO"); 3485 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 3486 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3487 hn_lro_ackcnt_sysctl, "I", 3488 "Max # of ACKs to be aggregated by LRO"); 3489 #endif 3490 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 3491 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 3492 hn_trust_hcsum_sysctl, "I", 3493 "Trust tcp segement verification on host side, " 3494 "when csum info is missing"); 3495 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 3496 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 3497 hn_trust_hcsum_sysctl, "I", 3498 "Trust udp datagram verification on host side, " 3499 "when csum info is missing"); 3500 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 3501 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 3502 hn_trust_hcsum_sysctl, "I", 3503 "Trust ip packet verification on host side, " 3504 "when csum info is missing"); 3505 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 3506 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3507 __offsetof(struct hn_rx_ring, hn_csum_ip), 3508 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 3509 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 3510 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3511 __offsetof(struct hn_rx_ring, hn_csum_tcp), 3512 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 3513 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 3514 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3515 __offsetof(struct hn_rx_ring, hn_csum_udp), 3516 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 3517 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 3518 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3519 __offsetof(struct hn_rx_ring, hn_csum_trusted), 3520 hn_rx_stat_ulong_sysctl, "LU", 3521 "# of packets that we trust host's csum verification"); 3522 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 3523 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3524 __offsetof(struct hn_rx_ring, hn_small_pkts), 3525 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 3526 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 3527 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3528 __offsetof(struct hn_rx_ring, hn_ack_failed), 3529 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 3530 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 3531 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 3532 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 3533 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 3534 3535 return (0); 3536 } 3537 3538 static void 3539 hn_destroy_rx_data(struct hn_softc *sc) 3540 { 3541 int i; 3542 3543 if (sc->hn_rxbuf != NULL) { 3544 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 3545 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 3546 else 3547 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 3548 sc->hn_rxbuf = NULL; 3549 } 3550 3551 if (sc->hn_rx_ring_cnt == 0) 3552 return; 3553 3554 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3555 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3556 3557 if (rxr->hn_br == NULL) 3558 continue; 3559 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 3560 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 3561 } else { 3562 device_printf(sc->hn_dev, 3563 "%dth channel bufring is referenced", i); 3564 } 3565 rxr->hn_br = NULL; 3566 3567 #if defined(INET) || defined(INET6) 3568 tcp_lro_free(&rxr->hn_lro); 3569 #endif 3570 free(rxr->hn_pktbuf, M_DEVBUF); 3571 } 3572 free(sc->hn_rx_ring, M_DEVBUF); 3573 sc->hn_rx_ring = NULL; 3574 3575 sc->hn_rx_ring_cnt = 0; 3576 sc->hn_rx_ring_inuse = 0; 3577 } 3578 3579 static int 3580 hn_tx_ring_create(struct hn_softc *sc, int id) 3581 { 3582 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 3583 device_t dev = sc->hn_dev; 3584 bus_dma_tag_t parent_dtag; 3585 int error, i; 3586 3587 txr->hn_sc = sc; 3588 txr->hn_tx_idx = id; 3589 3590 #ifndef HN_USE_TXDESC_BUFRING 3591 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 3592 #endif 3593 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 3594 3595 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 3596 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 3597 M_DEVBUF, M_WAITOK | M_ZERO); 3598 #ifndef HN_USE_TXDESC_BUFRING 3599 SLIST_INIT(&txr->hn_txlist); 3600 #else 3601 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 3602 M_WAITOK, &txr->hn_tx_lock); 3603 #endif 3604 3605 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 3606 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 3607 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 3608 } else { 3609 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 3610 } 3611 3612 #ifdef HN_IFSTART_SUPPORT 3613 if (hn_use_if_start) { 3614 txr->hn_txeof = hn_start_txeof; 3615 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 3616 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 3617 } else 3618 #endif 3619 { 3620 int br_depth; 3621 3622 txr->hn_txeof = hn_xmit_txeof; 3623 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 3624 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 3625 3626 br_depth = hn_get_txswq_depth(txr); 3627 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 3628 M_WAITOK, &txr->hn_tx_lock); 3629 } 3630 3631 txr->hn_direct_tx_size = hn_direct_tx_size; 3632 3633 /* 3634 * Always schedule transmission instead of trying to do direct 3635 * transmission. This one gives the best performance so far. 3636 */ 3637 txr->hn_sched_tx = 1; 3638 3639 parent_dtag = bus_get_dma_tag(dev); 3640 3641 /* DMA tag for RNDIS packet messages. */ 3642 error = bus_dma_tag_create(parent_dtag, /* parent */ 3643 HN_RNDIS_PKT_ALIGN, /* alignment */ 3644 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 3645 BUS_SPACE_MAXADDR, /* lowaddr */ 3646 BUS_SPACE_MAXADDR, /* highaddr */ 3647 NULL, NULL, /* filter, filterarg */ 3648 HN_RNDIS_PKT_LEN, /* maxsize */ 3649 1, /* nsegments */ 3650 HN_RNDIS_PKT_LEN, /* maxsegsize */ 3651 0, /* flags */ 3652 NULL, /* lockfunc */ 3653 NULL, /* lockfuncarg */ 3654 &txr->hn_tx_rndis_dtag); 3655 if (error) { 3656 device_printf(dev, "failed to create rndis dmatag\n"); 3657 return error; 3658 } 3659 3660 /* DMA tag for data. */ 3661 error = bus_dma_tag_create(parent_dtag, /* parent */ 3662 1, /* alignment */ 3663 HN_TX_DATA_BOUNDARY, /* boundary */ 3664 BUS_SPACE_MAXADDR, /* lowaddr */ 3665 BUS_SPACE_MAXADDR, /* highaddr */ 3666 NULL, NULL, /* filter, filterarg */ 3667 HN_TX_DATA_MAXSIZE, /* maxsize */ 3668 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 3669 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 3670 0, /* flags */ 3671 NULL, /* lockfunc */ 3672 NULL, /* lockfuncarg */ 3673 &txr->hn_tx_data_dtag); 3674 if (error) { 3675 device_printf(dev, "failed to create data dmatag\n"); 3676 return error; 3677 } 3678 3679 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 3680 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 3681 3682 txd->txr = txr; 3683 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3684 STAILQ_INIT(&txd->agg_list); 3685 3686 /* 3687 * Allocate and load RNDIS packet message. 3688 */ 3689 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 3690 (void **)&txd->rndis_pkt, 3691 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 3692 &txd->rndis_pkt_dmap); 3693 if (error) { 3694 device_printf(dev, 3695 "failed to allocate rndis_packet_msg, %d\n", i); 3696 return error; 3697 } 3698 3699 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 3700 txd->rndis_pkt_dmap, 3701 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 3702 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 3703 BUS_DMA_NOWAIT); 3704 if (error) { 3705 device_printf(dev, 3706 "failed to load rndis_packet_msg, %d\n", i); 3707 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3708 txd->rndis_pkt, txd->rndis_pkt_dmap); 3709 return error; 3710 } 3711 3712 /* DMA map for TX data. */ 3713 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 3714 &txd->data_dmap); 3715 if (error) { 3716 device_printf(dev, 3717 "failed to allocate tx data dmamap\n"); 3718 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 3719 txd->rndis_pkt_dmap); 3720 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3721 txd->rndis_pkt, txd->rndis_pkt_dmap); 3722 return error; 3723 } 3724 3725 /* All set, put it to list */ 3726 txd->flags |= HN_TXD_FLAG_ONLIST; 3727 #ifndef HN_USE_TXDESC_BUFRING 3728 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 3729 #else 3730 buf_ring_enqueue(txr->hn_txdesc_br, txd); 3731 #endif 3732 } 3733 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 3734 3735 if (sc->hn_tx_sysctl_tree != NULL) { 3736 struct sysctl_oid_list *child; 3737 struct sysctl_ctx_list *ctx; 3738 char name[16]; 3739 3740 /* 3741 * Create per TX ring sysctl tree: 3742 * dev.hn.UNIT.tx.RINGID 3743 */ 3744 ctx = device_get_sysctl_ctx(dev); 3745 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 3746 3747 snprintf(name, sizeof(name), "%d", id); 3748 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 3749 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3750 3751 if (txr->hn_tx_sysctl_tree != NULL) { 3752 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 3753 3754 #ifdef HN_DEBUG 3755 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 3756 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 3757 "# of available TX descs"); 3758 #endif 3759 #ifdef HN_IFSTART_SUPPORT 3760 if (!hn_use_if_start) 3761 #endif 3762 { 3763 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 3764 CTLFLAG_RD, &txr->hn_oactive, 0, 3765 "over active"); 3766 } 3767 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 3768 CTLFLAG_RW, &txr->hn_pkts, 3769 "# of packets transmitted"); 3770 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 3771 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 3772 } 3773 } 3774 3775 return 0; 3776 } 3777 3778 static void 3779 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 3780 { 3781 struct hn_tx_ring *txr = txd->txr; 3782 3783 KASSERT(txd->m == NULL, ("still has mbuf installed")); 3784 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 3785 3786 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 3787 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 3788 txd->rndis_pkt_dmap); 3789 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 3790 } 3791 3792 static void 3793 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 3794 { 3795 3796 KASSERT(txd->refs == 0 || txd->refs == 1, 3797 ("invalid txd refs %d", txd->refs)); 3798 3799 /* Aggregated txds will be freed by their aggregating txd. */ 3800 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 3801 int freed; 3802 3803 freed = hn_txdesc_put(txr, txd); 3804 KASSERT(freed, ("can't free txdesc")); 3805 } 3806 } 3807 3808 static void 3809 hn_tx_ring_destroy(struct hn_tx_ring *txr) 3810 { 3811 int i; 3812 3813 if (txr->hn_txdesc == NULL) 3814 return; 3815 3816 /* 3817 * NOTE: 3818 * Because the freeing of aggregated txds will be deferred 3819 * to the aggregating txd, two passes are used here: 3820 * - The first pass GCes any pending txds. This GC is necessary, 3821 * since if the channels are revoked, hypervisor will not 3822 * deliver send-done for all pending txds. 3823 * - The second pass frees the busdma stuffs, i.e. after all txds 3824 * were freed. 3825 */ 3826 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3827 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 3828 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3829 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 3830 3831 if (txr->hn_tx_data_dtag != NULL) 3832 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 3833 if (txr->hn_tx_rndis_dtag != NULL) 3834 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 3835 3836 #ifdef HN_USE_TXDESC_BUFRING 3837 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 3838 #endif 3839 3840 free(txr->hn_txdesc, M_DEVBUF); 3841 txr->hn_txdesc = NULL; 3842 3843 if (txr->hn_mbuf_br != NULL) 3844 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 3845 3846 #ifndef HN_USE_TXDESC_BUFRING 3847 mtx_destroy(&txr->hn_txlist_spin); 3848 #endif 3849 mtx_destroy(&txr->hn_tx_lock); 3850 } 3851 3852 static int 3853 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 3854 { 3855 struct sysctl_oid_list *child; 3856 struct sysctl_ctx_list *ctx; 3857 int i; 3858 3859 /* 3860 * Create TXBUF for chimney sending. 3861 * 3862 * NOTE: It is shared by all channels. 3863 */ 3864 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 3865 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 3866 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3867 if (sc->hn_chim == NULL) { 3868 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 3869 return (ENOMEM); 3870 } 3871 3872 sc->hn_tx_ring_cnt = ring_cnt; 3873 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 3874 3875 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 3876 M_DEVBUF, M_WAITOK | M_ZERO); 3877 3878 ctx = device_get_sysctl_ctx(sc->hn_dev); 3879 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 3880 3881 /* Create dev.hn.UNIT.tx sysctl tree */ 3882 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 3883 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3884 3885 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 3886 int error; 3887 3888 error = hn_tx_ring_create(sc, i); 3889 if (error) 3890 return error; 3891 } 3892 3893 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 3894 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3895 __offsetof(struct hn_tx_ring, hn_no_txdescs), 3896 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 3897 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 3898 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3899 __offsetof(struct hn_tx_ring, hn_send_failed), 3900 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 3901 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 3902 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3903 __offsetof(struct hn_tx_ring, hn_txdma_failed), 3904 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 3905 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 3906 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3907 __offsetof(struct hn_tx_ring, hn_flush_failed), 3908 hn_tx_stat_ulong_sysctl, "LU", 3909 "# of packet transmission aggregation flush failure"); 3910 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 3911 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3912 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 3913 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 3914 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 3915 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3916 __offsetof(struct hn_tx_ring, hn_tx_chimney), 3917 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 3918 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 3919 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3920 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 3921 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 3922 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 3923 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 3924 "# of total TX descs"); 3925 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 3926 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 3927 "Chimney send packet size upper boundary"); 3928 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 3929 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3930 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 3931 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 3932 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3933 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 3934 hn_tx_conf_int_sysctl, "I", 3935 "Size of the packet for direct transmission"); 3936 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 3937 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3938 __offsetof(struct hn_tx_ring, hn_sched_tx), 3939 hn_tx_conf_int_sysctl, "I", 3940 "Always schedule transmission " 3941 "instead of doing direct transmission"); 3942 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 3943 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 3944 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 3945 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 3946 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 3947 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 3948 "Applied packet transmission aggregation size"); 3949 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 3950 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3951 hn_txagg_pktmax_sysctl, "I", 3952 "Applied packet transmission aggregation packets"); 3953 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 3954 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3955 hn_txagg_align_sysctl, "I", 3956 "Applied packet transmission aggregation alignment"); 3957 3958 return 0; 3959 } 3960 3961 static void 3962 hn_set_chim_size(struct hn_softc *sc, int chim_size) 3963 { 3964 int i; 3965 3966 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3967 sc->hn_tx_ring[i].hn_chim_size = chim_size; 3968 } 3969 3970 static void 3971 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 3972 { 3973 struct ifnet *ifp = sc->hn_ifp; 3974 int tso_minlen; 3975 3976 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 3977 return; 3978 3979 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 3980 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 3981 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 3982 3983 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 3984 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 3985 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 3986 3987 if (tso_maxlen < tso_minlen) 3988 tso_maxlen = tso_minlen; 3989 else if (tso_maxlen > IP_MAXPACKET) 3990 tso_maxlen = IP_MAXPACKET; 3991 if (tso_maxlen > sc->hn_ndis_tso_szmax) 3992 tso_maxlen = sc->hn_ndis_tso_szmax; 3993 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 3994 if (bootverbose) 3995 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 3996 } 3997 3998 static void 3999 hn_fixup_tx_data(struct hn_softc *sc) 4000 { 4001 uint64_t csum_assist; 4002 int i; 4003 4004 hn_set_chim_size(sc, sc->hn_chim_szmax); 4005 if (hn_tx_chimney_size > 0 && 4006 hn_tx_chimney_size < sc->hn_chim_szmax) 4007 hn_set_chim_size(sc, hn_tx_chimney_size); 4008 4009 csum_assist = 0; 4010 if (sc->hn_caps & HN_CAP_IPCS) 4011 csum_assist |= CSUM_IP; 4012 if (sc->hn_caps & HN_CAP_TCP4CS) 4013 csum_assist |= CSUM_IP_TCP; 4014 if (sc->hn_caps & HN_CAP_UDP4CS) 4015 csum_assist |= CSUM_IP_UDP; 4016 if (sc->hn_caps & HN_CAP_TCP6CS) 4017 csum_assist |= CSUM_IP6_TCP; 4018 if (sc->hn_caps & HN_CAP_UDP6CS) 4019 csum_assist |= CSUM_IP6_UDP; 4020 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4021 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 4022 4023 if (sc->hn_caps & HN_CAP_HASHVAL) { 4024 /* 4025 * Support HASHVAL pktinfo on TX path. 4026 */ 4027 if (bootverbose) 4028 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 4029 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4030 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 4031 } 4032 } 4033 4034 static void 4035 hn_destroy_tx_data(struct hn_softc *sc) 4036 { 4037 int i; 4038 4039 if (sc->hn_chim != NULL) { 4040 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 4041 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 4042 } else { 4043 device_printf(sc->hn_dev, 4044 "chimney sending buffer is referenced"); 4045 } 4046 sc->hn_chim = NULL; 4047 } 4048 4049 if (sc->hn_tx_ring_cnt == 0) 4050 return; 4051 4052 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4053 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 4054 4055 free(sc->hn_tx_ring, M_DEVBUF); 4056 sc->hn_tx_ring = NULL; 4057 4058 sc->hn_tx_ring_cnt = 0; 4059 sc->hn_tx_ring_inuse = 0; 4060 } 4061 4062 #ifdef HN_IFSTART_SUPPORT 4063 4064 static void 4065 hn_start_taskfunc(void *xtxr, int pending __unused) 4066 { 4067 struct hn_tx_ring *txr = xtxr; 4068 4069 mtx_lock(&txr->hn_tx_lock); 4070 hn_start_locked(txr, 0); 4071 mtx_unlock(&txr->hn_tx_lock); 4072 } 4073 4074 static int 4075 hn_start_locked(struct hn_tx_ring *txr, int len) 4076 { 4077 struct hn_softc *sc = txr->hn_sc; 4078 struct ifnet *ifp = sc->hn_ifp; 4079 int sched = 0; 4080 4081 KASSERT(hn_use_if_start, 4082 ("hn_start_locked is called, when if_start is disabled")); 4083 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 4084 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 4085 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 4086 4087 if (__predict_false(txr->hn_suspended)) 4088 return (0); 4089 4090 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 4091 IFF_DRV_RUNNING) 4092 return (0); 4093 4094 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 4095 struct hn_txdesc *txd; 4096 struct mbuf *m_head; 4097 int error; 4098 4099 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 4100 if (m_head == NULL) 4101 break; 4102 4103 if (len > 0 && m_head->m_pkthdr.len > len) { 4104 /* 4105 * This sending could be time consuming; let callers 4106 * dispatch this packet sending (and sending of any 4107 * following up packets) to tx taskqueue. 4108 */ 4109 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 4110 sched = 1; 4111 break; 4112 } 4113 4114 #if defined(INET6) || defined(INET) 4115 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 4116 m_head = hn_tso_fixup(m_head); 4117 if (__predict_false(m_head == NULL)) { 4118 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 4119 continue; 4120 } 4121 } 4122 #endif 4123 4124 txd = hn_txdesc_get(txr); 4125 if (txd == NULL) { 4126 txr->hn_no_txdescs++; 4127 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 4128 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4129 break; 4130 } 4131 4132 error = hn_encap(ifp, txr, txd, &m_head); 4133 if (error) { 4134 /* Both txd and m_head are freed */ 4135 KASSERT(txr->hn_agg_txd == NULL, 4136 ("encap failed w/ pending aggregating txdesc")); 4137 continue; 4138 } 4139 4140 if (txr->hn_agg_pktleft == 0) { 4141 if (txr->hn_agg_txd != NULL) { 4142 KASSERT(m_head == NULL, 4143 ("pending mbuf for aggregating txdesc")); 4144 error = hn_flush_txagg(ifp, txr); 4145 if (__predict_false(error)) { 4146 atomic_set_int(&ifp->if_drv_flags, 4147 IFF_DRV_OACTIVE); 4148 break; 4149 } 4150 } else { 4151 KASSERT(m_head != NULL, ("mbuf was freed")); 4152 error = hn_txpkt(ifp, txr, txd); 4153 if (__predict_false(error)) { 4154 /* txd is freed, but m_head is not */ 4155 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 4156 atomic_set_int(&ifp->if_drv_flags, 4157 IFF_DRV_OACTIVE); 4158 break; 4159 } 4160 } 4161 } 4162 #ifdef INVARIANTS 4163 else { 4164 KASSERT(txr->hn_agg_txd != NULL, 4165 ("no aggregating txdesc")); 4166 KASSERT(m_head == NULL, 4167 ("pending mbuf for aggregating txdesc")); 4168 } 4169 #endif 4170 } 4171 4172 /* Flush pending aggerated transmission. */ 4173 if (txr->hn_agg_txd != NULL) 4174 hn_flush_txagg(ifp, txr); 4175 return (sched); 4176 } 4177 4178 static void 4179 hn_start(struct ifnet *ifp) 4180 { 4181 struct hn_softc *sc = ifp->if_softc; 4182 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 4183 4184 if (txr->hn_sched_tx) 4185 goto do_sched; 4186 4187 if (mtx_trylock(&txr->hn_tx_lock)) { 4188 int sched; 4189 4190 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 4191 mtx_unlock(&txr->hn_tx_lock); 4192 if (!sched) 4193 return; 4194 } 4195 do_sched: 4196 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4197 } 4198 4199 static void 4200 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 4201 { 4202 struct hn_tx_ring *txr = xtxr; 4203 4204 mtx_lock(&txr->hn_tx_lock); 4205 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 4206 hn_start_locked(txr, 0); 4207 mtx_unlock(&txr->hn_tx_lock); 4208 } 4209 4210 static void 4211 hn_start_txeof(struct hn_tx_ring *txr) 4212 { 4213 struct hn_softc *sc = txr->hn_sc; 4214 struct ifnet *ifp = sc->hn_ifp; 4215 4216 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 4217 4218 if (txr->hn_sched_tx) 4219 goto do_sched; 4220 4221 if (mtx_trylock(&txr->hn_tx_lock)) { 4222 int sched; 4223 4224 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4225 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 4226 mtx_unlock(&txr->hn_tx_lock); 4227 if (sched) { 4228 taskqueue_enqueue(txr->hn_tx_taskq, 4229 &txr->hn_tx_task); 4230 } 4231 } else { 4232 do_sched: 4233 /* 4234 * Release the OACTIVE earlier, with the hope, that 4235 * others could catch up. The task will clear the 4236 * flag again with the hn_tx_lock to avoid possible 4237 * races. 4238 */ 4239 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4240 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4241 } 4242 } 4243 4244 #endif /* HN_IFSTART_SUPPORT */ 4245 4246 static int 4247 hn_xmit(struct hn_tx_ring *txr, int len) 4248 { 4249 struct hn_softc *sc = txr->hn_sc; 4250 struct ifnet *ifp = sc->hn_ifp; 4251 struct mbuf *m_head; 4252 int sched = 0; 4253 4254 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 4255 #ifdef HN_IFSTART_SUPPORT 4256 KASSERT(hn_use_if_start == 0, 4257 ("hn_xmit is called, when if_start is enabled")); 4258 #endif 4259 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 4260 4261 if (__predict_false(txr->hn_suspended)) 4262 return (0); 4263 4264 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 4265 return (0); 4266 4267 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 4268 struct hn_txdesc *txd; 4269 int error; 4270 4271 if (len > 0 && m_head->m_pkthdr.len > len) { 4272 /* 4273 * This sending could be time consuming; let callers 4274 * dispatch this packet sending (and sending of any 4275 * following up packets) to tx taskqueue. 4276 */ 4277 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4278 sched = 1; 4279 break; 4280 } 4281 4282 txd = hn_txdesc_get(txr); 4283 if (txd == NULL) { 4284 txr->hn_no_txdescs++; 4285 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4286 txr->hn_oactive = 1; 4287 break; 4288 } 4289 4290 error = hn_encap(ifp, txr, txd, &m_head); 4291 if (error) { 4292 /* Both txd and m_head are freed; discard */ 4293 KASSERT(txr->hn_agg_txd == NULL, 4294 ("encap failed w/ pending aggregating txdesc")); 4295 drbr_advance(ifp, txr->hn_mbuf_br); 4296 continue; 4297 } 4298 4299 if (txr->hn_agg_pktleft == 0) { 4300 if (txr->hn_agg_txd != NULL) { 4301 KASSERT(m_head == NULL, 4302 ("pending mbuf for aggregating txdesc")); 4303 error = hn_flush_txagg(ifp, txr); 4304 if (__predict_false(error)) { 4305 txr->hn_oactive = 1; 4306 break; 4307 } 4308 } else { 4309 KASSERT(m_head != NULL, ("mbuf was freed")); 4310 error = hn_txpkt(ifp, txr, txd); 4311 if (__predict_false(error)) { 4312 /* txd is freed, but m_head is not */ 4313 drbr_putback(ifp, txr->hn_mbuf_br, 4314 m_head); 4315 txr->hn_oactive = 1; 4316 break; 4317 } 4318 } 4319 } 4320 #ifdef INVARIANTS 4321 else { 4322 KASSERT(txr->hn_agg_txd != NULL, 4323 ("no aggregating txdesc")); 4324 KASSERT(m_head == NULL, 4325 ("pending mbuf for aggregating txdesc")); 4326 } 4327 #endif 4328 4329 /* Sent */ 4330 drbr_advance(ifp, txr->hn_mbuf_br); 4331 } 4332 4333 /* Flush pending aggerated transmission. */ 4334 if (txr->hn_agg_txd != NULL) 4335 hn_flush_txagg(ifp, txr); 4336 return (sched); 4337 } 4338 4339 static int 4340 hn_transmit(struct ifnet *ifp, struct mbuf *m) 4341 { 4342 struct hn_softc *sc = ifp->if_softc; 4343 struct hn_tx_ring *txr; 4344 int error, idx = 0; 4345 4346 #if defined(INET6) || defined(INET) 4347 /* 4348 * Perform TSO packet header fixup now, since the TSO 4349 * packet header should be cache-hot. 4350 */ 4351 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 4352 m = hn_tso_fixup(m); 4353 if (__predict_false(m == NULL)) { 4354 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 4355 return EIO; 4356 } 4357 } 4358 #endif 4359 4360 /* 4361 * Select the TX ring based on flowid 4362 */ 4363 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 4364 #ifdef RSS 4365 uint32_t bid; 4366 4367 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 4368 &bid) == 0) 4369 idx = bid % sc->hn_tx_ring_inuse; 4370 else 4371 #endif 4372 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 4373 } 4374 txr = &sc->hn_tx_ring[idx]; 4375 4376 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 4377 if (error) { 4378 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 4379 return error; 4380 } 4381 4382 if (txr->hn_oactive) 4383 return 0; 4384 4385 if (txr->hn_sched_tx) 4386 goto do_sched; 4387 4388 if (mtx_trylock(&txr->hn_tx_lock)) { 4389 int sched; 4390 4391 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4392 mtx_unlock(&txr->hn_tx_lock); 4393 if (!sched) 4394 return 0; 4395 } 4396 do_sched: 4397 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4398 return 0; 4399 } 4400 4401 static void 4402 hn_tx_ring_qflush(struct hn_tx_ring *txr) 4403 { 4404 struct mbuf *m; 4405 4406 mtx_lock(&txr->hn_tx_lock); 4407 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 4408 m_freem(m); 4409 mtx_unlock(&txr->hn_tx_lock); 4410 } 4411 4412 static void 4413 hn_xmit_qflush(struct ifnet *ifp) 4414 { 4415 struct hn_softc *sc = ifp->if_softc; 4416 int i; 4417 4418 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4419 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 4420 if_qflush(ifp); 4421 } 4422 4423 static void 4424 hn_xmit_txeof(struct hn_tx_ring *txr) 4425 { 4426 4427 if (txr->hn_sched_tx) 4428 goto do_sched; 4429 4430 if (mtx_trylock(&txr->hn_tx_lock)) { 4431 int sched; 4432 4433 txr->hn_oactive = 0; 4434 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4435 mtx_unlock(&txr->hn_tx_lock); 4436 if (sched) { 4437 taskqueue_enqueue(txr->hn_tx_taskq, 4438 &txr->hn_tx_task); 4439 } 4440 } else { 4441 do_sched: 4442 /* 4443 * Release the oactive earlier, with the hope, that 4444 * others could catch up. The task will clear the 4445 * oactive again with the hn_tx_lock to avoid possible 4446 * races. 4447 */ 4448 txr->hn_oactive = 0; 4449 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4450 } 4451 } 4452 4453 static void 4454 hn_xmit_taskfunc(void *xtxr, int pending __unused) 4455 { 4456 struct hn_tx_ring *txr = xtxr; 4457 4458 mtx_lock(&txr->hn_tx_lock); 4459 hn_xmit(txr, 0); 4460 mtx_unlock(&txr->hn_tx_lock); 4461 } 4462 4463 static void 4464 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 4465 { 4466 struct hn_tx_ring *txr = xtxr; 4467 4468 mtx_lock(&txr->hn_tx_lock); 4469 txr->hn_oactive = 0; 4470 hn_xmit(txr, 0); 4471 mtx_unlock(&txr->hn_tx_lock); 4472 } 4473 4474 static int 4475 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 4476 { 4477 struct vmbus_chan_br cbr; 4478 struct hn_rx_ring *rxr; 4479 struct hn_tx_ring *txr = NULL; 4480 int idx, error; 4481 4482 idx = vmbus_chan_subidx(chan); 4483 4484 /* 4485 * Link this channel to RX/TX ring. 4486 */ 4487 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4488 ("invalid channel index %d, should > 0 && < %d", 4489 idx, sc->hn_rx_ring_inuse)); 4490 rxr = &sc->hn_rx_ring[idx]; 4491 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 4492 ("RX ring %d already attached", idx)); 4493 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 4494 rxr->hn_chan = chan; 4495 4496 if (bootverbose) { 4497 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 4498 idx, vmbus_chan_id(chan)); 4499 } 4500 4501 if (idx < sc->hn_tx_ring_inuse) { 4502 txr = &sc->hn_tx_ring[idx]; 4503 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 4504 ("TX ring %d already attached", idx)); 4505 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 4506 4507 txr->hn_chan = chan; 4508 if (bootverbose) { 4509 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 4510 idx, vmbus_chan_id(chan)); 4511 } 4512 } 4513 4514 /* Bind this channel to a proper CPU. */ 4515 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 4516 4517 /* 4518 * Open this channel 4519 */ 4520 cbr.cbr = rxr->hn_br; 4521 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 4522 cbr.cbr_txsz = HN_TXBR_SIZE; 4523 cbr.cbr_rxsz = HN_RXBR_SIZE; 4524 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 4525 if (error) { 4526 if (error == EISCONN) { 4527 if_printf(sc->hn_ifp, "bufring is connected after " 4528 "chan%u open failure\n", vmbus_chan_id(chan)); 4529 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4530 } else { 4531 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 4532 vmbus_chan_id(chan), error); 4533 } 4534 } 4535 return (error); 4536 } 4537 4538 static void 4539 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 4540 { 4541 struct hn_rx_ring *rxr; 4542 int idx, error; 4543 4544 idx = vmbus_chan_subidx(chan); 4545 4546 /* 4547 * Link this channel to RX/TX ring. 4548 */ 4549 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4550 ("invalid channel index %d, should > 0 && < %d", 4551 idx, sc->hn_rx_ring_inuse)); 4552 rxr = &sc->hn_rx_ring[idx]; 4553 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 4554 ("RX ring %d is not attached", idx)); 4555 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 4556 4557 if (idx < sc->hn_tx_ring_inuse) { 4558 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 4559 4560 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 4561 ("TX ring %d is not attached attached", idx)); 4562 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 4563 } 4564 4565 /* 4566 * Close this channel. 4567 * 4568 * NOTE: 4569 * Channel closing does _not_ destroy the target channel. 4570 */ 4571 error = vmbus_chan_close_direct(chan); 4572 if (error == EISCONN) { 4573 if_printf(sc->hn_ifp, "chan%u bufring is connected " 4574 "after being closed\n", vmbus_chan_id(chan)); 4575 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4576 } else if (error) { 4577 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 4578 vmbus_chan_id(chan), error); 4579 } 4580 } 4581 4582 static int 4583 hn_attach_subchans(struct hn_softc *sc) 4584 { 4585 struct vmbus_channel **subchans; 4586 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4587 int i, error = 0; 4588 4589 KASSERT(subchan_cnt > 0, ("no sub-channels")); 4590 4591 /* Attach the sub-channels. */ 4592 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4593 for (i = 0; i < subchan_cnt; ++i) { 4594 int error1; 4595 4596 error1 = hn_chan_attach(sc, subchans[i]); 4597 if (error1) { 4598 error = error1; 4599 /* Move on; all channels will be detached later. */ 4600 } 4601 } 4602 vmbus_subchan_rel(subchans, subchan_cnt); 4603 4604 if (error) { 4605 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 4606 } else { 4607 if (bootverbose) { 4608 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 4609 subchan_cnt); 4610 } 4611 } 4612 return (error); 4613 } 4614 4615 static void 4616 hn_detach_allchans(struct hn_softc *sc) 4617 { 4618 struct vmbus_channel **subchans; 4619 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4620 int i; 4621 4622 if (subchan_cnt == 0) 4623 goto back; 4624 4625 /* Detach the sub-channels. */ 4626 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4627 for (i = 0; i < subchan_cnt; ++i) 4628 hn_chan_detach(sc, subchans[i]); 4629 vmbus_subchan_rel(subchans, subchan_cnt); 4630 4631 back: 4632 /* 4633 * Detach the primary channel, _after_ all sub-channels 4634 * are detached. 4635 */ 4636 hn_chan_detach(sc, sc->hn_prichan); 4637 4638 /* Wait for sub-channels to be destroyed, if any. */ 4639 vmbus_subchan_drain(sc->hn_prichan); 4640 4641 #ifdef INVARIANTS 4642 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4643 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 4644 HN_RX_FLAG_ATTACHED) == 0, 4645 ("%dth RX ring is still attached", i)); 4646 } 4647 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4648 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 4649 HN_TX_FLAG_ATTACHED) == 0, 4650 ("%dth TX ring is still attached", i)); 4651 } 4652 #endif 4653 } 4654 4655 static int 4656 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 4657 { 4658 struct vmbus_channel **subchans; 4659 int nchan, rxr_cnt, error; 4660 4661 nchan = *nsubch + 1; 4662 if (nchan == 1) { 4663 /* 4664 * Multiple RX/TX rings are not requested. 4665 */ 4666 *nsubch = 0; 4667 return (0); 4668 } 4669 4670 /* 4671 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 4672 * table entries. 4673 */ 4674 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 4675 if (error) { 4676 /* No RSS; this is benign. */ 4677 *nsubch = 0; 4678 return (0); 4679 } 4680 if (bootverbose) { 4681 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 4682 rxr_cnt, nchan); 4683 } 4684 4685 if (nchan > rxr_cnt) 4686 nchan = rxr_cnt; 4687 if (nchan == 1) { 4688 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 4689 *nsubch = 0; 4690 return (0); 4691 } 4692 4693 /* 4694 * Allocate sub-channels from NVS. 4695 */ 4696 *nsubch = nchan - 1; 4697 error = hn_nvs_alloc_subchans(sc, nsubch); 4698 if (error || *nsubch == 0) { 4699 /* Failed to allocate sub-channels. */ 4700 *nsubch = 0; 4701 return (0); 4702 } 4703 4704 /* 4705 * Wait for all sub-channels to become ready before moving on. 4706 */ 4707 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 4708 vmbus_subchan_rel(subchans, *nsubch); 4709 return (0); 4710 } 4711 4712 static bool 4713 hn_synth_attachable(const struct hn_softc *sc) 4714 { 4715 int i; 4716 4717 if (sc->hn_flags & HN_FLAG_ERRORS) 4718 return (false); 4719 4720 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4721 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4722 4723 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 4724 return (false); 4725 } 4726 return (true); 4727 } 4728 4729 /* 4730 * Make sure that the RX filter is zero after the successful 4731 * RNDIS initialization. 4732 * 4733 * NOTE: 4734 * Under certain conditions on certain versions of Hyper-V, 4735 * the RNDIS rxfilter is _not_ zero on the hypervisor side 4736 * after the successful RNDIS initialization, which breaks 4737 * the assumption of any following code (well, it breaks the 4738 * RNDIS API contract actually). Clear the RNDIS rxfilter 4739 * explicitly, drain packets sneaking through, and drain the 4740 * interrupt taskqueues scheduled due to the stealth packets. 4741 */ 4742 static void 4743 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 4744 { 4745 4746 hn_disable_rx(sc); 4747 hn_drain_rxtx(sc, nchan); 4748 } 4749 4750 static int 4751 hn_synth_attach(struct hn_softc *sc, int mtu) 4752 { 4753 #define ATTACHED_NVS 0x0002 4754 #define ATTACHED_RNDIS 0x0004 4755 4756 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 4757 int error, nsubch, nchan = 1, i, rndis_inited; 4758 uint32_t old_caps, attached = 0; 4759 4760 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 4761 ("synthetic parts were attached")); 4762 4763 if (!hn_synth_attachable(sc)) 4764 return (ENXIO); 4765 4766 /* Save capabilities for later verification. */ 4767 old_caps = sc->hn_caps; 4768 sc->hn_caps = 0; 4769 4770 /* Clear RSS stuffs. */ 4771 sc->hn_rss_ind_size = 0; 4772 sc->hn_rss_hash = 0; 4773 4774 /* 4775 * Attach the primary channel _before_ attaching NVS and RNDIS. 4776 */ 4777 error = hn_chan_attach(sc, sc->hn_prichan); 4778 if (error) 4779 goto failed; 4780 4781 /* 4782 * Attach NVS. 4783 */ 4784 error = hn_nvs_attach(sc, mtu); 4785 if (error) 4786 goto failed; 4787 attached |= ATTACHED_NVS; 4788 4789 /* 4790 * Attach RNDIS _after_ NVS is attached. 4791 */ 4792 error = hn_rndis_attach(sc, mtu, &rndis_inited); 4793 if (rndis_inited) 4794 attached |= ATTACHED_RNDIS; 4795 if (error) 4796 goto failed; 4797 4798 /* 4799 * Make sure capabilities are not changed. 4800 */ 4801 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 4802 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 4803 old_caps, sc->hn_caps); 4804 error = ENXIO; 4805 goto failed; 4806 } 4807 4808 /* 4809 * Allocate sub-channels for multi-TX/RX rings. 4810 * 4811 * NOTE: 4812 * The # of RX rings that can be used is equivalent to the # of 4813 * channels to be requested. 4814 */ 4815 nsubch = sc->hn_rx_ring_cnt - 1; 4816 error = hn_synth_alloc_subchans(sc, &nsubch); 4817 if (error) 4818 goto failed; 4819 /* NOTE: _Full_ synthetic parts detach is required now. */ 4820 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 4821 4822 /* 4823 * Set the # of TX/RX rings that could be used according to 4824 * the # of channels that NVS offered. 4825 */ 4826 nchan = nsubch + 1; 4827 hn_set_ring_inuse(sc, nchan); 4828 if (nchan == 1) { 4829 /* Only the primary channel can be used; done */ 4830 goto back; 4831 } 4832 4833 /* 4834 * Attach the sub-channels. 4835 * 4836 * NOTE: hn_set_ring_inuse() _must_ have been called. 4837 */ 4838 error = hn_attach_subchans(sc); 4839 if (error) 4840 goto failed; 4841 4842 /* 4843 * Configure RSS key and indirect table _after_ all sub-channels 4844 * are attached. 4845 */ 4846 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 4847 /* 4848 * RSS key is not set yet; set it to the default RSS key. 4849 */ 4850 if (bootverbose) 4851 if_printf(sc->hn_ifp, "setup default RSS key\n"); 4852 #ifdef RSS 4853 rss_getkey(rss->rss_key); 4854 #else 4855 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 4856 #endif 4857 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4858 } 4859 4860 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 4861 /* 4862 * RSS indirect table is not set yet; set it up in round- 4863 * robin fashion. 4864 */ 4865 if (bootverbose) { 4866 if_printf(sc->hn_ifp, "setup default RSS indirect " 4867 "table\n"); 4868 } 4869 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 4870 uint32_t subidx; 4871 4872 #ifdef RSS 4873 subidx = rss_get_indirection_to_bucket(i); 4874 #else 4875 subidx = i; 4876 #endif 4877 rss->rss_ind[i] = subidx % nchan; 4878 } 4879 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4880 } else { 4881 /* 4882 * # of usable channels may be changed, so we have to 4883 * make sure that all entries in RSS indirect table 4884 * are valid. 4885 * 4886 * NOTE: hn_set_ring_inuse() _must_ have been called. 4887 */ 4888 hn_rss_ind_fixup(sc); 4889 } 4890 4891 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 4892 if (error) 4893 goto failed; 4894 back: 4895 /* 4896 * Fixup transmission aggregation setup. 4897 */ 4898 hn_set_txagg(sc); 4899 hn_rndis_init_fixat(sc, nchan); 4900 return (0); 4901 4902 failed: 4903 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 4904 hn_rndis_init_fixat(sc, nchan); 4905 hn_synth_detach(sc); 4906 } else { 4907 if (attached & ATTACHED_RNDIS) { 4908 hn_rndis_init_fixat(sc, nchan); 4909 hn_rndis_detach(sc); 4910 } 4911 if (attached & ATTACHED_NVS) 4912 hn_nvs_detach(sc); 4913 hn_chan_detach(sc, sc->hn_prichan); 4914 /* Restore old capabilities. */ 4915 sc->hn_caps = old_caps; 4916 } 4917 return (error); 4918 4919 #undef ATTACHED_RNDIS 4920 #undef ATTACHED_NVS 4921 } 4922 4923 /* 4924 * NOTE: 4925 * The interface must have been suspended though hn_suspend(), before 4926 * this function get called. 4927 */ 4928 static void 4929 hn_synth_detach(struct hn_softc *sc) 4930 { 4931 4932 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4933 ("synthetic parts were not attached")); 4934 4935 /* Detach the RNDIS first. */ 4936 hn_rndis_detach(sc); 4937 4938 /* Detach NVS. */ 4939 hn_nvs_detach(sc); 4940 4941 /* Detach all of the channels. */ 4942 hn_detach_allchans(sc); 4943 4944 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 4945 } 4946 4947 static void 4948 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 4949 { 4950 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 4951 ("invalid ring count %d", ring_cnt)); 4952 4953 if (sc->hn_tx_ring_cnt > ring_cnt) 4954 sc->hn_tx_ring_inuse = ring_cnt; 4955 else 4956 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 4957 sc->hn_rx_ring_inuse = ring_cnt; 4958 4959 #ifdef RSS 4960 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 4961 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 4962 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 4963 rss_getnumbuckets()); 4964 } 4965 #endif 4966 4967 if (bootverbose) { 4968 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 4969 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 4970 } 4971 } 4972 4973 static void 4974 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 4975 { 4976 4977 /* 4978 * NOTE: 4979 * The TX bufring will not be drained by the hypervisor, 4980 * if the primary channel is revoked. 4981 */ 4982 while (!vmbus_chan_rx_empty(chan) || 4983 (!vmbus_chan_is_revoked(sc->hn_prichan) && 4984 !vmbus_chan_tx_empty(chan))) 4985 pause("waitch", 1); 4986 vmbus_chan_intr_drain(chan); 4987 } 4988 4989 static void 4990 hn_disable_rx(struct hn_softc *sc) 4991 { 4992 4993 /* 4994 * Disable RX by clearing RX filter forcefully. 4995 */ 4996 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 4997 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 4998 4999 /* 5000 * Give RNDIS enough time to flush all pending data packets. 5001 */ 5002 pause("waitrx", (200 * hz) / 1000); 5003 } 5004 5005 /* 5006 * NOTE: 5007 * RX/TX _must_ have been suspended/disabled, before this function 5008 * is called. 5009 */ 5010 static void 5011 hn_drain_rxtx(struct hn_softc *sc, int nchan) 5012 { 5013 struct vmbus_channel **subch = NULL; 5014 int nsubch; 5015 5016 /* 5017 * Drain RX/TX bufrings and interrupts. 5018 */ 5019 nsubch = nchan - 1; 5020 if (nsubch > 0) 5021 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 5022 5023 if (subch != NULL) { 5024 int i; 5025 5026 for (i = 0; i < nsubch; ++i) 5027 hn_chan_drain(sc, subch[i]); 5028 } 5029 hn_chan_drain(sc, sc->hn_prichan); 5030 5031 if (subch != NULL) 5032 vmbus_subchan_rel(subch, nsubch); 5033 } 5034 5035 static void 5036 hn_suspend_data(struct hn_softc *sc) 5037 { 5038 struct hn_tx_ring *txr; 5039 int i; 5040 5041 HN_LOCK_ASSERT(sc); 5042 5043 /* 5044 * Suspend TX. 5045 */ 5046 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 5047 txr = &sc->hn_tx_ring[i]; 5048 5049 mtx_lock(&txr->hn_tx_lock); 5050 txr->hn_suspended = 1; 5051 mtx_unlock(&txr->hn_tx_lock); 5052 /* No one is able send more packets now. */ 5053 5054 /* 5055 * Wait for all pending sends to finish. 5056 * 5057 * NOTE: 5058 * We will _not_ receive all pending send-done, if the 5059 * primary channel is revoked. 5060 */ 5061 while (hn_tx_ring_pending(txr) && 5062 !vmbus_chan_is_revoked(sc->hn_prichan)) 5063 pause("hnwtx", 1 /* 1 tick */); 5064 } 5065 5066 /* 5067 * Disable RX. 5068 */ 5069 hn_disable_rx(sc); 5070 5071 /* 5072 * Drain RX/TX. 5073 */ 5074 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 5075 5076 /* 5077 * Drain any pending TX tasks. 5078 * 5079 * NOTE: 5080 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 5081 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 5082 */ 5083 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 5084 txr = &sc->hn_tx_ring[i]; 5085 5086 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 5087 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 5088 } 5089 } 5090 5091 static void 5092 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 5093 { 5094 5095 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 5096 } 5097 5098 static void 5099 hn_suspend_mgmt(struct hn_softc *sc) 5100 { 5101 struct task task; 5102 5103 HN_LOCK_ASSERT(sc); 5104 5105 /* 5106 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 5107 * through hn_mgmt_taskq. 5108 */ 5109 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 5110 vmbus_chan_run_task(sc->hn_prichan, &task); 5111 5112 /* 5113 * Make sure that all pending management tasks are completed. 5114 */ 5115 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 5116 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 5117 taskqueue_drain_all(sc->hn_mgmt_taskq0); 5118 } 5119 5120 static void 5121 hn_suspend(struct hn_softc *sc) 5122 { 5123 5124 /* Disable polling. */ 5125 hn_polling(sc, 0); 5126 5127 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 5128 (sc->hn_flags & HN_FLAG_VF)) 5129 hn_suspend_data(sc); 5130 hn_suspend_mgmt(sc); 5131 } 5132 5133 static void 5134 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 5135 { 5136 int i; 5137 5138 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 5139 ("invalid TX ring count %d", tx_ring_cnt)); 5140 5141 for (i = 0; i < tx_ring_cnt; ++i) { 5142 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 5143 5144 mtx_lock(&txr->hn_tx_lock); 5145 txr->hn_suspended = 0; 5146 mtx_unlock(&txr->hn_tx_lock); 5147 } 5148 } 5149 5150 static void 5151 hn_resume_data(struct hn_softc *sc) 5152 { 5153 int i; 5154 5155 HN_LOCK_ASSERT(sc); 5156 5157 /* 5158 * Re-enable RX. 5159 */ 5160 hn_rxfilter_config(sc); 5161 5162 /* 5163 * Make sure to clear suspend status on "all" TX rings, 5164 * since hn_tx_ring_inuse can be changed after 5165 * hn_suspend_data(). 5166 */ 5167 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 5168 5169 #ifdef HN_IFSTART_SUPPORT 5170 if (!hn_use_if_start) 5171 #endif 5172 { 5173 /* 5174 * Flush unused drbrs, since hn_tx_ring_inuse may be 5175 * reduced. 5176 */ 5177 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 5178 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 5179 } 5180 5181 /* 5182 * Kick start TX. 5183 */ 5184 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 5185 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 5186 5187 /* 5188 * Use txeof task, so that any pending oactive can be 5189 * cleared properly. 5190 */ 5191 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5192 } 5193 } 5194 5195 static void 5196 hn_resume_mgmt(struct hn_softc *sc) 5197 { 5198 5199 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 5200 5201 /* 5202 * Kick off network change detection, if it was pending. 5203 * If no network change was pending, start link status 5204 * checks, which is more lightweight than network change 5205 * detection. 5206 */ 5207 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 5208 hn_change_network(sc); 5209 else 5210 hn_update_link_status(sc); 5211 } 5212 5213 static void 5214 hn_resume(struct hn_softc *sc) 5215 { 5216 5217 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 5218 (sc->hn_flags & HN_FLAG_VF)) 5219 hn_resume_data(sc); 5220 5221 /* 5222 * When the VF is activated, the synthetic interface is changed 5223 * to DOWN in hn_set_vf(). Here, if the VF is still active, we 5224 * don't call hn_resume_mgmt() until the VF is deactivated in 5225 * hn_set_vf(). 5226 */ 5227 if (!(sc->hn_flags & HN_FLAG_VF)) 5228 hn_resume_mgmt(sc); 5229 5230 /* 5231 * Re-enable polling if this interface is running and 5232 * the polling is requested. 5233 */ 5234 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 5235 hn_polling(sc, sc->hn_pollhz); 5236 } 5237 5238 static void 5239 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 5240 { 5241 const struct rndis_status_msg *msg; 5242 int ofs; 5243 5244 if (dlen < sizeof(*msg)) { 5245 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 5246 return; 5247 } 5248 msg = data; 5249 5250 switch (msg->rm_status) { 5251 case RNDIS_STATUS_MEDIA_CONNECT: 5252 case RNDIS_STATUS_MEDIA_DISCONNECT: 5253 hn_update_link_status(sc); 5254 break; 5255 5256 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 5257 /* Not really useful; ignore. */ 5258 break; 5259 5260 case RNDIS_STATUS_NETWORK_CHANGE: 5261 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 5262 if (dlen < ofs + msg->rm_stbuflen || 5263 msg->rm_stbuflen < sizeof(uint32_t)) { 5264 if_printf(sc->hn_ifp, "network changed\n"); 5265 } else { 5266 uint32_t change; 5267 5268 memcpy(&change, ((const uint8_t *)msg) + ofs, 5269 sizeof(change)); 5270 if_printf(sc->hn_ifp, "network changed, change %u\n", 5271 change); 5272 } 5273 hn_change_network(sc); 5274 break; 5275 5276 default: 5277 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 5278 msg->rm_status); 5279 break; 5280 } 5281 } 5282 5283 static int 5284 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 5285 { 5286 const struct rndis_pktinfo *pi = info_data; 5287 uint32_t mask = 0; 5288 5289 while (info_dlen != 0) { 5290 const void *data; 5291 uint32_t dlen; 5292 5293 if (__predict_false(info_dlen < sizeof(*pi))) 5294 return (EINVAL); 5295 if (__predict_false(info_dlen < pi->rm_size)) 5296 return (EINVAL); 5297 info_dlen -= pi->rm_size; 5298 5299 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 5300 return (EINVAL); 5301 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 5302 return (EINVAL); 5303 dlen = pi->rm_size - pi->rm_pktinfooffset; 5304 data = pi->rm_data; 5305 5306 switch (pi->rm_type) { 5307 case NDIS_PKTINFO_TYPE_VLAN: 5308 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) 5309 return (EINVAL); 5310 info->vlan_info = *((const uint32_t *)data); 5311 mask |= HN_RXINFO_VLAN; 5312 break; 5313 5314 case NDIS_PKTINFO_TYPE_CSUM: 5315 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) 5316 return (EINVAL); 5317 info->csum_info = *((const uint32_t *)data); 5318 mask |= HN_RXINFO_CSUM; 5319 break; 5320 5321 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 5322 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) 5323 return (EINVAL); 5324 info->hash_value = *((const uint32_t *)data); 5325 mask |= HN_RXINFO_HASHVAL; 5326 break; 5327 5328 case HN_NDIS_PKTINFO_TYPE_HASHINF: 5329 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) 5330 return (EINVAL); 5331 info->hash_info = *((const uint32_t *)data); 5332 mask |= HN_RXINFO_HASHINF; 5333 break; 5334 5335 default: 5336 goto next; 5337 } 5338 5339 if (mask == HN_RXINFO_ALL) { 5340 /* All found; done */ 5341 break; 5342 } 5343 next: 5344 pi = (const struct rndis_pktinfo *) 5345 ((const uint8_t *)pi + pi->rm_size); 5346 } 5347 5348 /* 5349 * Final fixup. 5350 * - If there is no hash value, invalidate the hash info. 5351 */ 5352 if ((mask & HN_RXINFO_HASHVAL) == 0) 5353 info->hash_info = HN_NDIS_HASH_INFO_INVALID; 5354 return (0); 5355 } 5356 5357 static __inline bool 5358 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 5359 { 5360 5361 if (off < check_off) { 5362 if (__predict_true(off + len <= check_off)) 5363 return (false); 5364 } else if (off > check_off) { 5365 if (__predict_true(check_off + check_len <= off)) 5366 return (false); 5367 } 5368 return (true); 5369 } 5370 5371 static void 5372 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 5373 { 5374 const struct rndis_packet_msg *pkt; 5375 struct hn_rxinfo info; 5376 int data_off, pktinfo_off, data_len, pktinfo_len; 5377 5378 /* 5379 * Check length. 5380 */ 5381 if (__predict_false(dlen < sizeof(*pkt))) { 5382 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 5383 return; 5384 } 5385 pkt = data; 5386 5387 if (__predict_false(dlen < pkt->rm_len)) { 5388 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 5389 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 5390 return; 5391 } 5392 if (__predict_false(pkt->rm_len < 5393 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 5394 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 5395 "msglen %u, data %u, oob %u, pktinfo %u\n", 5396 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 5397 pkt->rm_pktinfolen); 5398 return; 5399 } 5400 if (__predict_false(pkt->rm_datalen == 0)) { 5401 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 5402 return; 5403 } 5404 5405 /* 5406 * Check offests. 5407 */ 5408 #define IS_OFFSET_INVALID(ofs) \ 5409 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 5410 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 5411 5412 /* XXX Hyper-V does not meet data offset alignment requirement */ 5413 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 5414 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5415 "data offset %u\n", pkt->rm_dataoffset); 5416 return; 5417 } 5418 if (__predict_false(pkt->rm_oobdataoffset > 0 && 5419 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 5420 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5421 "oob offset %u\n", pkt->rm_oobdataoffset); 5422 return; 5423 } 5424 if (__predict_true(pkt->rm_pktinfooffset > 0) && 5425 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 5426 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5427 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 5428 return; 5429 } 5430 5431 #undef IS_OFFSET_INVALID 5432 5433 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 5434 data_len = pkt->rm_datalen; 5435 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 5436 pktinfo_len = pkt->rm_pktinfolen; 5437 5438 /* 5439 * Check OOB coverage. 5440 */ 5441 if (__predict_false(pkt->rm_oobdatalen != 0)) { 5442 int oob_off, oob_len; 5443 5444 if_printf(rxr->hn_ifp, "got oobdata\n"); 5445 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 5446 oob_len = pkt->rm_oobdatalen; 5447 5448 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 5449 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5450 "oob overflow, msglen %u, oob abs %d len %d\n", 5451 pkt->rm_len, oob_off, oob_len); 5452 return; 5453 } 5454 5455 /* 5456 * Check against data. 5457 */ 5458 if (hn_rndis_check_overlap(oob_off, oob_len, 5459 data_off, data_len)) { 5460 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5461 "oob overlaps data, oob abs %d len %d, " 5462 "data abs %d len %d\n", 5463 oob_off, oob_len, data_off, data_len); 5464 return; 5465 } 5466 5467 /* 5468 * Check against pktinfo. 5469 */ 5470 if (pktinfo_len != 0 && 5471 hn_rndis_check_overlap(oob_off, oob_len, 5472 pktinfo_off, pktinfo_len)) { 5473 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5474 "oob overlaps pktinfo, oob abs %d len %d, " 5475 "pktinfo abs %d len %d\n", 5476 oob_off, oob_len, pktinfo_off, pktinfo_len); 5477 return; 5478 } 5479 } 5480 5481 /* 5482 * Check per-packet-info coverage and find useful per-packet-info. 5483 */ 5484 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; 5485 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; 5486 info.hash_info = HN_NDIS_HASH_INFO_INVALID; 5487 if (__predict_true(pktinfo_len != 0)) { 5488 bool overlap; 5489 int error; 5490 5491 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 5492 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5493 "pktinfo overflow, msglen %u, " 5494 "pktinfo abs %d len %d\n", 5495 pkt->rm_len, pktinfo_off, pktinfo_len); 5496 return; 5497 } 5498 5499 /* 5500 * Check packet info coverage. 5501 */ 5502 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 5503 data_off, data_len); 5504 if (__predict_false(overlap)) { 5505 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5506 "pktinfo overlap data, pktinfo abs %d len %d, " 5507 "data abs %d len %d\n", 5508 pktinfo_off, pktinfo_len, data_off, data_len); 5509 return; 5510 } 5511 5512 /* 5513 * Find useful per-packet-info. 5514 */ 5515 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 5516 pktinfo_len, &info); 5517 if (__predict_false(error)) { 5518 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 5519 "pktinfo\n"); 5520 return; 5521 } 5522 } 5523 5524 if (__predict_false(data_off + data_len > pkt->rm_len)) { 5525 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5526 "data overflow, msglen %u, data abs %d len %d\n", 5527 pkt->rm_len, data_off, data_len); 5528 return; 5529 } 5530 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); 5531 } 5532 5533 static __inline void 5534 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 5535 { 5536 const struct rndis_msghdr *hdr; 5537 5538 if (__predict_false(dlen < sizeof(*hdr))) { 5539 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 5540 return; 5541 } 5542 hdr = data; 5543 5544 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 5545 /* Hot data path. */ 5546 hn_rndis_rx_data(rxr, data, dlen); 5547 /* Done! */ 5548 return; 5549 } 5550 5551 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 5552 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 5553 else 5554 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 5555 } 5556 5557 static void 5558 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 5559 { 5560 const struct hn_nvs_hdr *hdr; 5561 5562 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 5563 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 5564 return; 5565 } 5566 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 5567 5568 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 5569 /* Useless; ignore */ 5570 return; 5571 } 5572 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 5573 } 5574 5575 static void 5576 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 5577 const struct vmbus_chanpkt_hdr *pkt) 5578 { 5579 struct hn_nvs_sendctx *sndc; 5580 5581 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 5582 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 5583 VMBUS_CHANPKT_DATALEN(pkt)); 5584 /* 5585 * NOTE: 5586 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 5587 * its callback. 5588 */ 5589 } 5590 5591 static void 5592 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5593 const struct vmbus_chanpkt_hdr *pkthdr) 5594 { 5595 const struct vmbus_chanpkt_rxbuf *pkt; 5596 const struct hn_nvs_hdr *nvs_hdr; 5597 int count, i, hlen; 5598 5599 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 5600 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 5601 return; 5602 } 5603 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 5604 5605 /* Make sure that this is a RNDIS message. */ 5606 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 5607 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 5608 nvs_hdr->nvs_type); 5609 return; 5610 } 5611 5612 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 5613 if (__predict_false(hlen < sizeof(*pkt))) { 5614 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 5615 return; 5616 } 5617 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 5618 5619 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 5620 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 5621 pkt->cp_rxbuf_id); 5622 return; 5623 } 5624 5625 count = pkt->cp_rxbuf_cnt; 5626 if (__predict_false(hlen < 5627 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 5628 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 5629 return; 5630 } 5631 5632 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 5633 for (i = 0; i < count; ++i) { 5634 int ofs, len; 5635 5636 ofs = pkt->cp_rxbuf[i].rb_ofs; 5637 len = pkt->cp_rxbuf[i].rb_len; 5638 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 5639 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 5640 "ofs %d, len %d\n", i, ofs, len); 5641 continue; 5642 } 5643 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 5644 } 5645 5646 /* 5647 * Ack the consumed RXBUF associated w/ this channel packet, 5648 * so that this RXBUF can be recycled by the hypervisor. 5649 */ 5650 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 5651 } 5652 5653 static void 5654 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5655 uint64_t tid) 5656 { 5657 struct hn_nvs_rndis_ack ack; 5658 int retries, error; 5659 5660 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 5661 ack.nvs_status = HN_NVS_STATUS_OK; 5662 5663 retries = 0; 5664 again: 5665 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 5666 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 5667 if (__predict_false(error == EAGAIN)) { 5668 /* 5669 * NOTE: 5670 * This should _not_ happen in real world, since the 5671 * consumption of the TX bufring from the TX path is 5672 * controlled. 5673 */ 5674 if (rxr->hn_ack_failed == 0) 5675 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 5676 rxr->hn_ack_failed++; 5677 retries++; 5678 if (retries < 10) { 5679 DELAY(100); 5680 goto again; 5681 } 5682 /* RXBUF leaks! */ 5683 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 5684 } 5685 } 5686 5687 static void 5688 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 5689 { 5690 struct hn_rx_ring *rxr = xrxr; 5691 struct hn_softc *sc = rxr->hn_ifp->if_softc; 5692 5693 for (;;) { 5694 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 5695 int error, pktlen; 5696 5697 pktlen = rxr->hn_pktbuf_len; 5698 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 5699 if (__predict_false(error == ENOBUFS)) { 5700 void *nbuf; 5701 int nlen; 5702 5703 /* 5704 * Expand channel packet buffer. 5705 * 5706 * XXX 5707 * Use M_WAITOK here, since allocation failure 5708 * is fatal. 5709 */ 5710 nlen = rxr->hn_pktbuf_len * 2; 5711 while (nlen < pktlen) 5712 nlen *= 2; 5713 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 5714 5715 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 5716 rxr->hn_pktbuf_len, nlen); 5717 5718 free(rxr->hn_pktbuf, M_DEVBUF); 5719 rxr->hn_pktbuf = nbuf; 5720 rxr->hn_pktbuf_len = nlen; 5721 /* Retry! */ 5722 continue; 5723 } else if (__predict_false(error == EAGAIN)) { 5724 /* No more channel packets; done! */ 5725 break; 5726 } 5727 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 5728 5729 switch (pkt->cph_type) { 5730 case VMBUS_CHANPKT_TYPE_COMP: 5731 hn_nvs_handle_comp(sc, chan, pkt); 5732 break; 5733 5734 case VMBUS_CHANPKT_TYPE_RXBUF: 5735 hn_nvs_handle_rxbuf(rxr, chan, pkt); 5736 break; 5737 5738 case VMBUS_CHANPKT_TYPE_INBAND: 5739 hn_nvs_handle_notify(sc, pkt); 5740 break; 5741 5742 default: 5743 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 5744 pkt->cph_type); 5745 break; 5746 } 5747 } 5748 hn_chan_rollup(rxr, rxr->hn_txr); 5749 } 5750 5751 static void 5752 hn_tx_taskq_create(void *arg __unused) 5753 { 5754 int i; 5755 5756 /* 5757 * Fix the # of TX taskqueues. 5758 */ 5759 if (hn_tx_taskq_cnt <= 0) 5760 hn_tx_taskq_cnt = 1; 5761 else if (hn_tx_taskq_cnt > mp_ncpus) 5762 hn_tx_taskq_cnt = mp_ncpus; 5763 5764 /* 5765 * Fix the TX taskqueue mode. 5766 */ 5767 switch (hn_tx_taskq_mode) { 5768 case HN_TX_TASKQ_M_INDEP: 5769 case HN_TX_TASKQ_M_GLOBAL: 5770 case HN_TX_TASKQ_M_EVTTQ: 5771 break; 5772 default: 5773 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 5774 break; 5775 } 5776 5777 if (vm_guest != VM_GUEST_HV) 5778 return; 5779 5780 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 5781 return; 5782 5783 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 5784 M_DEVBUF, M_WAITOK); 5785 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 5786 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 5787 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 5788 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 5789 "hn tx%d", i); 5790 } 5791 } 5792 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5793 hn_tx_taskq_create, NULL); 5794 5795 static void 5796 hn_tx_taskq_destroy(void *arg __unused) 5797 { 5798 5799 if (hn_tx_taskque != NULL) { 5800 int i; 5801 5802 for (i = 0; i < hn_tx_taskq_cnt; ++i) 5803 taskqueue_free(hn_tx_taskque[i]); 5804 free(hn_tx_taskque, M_DEVBUF); 5805 } 5806 } 5807 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5808 hn_tx_taskq_destroy, NULL); 5809