1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/bus.h> 65 #include <sys/kernel.h> 66 #include <sys/limits.h> 67 #include <sys/malloc.h> 68 #include <sys/mbuf.h> 69 #include <sys/module.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/smp.h> 73 #include <sys/socket.h> 74 #include <sys/sockio.h> 75 #include <sys/sx.h> 76 #include <sys/sysctl.h> 77 #include <sys/systm.h> 78 #include <sys/taskqueue.h> 79 #include <sys/buf_ring.h> 80 #include <sys/eventhandler.h> 81 82 #include <machine/atomic.h> 83 #include <machine/in_cksum.h> 84 85 #include <net/bpf.h> 86 #include <net/ethernet.h> 87 #include <net/if.h> 88 #include <net/if_dl.h> 89 #include <net/if_media.h> 90 #include <net/if_types.h> 91 #include <net/if_var.h> 92 #include <net/rndis.h> 93 #ifdef RSS 94 #include <net/rss_config.h> 95 #endif 96 97 #include <netinet/in_systm.h> 98 #include <netinet/in.h> 99 #include <netinet/ip.h> 100 #include <netinet/ip6.h> 101 #include <netinet/tcp.h> 102 #include <netinet/tcp_lro.h> 103 #include <netinet/udp.h> 104 105 #include <dev/hyperv/include/hyperv.h> 106 #include <dev/hyperv/include/hyperv_busdma.h> 107 #include <dev/hyperv/include/vmbus.h> 108 #include <dev/hyperv/include/vmbus_xact.h> 109 110 #include <dev/hyperv/netvsc/ndis.h> 111 #include <dev/hyperv/netvsc/if_hnreg.h> 112 #include <dev/hyperv/netvsc/if_hnvar.h> 113 #include <dev/hyperv/netvsc/hn_nvs.h> 114 #include <dev/hyperv/netvsc/hn_rndis.h> 115 116 #include "vmbus_if.h" 117 118 #define HN_IFSTART_SUPPORT 119 120 #define HN_RING_CNT_DEF_MAX 8 121 122 /* YYY should get it from the underlying channel */ 123 #define HN_TX_DESC_CNT 512 124 125 #define HN_RNDIS_PKT_LEN \ 126 (sizeof(struct rndis_packet_msg) + \ 127 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 128 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 129 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 130 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 131 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 132 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 133 134 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 135 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 136 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 137 /* -1 for RNDIS packet message */ 138 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 139 140 #define HN_DIRECT_TX_SIZE_DEF 128 141 142 #define HN_EARLY_TXEOF_THRESH 8 143 144 #define HN_PKTBUF_LEN_DEF (16 * 1024) 145 146 #define HN_LROENT_CNT_DEF 128 147 148 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 149 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 150 /* YYY 2*MTU is a bit rough, but should be good enough. */ 151 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 152 153 #define HN_LRO_ACKCNT_DEF 1 154 155 #define HN_LOCK_INIT(sc) \ 156 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 157 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 158 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 159 #define HN_LOCK(sc) \ 160 do { \ 161 while (sx_try_xlock(&(sc)->hn_lock) == 0) \ 162 DELAY(1000); \ 163 } while (0) 164 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 165 166 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 167 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 168 #define HN_CSUM_IP_HWASSIST(sc) \ 169 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 170 #define HN_CSUM_IP6_HWASSIST(sc) \ 171 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 172 173 #define HN_PKTSIZE_MIN(align) \ 174 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 175 HN_RNDIS_PKT_LEN, (align)) 176 #define HN_PKTSIZE(m, align) \ 177 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 178 179 #ifdef RSS 180 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 181 #else 182 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 183 #endif 184 185 struct hn_txdesc { 186 #ifndef HN_USE_TXDESC_BUFRING 187 SLIST_ENTRY(hn_txdesc) link; 188 #endif 189 STAILQ_ENTRY(hn_txdesc) agg_link; 190 191 /* Aggregated txdescs, in sending order. */ 192 STAILQ_HEAD(, hn_txdesc) agg_list; 193 194 /* The oldest packet, if transmission aggregation happens. */ 195 struct mbuf *m; 196 struct hn_tx_ring *txr; 197 int refs; 198 uint32_t flags; /* HN_TXD_FLAG_ */ 199 struct hn_nvs_sendctx send_ctx; 200 uint32_t chim_index; 201 int chim_size; 202 203 bus_dmamap_t data_dmap; 204 205 bus_addr_t rndis_pkt_paddr; 206 struct rndis_packet_msg *rndis_pkt; 207 bus_dmamap_t rndis_pkt_dmap; 208 }; 209 210 #define HN_TXD_FLAG_ONLIST 0x0001 211 #define HN_TXD_FLAG_DMAMAP 0x0002 212 #define HN_TXD_FLAG_ONAGG 0x0004 213 214 struct hn_rxinfo { 215 uint32_t vlan_info; 216 uint32_t csum_info; 217 uint32_t hash_info; 218 uint32_t hash_value; 219 }; 220 221 struct hn_update_vf { 222 struct hn_rx_ring *rxr; 223 struct ifnet *vf; 224 }; 225 226 #define HN_RXINFO_VLAN 0x0001 227 #define HN_RXINFO_CSUM 0x0002 228 #define HN_RXINFO_HASHINF 0x0004 229 #define HN_RXINFO_HASHVAL 0x0008 230 #define HN_RXINFO_ALL \ 231 (HN_RXINFO_VLAN | \ 232 HN_RXINFO_CSUM | \ 233 HN_RXINFO_HASHINF | \ 234 HN_RXINFO_HASHVAL) 235 236 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff 237 #define HN_NDIS_RXCSUM_INFO_INVALID 0 238 #define HN_NDIS_HASH_INFO_INVALID 0 239 240 static int hn_probe(device_t); 241 static int hn_attach(device_t); 242 static int hn_detach(device_t); 243 static int hn_shutdown(device_t); 244 static void hn_chan_callback(struct vmbus_channel *, 245 void *); 246 247 static void hn_init(void *); 248 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 249 #ifdef HN_IFSTART_SUPPORT 250 static void hn_start(struct ifnet *); 251 #endif 252 static int hn_transmit(struct ifnet *, struct mbuf *); 253 static void hn_xmit_qflush(struct ifnet *); 254 static int hn_ifmedia_upd(struct ifnet *); 255 static void hn_ifmedia_sts(struct ifnet *, 256 struct ifmediareq *); 257 258 static int hn_rndis_rxinfo(const void *, int, 259 struct hn_rxinfo *); 260 static void hn_rndis_rx_data(struct hn_rx_ring *, 261 const void *, int); 262 static void hn_rndis_rx_status(struct hn_softc *, 263 const void *, int); 264 265 static void hn_nvs_handle_notify(struct hn_softc *, 266 const struct vmbus_chanpkt_hdr *); 267 static void hn_nvs_handle_comp(struct hn_softc *, 268 struct vmbus_channel *, 269 const struct vmbus_chanpkt_hdr *); 270 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 271 struct vmbus_channel *, 272 const struct vmbus_chanpkt_hdr *); 273 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 274 struct vmbus_channel *, uint64_t); 275 276 #if __FreeBSD_version >= 1100099 277 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 278 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 279 #endif 280 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 281 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 282 #if __FreeBSD_version < 1100095 283 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 284 #else 285 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 286 #endif 287 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 288 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 289 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 290 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 291 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 292 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 293 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 294 #ifndef RSS 295 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 296 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 297 #endif 298 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 299 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 300 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 301 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 302 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 303 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 304 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 305 306 static void hn_stop(struct hn_softc *, bool); 307 static void hn_init_locked(struct hn_softc *); 308 static int hn_chan_attach(struct hn_softc *, 309 struct vmbus_channel *); 310 static void hn_chan_detach(struct hn_softc *, 311 struct vmbus_channel *); 312 static int hn_attach_subchans(struct hn_softc *); 313 static void hn_detach_allchans(struct hn_softc *); 314 static void hn_chan_rollup(struct hn_rx_ring *, 315 struct hn_tx_ring *); 316 static void hn_set_ring_inuse(struct hn_softc *, int); 317 static int hn_synth_attach(struct hn_softc *, int); 318 static void hn_synth_detach(struct hn_softc *); 319 static int hn_synth_alloc_subchans(struct hn_softc *, 320 int *); 321 static bool hn_synth_attachable(const struct hn_softc *); 322 static void hn_suspend(struct hn_softc *); 323 static void hn_suspend_data(struct hn_softc *); 324 static void hn_suspend_mgmt(struct hn_softc *); 325 static void hn_resume(struct hn_softc *); 326 static void hn_resume_data(struct hn_softc *); 327 static void hn_resume_mgmt(struct hn_softc *); 328 static void hn_suspend_mgmt_taskfunc(void *, int); 329 static void hn_chan_drain(struct hn_softc *, 330 struct vmbus_channel *); 331 static void hn_polling(struct hn_softc *, u_int); 332 static void hn_chan_polling(struct vmbus_channel *, u_int); 333 334 static void hn_update_link_status(struct hn_softc *); 335 static void hn_change_network(struct hn_softc *); 336 static void hn_link_taskfunc(void *, int); 337 static void hn_netchg_init_taskfunc(void *, int); 338 static void hn_netchg_status_taskfunc(void *, int); 339 static void hn_link_status(struct hn_softc *); 340 341 static int hn_create_rx_data(struct hn_softc *, int); 342 static void hn_destroy_rx_data(struct hn_softc *); 343 static int hn_check_iplen(const struct mbuf *, int); 344 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 345 static int hn_rxfilter_config(struct hn_softc *); 346 #ifndef RSS 347 static int hn_rss_reconfig(struct hn_softc *); 348 #endif 349 static void hn_rss_ind_fixup(struct hn_softc *); 350 static int hn_rxpkt(struct hn_rx_ring *, const void *, 351 int, const struct hn_rxinfo *); 352 353 static int hn_tx_ring_create(struct hn_softc *, int); 354 static void hn_tx_ring_destroy(struct hn_tx_ring *); 355 static int hn_create_tx_data(struct hn_softc *, int); 356 static void hn_fixup_tx_data(struct hn_softc *); 357 static void hn_destroy_tx_data(struct hn_softc *); 358 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 359 static void hn_txdesc_gc(struct hn_tx_ring *, 360 struct hn_txdesc *); 361 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 362 struct hn_txdesc *, struct mbuf **); 363 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 364 struct hn_txdesc *); 365 static void hn_set_chim_size(struct hn_softc *, int); 366 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 367 static bool hn_tx_ring_pending(struct hn_tx_ring *); 368 static void hn_tx_ring_qflush(struct hn_tx_ring *); 369 static void hn_resume_tx(struct hn_softc *, int); 370 static void hn_set_txagg(struct hn_softc *); 371 static void *hn_try_txagg(struct ifnet *, 372 struct hn_tx_ring *, struct hn_txdesc *, 373 int); 374 static int hn_get_txswq_depth(const struct hn_tx_ring *); 375 static void hn_txpkt_done(struct hn_nvs_sendctx *, 376 struct hn_softc *, struct vmbus_channel *, 377 const void *, int); 378 static int hn_txpkt_sglist(struct hn_tx_ring *, 379 struct hn_txdesc *); 380 static int hn_txpkt_chim(struct hn_tx_ring *, 381 struct hn_txdesc *); 382 static int hn_xmit(struct hn_tx_ring *, int); 383 static void hn_xmit_taskfunc(void *, int); 384 static void hn_xmit_txeof(struct hn_tx_ring *); 385 static void hn_xmit_txeof_taskfunc(void *, int); 386 #ifdef HN_IFSTART_SUPPORT 387 static int hn_start_locked(struct hn_tx_ring *, int); 388 static void hn_start_taskfunc(void *, int); 389 static void hn_start_txeof(struct hn_tx_ring *); 390 static void hn_start_txeof_taskfunc(void *, int); 391 #endif 392 393 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 394 "Hyper-V network interface"); 395 396 /* Trust tcp segements verification on host side. */ 397 static int hn_trust_hosttcp = 1; 398 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 399 &hn_trust_hosttcp, 0, 400 "Trust tcp segement verification on host side, " 401 "when csum info is missing (global setting)"); 402 403 /* Trust udp datagrams verification on host side. */ 404 static int hn_trust_hostudp = 1; 405 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 406 &hn_trust_hostudp, 0, 407 "Trust udp datagram verification on host side, " 408 "when csum info is missing (global setting)"); 409 410 /* Trust ip packets verification on host side. */ 411 static int hn_trust_hostip = 1; 412 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 413 &hn_trust_hostip, 0, 414 "Trust ip packet verification on host side, " 415 "when csum info is missing (global setting)"); 416 417 /* Limit TSO burst size */ 418 static int hn_tso_maxlen = IP_MAXPACKET; 419 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 420 &hn_tso_maxlen, 0, "TSO burst limit"); 421 422 /* Limit chimney send size */ 423 static int hn_tx_chimney_size = 0; 424 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 425 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 426 427 /* Limit the size of packet for direct transmission */ 428 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 429 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 430 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 431 432 /* # of LRO entries per RX ring */ 433 #if defined(INET) || defined(INET6) 434 #if __FreeBSD_version >= 1100095 435 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 436 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 437 &hn_lro_entry_count, 0, "LRO entry count"); 438 #endif 439 #endif 440 441 static int hn_tx_taskq_cnt = 1; 442 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 443 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 444 445 #define HN_TX_TASKQ_M_INDEP 0 446 #define HN_TX_TASKQ_M_GLOBAL 1 447 #define HN_TX_TASKQ_M_EVTTQ 2 448 449 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 450 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 451 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 452 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 453 454 #ifndef HN_USE_TXDESC_BUFRING 455 static int hn_use_txdesc_bufring = 0; 456 #else 457 static int hn_use_txdesc_bufring = 1; 458 #endif 459 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 460 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 461 462 #ifdef HN_IFSTART_SUPPORT 463 /* Use ifnet.if_start instead of ifnet.if_transmit */ 464 static int hn_use_if_start = 0; 465 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 466 &hn_use_if_start, 0, "Use if_start TX method"); 467 #endif 468 469 /* # of channels to use */ 470 static int hn_chan_cnt = 0; 471 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 472 &hn_chan_cnt, 0, 473 "# of channels to use; each channel has one RX ring and one TX ring"); 474 475 /* # of transmit rings to use */ 476 static int hn_tx_ring_cnt = 0; 477 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 478 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 479 480 /* Software TX ring deptch */ 481 static int hn_tx_swq_depth = 0; 482 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 483 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 484 485 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 486 #if __FreeBSD_version >= 1100095 487 static u_int hn_lro_mbufq_depth = 0; 488 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 489 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 490 #endif 491 492 /* Packet transmission aggregation size limit */ 493 static int hn_tx_agg_size = -1; 494 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 495 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 496 497 /* Packet transmission aggregation count limit */ 498 static int hn_tx_agg_pkts = -1; 499 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 500 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 501 502 static u_int hn_cpu_index; /* next CPU for channel */ 503 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 504 505 #ifndef RSS 506 static const uint8_t 507 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 508 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 509 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 510 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 511 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 512 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 513 }; 514 #endif /* !RSS */ 515 516 static device_method_t hn_methods[] = { 517 /* Device interface */ 518 DEVMETHOD(device_probe, hn_probe), 519 DEVMETHOD(device_attach, hn_attach), 520 DEVMETHOD(device_detach, hn_detach), 521 DEVMETHOD(device_shutdown, hn_shutdown), 522 DEVMETHOD_END 523 }; 524 525 static driver_t hn_driver = { 526 "hn", 527 hn_methods, 528 sizeof(struct hn_softc) 529 }; 530 531 static devclass_t hn_devclass; 532 533 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 534 MODULE_VERSION(hn, 1); 535 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 536 537 #if __FreeBSD_version >= 1100099 538 static void 539 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 540 { 541 int i; 542 543 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 544 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 545 } 546 #endif 547 548 static int 549 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 550 { 551 552 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 553 txd->chim_size == 0, ("invalid rndis sglist txd")); 554 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 555 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 556 } 557 558 static int 559 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 560 { 561 struct hn_nvs_rndis rndis; 562 563 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 564 txd->chim_size > 0, ("invalid rndis chim txd")); 565 566 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 567 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 568 rndis.nvs_chim_idx = txd->chim_index; 569 rndis.nvs_chim_sz = txd->chim_size; 570 571 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 572 &rndis, sizeof(rndis), &txd->send_ctx)); 573 } 574 575 static __inline uint32_t 576 hn_chim_alloc(struct hn_softc *sc) 577 { 578 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 579 u_long *bmap = sc->hn_chim_bmap; 580 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 581 582 for (i = 0; i < bmap_cnt; ++i) { 583 int idx; 584 585 idx = ffsl(~bmap[i]); 586 if (idx == 0) 587 continue; 588 589 --idx; /* ffsl is 1-based */ 590 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 591 ("invalid i %d and idx %d", i, idx)); 592 593 if (atomic_testandset_long(&bmap[i], idx)) 594 continue; 595 596 ret = i * LONG_BIT + idx; 597 break; 598 } 599 return (ret); 600 } 601 602 static __inline void 603 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 604 { 605 u_long mask; 606 uint32_t idx; 607 608 idx = chim_idx / LONG_BIT; 609 KASSERT(idx < sc->hn_chim_bmap_cnt, 610 ("invalid chimney index 0x%x", chim_idx)); 611 612 mask = 1UL << (chim_idx % LONG_BIT); 613 KASSERT(sc->hn_chim_bmap[idx] & mask, 614 ("index bitmap 0x%lx, chimney index %u, " 615 "bitmap idx %d, bitmask 0x%lx", 616 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 617 618 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 619 } 620 621 #if defined(INET6) || defined(INET) 622 /* 623 * NOTE: If this function failed, the m_head would be freed. 624 */ 625 static __inline struct mbuf * 626 hn_tso_fixup(struct mbuf *m_head) 627 { 628 struct ether_vlan_header *evl; 629 struct tcphdr *th; 630 int ehlen; 631 632 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 633 634 #define PULLUP_HDR(m, len) \ 635 do { \ 636 if (__predict_false((m)->m_len < (len))) { \ 637 (m) = m_pullup((m), (len)); \ 638 if ((m) == NULL) \ 639 return (NULL); \ 640 } \ 641 } while (0) 642 643 PULLUP_HDR(m_head, sizeof(*evl)); 644 evl = mtod(m_head, struct ether_vlan_header *); 645 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 646 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 647 else 648 ehlen = ETHER_HDR_LEN; 649 650 #ifdef INET 651 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 652 struct ip *ip; 653 int iphlen; 654 655 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 656 ip = mtodo(m_head, ehlen); 657 iphlen = ip->ip_hl << 2; 658 659 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 660 th = mtodo(m_head, ehlen + iphlen); 661 662 ip->ip_len = 0; 663 ip->ip_sum = 0; 664 th->th_sum = in_pseudo(ip->ip_src.s_addr, 665 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 666 } 667 #endif 668 #if defined(INET6) && defined(INET) 669 else 670 #endif 671 #ifdef INET6 672 { 673 struct ip6_hdr *ip6; 674 675 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 676 ip6 = mtodo(m_head, ehlen); 677 if (ip6->ip6_nxt != IPPROTO_TCP) { 678 m_freem(m_head); 679 return (NULL); 680 } 681 682 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 683 th = mtodo(m_head, ehlen + sizeof(*ip6)); 684 685 ip6->ip6_plen = 0; 686 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 687 } 688 #endif 689 return (m_head); 690 691 #undef PULLUP_HDR 692 } 693 #endif /* INET6 || INET */ 694 695 static int 696 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 697 { 698 int error = 0; 699 700 HN_LOCK_ASSERT(sc); 701 702 if (sc->hn_rx_filter != filter) { 703 error = hn_rndis_set_rxfilter(sc, filter); 704 if (!error) 705 sc->hn_rx_filter = filter; 706 } 707 return (error); 708 } 709 710 static int 711 hn_rxfilter_config(struct hn_softc *sc) 712 { 713 struct ifnet *ifp = sc->hn_ifp; 714 uint32_t filter; 715 716 HN_LOCK_ASSERT(sc); 717 718 if ((ifp->if_flags & IFF_PROMISC) || 719 (sc->hn_flags & HN_FLAG_VF)) { 720 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 721 } else { 722 filter = NDIS_PACKET_TYPE_DIRECTED; 723 if (ifp->if_flags & IFF_BROADCAST) 724 filter |= NDIS_PACKET_TYPE_BROADCAST; 725 /* TODO: support multicast list */ 726 if ((ifp->if_flags & IFF_ALLMULTI) || 727 !TAILQ_EMPTY(&ifp->if_multiaddrs)) 728 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 729 } 730 return (hn_set_rxfilter(sc, filter)); 731 } 732 733 static void 734 hn_set_txagg(struct hn_softc *sc) 735 { 736 uint32_t size, pkts; 737 int i; 738 739 /* 740 * Setup aggregation size. 741 */ 742 if (sc->hn_agg_size < 0) 743 size = UINT32_MAX; 744 else 745 size = sc->hn_agg_size; 746 747 if (sc->hn_rndis_agg_size < size) 748 size = sc->hn_rndis_agg_size; 749 750 /* NOTE: We only aggregate packets using chimney sending buffers. */ 751 if (size > (uint32_t)sc->hn_chim_szmax) 752 size = sc->hn_chim_szmax; 753 754 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 755 /* Disable */ 756 size = 0; 757 pkts = 0; 758 goto done; 759 } 760 761 /* NOTE: Type of the per TX ring setting is 'int'. */ 762 if (size > INT_MAX) 763 size = INT_MAX; 764 765 /* 766 * Setup aggregation packet count. 767 */ 768 if (sc->hn_agg_pkts < 0) 769 pkts = UINT32_MAX; 770 else 771 pkts = sc->hn_agg_pkts; 772 773 if (sc->hn_rndis_agg_pkts < pkts) 774 pkts = sc->hn_rndis_agg_pkts; 775 776 if (pkts <= 1) { 777 /* Disable */ 778 size = 0; 779 pkts = 0; 780 goto done; 781 } 782 783 /* NOTE: Type of the per TX ring setting is 'short'. */ 784 if (pkts > SHRT_MAX) 785 pkts = SHRT_MAX; 786 787 done: 788 /* NOTE: Type of the per TX ring setting is 'short'. */ 789 if (sc->hn_rndis_agg_align > SHRT_MAX) { 790 /* Disable */ 791 size = 0; 792 pkts = 0; 793 } 794 795 if (bootverbose) { 796 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 797 size, pkts, sc->hn_rndis_agg_align); 798 } 799 800 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 801 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 802 803 mtx_lock(&txr->hn_tx_lock); 804 txr->hn_agg_szmax = size; 805 txr->hn_agg_pktmax = pkts; 806 txr->hn_agg_align = sc->hn_rndis_agg_align; 807 mtx_unlock(&txr->hn_tx_lock); 808 } 809 } 810 811 static int 812 hn_get_txswq_depth(const struct hn_tx_ring *txr) 813 { 814 815 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 816 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 817 return txr->hn_txdesc_cnt; 818 return hn_tx_swq_depth; 819 } 820 821 #ifndef RSS 822 static int 823 hn_rss_reconfig(struct hn_softc *sc) 824 { 825 int error; 826 827 HN_LOCK_ASSERT(sc); 828 829 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 830 return (ENXIO); 831 832 /* 833 * Disable RSS first. 834 * 835 * NOTE: 836 * Direct reconfiguration by setting the UNCHG flags does 837 * _not_ work properly. 838 */ 839 if (bootverbose) 840 if_printf(sc->hn_ifp, "disable RSS\n"); 841 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 842 if (error) { 843 if_printf(sc->hn_ifp, "RSS disable failed\n"); 844 return (error); 845 } 846 847 /* 848 * Reenable the RSS w/ the updated RSS key or indirect 849 * table. 850 */ 851 if (bootverbose) 852 if_printf(sc->hn_ifp, "reconfig RSS\n"); 853 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 854 if (error) { 855 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 856 return (error); 857 } 858 return (0); 859 } 860 #endif /* !RSS */ 861 862 static void 863 hn_rss_ind_fixup(struct hn_softc *sc) 864 { 865 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 866 int i, nchan; 867 868 nchan = sc->hn_rx_ring_inuse; 869 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 870 871 /* 872 * Check indirect table to make sure that all channels in it 873 * can be used. 874 */ 875 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 876 if (rss->rss_ind[i] >= nchan) { 877 if_printf(sc->hn_ifp, 878 "RSS indirect table %d fixup: %u -> %d\n", 879 i, rss->rss_ind[i], nchan - 1); 880 rss->rss_ind[i] = nchan - 1; 881 } 882 } 883 } 884 885 static int 886 hn_ifmedia_upd(struct ifnet *ifp __unused) 887 { 888 889 return EOPNOTSUPP; 890 } 891 892 static void 893 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 894 { 895 struct hn_softc *sc = ifp->if_softc; 896 897 ifmr->ifm_status = IFM_AVALID; 898 ifmr->ifm_active = IFM_ETHER; 899 900 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 901 ifmr->ifm_active |= IFM_NONE; 902 return; 903 } 904 ifmr->ifm_status |= IFM_ACTIVE; 905 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 906 } 907 908 static void 909 hn_update_vf_task(void *arg, int pending __unused) 910 { 911 struct hn_update_vf *uv = arg; 912 913 uv->rxr->hn_vf = uv->vf; 914 } 915 916 static void 917 hn_update_vf(struct hn_softc *sc, struct ifnet *vf) 918 { 919 struct hn_rx_ring *rxr; 920 struct hn_update_vf uv; 921 struct task task; 922 int i; 923 924 HN_LOCK_ASSERT(sc); 925 926 TASK_INIT(&task, 0, hn_update_vf_task, &uv); 927 928 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 929 rxr = &sc->hn_rx_ring[i]; 930 931 if (i < sc->hn_rx_ring_inuse) { 932 uv.rxr = rxr; 933 uv.vf = vf; 934 vmbus_chan_run_task(rxr->hn_chan, &task); 935 } else { 936 rxr->hn_vf = vf; 937 } 938 } 939 } 940 941 static void 942 hn_set_vf(struct hn_softc *sc, struct ifnet *ifp, bool vf) 943 { 944 struct ifnet *hn_ifp; 945 946 HN_LOCK(sc); 947 948 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 949 goto out; 950 951 hn_ifp = sc->hn_ifp; 952 953 if (ifp == hn_ifp) 954 goto out; 955 956 if (ifp->if_alloctype != IFT_ETHER) 957 goto out; 958 959 /* Ignore lagg/vlan interfaces */ 960 if (strcmp(ifp->if_dname, "lagg") == 0 || 961 strcmp(ifp->if_dname, "vlan") == 0) 962 goto out; 963 964 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 965 goto out; 966 967 /* Now we're sure 'ifp' is a real VF device. */ 968 if (vf) { 969 if (sc->hn_flags & HN_FLAG_VF) 970 goto out; 971 972 sc->hn_flags |= HN_FLAG_VF; 973 hn_rxfilter_config(sc); 974 } else { 975 if (!(sc->hn_flags & HN_FLAG_VF)) 976 goto out; 977 978 sc->hn_flags &= ~HN_FLAG_VF; 979 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 980 hn_rxfilter_config(sc); 981 else 982 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 983 } 984 985 hn_nvs_set_datapath(sc, 986 vf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC); 987 988 hn_update_vf(sc, vf ? ifp : NULL); 989 990 if (vf) { 991 hn_suspend_mgmt(sc); 992 sc->hn_link_flags &= 993 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 994 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 995 } else { 996 hn_resume_mgmt(sc); 997 } 998 999 devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp), 1000 vf ? "VF_UP" : "VF_DOWN", NULL); 1001 1002 if (bootverbose) 1003 if_printf(hn_ifp, "Data path is switched %s %s\n", 1004 vf ? "to" : "from", if_name(ifp)); 1005 out: 1006 HN_UNLOCK(sc); 1007 } 1008 1009 static void 1010 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1011 { 1012 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1013 return; 1014 1015 hn_set_vf(arg, ifp, event == IFNET_EVENT_UP); 1016 } 1017 1018 static void 1019 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1020 { 1021 hn_set_vf(arg, ifp, ifp->if_flags & IFF_UP); 1022 } 1023 1024 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ 1025 static const struct hyperv_guid g_net_vsc_device_type = { 1026 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 1027 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} 1028 }; 1029 1030 static int 1031 hn_probe(device_t dev) 1032 { 1033 1034 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, 1035 &g_net_vsc_device_type) == 0) { 1036 device_set_desc(dev, "Hyper-V Network Interface"); 1037 return BUS_PROBE_DEFAULT; 1038 } 1039 return ENXIO; 1040 } 1041 1042 static int 1043 hn_attach(device_t dev) 1044 { 1045 struct hn_softc *sc = device_get_softc(dev); 1046 struct sysctl_oid_list *child; 1047 struct sysctl_ctx_list *ctx; 1048 uint8_t eaddr[ETHER_ADDR_LEN]; 1049 struct ifnet *ifp = NULL; 1050 int error, ring_cnt, tx_ring_cnt; 1051 1052 sc->hn_dev = dev; 1053 sc->hn_prichan = vmbus_get_channel(dev); 1054 HN_LOCK_INIT(sc); 1055 1056 /* 1057 * Initialize these tunables once. 1058 */ 1059 sc->hn_agg_size = hn_tx_agg_size; 1060 sc->hn_agg_pkts = hn_tx_agg_pkts; 1061 1062 /* 1063 * Setup taskqueue for transmission. 1064 */ 1065 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 1066 int i; 1067 1068 sc->hn_tx_taskqs = 1069 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 1070 M_DEVBUF, M_WAITOK); 1071 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 1072 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 1073 M_WAITOK, taskqueue_thread_enqueue, 1074 &sc->hn_tx_taskqs[i]); 1075 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 1076 "%s tx%d", device_get_nameunit(dev), i); 1077 } 1078 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 1079 sc->hn_tx_taskqs = hn_tx_taskque; 1080 } 1081 1082 /* 1083 * Setup taskqueue for mangement tasks, e.g. link status. 1084 */ 1085 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 1086 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 1087 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 1088 device_get_nameunit(dev)); 1089 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 1090 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 1091 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 1092 hn_netchg_status_taskfunc, sc); 1093 1094 /* 1095 * Allocate ifnet and setup its name earlier, so that if_printf 1096 * can be used by functions, which will be called after 1097 * ether_ifattach(). 1098 */ 1099 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 1100 ifp->if_softc = sc; 1101 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 1102 1103 /* 1104 * Initialize ifmedia earlier so that it can be unconditionally 1105 * destroyed, if error happened later on. 1106 */ 1107 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 1108 1109 /* 1110 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 1111 * to use (tx_ring_cnt). 1112 * 1113 * NOTE: 1114 * The # of RX rings to use is same as the # of channels to use. 1115 */ 1116 ring_cnt = hn_chan_cnt; 1117 if (ring_cnt <= 0) { 1118 /* Default */ 1119 ring_cnt = mp_ncpus; 1120 if (ring_cnt > HN_RING_CNT_DEF_MAX) 1121 ring_cnt = HN_RING_CNT_DEF_MAX; 1122 } else if (ring_cnt > mp_ncpus) { 1123 ring_cnt = mp_ncpus; 1124 } 1125 #ifdef RSS 1126 if (ring_cnt > rss_getnumbuckets()) 1127 ring_cnt = rss_getnumbuckets(); 1128 #endif 1129 1130 tx_ring_cnt = hn_tx_ring_cnt; 1131 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 1132 tx_ring_cnt = ring_cnt; 1133 #ifdef HN_IFSTART_SUPPORT 1134 if (hn_use_if_start) { 1135 /* ifnet.if_start only needs one TX ring. */ 1136 tx_ring_cnt = 1; 1137 } 1138 #endif 1139 1140 /* 1141 * Set the leader CPU for channels. 1142 */ 1143 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 1144 1145 /* 1146 * Create enough TX/RX rings, even if only limited number of 1147 * channels can be allocated. 1148 */ 1149 error = hn_create_tx_data(sc, tx_ring_cnt); 1150 if (error) 1151 goto failed; 1152 error = hn_create_rx_data(sc, ring_cnt); 1153 if (error) 1154 goto failed; 1155 1156 /* 1157 * Create transaction context for NVS and RNDIS transactions. 1158 */ 1159 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 1160 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 1161 if (sc->hn_xact == NULL) { 1162 error = ENXIO; 1163 goto failed; 1164 } 1165 1166 /* 1167 * Install orphan handler for the revocation of this device's 1168 * primary channel. 1169 * 1170 * NOTE: 1171 * The processing order is critical here: 1172 * Install the orphan handler, _before_ testing whether this 1173 * device's primary channel has been revoked or not. 1174 */ 1175 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 1176 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 1177 error = ENXIO; 1178 goto failed; 1179 } 1180 1181 /* 1182 * Attach the synthetic parts, i.e. NVS and RNDIS. 1183 */ 1184 error = hn_synth_attach(sc, ETHERMTU); 1185 if (error) 1186 goto failed; 1187 1188 error = hn_rndis_get_eaddr(sc, eaddr); 1189 if (error) 1190 goto failed; 1191 1192 #if __FreeBSD_version >= 1100099 1193 if (sc->hn_rx_ring_inuse > 1) { 1194 /* 1195 * Reduce TCP segment aggregation limit for multiple 1196 * RX rings to increase ACK timeliness. 1197 */ 1198 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 1199 } 1200 #endif 1201 1202 /* 1203 * Fixup TX stuffs after synthetic parts are attached. 1204 */ 1205 hn_fixup_tx_data(sc); 1206 1207 ctx = device_get_sysctl_ctx(dev); 1208 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 1209 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 1210 &sc->hn_nvs_ver, 0, "NVS version"); 1211 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 1212 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1213 hn_ndis_version_sysctl, "A", "NDIS version"); 1214 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 1215 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1216 hn_caps_sysctl, "A", "capabilities"); 1217 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 1218 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1219 hn_hwassist_sysctl, "A", "hwassist"); 1220 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 1221 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1222 hn_rxfilter_sysctl, "A", "rxfilter"); 1223 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 1224 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1225 hn_rss_hash_sysctl, "A", "RSS hash"); 1226 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 1227 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 1228 #ifndef RSS 1229 /* 1230 * Don't allow RSS key/indirect table changes, if RSS is defined. 1231 */ 1232 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 1233 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1234 hn_rss_key_sysctl, "IU", "RSS key"); 1235 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 1236 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1237 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 1238 #endif 1239 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 1240 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 1241 "RNDIS offered packet transmission aggregation size limit"); 1242 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 1243 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 1244 "RNDIS offered packet transmission aggregation count limit"); 1245 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 1246 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 1247 "RNDIS packet transmission aggregation alignment"); 1248 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 1249 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1250 hn_txagg_size_sysctl, "I", 1251 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 1252 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 1253 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1254 hn_txagg_pkts_sysctl, "I", 1255 "Packet transmission aggregation packets, " 1256 "0 -- disable, -1 -- auto"); 1257 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 1258 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1259 hn_polling_sysctl, "I", 1260 "Polling frequency: [100,1000000], 0 disable polling"); 1261 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 1262 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1263 hn_vf_sysctl, "A", "Virtual Function's name"); 1264 1265 /* 1266 * Setup the ifmedia, which has been initialized earlier. 1267 */ 1268 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 1269 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 1270 /* XXX ifmedia_set really should do this for us */ 1271 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 1272 1273 /* 1274 * Setup the ifnet for this interface. 1275 */ 1276 1277 ifp->if_baudrate = IF_Gbps(10); 1278 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 1279 ifp->if_ioctl = hn_ioctl; 1280 ifp->if_init = hn_init; 1281 #ifdef HN_IFSTART_SUPPORT 1282 if (hn_use_if_start) { 1283 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 1284 1285 ifp->if_start = hn_start; 1286 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 1287 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 1288 IFQ_SET_READY(&ifp->if_snd); 1289 } else 1290 #endif 1291 { 1292 ifp->if_transmit = hn_transmit; 1293 ifp->if_qflush = hn_xmit_qflush; 1294 } 1295 1296 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO; 1297 #ifdef foo 1298 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 1299 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 1300 #endif 1301 if (sc->hn_caps & HN_CAP_VLAN) { 1302 /* XXX not sure about VLAN_MTU. */ 1303 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 1304 } 1305 1306 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 1307 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 1308 ifp->if_capabilities |= IFCAP_TXCSUM; 1309 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 1310 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 1311 if (sc->hn_caps & HN_CAP_TSO4) { 1312 ifp->if_capabilities |= IFCAP_TSO4; 1313 ifp->if_hwassist |= CSUM_IP_TSO; 1314 } 1315 if (sc->hn_caps & HN_CAP_TSO6) { 1316 ifp->if_capabilities |= IFCAP_TSO6; 1317 ifp->if_hwassist |= CSUM_IP6_TSO; 1318 } 1319 1320 /* Enable all available capabilities by default. */ 1321 ifp->if_capenable = ifp->if_capabilities; 1322 1323 /* 1324 * Disable IPv6 TSO and TXCSUM by default, they still can 1325 * be enabled through SIOCSIFCAP. 1326 */ 1327 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 1328 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 1329 1330 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 1331 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 1332 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 1333 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 1334 } 1335 1336 ether_ifattach(ifp, eaddr); 1337 1338 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 1339 if_printf(ifp, "TSO segcnt %u segsz %u\n", 1340 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 1341 } 1342 1343 /* Inform the upper layer about the long frame support. */ 1344 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 1345 1346 /* 1347 * Kick off link status check. 1348 */ 1349 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 1350 hn_update_link_status(sc); 1351 1352 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 1353 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 1354 1355 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 1356 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 1357 1358 return (0); 1359 failed: 1360 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 1361 hn_synth_detach(sc); 1362 hn_detach(dev); 1363 return (error); 1364 } 1365 1366 static int 1367 hn_detach(device_t dev) 1368 { 1369 struct hn_softc *sc = device_get_softc(dev); 1370 struct ifnet *ifp = sc->hn_ifp; 1371 1372 if (sc->hn_ifaddr_evthand != NULL) 1373 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 1374 if (sc->hn_ifnet_evthand != NULL) 1375 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 1376 1377 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 1378 /* 1379 * In case that the vmbus missed the orphan handler 1380 * installation. 1381 */ 1382 vmbus_xact_ctx_orphan(sc->hn_xact); 1383 } 1384 1385 if (device_is_attached(dev)) { 1386 HN_LOCK(sc); 1387 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 1388 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 1389 hn_stop(sc, true); 1390 /* 1391 * NOTE: 1392 * hn_stop() only suspends data, so managment 1393 * stuffs have to be suspended manually here. 1394 */ 1395 hn_suspend_mgmt(sc); 1396 hn_synth_detach(sc); 1397 } 1398 HN_UNLOCK(sc); 1399 ether_ifdetach(ifp); 1400 } 1401 1402 ifmedia_removeall(&sc->hn_media); 1403 hn_destroy_rx_data(sc); 1404 hn_destroy_tx_data(sc); 1405 1406 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 1407 int i; 1408 1409 for (i = 0; i < hn_tx_taskq_cnt; ++i) 1410 taskqueue_free(sc->hn_tx_taskqs[i]); 1411 free(sc->hn_tx_taskqs, M_DEVBUF); 1412 } 1413 taskqueue_free(sc->hn_mgmt_taskq0); 1414 1415 if (sc->hn_xact != NULL) { 1416 /* 1417 * Uninstall the orphan handler _before_ the xact is 1418 * destructed. 1419 */ 1420 vmbus_chan_unset_orphan(sc->hn_prichan); 1421 vmbus_xact_ctx_destroy(sc->hn_xact); 1422 } 1423 1424 if_free(ifp); 1425 1426 HN_LOCK_DESTROY(sc); 1427 return (0); 1428 } 1429 1430 static int 1431 hn_shutdown(device_t dev) 1432 { 1433 1434 return (0); 1435 } 1436 1437 static void 1438 hn_link_status(struct hn_softc *sc) 1439 { 1440 uint32_t link_status; 1441 int error; 1442 1443 error = hn_rndis_get_linkstatus(sc, &link_status); 1444 if (error) { 1445 /* XXX what to do? */ 1446 return; 1447 } 1448 1449 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 1450 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 1451 else 1452 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1453 if_link_state_change(sc->hn_ifp, 1454 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 1455 LINK_STATE_UP : LINK_STATE_DOWN); 1456 } 1457 1458 static void 1459 hn_link_taskfunc(void *xsc, int pending __unused) 1460 { 1461 struct hn_softc *sc = xsc; 1462 1463 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 1464 return; 1465 hn_link_status(sc); 1466 } 1467 1468 static void 1469 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 1470 { 1471 struct hn_softc *sc = xsc; 1472 1473 /* Prevent any link status checks from running. */ 1474 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 1475 1476 /* 1477 * Fake up a [link down --> link up] state change; 5 seconds 1478 * delay is used, which closely simulates miibus reaction 1479 * upon link down event. 1480 */ 1481 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1482 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 1483 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 1484 &sc->hn_netchg_status, 5 * hz); 1485 } 1486 1487 static void 1488 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 1489 { 1490 struct hn_softc *sc = xsc; 1491 1492 /* Re-allow link status checks. */ 1493 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 1494 hn_link_status(sc); 1495 } 1496 1497 static void 1498 hn_update_link_status(struct hn_softc *sc) 1499 { 1500 1501 if (sc->hn_mgmt_taskq != NULL) 1502 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 1503 } 1504 1505 static void 1506 hn_change_network(struct hn_softc *sc) 1507 { 1508 1509 if (sc->hn_mgmt_taskq != NULL) 1510 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 1511 } 1512 1513 static __inline int 1514 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 1515 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 1516 { 1517 struct mbuf *m = *m_head; 1518 int error; 1519 1520 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 1521 1522 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 1523 m, segs, nsegs, BUS_DMA_NOWAIT); 1524 if (error == EFBIG) { 1525 struct mbuf *m_new; 1526 1527 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 1528 if (m_new == NULL) 1529 return ENOBUFS; 1530 else 1531 *m_head = m = m_new; 1532 txr->hn_tx_collapsed++; 1533 1534 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 1535 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 1536 } 1537 if (!error) { 1538 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 1539 BUS_DMASYNC_PREWRITE); 1540 txd->flags |= HN_TXD_FLAG_DMAMAP; 1541 } 1542 return error; 1543 } 1544 1545 static __inline int 1546 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 1547 { 1548 1549 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 1550 ("put an onlist txd %#x", txd->flags)); 1551 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1552 ("put an onagg txd %#x", txd->flags)); 1553 1554 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1555 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 1556 return 0; 1557 1558 if (!STAILQ_EMPTY(&txd->agg_list)) { 1559 struct hn_txdesc *tmp_txd; 1560 1561 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 1562 int freed; 1563 1564 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 1565 ("resursive aggregation on aggregated txdesc")); 1566 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 1567 ("not aggregated txdesc")); 1568 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1569 ("aggregated txdesc uses dmamap")); 1570 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 1571 ("aggregated txdesc consumes " 1572 "chimney sending buffer")); 1573 KASSERT(tmp_txd->chim_size == 0, 1574 ("aggregated txdesc has non-zero " 1575 "chimney sending size")); 1576 1577 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 1578 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 1579 freed = hn_txdesc_put(txr, tmp_txd); 1580 KASSERT(freed, ("failed to free aggregated txdesc")); 1581 } 1582 } 1583 1584 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 1585 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1586 ("chim txd uses dmamap")); 1587 hn_chim_free(txr->hn_sc, txd->chim_index); 1588 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 1589 txd->chim_size = 0; 1590 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 1591 bus_dmamap_sync(txr->hn_tx_data_dtag, 1592 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 1593 bus_dmamap_unload(txr->hn_tx_data_dtag, 1594 txd->data_dmap); 1595 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 1596 } 1597 1598 if (txd->m != NULL) { 1599 m_freem(txd->m); 1600 txd->m = NULL; 1601 } 1602 1603 txd->flags |= HN_TXD_FLAG_ONLIST; 1604 #ifndef HN_USE_TXDESC_BUFRING 1605 mtx_lock_spin(&txr->hn_txlist_spin); 1606 KASSERT(txr->hn_txdesc_avail >= 0 && 1607 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 1608 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 1609 txr->hn_txdesc_avail++; 1610 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 1611 mtx_unlock_spin(&txr->hn_txlist_spin); 1612 #else /* HN_USE_TXDESC_BUFRING */ 1613 #ifdef HN_DEBUG 1614 atomic_add_int(&txr->hn_txdesc_avail, 1); 1615 #endif 1616 buf_ring_enqueue(txr->hn_txdesc_br, txd); 1617 #endif /* !HN_USE_TXDESC_BUFRING */ 1618 1619 return 1; 1620 } 1621 1622 static __inline struct hn_txdesc * 1623 hn_txdesc_get(struct hn_tx_ring *txr) 1624 { 1625 struct hn_txdesc *txd; 1626 1627 #ifndef HN_USE_TXDESC_BUFRING 1628 mtx_lock_spin(&txr->hn_txlist_spin); 1629 txd = SLIST_FIRST(&txr->hn_txlist); 1630 if (txd != NULL) { 1631 KASSERT(txr->hn_txdesc_avail > 0, 1632 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 1633 txr->hn_txdesc_avail--; 1634 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 1635 } 1636 mtx_unlock_spin(&txr->hn_txlist_spin); 1637 #else 1638 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 1639 #endif 1640 1641 if (txd != NULL) { 1642 #ifdef HN_USE_TXDESC_BUFRING 1643 #ifdef HN_DEBUG 1644 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 1645 #endif 1646 #endif /* HN_USE_TXDESC_BUFRING */ 1647 KASSERT(txd->m == NULL && txd->refs == 0 && 1648 STAILQ_EMPTY(&txd->agg_list) && 1649 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 1650 txd->chim_size == 0 && 1651 (txd->flags & HN_TXD_FLAG_ONLIST) && 1652 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 1653 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 1654 txd->flags &= ~HN_TXD_FLAG_ONLIST; 1655 txd->refs = 1; 1656 } 1657 return txd; 1658 } 1659 1660 static __inline void 1661 hn_txdesc_hold(struct hn_txdesc *txd) 1662 { 1663 1664 /* 0->1 transition will never work */ 1665 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1666 atomic_add_int(&txd->refs, 1); 1667 } 1668 1669 static __inline void 1670 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 1671 { 1672 1673 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1674 ("recursive aggregation on aggregating txdesc")); 1675 1676 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1677 ("already aggregated")); 1678 KASSERT(STAILQ_EMPTY(&txd->agg_list), 1679 ("recursive aggregation on to-be-aggregated txdesc")); 1680 1681 txd->flags |= HN_TXD_FLAG_ONAGG; 1682 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 1683 } 1684 1685 static bool 1686 hn_tx_ring_pending(struct hn_tx_ring *txr) 1687 { 1688 bool pending = false; 1689 1690 #ifndef HN_USE_TXDESC_BUFRING 1691 mtx_lock_spin(&txr->hn_txlist_spin); 1692 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 1693 pending = true; 1694 mtx_unlock_spin(&txr->hn_txlist_spin); 1695 #else 1696 if (!buf_ring_full(txr->hn_txdesc_br)) 1697 pending = true; 1698 #endif 1699 return (pending); 1700 } 1701 1702 static __inline void 1703 hn_txeof(struct hn_tx_ring *txr) 1704 { 1705 txr->hn_has_txeof = 0; 1706 txr->hn_txeof(txr); 1707 } 1708 1709 static void 1710 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 1711 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 1712 { 1713 struct hn_txdesc *txd = sndc->hn_cbarg; 1714 struct hn_tx_ring *txr; 1715 1716 txr = txd->txr; 1717 KASSERT(txr->hn_chan == chan, 1718 ("channel mismatch, on chan%u, should be chan%u", 1719 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 1720 1721 txr->hn_has_txeof = 1; 1722 hn_txdesc_put(txr, txd); 1723 1724 ++txr->hn_txdone_cnt; 1725 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 1726 txr->hn_txdone_cnt = 0; 1727 if (txr->hn_oactive) 1728 hn_txeof(txr); 1729 } 1730 } 1731 1732 static void 1733 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 1734 { 1735 #if defined(INET) || defined(INET6) 1736 tcp_lro_flush_all(&rxr->hn_lro); 1737 #endif 1738 1739 /* 1740 * NOTE: 1741 * 'txr' could be NULL, if multiple channels and 1742 * ifnet.if_start method are enabled. 1743 */ 1744 if (txr == NULL || !txr->hn_has_txeof) 1745 return; 1746 1747 txr->hn_txdone_cnt = 0; 1748 hn_txeof(txr); 1749 } 1750 1751 static __inline uint32_t 1752 hn_rndis_pktmsg_offset(uint32_t ofs) 1753 { 1754 1755 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 1756 ("invalid RNDIS packet msg offset %u", ofs)); 1757 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 1758 } 1759 1760 static __inline void * 1761 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 1762 size_t pi_dlen, uint32_t pi_type) 1763 { 1764 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 1765 struct rndis_pktinfo *pi; 1766 1767 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 1768 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 1769 1770 /* 1771 * Per-packet-info does not move; it only grows. 1772 * 1773 * NOTE: 1774 * rm_pktinfooffset in this phase counts from the beginning 1775 * of rndis_packet_msg. 1776 */ 1777 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 1778 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 1779 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 1780 pkt->rm_pktinfolen); 1781 pkt->rm_pktinfolen += pi_size; 1782 1783 pi->rm_size = pi_size; 1784 pi->rm_type = pi_type; 1785 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 1786 1787 return (pi->rm_data); 1788 } 1789 1790 static __inline int 1791 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 1792 { 1793 struct hn_txdesc *txd; 1794 struct mbuf *m; 1795 int error, pkts; 1796 1797 txd = txr->hn_agg_txd; 1798 KASSERT(txd != NULL, ("no aggregate txdesc")); 1799 1800 /* 1801 * Since hn_txpkt() will reset this temporary stat, save 1802 * it now, so that oerrors can be updated properly, if 1803 * hn_txpkt() ever fails. 1804 */ 1805 pkts = txr->hn_stat_pkts; 1806 1807 /* 1808 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 1809 * failure, save it for later freeing, if hn_txpkt() ever 1810 * fails. 1811 */ 1812 m = txd->m; 1813 error = hn_txpkt(ifp, txr, txd); 1814 if (__predict_false(error)) { 1815 /* txd is freed, but m is not. */ 1816 m_freem(m); 1817 1818 txr->hn_flush_failed++; 1819 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 1820 } 1821 1822 /* Reset all aggregation states. */ 1823 txr->hn_agg_txd = NULL; 1824 txr->hn_agg_szleft = 0; 1825 txr->hn_agg_pktleft = 0; 1826 txr->hn_agg_prevpkt = NULL; 1827 1828 return (error); 1829 } 1830 1831 static void * 1832 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1833 int pktsize) 1834 { 1835 void *chim; 1836 1837 if (txr->hn_agg_txd != NULL) { 1838 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 1839 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 1840 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 1841 int olen; 1842 1843 /* 1844 * Update the previous RNDIS packet's total length, 1845 * it can be increased due to the mandatory alignment 1846 * padding for this RNDIS packet. And update the 1847 * aggregating txdesc's chimney sending buffer size 1848 * accordingly. 1849 * 1850 * XXX 1851 * Zero-out the padding, as required by the RNDIS spec. 1852 */ 1853 olen = pkt->rm_len; 1854 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 1855 agg_txd->chim_size += pkt->rm_len - olen; 1856 1857 /* Link this txdesc to the parent. */ 1858 hn_txdesc_agg(agg_txd, txd); 1859 1860 chim = (uint8_t *)pkt + pkt->rm_len; 1861 /* Save the current packet for later fixup. */ 1862 txr->hn_agg_prevpkt = chim; 1863 1864 txr->hn_agg_pktleft--; 1865 txr->hn_agg_szleft -= pktsize; 1866 if (txr->hn_agg_szleft <= 1867 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1868 /* 1869 * Probably can't aggregate more packets, 1870 * flush this aggregating txdesc proactively. 1871 */ 1872 txr->hn_agg_pktleft = 0; 1873 } 1874 /* Done! */ 1875 return (chim); 1876 } 1877 hn_flush_txagg(ifp, txr); 1878 } 1879 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 1880 1881 txr->hn_tx_chimney_tried++; 1882 txd->chim_index = hn_chim_alloc(txr->hn_sc); 1883 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 1884 return (NULL); 1885 txr->hn_tx_chimney++; 1886 1887 chim = txr->hn_sc->hn_chim + 1888 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 1889 1890 if (txr->hn_agg_pktmax > 1 && 1891 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1892 txr->hn_agg_txd = txd; 1893 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 1894 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 1895 txr->hn_agg_prevpkt = chim; 1896 } 1897 return (chim); 1898 } 1899 1900 /* 1901 * NOTE: 1902 * If this function fails, then both txd and m_head0 will be freed. 1903 */ 1904 static int 1905 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1906 struct mbuf **m_head0) 1907 { 1908 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 1909 int error, nsegs, i; 1910 struct mbuf *m_head = *m_head0; 1911 struct rndis_packet_msg *pkt; 1912 uint32_t *pi_data; 1913 void *chim = NULL; 1914 int pkt_hlen, pkt_size; 1915 1916 pkt = txd->rndis_pkt; 1917 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 1918 if (pkt_size < txr->hn_chim_size) { 1919 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 1920 if (chim != NULL) 1921 pkt = chim; 1922 } else { 1923 if (txr->hn_agg_txd != NULL) 1924 hn_flush_txagg(ifp, txr); 1925 } 1926 1927 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 1928 pkt->rm_len = m_head->m_pkthdr.len; 1929 pkt->rm_dataoffset = 0; 1930 pkt->rm_datalen = m_head->m_pkthdr.len; 1931 pkt->rm_oobdataoffset = 0; 1932 pkt->rm_oobdatalen = 0; 1933 pkt->rm_oobdataelements = 0; 1934 pkt->rm_pktinfooffset = sizeof(*pkt); 1935 pkt->rm_pktinfolen = 0; 1936 pkt->rm_vchandle = 0; 1937 pkt->rm_reserved = 0; 1938 1939 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 1940 /* 1941 * Set the hash value for this packet, so that the host could 1942 * dispatch the TX done event for this packet back to this TX 1943 * ring's channel. 1944 */ 1945 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1946 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 1947 *pi_data = txr->hn_tx_idx; 1948 } 1949 1950 if (m_head->m_flags & M_VLANTAG) { 1951 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1952 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 1953 *pi_data = NDIS_VLAN_INFO_MAKE( 1954 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 1955 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 1956 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 1957 } 1958 1959 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 1960 #if defined(INET6) || defined(INET) 1961 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1962 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 1963 #ifdef INET 1964 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 1965 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, 1966 m_head->m_pkthdr.tso_segsz); 1967 } 1968 #endif 1969 #if defined(INET6) && defined(INET) 1970 else 1971 #endif 1972 #ifdef INET6 1973 { 1974 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, 1975 m_head->m_pkthdr.tso_segsz); 1976 } 1977 #endif 1978 #endif /* INET6 || INET */ 1979 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 1980 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1981 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 1982 if (m_head->m_pkthdr.csum_flags & 1983 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 1984 *pi_data = NDIS_TXCSUM_INFO_IPV6; 1985 } else { 1986 *pi_data = NDIS_TXCSUM_INFO_IPV4; 1987 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 1988 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 1989 } 1990 1991 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) 1992 *pi_data |= NDIS_TXCSUM_INFO_TCPCS; 1993 else if (m_head->m_pkthdr.csum_flags & 1994 (CSUM_IP_UDP | CSUM_IP6_UDP)) 1995 *pi_data |= NDIS_TXCSUM_INFO_UDPCS; 1996 } 1997 1998 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 1999 /* Fixup RNDIS packet message total length */ 2000 pkt->rm_len += pkt_hlen; 2001 /* Convert RNDIS packet message offsets */ 2002 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 2003 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 2004 2005 /* 2006 * Fast path: Chimney sending. 2007 */ 2008 if (chim != NULL) { 2009 struct hn_txdesc *tgt_txd = txd; 2010 2011 if (txr->hn_agg_txd != NULL) { 2012 tgt_txd = txr->hn_agg_txd; 2013 #ifdef INVARIANTS 2014 *m_head0 = NULL; 2015 #endif 2016 } 2017 2018 KASSERT(pkt == chim, 2019 ("RNDIS pkt not in chimney sending buffer")); 2020 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 2021 ("chimney sending buffer is not used")); 2022 tgt_txd->chim_size += pkt->rm_len; 2023 2024 m_copydata(m_head, 0, m_head->m_pkthdr.len, 2025 ((uint8_t *)chim) + pkt_hlen); 2026 2027 txr->hn_gpa_cnt = 0; 2028 txr->hn_sendpkt = hn_txpkt_chim; 2029 goto done; 2030 } 2031 2032 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 2033 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2034 ("chimney buffer is used")); 2035 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 2036 2037 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 2038 if (__predict_false(error)) { 2039 int freed; 2040 2041 /* 2042 * This mbuf is not linked w/ the txd yet, so free it now. 2043 */ 2044 m_freem(m_head); 2045 *m_head0 = NULL; 2046 2047 freed = hn_txdesc_put(txr, txd); 2048 KASSERT(freed != 0, 2049 ("fail to free txd upon txdma error")); 2050 2051 txr->hn_txdma_failed++; 2052 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 2053 return error; 2054 } 2055 *m_head0 = m_head; 2056 2057 /* +1 RNDIS packet message */ 2058 txr->hn_gpa_cnt = nsegs + 1; 2059 2060 /* send packet with page buffer */ 2061 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 2062 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 2063 txr->hn_gpa[0].gpa_len = pkt_hlen; 2064 2065 /* 2066 * Fill the page buffers with mbuf info after the page 2067 * buffer for RNDIS packet message. 2068 */ 2069 for (i = 0; i < nsegs; ++i) { 2070 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 2071 2072 gpa->gpa_page = atop(segs[i].ds_addr); 2073 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 2074 gpa->gpa_len = segs[i].ds_len; 2075 } 2076 2077 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2078 txd->chim_size = 0; 2079 txr->hn_sendpkt = hn_txpkt_sglist; 2080 done: 2081 txd->m = m_head; 2082 2083 /* Set the completion routine */ 2084 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 2085 2086 /* Update temporary stats for later use. */ 2087 txr->hn_stat_pkts++; 2088 txr->hn_stat_size += m_head->m_pkthdr.len; 2089 if (m_head->m_flags & M_MCAST) 2090 txr->hn_stat_mcasts++; 2091 2092 return 0; 2093 } 2094 2095 /* 2096 * NOTE: 2097 * If this function fails, then txd will be freed, but the mbuf 2098 * associated w/ the txd will _not_ be freed. 2099 */ 2100 static int 2101 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 2102 { 2103 int error, send_failed = 0, has_bpf; 2104 2105 again: 2106 has_bpf = bpf_peers_present(ifp->if_bpf); 2107 if (has_bpf) { 2108 /* 2109 * Make sure that this txd and any aggregated txds are not 2110 * freed before ETHER_BPF_MTAP. 2111 */ 2112 hn_txdesc_hold(txd); 2113 } 2114 error = txr->hn_sendpkt(txr, txd); 2115 if (!error) { 2116 if (has_bpf) { 2117 const struct hn_txdesc *tmp_txd; 2118 2119 ETHER_BPF_MTAP(ifp, txd->m); 2120 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 2121 ETHER_BPF_MTAP(ifp, tmp_txd->m); 2122 } 2123 2124 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 2125 #ifdef HN_IFSTART_SUPPORT 2126 if (!hn_use_if_start) 2127 #endif 2128 { 2129 if_inc_counter(ifp, IFCOUNTER_OBYTES, 2130 txr->hn_stat_size); 2131 if (txr->hn_stat_mcasts != 0) { 2132 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 2133 txr->hn_stat_mcasts); 2134 } 2135 } 2136 txr->hn_pkts += txr->hn_stat_pkts; 2137 txr->hn_sends++; 2138 } 2139 if (has_bpf) 2140 hn_txdesc_put(txr, txd); 2141 2142 if (__predict_false(error)) { 2143 int freed; 2144 2145 /* 2146 * This should "really rarely" happen. 2147 * 2148 * XXX Too many RX to be acked or too many sideband 2149 * commands to run? Ask netvsc_channel_rollup() 2150 * to kick start later. 2151 */ 2152 txr->hn_has_txeof = 1; 2153 if (!send_failed) { 2154 txr->hn_send_failed++; 2155 send_failed = 1; 2156 /* 2157 * Try sending again after set hn_has_txeof; 2158 * in case that we missed the last 2159 * netvsc_channel_rollup(). 2160 */ 2161 goto again; 2162 } 2163 if_printf(ifp, "send failed\n"); 2164 2165 /* 2166 * Caller will perform further processing on the 2167 * associated mbuf, so don't free it in hn_txdesc_put(); 2168 * only unload it from the DMA map in hn_txdesc_put(), 2169 * if it was loaded. 2170 */ 2171 txd->m = NULL; 2172 freed = hn_txdesc_put(txr, txd); 2173 KASSERT(freed != 0, 2174 ("fail to free txd upon send error")); 2175 2176 txr->hn_send_failed++; 2177 } 2178 2179 /* Reset temporary stats, after this sending is done. */ 2180 txr->hn_stat_size = 0; 2181 txr->hn_stat_pkts = 0; 2182 txr->hn_stat_mcasts = 0; 2183 2184 return (error); 2185 } 2186 2187 /* 2188 * Append the specified data to the indicated mbuf chain, 2189 * Extend the mbuf chain if the new data does not fit in 2190 * existing space. 2191 * 2192 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 2193 * There should be an equivalent in the kernel mbuf code, 2194 * but there does not appear to be one yet. 2195 * 2196 * Differs from m_append() in that additional mbufs are 2197 * allocated with cluster size MJUMPAGESIZE, and filled 2198 * accordingly. 2199 * 2200 * Return 1 if able to complete the job; otherwise 0. 2201 */ 2202 static int 2203 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 2204 { 2205 struct mbuf *m, *n; 2206 int remainder, space; 2207 2208 for (m = m0; m->m_next != NULL; m = m->m_next) 2209 ; 2210 remainder = len; 2211 space = M_TRAILINGSPACE(m); 2212 if (space > 0) { 2213 /* 2214 * Copy into available space. 2215 */ 2216 if (space > remainder) 2217 space = remainder; 2218 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 2219 m->m_len += space; 2220 cp += space; 2221 remainder -= space; 2222 } 2223 while (remainder > 0) { 2224 /* 2225 * Allocate a new mbuf; could check space 2226 * and allocate a cluster instead. 2227 */ 2228 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 2229 if (n == NULL) 2230 break; 2231 n->m_len = min(MJUMPAGESIZE, remainder); 2232 bcopy(cp, mtod(n, caddr_t), n->m_len); 2233 cp += n->m_len; 2234 remainder -= n->m_len; 2235 m->m_next = n; 2236 m = n; 2237 } 2238 if (m0->m_flags & M_PKTHDR) 2239 m0->m_pkthdr.len += len - remainder; 2240 2241 return (remainder == 0); 2242 } 2243 2244 #if defined(INET) || defined(INET6) 2245 static __inline int 2246 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 2247 { 2248 #if __FreeBSD_version >= 1100095 2249 if (hn_lro_mbufq_depth) { 2250 tcp_lro_queue_mbuf(lc, m); 2251 return 0; 2252 } 2253 #endif 2254 return tcp_lro_rx(lc, m, 0); 2255 } 2256 #endif 2257 2258 static int 2259 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, 2260 const struct hn_rxinfo *info) 2261 { 2262 struct ifnet *ifp; 2263 struct mbuf *m_new; 2264 int size, do_lro = 0, do_csum = 1; 2265 int hash_type; 2266 2267 /* If the VF is active, inject the packet through the VF */ 2268 ifp = rxr->hn_vf ? rxr->hn_vf : rxr->hn_ifp; 2269 2270 if (dlen <= MHLEN) { 2271 m_new = m_gethdr(M_NOWAIT, MT_DATA); 2272 if (m_new == NULL) { 2273 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2274 return (0); 2275 } 2276 memcpy(mtod(m_new, void *), data, dlen); 2277 m_new->m_pkthdr.len = m_new->m_len = dlen; 2278 rxr->hn_small_pkts++; 2279 } else { 2280 /* 2281 * Get an mbuf with a cluster. For packets 2K or less, 2282 * get a standard 2K cluster. For anything larger, get a 2283 * 4K cluster. Any buffers larger than 4K can cause problems 2284 * if looped around to the Hyper-V TX channel, so avoid them. 2285 */ 2286 size = MCLBYTES; 2287 if (dlen > MCLBYTES) { 2288 /* 4096 */ 2289 size = MJUMPAGESIZE; 2290 } 2291 2292 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 2293 if (m_new == NULL) { 2294 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2295 return (0); 2296 } 2297 2298 hv_m_append(m_new, dlen, data); 2299 } 2300 m_new->m_pkthdr.rcvif = ifp; 2301 2302 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) 2303 do_csum = 0; 2304 2305 /* receive side checksum offload */ 2306 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { 2307 /* IP csum offload */ 2308 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 2309 m_new->m_pkthdr.csum_flags |= 2310 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2311 rxr->hn_csum_ip++; 2312 } 2313 2314 /* TCP/UDP csum offload */ 2315 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | 2316 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 2317 m_new->m_pkthdr.csum_flags |= 2318 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2319 m_new->m_pkthdr.csum_data = 0xffff; 2320 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) 2321 rxr->hn_csum_tcp++; 2322 else 2323 rxr->hn_csum_udp++; 2324 } 2325 2326 /* 2327 * XXX 2328 * As of this write (Oct 28th, 2016), host side will turn 2329 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 2330 * the do_lro setting here is actually _not_ accurate. We 2331 * depend on the RSS hash type check to reset do_lro. 2332 */ 2333 if ((info->csum_info & 2334 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 2335 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 2336 do_lro = 1; 2337 } else { 2338 const struct ether_header *eh; 2339 uint16_t etype; 2340 int hoff; 2341 2342 hoff = sizeof(*eh); 2343 if (m_new->m_len < hoff) 2344 goto skip; 2345 eh = mtod(m_new, struct ether_header *); 2346 etype = ntohs(eh->ether_type); 2347 if (etype == ETHERTYPE_VLAN) { 2348 const struct ether_vlan_header *evl; 2349 2350 hoff = sizeof(*evl); 2351 if (m_new->m_len < hoff) 2352 goto skip; 2353 evl = mtod(m_new, struct ether_vlan_header *); 2354 etype = ntohs(evl->evl_proto); 2355 } 2356 2357 if (etype == ETHERTYPE_IP) { 2358 int pr; 2359 2360 pr = hn_check_iplen(m_new, hoff); 2361 if (pr == IPPROTO_TCP) { 2362 if (do_csum && 2363 (rxr->hn_trust_hcsum & 2364 HN_TRUST_HCSUM_TCP)) { 2365 rxr->hn_csum_trusted++; 2366 m_new->m_pkthdr.csum_flags |= 2367 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2368 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2369 m_new->m_pkthdr.csum_data = 0xffff; 2370 } 2371 do_lro = 1; 2372 } else if (pr == IPPROTO_UDP) { 2373 if (do_csum && 2374 (rxr->hn_trust_hcsum & 2375 HN_TRUST_HCSUM_UDP)) { 2376 rxr->hn_csum_trusted++; 2377 m_new->m_pkthdr.csum_flags |= 2378 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2379 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2380 m_new->m_pkthdr.csum_data = 0xffff; 2381 } 2382 } else if (pr != IPPROTO_DONE && do_csum && 2383 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 2384 rxr->hn_csum_trusted++; 2385 m_new->m_pkthdr.csum_flags |= 2386 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2387 } 2388 } 2389 } 2390 skip: 2391 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { 2392 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 2393 NDIS_VLAN_INFO_ID(info->vlan_info), 2394 NDIS_VLAN_INFO_PRI(info->vlan_info), 2395 NDIS_VLAN_INFO_CFI(info->vlan_info)); 2396 m_new->m_flags |= M_VLANTAG; 2397 } 2398 2399 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { 2400 rxr->hn_rss_pkts++; 2401 m_new->m_pkthdr.flowid = info->hash_value; 2402 hash_type = M_HASHTYPE_OPAQUE_HASH; 2403 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == 2404 NDIS_HASH_FUNCTION_TOEPLITZ) { 2405 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); 2406 2407 /* 2408 * NOTE: 2409 * do_lro is resetted, if the hash types are not TCP 2410 * related. See the comment in the above csum_flags 2411 * setup section. 2412 */ 2413 switch (type) { 2414 case NDIS_HASH_IPV4: 2415 hash_type = M_HASHTYPE_RSS_IPV4; 2416 do_lro = 0; 2417 break; 2418 2419 case NDIS_HASH_TCP_IPV4: 2420 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 2421 break; 2422 2423 case NDIS_HASH_IPV6: 2424 hash_type = M_HASHTYPE_RSS_IPV6; 2425 do_lro = 0; 2426 break; 2427 2428 case NDIS_HASH_IPV6_EX: 2429 hash_type = M_HASHTYPE_RSS_IPV6_EX; 2430 do_lro = 0; 2431 break; 2432 2433 case NDIS_HASH_TCP_IPV6: 2434 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 2435 break; 2436 2437 case NDIS_HASH_TCP_IPV6_EX: 2438 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 2439 break; 2440 } 2441 } 2442 } else { 2443 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 2444 hash_type = M_HASHTYPE_OPAQUE; 2445 } 2446 M_HASHTYPE_SET(m_new, hash_type); 2447 2448 /* 2449 * Note: Moved RX completion back to hv_nv_on_receive() so all 2450 * messages (not just data messages) will trigger a response. 2451 */ 2452 2453 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 2454 rxr->hn_pkts++; 2455 2456 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { 2457 #if defined(INET) || defined(INET6) 2458 struct lro_ctrl *lro = &rxr->hn_lro; 2459 2460 if (lro->lro_cnt) { 2461 rxr->hn_lro_tried++; 2462 if (hn_lro_rx(lro, m_new) == 0) { 2463 /* DONE! */ 2464 return 0; 2465 } 2466 } 2467 #endif 2468 } 2469 2470 /* We're not holding the lock here, so don't release it */ 2471 (*ifp->if_input)(ifp, m_new); 2472 2473 return (0); 2474 } 2475 2476 static int 2477 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 2478 { 2479 struct hn_softc *sc = ifp->if_softc; 2480 struct ifreq *ifr = (struct ifreq *)data; 2481 int mask, error = 0; 2482 2483 switch (cmd) { 2484 case SIOCSIFMTU: 2485 if (ifr->ifr_mtu > HN_MTU_MAX) { 2486 error = EINVAL; 2487 break; 2488 } 2489 2490 HN_LOCK(sc); 2491 2492 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2493 HN_UNLOCK(sc); 2494 break; 2495 } 2496 2497 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 2498 /* Can't change MTU */ 2499 HN_UNLOCK(sc); 2500 error = EOPNOTSUPP; 2501 break; 2502 } 2503 2504 if (ifp->if_mtu == ifr->ifr_mtu) { 2505 HN_UNLOCK(sc); 2506 break; 2507 } 2508 2509 /* 2510 * Suspend this interface before the synthetic parts 2511 * are ripped. 2512 */ 2513 hn_suspend(sc); 2514 2515 /* 2516 * Detach the synthetics parts, i.e. NVS and RNDIS. 2517 */ 2518 hn_synth_detach(sc); 2519 2520 /* 2521 * Reattach the synthetic parts, i.e. NVS and RNDIS, 2522 * with the new MTU setting. 2523 */ 2524 error = hn_synth_attach(sc, ifr->ifr_mtu); 2525 if (error) { 2526 HN_UNLOCK(sc); 2527 break; 2528 } 2529 2530 /* 2531 * Commit the requested MTU, after the synthetic parts 2532 * have been successfully attached. 2533 */ 2534 ifp->if_mtu = ifr->ifr_mtu; 2535 2536 /* 2537 * Make sure that various parameters based on MTU are 2538 * still valid, after the MTU change. 2539 */ 2540 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 2541 hn_set_chim_size(sc, sc->hn_chim_szmax); 2542 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 2543 #if __FreeBSD_version >= 1100099 2544 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < 2545 HN_LRO_LENLIM_MIN(ifp)) 2546 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 2547 #endif 2548 2549 /* 2550 * All done! Resume the interface now. 2551 */ 2552 hn_resume(sc); 2553 2554 HN_UNLOCK(sc); 2555 break; 2556 2557 case SIOCSIFFLAGS: 2558 HN_LOCK(sc); 2559 2560 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2561 HN_UNLOCK(sc); 2562 break; 2563 } 2564 2565 if (ifp->if_flags & IFF_UP) { 2566 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2567 /* 2568 * Caller meight hold mutex, e.g. 2569 * bpf; use busy-wait for the RNDIS 2570 * reply. 2571 */ 2572 HN_NO_SLEEPING(sc); 2573 hn_rxfilter_config(sc); 2574 HN_SLEEPING_OK(sc); 2575 } else { 2576 hn_init_locked(sc); 2577 } 2578 } else { 2579 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2580 hn_stop(sc, false); 2581 } 2582 sc->hn_if_flags = ifp->if_flags; 2583 2584 HN_UNLOCK(sc); 2585 break; 2586 2587 case SIOCSIFCAP: 2588 HN_LOCK(sc); 2589 mask = ifr->ifr_reqcap ^ ifp->if_capenable; 2590 2591 if (mask & IFCAP_TXCSUM) { 2592 ifp->if_capenable ^= IFCAP_TXCSUM; 2593 if (ifp->if_capenable & IFCAP_TXCSUM) 2594 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 2595 else 2596 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 2597 } 2598 if (mask & IFCAP_TXCSUM_IPV6) { 2599 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 2600 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 2601 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 2602 else 2603 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 2604 } 2605 2606 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 2607 if (mask & IFCAP_RXCSUM) 2608 ifp->if_capenable ^= IFCAP_RXCSUM; 2609 #ifdef foo 2610 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2611 if (mask & IFCAP_RXCSUM_IPV6) 2612 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 2613 #endif 2614 2615 if (mask & IFCAP_LRO) 2616 ifp->if_capenable ^= IFCAP_LRO; 2617 2618 if (mask & IFCAP_TSO4) { 2619 ifp->if_capenable ^= IFCAP_TSO4; 2620 if (ifp->if_capenable & IFCAP_TSO4) 2621 ifp->if_hwassist |= CSUM_IP_TSO; 2622 else 2623 ifp->if_hwassist &= ~CSUM_IP_TSO; 2624 } 2625 if (mask & IFCAP_TSO6) { 2626 ifp->if_capenable ^= IFCAP_TSO6; 2627 if (ifp->if_capenable & IFCAP_TSO6) 2628 ifp->if_hwassist |= CSUM_IP6_TSO; 2629 else 2630 ifp->if_hwassist &= ~CSUM_IP6_TSO; 2631 } 2632 2633 HN_UNLOCK(sc); 2634 break; 2635 2636 case SIOCADDMULTI: 2637 case SIOCDELMULTI: 2638 HN_LOCK(sc); 2639 2640 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2641 HN_UNLOCK(sc); 2642 break; 2643 } 2644 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2645 /* 2646 * Multicast uses mutex; use busy-wait for 2647 * the RNDIS reply. 2648 */ 2649 HN_NO_SLEEPING(sc); 2650 hn_rxfilter_config(sc); 2651 HN_SLEEPING_OK(sc); 2652 } 2653 2654 HN_UNLOCK(sc); 2655 break; 2656 2657 case SIOCSIFMEDIA: 2658 case SIOCGIFMEDIA: 2659 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 2660 break; 2661 2662 default: 2663 error = ether_ioctl(ifp, cmd, data); 2664 break; 2665 } 2666 return (error); 2667 } 2668 2669 static void 2670 hn_stop(struct hn_softc *sc, bool detaching) 2671 { 2672 struct ifnet *ifp = sc->hn_ifp; 2673 int i; 2674 2675 HN_LOCK_ASSERT(sc); 2676 2677 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 2678 ("synthetic parts were not attached")); 2679 2680 /* Disable polling. */ 2681 hn_polling(sc, 0); 2682 2683 /* Clear RUNNING bit _before_ hn_suspend_data() */ 2684 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2685 hn_suspend_data(sc); 2686 2687 /* Clear OACTIVE bit. */ 2688 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2689 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2690 sc->hn_tx_ring[i].hn_oactive = 0; 2691 2692 /* 2693 * If the VF is active, make sure the filter is not 0, even if 2694 * the synthetic NIC is down. 2695 */ 2696 if (!detaching && (sc->hn_flags & HN_FLAG_VF)) 2697 hn_rxfilter_config(sc); 2698 } 2699 2700 static void 2701 hn_init_locked(struct hn_softc *sc) 2702 { 2703 struct ifnet *ifp = sc->hn_ifp; 2704 int i; 2705 2706 HN_LOCK_ASSERT(sc); 2707 2708 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 2709 return; 2710 2711 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2712 return; 2713 2714 /* Configure RX filter */ 2715 hn_rxfilter_config(sc); 2716 2717 /* Clear OACTIVE bit. */ 2718 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2719 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2720 sc->hn_tx_ring[i].hn_oactive = 0; 2721 2722 /* Clear TX 'suspended' bit. */ 2723 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 2724 2725 /* Everything is ready; unleash! */ 2726 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2727 2728 /* Re-enable polling if requested. */ 2729 if (sc->hn_pollhz > 0) 2730 hn_polling(sc, sc->hn_pollhz); 2731 } 2732 2733 static void 2734 hn_init(void *xsc) 2735 { 2736 struct hn_softc *sc = xsc; 2737 2738 HN_LOCK(sc); 2739 hn_init_locked(sc); 2740 HN_UNLOCK(sc); 2741 } 2742 2743 #if __FreeBSD_version >= 1100099 2744 2745 static int 2746 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 2747 { 2748 struct hn_softc *sc = arg1; 2749 unsigned int lenlim; 2750 int error; 2751 2752 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 2753 error = sysctl_handle_int(oidp, &lenlim, 0, req); 2754 if (error || req->newptr == NULL) 2755 return error; 2756 2757 HN_LOCK(sc); 2758 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 2759 lenlim > TCP_LRO_LENGTH_MAX) { 2760 HN_UNLOCK(sc); 2761 return EINVAL; 2762 } 2763 hn_set_lro_lenlim(sc, lenlim); 2764 HN_UNLOCK(sc); 2765 2766 return 0; 2767 } 2768 2769 static int 2770 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 2771 { 2772 struct hn_softc *sc = arg1; 2773 int ackcnt, error, i; 2774 2775 /* 2776 * lro_ackcnt_lim is append count limit, 2777 * +1 to turn it into aggregation limit. 2778 */ 2779 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 2780 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 2781 if (error || req->newptr == NULL) 2782 return error; 2783 2784 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 2785 return EINVAL; 2786 2787 /* 2788 * Convert aggregation limit back to append 2789 * count limit. 2790 */ 2791 --ackcnt; 2792 HN_LOCK(sc); 2793 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 2794 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 2795 HN_UNLOCK(sc); 2796 return 0; 2797 } 2798 2799 #endif 2800 2801 static int 2802 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 2803 { 2804 struct hn_softc *sc = arg1; 2805 int hcsum = arg2; 2806 int on, error, i; 2807 2808 on = 0; 2809 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 2810 on = 1; 2811 2812 error = sysctl_handle_int(oidp, &on, 0, req); 2813 if (error || req->newptr == NULL) 2814 return error; 2815 2816 HN_LOCK(sc); 2817 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2818 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 2819 2820 if (on) 2821 rxr->hn_trust_hcsum |= hcsum; 2822 else 2823 rxr->hn_trust_hcsum &= ~hcsum; 2824 } 2825 HN_UNLOCK(sc); 2826 return 0; 2827 } 2828 2829 static int 2830 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 2831 { 2832 struct hn_softc *sc = arg1; 2833 int chim_size, error; 2834 2835 chim_size = sc->hn_tx_ring[0].hn_chim_size; 2836 error = sysctl_handle_int(oidp, &chim_size, 0, req); 2837 if (error || req->newptr == NULL) 2838 return error; 2839 2840 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 2841 return EINVAL; 2842 2843 HN_LOCK(sc); 2844 hn_set_chim_size(sc, chim_size); 2845 HN_UNLOCK(sc); 2846 return 0; 2847 } 2848 2849 #if __FreeBSD_version < 1100095 2850 static int 2851 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 2852 { 2853 struct hn_softc *sc = arg1; 2854 int ofs = arg2, i, error; 2855 struct hn_rx_ring *rxr; 2856 uint64_t stat; 2857 2858 stat = 0; 2859 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2860 rxr = &sc->hn_rx_ring[i]; 2861 stat += *((int *)((uint8_t *)rxr + ofs)); 2862 } 2863 2864 error = sysctl_handle_64(oidp, &stat, 0, req); 2865 if (error || req->newptr == NULL) 2866 return error; 2867 2868 /* Zero out this stat. */ 2869 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2870 rxr = &sc->hn_rx_ring[i]; 2871 *((int *)((uint8_t *)rxr + ofs)) = 0; 2872 } 2873 return 0; 2874 } 2875 #else 2876 static int 2877 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 2878 { 2879 struct hn_softc *sc = arg1; 2880 int ofs = arg2, i, error; 2881 struct hn_rx_ring *rxr; 2882 uint64_t stat; 2883 2884 stat = 0; 2885 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2886 rxr = &sc->hn_rx_ring[i]; 2887 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 2888 } 2889 2890 error = sysctl_handle_64(oidp, &stat, 0, req); 2891 if (error || req->newptr == NULL) 2892 return error; 2893 2894 /* Zero out this stat. */ 2895 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2896 rxr = &sc->hn_rx_ring[i]; 2897 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 2898 } 2899 return 0; 2900 } 2901 2902 #endif 2903 2904 static int 2905 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2906 { 2907 struct hn_softc *sc = arg1; 2908 int ofs = arg2, i, error; 2909 struct hn_rx_ring *rxr; 2910 u_long stat; 2911 2912 stat = 0; 2913 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2914 rxr = &sc->hn_rx_ring[i]; 2915 stat += *((u_long *)((uint8_t *)rxr + ofs)); 2916 } 2917 2918 error = sysctl_handle_long(oidp, &stat, 0, req); 2919 if (error || req->newptr == NULL) 2920 return error; 2921 2922 /* Zero out this stat. */ 2923 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2924 rxr = &sc->hn_rx_ring[i]; 2925 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 2926 } 2927 return 0; 2928 } 2929 2930 static int 2931 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2932 { 2933 struct hn_softc *sc = arg1; 2934 int ofs = arg2, i, error; 2935 struct hn_tx_ring *txr; 2936 u_long stat; 2937 2938 stat = 0; 2939 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2940 txr = &sc->hn_tx_ring[i]; 2941 stat += *((u_long *)((uint8_t *)txr + ofs)); 2942 } 2943 2944 error = sysctl_handle_long(oidp, &stat, 0, req); 2945 if (error || req->newptr == NULL) 2946 return error; 2947 2948 /* Zero out this stat. */ 2949 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2950 txr = &sc->hn_tx_ring[i]; 2951 *((u_long *)((uint8_t *)txr + ofs)) = 0; 2952 } 2953 return 0; 2954 } 2955 2956 static int 2957 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 2958 { 2959 struct hn_softc *sc = arg1; 2960 int ofs = arg2, i, error, conf; 2961 struct hn_tx_ring *txr; 2962 2963 txr = &sc->hn_tx_ring[0]; 2964 conf = *((int *)((uint8_t *)txr + ofs)); 2965 2966 error = sysctl_handle_int(oidp, &conf, 0, req); 2967 if (error || req->newptr == NULL) 2968 return error; 2969 2970 HN_LOCK(sc); 2971 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2972 txr = &sc->hn_tx_ring[i]; 2973 *((int *)((uint8_t *)txr + ofs)) = conf; 2974 } 2975 HN_UNLOCK(sc); 2976 2977 return 0; 2978 } 2979 2980 static int 2981 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 2982 { 2983 struct hn_softc *sc = arg1; 2984 int error, size; 2985 2986 size = sc->hn_agg_size; 2987 error = sysctl_handle_int(oidp, &size, 0, req); 2988 if (error || req->newptr == NULL) 2989 return (error); 2990 2991 HN_LOCK(sc); 2992 sc->hn_agg_size = size; 2993 hn_set_txagg(sc); 2994 HN_UNLOCK(sc); 2995 2996 return (0); 2997 } 2998 2999 static int 3000 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 3001 { 3002 struct hn_softc *sc = arg1; 3003 int error, pkts; 3004 3005 pkts = sc->hn_agg_pkts; 3006 error = sysctl_handle_int(oidp, &pkts, 0, req); 3007 if (error || req->newptr == NULL) 3008 return (error); 3009 3010 HN_LOCK(sc); 3011 sc->hn_agg_pkts = pkts; 3012 hn_set_txagg(sc); 3013 HN_UNLOCK(sc); 3014 3015 return (0); 3016 } 3017 3018 static int 3019 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 3020 { 3021 struct hn_softc *sc = arg1; 3022 int pkts; 3023 3024 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 3025 return (sysctl_handle_int(oidp, &pkts, 0, req)); 3026 } 3027 3028 static int 3029 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 3030 { 3031 struct hn_softc *sc = arg1; 3032 int align; 3033 3034 align = sc->hn_tx_ring[0].hn_agg_align; 3035 return (sysctl_handle_int(oidp, &align, 0, req)); 3036 } 3037 3038 static void 3039 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 3040 { 3041 if (pollhz == 0) 3042 vmbus_chan_poll_disable(chan); 3043 else 3044 vmbus_chan_poll_enable(chan, pollhz); 3045 } 3046 3047 static void 3048 hn_polling(struct hn_softc *sc, u_int pollhz) 3049 { 3050 int nsubch = sc->hn_rx_ring_inuse - 1; 3051 3052 HN_LOCK_ASSERT(sc); 3053 3054 if (nsubch > 0) { 3055 struct vmbus_channel **subch; 3056 int i; 3057 3058 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 3059 for (i = 0; i < nsubch; ++i) 3060 hn_chan_polling(subch[i], pollhz); 3061 vmbus_subchan_rel(subch, nsubch); 3062 } 3063 hn_chan_polling(sc->hn_prichan, pollhz); 3064 } 3065 3066 static int 3067 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 3068 { 3069 struct hn_softc *sc = arg1; 3070 int pollhz, error; 3071 3072 pollhz = sc->hn_pollhz; 3073 error = sysctl_handle_int(oidp, &pollhz, 0, req); 3074 if (error || req->newptr == NULL) 3075 return (error); 3076 3077 if (pollhz != 0 && 3078 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 3079 return (EINVAL); 3080 3081 HN_LOCK(sc); 3082 if (sc->hn_pollhz != pollhz) { 3083 sc->hn_pollhz = pollhz; 3084 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 3085 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 3086 hn_polling(sc, sc->hn_pollhz); 3087 } 3088 HN_UNLOCK(sc); 3089 3090 return (0); 3091 } 3092 3093 static int 3094 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 3095 { 3096 struct hn_softc *sc = arg1; 3097 char verstr[16]; 3098 3099 snprintf(verstr, sizeof(verstr), "%u.%u", 3100 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 3101 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 3102 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 3103 } 3104 3105 static int 3106 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 3107 { 3108 struct hn_softc *sc = arg1; 3109 char caps_str[128]; 3110 uint32_t caps; 3111 3112 HN_LOCK(sc); 3113 caps = sc->hn_caps; 3114 HN_UNLOCK(sc); 3115 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 3116 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 3117 } 3118 3119 static int 3120 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 3121 { 3122 struct hn_softc *sc = arg1; 3123 char assist_str[128]; 3124 uint32_t hwassist; 3125 3126 HN_LOCK(sc); 3127 hwassist = sc->hn_ifp->if_hwassist; 3128 HN_UNLOCK(sc); 3129 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 3130 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 3131 } 3132 3133 static int 3134 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 3135 { 3136 struct hn_softc *sc = arg1; 3137 char filter_str[128]; 3138 uint32_t filter; 3139 3140 HN_LOCK(sc); 3141 filter = sc->hn_rx_filter; 3142 HN_UNLOCK(sc); 3143 snprintf(filter_str, sizeof(filter_str), "%b", filter, 3144 NDIS_PACKET_TYPES); 3145 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 3146 } 3147 3148 #ifndef RSS 3149 3150 static int 3151 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 3152 { 3153 struct hn_softc *sc = arg1; 3154 int error; 3155 3156 HN_LOCK(sc); 3157 3158 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3159 if (error || req->newptr == NULL) 3160 goto back; 3161 3162 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3163 if (error) 3164 goto back; 3165 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 3166 3167 if (sc->hn_rx_ring_inuse > 1) { 3168 error = hn_rss_reconfig(sc); 3169 } else { 3170 /* Not RSS capable, at least for now; just save the RSS key. */ 3171 error = 0; 3172 } 3173 back: 3174 HN_UNLOCK(sc); 3175 return (error); 3176 } 3177 3178 static int 3179 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 3180 { 3181 struct hn_softc *sc = arg1; 3182 int error; 3183 3184 HN_LOCK(sc); 3185 3186 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 3187 if (error || req->newptr == NULL) 3188 goto back; 3189 3190 /* 3191 * Don't allow RSS indirect table change, if this interface is not 3192 * RSS capable currently. 3193 */ 3194 if (sc->hn_rx_ring_inuse == 1) { 3195 error = EOPNOTSUPP; 3196 goto back; 3197 } 3198 3199 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 3200 if (error) 3201 goto back; 3202 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 3203 3204 hn_rss_ind_fixup(sc); 3205 error = hn_rss_reconfig(sc); 3206 back: 3207 HN_UNLOCK(sc); 3208 return (error); 3209 } 3210 3211 #endif /* !RSS */ 3212 3213 static int 3214 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 3215 { 3216 struct hn_softc *sc = arg1; 3217 char hash_str[128]; 3218 uint32_t hash; 3219 3220 HN_LOCK(sc); 3221 hash = sc->hn_rss_hash; 3222 HN_UNLOCK(sc); 3223 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 3224 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 3225 } 3226 3227 static int 3228 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 3229 { 3230 struct hn_softc *sc = arg1; 3231 char vf_name[128]; 3232 struct ifnet *vf; 3233 3234 HN_LOCK(sc); 3235 vf_name[0] = '\0'; 3236 vf = sc->hn_rx_ring[0].hn_vf; 3237 if (vf != NULL) 3238 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf)); 3239 HN_UNLOCK(sc); 3240 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 3241 } 3242 3243 static int 3244 hn_check_iplen(const struct mbuf *m, int hoff) 3245 { 3246 const struct ip *ip; 3247 int len, iphlen, iplen; 3248 const struct tcphdr *th; 3249 int thoff; /* TCP data offset */ 3250 3251 len = hoff + sizeof(struct ip); 3252 3253 /* The packet must be at least the size of an IP header. */ 3254 if (m->m_pkthdr.len < len) 3255 return IPPROTO_DONE; 3256 3257 /* The fixed IP header must reside completely in the first mbuf. */ 3258 if (m->m_len < len) 3259 return IPPROTO_DONE; 3260 3261 ip = mtodo(m, hoff); 3262 3263 /* Bound check the packet's stated IP header length. */ 3264 iphlen = ip->ip_hl << 2; 3265 if (iphlen < sizeof(struct ip)) /* minimum header length */ 3266 return IPPROTO_DONE; 3267 3268 /* The full IP header must reside completely in the one mbuf. */ 3269 if (m->m_len < hoff + iphlen) 3270 return IPPROTO_DONE; 3271 3272 iplen = ntohs(ip->ip_len); 3273 3274 /* 3275 * Check that the amount of data in the buffers is as 3276 * at least much as the IP header would have us expect. 3277 */ 3278 if (m->m_pkthdr.len < hoff + iplen) 3279 return IPPROTO_DONE; 3280 3281 /* 3282 * Ignore IP fragments. 3283 */ 3284 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 3285 return IPPROTO_DONE; 3286 3287 /* 3288 * The TCP/IP or UDP/IP header must be entirely contained within 3289 * the first fragment of a packet. 3290 */ 3291 switch (ip->ip_p) { 3292 case IPPROTO_TCP: 3293 if (iplen < iphlen + sizeof(struct tcphdr)) 3294 return IPPROTO_DONE; 3295 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 3296 return IPPROTO_DONE; 3297 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 3298 thoff = th->th_off << 2; 3299 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 3300 return IPPROTO_DONE; 3301 if (m->m_len < hoff + iphlen + thoff) 3302 return IPPROTO_DONE; 3303 break; 3304 case IPPROTO_UDP: 3305 if (iplen < iphlen + sizeof(struct udphdr)) 3306 return IPPROTO_DONE; 3307 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 3308 return IPPROTO_DONE; 3309 break; 3310 default: 3311 if (iplen < iphlen) 3312 return IPPROTO_DONE; 3313 break; 3314 } 3315 return ip->ip_p; 3316 } 3317 3318 static int 3319 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 3320 { 3321 struct sysctl_oid_list *child; 3322 struct sysctl_ctx_list *ctx; 3323 device_t dev = sc->hn_dev; 3324 #if defined(INET) || defined(INET6) 3325 #if __FreeBSD_version >= 1100095 3326 int lroent_cnt; 3327 #endif 3328 #endif 3329 int i; 3330 3331 /* 3332 * Create RXBUF for reception. 3333 * 3334 * NOTE: 3335 * - It is shared by all channels. 3336 * - A large enough buffer is allocated, certain version of NVSes 3337 * may further limit the usable space. 3338 */ 3339 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3340 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 3341 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3342 if (sc->hn_rxbuf == NULL) { 3343 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 3344 return (ENOMEM); 3345 } 3346 3347 sc->hn_rx_ring_cnt = ring_cnt; 3348 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 3349 3350 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 3351 M_DEVBUF, M_WAITOK | M_ZERO); 3352 3353 #if defined(INET) || defined(INET6) 3354 #if __FreeBSD_version >= 1100095 3355 lroent_cnt = hn_lro_entry_count; 3356 if (lroent_cnt < TCP_LRO_ENTRIES) 3357 lroent_cnt = TCP_LRO_ENTRIES; 3358 if (bootverbose) 3359 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 3360 #endif 3361 #endif /* INET || INET6 */ 3362 3363 ctx = device_get_sysctl_ctx(dev); 3364 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 3365 3366 /* Create dev.hn.UNIT.rx sysctl tree */ 3367 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 3368 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3369 3370 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3371 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3372 3373 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3374 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 3375 &rxr->hn_br_dma, BUS_DMA_WAITOK); 3376 if (rxr->hn_br == NULL) { 3377 device_printf(dev, "allocate bufring failed\n"); 3378 return (ENOMEM); 3379 } 3380 3381 if (hn_trust_hosttcp) 3382 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 3383 if (hn_trust_hostudp) 3384 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 3385 if (hn_trust_hostip) 3386 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 3387 rxr->hn_ifp = sc->hn_ifp; 3388 if (i < sc->hn_tx_ring_cnt) 3389 rxr->hn_txr = &sc->hn_tx_ring[i]; 3390 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 3391 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 3392 rxr->hn_rx_idx = i; 3393 rxr->hn_rxbuf = sc->hn_rxbuf; 3394 3395 /* 3396 * Initialize LRO. 3397 */ 3398 #if defined(INET) || defined(INET6) 3399 #if __FreeBSD_version >= 1100095 3400 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 3401 hn_lro_mbufq_depth); 3402 #else 3403 tcp_lro_init(&rxr->hn_lro); 3404 rxr->hn_lro.ifp = sc->hn_ifp; 3405 #endif 3406 #if __FreeBSD_version >= 1100099 3407 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 3408 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 3409 #endif 3410 #endif /* INET || INET6 */ 3411 3412 if (sc->hn_rx_sysctl_tree != NULL) { 3413 char name[16]; 3414 3415 /* 3416 * Create per RX ring sysctl tree: 3417 * dev.hn.UNIT.rx.RINGID 3418 */ 3419 snprintf(name, sizeof(name), "%d", i); 3420 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 3421 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 3422 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3423 3424 if (rxr->hn_rx_sysctl_tree != NULL) { 3425 SYSCTL_ADD_ULONG(ctx, 3426 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3427 OID_AUTO, "packets", CTLFLAG_RW, 3428 &rxr->hn_pkts, "# of packets received"); 3429 SYSCTL_ADD_ULONG(ctx, 3430 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3431 OID_AUTO, "rss_pkts", CTLFLAG_RW, 3432 &rxr->hn_rss_pkts, 3433 "# of packets w/ RSS info received"); 3434 SYSCTL_ADD_INT(ctx, 3435 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3436 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 3437 &rxr->hn_pktbuf_len, 0, 3438 "Temporary channel packet buffer length"); 3439 } 3440 } 3441 } 3442 3443 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 3444 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3445 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 3446 #if __FreeBSD_version < 1100095 3447 hn_rx_stat_int_sysctl, 3448 #else 3449 hn_rx_stat_u64_sysctl, 3450 #endif 3451 "LU", "LRO queued"); 3452 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 3453 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3454 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 3455 #if __FreeBSD_version < 1100095 3456 hn_rx_stat_int_sysctl, 3457 #else 3458 hn_rx_stat_u64_sysctl, 3459 #endif 3460 "LU", "LRO flushed"); 3461 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 3462 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3463 __offsetof(struct hn_rx_ring, hn_lro_tried), 3464 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 3465 #if __FreeBSD_version >= 1100099 3466 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 3467 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3468 hn_lro_lenlim_sysctl, "IU", 3469 "Max # of data bytes to be aggregated by LRO"); 3470 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 3471 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3472 hn_lro_ackcnt_sysctl, "I", 3473 "Max # of ACKs to be aggregated by LRO"); 3474 #endif 3475 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 3476 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 3477 hn_trust_hcsum_sysctl, "I", 3478 "Trust tcp segement verification on host side, " 3479 "when csum info is missing"); 3480 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 3481 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 3482 hn_trust_hcsum_sysctl, "I", 3483 "Trust udp datagram verification on host side, " 3484 "when csum info is missing"); 3485 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 3486 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 3487 hn_trust_hcsum_sysctl, "I", 3488 "Trust ip packet verification on host side, " 3489 "when csum info is missing"); 3490 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 3491 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3492 __offsetof(struct hn_rx_ring, hn_csum_ip), 3493 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 3494 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 3495 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3496 __offsetof(struct hn_rx_ring, hn_csum_tcp), 3497 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 3498 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 3499 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3500 __offsetof(struct hn_rx_ring, hn_csum_udp), 3501 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 3502 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 3503 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3504 __offsetof(struct hn_rx_ring, hn_csum_trusted), 3505 hn_rx_stat_ulong_sysctl, "LU", 3506 "# of packets that we trust host's csum verification"); 3507 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 3508 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3509 __offsetof(struct hn_rx_ring, hn_small_pkts), 3510 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 3511 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 3512 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3513 __offsetof(struct hn_rx_ring, hn_ack_failed), 3514 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 3515 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 3516 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 3517 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 3518 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 3519 3520 return (0); 3521 } 3522 3523 static void 3524 hn_destroy_rx_data(struct hn_softc *sc) 3525 { 3526 int i; 3527 3528 if (sc->hn_rxbuf != NULL) { 3529 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 3530 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 3531 else 3532 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 3533 sc->hn_rxbuf = NULL; 3534 } 3535 3536 if (sc->hn_rx_ring_cnt == 0) 3537 return; 3538 3539 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3540 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3541 3542 if (rxr->hn_br == NULL) 3543 continue; 3544 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 3545 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 3546 } else { 3547 device_printf(sc->hn_dev, 3548 "%dth channel bufring is referenced", i); 3549 } 3550 rxr->hn_br = NULL; 3551 3552 #if defined(INET) || defined(INET6) 3553 tcp_lro_free(&rxr->hn_lro); 3554 #endif 3555 free(rxr->hn_pktbuf, M_DEVBUF); 3556 } 3557 free(sc->hn_rx_ring, M_DEVBUF); 3558 sc->hn_rx_ring = NULL; 3559 3560 sc->hn_rx_ring_cnt = 0; 3561 sc->hn_rx_ring_inuse = 0; 3562 } 3563 3564 static int 3565 hn_tx_ring_create(struct hn_softc *sc, int id) 3566 { 3567 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 3568 device_t dev = sc->hn_dev; 3569 bus_dma_tag_t parent_dtag; 3570 int error, i; 3571 3572 txr->hn_sc = sc; 3573 txr->hn_tx_idx = id; 3574 3575 #ifndef HN_USE_TXDESC_BUFRING 3576 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 3577 #endif 3578 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 3579 3580 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 3581 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 3582 M_DEVBUF, M_WAITOK | M_ZERO); 3583 #ifndef HN_USE_TXDESC_BUFRING 3584 SLIST_INIT(&txr->hn_txlist); 3585 #else 3586 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 3587 M_WAITOK, &txr->hn_tx_lock); 3588 #endif 3589 3590 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 3591 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 3592 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 3593 } else { 3594 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 3595 } 3596 3597 #ifdef HN_IFSTART_SUPPORT 3598 if (hn_use_if_start) { 3599 txr->hn_txeof = hn_start_txeof; 3600 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 3601 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 3602 } else 3603 #endif 3604 { 3605 int br_depth; 3606 3607 txr->hn_txeof = hn_xmit_txeof; 3608 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 3609 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 3610 3611 br_depth = hn_get_txswq_depth(txr); 3612 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 3613 M_WAITOK, &txr->hn_tx_lock); 3614 } 3615 3616 txr->hn_direct_tx_size = hn_direct_tx_size; 3617 3618 /* 3619 * Always schedule transmission instead of trying to do direct 3620 * transmission. This one gives the best performance so far. 3621 */ 3622 txr->hn_sched_tx = 1; 3623 3624 parent_dtag = bus_get_dma_tag(dev); 3625 3626 /* DMA tag for RNDIS packet messages. */ 3627 error = bus_dma_tag_create(parent_dtag, /* parent */ 3628 HN_RNDIS_PKT_ALIGN, /* alignment */ 3629 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 3630 BUS_SPACE_MAXADDR, /* lowaddr */ 3631 BUS_SPACE_MAXADDR, /* highaddr */ 3632 NULL, NULL, /* filter, filterarg */ 3633 HN_RNDIS_PKT_LEN, /* maxsize */ 3634 1, /* nsegments */ 3635 HN_RNDIS_PKT_LEN, /* maxsegsize */ 3636 0, /* flags */ 3637 NULL, /* lockfunc */ 3638 NULL, /* lockfuncarg */ 3639 &txr->hn_tx_rndis_dtag); 3640 if (error) { 3641 device_printf(dev, "failed to create rndis dmatag\n"); 3642 return error; 3643 } 3644 3645 /* DMA tag for data. */ 3646 error = bus_dma_tag_create(parent_dtag, /* parent */ 3647 1, /* alignment */ 3648 HN_TX_DATA_BOUNDARY, /* boundary */ 3649 BUS_SPACE_MAXADDR, /* lowaddr */ 3650 BUS_SPACE_MAXADDR, /* highaddr */ 3651 NULL, NULL, /* filter, filterarg */ 3652 HN_TX_DATA_MAXSIZE, /* maxsize */ 3653 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 3654 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 3655 0, /* flags */ 3656 NULL, /* lockfunc */ 3657 NULL, /* lockfuncarg */ 3658 &txr->hn_tx_data_dtag); 3659 if (error) { 3660 device_printf(dev, "failed to create data dmatag\n"); 3661 return error; 3662 } 3663 3664 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 3665 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 3666 3667 txd->txr = txr; 3668 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3669 STAILQ_INIT(&txd->agg_list); 3670 3671 /* 3672 * Allocate and load RNDIS packet message. 3673 */ 3674 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 3675 (void **)&txd->rndis_pkt, 3676 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 3677 &txd->rndis_pkt_dmap); 3678 if (error) { 3679 device_printf(dev, 3680 "failed to allocate rndis_packet_msg, %d\n", i); 3681 return error; 3682 } 3683 3684 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 3685 txd->rndis_pkt_dmap, 3686 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 3687 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 3688 BUS_DMA_NOWAIT); 3689 if (error) { 3690 device_printf(dev, 3691 "failed to load rndis_packet_msg, %d\n", i); 3692 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3693 txd->rndis_pkt, txd->rndis_pkt_dmap); 3694 return error; 3695 } 3696 3697 /* DMA map for TX data. */ 3698 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 3699 &txd->data_dmap); 3700 if (error) { 3701 device_printf(dev, 3702 "failed to allocate tx data dmamap\n"); 3703 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 3704 txd->rndis_pkt_dmap); 3705 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3706 txd->rndis_pkt, txd->rndis_pkt_dmap); 3707 return error; 3708 } 3709 3710 /* All set, put it to list */ 3711 txd->flags |= HN_TXD_FLAG_ONLIST; 3712 #ifndef HN_USE_TXDESC_BUFRING 3713 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 3714 #else 3715 buf_ring_enqueue(txr->hn_txdesc_br, txd); 3716 #endif 3717 } 3718 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 3719 3720 if (sc->hn_tx_sysctl_tree != NULL) { 3721 struct sysctl_oid_list *child; 3722 struct sysctl_ctx_list *ctx; 3723 char name[16]; 3724 3725 /* 3726 * Create per TX ring sysctl tree: 3727 * dev.hn.UNIT.tx.RINGID 3728 */ 3729 ctx = device_get_sysctl_ctx(dev); 3730 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 3731 3732 snprintf(name, sizeof(name), "%d", id); 3733 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 3734 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3735 3736 if (txr->hn_tx_sysctl_tree != NULL) { 3737 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 3738 3739 #ifdef HN_DEBUG 3740 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 3741 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 3742 "# of available TX descs"); 3743 #endif 3744 #ifdef HN_IFSTART_SUPPORT 3745 if (!hn_use_if_start) 3746 #endif 3747 { 3748 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 3749 CTLFLAG_RD, &txr->hn_oactive, 0, 3750 "over active"); 3751 } 3752 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 3753 CTLFLAG_RW, &txr->hn_pkts, 3754 "# of packets transmitted"); 3755 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 3756 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 3757 } 3758 } 3759 3760 return 0; 3761 } 3762 3763 static void 3764 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 3765 { 3766 struct hn_tx_ring *txr = txd->txr; 3767 3768 KASSERT(txd->m == NULL, ("still has mbuf installed")); 3769 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 3770 3771 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 3772 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 3773 txd->rndis_pkt_dmap); 3774 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 3775 } 3776 3777 static void 3778 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 3779 { 3780 3781 KASSERT(txd->refs == 0 || txd->refs == 1, 3782 ("invalid txd refs %d", txd->refs)); 3783 3784 /* Aggregated txds will be freed by their aggregating txd. */ 3785 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 3786 int freed; 3787 3788 freed = hn_txdesc_put(txr, txd); 3789 KASSERT(freed, ("can't free txdesc")); 3790 } 3791 } 3792 3793 static void 3794 hn_tx_ring_destroy(struct hn_tx_ring *txr) 3795 { 3796 int i; 3797 3798 if (txr->hn_txdesc == NULL) 3799 return; 3800 3801 /* 3802 * NOTE: 3803 * Because the freeing of aggregated txds will be deferred 3804 * to the aggregating txd, two passes are used here: 3805 * - The first pass GCes any pending txds. This GC is necessary, 3806 * since if the channels are revoked, hypervisor will not 3807 * deliver send-done for all pending txds. 3808 * - The second pass frees the busdma stuffs, i.e. after all txds 3809 * were freed. 3810 */ 3811 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3812 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 3813 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3814 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 3815 3816 if (txr->hn_tx_data_dtag != NULL) 3817 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 3818 if (txr->hn_tx_rndis_dtag != NULL) 3819 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 3820 3821 #ifdef HN_USE_TXDESC_BUFRING 3822 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 3823 #endif 3824 3825 free(txr->hn_txdesc, M_DEVBUF); 3826 txr->hn_txdesc = NULL; 3827 3828 if (txr->hn_mbuf_br != NULL) 3829 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 3830 3831 #ifndef HN_USE_TXDESC_BUFRING 3832 mtx_destroy(&txr->hn_txlist_spin); 3833 #endif 3834 mtx_destroy(&txr->hn_tx_lock); 3835 } 3836 3837 static int 3838 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 3839 { 3840 struct sysctl_oid_list *child; 3841 struct sysctl_ctx_list *ctx; 3842 int i; 3843 3844 /* 3845 * Create TXBUF for chimney sending. 3846 * 3847 * NOTE: It is shared by all channels. 3848 */ 3849 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 3850 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 3851 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3852 if (sc->hn_chim == NULL) { 3853 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 3854 return (ENOMEM); 3855 } 3856 3857 sc->hn_tx_ring_cnt = ring_cnt; 3858 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 3859 3860 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 3861 M_DEVBUF, M_WAITOK | M_ZERO); 3862 3863 ctx = device_get_sysctl_ctx(sc->hn_dev); 3864 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 3865 3866 /* Create dev.hn.UNIT.tx sysctl tree */ 3867 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 3868 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3869 3870 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 3871 int error; 3872 3873 error = hn_tx_ring_create(sc, i); 3874 if (error) 3875 return error; 3876 } 3877 3878 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 3879 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3880 __offsetof(struct hn_tx_ring, hn_no_txdescs), 3881 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 3882 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 3883 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3884 __offsetof(struct hn_tx_ring, hn_send_failed), 3885 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 3886 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 3887 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3888 __offsetof(struct hn_tx_ring, hn_txdma_failed), 3889 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 3890 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 3891 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3892 __offsetof(struct hn_tx_ring, hn_flush_failed), 3893 hn_tx_stat_ulong_sysctl, "LU", 3894 "# of packet transmission aggregation flush failure"); 3895 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 3896 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3897 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 3898 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 3899 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 3900 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3901 __offsetof(struct hn_tx_ring, hn_tx_chimney), 3902 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 3903 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 3904 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3905 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 3906 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 3907 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 3908 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 3909 "# of total TX descs"); 3910 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 3911 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 3912 "Chimney send packet size upper boundary"); 3913 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 3914 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3915 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 3916 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 3917 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3918 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 3919 hn_tx_conf_int_sysctl, "I", 3920 "Size of the packet for direct transmission"); 3921 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 3922 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3923 __offsetof(struct hn_tx_ring, hn_sched_tx), 3924 hn_tx_conf_int_sysctl, "I", 3925 "Always schedule transmission " 3926 "instead of doing direct transmission"); 3927 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 3928 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 3929 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 3930 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 3931 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 3932 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 3933 "Applied packet transmission aggregation size"); 3934 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 3935 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3936 hn_txagg_pktmax_sysctl, "I", 3937 "Applied packet transmission aggregation packets"); 3938 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 3939 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3940 hn_txagg_align_sysctl, "I", 3941 "Applied packet transmission aggregation alignment"); 3942 3943 return 0; 3944 } 3945 3946 static void 3947 hn_set_chim_size(struct hn_softc *sc, int chim_size) 3948 { 3949 int i; 3950 3951 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3952 sc->hn_tx_ring[i].hn_chim_size = chim_size; 3953 } 3954 3955 static void 3956 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 3957 { 3958 struct ifnet *ifp = sc->hn_ifp; 3959 int tso_minlen; 3960 3961 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 3962 return; 3963 3964 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 3965 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 3966 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 3967 3968 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 3969 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 3970 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 3971 3972 if (tso_maxlen < tso_minlen) 3973 tso_maxlen = tso_minlen; 3974 else if (tso_maxlen > IP_MAXPACKET) 3975 tso_maxlen = IP_MAXPACKET; 3976 if (tso_maxlen > sc->hn_ndis_tso_szmax) 3977 tso_maxlen = sc->hn_ndis_tso_szmax; 3978 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 3979 if (bootverbose) 3980 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 3981 } 3982 3983 static void 3984 hn_fixup_tx_data(struct hn_softc *sc) 3985 { 3986 uint64_t csum_assist; 3987 int i; 3988 3989 hn_set_chim_size(sc, sc->hn_chim_szmax); 3990 if (hn_tx_chimney_size > 0 && 3991 hn_tx_chimney_size < sc->hn_chim_szmax) 3992 hn_set_chim_size(sc, hn_tx_chimney_size); 3993 3994 csum_assist = 0; 3995 if (sc->hn_caps & HN_CAP_IPCS) 3996 csum_assist |= CSUM_IP; 3997 if (sc->hn_caps & HN_CAP_TCP4CS) 3998 csum_assist |= CSUM_IP_TCP; 3999 if (sc->hn_caps & HN_CAP_UDP4CS) 4000 csum_assist |= CSUM_IP_UDP; 4001 if (sc->hn_caps & HN_CAP_TCP6CS) 4002 csum_assist |= CSUM_IP6_TCP; 4003 if (sc->hn_caps & HN_CAP_UDP6CS) 4004 csum_assist |= CSUM_IP6_UDP; 4005 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4006 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 4007 4008 if (sc->hn_caps & HN_CAP_HASHVAL) { 4009 /* 4010 * Support HASHVAL pktinfo on TX path. 4011 */ 4012 if (bootverbose) 4013 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 4014 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4015 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 4016 } 4017 } 4018 4019 static void 4020 hn_destroy_tx_data(struct hn_softc *sc) 4021 { 4022 int i; 4023 4024 if (sc->hn_chim != NULL) { 4025 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 4026 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 4027 } else { 4028 device_printf(sc->hn_dev, 4029 "chimney sending buffer is referenced"); 4030 } 4031 sc->hn_chim = NULL; 4032 } 4033 4034 if (sc->hn_tx_ring_cnt == 0) 4035 return; 4036 4037 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4038 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 4039 4040 free(sc->hn_tx_ring, M_DEVBUF); 4041 sc->hn_tx_ring = NULL; 4042 4043 sc->hn_tx_ring_cnt = 0; 4044 sc->hn_tx_ring_inuse = 0; 4045 } 4046 4047 #ifdef HN_IFSTART_SUPPORT 4048 4049 static void 4050 hn_start_taskfunc(void *xtxr, int pending __unused) 4051 { 4052 struct hn_tx_ring *txr = xtxr; 4053 4054 mtx_lock(&txr->hn_tx_lock); 4055 hn_start_locked(txr, 0); 4056 mtx_unlock(&txr->hn_tx_lock); 4057 } 4058 4059 static int 4060 hn_start_locked(struct hn_tx_ring *txr, int len) 4061 { 4062 struct hn_softc *sc = txr->hn_sc; 4063 struct ifnet *ifp = sc->hn_ifp; 4064 int sched = 0; 4065 4066 KASSERT(hn_use_if_start, 4067 ("hn_start_locked is called, when if_start is disabled")); 4068 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 4069 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 4070 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 4071 4072 if (__predict_false(txr->hn_suspended)) 4073 return (0); 4074 4075 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 4076 IFF_DRV_RUNNING) 4077 return (0); 4078 4079 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 4080 struct hn_txdesc *txd; 4081 struct mbuf *m_head; 4082 int error; 4083 4084 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 4085 if (m_head == NULL) 4086 break; 4087 4088 if (len > 0 && m_head->m_pkthdr.len > len) { 4089 /* 4090 * This sending could be time consuming; let callers 4091 * dispatch this packet sending (and sending of any 4092 * following up packets) to tx taskqueue. 4093 */ 4094 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 4095 sched = 1; 4096 break; 4097 } 4098 4099 #if defined(INET6) || defined(INET) 4100 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 4101 m_head = hn_tso_fixup(m_head); 4102 if (__predict_false(m_head == NULL)) { 4103 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 4104 continue; 4105 } 4106 } 4107 #endif 4108 4109 txd = hn_txdesc_get(txr); 4110 if (txd == NULL) { 4111 txr->hn_no_txdescs++; 4112 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 4113 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4114 break; 4115 } 4116 4117 error = hn_encap(ifp, txr, txd, &m_head); 4118 if (error) { 4119 /* Both txd and m_head are freed */ 4120 KASSERT(txr->hn_agg_txd == NULL, 4121 ("encap failed w/ pending aggregating txdesc")); 4122 continue; 4123 } 4124 4125 if (txr->hn_agg_pktleft == 0) { 4126 if (txr->hn_agg_txd != NULL) { 4127 KASSERT(m_head == NULL, 4128 ("pending mbuf for aggregating txdesc")); 4129 error = hn_flush_txagg(ifp, txr); 4130 if (__predict_false(error)) { 4131 atomic_set_int(&ifp->if_drv_flags, 4132 IFF_DRV_OACTIVE); 4133 break; 4134 } 4135 } else { 4136 KASSERT(m_head != NULL, ("mbuf was freed")); 4137 error = hn_txpkt(ifp, txr, txd); 4138 if (__predict_false(error)) { 4139 /* txd is freed, but m_head is not */ 4140 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 4141 atomic_set_int(&ifp->if_drv_flags, 4142 IFF_DRV_OACTIVE); 4143 break; 4144 } 4145 } 4146 } 4147 #ifdef INVARIANTS 4148 else { 4149 KASSERT(txr->hn_agg_txd != NULL, 4150 ("no aggregating txdesc")); 4151 KASSERT(m_head == NULL, 4152 ("pending mbuf for aggregating txdesc")); 4153 } 4154 #endif 4155 } 4156 4157 /* Flush pending aggerated transmission. */ 4158 if (txr->hn_agg_txd != NULL) 4159 hn_flush_txagg(ifp, txr); 4160 return (sched); 4161 } 4162 4163 static void 4164 hn_start(struct ifnet *ifp) 4165 { 4166 struct hn_softc *sc = ifp->if_softc; 4167 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 4168 4169 if (txr->hn_sched_tx) 4170 goto do_sched; 4171 4172 if (mtx_trylock(&txr->hn_tx_lock)) { 4173 int sched; 4174 4175 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 4176 mtx_unlock(&txr->hn_tx_lock); 4177 if (!sched) 4178 return; 4179 } 4180 do_sched: 4181 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4182 } 4183 4184 static void 4185 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 4186 { 4187 struct hn_tx_ring *txr = xtxr; 4188 4189 mtx_lock(&txr->hn_tx_lock); 4190 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 4191 hn_start_locked(txr, 0); 4192 mtx_unlock(&txr->hn_tx_lock); 4193 } 4194 4195 static void 4196 hn_start_txeof(struct hn_tx_ring *txr) 4197 { 4198 struct hn_softc *sc = txr->hn_sc; 4199 struct ifnet *ifp = sc->hn_ifp; 4200 4201 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 4202 4203 if (txr->hn_sched_tx) 4204 goto do_sched; 4205 4206 if (mtx_trylock(&txr->hn_tx_lock)) { 4207 int sched; 4208 4209 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4210 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 4211 mtx_unlock(&txr->hn_tx_lock); 4212 if (sched) { 4213 taskqueue_enqueue(txr->hn_tx_taskq, 4214 &txr->hn_tx_task); 4215 } 4216 } else { 4217 do_sched: 4218 /* 4219 * Release the OACTIVE earlier, with the hope, that 4220 * others could catch up. The task will clear the 4221 * flag again with the hn_tx_lock to avoid possible 4222 * races. 4223 */ 4224 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4225 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4226 } 4227 } 4228 4229 #endif /* HN_IFSTART_SUPPORT */ 4230 4231 static int 4232 hn_xmit(struct hn_tx_ring *txr, int len) 4233 { 4234 struct hn_softc *sc = txr->hn_sc; 4235 struct ifnet *ifp = sc->hn_ifp; 4236 struct mbuf *m_head; 4237 int sched = 0; 4238 4239 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 4240 #ifdef HN_IFSTART_SUPPORT 4241 KASSERT(hn_use_if_start == 0, 4242 ("hn_xmit is called, when if_start is enabled")); 4243 #endif 4244 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 4245 4246 if (__predict_false(txr->hn_suspended)) 4247 return (0); 4248 4249 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 4250 return (0); 4251 4252 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 4253 struct hn_txdesc *txd; 4254 int error; 4255 4256 if (len > 0 && m_head->m_pkthdr.len > len) { 4257 /* 4258 * This sending could be time consuming; let callers 4259 * dispatch this packet sending (and sending of any 4260 * following up packets) to tx taskqueue. 4261 */ 4262 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4263 sched = 1; 4264 break; 4265 } 4266 4267 txd = hn_txdesc_get(txr); 4268 if (txd == NULL) { 4269 txr->hn_no_txdescs++; 4270 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4271 txr->hn_oactive = 1; 4272 break; 4273 } 4274 4275 error = hn_encap(ifp, txr, txd, &m_head); 4276 if (error) { 4277 /* Both txd and m_head are freed; discard */ 4278 KASSERT(txr->hn_agg_txd == NULL, 4279 ("encap failed w/ pending aggregating txdesc")); 4280 drbr_advance(ifp, txr->hn_mbuf_br); 4281 continue; 4282 } 4283 4284 if (txr->hn_agg_pktleft == 0) { 4285 if (txr->hn_agg_txd != NULL) { 4286 KASSERT(m_head == NULL, 4287 ("pending mbuf for aggregating txdesc")); 4288 error = hn_flush_txagg(ifp, txr); 4289 if (__predict_false(error)) { 4290 txr->hn_oactive = 1; 4291 break; 4292 } 4293 } else { 4294 KASSERT(m_head != NULL, ("mbuf was freed")); 4295 error = hn_txpkt(ifp, txr, txd); 4296 if (__predict_false(error)) { 4297 /* txd is freed, but m_head is not */ 4298 drbr_putback(ifp, txr->hn_mbuf_br, 4299 m_head); 4300 txr->hn_oactive = 1; 4301 break; 4302 } 4303 } 4304 } 4305 #ifdef INVARIANTS 4306 else { 4307 KASSERT(txr->hn_agg_txd != NULL, 4308 ("no aggregating txdesc")); 4309 KASSERT(m_head == NULL, 4310 ("pending mbuf for aggregating txdesc")); 4311 } 4312 #endif 4313 4314 /* Sent */ 4315 drbr_advance(ifp, txr->hn_mbuf_br); 4316 } 4317 4318 /* Flush pending aggerated transmission. */ 4319 if (txr->hn_agg_txd != NULL) 4320 hn_flush_txagg(ifp, txr); 4321 return (sched); 4322 } 4323 4324 static int 4325 hn_transmit(struct ifnet *ifp, struct mbuf *m) 4326 { 4327 struct hn_softc *sc = ifp->if_softc; 4328 struct hn_tx_ring *txr; 4329 int error, idx = 0; 4330 4331 #if defined(INET6) || defined(INET) 4332 /* 4333 * Perform TSO packet header fixup now, since the TSO 4334 * packet header should be cache-hot. 4335 */ 4336 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 4337 m = hn_tso_fixup(m); 4338 if (__predict_false(m == NULL)) { 4339 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 4340 return EIO; 4341 } 4342 } 4343 #endif 4344 4345 /* 4346 * Select the TX ring based on flowid 4347 */ 4348 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 4349 #ifdef RSS 4350 uint32_t bid; 4351 4352 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 4353 &bid) == 0) 4354 idx = bid % sc->hn_tx_ring_inuse; 4355 else 4356 #endif 4357 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 4358 } 4359 txr = &sc->hn_tx_ring[idx]; 4360 4361 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 4362 if (error) { 4363 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 4364 return error; 4365 } 4366 4367 if (txr->hn_oactive) 4368 return 0; 4369 4370 if (txr->hn_sched_tx) 4371 goto do_sched; 4372 4373 if (mtx_trylock(&txr->hn_tx_lock)) { 4374 int sched; 4375 4376 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4377 mtx_unlock(&txr->hn_tx_lock); 4378 if (!sched) 4379 return 0; 4380 } 4381 do_sched: 4382 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4383 return 0; 4384 } 4385 4386 static void 4387 hn_tx_ring_qflush(struct hn_tx_ring *txr) 4388 { 4389 struct mbuf *m; 4390 4391 mtx_lock(&txr->hn_tx_lock); 4392 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 4393 m_freem(m); 4394 mtx_unlock(&txr->hn_tx_lock); 4395 } 4396 4397 static void 4398 hn_xmit_qflush(struct ifnet *ifp) 4399 { 4400 struct hn_softc *sc = ifp->if_softc; 4401 int i; 4402 4403 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4404 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 4405 if_qflush(ifp); 4406 } 4407 4408 static void 4409 hn_xmit_txeof(struct hn_tx_ring *txr) 4410 { 4411 4412 if (txr->hn_sched_tx) 4413 goto do_sched; 4414 4415 if (mtx_trylock(&txr->hn_tx_lock)) { 4416 int sched; 4417 4418 txr->hn_oactive = 0; 4419 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4420 mtx_unlock(&txr->hn_tx_lock); 4421 if (sched) { 4422 taskqueue_enqueue(txr->hn_tx_taskq, 4423 &txr->hn_tx_task); 4424 } 4425 } else { 4426 do_sched: 4427 /* 4428 * Release the oactive earlier, with the hope, that 4429 * others could catch up. The task will clear the 4430 * oactive again with the hn_tx_lock to avoid possible 4431 * races. 4432 */ 4433 txr->hn_oactive = 0; 4434 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4435 } 4436 } 4437 4438 static void 4439 hn_xmit_taskfunc(void *xtxr, int pending __unused) 4440 { 4441 struct hn_tx_ring *txr = xtxr; 4442 4443 mtx_lock(&txr->hn_tx_lock); 4444 hn_xmit(txr, 0); 4445 mtx_unlock(&txr->hn_tx_lock); 4446 } 4447 4448 static void 4449 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 4450 { 4451 struct hn_tx_ring *txr = xtxr; 4452 4453 mtx_lock(&txr->hn_tx_lock); 4454 txr->hn_oactive = 0; 4455 hn_xmit(txr, 0); 4456 mtx_unlock(&txr->hn_tx_lock); 4457 } 4458 4459 static int 4460 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 4461 { 4462 struct vmbus_chan_br cbr; 4463 struct hn_rx_ring *rxr; 4464 struct hn_tx_ring *txr = NULL; 4465 int idx, error; 4466 4467 idx = vmbus_chan_subidx(chan); 4468 4469 /* 4470 * Link this channel to RX/TX ring. 4471 */ 4472 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4473 ("invalid channel index %d, should > 0 && < %d", 4474 idx, sc->hn_rx_ring_inuse)); 4475 rxr = &sc->hn_rx_ring[idx]; 4476 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 4477 ("RX ring %d already attached", idx)); 4478 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 4479 rxr->hn_chan = chan; 4480 4481 if (bootverbose) { 4482 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 4483 idx, vmbus_chan_id(chan)); 4484 } 4485 4486 if (idx < sc->hn_tx_ring_inuse) { 4487 txr = &sc->hn_tx_ring[idx]; 4488 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 4489 ("TX ring %d already attached", idx)); 4490 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 4491 4492 txr->hn_chan = chan; 4493 if (bootverbose) { 4494 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 4495 idx, vmbus_chan_id(chan)); 4496 } 4497 } 4498 4499 /* Bind this channel to a proper CPU. */ 4500 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 4501 4502 /* 4503 * Open this channel 4504 */ 4505 cbr.cbr = rxr->hn_br; 4506 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 4507 cbr.cbr_txsz = HN_TXBR_SIZE; 4508 cbr.cbr_rxsz = HN_RXBR_SIZE; 4509 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 4510 if (error) { 4511 if (error == EISCONN) { 4512 if_printf(sc->hn_ifp, "bufring is connected after " 4513 "chan%u open failure\n", vmbus_chan_id(chan)); 4514 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4515 } else { 4516 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 4517 vmbus_chan_id(chan), error); 4518 } 4519 } 4520 return (error); 4521 } 4522 4523 static void 4524 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 4525 { 4526 struct hn_rx_ring *rxr; 4527 int idx, error; 4528 4529 idx = vmbus_chan_subidx(chan); 4530 4531 /* 4532 * Link this channel to RX/TX ring. 4533 */ 4534 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4535 ("invalid channel index %d, should > 0 && < %d", 4536 idx, sc->hn_rx_ring_inuse)); 4537 rxr = &sc->hn_rx_ring[idx]; 4538 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 4539 ("RX ring %d is not attached", idx)); 4540 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 4541 4542 if (idx < sc->hn_tx_ring_inuse) { 4543 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 4544 4545 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 4546 ("TX ring %d is not attached attached", idx)); 4547 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 4548 } 4549 4550 /* 4551 * Close this channel. 4552 * 4553 * NOTE: 4554 * Channel closing does _not_ destroy the target channel. 4555 */ 4556 error = vmbus_chan_close_direct(chan); 4557 if (error == EISCONN) { 4558 if_printf(sc->hn_ifp, "chan%u bufring is connected " 4559 "after being closed\n", vmbus_chan_id(chan)); 4560 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4561 } else if (error) { 4562 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 4563 vmbus_chan_id(chan), error); 4564 } 4565 } 4566 4567 static int 4568 hn_attach_subchans(struct hn_softc *sc) 4569 { 4570 struct vmbus_channel **subchans; 4571 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4572 int i, error = 0; 4573 4574 KASSERT(subchan_cnt > 0, ("no sub-channels")); 4575 4576 /* Attach the sub-channels. */ 4577 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4578 for (i = 0; i < subchan_cnt; ++i) { 4579 int error1; 4580 4581 error1 = hn_chan_attach(sc, subchans[i]); 4582 if (error1) { 4583 error = error1; 4584 /* Move on; all channels will be detached later. */ 4585 } 4586 } 4587 vmbus_subchan_rel(subchans, subchan_cnt); 4588 4589 if (error) { 4590 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 4591 } else { 4592 if (bootverbose) { 4593 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 4594 subchan_cnt); 4595 } 4596 } 4597 return (error); 4598 } 4599 4600 static void 4601 hn_detach_allchans(struct hn_softc *sc) 4602 { 4603 struct vmbus_channel **subchans; 4604 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4605 int i; 4606 4607 if (subchan_cnt == 0) 4608 goto back; 4609 4610 /* Detach the sub-channels. */ 4611 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4612 for (i = 0; i < subchan_cnt; ++i) 4613 hn_chan_detach(sc, subchans[i]); 4614 vmbus_subchan_rel(subchans, subchan_cnt); 4615 4616 back: 4617 /* 4618 * Detach the primary channel, _after_ all sub-channels 4619 * are detached. 4620 */ 4621 hn_chan_detach(sc, sc->hn_prichan); 4622 4623 /* Wait for sub-channels to be destroyed, if any. */ 4624 vmbus_subchan_drain(sc->hn_prichan); 4625 4626 #ifdef INVARIANTS 4627 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4628 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 4629 HN_RX_FLAG_ATTACHED) == 0, 4630 ("%dth RX ring is still attached", i)); 4631 } 4632 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4633 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 4634 HN_TX_FLAG_ATTACHED) == 0, 4635 ("%dth TX ring is still attached", i)); 4636 } 4637 #endif 4638 } 4639 4640 static int 4641 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 4642 { 4643 struct vmbus_channel **subchans; 4644 int nchan, rxr_cnt, error; 4645 4646 nchan = *nsubch + 1; 4647 if (nchan == 1) { 4648 /* 4649 * Multiple RX/TX rings are not requested. 4650 */ 4651 *nsubch = 0; 4652 return (0); 4653 } 4654 4655 /* 4656 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 4657 * table entries. 4658 */ 4659 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 4660 if (error) { 4661 /* No RSS; this is benign. */ 4662 *nsubch = 0; 4663 return (0); 4664 } 4665 if (bootverbose) { 4666 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 4667 rxr_cnt, nchan); 4668 } 4669 4670 if (nchan > rxr_cnt) 4671 nchan = rxr_cnt; 4672 if (nchan == 1) { 4673 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 4674 *nsubch = 0; 4675 return (0); 4676 } 4677 4678 /* 4679 * Allocate sub-channels from NVS. 4680 */ 4681 *nsubch = nchan - 1; 4682 error = hn_nvs_alloc_subchans(sc, nsubch); 4683 if (error || *nsubch == 0) { 4684 /* Failed to allocate sub-channels. */ 4685 *nsubch = 0; 4686 return (0); 4687 } 4688 4689 /* 4690 * Wait for all sub-channels to become ready before moving on. 4691 */ 4692 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 4693 vmbus_subchan_rel(subchans, *nsubch); 4694 return (0); 4695 } 4696 4697 static bool 4698 hn_synth_attachable(const struct hn_softc *sc) 4699 { 4700 int i; 4701 4702 if (sc->hn_flags & HN_FLAG_ERRORS) 4703 return (false); 4704 4705 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4706 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4707 4708 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 4709 return (false); 4710 } 4711 return (true); 4712 } 4713 4714 static int 4715 hn_synth_attach(struct hn_softc *sc, int mtu) 4716 { 4717 #define ATTACHED_NVS 0x0002 4718 #define ATTACHED_RNDIS 0x0004 4719 4720 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 4721 int error, nsubch, nchan, i; 4722 uint32_t old_caps, attached = 0; 4723 4724 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 4725 ("synthetic parts were attached")); 4726 4727 if (!hn_synth_attachable(sc)) 4728 return (ENXIO); 4729 4730 /* Save capabilities for later verification. */ 4731 old_caps = sc->hn_caps; 4732 sc->hn_caps = 0; 4733 4734 /* Clear RSS stuffs. */ 4735 sc->hn_rss_ind_size = 0; 4736 sc->hn_rss_hash = 0; 4737 4738 /* 4739 * Attach the primary channel _before_ attaching NVS and RNDIS. 4740 */ 4741 error = hn_chan_attach(sc, sc->hn_prichan); 4742 if (error) 4743 goto failed; 4744 4745 /* 4746 * Attach NVS. 4747 */ 4748 error = hn_nvs_attach(sc, mtu); 4749 if (error) 4750 goto failed; 4751 attached |= ATTACHED_NVS; 4752 4753 /* 4754 * Attach RNDIS _after_ NVS is attached. 4755 */ 4756 error = hn_rndis_attach(sc, mtu); 4757 if (error) 4758 goto failed; 4759 attached |= ATTACHED_RNDIS; 4760 4761 /* 4762 * Make sure capabilities are not changed. 4763 */ 4764 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 4765 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 4766 old_caps, sc->hn_caps); 4767 error = ENXIO; 4768 goto failed; 4769 } 4770 4771 /* 4772 * Allocate sub-channels for multi-TX/RX rings. 4773 * 4774 * NOTE: 4775 * The # of RX rings that can be used is equivalent to the # of 4776 * channels to be requested. 4777 */ 4778 nsubch = sc->hn_rx_ring_cnt - 1; 4779 error = hn_synth_alloc_subchans(sc, &nsubch); 4780 if (error) 4781 goto failed; 4782 /* NOTE: _Full_ synthetic parts detach is required now. */ 4783 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 4784 4785 /* 4786 * Set the # of TX/RX rings that could be used according to 4787 * the # of channels that NVS offered. 4788 */ 4789 nchan = nsubch + 1; 4790 hn_set_ring_inuse(sc, nchan); 4791 if (nchan == 1) { 4792 /* Only the primary channel can be used; done */ 4793 goto back; 4794 } 4795 4796 /* 4797 * Attach the sub-channels. 4798 * 4799 * NOTE: hn_set_ring_inuse() _must_ have been called. 4800 */ 4801 error = hn_attach_subchans(sc); 4802 if (error) 4803 goto failed; 4804 4805 /* 4806 * Configure RSS key and indirect table _after_ all sub-channels 4807 * are attached. 4808 */ 4809 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 4810 /* 4811 * RSS key is not set yet; set it to the default RSS key. 4812 */ 4813 if (bootverbose) 4814 if_printf(sc->hn_ifp, "setup default RSS key\n"); 4815 #ifdef RSS 4816 rss_getkey(rss->rss_key); 4817 #else 4818 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 4819 #endif 4820 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4821 } 4822 4823 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 4824 /* 4825 * RSS indirect table is not set yet; set it up in round- 4826 * robin fashion. 4827 */ 4828 if (bootverbose) { 4829 if_printf(sc->hn_ifp, "setup default RSS indirect " 4830 "table\n"); 4831 } 4832 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 4833 uint32_t subidx; 4834 4835 #ifdef RSS 4836 subidx = rss_get_indirection_to_bucket(i); 4837 #else 4838 subidx = i; 4839 #endif 4840 rss->rss_ind[i] = subidx % nchan; 4841 } 4842 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4843 } else { 4844 /* 4845 * # of usable channels may be changed, so we have to 4846 * make sure that all entries in RSS indirect table 4847 * are valid. 4848 * 4849 * NOTE: hn_set_ring_inuse() _must_ have been called. 4850 */ 4851 hn_rss_ind_fixup(sc); 4852 } 4853 4854 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 4855 if (error) 4856 goto failed; 4857 back: 4858 /* 4859 * Fixup transmission aggregation setup. 4860 */ 4861 hn_set_txagg(sc); 4862 return (0); 4863 4864 failed: 4865 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 4866 hn_synth_detach(sc); 4867 } else { 4868 if (attached & ATTACHED_RNDIS) 4869 hn_rndis_detach(sc); 4870 if (attached & ATTACHED_NVS) 4871 hn_nvs_detach(sc); 4872 hn_chan_detach(sc, sc->hn_prichan); 4873 /* Restore old capabilities. */ 4874 sc->hn_caps = old_caps; 4875 } 4876 return (error); 4877 4878 #undef ATTACHED_RNDIS 4879 #undef ATTACHED_NVS 4880 } 4881 4882 /* 4883 * NOTE: 4884 * The interface must have been suspended though hn_suspend(), before 4885 * this function get called. 4886 */ 4887 static void 4888 hn_synth_detach(struct hn_softc *sc) 4889 { 4890 4891 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4892 ("synthetic parts were not attached")); 4893 4894 /* Detach the RNDIS first. */ 4895 hn_rndis_detach(sc); 4896 4897 /* Detach NVS. */ 4898 hn_nvs_detach(sc); 4899 4900 /* Detach all of the channels. */ 4901 hn_detach_allchans(sc); 4902 4903 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 4904 } 4905 4906 static void 4907 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 4908 { 4909 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 4910 ("invalid ring count %d", ring_cnt)); 4911 4912 if (sc->hn_tx_ring_cnt > ring_cnt) 4913 sc->hn_tx_ring_inuse = ring_cnt; 4914 else 4915 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 4916 sc->hn_rx_ring_inuse = ring_cnt; 4917 4918 #ifdef RSS 4919 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 4920 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 4921 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 4922 rss_getnumbuckets()); 4923 } 4924 #endif 4925 4926 if (bootverbose) { 4927 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 4928 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 4929 } 4930 } 4931 4932 static void 4933 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 4934 { 4935 4936 /* 4937 * NOTE: 4938 * The TX bufring will not be drained by the hypervisor, 4939 * if the primary channel is revoked. 4940 */ 4941 while (!vmbus_chan_rx_empty(chan) || 4942 (!vmbus_chan_is_revoked(sc->hn_prichan) && 4943 !vmbus_chan_tx_empty(chan))) 4944 pause("waitch", 1); 4945 vmbus_chan_intr_drain(chan); 4946 } 4947 4948 static void 4949 hn_suspend_data(struct hn_softc *sc) 4950 { 4951 struct vmbus_channel **subch = NULL; 4952 struct hn_tx_ring *txr; 4953 int i, nsubch; 4954 4955 HN_LOCK_ASSERT(sc); 4956 4957 /* 4958 * Suspend TX. 4959 */ 4960 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4961 txr = &sc->hn_tx_ring[i]; 4962 4963 mtx_lock(&txr->hn_tx_lock); 4964 txr->hn_suspended = 1; 4965 mtx_unlock(&txr->hn_tx_lock); 4966 /* No one is able send more packets now. */ 4967 4968 /* 4969 * Wait for all pending sends to finish. 4970 * 4971 * NOTE: 4972 * We will _not_ receive all pending send-done, if the 4973 * primary channel is revoked. 4974 */ 4975 while (hn_tx_ring_pending(txr) && 4976 !vmbus_chan_is_revoked(sc->hn_prichan)) 4977 pause("hnwtx", 1 /* 1 tick */); 4978 } 4979 4980 /* 4981 * Disable RX by clearing RX filter. 4982 */ 4983 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 4984 4985 /* 4986 * Give RNDIS enough time to flush all pending data packets. 4987 */ 4988 pause("waitrx", (200 * hz) / 1000); 4989 4990 /* 4991 * Drain RX/TX bufrings and interrupts. 4992 */ 4993 nsubch = sc->hn_rx_ring_inuse - 1; 4994 if (nsubch > 0) 4995 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4996 4997 if (subch != NULL) { 4998 for (i = 0; i < nsubch; ++i) 4999 hn_chan_drain(sc, subch[i]); 5000 } 5001 hn_chan_drain(sc, sc->hn_prichan); 5002 5003 if (subch != NULL) 5004 vmbus_subchan_rel(subch, nsubch); 5005 5006 /* 5007 * Drain any pending TX tasks. 5008 * 5009 * NOTE: 5010 * The above hn_chan_drain() can dispatch TX tasks, so the TX 5011 * tasks will have to be drained _after_ the above hn_chan_drain() 5012 * calls. 5013 */ 5014 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 5015 txr = &sc->hn_tx_ring[i]; 5016 5017 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 5018 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 5019 } 5020 } 5021 5022 static void 5023 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 5024 { 5025 5026 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 5027 } 5028 5029 static void 5030 hn_suspend_mgmt(struct hn_softc *sc) 5031 { 5032 struct task task; 5033 5034 HN_LOCK_ASSERT(sc); 5035 5036 /* 5037 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 5038 * through hn_mgmt_taskq. 5039 */ 5040 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 5041 vmbus_chan_run_task(sc->hn_prichan, &task); 5042 5043 /* 5044 * Make sure that all pending management tasks are completed. 5045 */ 5046 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 5047 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 5048 taskqueue_drain_all(sc->hn_mgmt_taskq0); 5049 } 5050 5051 static void 5052 hn_suspend(struct hn_softc *sc) 5053 { 5054 5055 /* Disable polling. */ 5056 hn_polling(sc, 0); 5057 5058 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 5059 (sc->hn_flags & HN_FLAG_VF)) 5060 hn_suspend_data(sc); 5061 hn_suspend_mgmt(sc); 5062 } 5063 5064 static void 5065 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 5066 { 5067 int i; 5068 5069 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 5070 ("invalid TX ring count %d", tx_ring_cnt)); 5071 5072 for (i = 0; i < tx_ring_cnt; ++i) { 5073 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 5074 5075 mtx_lock(&txr->hn_tx_lock); 5076 txr->hn_suspended = 0; 5077 mtx_unlock(&txr->hn_tx_lock); 5078 } 5079 } 5080 5081 static void 5082 hn_resume_data(struct hn_softc *sc) 5083 { 5084 int i; 5085 5086 HN_LOCK_ASSERT(sc); 5087 5088 /* 5089 * Re-enable RX. 5090 */ 5091 hn_rxfilter_config(sc); 5092 5093 /* 5094 * Make sure to clear suspend status on "all" TX rings, 5095 * since hn_tx_ring_inuse can be changed after 5096 * hn_suspend_data(). 5097 */ 5098 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 5099 5100 #ifdef HN_IFSTART_SUPPORT 5101 if (!hn_use_if_start) 5102 #endif 5103 { 5104 /* 5105 * Flush unused drbrs, since hn_tx_ring_inuse may be 5106 * reduced. 5107 */ 5108 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 5109 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 5110 } 5111 5112 /* 5113 * Kick start TX. 5114 */ 5115 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 5116 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 5117 5118 /* 5119 * Use txeof task, so that any pending oactive can be 5120 * cleared properly. 5121 */ 5122 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5123 } 5124 } 5125 5126 static void 5127 hn_resume_mgmt(struct hn_softc *sc) 5128 { 5129 5130 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 5131 5132 /* 5133 * Kick off network change detection, if it was pending. 5134 * If no network change was pending, start link status 5135 * checks, which is more lightweight than network change 5136 * detection. 5137 */ 5138 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 5139 hn_change_network(sc); 5140 else 5141 hn_update_link_status(sc); 5142 } 5143 5144 static void 5145 hn_resume(struct hn_softc *sc) 5146 { 5147 5148 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 5149 (sc->hn_flags & HN_FLAG_VF)) 5150 hn_resume_data(sc); 5151 5152 /* 5153 * When the VF is activated, the synthetic interface is changed 5154 * to DOWN in hn_set_vf(). Here, if the VF is still active, we 5155 * don't call hn_resume_mgmt() until the VF is deactivated in 5156 * hn_set_vf(). 5157 */ 5158 if (!(sc->hn_flags & HN_FLAG_VF)) 5159 hn_resume_mgmt(sc); 5160 5161 /* 5162 * Re-enable polling if this interface is running and 5163 * the polling is requested. 5164 */ 5165 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 5166 hn_polling(sc, sc->hn_pollhz); 5167 } 5168 5169 static void 5170 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 5171 { 5172 const struct rndis_status_msg *msg; 5173 int ofs; 5174 5175 if (dlen < sizeof(*msg)) { 5176 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 5177 return; 5178 } 5179 msg = data; 5180 5181 switch (msg->rm_status) { 5182 case RNDIS_STATUS_MEDIA_CONNECT: 5183 case RNDIS_STATUS_MEDIA_DISCONNECT: 5184 hn_update_link_status(sc); 5185 break; 5186 5187 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 5188 /* Not really useful; ignore. */ 5189 break; 5190 5191 case RNDIS_STATUS_NETWORK_CHANGE: 5192 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 5193 if (dlen < ofs + msg->rm_stbuflen || 5194 msg->rm_stbuflen < sizeof(uint32_t)) { 5195 if_printf(sc->hn_ifp, "network changed\n"); 5196 } else { 5197 uint32_t change; 5198 5199 memcpy(&change, ((const uint8_t *)msg) + ofs, 5200 sizeof(change)); 5201 if_printf(sc->hn_ifp, "network changed, change %u\n", 5202 change); 5203 } 5204 hn_change_network(sc); 5205 break; 5206 5207 default: 5208 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 5209 msg->rm_status); 5210 break; 5211 } 5212 } 5213 5214 static int 5215 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 5216 { 5217 const struct rndis_pktinfo *pi = info_data; 5218 uint32_t mask = 0; 5219 5220 while (info_dlen != 0) { 5221 const void *data; 5222 uint32_t dlen; 5223 5224 if (__predict_false(info_dlen < sizeof(*pi))) 5225 return (EINVAL); 5226 if (__predict_false(info_dlen < pi->rm_size)) 5227 return (EINVAL); 5228 info_dlen -= pi->rm_size; 5229 5230 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 5231 return (EINVAL); 5232 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 5233 return (EINVAL); 5234 dlen = pi->rm_size - pi->rm_pktinfooffset; 5235 data = pi->rm_data; 5236 5237 switch (pi->rm_type) { 5238 case NDIS_PKTINFO_TYPE_VLAN: 5239 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) 5240 return (EINVAL); 5241 info->vlan_info = *((const uint32_t *)data); 5242 mask |= HN_RXINFO_VLAN; 5243 break; 5244 5245 case NDIS_PKTINFO_TYPE_CSUM: 5246 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) 5247 return (EINVAL); 5248 info->csum_info = *((const uint32_t *)data); 5249 mask |= HN_RXINFO_CSUM; 5250 break; 5251 5252 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 5253 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) 5254 return (EINVAL); 5255 info->hash_value = *((const uint32_t *)data); 5256 mask |= HN_RXINFO_HASHVAL; 5257 break; 5258 5259 case HN_NDIS_PKTINFO_TYPE_HASHINF: 5260 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) 5261 return (EINVAL); 5262 info->hash_info = *((const uint32_t *)data); 5263 mask |= HN_RXINFO_HASHINF; 5264 break; 5265 5266 default: 5267 goto next; 5268 } 5269 5270 if (mask == HN_RXINFO_ALL) { 5271 /* All found; done */ 5272 break; 5273 } 5274 next: 5275 pi = (const struct rndis_pktinfo *) 5276 ((const uint8_t *)pi + pi->rm_size); 5277 } 5278 5279 /* 5280 * Final fixup. 5281 * - If there is no hash value, invalidate the hash info. 5282 */ 5283 if ((mask & HN_RXINFO_HASHVAL) == 0) 5284 info->hash_info = HN_NDIS_HASH_INFO_INVALID; 5285 return (0); 5286 } 5287 5288 static __inline bool 5289 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 5290 { 5291 5292 if (off < check_off) { 5293 if (__predict_true(off + len <= check_off)) 5294 return (false); 5295 } else if (off > check_off) { 5296 if (__predict_true(check_off + check_len <= off)) 5297 return (false); 5298 } 5299 return (true); 5300 } 5301 5302 static void 5303 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 5304 { 5305 const struct rndis_packet_msg *pkt; 5306 struct hn_rxinfo info; 5307 int data_off, pktinfo_off, data_len, pktinfo_len; 5308 5309 /* 5310 * Check length. 5311 */ 5312 if (__predict_false(dlen < sizeof(*pkt))) { 5313 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 5314 return; 5315 } 5316 pkt = data; 5317 5318 if (__predict_false(dlen < pkt->rm_len)) { 5319 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 5320 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 5321 return; 5322 } 5323 if (__predict_false(pkt->rm_len < 5324 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 5325 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 5326 "msglen %u, data %u, oob %u, pktinfo %u\n", 5327 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 5328 pkt->rm_pktinfolen); 5329 return; 5330 } 5331 if (__predict_false(pkt->rm_datalen == 0)) { 5332 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 5333 return; 5334 } 5335 5336 /* 5337 * Check offests. 5338 */ 5339 #define IS_OFFSET_INVALID(ofs) \ 5340 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 5341 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 5342 5343 /* XXX Hyper-V does not meet data offset alignment requirement */ 5344 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 5345 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5346 "data offset %u\n", pkt->rm_dataoffset); 5347 return; 5348 } 5349 if (__predict_false(pkt->rm_oobdataoffset > 0 && 5350 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 5351 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5352 "oob offset %u\n", pkt->rm_oobdataoffset); 5353 return; 5354 } 5355 if (__predict_true(pkt->rm_pktinfooffset > 0) && 5356 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 5357 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5358 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 5359 return; 5360 } 5361 5362 #undef IS_OFFSET_INVALID 5363 5364 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 5365 data_len = pkt->rm_datalen; 5366 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 5367 pktinfo_len = pkt->rm_pktinfolen; 5368 5369 /* 5370 * Check OOB coverage. 5371 */ 5372 if (__predict_false(pkt->rm_oobdatalen != 0)) { 5373 int oob_off, oob_len; 5374 5375 if_printf(rxr->hn_ifp, "got oobdata\n"); 5376 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 5377 oob_len = pkt->rm_oobdatalen; 5378 5379 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 5380 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5381 "oob overflow, msglen %u, oob abs %d len %d\n", 5382 pkt->rm_len, oob_off, oob_len); 5383 return; 5384 } 5385 5386 /* 5387 * Check against data. 5388 */ 5389 if (hn_rndis_check_overlap(oob_off, oob_len, 5390 data_off, data_len)) { 5391 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5392 "oob overlaps data, oob abs %d len %d, " 5393 "data abs %d len %d\n", 5394 oob_off, oob_len, data_off, data_len); 5395 return; 5396 } 5397 5398 /* 5399 * Check against pktinfo. 5400 */ 5401 if (pktinfo_len != 0 && 5402 hn_rndis_check_overlap(oob_off, oob_len, 5403 pktinfo_off, pktinfo_len)) { 5404 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5405 "oob overlaps pktinfo, oob abs %d len %d, " 5406 "pktinfo abs %d len %d\n", 5407 oob_off, oob_len, pktinfo_off, pktinfo_len); 5408 return; 5409 } 5410 } 5411 5412 /* 5413 * Check per-packet-info coverage and find useful per-packet-info. 5414 */ 5415 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; 5416 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; 5417 info.hash_info = HN_NDIS_HASH_INFO_INVALID; 5418 if (__predict_true(pktinfo_len != 0)) { 5419 bool overlap; 5420 int error; 5421 5422 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 5423 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5424 "pktinfo overflow, msglen %u, " 5425 "pktinfo abs %d len %d\n", 5426 pkt->rm_len, pktinfo_off, pktinfo_len); 5427 return; 5428 } 5429 5430 /* 5431 * Check packet info coverage. 5432 */ 5433 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 5434 data_off, data_len); 5435 if (__predict_false(overlap)) { 5436 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5437 "pktinfo overlap data, pktinfo abs %d len %d, " 5438 "data abs %d len %d\n", 5439 pktinfo_off, pktinfo_len, data_off, data_len); 5440 return; 5441 } 5442 5443 /* 5444 * Find useful per-packet-info. 5445 */ 5446 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 5447 pktinfo_len, &info); 5448 if (__predict_false(error)) { 5449 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 5450 "pktinfo\n"); 5451 return; 5452 } 5453 } 5454 5455 if (__predict_false(data_off + data_len > pkt->rm_len)) { 5456 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5457 "data overflow, msglen %u, data abs %d len %d\n", 5458 pkt->rm_len, data_off, data_len); 5459 return; 5460 } 5461 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); 5462 } 5463 5464 static __inline void 5465 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 5466 { 5467 const struct rndis_msghdr *hdr; 5468 5469 if (__predict_false(dlen < sizeof(*hdr))) { 5470 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 5471 return; 5472 } 5473 hdr = data; 5474 5475 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 5476 /* Hot data path. */ 5477 hn_rndis_rx_data(rxr, data, dlen); 5478 /* Done! */ 5479 return; 5480 } 5481 5482 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 5483 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 5484 else 5485 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 5486 } 5487 5488 static void 5489 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 5490 { 5491 const struct hn_nvs_hdr *hdr; 5492 5493 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 5494 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 5495 return; 5496 } 5497 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 5498 5499 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 5500 /* Useless; ignore */ 5501 return; 5502 } 5503 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 5504 } 5505 5506 static void 5507 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 5508 const struct vmbus_chanpkt_hdr *pkt) 5509 { 5510 struct hn_nvs_sendctx *sndc; 5511 5512 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 5513 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 5514 VMBUS_CHANPKT_DATALEN(pkt)); 5515 /* 5516 * NOTE: 5517 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 5518 * its callback. 5519 */ 5520 } 5521 5522 static void 5523 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5524 const struct vmbus_chanpkt_hdr *pkthdr) 5525 { 5526 const struct vmbus_chanpkt_rxbuf *pkt; 5527 const struct hn_nvs_hdr *nvs_hdr; 5528 int count, i, hlen; 5529 5530 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 5531 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 5532 return; 5533 } 5534 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 5535 5536 /* Make sure that this is a RNDIS message. */ 5537 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 5538 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 5539 nvs_hdr->nvs_type); 5540 return; 5541 } 5542 5543 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 5544 if (__predict_false(hlen < sizeof(*pkt))) { 5545 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 5546 return; 5547 } 5548 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 5549 5550 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 5551 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 5552 pkt->cp_rxbuf_id); 5553 return; 5554 } 5555 5556 count = pkt->cp_rxbuf_cnt; 5557 if (__predict_false(hlen < 5558 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 5559 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 5560 return; 5561 } 5562 5563 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 5564 for (i = 0; i < count; ++i) { 5565 int ofs, len; 5566 5567 ofs = pkt->cp_rxbuf[i].rb_ofs; 5568 len = pkt->cp_rxbuf[i].rb_len; 5569 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 5570 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 5571 "ofs %d, len %d\n", i, ofs, len); 5572 continue; 5573 } 5574 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 5575 } 5576 5577 /* 5578 * Ack the consumed RXBUF associated w/ this channel packet, 5579 * so that this RXBUF can be recycled by the hypervisor. 5580 */ 5581 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 5582 } 5583 5584 static void 5585 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5586 uint64_t tid) 5587 { 5588 struct hn_nvs_rndis_ack ack; 5589 int retries, error; 5590 5591 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 5592 ack.nvs_status = HN_NVS_STATUS_OK; 5593 5594 retries = 0; 5595 again: 5596 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 5597 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 5598 if (__predict_false(error == EAGAIN)) { 5599 /* 5600 * NOTE: 5601 * This should _not_ happen in real world, since the 5602 * consumption of the TX bufring from the TX path is 5603 * controlled. 5604 */ 5605 if (rxr->hn_ack_failed == 0) 5606 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 5607 rxr->hn_ack_failed++; 5608 retries++; 5609 if (retries < 10) { 5610 DELAY(100); 5611 goto again; 5612 } 5613 /* RXBUF leaks! */ 5614 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 5615 } 5616 } 5617 5618 static void 5619 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 5620 { 5621 struct hn_rx_ring *rxr = xrxr; 5622 struct hn_softc *sc = rxr->hn_ifp->if_softc; 5623 5624 for (;;) { 5625 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 5626 int error, pktlen; 5627 5628 pktlen = rxr->hn_pktbuf_len; 5629 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 5630 if (__predict_false(error == ENOBUFS)) { 5631 void *nbuf; 5632 int nlen; 5633 5634 /* 5635 * Expand channel packet buffer. 5636 * 5637 * XXX 5638 * Use M_WAITOK here, since allocation failure 5639 * is fatal. 5640 */ 5641 nlen = rxr->hn_pktbuf_len * 2; 5642 while (nlen < pktlen) 5643 nlen *= 2; 5644 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 5645 5646 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 5647 rxr->hn_pktbuf_len, nlen); 5648 5649 free(rxr->hn_pktbuf, M_DEVBUF); 5650 rxr->hn_pktbuf = nbuf; 5651 rxr->hn_pktbuf_len = nlen; 5652 /* Retry! */ 5653 continue; 5654 } else if (__predict_false(error == EAGAIN)) { 5655 /* No more channel packets; done! */ 5656 break; 5657 } 5658 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 5659 5660 switch (pkt->cph_type) { 5661 case VMBUS_CHANPKT_TYPE_COMP: 5662 hn_nvs_handle_comp(sc, chan, pkt); 5663 break; 5664 5665 case VMBUS_CHANPKT_TYPE_RXBUF: 5666 hn_nvs_handle_rxbuf(rxr, chan, pkt); 5667 break; 5668 5669 case VMBUS_CHANPKT_TYPE_INBAND: 5670 hn_nvs_handle_notify(sc, pkt); 5671 break; 5672 5673 default: 5674 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 5675 pkt->cph_type); 5676 break; 5677 } 5678 } 5679 hn_chan_rollup(rxr, rxr->hn_txr); 5680 } 5681 5682 static void 5683 hn_tx_taskq_create(void *arg __unused) 5684 { 5685 int i; 5686 5687 /* 5688 * Fix the # of TX taskqueues. 5689 */ 5690 if (hn_tx_taskq_cnt <= 0) 5691 hn_tx_taskq_cnt = 1; 5692 else if (hn_tx_taskq_cnt > mp_ncpus) 5693 hn_tx_taskq_cnt = mp_ncpus; 5694 5695 /* 5696 * Fix the TX taskqueue mode. 5697 */ 5698 switch (hn_tx_taskq_mode) { 5699 case HN_TX_TASKQ_M_INDEP: 5700 case HN_TX_TASKQ_M_GLOBAL: 5701 case HN_TX_TASKQ_M_EVTTQ: 5702 break; 5703 default: 5704 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 5705 break; 5706 } 5707 5708 if (vm_guest != VM_GUEST_HV) 5709 return; 5710 5711 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 5712 return; 5713 5714 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 5715 M_DEVBUF, M_WAITOK); 5716 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 5717 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 5718 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 5719 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 5720 "hn tx%d", i); 5721 } 5722 } 5723 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5724 hn_tx_taskq_create, NULL); 5725 5726 static void 5727 hn_tx_taskq_destroy(void *arg __unused) 5728 { 5729 5730 if (hn_tx_taskque != NULL) { 5731 int i; 5732 5733 for (i = 0; i < hn_tx_taskq_cnt; ++i) 5734 taskqueue_free(hn_tx_taskque[i]); 5735 free(hn_tx_taskque, M_DEVBUF); 5736 } 5737 } 5738 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5739 hn_tx_taskq_destroy, NULL); 5740