1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/bus.h> 65 #include <sys/kernel.h> 66 #include <sys/limits.h> 67 #include <sys/malloc.h> 68 #include <sys/mbuf.h> 69 #include <sys/module.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/smp.h> 73 #include <sys/socket.h> 74 #include <sys/sockio.h> 75 #include <sys/sx.h> 76 #include <sys/sysctl.h> 77 #include <sys/systm.h> 78 #include <sys/taskqueue.h> 79 #include <sys/buf_ring.h> 80 #include <sys/eventhandler.h> 81 82 #include <machine/atomic.h> 83 #include <machine/in_cksum.h> 84 85 #include <net/bpf.h> 86 #include <net/ethernet.h> 87 #include <net/if.h> 88 #include <net/if_dl.h> 89 #include <net/if_media.h> 90 #include <net/if_types.h> 91 #include <net/if_var.h> 92 #include <net/rndis.h> 93 #ifdef RSS 94 #include <net/rss_config.h> 95 #endif 96 97 #include <netinet/in_systm.h> 98 #include <netinet/in.h> 99 #include <netinet/ip.h> 100 #include <netinet/ip6.h> 101 #include <netinet/tcp.h> 102 #include <netinet/tcp_lro.h> 103 #include <netinet/udp.h> 104 105 #include <dev/hyperv/include/hyperv.h> 106 #include <dev/hyperv/include/hyperv_busdma.h> 107 #include <dev/hyperv/include/vmbus.h> 108 #include <dev/hyperv/include/vmbus_xact.h> 109 110 #include <dev/hyperv/netvsc/ndis.h> 111 #include <dev/hyperv/netvsc/if_hnreg.h> 112 #include <dev/hyperv/netvsc/if_hnvar.h> 113 #include <dev/hyperv/netvsc/hn_nvs.h> 114 #include <dev/hyperv/netvsc/hn_rndis.h> 115 116 #include "vmbus_if.h" 117 118 #define HN_IFSTART_SUPPORT 119 120 #define HN_RING_CNT_DEF_MAX 8 121 122 /* YYY should get it from the underlying channel */ 123 #define HN_TX_DESC_CNT 512 124 125 #define HN_RNDIS_PKT_LEN \ 126 (sizeof(struct rndis_packet_msg) + \ 127 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 128 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 129 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 130 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 131 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 132 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 133 134 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 135 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 136 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 137 /* -1 for RNDIS packet message */ 138 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 139 140 #define HN_DIRECT_TX_SIZE_DEF 128 141 142 #define HN_EARLY_TXEOF_THRESH 8 143 144 #define HN_PKTBUF_LEN_DEF (16 * 1024) 145 146 #define HN_LROENT_CNT_DEF 128 147 148 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 149 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 150 /* YYY 2*MTU is a bit rough, but should be good enough. */ 151 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 152 153 #define HN_LRO_ACKCNT_DEF 1 154 155 #define HN_LOCK_INIT(sc) \ 156 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 157 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 158 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 159 #define HN_LOCK(sc) \ 160 do { \ 161 while (sx_try_xlock(&(sc)->hn_lock) == 0) \ 162 DELAY(1000); \ 163 } while (0) 164 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 165 166 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 167 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 168 #define HN_CSUM_IP_HWASSIST(sc) \ 169 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 170 #define HN_CSUM_IP6_HWASSIST(sc) \ 171 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 172 173 #define HN_PKTSIZE_MIN(align) \ 174 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 175 HN_RNDIS_PKT_LEN, (align)) 176 #define HN_PKTSIZE(m, align) \ 177 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 178 179 #ifdef RSS 180 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 181 #else 182 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 183 #endif 184 185 struct hn_txdesc { 186 #ifndef HN_USE_TXDESC_BUFRING 187 SLIST_ENTRY(hn_txdesc) link; 188 #endif 189 STAILQ_ENTRY(hn_txdesc) agg_link; 190 191 /* Aggregated txdescs, in sending order. */ 192 STAILQ_HEAD(, hn_txdesc) agg_list; 193 194 /* The oldest packet, if transmission aggregation happens. */ 195 struct mbuf *m; 196 struct hn_tx_ring *txr; 197 int refs; 198 uint32_t flags; /* HN_TXD_FLAG_ */ 199 struct hn_nvs_sendctx send_ctx; 200 uint32_t chim_index; 201 int chim_size; 202 203 bus_dmamap_t data_dmap; 204 205 bus_addr_t rndis_pkt_paddr; 206 struct rndis_packet_msg *rndis_pkt; 207 bus_dmamap_t rndis_pkt_dmap; 208 }; 209 210 #define HN_TXD_FLAG_ONLIST 0x0001 211 #define HN_TXD_FLAG_DMAMAP 0x0002 212 #define HN_TXD_FLAG_ONAGG 0x0004 213 214 struct hn_rxinfo { 215 uint32_t vlan_info; 216 uint32_t csum_info; 217 uint32_t hash_info; 218 uint32_t hash_value; 219 }; 220 221 struct hn_update_vf { 222 struct hn_rx_ring *rxr; 223 struct ifnet *vf; 224 }; 225 226 #define HN_RXINFO_VLAN 0x0001 227 #define HN_RXINFO_CSUM 0x0002 228 #define HN_RXINFO_HASHINF 0x0004 229 #define HN_RXINFO_HASHVAL 0x0008 230 #define HN_RXINFO_ALL \ 231 (HN_RXINFO_VLAN | \ 232 HN_RXINFO_CSUM | \ 233 HN_RXINFO_HASHINF | \ 234 HN_RXINFO_HASHVAL) 235 236 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff 237 #define HN_NDIS_RXCSUM_INFO_INVALID 0 238 #define HN_NDIS_HASH_INFO_INVALID 0 239 240 static int hn_probe(device_t); 241 static int hn_attach(device_t); 242 static int hn_detach(device_t); 243 static int hn_shutdown(device_t); 244 static void hn_chan_callback(struct vmbus_channel *, 245 void *); 246 247 static void hn_init(void *); 248 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 249 #ifdef HN_IFSTART_SUPPORT 250 static void hn_start(struct ifnet *); 251 #endif 252 static int hn_transmit(struct ifnet *, struct mbuf *); 253 static void hn_xmit_qflush(struct ifnet *); 254 static int hn_ifmedia_upd(struct ifnet *); 255 static void hn_ifmedia_sts(struct ifnet *, 256 struct ifmediareq *); 257 258 static int hn_rndis_rxinfo(const void *, int, 259 struct hn_rxinfo *); 260 static void hn_rndis_rx_data(struct hn_rx_ring *, 261 const void *, int); 262 static void hn_rndis_rx_status(struct hn_softc *, 263 const void *, int); 264 265 static void hn_nvs_handle_notify(struct hn_softc *, 266 const struct vmbus_chanpkt_hdr *); 267 static void hn_nvs_handle_comp(struct hn_softc *, 268 struct vmbus_channel *, 269 const struct vmbus_chanpkt_hdr *); 270 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 271 struct vmbus_channel *, 272 const struct vmbus_chanpkt_hdr *); 273 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 274 struct vmbus_channel *, uint64_t); 275 276 #if __FreeBSD_version >= 1100099 277 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 278 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 279 #endif 280 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 281 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 282 #if __FreeBSD_version < 1100095 283 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 284 #else 285 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 286 #endif 287 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 288 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 289 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 290 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 291 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 292 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 293 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 294 #ifndef RSS 295 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 296 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 297 #endif 298 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 299 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 300 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 301 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 302 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 303 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 304 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 305 306 static void hn_stop(struct hn_softc *, bool); 307 static void hn_init_locked(struct hn_softc *); 308 static int hn_chan_attach(struct hn_softc *, 309 struct vmbus_channel *); 310 static void hn_chan_detach(struct hn_softc *, 311 struct vmbus_channel *); 312 static int hn_attach_subchans(struct hn_softc *); 313 static void hn_detach_allchans(struct hn_softc *); 314 static void hn_chan_rollup(struct hn_rx_ring *, 315 struct hn_tx_ring *); 316 static void hn_set_ring_inuse(struct hn_softc *, int); 317 static int hn_synth_attach(struct hn_softc *, int); 318 static void hn_synth_detach(struct hn_softc *); 319 static int hn_synth_alloc_subchans(struct hn_softc *, 320 int *); 321 static bool hn_synth_attachable(const struct hn_softc *); 322 static void hn_suspend(struct hn_softc *); 323 static void hn_suspend_data(struct hn_softc *); 324 static void hn_suspend_mgmt(struct hn_softc *); 325 static void hn_resume(struct hn_softc *); 326 static void hn_resume_data(struct hn_softc *); 327 static void hn_resume_mgmt(struct hn_softc *); 328 static void hn_suspend_mgmt_taskfunc(void *, int); 329 static void hn_chan_drain(struct hn_softc *, 330 struct vmbus_channel *); 331 static void hn_polling(struct hn_softc *, u_int); 332 static void hn_chan_polling(struct vmbus_channel *, u_int); 333 334 static void hn_update_link_status(struct hn_softc *); 335 static void hn_change_network(struct hn_softc *); 336 static void hn_link_taskfunc(void *, int); 337 static void hn_netchg_init_taskfunc(void *, int); 338 static void hn_netchg_status_taskfunc(void *, int); 339 static void hn_link_status(struct hn_softc *); 340 341 static int hn_create_rx_data(struct hn_softc *, int); 342 static void hn_destroy_rx_data(struct hn_softc *); 343 static int hn_check_iplen(const struct mbuf *, int); 344 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 345 static int hn_rxfilter_config(struct hn_softc *); 346 #ifndef RSS 347 static int hn_rss_reconfig(struct hn_softc *); 348 #endif 349 static void hn_rss_ind_fixup(struct hn_softc *); 350 static int hn_rxpkt(struct hn_rx_ring *, const void *, 351 int, const struct hn_rxinfo *); 352 353 static int hn_tx_ring_create(struct hn_softc *, int); 354 static void hn_tx_ring_destroy(struct hn_tx_ring *); 355 static int hn_create_tx_data(struct hn_softc *, int); 356 static void hn_fixup_tx_data(struct hn_softc *); 357 static void hn_destroy_tx_data(struct hn_softc *); 358 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 359 static void hn_txdesc_gc(struct hn_tx_ring *, 360 struct hn_txdesc *); 361 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 362 struct hn_txdesc *, struct mbuf **); 363 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 364 struct hn_txdesc *); 365 static void hn_set_chim_size(struct hn_softc *, int); 366 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 367 static bool hn_tx_ring_pending(struct hn_tx_ring *); 368 static void hn_tx_ring_qflush(struct hn_tx_ring *); 369 static void hn_resume_tx(struct hn_softc *, int); 370 static void hn_set_txagg(struct hn_softc *); 371 static void *hn_try_txagg(struct ifnet *, 372 struct hn_tx_ring *, struct hn_txdesc *, 373 int); 374 static int hn_get_txswq_depth(const struct hn_tx_ring *); 375 static void hn_txpkt_done(struct hn_nvs_sendctx *, 376 struct hn_softc *, struct vmbus_channel *, 377 const void *, int); 378 static int hn_txpkt_sglist(struct hn_tx_ring *, 379 struct hn_txdesc *); 380 static int hn_txpkt_chim(struct hn_tx_ring *, 381 struct hn_txdesc *); 382 static int hn_xmit(struct hn_tx_ring *, int); 383 static void hn_xmit_taskfunc(void *, int); 384 static void hn_xmit_txeof(struct hn_tx_ring *); 385 static void hn_xmit_txeof_taskfunc(void *, int); 386 #ifdef HN_IFSTART_SUPPORT 387 static int hn_start_locked(struct hn_tx_ring *, int); 388 static void hn_start_taskfunc(void *, int); 389 static void hn_start_txeof(struct hn_tx_ring *); 390 static void hn_start_txeof_taskfunc(void *, int); 391 #endif 392 393 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 394 "Hyper-V network interface"); 395 396 /* Trust tcp segements verification on host side. */ 397 static int hn_trust_hosttcp = 1; 398 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 399 &hn_trust_hosttcp, 0, 400 "Trust tcp segement verification on host side, " 401 "when csum info is missing (global setting)"); 402 403 /* Trust udp datagrams verification on host side. */ 404 static int hn_trust_hostudp = 1; 405 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 406 &hn_trust_hostudp, 0, 407 "Trust udp datagram verification on host side, " 408 "when csum info is missing (global setting)"); 409 410 /* Trust ip packets verification on host side. */ 411 static int hn_trust_hostip = 1; 412 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 413 &hn_trust_hostip, 0, 414 "Trust ip packet verification on host side, " 415 "when csum info is missing (global setting)"); 416 417 /* Limit TSO burst size */ 418 static int hn_tso_maxlen = IP_MAXPACKET; 419 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 420 &hn_tso_maxlen, 0, "TSO burst limit"); 421 422 /* Limit chimney send size */ 423 static int hn_tx_chimney_size = 0; 424 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 425 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 426 427 /* Limit the size of packet for direct transmission */ 428 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 429 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 430 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 431 432 /* # of LRO entries per RX ring */ 433 #if defined(INET) || defined(INET6) 434 #if __FreeBSD_version >= 1100095 435 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 436 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 437 &hn_lro_entry_count, 0, "LRO entry count"); 438 #endif 439 #endif 440 441 static int hn_tx_taskq_cnt = 1; 442 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 443 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 444 445 #define HN_TX_TASKQ_M_INDEP 0 446 #define HN_TX_TASKQ_M_GLOBAL 1 447 #define HN_TX_TASKQ_M_EVTTQ 2 448 449 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 450 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 451 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 452 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 453 454 #ifndef HN_USE_TXDESC_BUFRING 455 static int hn_use_txdesc_bufring = 0; 456 #else 457 static int hn_use_txdesc_bufring = 1; 458 #endif 459 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 460 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 461 462 #ifdef HN_IFSTART_SUPPORT 463 /* Use ifnet.if_start instead of ifnet.if_transmit */ 464 static int hn_use_if_start = 0; 465 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 466 &hn_use_if_start, 0, "Use if_start TX method"); 467 #endif 468 469 /* # of channels to use */ 470 static int hn_chan_cnt = 0; 471 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 472 &hn_chan_cnt, 0, 473 "# of channels to use; each channel has one RX ring and one TX ring"); 474 475 /* # of transmit rings to use */ 476 static int hn_tx_ring_cnt = 0; 477 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 478 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 479 480 /* Software TX ring deptch */ 481 static int hn_tx_swq_depth = 0; 482 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 483 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 484 485 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 486 #if __FreeBSD_version >= 1100095 487 static u_int hn_lro_mbufq_depth = 0; 488 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 489 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 490 #endif 491 492 /* Packet transmission aggregation size limit */ 493 static int hn_tx_agg_size = -1; 494 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 495 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 496 497 /* Packet transmission aggregation count limit */ 498 static int hn_tx_agg_pkts = -1; 499 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 500 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 501 502 static u_int hn_cpu_index; /* next CPU for channel */ 503 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 504 505 #ifndef RSS 506 static const uint8_t 507 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 508 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 509 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 510 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 511 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 512 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 513 }; 514 #endif /* !RSS */ 515 516 static device_method_t hn_methods[] = { 517 /* Device interface */ 518 DEVMETHOD(device_probe, hn_probe), 519 DEVMETHOD(device_attach, hn_attach), 520 DEVMETHOD(device_detach, hn_detach), 521 DEVMETHOD(device_shutdown, hn_shutdown), 522 DEVMETHOD_END 523 }; 524 525 static driver_t hn_driver = { 526 "hn", 527 hn_methods, 528 sizeof(struct hn_softc) 529 }; 530 531 static devclass_t hn_devclass; 532 533 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 534 MODULE_VERSION(hn, 1); 535 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 536 537 #if __FreeBSD_version >= 1100099 538 static void 539 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 540 { 541 int i; 542 543 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 544 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 545 } 546 #endif 547 548 static int 549 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 550 { 551 552 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 553 txd->chim_size == 0, ("invalid rndis sglist txd")); 554 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 555 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 556 } 557 558 static int 559 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 560 { 561 struct hn_nvs_rndis rndis; 562 563 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 564 txd->chim_size > 0, ("invalid rndis chim txd")); 565 566 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 567 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 568 rndis.nvs_chim_idx = txd->chim_index; 569 rndis.nvs_chim_sz = txd->chim_size; 570 571 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 572 &rndis, sizeof(rndis), &txd->send_ctx)); 573 } 574 575 static __inline uint32_t 576 hn_chim_alloc(struct hn_softc *sc) 577 { 578 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 579 u_long *bmap = sc->hn_chim_bmap; 580 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 581 582 for (i = 0; i < bmap_cnt; ++i) { 583 int idx; 584 585 idx = ffsl(~bmap[i]); 586 if (idx == 0) 587 continue; 588 589 --idx; /* ffsl is 1-based */ 590 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 591 ("invalid i %d and idx %d", i, idx)); 592 593 if (atomic_testandset_long(&bmap[i], idx)) 594 continue; 595 596 ret = i * LONG_BIT + idx; 597 break; 598 } 599 return (ret); 600 } 601 602 static __inline void 603 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 604 { 605 u_long mask; 606 uint32_t idx; 607 608 idx = chim_idx / LONG_BIT; 609 KASSERT(idx < sc->hn_chim_bmap_cnt, 610 ("invalid chimney index 0x%x", chim_idx)); 611 612 mask = 1UL << (chim_idx % LONG_BIT); 613 KASSERT(sc->hn_chim_bmap[idx] & mask, 614 ("index bitmap 0x%lx, chimney index %u, " 615 "bitmap idx %d, bitmask 0x%lx", 616 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 617 618 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 619 } 620 621 #if defined(INET6) || defined(INET) 622 /* 623 * NOTE: If this function failed, the m_head would be freed. 624 */ 625 static __inline struct mbuf * 626 hn_tso_fixup(struct mbuf *m_head) 627 { 628 struct ether_vlan_header *evl; 629 struct tcphdr *th; 630 int ehlen; 631 632 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 633 634 #define PULLUP_HDR(m, len) \ 635 do { \ 636 if (__predict_false((m)->m_len < (len))) { \ 637 (m) = m_pullup((m), (len)); \ 638 if ((m) == NULL) \ 639 return (NULL); \ 640 } \ 641 } while (0) 642 643 PULLUP_HDR(m_head, sizeof(*evl)); 644 evl = mtod(m_head, struct ether_vlan_header *); 645 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 646 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 647 else 648 ehlen = ETHER_HDR_LEN; 649 650 #ifdef INET 651 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 652 struct ip *ip; 653 int iphlen; 654 655 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 656 ip = mtodo(m_head, ehlen); 657 iphlen = ip->ip_hl << 2; 658 659 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 660 th = mtodo(m_head, ehlen + iphlen); 661 662 ip->ip_len = 0; 663 ip->ip_sum = 0; 664 th->th_sum = in_pseudo(ip->ip_src.s_addr, 665 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 666 } 667 #endif 668 #if defined(INET6) && defined(INET) 669 else 670 #endif 671 #ifdef INET6 672 { 673 struct ip6_hdr *ip6; 674 675 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 676 ip6 = mtodo(m_head, ehlen); 677 if (ip6->ip6_nxt != IPPROTO_TCP) { 678 m_freem(m_head); 679 return (NULL); 680 } 681 682 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 683 th = mtodo(m_head, ehlen + sizeof(*ip6)); 684 685 ip6->ip6_plen = 0; 686 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 687 } 688 #endif 689 return (m_head); 690 691 #undef PULLUP_HDR 692 } 693 #endif /* INET6 || INET */ 694 695 static int 696 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 697 { 698 int error = 0; 699 700 HN_LOCK_ASSERT(sc); 701 702 if (sc->hn_rx_filter != filter) { 703 error = hn_rndis_set_rxfilter(sc, filter); 704 if (!error) 705 sc->hn_rx_filter = filter; 706 } 707 return (error); 708 } 709 710 static int 711 hn_rxfilter_config(struct hn_softc *sc) 712 { 713 struct ifnet *ifp = sc->hn_ifp; 714 uint32_t filter; 715 716 HN_LOCK_ASSERT(sc); 717 718 if ((ifp->if_flags & IFF_PROMISC) || 719 (sc->hn_flags & HN_FLAG_VF)) { 720 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 721 } else { 722 filter = NDIS_PACKET_TYPE_DIRECTED; 723 if (ifp->if_flags & IFF_BROADCAST) 724 filter |= NDIS_PACKET_TYPE_BROADCAST; 725 /* TODO: support multicast list */ 726 if ((ifp->if_flags & IFF_ALLMULTI) || 727 !TAILQ_EMPTY(&ifp->if_multiaddrs)) 728 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 729 } 730 return (hn_set_rxfilter(sc, filter)); 731 } 732 733 static void 734 hn_set_txagg(struct hn_softc *sc) 735 { 736 uint32_t size, pkts; 737 int i; 738 739 /* 740 * Setup aggregation size. 741 */ 742 if (sc->hn_agg_size < 0) 743 size = UINT32_MAX; 744 else 745 size = sc->hn_agg_size; 746 747 if (sc->hn_rndis_agg_size < size) 748 size = sc->hn_rndis_agg_size; 749 750 /* NOTE: We only aggregate packets using chimney sending buffers. */ 751 if (size > (uint32_t)sc->hn_chim_szmax) 752 size = sc->hn_chim_szmax; 753 754 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 755 /* Disable */ 756 size = 0; 757 pkts = 0; 758 goto done; 759 } 760 761 /* NOTE: Type of the per TX ring setting is 'int'. */ 762 if (size > INT_MAX) 763 size = INT_MAX; 764 765 /* 766 * Setup aggregation packet count. 767 */ 768 if (sc->hn_agg_pkts < 0) 769 pkts = UINT32_MAX; 770 else 771 pkts = sc->hn_agg_pkts; 772 773 if (sc->hn_rndis_agg_pkts < pkts) 774 pkts = sc->hn_rndis_agg_pkts; 775 776 if (pkts <= 1) { 777 /* Disable */ 778 size = 0; 779 pkts = 0; 780 goto done; 781 } 782 783 /* NOTE: Type of the per TX ring setting is 'short'. */ 784 if (pkts > SHRT_MAX) 785 pkts = SHRT_MAX; 786 787 done: 788 /* NOTE: Type of the per TX ring setting is 'short'. */ 789 if (sc->hn_rndis_agg_align > SHRT_MAX) { 790 /* Disable */ 791 size = 0; 792 pkts = 0; 793 } 794 795 if (bootverbose) { 796 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 797 size, pkts, sc->hn_rndis_agg_align); 798 } 799 800 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 801 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 802 803 mtx_lock(&txr->hn_tx_lock); 804 txr->hn_agg_szmax = size; 805 txr->hn_agg_pktmax = pkts; 806 txr->hn_agg_align = sc->hn_rndis_agg_align; 807 mtx_unlock(&txr->hn_tx_lock); 808 } 809 } 810 811 static int 812 hn_get_txswq_depth(const struct hn_tx_ring *txr) 813 { 814 815 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 816 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 817 return txr->hn_txdesc_cnt; 818 return hn_tx_swq_depth; 819 } 820 821 #ifndef RSS 822 static int 823 hn_rss_reconfig(struct hn_softc *sc) 824 { 825 int error; 826 827 HN_LOCK_ASSERT(sc); 828 829 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 830 return (ENXIO); 831 832 /* 833 * Disable RSS first. 834 * 835 * NOTE: 836 * Direct reconfiguration by setting the UNCHG flags does 837 * _not_ work properly. 838 */ 839 if (bootverbose) 840 if_printf(sc->hn_ifp, "disable RSS\n"); 841 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 842 if (error) { 843 if_printf(sc->hn_ifp, "RSS disable failed\n"); 844 return (error); 845 } 846 847 /* 848 * Reenable the RSS w/ the updated RSS key or indirect 849 * table. 850 */ 851 if (bootverbose) 852 if_printf(sc->hn_ifp, "reconfig RSS\n"); 853 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 854 if (error) { 855 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 856 return (error); 857 } 858 return (0); 859 } 860 #endif /* !RSS */ 861 862 static void 863 hn_rss_ind_fixup(struct hn_softc *sc) 864 { 865 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 866 int i, nchan; 867 868 nchan = sc->hn_rx_ring_inuse; 869 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 870 871 /* 872 * Check indirect table to make sure that all channels in it 873 * can be used. 874 */ 875 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 876 if (rss->rss_ind[i] >= nchan) { 877 if_printf(sc->hn_ifp, 878 "RSS indirect table %d fixup: %u -> %d\n", 879 i, rss->rss_ind[i], nchan - 1); 880 rss->rss_ind[i] = nchan - 1; 881 } 882 } 883 } 884 885 static int 886 hn_ifmedia_upd(struct ifnet *ifp __unused) 887 { 888 889 return EOPNOTSUPP; 890 } 891 892 static void 893 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 894 { 895 struct hn_softc *sc = ifp->if_softc; 896 897 ifmr->ifm_status = IFM_AVALID; 898 ifmr->ifm_active = IFM_ETHER; 899 900 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 901 ifmr->ifm_active |= IFM_NONE; 902 return; 903 } 904 ifmr->ifm_status |= IFM_ACTIVE; 905 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 906 } 907 908 static void 909 hn_update_vf_task(void *arg, int pending __unused) 910 { 911 struct hn_update_vf *uv = arg; 912 913 uv->rxr->hn_vf = uv->vf; 914 } 915 916 static void 917 hn_update_vf(struct hn_softc *sc, struct ifnet *vf) 918 { 919 struct hn_rx_ring *rxr; 920 struct hn_update_vf uv; 921 struct task task; 922 int i; 923 924 HN_LOCK_ASSERT(sc); 925 926 TASK_INIT(&task, 0, hn_update_vf_task, &uv); 927 928 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 929 rxr = &sc->hn_rx_ring[i]; 930 931 if (i < sc->hn_rx_ring_inuse) { 932 uv.rxr = rxr; 933 uv.vf = vf; 934 vmbus_chan_run_task(rxr->hn_chan, &task); 935 } else { 936 rxr->hn_vf = vf; 937 } 938 } 939 } 940 941 static void 942 hn_set_vf(struct hn_softc *sc, struct ifnet *ifp, bool vf) 943 { 944 struct ifnet *hn_ifp; 945 946 HN_LOCK(sc); 947 948 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 949 goto out; 950 951 hn_ifp = sc->hn_ifp; 952 953 if (ifp == hn_ifp) 954 goto out; 955 956 if (ifp->if_alloctype != IFT_ETHER) 957 goto out; 958 959 /* Ignore lagg/vlan interfaces */ 960 if (strcmp(ifp->if_dname, "lagg") == 0 || 961 strcmp(ifp->if_dname, "vlan") == 0) 962 goto out; 963 964 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 965 goto out; 966 967 /* Now we're sure 'ifp' is a real VF device. */ 968 if (vf) { 969 if (sc->hn_flags & HN_FLAG_VF) 970 goto out; 971 972 sc->hn_flags |= HN_FLAG_VF; 973 hn_rxfilter_config(sc); 974 } else { 975 if (!(sc->hn_flags & HN_FLAG_VF)) 976 goto out; 977 978 sc->hn_flags &= ~HN_FLAG_VF; 979 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 980 hn_rxfilter_config(sc); 981 else 982 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 983 } 984 985 hn_nvs_set_datapath(sc, 986 vf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC); 987 988 hn_update_vf(sc, vf ? ifp : NULL); 989 990 if (vf) { 991 hn_suspend_mgmt(sc); 992 sc->hn_link_flags &= 993 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 994 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 995 } else { 996 hn_resume_mgmt(sc); 997 } 998 999 devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp), 1000 vf ? "VF_UP" : "VF_DOWN", NULL); 1001 1002 if (bootverbose) 1003 if_printf(hn_ifp, "Data path is switched %s %s\n", 1004 vf ? "to" : "from", if_name(ifp)); 1005 out: 1006 HN_UNLOCK(sc); 1007 } 1008 1009 static void 1010 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1011 { 1012 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1013 return; 1014 1015 hn_set_vf(arg, ifp, event == IFNET_EVENT_UP); 1016 } 1017 1018 static void 1019 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1020 { 1021 hn_set_vf(arg, ifp, ifp->if_flags & IFF_UP); 1022 } 1023 1024 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ 1025 static const struct hyperv_guid g_net_vsc_device_type = { 1026 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 1027 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} 1028 }; 1029 1030 static int 1031 hn_probe(device_t dev) 1032 { 1033 1034 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, 1035 &g_net_vsc_device_type) == 0) { 1036 device_set_desc(dev, "Hyper-V Network Interface"); 1037 return BUS_PROBE_DEFAULT; 1038 } 1039 return ENXIO; 1040 } 1041 1042 static int 1043 hn_attach(device_t dev) 1044 { 1045 struct hn_softc *sc = device_get_softc(dev); 1046 struct sysctl_oid_list *child; 1047 struct sysctl_ctx_list *ctx; 1048 uint8_t eaddr[ETHER_ADDR_LEN]; 1049 struct ifnet *ifp = NULL; 1050 int error, ring_cnt, tx_ring_cnt; 1051 1052 sc->hn_dev = dev; 1053 sc->hn_prichan = vmbus_get_channel(dev); 1054 HN_LOCK_INIT(sc); 1055 1056 /* 1057 * Initialize these tunables once. 1058 */ 1059 sc->hn_agg_size = hn_tx_agg_size; 1060 sc->hn_agg_pkts = hn_tx_agg_pkts; 1061 1062 /* 1063 * Setup taskqueue for transmission. 1064 */ 1065 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 1066 int i; 1067 1068 sc->hn_tx_taskqs = 1069 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 1070 M_DEVBUF, M_WAITOK); 1071 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 1072 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 1073 M_WAITOK, taskqueue_thread_enqueue, 1074 &sc->hn_tx_taskqs[i]); 1075 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 1076 "%s tx%d", device_get_nameunit(dev), i); 1077 } 1078 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 1079 sc->hn_tx_taskqs = hn_tx_taskque; 1080 } 1081 1082 /* 1083 * Setup taskqueue for mangement tasks, e.g. link status. 1084 */ 1085 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 1086 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 1087 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 1088 device_get_nameunit(dev)); 1089 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 1090 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 1091 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 1092 hn_netchg_status_taskfunc, sc); 1093 1094 /* 1095 * Allocate ifnet and setup its name earlier, so that if_printf 1096 * can be used by functions, which will be called after 1097 * ether_ifattach(). 1098 */ 1099 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 1100 ifp->if_softc = sc; 1101 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 1102 1103 /* 1104 * Initialize ifmedia earlier so that it can be unconditionally 1105 * destroyed, if error happened later on. 1106 */ 1107 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 1108 1109 /* 1110 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 1111 * to use (tx_ring_cnt). 1112 * 1113 * NOTE: 1114 * The # of RX rings to use is same as the # of channels to use. 1115 */ 1116 ring_cnt = hn_chan_cnt; 1117 if (ring_cnt <= 0) { 1118 /* Default */ 1119 ring_cnt = mp_ncpus; 1120 if (ring_cnt > HN_RING_CNT_DEF_MAX) 1121 ring_cnt = HN_RING_CNT_DEF_MAX; 1122 } else if (ring_cnt > mp_ncpus) { 1123 ring_cnt = mp_ncpus; 1124 } 1125 #ifdef RSS 1126 if (ring_cnt > rss_getnumbuckets()) 1127 ring_cnt = rss_getnumbuckets(); 1128 #endif 1129 1130 tx_ring_cnt = hn_tx_ring_cnt; 1131 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 1132 tx_ring_cnt = ring_cnt; 1133 #ifdef HN_IFSTART_SUPPORT 1134 if (hn_use_if_start) { 1135 /* ifnet.if_start only needs one TX ring. */ 1136 tx_ring_cnt = 1; 1137 } 1138 #endif 1139 1140 /* 1141 * Set the leader CPU for channels. 1142 */ 1143 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 1144 1145 /* 1146 * Create enough TX/RX rings, even if only limited number of 1147 * channels can be allocated. 1148 */ 1149 error = hn_create_tx_data(sc, tx_ring_cnt); 1150 if (error) 1151 goto failed; 1152 error = hn_create_rx_data(sc, ring_cnt); 1153 if (error) 1154 goto failed; 1155 1156 /* 1157 * Create transaction context for NVS and RNDIS transactions. 1158 */ 1159 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 1160 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 1161 if (sc->hn_xact == NULL) { 1162 error = ENXIO; 1163 goto failed; 1164 } 1165 1166 /* 1167 * Install orphan handler for the revocation of this device's 1168 * primary channel. 1169 * 1170 * NOTE: 1171 * The processing order is critical here: 1172 * Install the orphan handler, _before_ testing whether this 1173 * device's primary channel has been revoked or not. 1174 */ 1175 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 1176 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 1177 error = ENXIO; 1178 goto failed; 1179 } 1180 1181 /* 1182 * Attach the synthetic parts, i.e. NVS and RNDIS. 1183 */ 1184 error = hn_synth_attach(sc, ETHERMTU); 1185 if (error) 1186 goto failed; 1187 1188 error = hn_rndis_get_eaddr(sc, eaddr); 1189 if (error) 1190 goto failed; 1191 1192 #if __FreeBSD_version >= 1100099 1193 if (sc->hn_rx_ring_inuse > 1) { 1194 /* 1195 * Reduce TCP segment aggregation limit for multiple 1196 * RX rings to increase ACK timeliness. 1197 */ 1198 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 1199 } 1200 #endif 1201 1202 /* 1203 * Fixup TX stuffs after synthetic parts are attached. 1204 */ 1205 hn_fixup_tx_data(sc); 1206 1207 ctx = device_get_sysctl_ctx(dev); 1208 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 1209 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 1210 &sc->hn_nvs_ver, 0, "NVS version"); 1211 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 1212 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1213 hn_ndis_version_sysctl, "A", "NDIS version"); 1214 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 1215 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1216 hn_caps_sysctl, "A", "capabilities"); 1217 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 1218 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1219 hn_hwassist_sysctl, "A", "hwassist"); 1220 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 1221 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1222 hn_rxfilter_sysctl, "A", "rxfilter"); 1223 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 1224 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1225 hn_rss_hash_sysctl, "A", "RSS hash"); 1226 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 1227 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 1228 #ifndef RSS 1229 /* 1230 * Don't allow RSS key/indirect table changes, if RSS is defined. 1231 */ 1232 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 1233 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1234 hn_rss_key_sysctl, "IU", "RSS key"); 1235 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 1236 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1237 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 1238 #endif 1239 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 1240 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 1241 "RNDIS offered packet transmission aggregation size limit"); 1242 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 1243 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 1244 "RNDIS offered packet transmission aggregation count limit"); 1245 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 1246 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 1247 "RNDIS packet transmission aggregation alignment"); 1248 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 1249 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1250 hn_txagg_size_sysctl, "I", 1251 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 1252 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 1253 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1254 hn_txagg_pkts_sysctl, "I", 1255 "Packet transmission aggregation packets, " 1256 "0 -- disable, -1 -- auto"); 1257 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 1258 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1259 hn_polling_sysctl, "I", 1260 "Polling frequency: [100,1000000], 0 disable polling"); 1261 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 1262 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1263 hn_vf_sysctl, "A", "Virtual Function's name"); 1264 1265 /* 1266 * Setup the ifmedia, which has been initialized earlier. 1267 */ 1268 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 1269 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 1270 /* XXX ifmedia_set really should do this for us */ 1271 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 1272 1273 /* 1274 * Setup the ifnet for this interface. 1275 */ 1276 1277 ifp->if_baudrate = IF_Gbps(10); 1278 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 1279 ifp->if_ioctl = hn_ioctl; 1280 ifp->if_init = hn_init; 1281 #ifdef HN_IFSTART_SUPPORT 1282 if (hn_use_if_start) { 1283 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 1284 1285 ifp->if_start = hn_start; 1286 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 1287 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 1288 IFQ_SET_READY(&ifp->if_snd); 1289 } else 1290 #endif 1291 { 1292 ifp->if_transmit = hn_transmit; 1293 ifp->if_qflush = hn_xmit_qflush; 1294 } 1295 1296 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO; 1297 #ifdef foo 1298 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 1299 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 1300 #endif 1301 if (sc->hn_caps & HN_CAP_VLAN) { 1302 /* XXX not sure about VLAN_MTU. */ 1303 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 1304 } 1305 1306 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 1307 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 1308 ifp->if_capabilities |= IFCAP_TXCSUM; 1309 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 1310 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 1311 if (sc->hn_caps & HN_CAP_TSO4) { 1312 ifp->if_capabilities |= IFCAP_TSO4; 1313 ifp->if_hwassist |= CSUM_IP_TSO; 1314 } 1315 if (sc->hn_caps & HN_CAP_TSO6) { 1316 ifp->if_capabilities |= IFCAP_TSO6; 1317 ifp->if_hwassist |= CSUM_IP6_TSO; 1318 } 1319 1320 /* Enable all available capabilities by default. */ 1321 ifp->if_capenable = ifp->if_capabilities; 1322 1323 /* 1324 * Disable IPv6 TSO and TXCSUM by default, they still can 1325 * be enabled through SIOCSIFCAP. 1326 */ 1327 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 1328 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 1329 1330 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 1331 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 1332 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 1333 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 1334 } 1335 1336 ether_ifattach(ifp, eaddr); 1337 1338 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 1339 if_printf(ifp, "TSO segcnt %u segsz %u\n", 1340 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 1341 } 1342 1343 /* Inform the upper layer about the long frame support. */ 1344 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 1345 1346 /* 1347 * Kick off link status check. 1348 */ 1349 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 1350 hn_update_link_status(sc); 1351 1352 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 1353 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 1354 1355 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 1356 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 1357 1358 return (0); 1359 failed: 1360 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 1361 hn_synth_detach(sc); 1362 hn_detach(dev); 1363 return (error); 1364 } 1365 1366 static int 1367 hn_detach(device_t dev) 1368 { 1369 struct hn_softc *sc = device_get_softc(dev); 1370 struct ifnet *ifp = sc->hn_ifp; 1371 1372 if (sc->hn_ifaddr_evthand != NULL) 1373 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 1374 if (sc->hn_ifnet_evthand != NULL) 1375 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 1376 1377 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 1378 /* 1379 * In case that the vmbus missed the orphan handler 1380 * installation. 1381 */ 1382 vmbus_xact_ctx_orphan(sc->hn_xact); 1383 } 1384 1385 if (device_is_attached(dev)) { 1386 HN_LOCK(sc); 1387 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 1388 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 1389 hn_stop(sc, true); 1390 /* 1391 * NOTE: 1392 * hn_stop() only suspends data, so managment 1393 * stuffs have to be suspended manually here. 1394 */ 1395 hn_suspend_mgmt(sc); 1396 hn_synth_detach(sc); 1397 } 1398 HN_UNLOCK(sc); 1399 ether_ifdetach(ifp); 1400 } 1401 1402 ifmedia_removeall(&sc->hn_media); 1403 hn_destroy_rx_data(sc); 1404 hn_destroy_tx_data(sc); 1405 1406 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 1407 int i; 1408 1409 for (i = 0; i < hn_tx_taskq_cnt; ++i) 1410 taskqueue_free(sc->hn_tx_taskqs[i]); 1411 free(sc->hn_tx_taskqs, M_DEVBUF); 1412 } 1413 taskqueue_free(sc->hn_mgmt_taskq0); 1414 1415 if (sc->hn_xact != NULL) { 1416 /* 1417 * Uninstall the orphan handler _before_ the xact is 1418 * destructed. 1419 */ 1420 vmbus_chan_unset_orphan(sc->hn_prichan); 1421 vmbus_xact_ctx_destroy(sc->hn_xact); 1422 } 1423 1424 if_free(ifp); 1425 1426 HN_LOCK_DESTROY(sc); 1427 return (0); 1428 } 1429 1430 static int 1431 hn_shutdown(device_t dev) 1432 { 1433 1434 return (0); 1435 } 1436 1437 static void 1438 hn_link_status(struct hn_softc *sc) 1439 { 1440 uint32_t link_status; 1441 int error; 1442 1443 error = hn_rndis_get_linkstatus(sc, &link_status); 1444 if (error) { 1445 /* XXX what to do? */ 1446 return; 1447 } 1448 1449 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 1450 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 1451 else 1452 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1453 if_link_state_change(sc->hn_ifp, 1454 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 1455 LINK_STATE_UP : LINK_STATE_DOWN); 1456 } 1457 1458 static void 1459 hn_link_taskfunc(void *xsc, int pending __unused) 1460 { 1461 struct hn_softc *sc = xsc; 1462 1463 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 1464 return; 1465 hn_link_status(sc); 1466 } 1467 1468 static void 1469 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 1470 { 1471 struct hn_softc *sc = xsc; 1472 1473 /* Prevent any link status checks from running. */ 1474 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 1475 1476 /* 1477 * Fake up a [link down --> link up] state change; 5 seconds 1478 * delay is used, which closely simulates miibus reaction 1479 * upon link down event. 1480 */ 1481 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1482 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 1483 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 1484 &sc->hn_netchg_status, 5 * hz); 1485 } 1486 1487 static void 1488 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 1489 { 1490 struct hn_softc *sc = xsc; 1491 1492 /* Re-allow link status checks. */ 1493 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 1494 hn_link_status(sc); 1495 } 1496 1497 static void 1498 hn_update_link_status(struct hn_softc *sc) 1499 { 1500 1501 if (sc->hn_mgmt_taskq != NULL) 1502 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 1503 } 1504 1505 static void 1506 hn_change_network(struct hn_softc *sc) 1507 { 1508 1509 if (sc->hn_mgmt_taskq != NULL) 1510 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 1511 } 1512 1513 static __inline int 1514 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 1515 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 1516 { 1517 struct mbuf *m = *m_head; 1518 int error; 1519 1520 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 1521 1522 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 1523 m, segs, nsegs, BUS_DMA_NOWAIT); 1524 if (error == EFBIG) { 1525 struct mbuf *m_new; 1526 1527 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 1528 if (m_new == NULL) 1529 return ENOBUFS; 1530 else 1531 *m_head = m = m_new; 1532 txr->hn_tx_collapsed++; 1533 1534 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 1535 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 1536 } 1537 if (!error) { 1538 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 1539 BUS_DMASYNC_PREWRITE); 1540 txd->flags |= HN_TXD_FLAG_DMAMAP; 1541 } 1542 return error; 1543 } 1544 1545 static __inline int 1546 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 1547 { 1548 1549 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 1550 ("put an onlist txd %#x", txd->flags)); 1551 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1552 ("put an onagg txd %#x", txd->flags)); 1553 1554 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1555 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 1556 return 0; 1557 1558 if (!STAILQ_EMPTY(&txd->agg_list)) { 1559 struct hn_txdesc *tmp_txd; 1560 1561 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 1562 int freed; 1563 1564 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 1565 ("resursive aggregation on aggregated txdesc")); 1566 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 1567 ("not aggregated txdesc")); 1568 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1569 ("aggregated txdesc uses dmamap")); 1570 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 1571 ("aggregated txdesc consumes " 1572 "chimney sending buffer")); 1573 KASSERT(tmp_txd->chim_size == 0, 1574 ("aggregated txdesc has non-zero " 1575 "chimney sending size")); 1576 1577 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 1578 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 1579 freed = hn_txdesc_put(txr, tmp_txd); 1580 KASSERT(freed, ("failed to free aggregated txdesc")); 1581 } 1582 } 1583 1584 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 1585 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1586 ("chim txd uses dmamap")); 1587 hn_chim_free(txr->hn_sc, txd->chim_index); 1588 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 1589 txd->chim_size = 0; 1590 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 1591 bus_dmamap_sync(txr->hn_tx_data_dtag, 1592 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 1593 bus_dmamap_unload(txr->hn_tx_data_dtag, 1594 txd->data_dmap); 1595 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 1596 } 1597 1598 if (txd->m != NULL) { 1599 m_freem(txd->m); 1600 txd->m = NULL; 1601 } 1602 1603 txd->flags |= HN_TXD_FLAG_ONLIST; 1604 #ifndef HN_USE_TXDESC_BUFRING 1605 mtx_lock_spin(&txr->hn_txlist_spin); 1606 KASSERT(txr->hn_txdesc_avail >= 0 && 1607 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 1608 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 1609 txr->hn_txdesc_avail++; 1610 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 1611 mtx_unlock_spin(&txr->hn_txlist_spin); 1612 #else /* HN_USE_TXDESC_BUFRING */ 1613 #ifdef HN_DEBUG 1614 atomic_add_int(&txr->hn_txdesc_avail, 1); 1615 #endif 1616 buf_ring_enqueue(txr->hn_txdesc_br, txd); 1617 #endif /* !HN_USE_TXDESC_BUFRING */ 1618 1619 return 1; 1620 } 1621 1622 static __inline struct hn_txdesc * 1623 hn_txdesc_get(struct hn_tx_ring *txr) 1624 { 1625 struct hn_txdesc *txd; 1626 1627 #ifndef HN_USE_TXDESC_BUFRING 1628 mtx_lock_spin(&txr->hn_txlist_spin); 1629 txd = SLIST_FIRST(&txr->hn_txlist); 1630 if (txd != NULL) { 1631 KASSERT(txr->hn_txdesc_avail > 0, 1632 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 1633 txr->hn_txdesc_avail--; 1634 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 1635 } 1636 mtx_unlock_spin(&txr->hn_txlist_spin); 1637 #else 1638 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 1639 #endif 1640 1641 if (txd != NULL) { 1642 #ifdef HN_USE_TXDESC_BUFRING 1643 #ifdef HN_DEBUG 1644 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 1645 #endif 1646 #endif /* HN_USE_TXDESC_BUFRING */ 1647 KASSERT(txd->m == NULL && txd->refs == 0 && 1648 STAILQ_EMPTY(&txd->agg_list) && 1649 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 1650 txd->chim_size == 0 && 1651 (txd->flags & HN_TXD_FLAG_ONLIST) && 1652 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 1653 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 1654 txd->flags &= ~HN_TXD_FLAG_ONLIST; 1655 txd->refs = 1; 1656 } 1657 return txd; 1658 } 1659 1660 static __inline void 1661 hn_txdesc_hold(struct hn_txdesc *txd) 1662 { 1663 1664 /* 0->1 transition will never work */ 1665 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1666 atomic_add_int(&txd->refs, 1); 1667 } 1668 1669 static __inline void 1670 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 1671 { 1672 1673 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1674 ("recursive aggregation on aggregating txdesc")); 1675 1676 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1677 ("already aggregated")); 1678 KASSERT(STAILQ_EMPTY(&txd->agg_list), 1679 ("recursive aggregation on to-be-aggregated txdesc")); 1680 1681 txd->flags |= HN_TXD_FLAG_ONAGG; 1682 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 1683 } 1684 1685 static bool 1686 hn_tx_ring_pending(struct hn_tx_ring *txr) 1687 { 1688 bool pending = false; 1689 1690 #ifndef HN_USE_TXDESC_BUFRING 1691 mtx_lock_spin(&txr->hn_txlist_spin); 1692 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 1693 pending = true; 1694 mtx_unlock_spin(&txr->hn_txlist_spin); 1695 #else 1696 if (!buf_ring_full(txr->hn_txdesc_br)) 1697 pending = true; 1698 #endif 1699 return (pending); 1700 } 1701 1702 static __inline void 1703 hn_txeof(struct hn_tx_ring *txr) 1704 { 1705 txr->hn_has_txeof = 0; 1706 txr->hn_txeof(txr); 1707 } 1708 1709 static void 1710 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 1711 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 1712 { 1713 struct hn_txdesc *txd = sndc->hn_cbarg; 1714 struct hn_tx_ring *txr; 1715 1716 txr = txd->txr; 1717 KASSERT(txr->hn_chan == chan, 1718 ("channel mismatch, on chan%u, should be chan%u", 1719 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 1720 1721 txr->hn_has_txeof = 1; 1722 hn_txdesc_put(txr, txd); 1723 1724 ++txr->hn_txdone_cnt; 1725 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 1726 txr->hn_txdone_cnt = 0; 1727 if (txr->hn_oactive) 1728 hn_txeof(txr); 1729 } 1730 } 1731 1732 static void 1733 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 1734 { 1735 #if defined(INET) || defined(INET6) 1736 tcp_lro_flush_all(&rxr->hn_lro); 1737 #endif 1738 1739 /* 1740 * NOTE: 1741 * 'txr' could be NULL, if multiple channels and 1742 * ifnet.if_start method are enabled. 1743 */ 1744 if (txr == NULL || !txr->hn_has_txeof) 1745 return; 1746 1747 txr->hn_txdone_cnt = 0; 1748 hn_txeof(txr); 1749 } 1750 1751 static __inline uint32_t 1752 hn_rndis_pktmsg_offset(uint32_t ofs) 1753 { 1754 1755 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 1756 ("invalid RNDIS packet msg offset %u", ofs)); 1757 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 1758 } 1759 1760 static __inline void * 1761 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 1762 size_t pi_dlen, uint32_t pi_type) 1763 { 1764 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 1765 struct rndis_pktinfo *pi; 1766 1767 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 1768 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 1769 1770 /* 1771 * Per-packet-info does not move; it only grows. 1772 * 1773 * NOTE: 1774 * rm_pktinfooffset in this phase counts from the beginning 1775 * of rndis_packet_msg. 1776 */ 1777 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 1778 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 1779 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 1780 pkt->rm_pktinfolen); 1781 pkt->rm_pktinfolen += pi_size; 1782 1783 pi->rm_size = pi_size; 1784 pi->rm_type = pi_type; 1785 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 1786 1787 /* Update RNDIS packet msg length */ 1788 pkt->rm_len += pi_size; 1789 1790 return (pi->rm_data); 1791 } 1792 1793 static __inline int 1794 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 1795 { 1796 struct hn_txdesc *txd; 1797 struct mbuf *m; 1798 int error, pkts; 1799 1800 txd = txr->hn_agg_txd; 1801 KASSERT(txd != NULL, ("no aggregate txdesc")); 1802 1803 /* 1804 * Since hn_txpkt() will reset this temporary stat, save 1805 * it now, so that oerrors can be updated properly, if 1806 * hn_txpkt() ever fails. 1807 */ 1808 pkts = txr->hn_stat_pkts; 1809 1810 /* 1811 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 1812 * failure, save it for later freeing, if hn_txpkt() ever 1813 * fails. 1814 */ 1815 m = txd->m; 1816 error = hn_txpkt(ifp, txr, txd); 1817 if (__predict_false(error)) { 1818 /* txd is freed, but m is not. */ 1819 m_freem(m); 1820 1821 txr->hn_flush_failed++; 1822 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 1823 } 1824 1825 /* Reset all aggregation states. */ 1826 txr->hn_agg_txd = NULL; 1827 txr->hn_agg_szleft = 0; 1828 txr->hn_agg_pktleft = 0; 1829 txr->hn_agg_prevpkt = NULL; 1830 1831 return (error); 1832 } 1833 1834 static void * 1835 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1836 int pktsize) 1837 { 1838 void *chim; 1839 1840 if (txr->hn_agg_txd != NULL) { 1841 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 1842 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 1843 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 1844 int olen; 1845 1846 /* 1847 * Update the previous RNDIS packet's total length, 1848 * it can be increased due to the mandatory alignment 1849 * padding for this RNDIS packet. And update the 1850 * aggregating txdesc's chimney sending buffer size 1851 * accordingly. 1852 * 1853 * XXX 1854 * Zero-out the padding, as required by the RNDIS spec. 1855 */ 1856 olen = pkt->rm_len; 1857 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 1858 agg_txd->chim_size += pkt->rm_len - olen; 1859 1860 /* Link this txdesc to the parent. */ 1861 hn_txdesc_agg(agg_txd, txd); 1862 1863 chim = (uint8_t *)pkt + pkt->rm_len; 1864 /* Save the current packet for later fixup. */ 1865 txr->hn_agg_prevpkt = chim; 1866 1867 txr->hn_agg_pktleft--; 1868 txr->hn_agg_szleft -= pktsize; 1869 if (txr->hn_agg_szleft <= 1870 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1871 /* 1872 * Probably can't aggregate more packets, 1873 * flush this aggregating txdesc proactively. 1874 */ 1875 txr->hn_agg_pktleft = 0; 1876 } 1877 /* Done! */ 1878 return (chim); 1879 } 1880 hn_flush_txagg(ifp, txr); 1881 } 1882 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 1883 1884 txr->hn_tx_chimney_tried++; 1885 txd->chim_index = hn_chim_alloc(txr->hn_sc); 1886 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 1887 return (NULL); 1888 txr->hn_tx_chimney++; 1889 1890 chim = txr->hn_sc->hn_chim + 1891 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 1892 1893 if (txr->hn_agg_pktmax > 1 && 1894 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1895 txr->hn_agg_txd = txd; 1896 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 1897 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 1898 txr->hn_agg_prevpkt = chim; 1899 } 1900 return (chim); 1901 } 1902 1903 /* 1904 * NOTE: 1905 * If this function fails, then both txd and m_head0 will be freed. 1906 */ 1907 static int 1908 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1909 struct mbuf **m_head0) 1910 { 1911 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 1912 int error, nsegs, i; 1913 struct mbuf *m_head = *m_head0; 1914 struct rndis_packet_msg *pkt; 1915 uint32_t *pi_data; 1916 void *chim = NULL; 1917 int pkt_hlen, pkt_size; 1918 1919 pkt = txd->rndis_pkt; 1920 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 1921 if (pkt_size < txr->hn_chim_size) { 1922 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 1923 if (chim != NULL) 1924 pkt = chim; 1925 } else { 1926 if (txr->hn_agg_txd != NULL) 1927 hn_flush_txagg(ifp, txr); 1928 } 1929 1930 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 1931 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len; 1932 pkt->rm_dataoffset = 0; 1933 pkt->rm_datalen = m_head->m_pkthdr.len; 1934 pkt->rm_oobdataoffset = 0; 1935 pkt->rm_oobdatalen = 0; 1936 pkt->rm_oobdataelements = 0; 1937 pkt->rm_pktinfooffset = sizeof(*pkt); 1938 pkt->rm_pktinfolen = 0; 1939 pkt->rm_vchandle = 0; 1940 pkt->rm_reserved = 0; 1941 1942 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 1943 /* 1944 * Set the hash value for this packet, so that the host could 1945 * dispatch the TX done event for this packet back to this TX 1946 * ring's channel. 1947 */ 1948 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1949 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 1950 *pi_data = txr->hn_tx_idx; 1951 } 1952 1953 if (m_head->m_flags & M_VLANTAG) { 1954 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1955 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 1956 *pi_data = NDIS_VLAN_INFO_MAKE( 1957 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 1958 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 1959 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 1960 } 1961 1962 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 1963 #if defined(INET6) || defined(INET) 1964 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1965 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 1966 #ifdef INET 1967 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 1968 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, 1969 m_head->m_pkthdr.tso_segsz); 1970 } 1971 #endif 1972 #if defined(INET6) && defined(INET) 1973 else 1974 #endif 1975 #ifdef INET6 1976 { 1977 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, 1978 m_head->m_pkthdr.tso_segsz); 1979 } 1980 #endif 1981 #endif /* INET6 || INET */ 1982 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 1983 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1984 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 1985 if (m_head->m_pkthdr.csum_flags & 1986 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 1987 *pi_data = NDIS_TXCSUM_INFO_IPV6; 1988 } else { 1989 *pi_data = NDIS_TXCSUM_INFO_IPV4; 1990 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 1991 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 1992 } 1993 1994 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) 1995 *pi_data |= NDIS_TXCSUM_INFO_TCPCS; 1996 else if (m_head->m_pkthdr.csum_flags & 1997 (CSUM_IP_UDP | CSUM_IP6_UDP)) 1998 *pi_data |= NDIS_TXCSUM_INFO_UDPCS; 1999 } 2000 2001 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 2002 /* Convert RNDIS packet message offsets */ 2003 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 2004 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 2005 2006 /* 2007 * Fast path: Chimney sending. 2008 */ 2009 if (chim != NULL) { 2010 struct hn_txdesc *tgt_txd = txd; 2011 2012 if (txr->hn_agg_txd != NULL) { 2013 tgt_txd = txr->hn_agg_txd; 2014 #ifdef INVARIANTS 2015 *m_head0 = NULL; 2016 #endif 2017 } 2018 2019 KASSERT(pkt == chim, 2020 ("RNDIS pkt not in chimney sending buffer")); 2021 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 2022 ("chimney sending buffer is not used")); 2023 tgt_txd->chim_size += pkt->rm_len; 2024 2025 m_copydata(m_head, 0, m_head->m_pkthdr.len, 2026 ((uint8_t *)chim) + pkt_hlen); 2027 2028 txr->hn_gpa_cnt = 0; 2029 txr->hn_sendpkt = hn_txpkt_chim; 2030 goto done; 2031 } 2032 2033 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 2034 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2035 ("chimney buffer is used")); 2036 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 2037 2038 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 2039 if (__predict_false(error)) { 2040 int freed; 2041 2042 /* 2043 * This mbuf is not linked w/ the txd yet, so free it now. 2044 */ 2045 m_freem(m_head); 2046 *m_head0 = NULL; 2047 2048 freed = hn_txdesc_put(txr, txd); 2049 KASSERT(freed != 0, 2050 ("fail to free txd upon txdma error")); 2051 2052 txr->hn_txdma_failed++; 2053 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 2054 return error; 2055 } 2056 *m_head0 = m_head; 2057 2058 /* +1 RNDIS packet message */ 2059 txr->hn_gpa_cnt = nsegs + 1; 2060 2061 /* send packet with page buffer */ 2062 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 2063 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 2064 txr->hn_gpa[0].gpa_len = pkt_hlen; 2065 2066 /* 2067 * Fill the page buffers with mbuf info after the page 2068 * buffer for RNDIS packet message. 2069 */ 2070 for (i = 0; i < nsegs; ++i) { 2071 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 2072 2073 gpa->gpa_page = atop(segs[i].ds_addr); 2074 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 2075 gpa->gpa_len = segs[i].ds_len; 2076 } 2077 2078 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2079 txd->chim_size = 0; 2080 txr->hn_sendpkt = hn_txpkt_sglist; 2081 done: 2082 txd->m = m_head; 2083 2084 /* Set the completion routine */ 2085 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 2086 2087 /* Update temporary stats for later use. */ 2088 txr->hn_stat_pkts++; 2089 txr->hn_stat_size += m_head->m_pkthdr.len; 2090 if (m_head->m_flags & M_MCAST) 2091 txr->hn_stat_mcasts++; 2092 2093 return 0; 2094 } 2095 2096 /* 2097 * NOTE: 2098 * If this function fails, then txd will be freed, but the mbuf 2099 * associated w/ the txd will _not_ be freed. 2100 */ 2101 static int 2102 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 2103 { 2104 int error, send_failed = 0, has_bpf; 2105 2106 again: 2107 has_bpf = bpf_peers_present(ifp->if_bpf); 2108 if (has_bpf) { 2109 /* 2110 * Make sure that this txd and any aggregated txds are not 2111 * freed before ETHER_BPF_MTAP. 2112 */ 2113 hn_txdesc_hold(txd); 2114 } 2115 error = txr->hn_sendpkt(txr, txd); 2116 if (!error) { 2117 if (has_bpf) { 2118 const struct hn_txdesc *tmp_txd; 2119 2120 ETHER_BPF_MTAP(ifp, txd->m); 2121 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 2122 ETHER_BPF_MTAP(ifp, tmp_txd->m); 2123 } 2124 2125 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 2126 #ifdef HN_IFSTART_SUPPORT 2127 if (!hn_use_if_start) 2128 #endif 2129 { 2130 if_inc_counter(ifp, IFCOUNTER_OBYTES, 2131 txr->hn_stat_size); 2132 if (txr->hn_stat_mcasts != 0) { 2133 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 2134 txr->hn_stat_mcasts); 2135 } 2136 } 2137 txr->hn_pkts += txr->hn_stat_pkts; 2138 txr->hn_sends++; 2139 } 2140 if (has_bpf) 2141 hn_txdesc_put(txr, txd); 2142 2143 if (__predict_false(error)) { 2144 int freed; 2145 2146 /* 2147 * This should "really rarely" happen. 2148 * 2149 * XXX Too many RX to be acked or too many sideband 2150 * commands to run? Ask netvsc_channel_rollup() 2151 * to kick start later. 2152 */ 2153 txr->hn_has_txeof = 1; 2154 if (!send_failed) { 2155 txr->hn_send_failed++; 2156 send_failed = 1; 2157 /* 2158 * Try sending again after set hn_has_txeof; 2159 * in case that we missed the last 2160 * netvsc_channel_rollup(). 2161 */ 2162 goto again; 2163 } 2164 if_printf(ifp, "send failed\n"); 2165 2166 /* 2167 * Caller will perform further processing on the 2168 * associated mbuf, so don't free it in hn_txdesc_put(); 2169 * only unload it from the DMA map in hn_txdesc_put(), 2170 * if it was loaded. 2171 */ 2172 txd->m = NULL; 2173 freed = hn_txdesc_put(txr, txd); 2174 KASSERT(freed != 0, 2175 ("fail to free txd upon send error")); 2176 2177 txr->hn_send_failed++; 2178 } 2179 2180 /* Reset temporary stats, after this sending is done. */ 2181 txr->hn_stat_size = 0; 2182 txr->hn_stat_pkts = 0; 2183 txr->hn_stat_mcasts = 0; 2184 2185 return (error); 2186 } 2187 2188 /* 2189 * Append the specified data to the indicated mbuf chain, 2190 * Extend the mbuf chain if the new data does not fit in 2191 * existing space. 2192 * 2193 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 2194 * There should be an equivalent in the kernel mbuf code, 2195 * but there does not appear to be one yet. 2196 * 2197 * Differs from m_append() in that additional mbufs are 2198 * allocated with cluster size MJUMPAGESIZE, and filled 2199 * accordingly. 2200 * 2201 * Return 1 if able to complete the job; otherwise 0. 2202 */ 2203 static int 2204 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 2205 { 2206 struct mbuf *m, *n; 2207 int remainder, space; 2208 2209 for (m = m0; m->m_next != NULL; m = m->m_next) 2210 ; 2211 remainder = len; 2212 space = M_TRAILINGSPACE(m); 2213 if (space > 0) { 2214 /* 2215 * Copy into available space. 2216 */ 2217 if (space > remainder) 2218 space = remainder; 2219 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 2220 m->m_len += space; 2221 cp += space; 2222 remainder -= space; 2223 } 2224 while (remainder > 0) { 2225 /* 2226 * Allocate a new mbuf; could check space 2227 * and allocate a cluster instead. 2228 */ 2229 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 2230 if (n == NULL) 2231 break; 2232 n->m_len = min(MJUMPAGESIZE, remainder); 2233 bcopy(cp, mtod(n, caddr_t), n->m_len); 2234 cp += n->m_len; 2235 remainder -= n->m_len; 2236 m->m_next = n; 2237 m = n; 2238 } 2239 if (m0->m_flags & M_PKTHDR) 2240 m0->m_pkthdr.len += len - remainder; 2241 2242 return (remainder == 0); 2243 } 2244 2245 #if defined(INET) || defined(INET6) 2246 static __inline int 2247 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 2248 { 2249 #if __FreeBSD_version >= 1100095 2250 if (hn_lro_mbufq_depth) { 2251 tcp_lro_queue_mbuf(lc, m); 2252 return 0; 2253 } 2254 #endif 2255 return tcp_lro_rx(lc, m, 0); 2256 } 2257 #endif 2258 2259 static int 2260 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, 2261 const struct hn_rxinfo *info) 2262 { 2263 struct ifnet *ifp; 2264 struct mbuf *m_new; 2265 int size, do_lro = 0, do_csum = 1; 2266 int hash_type; 2267 2268 /* If the VF is active, inject the packet through the VF */ 2269 ifp = rxr->hn_vf ? rxr->hn_vf : rxr->hn_ifp; 2270 2271 if (dlen <= MHLEN) { 2272 m_new = m_gethdr(M_NOWAIT, MT_DATA); 2273 if (m_new == NULL) { 2274 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2275 return (0); 2276 } 2277 memcpy(mtod(m_new, void *), data, dlen); 2278 m_new->m_pkthdr.len = m_new->m_len = dlen; 2279 rxr->hn_small_pkts++; 2280 } else { 2281 /* 2282 * Get an mbuf with a cluster. For packets 2K or less, 2283 * get a standard 2K cluster. For anything larger, get a 2284 * 4K cluster. Any buffers larger than 4K can cause problems 2285 * if looped around to the Hyper-V TX channel, so avoid them. 2286 */ 2287 size = MCLBYTES; 2288 if (dlen > MCLBYTES) { 2289 /* 4096 */ 2290 size = MJUMPAGESIZE; 2291 } 2292 2293 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 2294 if (m_new == NULL) { 2295 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2296 return (0); 2297 } 2298 2299 hv_m_append(m_new, dlen, data); 2300 } 2301 m_new->m_pkthdr.rcvif = ifp; 2302 2303 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) 2304 do_csum = 0; 2305 2306 /* receive side checksum offload */ 2307 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { 2308 /* IP csum offload */ 2309 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 2310 m_new->m_pkthdr.csum_flags |= 2311 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2312 rxr->hn_csum_ip++; 2313 } 2314 2315 /* TCP/UDP csum offload */ 2316 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | 2317 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 2318 m_new->m_pkthdr.csum_flags |= 2319 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2320 m_new->m_pkthdr.csum_data = 0xffff; 2321 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) 2322 rxr->hn_csum_tcp++; 2323 else 2324 rxr->hn_csum_udp++; 2325 } 2326 2327 /* 2328 * XXX 2329 * As of this write (Oct 28th, 2016), host side will turn 2330 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 2331 * the do_lro setting here is actually _not_ accurate. We 2332 * depend on the RSS hash type check to reset do_lro. 2333 */ 2334 if ((info->csum_info & 2335 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 2336 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 2337 do_lro = 1; 2338 } else { 2339 const struct ether_header *eh; 2340 uint16_t etype; 2341 int hoff; 2342 2343 hoff = sizeof(*eh); 2344 if (m_new->m_len < hoff) 2345 goto skip; 2346 eh = mtod(m_new, struct ether_header *); 2347 etype = ntohs(eh->ether_type); 2348 if (etype == ETHERTYPE_VLAN) { 2349 const struct ether_vlan_header *evl; 2350 2351 hoff = sizeof(*evl); 2352 if (m_new->m_len < hoff) 2353 goto skip; 2354 evl = mtod(m_new, struct ether_vlan_header *); 2355 etype = ntohs(evl->evl_proto); 2356 } 2357 2358 if (etype == ETHERTYPE_IP) { 2359 int pr; 2360 2361 pr = hn_check_iplen(m_new, hoff); 2362 if (pr == IPPROTO_TCP) { 2363 if (do_csum && 2364 (rxr->hn_trust_hcsum & 2365 HN_TRUST_HCSUM_TCP)) { 2366 rxr->hn_csum_trusted++; 2367 m_new->m_pkthdr.csum_flags |= 2368 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2369 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2370 m_new->m_pkthdr.csum_data = 0xffff; 2371 } 2372 do_lro = 1; 2373 } else if (pr == IPPROTO_UDP) { 2374 if (do_csum && 2375 (rxr->hn_trust_hcsum & 2376 HN_TRUST_HCSUM_UDP)) { 2377 rxr->hn_csum_trusted++; 2378 m_new->m_pkthdr.csum_flags |= 2379 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2380 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2381 m_new->m_pkthdr.csum_data = 0xffff; 2382 } 2383 } else if (pr != IPPROTO_DONE && do_csum && 2384 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 2385 rxr->hn_csum_trusted++; 2386 m_new->m_pkthdr.csum_flags |= 2387 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2388 } 2389 } 2390 } 2391 skip: 2392 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { 2393 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 2394 NDIS_VLAN_INFO_ID(info->vlan_info), 2395 NDIS_VLAN_INFO_PRI(info->vlan_info), 2396 NDIS_VLAN_INFO_CFI(info->vlan_info)); 2397 m_new->m_flags |= M_VLANTAG; 2398 } 2399 2400 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { 2401 rxr->hn_rss_pkts++; 2402 m_new->m_pkthdr.flowid = info->hash_value; 2403 hash_type = M_HASHTYPE_OPAQUE_HASH; 2404 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == 2405 NDIS_HASH_FUNCTION_TOEPLITZ) { 2406 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); 2407 2408 /* 2409 * NOTE: 2410 * do_lro is resetted, if the hash types are not TCP 2411 * related. See the comment in the above csum_flags 2412 * setup section. 2413 */ 2414 switch (type) { 2415 case NDIS_HASH_IPV4: 2416 hash_type = M_HASHTYPE_RSS_IPV4; 2417 do_lro = 0; 2418 break; 2419 2420 case NDIS_HASH_TCP_IPV4: 2421 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 2422 break; 2423 2424 case NDIS_HASH_IPV6: 2425 hash_type = M_HASHTYPE_RSS_IPV6; 2426 do_lro = 0; 2427 break; 2428 2429 case NDIS_HASH_IPV6_EX: 2430 hash_type = M_HASHTYPE_RSS_IPV6_EX; 2431 do_lro = 0; 2432 break; 2433 2434 case NDIS_HASH_TCP_IPV6: 2435 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 2436 break; 2437 2438 case NDIS_HASH_TCP_IPV6_EX: 2439 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 2440 break; 2441 } 2442 } 2443 } else { 2444 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 2445 hash_type = M_HASHTYPE_OPAQUE; 2446 } 2447 M_HASHTYPE_SET(m_new, hash_type); 2448 2449 /* 2450 * Note: Moved RX completion back to hv_nv_on_receive() so all 2451 * messages (not just data messages) will trigger a response. 2452 */ 2453 2454 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 2455 rxr->hn_pkts++; 2456 2457 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { 2458 #if defined(INET) || defined(INET6) 2459 struct lro_ctrl *lro = &rxr->hn_lro; 2460 2461 if (lro->lro_cnt) { 2462 rxr->hn_lro_tried++; 2463 if (hn_lro_rx(lro, m_new) == 0) { 2464 /* DONE! */ 2465 return 0; 2466 } 2467 } 2468 #endif 2469 } 2470 2471 /* We're not holding the lock here, so don't release it */ 2472 (*ifp->if_input)(ifp, m_new); 2473 2474 return (0); 2475 } 2476 2477 static int 2478 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 2479 { 2480 struct hn_softc *sc = ifp->if_softc; 2481 struct ifreq *ifr = (struct ifreq *)data; 2482 int mask, error = 0; 2483 2484 switch (cmd) { 2485 case SIOCSIFMTU: 2486 if (ifr->ifr_mtu > HN_MTU_MAX) { 2487 error = EINVAL; 2488 break; 2489 } 2490 2491 HN_LOCK(sc); 2492 2493 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2494 HN_UNLOCK(sc); 2495 break; 2496 } 2497 2498 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 2499 /* Can't change MTU */ 2500 HN_UNLOCK(sc); 2501 error = EOPNOTSUPP; 2502 break; 2503 } 2504 2505 if (ifp->if_mtu == ifr->ifr_mtu) { 2506 HN_UNLOCK(sc); 2507 break; 2508 } 2509 2510 /* 2511 * Suspend this interface before the synthetic parts 2512 * are ripped. 2513 */ 2514 hn_suspend(sc); 2515 2516 /* 2517 * Detach the synthetics parts, i.e. NVS and RNDIS. 2518 */ 2519 hn_synth_detach(sc); 2520 2521 /* 2522 * Reattach the synthetic parts, i.e. NVS and RNDIS, 2523 * with the new MTU setting. 2524 */ 2525 error = hn_synth_attach(sc, ifr->ifr_mtu); 2526 if (error) { 2527 HN_UNLOCK(sc); 2528 break; 2529 } 2530 2531 /* 2532 * Commit the requested MTU, after the synthetic parts 2533 * have been successfully attached. 2534 */ 2535 ifp->if_mtu = ifr->ifr_mtu; 2536 2537 /* 2538 * Make sure that various parameters based on MTU are 2539 * still valid, after the MTU change. 2540 */ 2541 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 2542 hn_set_chim_size(sc, sc->hn_chim_szmax); 2543 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 2544 #if __FreeBSD_version >= 1100099 2545 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < 2546 HN_LRO_LENLIM_MIN(ifp)) 2547 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 2548 #endif 2549 2550 /* 2551 * All done! Resume the interface now. 2552 */ 2553 hn_resume(sc); 2554 2555 HN_UNLOCK(sc); 2556 break; 2557 2558 case SIOCSIFFLAGS: 2559 HN_LOCK(sc); 2560 2561 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2562 HN_UNLOCK(sc); 2563 break; 2564 } 2565 2566 if (ifp->if_flags & IFF_UP) { 2567 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2568 /* 2569 * Caller meight hold mutex, e.g. 2570 * bpf; use busy-wait for the RNDIS 2571 * reply. 2572 */ 2573 HN_NO_SLEEPING(sc); 2574 hn_rxfilter_config(sc); 2575 HN_SLEEPING_OK(sc); 2576 } else { 2577 hn_init_locked(sc); 2578 } 2579 } else { 2580 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2581 hn_stop(sc, false); 2582 } 2583 sc->hn_if_flags = ifp->if_flags; 2584 2585 HN_UNLOCK(sc); 2586 break; 2587 2588 case SIOCSIFCAP: 2589 HN_LOCK(sc); 2590 mask = ifr->ifr_reqcap ^ ifp->if_capenable; 2591 2592 if (mask & IFCAP_TXCSUM) { 2593 ifp->if_capenable ^= IFCAP_TXCSUM; 2594 if (ifp->if_capenable & IFCAP_TXCSUM) 2595 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 2596 else 2597 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 2598 } 2599 if (mask & IFCAP_TXCSUM_IPV6) { 2600 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 2601 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 2602 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 2603 else 2604 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 2605 } 2606 2607 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 2608 if (mask & IFCAP_RXCSUM) 2609 ifp->if_capenable ^= IFCAP_RXCSUM; 2610 #ifdef foo 2611 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2612 if (mask & IFCAP_RXCSUM_IPV6) 2613 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 2614 #endif 2615 2616 if (mask & IFCAP_LRO) 2617 ifp->if_capenable ^= IFCAP_LRO; 2618 2619 if (mask & IFCAP_TSO4) { 2620 ifp->if_capenable ^= IFCAP_TSO4; 2621 if (ifp->if_capenable & IFCAP_TSO4) 2622 ifp->if_hwassist |= CSUM_IP_TSO; 2623 else 2624 ifp->if_hwassist &= ~CSUM_IP_TSO; 2625 } 2626 if (mask & IFCAP_TSO6) { 2627 ifp->if_capenable ^= IFCAP_TSO6; 2628 if (ifp->if_capenable & IFCAP_TSO6) 2629 ifp->if_hwassist |= CSUM_IP6_TSO; 2630 else 2631 ifp->if_hwassist &= ~CSUM_IP6_TSO; 2632 } 2633 2634 HN_UNLOCK(sc); 2635 break; 2636 2637 case SIOCADDMULTI: 2638 case SIOCDELMULTI: 2639 HN_LOCK(sc); 2640 2641 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2642 HN_UNLOCK(sc); 2643 break; 2644 } 2645 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2646 /* 2647 * Multicast uses mutex; use busy-wait for 2648 * the RNDIS reply. 2649 */ 2650 HN_NO_SLEEPING(sc); 2651 hn_rxfilter_config(sc); 2652 HN_SLEEPING_OK(sc); 2653 } 2654 2655 HN_UNLOCK(sc); 2656 break; 2657 2658 case SIOCSIFMEDIA: 2659 case SIOCGIFMEDIA: 2660 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 2661 break; 2662 2663 default: 2664 error = ether_ioctl(ifp, cmd, data); 2665 break; 2666 } 2667 return (error); 2668 } 2669 2670 static void 2671 hn_stop(struct hn_softc *sc, bool detaching) 2672 { 2673 struct ifnet *ifp = sc->hn_ifp; 2674 int i; 2675 2676 HN_LOCK_ASSERT(sc); 2677 2678 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 2679 ("synthetic parts were not attached")); 2680 2681 /* Disable polling. */ 2682 hn_polling(sc, 0); 2683 2684 /* Clear RUNNING bit _before_ hn_suspend_data() */ 2685 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2686 hn_suspend_data(sc); 2687 2688 /* Clear OACTIVE bit. */ 2689 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2690 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2691 sc->hn_tx_ring[i].hn_oactive = 0; 2692 2693 /* 2694 * If the VF is active, make sure the filter is not 0, even if 2695 * the synthetic NIC is down. 2696 */ 2697 if (!detaching && (sc->hn_flags & HN_FLAG_VF)) 2698 hn_rxfilter_config(sc); 2699 } 2700 2701 static void 2702 hn_init_locked(struct hn_softc *sc) 2703 { 2704 struct ifnet *ifp = sc->hn_ifp; 2705 int i; 2706 2707 HN_LOCK_ASSERT(sc); 2708 2709 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 2710 return; 2711 2712 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2713 return; 2714 2715 /* Configure RX filter */ 2716 hn_rxfilter_config(sc); 2717 2718 /* Clear OACTIVE bit. */ 2719 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2720 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2721 sc->hn_tx_ring[i].hn_oactive = 0; 2722 2723 /* Clear TX 'suspended' bit. */ 2724 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 2725 2726 /* Everything is ready; unleash! */ 2727 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2728 2729 /* Re-enable polling if requested. */ 2730 if (sc->hn_pollhz > 0) 2731 hn_polling(sc, sc->hn_pollhz); 2732 } 2733 2734 static void 2735 hn_init(void *xsc) 2736 { 2737 struct hn_softc *sc = xsc; 2738 2739 HN_LOCK(sc); 2740 hn_init_locked(sc); 2741 HN_UNLOCK(sc); 2742 } 2743 2744 #if __FreeBSD_version >= 1100099 2745 2746 static int 2747 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 2748 { 2749 struct hn_softc *sc = arg1; 2750 unsigned int lenlim; 2751 int error; 2752 2753 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 2754 error = sysctl_handle_int(oidp, &lenlim, 0, req); 2755 if (error || req->newptr == NULL) 2756 return error; 2757 2758 HN_LOCK(sc); 2759 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 2760 lenlim > TCP_LRO_LENGTH_MAX) { 2761 HN_UNLOCK(sc); 2762 return EINVAL; 2763 } 2764 hn_set_lro_lenlim(sc, lenlim); 2765 HN_UNLOCK(sc); 2766 2767 return 0; 2768 } 2769 2770 static int 2771 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 2772 { 2773 struct hn_softc *sc = arg1; 2774 int ackcnt, error, i; 2775 2776 /* 2777 * lro_ackcnt_lim is append count limit, 2778 * +1 to turn it into aggregation limit. 2779 */ 2780 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 2781 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 2782 if (error || req->newptr == NULL) 2783 return error; 2784 2785 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 2786 return EINVAL; 2787 2788 /* 2789 * Convert aggregation limit back to append 2790 * count limit. 2791 */ 2792 --ackcnt; 2793 HN_LOCK(sc); 2794 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 2795 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 2796 HN_UNLOCK(sc); 2797 return 0; 2798 } 2799 2800 #endif 2801 2802 static int 2803 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 2804 { 2805 struct hn_softc *sc = arg1; 2806 int hcsum = arg2; 2807 int on, error, i; 2808 2809 on = 0; 2810 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 2811 on = 1; 2812 2813 error = sysctl_handle_int(oidp, &on, 0, req); 2814 if (error || req->newptr == NULL) 2815 return error; 2816 2817 HN_LOCK(sc); 2818 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2819 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 2820 2821 if (on) 2822 rxr->hn_trust_hcsum |= hcsum; 2823 else 2824 rxr->hn_trust_hcsum &= ~hcsum; 2825 } 2826 HN_UNLOCK(sc); 2827 return 0; 2828 } 2829 2830 static int 2831 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 2832 { 2833 struct hn_softc *sc = arg1; 2834 int chim_size, error; 2835 2836 chim_size = sc->hn_tx_ring[0].hn_chim_size; 2837 error = sysctl_handle_int(oidp, &chim_size, 0, req); 2838 if (error || req->newptr == NULL) 2839 return error; 2840 2841 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 2842 return EINVAL; 2843 2844 HN_LOCK(sc); 2845 hn_set_chim_size(sc, chim_size); 2846 HN_UNLOCK(sc); 2847 return 0; 2848 } 2849 2850 #if __FreeBSD_version < 1100095 2851 static int 2852 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 2853 { 2854 struct hn_softc *sc = arg1; 2855 int ofs = arg2, i, error; 2856 struct hn_rx_ring *rxr; 2857 uint64_t stat; 2858 2859 stat = 0; 2860 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2861 rxr = &sc->hn_rx_ring[i]; 2862 stat += *((int *)((uint8_t *)rxr + ofs)); 2863 } 2864 2865 error = sysctl_handle_64(oidp, &stat, 0, req); 2866 if (error || req->newptr == NULL) 2867 return error; 2868 2869 /* Zero out this stat. */ 2870 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2871 rxr = &sc->hn_rx_ring[i]; 2872 *((int *)((uint8_t *)rxr + ofs)) = 0; 2873 } 2874 return 0; 2875 } 2876 #else 2877 static int 2878 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 2879 { 2880 struct hn_softc *sc = arg1; 2881 int ofs = arg2, i, error; 2882 struct hn_rx_ring *rxr; 2883 uint64_t stat; 2884 2885 stat = 0; 2886 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2887 rxr = &sc->hn_rx_ring[i]; 2888 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 2889 } 2890 2891 error = sysctl_handle_64(oidp, &stat, 0, req); 2892 if (error || req->newptr == NULL) 2893 return error; 2894 2895 /* Zero out this stat. */ 2896 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2897 rxr = &sc->hn_rx_ring[i]; 2898 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 2899 } 2900 return 0; 2901 } 2902 2903 #endif 2904 2905 static int 2906 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2907 { 2908 struct hn_softc *sc = arg1; 2909 int ofs = arg2, i, error; 2910 struct hn_rx_ring *rxr; 2911 u_long stat; 2912 2913 stat = 0; 2914 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2915 rxr = &sc->hn_rx_ring[i]; 2916 stat += *((u_long *)((uint8_t *)rxr + ofs)); 2917 } 2918 2919 error = sysctl_handle_long(oidp, &stat, 0, req); 2920 if (error || req->newptr == NULL) 2921 return error; 2922 2923 /* Zero out this stat. */ 2924 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2925 rxr = &sc->hn_rx_ring[i]; 2926 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 2927 } 2928 return 0; 2929 } 2930 2931 static int 2932 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2933 { 2934 struct hn_softc *sc = arg1; 2935 int ofs = arg2, i, error; 2936 struct hn_tx_ring *txr; 2937 u_long stat; 2938 2939 stat = 0; 2940 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2941 txr = &sc->hn_tx_ring[i]; 2942 stat += *((u_long *)((uint8_t *)txr + ofs)); 2943 } 2944 2945 error = sysctl_handle_long(oidp, &stat, 0, req); 2946 if (error || req->newptr == NULL) 2947 return error; 2948 2949 /* Zero out this stat. */ 2950 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2951 txr = &sc->hn_tx_ring[i]; 2952 *((u_long *)((uint8_t *)txr + ofs)) = 0; 2953 } 2954 return 0; 2955 } 2956 2957 static int 2958 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 2959 { 2960 struct hn_softc *sc = arg1; 2961 int ofs = arg2, i, error, conf; 2962 struct hn_tx_ring *txr; 2963 2964 txr = &sc->hn_tx_ring[0]; 2965 conf = *((int *)((uint8_t *)txr + ofs)); 2966 2967 error = sysctl_handle_int(oidp, &conf, 0, req); 2968 if (error || req->newptr == NULL) 2969 return error; 2970 2971 HN_LOCK(sc); 2972 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2973 txr = &sc->hn_tx_ring[i]; 2974 *((int *)((uint8_t *)txr + ofs)) = conf; 2975 } 2976 HN_UNLOCK(sc); 2977 2978 return 0; 2979 } 2980 2981 static int 2982 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 2983 { 2984 struct hn_softc *sc = arg1; 2985 int error, size; 2986 2987 size = sc->hn_agg_size; 2988 error = sysctl_handle_int(oidp, &size, 0, req); 2989 if (error || req->newptr == NULL) 2990 return (error); 2991 2992 HN_LOCK(sc); 2993 sc->hn_agg_size = size; 2994 hn_set_txagg(sc); 2995 HN_UNLOCK(sc); 2996 2997 return (0); 2998 } 2999 3000 static int 3001 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 3002 { 3003 struct hn_softc *sc = arg1; 3004 int error, pkts; 3005 3006 pkts = sc->hn_agg_pkts; 3007 error = sysctl_handle_int(oidp, &pkts, 0, req); 3008 if (error || req->newptr == NULL) 3009 return (error); 3010 3011 HN_LOCK(sc); 3012 sc->hn_agg_pkts = pkts; 3013 hn_set_txagg(sc); 3014 HN_UNLOCK(sc); 3015 3016 return (0); 3017 } 3018 3019 static int 3020 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 3021 { 3022 struct hn_softc *sc = arg1; 3023 int pkts; 3024 3025 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 3026 return (sysctl_handle_int(oidp, &pkts, 0, req)); 3027 } 3028 3029 static int 3030 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 3031 { 3032 struct hn_softc *sc = arg1; 3033 int align; 3034 3035 align = sc->hn_tx_ring[0].hn_agg_align; 3036 return (sysctl_handle_int(oidp, &align, 0, req)); 3037 } 3038 3039 static void 3040 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 3041 { 3042 if (pollhz == 0) 3043 vmbus_chan_poll_disable(chan); 3044 else 3045 vmbus_chan_poll_enable(chan, pollhz); 3046 } 3047 3048 static void 3049 hn_polling(struct hn_softc *sc, u_int pollhz) 3050 { 3051 int nsubch = sc->hn_rx_ring_inuse - 1; 3052 3053 HN_LOCK_ASSERT(sc); 3054 3055 if (nsubch > 0) { 3056 struct vmbus_channel **subch; 3057 int i; 3058 3059 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 3060 for (i = 0; i < nsubch; ++i) 3061 hn_chan_polling(subch[i], pollhz); 3062 vmbus_subchan_rel(subch, nsubch); 3063 } 3064 hn_chan_polling(sc->hn_prichan, pollhz); 3065 } 3066 3067 static int 3068 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 3069 { 3070 struct hn_softc *sc = arg1; 3071 int pollhz, error; 3072 3073 pollhz = sc->hn_pollhz; 3074 error = sysctl_handle_int(oidp, &pollhz, 0, req); 3075 if (error || req->newptr == NULL) 3076 return (error); 3077 3078 if (pollhz != 0 && 3079 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 3080 return (EINVAL); 3081 3082 HN_LOCK(sc); 3083 if (sc->hn_pollhz != pollhz) { 3084 sc->hn_pollhz = pollhz; 3085 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 3086 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 3087 hn_polling(sc, sc->hn_pollhz); 3088 } 3089 HN_UNLOCK(sc); 3090 3091 return (0); 3092 } 3093 3094 static int 3095 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 3096 { 3097 struct hn_softc *sc = arg1; 3098 char verstr[16]; 3099 3100 snprintf(verstr, sizeof(verstr), "%u.%u", 3101 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 3102 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 3103 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 3104 } 3105 3106 static int 3107 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 3108 { 3109 struct hn_softc *sc = arg1; 3110 char caps_str[128]; 3111 uint32_t caps; 3112 3113 HN_LOCK(sc); 3114 caps = sc->hn_caps; 3115 HN_UNLOCK(sc); 3116 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 3117 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 3118 } 3119 3120 static int 3121 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 3122 { 3123 struct hn_softc *sc = arg1; 3124 char assist_str[128]; 3125 uint32_t hwassist; 3126 3127 HN_LOCK(sc); 3128 hwassist = sc->hn_ifp->if_hwassist; 3129 HN_UNLOCK(sc); 3130 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 3131 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 3132 } 3133 3134 static int 3135 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 3136 { 3137 struct hn_softc *sc = arg1; 3138 char filter_str[128]; 3139 uint32_t filter; 3140 3141 HN_LOCK(sc); 3142 filter = sc->hn_rx_filter; 3143 HN_UNLOCK(sc); 3144 snprintf(filter_str, sizeof(filter_str), "%b", filter, 3145 NDIS_PACKET_TYPES); 3146 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 3147 } 3148 3149 #ifndef RSS 3150 3151 static int 3152 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 3153 { 3154 struct hn_softc *sc = arg1; 3155 int error; 3156 3157 HN_LOCK(sc); 3158 3159 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3160 if (error || req->newptr == NULL) 3161 goto back; 3162 3163 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3164 if (error) 3165 goto back; 3166 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 3167 3168 if (sc->hn_rx_ring_inuse > 1) { 3169 error = hn_rss_reconfig(sc); 3170 } else { 3171 /* Not RSS capable, at least for now; just save the RSS key. */ 3172 error = 0; 3173 } 3174 back: 3175 HN_UNLOCK(sc); 3176 return (error); 3177 } 3178 3179 static int 3180 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 3181 { 3182 struct hn_softc *sc = arg1; 3183 int error; 3184 3185 HN_LOCK(sc); 3186 3187 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 3188 if (error || req->newptr == NULL) 3189 goto back; 3190 3191 /* 3192 * Don't allow RSS indirect table change, if this interface is not 3193 * RSS capable currently. 3194 */ 3195 if (sc->hn_rx_ring_inuse == 1) { 3196 error = EOPNOTSUPP; 3197 goto back; 3198 } 3199 3200 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 3201 if (error) 3202 goto back; 3203 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 3204 3205 hn_rss_ind_fixup(sc); 3206 error = hn_rss_reconfig(sc); 3207 back: 3208 HN_UNLOCK(sc); 3209 return (error); 3210 } 3211 3212 #endif /* !RSS */ 3213 3214 static int 3215 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 3216 { 3217 struct hn_softc *sc = arg1; 3218 char hash_str[128]; 3219 uint32_t hash; 3220 3221 HN_LOCK(sc); 3222 hash = sc->hn_rss_hash; 3223 HN_UNLOCK(sc); 3224 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 3225 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 3226 } 3227 3228 static int 3229 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 3230 { 3231 struct hn_softc *sc = arg1; 3232 char vf_name[128]; 3233 struct ifnet *vf; 3234 3235 HN_LOCK(sc); 3236 vf_name[0] = '\0'; 3237 vf = sc->hn_rx_ring[0].hn_vf; 3238 if (vf != NULL) 3239 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf)); 3240 HN_UNLOCK(sc); 3241 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 3242 } 3243 3244 static int 3245 hn_check_iplen(const struct mbuf *m, int hoff) 3246 { 3247 const struct ip *ip; 3248 int len, iphlen, iplen; 3249 const struct tcphdr *th; 3250 int thoff; /* TCP data offset */ 3251 3252 len = hoff + sizeof(struct ip); 3253 3254 /* The packet must be at least the size of an IP header. */ 3255 if (m->m_pkthdr.len < len) 3256 return IPPROTO_DONE; 3257 3258 /* The fixed IP header must reside completely in the first mbuf. */ 3259 if (m->m_len < len) 3260 return IPPROTO_DONE; 3261 3262 ip = mtodo(m, hoff); 3263 3264 /* Bound check the packet's stated IP header length. */ 3265 iphlen = ip->ip_hl << 2; 3266 if (iphlen < sizeof(struct ip)) /* minimum header length */ 3267 return IPPROTO_DONE; 3268 3269 /* The full IP header must reside completely in the one mbuf. */ 3270 if (m->m_len < hoff + iphlen) 3271 return IPPROTO_DONE; 3272 3273 iplen = ntohs(ip->ip_len); 3274 3275 /* 3276 * Check that the amount of data in the buffers is as 3277 * at least much as the IP header would have us expect. 3278 */ 3279 if (m->m_pkthdr.len < hoff + iplen) 3280 return IPPROTO_DONE; 3281 3282 /* 3283 * Ignore IP fragments. 3284 */ 3285 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 3286 return IPPROTO_DONE; 3287 3288 /* 3289 * The TCP/IP or UDP/IP header must be entirely contained within 3290 * the first fragment of a packet. 3291 */ 3292 switch (ip->ip_p) { 3293 case IPPROTO_TCP: 3294 if (iplen < iphlen + sizeof(struct tcphdr)) 3295 return IPPROTO_DONE; 3296 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 3297 return IPPROTO_DONE; 3298 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 3299 thoff = th->th_off << 2; 3300 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 3301 return IPPROTO_DONE; 3302 if (m->m_len < hoff + iphlen + thoff) 3303 return IPPROTO_DONE; 3304 break; 3305 case IPPROTO_UDP: 3306 if (iplen < iphlen + sizeof(struct udphdr)) 3307 return IPPROTO_DONE; 3308 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 3309 return IPPROTO_DONE; 3310 break; 3311 default: 3312 if (iplen < iphlen) 3313 return IPPROTO_DONE; 3314 break; 3315 } 3316 return ip->ip_p; 3317 } 3318 3319 static int 3320 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 3321 { 3322 struct sysctl_oid_list *child; 3323 struct sysctl_ctx_list *ctx; 3324 device_t dev = sc->hn_dev; 3325 #if defined(INET) || defined(INET6) 3326 #if __FreeBSD_version >= 1100095 3327 int lroent_cnt; 3328 #endif 3329 #endif 3330 int i; 3331 3332 /* 3333 * Create RXBUF for reception. 3334 * 3335 * NOTE: 3336 * - It is shared by all channels. 3337 * - A large enough buffer is allocated, certain version of NVSes 3338 * may further limit the usable space. 3339 */ 3340 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3341 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 3342 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3343 if (sc->hn_rxbuf == NULL) { 3344 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 3345 return (ENOMEM); 3346 } 3347 3348 sc->hn_rx_ring_cnt = ring_cnt; 3349 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 3350 3351 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 3352 M_DEVBUF, M_WAITOK | M_ZERO); 3353 3354 #if defined(INET) || defined(INET6) 3355 #if __FreeBSD_version >= 1100095 3356 lroent_cnt = hn_lro_entry_count; 3357 if (lroent_cnt < TCP_LRO_ENTRIES) 3358 lroent_cnt = TCP_LRO_ENTRIES; 3359 if (bootverbose) 3360 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 3361 #endif 3362 #endif /* INET || INET6 */ 3363 3364 ctx = device_get_sysctl_ctx(dev); 3365 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 3366 3367 /* Create dev.hn.UNIT.rx sysctl tree */ 3368 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 3369 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3370 3371 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3372 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3373 3374 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3375 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 3376 &rxr->hn_br_dma, BUS_DMA_WAITOK); 3377 if (rxr->hn_br == NULL) { 3378 device_printf(dev, "allocate bufring failed\n"); 3379 return (ENOMEM); 3380 } 3381 3382 if (hn_trust_hosttcp) 3383 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 3384 if (hn_trust_hostudp) 3385 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 3386 if (hn_trust_hostip) 3387 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 3388 rxr->hn_ifp = sc->hn_ifp; 3389 if (i < sc->hn_tx_ring_cnt) 3390 rxr->hn_txr = &sc->hn_tx_ring[i]; 3391 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 3392 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 3393 rxr->hn_rx_idx = i; 3394 rxr->hn_rxbuf = sc->hn_rxbuf; 3395 3396 /* 3397 * Initialize LRO. 3398 */ 3399 #if defined(INET) || defined(INET6) 3400 #if __FreeBSD_version >= 1100095 3401 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 3402 hn_lro_mbufq_depth); 3403 #else 3404 tcp_lro_init(&rxr->hn_lro); 3405 rxr->hn_lro.ifp = sc->hn_ifp; 3406 #endif 3407 #if __FreeBSD_version >= 1100099 3408 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 3409 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 3410 #endif 3411 #endif /* INET || INET6 */ 3412 3413 if (sc->hn_rx_sysctl_tree != NULL) { 3414 char name[16]; 3415 3416 /* 3417 * Create per RX ring sysctl tree: 3418 * dev.hn.UNIT.rx.RINGID 3419 */ 3420 snprintf(name, sizeof(name), "%d", i); 3421 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 3422 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 3423 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3424 3425 if (rxr->hn_rx_sysctl_tree != NULL) { 3426 SYSCTL_ADD_ULONG(ctx, 3427 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3428 OID_AUTO, "packets", CTLFLAG_RW, 3429 &rxr->hn_pkts, "# of packets received"); 3430 SYSCTL_ADD_ULONG(ctx, 3431 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3432 OID_AUTO, "rss_pkts", CTLFLAG_RW, 3433 &rxr->hn_rss_pkts, 3434 "# of packets w/ RSS info received"); 3435 SYSCTL_ADD_INT(ctx, 3436 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3437 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 3438 &rxr->hn_pktbuf_len, 0, 3439 "Temporary channel packet buffer length"); 3440 } 3441 } 3442 } 3443 3444 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 3445 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3446 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 3447 #if __FreeBSD_version < 1100095 3448 hn_rx_stat_int_sysctl, 3449 #else 3450 hn_rx_stat_u64_sysctl, 3451 #endif 3452 "LU", "LRO queued"); 3453 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 3454 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3455 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 3456 #if __FreeBSD_version < 1100095 3457 hn_rx_stat_int_sysctl, 3458 #else 3459 hn_rx_stat_u64_sysctl, 3460 #endif 3461 "LU", "LRO flushed"); 3462 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 3463 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3464 __offsetof(struct hn_rx_ring, hn_lro_tried), 3465 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 3466 #if __FreeBSD_version >= 1100099 3467 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 3468 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3469 hn_lro_lenlim_sysctl, "IU", 3470 "Max # of data bytes to be aggregated by LRO"); 3471 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 3472 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3473 hn_lro_ackcnt_sysctl, "I", 3474 "Max # of ACKs to be aggregated by LRO"); 3475 #endif 3476 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 3477 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 3478 hn_trust_hcsum_sysctl, "I", 3479 "Trust tcp segement verification on host side, " 3480 "when csum info is missing"); 3481 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 3482 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 3483 hn_trust_hcsum_sysctl, "I", 3484 "Trust udp datagram verification on host side, " 3485 "when csum info is missing"); 3486 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 3487 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 3488 hn_trust_hcsum_sysctl, "I", 3489 "Trust ip packet verification on host side, " 3490 "when csum info is missing"); 3491 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 3492 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3493 __offsetof(struct hn_rx_ring, hn_csum_ip), 3494 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 3495 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 3496 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3497 __offsetof(struct hn_rx_ring, hn_csum_tcp), 3498 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 3499 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 3500 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3501 __offsetof(struct hn_rx_ring, hn_csum_udp), 3502 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 3503 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 3504 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3505 __offsetof(struct hn_rx_ring, hn_csum_trusted), 3506 hn_rx_stat_ulong_sysctl, "LU", 3507 "# of packets that we trust host's csum verification"); 3508 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 3509 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3510 __offsetof(struct hn_rx_ring, hn_small_pkts), 3511 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 3512 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 3513 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3514 __offsetof(struct hn_rx_ring, hn_ack_failed), 3515 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 3516 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 3517 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 3518 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 3519 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 3520 3521 return (0); 3522 } 3523 3524 static void 3525 hn_destroy_rx_data(struct hn_softc *sc) 3526 { 3527 int i; 3528 3529 if (sc->hn_rxbuf != NULL) { 3530 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 3531 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 3532 else 3533 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 3534 sc->hn_rxbuf = NULL; 3535 } 3536 3537 if (sc->hn_rx_ring_cnt == 0) 3538 return; 3539 3540 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3541 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3542 3543 if (rxr->hn_br == NULL) 3544 continue; 3545 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 3546 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 3547 } else { 3548 device_printf(sc->hn_dev, 3549 "%dth channel bufring is referenced", i); 3550 } 3551 rxr->hn_br = NULL; 3552 3553 #if defined(INET) || defined(INET6) 3554 tcp_lro_free(&rxr->hn_lro); 3555 #endif 3556 free(rxr->hn_pktbuf, M_DEVBUF); 3557 } 3558 free(sc->hn_rx_ring, M_DEVBUF); 3559 sc->hn_rx_ring = NULL; 3560 3561 sc->hn_rx_ring_cnt = 0; 3562 sc->hn_rx_ring_inuse = 0; 3563 } 3564 3565 static int 3566 hn_tx_ring_create(struct hn_softc *sc, int id) 3567 { 3568 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 3569 device_t dev = sc->hn_dev; 3570 bus_dma_tag_t parent_dtag; 3571 int error, i; 3572 3573 txr->hn_sc = sc; 3574 txr->hn_tx_idx = id; 3575 3576 #ifndef HN_USE_TXDESC_BUFRING 3577 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 3578 #endif 3579 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 3580 3581 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 3582 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 3583 M_DEVBUF, M_WAITOK | M_ZERO); 3584 #ifndef HN_USE_TXDESC_BUFRING 3585 SLIST_INIT(&txr->hn_txlist); 3586 #else 3587 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 3588 M_WAITOK, &txr->hn_tx_lock); 3589 #endif 3590 3591 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 3592 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 3593 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 3594 } else { 3595 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 3596 } 3597 3598 #ifdef HN_IFSTART_SUPPORT 3599 if (hn_use_if_start) { 3600 txr->hn_txeof = hn_start_txeof; 3601 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 3602 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 3603 } else 3604 #endif 3605 { 3606 int br_depth; 3607 3608 txr->hn_txeof = hn_xmit_txeof; 3609 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 3610 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 3611 3612 br_depth = hn_get_txswq_depth(txr); 3613 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 3614 M_WAITOK, &txr->hn_tx_lock); 3615 } 3616 3617 txr->hn_direct_tx_size = hn_direct_tx_size; 3618 3619 /* 3620 * Always schedule transmission instead of trying to do direct 3621 * transmission. This one gives the best performance so far. 3622 */ 3623 txr->hn_sched_tx = 1; 3624 3625 parent_dtag = bus_get_dma_tag(dev); 3626 3627 /* DMA tag for RNDIS packet messages. */ 3628 error = bus_dma_tag_create(parent_dtag, /* parent */ 3629 HN_RNDIS_PKT_ALIGN, /* alignment */ 3630 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 3631 BUS_SPACE_MAXADDR, /* lowaddr */ 3632 BUS_SPACE_MAXADDR, /* highaddr */ 3633 NULL, NULL, /* filter, filterarg */ 3634 HN_RNDIS_PKT_LEN, /* maxsize */ 3635 1, /* nsegments */ 3636 HN_RNDIS_PKT_LEN, /* maxsegsize */ 3637 0, /* flags */ 3638 NULL, /* lockfunc */ 3639 NULL, /* lockfuncarg */ 3640 &txr->hn_tx_rndis_dtag); 3641 if (error) { 3642 device_printf(dev, "failed to create rndis dmatag\n"); 3643 return error; 3644 } 3645 3646 /* DMA tag for data. */ 3647 error = bus_dma_tag_create(parent_dtag, /* parent */ 3648 1, /* alignment */ 3649 HN_TX_DATA_BOUNDARY, /* boundary */ 3650 BUS_SPACE_MAXADDR, /* lowaddr */ 3651 BUS_SPACE_MAXADDR, /* highaddr */ 3652 NULL, NULL, /* filter, filterarg */ 3653 HN_TX_DATA_MAXSIZE, /* maxsize */ 3654 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 3655 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 3656 0, /* flags */ 3657 NULL, /* lockfunc */ 3658 NULL, /* lockfuncarg */ 3659 &txr->hn_tx_data_dtag); 3660 if (error) { 3661 device_printf(dev, "failed to create data dmatag\n"); 3662 return error; 3663 } 3664 3665 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 3666 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 3667 3668 txd->txr = txr; 3669 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3670 STAILQ_INIT(&txd->agg_list); 3671 3672 /* 3673 * Allocate and load RNDIS packet message. 3674 */ 3675 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 3676 (void **)&txd->rndis_pkt, 3677 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 3678 &txd->rndis_pkt_dmap); 3679 if (error) { 3680 device_printf(dev, 3681 "failed to allocate rndis_packet_msg, %d\n", i); 3682 return error; 3683 } 3684 3685 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 3686 txd->rndis_pkt_dmap, 3687 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 3688 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 3689 BUS_DMA_NOWAIT); 3690 if (error) { 3691 device_printf(dev, 3692 "failed to load rndis_packet_msg, %d\n", i); 3693 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3694 txd->rndis_pkt, txd->rndis_pkt_dmap); 3695 return error; 3696 } 3697 3698 /* DMA map for TX data. */ 3699 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 3700 &txd->data_dmap); 3701 if (error) { 3702 device_printf(dev, 3703 "failed to allocate tx data dmamap\n"); 3704 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 3705 txd->rndis_pkt_dmap); 3706 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3707 txd->rndis_pkt, txd->rndis_pkt_dmap); 3708 return error; 3709 } 3710 3711 /* All set, put it to list */ 3712 txd->flags |= HN_TXD_FLAG_ONLIST; 3713 #ifndef HN_USE_TXDESC_BUFRING 3714 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 3715 #else 3716 buf_ring_enqueue(txr->hn_txdesc_br, txd); 3717 #endif 3718 } 3719 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 3720 3721 if (sc->hn_tx_sysctl_tree != NULL) { 3722 struct sysctl_oid_list *child; 3723 struct sysctl_ctx_list *ctx; 3724 char name[16]; 3725 3726 /* 3727 * Create per TX ring sysctl tree: 3728 * dev.hn.UNIT.tx.RINGID 3729 */ 3730 ctx = device_get_sysctl_ctx(dev); 3731 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 3732 3733 snprintf(name, sizeof(name), "%d", id); 3734 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 3735 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3736 3737 if (txr->hn_tx_sysctl_tree != NULL) { 3738 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 3739 3740 #ifdef HN_DEBUG 3741 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 3742 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 3743 "# of available TX descs"); 3744 #endif 3745 #ifdef HN_IFSTART_SUPPORT 3746 if (!hn_use_if_start) 3747 #endif 3748 { 3749 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 3750 CTLFLAG_RD, &txr->hn_oactive, 0, 3751 "over active"); 3752 } 3753 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 3754 CTLFLAG_RW, &txr->hn_pkts, 3755 "# of packets transmitted"); 3756 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 3757 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 3758 } 3759 } 3760 3761 return 0; 3762 } 3763 3764 static void 3765 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 3766 { 3767 struct hn_tx_ring *txr = txd->txr; 3768 3769 KASSERT(txd->m == NULL, ("still has mbuf installed")); 3770 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 3771 3772 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 3773 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 3774 txd->rndis_pkt_dmap); 3775 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 3776 } 3777 3778 static void 3779 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 3780 { 3781 3782 KASSERT(txd->refs == 0 || txd->refs == 1, 3783 ("invalid txd refs %d", txd->refs)); 3784 3785 /* Aggregated txds will be freed by their aggregating txd. */ 3786 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 3787 int freed; 3788 3789 freed = hn_txdesc_put(txr, txd); 3790 KASSERT(freed, ("can't free txdesc")); 3791 } 3792 } 3793 3794 static void 3795 hn_tx_ring_destroy(struct hn_tx_ring *txr) 3796 { 3797 int i; 3798 3799 if (txr->hn_txdesc == NULL) 3800 return; 3801 3802 /* 3803 * NOTE: 3804 * Because the freeing of aggregated txds will be deferred 3805 * to the aggregating txd, two passes are used here: 3806 * - The first pass GCes any pending txds. This GC is necessary, 3807 * since if the channels are revoked, hypervisor will not 3808 * deliver send-done for all pending txds. 3809 * - The second pass frees the busdma stuffs, i.e. after all txds 3810 * were freed. 3811 */ 3812 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3813 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 3814 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3815 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 3816 3817 if (txr->hn_tx_data_dtag != NULL) 3818 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 3819 if (txr->hn_tx_rndis_dtag != NULL) 3820 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 3821 3822 #ifdef HN_USE_TXDESC_BUFRING 3823 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 3824 #endif 3825 3826 free(txr->hn_txdesc, M_DEVBUF); 3827 txr->hn_txdesc = NULL; 3828 3829 if (txr->hn_mbuf_br != NULL) 3830 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 3831 3832 #ifndef HN_USE_TXDESC_BUFRING 3833 mtx_destroy(&txr->hn_txlist_spin); 3834 #endif 3835 mtx_destroy(&txr->hn_tx_lock); 3836 } 3837 3838 static int 3839 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 3840 { 3841 struct sysctl_oid_list *child; 3842 struct sysctl_ctx_list *ctx; 3843 int i; 3844 3845 /* 3846 * Create TXBUF for chimney sending. 3847 * 3848 * NOTE: It is shared by all channels. 3849 */ 3850 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 3851 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 3852 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3853 if (sc->hn_chim == NULL) { 3854 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 3855 return (ENOMEM); 3856 } 3857 3858 sc->hn_tx_ring_cnt = ring_cnt; 3859 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 3860 3861 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 3862 M_DEVBUF, M_WAITOK | M_ZERO); 3863 3864 ctx = device_get_sysctl_ctx(sc->hn_dev); 3865 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 3866 3867 /* Create dev.hn.UNIT.tx sysctl tree */ 3868 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 3869 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3870 3871 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 3872 int error; 3873 3874 error = hn_tx_ring_create(sc, i); 3875 if (error) 3876 return error; 3877 } 3878 3879 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 3880 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3881 __offsetof(struct hn_tx_ring, hn_no_txdescs), 3882 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 3883 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 3884 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3885 __offsetof(struct hn_tx_ring, hn_send_failed), 3886 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 3887 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 3888 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3889 __offsetof(struct hn_tx_ring, hn_txdma_failed), 3890 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 3891 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 3892 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3893 __offsetof(struct hn_tx_ring, hn_flush_failed), 3894 hn_tx_stat_ulong_sysctl, "LU", 3895 "# of packet transmission aggregation flush failure"); 3896 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 3897 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3898 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 3899 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 3900 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 3901 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3902 __offsetof(struct hn_tx_ring, hn_tx_chimney), 3903 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 3904 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 3905 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3906 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 3907 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 3908 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 3909 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 3910 "# of total TX descs"); 3911 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 3912 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 3913 "Chimney send packet size upper boundary"); 3914 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 3915 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3916 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 3917 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 3918 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3919 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 3920 hn_tx_conf_int_sysctl, "I", 3921 "Size of the packet for direct transmission"); 3922 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 3923 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3924 __offsetof(struct hn_tx_ring, hn_sched_tx), 3925 hn_tx_conf_int_sysctl, "I", 3926 "Always schedule transmission " 3927 "instead of doing direct transmission"); 3928 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 3929 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 3930 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 3931 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 3932 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 3933 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 3934 "Applied packet transmission aggregation size"); 3935 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 3936 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3937 hn_txagg_pktmax_sysctl, "I", 3938 "Applied packet transmission aggregation packets"); 3939 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 3940 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3941 hn_txagg_align_sysctl, "I", 3942 "Applied packet transmission aggregation alignment"); 3943 3944 return 0; 3945 } 3946 3947 static void 3948 hn_set_chim_size(struct hn_softc *sc, int chim_size) 3949 { 3950 int i; 3951 3952 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3953 sc->hn_tx_ring[i].hn_chim_size = chim_size; 3954 } 3955 3956 static void 3957 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 3958 { 3959 struct ifnet *ifp = sc->hn_ifp; 3960 int tso_minlen; 3961 3962 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 3963 return; 3964 3965 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 3966 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 3967 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 3968 3969 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 3970 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 3971 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 3972 3973 if (tso_maxlen < tso_minlen) 3974 tso_maxlen = tso_minlen; 3975 else if (tso_maxlen > IP_MAXPACKET) 3976 tso_maxlen = IP_MAXPACKET; 3977 if (tso_maxlen > sc->hn_ndis_tso_szmax) 3978 tso_maxlen = sc->hn_ndis_tso_szmax; 3979 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 3980 if (bootverbose) 3981 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 3982 } 3983 3984 static void 3985 hn_fixup_tx_data(struct hn_softc *sc) 3986 { 3987 uint64_t csum_assist; 3988 int i; 3989 3990 hn_set_chim_size(sc, sc->hn_chim_szmax); 3991 if (hn_tx_chimney_size > 0 && 3992 hn_tx_chimney_size < sc->hn_chim_szmax) 3993 hn_set_chim_size(sc, hn_tx_chimney_size); 3994 3995 csum_assist = 0; 3996 if (sc->hn_caps & HN_CAP_IPCS) 3997 csum_assist |= CSUM_IP; 3998 if (sc->hn_caps & HN_CAP_TCP4CS) 3999 csum_assist |= CSUM_IP_TCP; 4000 if (sc->hn_caps & HN_CAP_UDP4CS) 4001 csum_assist |= CSUM_IP_UDP; 4002 if (sc->hn_caps & HN_CAP_TCP6CS) 4003 csum_assist |= CSUM_IP6_TCP; 4004 if (sc->hn_caps & HN_CAP_UDP6CS) 4005 csum_assist |= CSUM_IP6_UDP; 4006 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4007 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 4008 4009 if (sc->hn_caps & HN_CAP_HASHVAL) { 4010 /* 4011 * Support HASHVAL pktinfo on TX path. 4012 */ 4013 if (bootverbose) 4014 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 4015 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4016 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 4017 } 4018 } 4019 4020 static void 4021 hn_destroy_tx_data(struct hn_softc *sc) 4022 { 4023 int i; 4024 4025 if (sc->hn_chim != NULL) { 4026 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 4027 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 4028 } else { 4029 device_printf(sc->hn_dev, 4030 "chimney sending buffer is referenced"); 4031 } 4032 sc->hn_chim = NULL; 4033 } 4034 4035 if (sc->hn_tx_ring_cnt == 0) 4036 return; 4037 4038 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4039 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 4040 4041 free(sc->hn_tx_ring, M_DEVBUF); 4042 sc->hn_tx_ring = NULL; 4043 4044 sc->hn_tx_ring_cnt = 0; 4045 sc->hn_tx_ring_inuse = 0; 4046 } 4047 4048 #ifdef HN_IFSTART_SUPPORT 4049 4050 static void 4051 hn_start_taskfunc(void *xtxr, int pending __unused) 4052 { 4053 struct hn_tx_ring *txr = xtxr; 4054 4055 mtx_lock(&txr->hn_tx_lock); 4056 hn_start_locked(txr, 0); 4057 mtx_unlock(&txr->hn_tx_lock); 4058 } 4059 4060 static int 4061 hn_start_locked(struct hn_tx_ring *txr, int len) 4062 { 4063 struct hn_softc *sc = txr->hn_sc; 4064 struct ifnet *ifp = sc->hn_ifp; 4065 int sched = 0; 4066 4067 KASSERT(hn_use_if_start, 4068 ("hn_start_locked is called, when if_start is disabled")); 4069 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 4070 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 4071 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 4072 4073 if (__predict_false(txr->hn_suspended)) 4074 return (0); 4075 4076 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 4077 IFF_DRV_RUNNING) 4078 return (0); 4079 4080 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 4081 struct hn_txdesc *txd; 4082 struct mbuf *m_head; 4083 int error; 4084 4085 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 4086 if (m_head == NULL) 4087 break; 4088 4089 if (len > 0 && m_head->m_pkthdr.len > len) { 4090 /* 4091 * This sending could be time consuming; let callers 4092 * dispatch this packet sending (and sending of any 4093 * following up packets) to tx taskqueue. 4094 */ 4095 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 4096 sched = 1; 4097 break; 4098 } 4099 4100 #if defined(INET6) || defined(INET) 4101 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 4102 m_head = hn_tso_fixup(m_head); 4103 if (__predict_false(m_head == NULL)) { 4104 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 4105 continue; 4106 } 4107 } 4108 #endif 4109 4110 txd = hn_txdesc_get(txr); 4111 if (txd == NULL) { 4112 txr->hn_no_txdescs++; 4113 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 4114 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4115 break; 4116 } 4117 4118 error = hn_encap(ifp, txr, txd, &m_head); 4119 if (error) { 4120 /* Both txd and m_head are freed */ 4121 KASSERT(txr->hn_agg_txd == NULL, 4122 ("encap failed w/ pending aggregating txdesc")); 4123 continue; 4124 } 4125 4126 if (txr->hn_agg_pktleft == 0) { 4127 if (txr->hn_agg_txd != NULL) { 4128 KASSERT(m_head == NULL, 4129 ("pending mbuf for aggregating txdesc")); 4130 error = hn_flush_txagg(ifp, txr); 4131 if (__predict_false(error)) { 4132 atomic_set_int(&ifp->if_drv_flags, 4133 IFF_DRV_OACTIVE); 4134 break; 4135 } 4136 } else { 4137 KASSERT(m_head != NULL, ("mbuf was freed")); 4138 error = hn_txpkt(ifp, txr, txd); 4139 if (__predict_false(error)) { 4140 /* txd is freed, but m_head is not */ 4141 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 4142 atomic_set_int(&ifp->if_drv_flags, 4143 IFF_DRV_OACTIVE); 4144 break; 4145 } 4146 } 4147 } 4148 #ifdef INVARIANTS 4149 else { 4150 KASSERT(txr->hn_agg_txd != NULL, 4151 ("no aggregating txdesc")); 4152 KASSERT(m_head == NULL, 4153 ("pending mbuf for aggregating txdesc")); 4154 } 4155 #endif 4156 } 4157 4158 /* Flush pending aggerated transmission. */ 4159 if (txr->hn_agg_txd != NULL) 4160 hn_flush_txagg(ifp, txr); 4161 return (sched); 4162 } 4163 4164 static void 4165 hn_start(struct ifnet *ifp) 4166 { 4167 struct hn_softc *sc = ifp->if_softc; 4168 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 4169 4170 if (txr->hn_sched_tx) 4171 goto do_sched; 4172 4173 if (mtx_trylock(&txr->hn_tx_lock)) { 4174 int sched; 4175 4176 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 4177 mtx_unlock(&txr->hn_tx_lock); 4178 if (!sched) 4179 return; 4180 } 4181 do_sched: 4182 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4183 } 4184 4185 static void 4186 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 4187 { 4188 struct hn_tx_ring *txr = xtxr; 4189 4190 mtx_lock(&txr->hn_tx_lock); 4191 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 4192 hn_start_locked(txr, 0); 4193 mtx_unlock(&txr->hn_tx_lock); 4194 } 4195 4196 static void 4197 hn_start_txeof(struct hn_tx_ring *txr) 4198 { 4199 struct hn_softc *sc = txr->hn_sc; 4200 struct ifnet *ifp = sc->hn_ifp; 4201 4202 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 4203 4204 if (txr->hn_sched_tx) 4205 goto do_sched; 4206 4207 if (mtx_trylock(&txr->hn_tx_lock)) { 4208 int sched; 4209 4210 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4211 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 4212 mtx_unlock(&txr->hn_tx_lock); 4213 if (sched) { 4214 taskqueue_enqueue(txr->hn_tx_taskq, 4215 &txr->hn_tx_task); 4216 } 4217 } else { 4218 do_sched: 4219 /* 4220 * Release the OACTIVE earlier, with the hope, that 4221 * others could catch up. The task will clear the 4222 * flag again with the hn_tx_lock to avoid possible 4223 * races. 4224 */ 4225 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4226 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4227 } 4228 } 4229 4230 #endif /* HN_IFSTART_SUPPORT */ 4231 4232 static int 4233 hn_xmit(struct hn_tx_ring *txr, int len) 4234 { 4235 struct hn_softc *sc = txr->hn_sc; 4236 struct ifnet *ifp = sc->hn_ifp; 4237 struct mbuf *m_head; 4238 int sched = 0; 4239 4240 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 4241 #ifdef HN_IFSTART_SUPPORT 4242 KASSERT(hn_use_if_start == 0, 4243 ("hn_xmit is called, when if_start is enabled")); 4244 #endif 4245 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 4246 4247 if (__predict_false(txr->hn_suspended)) 4248 return (0); 4249 4250 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 4251 return (0); 4252 4253 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 4254 struct hn_txdesc *txd; 4255 int error; 4256 4257 if (len > 0 && m_head->m_pkthdr.len > len) { 4258 /* 4259 * This sending could be time consuming; let callers 4260 * dispatch this packet sending (and sending of any 4261 * following up packets) to tx taskqueue. 4262 */ 4263 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4264 sched = 1; 4265 break; 4266 } 4267 4268 txd = hn_txdesc_get(txr); 4269 if (txd == NULL) { 4270 txr->hn_no_txdescs++; 4271 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4272 txr->hn_oactive = 1; 4273 break; 4274 } 4275 4276 error = hn_encap(ifp, txr, txd, &m_head); 4277 if (error) { 4278 /* Both txd and m_head are freed; discard */ 4279 KASSERT(txr->hn_agg_txd == NULL, 4280 ("encap failed w/ pending aggregating txdesc")); 4281 drbr_advance(ifp, txr->hn_mbuf_br); 4282 continue; 4283 } 4284 4285 if (txr->hn_agg_pktleft == 0) { 4286 if (txr->hn_agg_txd != NULL) { 4287 KASSERT(m_head == NULL, 4288 ("pending mbuf for aggregating txdesc")); 4289 error = hn_flush_txagg(ifp, txr); 4290 if (__predict_false(error)) { 4291 txr->hn_oactive = 1; 4292 break; 4293 } 4294 } else { 4295 KASSERT(m_head != NULL, ("mbuf was freed")); 4296 error = hn_txpkt(ifp, txr, txd); 4297 if (__predict_false(error)) { 4298 /* txd is freed, but m_head is not */ 4299 drbr_putback(ifp, txr->hn_mbuf_br, 4300 m_head); 4301 txr->hn_oactive = 1; 4302 break; 4303 } 4304 } 4305 } 4306 #ifdef INVARIANTS 4307 else { 4308 KASSERT(txr->hn_agg_txd != NULL, 4309 ("no aggregating txdesc")); 4310 KASSERT(m_head == NULL, 4311 ("pending mbuf for aggregating txdesc")); 4312 } 4313 #endif 4314 4315 /* Sent */ 4316 drbr_advance(ifp, txr->hn_mbuf_br); 4317 } 4318 4319 /* Flush pending aggerated transmission. */ 4320 if (txr->hn_agg_txd != NULL) 4321 hn_flush_txagg(ifp, txr); 4322 return (sched); 4323 } 4324 4325 static int 4326 hn_transmit(struct ifnet *ifp, struct mbuf *m) 4327 { 4328 struct hn_softc *sc = ifp->if_softc; 4329 struct hn_tx_ring *txr; 4330 int error, idx = 0; 4331 4332 #if defined(INET6) || defined(INET) 4333 /* 4334 * Perform TSO packet header fixup now, since the TSO 4335 * packet header should be cache-hot. 4336 */ 4337 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 4338 m = hn_tso_fixup(m); 4339 if (__predict_false(m == NULL)) { 4340 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 4341 return EIO; 4342 } 4343 } 4344 #endif 4345 4346 /* 4347 * Select the TX ring based on flowid 4348 */ 4349 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 4350 #ifdef RSS 4351 uint32_t bid; 4352 4353 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 4354 &bid) == 0) 4355 idx = bid % sc->hn_tx_ring_inuse; 4356 else 4357 #endif 4358 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 4359 } 4360 txr = &sc->hn_tx_ring[idx]; 4361 4362 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 4363 if (error) { 4364 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 4365 return error; 4366 } 4367 4368 if (txr->hn_oactive) 4369 return 0; 4370 4371 if (txr->hn_sched_tx) 4372 goto do_sched; 4373 4374 if (mtx_trylock(&txr->hn_tx_lock)) { 4375 int sched; 4376 4377 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4378 mtx_unlock(&txr->hn_tx_lock); 4379 if (!sched) 4380 return 0; 4381 } 4382 do_sched: 4383 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4384 return 0; 4385 } 4386 4387 static void 4388 hn_tx_ring_qflush(struct hn_tx_ring *txr) 4389 { 4390 struct mbuf *m; 4391 4392 mtx_lock(&txr->hn_tx_lock); 4393 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 4394 m_freem(m); 4395 mtx_unlock(&txr->hn_tx_lock); 4396 } 4397 4398 static void 4399 hn_xmit_qflush(struct ifnet *ifp) 4400 { 4401 struct hn_softc *sc = ifp->if_softc; 4402 int i; 4403 4404 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4405 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 4406 if_qflush(ifp); 4407 } 4408 4409 static void 4410 hn_xmit_txeof(struct hn_tx_ring *txr) 4411 { 4412 4413 if (txr->hn_sched_tx) 4414 goto do_sched; 4415 4416 if (mtx_trylock(&txr->hn_tx_lock)) { 4417 int sched; 4418 4419 txr->hn_oactive = 0; 4420 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4421 mtx_unlock(&txr->hn_tx_lock); 4422 if (sched) { 4423 taskqueue_enqueue(txr->hn_tx_taskq, 4424 &txr->hn_tx_task); 4425 } 4426 } else { 4427 do_sched: 4428 /* 4429 * Release the oactive earlier, with the hope, that 4430 * others could catch up. The task will clear the 4431 * oactive again with the hn_tx_lock to avoid possible 4432 * races. 4433 */ 4434 txr->hn_oactive = 0; 4435 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4436 } 4437 } 4438 4439 static void 4440 hn_xmit_taskfunc(void *xtxr, int pending __unused) 4441 { 4442 struct hn_tx_ring *txr = xtxr; 4443 4444 mtx_lock(&txr->hn_tx_lock); 4445 hn_xmit(txr, 0); 4446 mtx_unlock(&txr->hn_tx_lock); 4447 } 4448 4449 static void 4450 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 4451 { 4452 struct hn_tx_ring *txr = xtxr; 4453 4454 mtx_lock(&txr->hn_tx_lock); 4455 txr->hn_oactive = 0; 4456 hn_xmit(txr, 0); 4457 mtx_unlock(&txr->hn_tx_lock); 4458 } 4459 4460 static int 4461 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 4462 { 4463 struct vmbus_chan_br cbr; 4464 struct hn_rx_ring *rxr; 4465 struct hn_tx_ring *txr = NULL; 4466 int idx, error; 4467 4468 idx = vmbus_chan_subidx(chan); 4469 4470 /* 4471 * Link this channel to RX/TX ring. 4472 */ 4473 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4474 ("invalid channel index %d, should > 0 && < %d", 4475 idx, sc->hn_rx_ring_inuse)); 4476 rxr = &sc->hn_rx_ring[idx]; 4477 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 4478 ("RX ring %d already attached", idx)); 4479 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 4480 rxr->hn_chan = chan; 4481 4482 if (bootverbose) { 4483 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 4484 idx, vmbus_chan_id(chan)); 4485 } 4486 4487 if (idx < sc->hn_tx_ring_inuse) { 4488 txr = &sc->hn_tx_ring[idx]; 4489 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 4490 ("TX ring %d already attached", idx)); 4491 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 4492 4493 txr->hn_chan = chan; 4494 if (bootverbose) { 4495 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 4496 idx, vmbus_chan_id(chan)); 4497 } 4498 } 4499 4500 /* Bind this channel to a proper CPU. */ 4501 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 4502 4503 /* 4504 * Open this channel 4505 */ 4506 cbr.cbr = rxr->hn_br; 4507 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 4508 cbr.cbr_txsz = HN_TXBR_SIZE; 4509 cbr.cbr_rxsz = HN_RXBR_SIZE; 4510 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 4511 if (error) { 4512 if (error == EISCONN) { 4513 if_printf(sc->hn_ifp, "bufring is connected after " 4514 "chan%u open failure\n", vmbus_chan_id(chan)); 4515 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4516 } else { 4517 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 4518 vmbus_chan_id(chan), error); 4519 } 4520 } 4521 return (error); 4522 } 4523 4524 static void 4525 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 4526 { 4527 struct hn_rx_ring *rxr; 4528 int idx, error; 4529 4530 idx = vmbus_chan_subidx(chan); 4531 4532 /* 4533 * Link this channel to RX/TX ring. 4534 */ 4535 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4536 ("invalid channel index %d, should > 0 && < %d", 4537 idx, sc->hn_rx_ring_inuse)); 4538 rxr = &sc->hn_rx_ring[idx]; 4539 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 4540 ("RX ring %d is not attached", idx)); 4541 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 4542 4543 if (idx < sc->hn_tx_ring_inuse) { 4544 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 4545 4546 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 4547 ("TX ring %d is not attached attached", idx)); 4548 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 4549 } 4550 4551 /* 4552 * Close this channel. 4553 * 4554 * NOTE: 4555 * Channel closing does _not_ destroy the target channel. 4556 */ 4557 error = vmbus_chan_close_direct(chan); 4558 if (error == EISCONN) { 4559 if_printf(sc->hn_ifp, "chan%u bufring is connected " 4560 "after being closed\n", vmbus_chan_id(chan)); 4561 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4562 } else if (error) { 4563 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 4564 vmbus_chan_id(chan), error); 4565 } 4566 } 4567 4568 static int 4569 hn_attach_subchans(struct hn_softc *sc) 4570 { 4571 struct vmbus_channel **subchans; 4572 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4573 int i, error = 0; 4574 4575 KASSERT(subchan_cnt > 0, ("no sub-channels")); 4576 4577 /* Attach the sub-channels. */ 4578 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4579 for (i = 0; i < subchan_cnt; ++i) { 4580 int error1; 4581 4582 error1 = hn_chan_attach(sc, subchans[i]); 4583 if (error1) { 4584 error = error1; 4585 /* Move on; all channels will be detached later. */ 4586 } 4587 } 4588 vmbus_subchan_rel(subchans, subchan_cnt); 4589 4590 if (error) { 4591 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 4592 } else { 4593 if (bootverbose) { 4594 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 4595 subchan_cnt); 4596 } 4597 } 4598 return (error); 4599 } 4600 4601 static void 4602 hn_detach_allchans(struct hn_softc *sc) 4603 { 4604 struct vmbus_channel **subchans; 4605 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4606 int i; 4607 4608 if (subchan_cnt == 0) 4609 goto back; 4610 4611 /* Detach the sub-channels. */ 4612 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4613 for (i = 0; i < subchan_cnt; ++i) 4614 hn_chan_detach(sc, subchans[i]); 4615 vmbus_subchan_rel(subchans, subchan_cnt); 4616 4617 back: 4618 /* 4619 * Detach the primary channel, _after_ all sub-channels 4620 * are detached. 4621 */ 4622 hn_chan_detach(sc, sc->hn_prichan); 4623 4624 /* Wait for sub-channels to be destroyed, if any. */ 4625 vmbus_subchan_drain(sc->hn_prichan); 4626 4627 #ifdef INVARIANTS 4628 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4629 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 4630 HN_RX_FLAG_ATTACHED) == 0, 4631 ("%dth RX ring is still attached", i)); 4632 } 4633 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4634 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 4635 HN_TX_FLAG_ATTACHED) == 0, 4636 ("%dth TX ring is still attached", i)); 4637 } 4638 #endif 4639 } 4640 4641 static int 4642 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 4643 { 4644 struct vmbus_channel **subchans; 4645 int nchan, rxr_cnt, error; 4646 4647 nchan = *nsubch + 1; 4648 if (nchan == 1) { 4649 /* 4650 * Multiple RX/TX rings are not requested. 4651 */ 4652 *nsubch = 0; 4653 return (0); 4654 } 4655 4656 /* 4657 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 4658 * table entries. 4659 */ 4660 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 4661 if (error) { 4662 /* No RSS; this is benign. */ 4663 *nsubch = 0; 4664 return (0); 4665 } 4666 if (bootverbose) { 4667 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 4668 rxr_cnt, nchan); 4669 } 4670 4671 if (nchan > rxr_cnt) 4672 nchan = rxr_cnt; 4673 if (nchan == 1) { 4674 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 4675 *nsubch = 0; 4676 return (0); 4677 } 4678 4679 /* 4680 * Allocate sub-channels from NVS. 4681 */ 4682 *nsubch = nchan - 1; 4683 error = hn_nvs_alloc_subchans(sc, nsubch); 4684 if (error || *nsubch == 0) { 4685 /* Failed to allocate sub-channels. */ 4686 *nsubch = 0; 4687 return (0); 4688 } 4689 4690 /* 4691 * Wait for all sub-channels to become ready before moving on. 4692 */ 4693 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 4694 vmbus_subchan_rel(subchans, *nsubch); 4695 return (0); 4696 } 4697 4698 static bool 4699 hn_synth_attachable(const struct hn_softc *sc) 4700 { 4701 int i; 4702 4703 if (sc->hn_flags & HN_FLAG_ERRORS) 4704 return (false); 4705 4706 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4707 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4708 4709 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 4710 return (false); 4711 } 4712 return (true); 4713 } 4714 4715 static int 4716 hn_synth_attach(struct hn_softc *sc, int mtu) 4717 { 4718 #define ATTACHED_NVS 0x0002 4719 #define ATTACHED_RNDIS 0x0004 4720 4721 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 4722 int error, nsubch, nchan, i; 4723 uint32_t old_caps, attached = 0; 4724 4725 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 4726 ("synthetic parts were attached")); 4727 4728 if (!hn_synth_attachable(sc)) 4729 return (ENXIO); 4730 4731 /* Save capabilities for later verification. */ 4732 old_caps = sc->hn_caps; 4733 sc->hn_caps = 0; 4734 4735 /* Clear RSS stuffs. */ 4736 sc->hn_rss_ind_size = 0; 4737 sc->hn_rss_hash = 0; 4738 4739 /* 4740 * Attach the primary channel _before_ attaching NVS and RNDIS. 4741 */ 4742 error = hn_chan_attach(sc, sc->hn_prichan); 4743 if (error) 4744 goto failed; 4745 4746 /* 4747 * Attach NVS. 4748 */ 4749 error = hn_nvs_attach(sc, mtu); 4750 if (error) 4751 goto failed; 4752 attached |= ATTACHED_NVS; 4753 4754 /* 4755 * Attach RNDIS _after_ NVS is attached. 4756 */ 4757 error = hn_rndis_attach(sc, mtu); 4758 if (error) 4759 goto failed; 4760 attached |= ATTACHED_RNDIS; 4761 4762 /* 4763 * Make sure capabilities are not changed. 4764 */ 4765 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 4766 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 4767 old_caps, sc->hn_caps); 4768 error = ENXIO; 4769 goto failed; 4770 } 4771 4772 /* 4773 * Allocate sub-channels for multi-TX/RX rings. 4774 * 4775 * NOTE: 4776 * The # of RX rings that can be used is equivalent to the # of 4777 * channels to be requested. 4778 */ 4779 nsubch = sc->hn_rx_ring_cnt - 1; 4780 error = hn_synth_alloc_subchans(sc, &nsubch); 4781 if (error) 4782 goto failed; 4783 /* NOTE: _Full_ synthetic parts detach is required now. */ 4784 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 4785 4786 /* 4787 * Set the # of TX/RX rings that could be used according to 4788 * the # of channels that NVS offered. 4789 */ 4790 nchan = nsubch + 1; 4791 hn_set_ring_inuse(sc, nchan); 4792 if (nchan == 1) { 4793 /* Only the primary channel can be used; done */ 4794 goto back; 4795 } 4796 4797 /* 4798 * Attach the sub-channels. 4799 * 4800 * NOTE: hn_set_ring_inuse() _must_ have been called. 4801 */ 4802 error = hn_attach_subchans(sc); 4803 if (error) 4804 goto failed; 4805 4806 /* 4807 * Configure RSS key and indirect table _after_ all sub-channels 4808 * are attached. 4809 */ 4810 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 4811 /* 4812 * RSS key is not set yet; set it to the default RSS key. 4813 */ 4814 if (bootverbose) 4815 if_printf(sc->hn_ifp, "setup default RSS key\n"); 4816 #ifdef RSS 4817 rss_getkey(rss->rss_key); 4818 #else 4819 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 4820 #endif 4821 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4822 } 4823 4824 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 4825 /* 4826 * RSS indirect table is not set yet; set it up in round- 4827 * robin fashion. 4828 */ 4829 if (bootverbose) { 4830 if_printf(sc->hn_ifp, "setup default RSS indirect " 4831 "table\n"); 4832 } 4833 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 4834 uint32_t subidx; 4835 4836 #ifdef RSS 4837 subidx = rss_get_indirection_to_bucket(i); 4838 #else 4839 subidx = i; 4840 #endif 4841 rss->rss_ind[i] = subidx % nchan; 4842 } 4843 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4844 } else { 4845 /* 4846 * # of usable channels may be changed, so we have to 4847 * make sure that all entries in RSS indirect table 4848 * are valid. 4849 * 4850 * NOTE: hn_set_ring_inuse() _must_ have been called. 4851 */ 4852 hn_rss_ind_fixup(sc); 4853 } 4854 4855 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 4856 if (error) 4857 goto failed; 4858 back: 4859 /* 4860 * Fixup transmission aggregation setup. 4861 */ 4862 hn_set_txagg(sc); 4863 return (0); 4864 4865 failed: 4866 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 4867 hn_synth_detach(sc); 4868 } else { 4869 if (attached & ATTACHED_RNDIS) 4870 hn_rndis_detach(sc); 4871 if (attached & ATTACHED_NVS) 4872 hn_nvs_detach(sc); 4873 hn_chan_detach(sc, sc->hn_prichan); 4874 /* Restore old capabilities. */ 4875 sc->hn_caps = old_caps; 4876 } 4877 return (error); 4878 4879 #undef ATTACHED_RNDIS 4880 #undef ATTACHED_NVS 4881 } 4882 4883 /* 4884 * NOTE: 4885 * The interface must have been suspended though hn_suspend(), before 4886 * this function get called. 4887 */ 4888 static void 4889 hn_synth_detach(struct hn_softc *sc) 4890 { 4891 4892 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4893 ("synthetic parts were not attached")); 4894 4895 /* Detach the RNDIS first. */ 4896 hn_rndis_detach(sc); 4897 4898 /* Detach NVS. */ 4899 hn_nvs_detach(sc); 4900 4901 /* Detach all of the channels. */ 4902 hn_detach_allchans(sc); 4903 4904 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 4905 } 4906 4907 static void 4908 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 4909 { 4910 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 4911 ("invalid ring count %d", ring_cnt)); 4912 4913 if (sc->hn_tx_ring_cnt > ring_cnt) 4914 sc->hn_tx_ring_inuse = ring_cnt; 4915 else 4916 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 4917 sc->hn_rx_ring_inuse = ring_cnt; 4918 4919 #ifdef RSS 4920 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 4921 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 4922 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 4923 rss_getnumbuckets()); 4924 } 4925 #endif 4926 4927 if (bootverbose) { 4928 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 4929 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 4930 } 4931 } 4932 4933 static void 4934 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 4935 { 4936 4937 /* 4938 * NOTE: 4939 * The TX bufring will not be drained by the hypervisor, 4940 * if the primary channel is revoked. 4941 */ 4942 while (!vmbus_chan_rx_empty(chan) || 4943 (!vmbus_chan_is_revoked(sc->hn_prichan) && 4944 !vmbus_chan_tx_empty(chan))) 4945 pause("waitch", 1); 4946 vmbus_chan_intr_drain(chan); 4947 } 4948 4949 static void 4950 hn_suspend_data(struct hn_softc *sc) 4951 { 4952 struct vmbus_channel **subch = NULL; 4953 struct hn_tx_ring *txr; 4954 int i, nsubch; 4955 4956 HN_LOCK_ASSERT(sc); 4957 4958 /* 4959 * Suspend TX. 4960 */ 4961 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4962 txr = &sc->hn_tx_ring[i]; 4963 4964 mtx_lock(&txr->hn_tx_lock); 4965 txr->hn_suspended = 1; 4966 mtx_unlock(&txr->hn_tx_lock); 4967 /* No one is able send more packets now. */ 4968 4969 /* 4970 * Wait for all pending sends to finish. 4971 * 4972 * NOTE: 4973 * We will _not_ receive all pending send-done, if the 4974 * primary channel is revoked. 4975 */ 4976 while (hn_tx_ring_pending(txr) && 4977 !vmbus_chan_is_revoked(sc->hn_prichan)) 4978 pause("hnwtx", 1 /* 1 tick */); 4979 } 4980 4981 /* 4982 * Disable RX by clearing RX filter. 4983 */ 4984 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 4985 4986 /* 4987 * Give RNDIS enough time to flush all pending data packets. 4988 */ 4989 pause("waitrx", (200 * hz) / 1000); 4990 4991 /* 4992 * Drain RX/TX bufrings and interrupts. 4993 */ 4994 nsubch = sc->hn_rx_ring_inuse - 1; 4995 if (nsubch > 0) 4996 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4997 4998 if (subch != NULL) { 4999 for (i = 0; i < nsubch; ++i) 5000 hn_chan_drain(sc, subch[i]); 5001 } 5002 hn_chan_drain(sc, sc->hn_prichan); 5003 5004 if (subch != NULL) 5005 vmbus_subchan_rel(subch, nsubch); 5006 5007 /* 5008 * Drain any pending TX tasks. 5009 * 5010 * NOTE: 5011 * The above hn_chan_drain() can dispatch TX tasks, so the TX 5012 * tasks will have to be drained _after_ the above hn_chan_drain() 5013 * calls. 5014 */ 5015 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 5016 txr = &sc->hn_tx_ring[i]; 5017 5018 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 5019 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 5020 } 5021 } 5022 5023 static void 5024 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 5025 { 5026 5027 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 5028 } 5029 5030 static void 5031 hn_suspend_mgmt(struct hn_softc *sc) 5032 { 5033 struct task task; 5034 5035 HN_LOCK_ASSERT(sc); 5036 5037 /* 5038 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 5039 * through hn_mgmt_taskq. 5040 */ 5041 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 5042 vmbus_chan_run_task(sc->hn_prichan, &task); 5043 5044 /* 5045 * Make sure that all pending management tasks are completed. 5046 */ 5047 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 5048 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 5049 taskqueue_drain_all(sc->hn_mgmt_taskq0); 5050 } 5051 5052 static void 5053 hn_suspend(struct hn_softc *sc) 5054 { 5055 5056 /* Disable polling. */ 5057 hn_polling(sc, 0); 5058 5059 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 5060 (sc->hn_flags & HN_FLAG_VF)) 5061 hn_suspend_data(sc); 5062 hn_suspend_mgmt(sc); 5063 } 5064 5065 static void 5066 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 5067 { 5068 int i; 5069 5070 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 5071 ("invalid TX ring count %d", tx_ring_cnt)); 5072 5073 for (i = 0; i < tx_ring_cnt; ++i) { 5074 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 5075 5076 mtx_lock(&txr->hn_tx_lock); 5077 txr->hn_suspended = 0; 5078 mtx_unlock(&txr->hn_tx_lock); 5079 } 5080 } 5081 5082 static void 5083 hn_resume_data(struct hn_softc *sc) 5084 { 5085 int i; 5086 5087 HN_LOCK_ASSERT(sc); 5088 5089 /* 5090 * Re-enable RX. 5091 */ 5092 hn_rxfilter_config(sc); 5093 5094 /* 5095 * Make sure to clear suspend status on "all" TX rings, 5096 * since hn_tx_ring_inuse can be changed after 5097 * hn_suspend_data(). 5098 */ 5099 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 5100 5101 #ifdef HN_IFSTART_SUPPORT 5102 if (!hn_use_if_start) 5103 #endif 5104 { 5105 /* 5106 * Flush unused drbrs, since hn_tx_ring_inuse may be 5107 * reduced. 5108 */ 5109 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 5110 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 5111 } 5112 5113 /* 5114 * Kick start TX. 5115 */ 5116 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 5117 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 5118 5119 /* 5120 * Use txeof task, so that any pending oactive can be 5121 * cleared properly. 5122 */ 5123 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5124 } 5125 } 5126 5127 static void 5128 hn_resume_mgmt(struct hn_softc *sc) 5129 { 5130 5131 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 5132 5133 /* 5134 * Kick off network change detection, if it was pending. 5135 * If no network change was pending, start link status 5136 * checks, which is more lightweight than network change 5137 * detection. 5138 */ 5139 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 5140 hn_change_network(sc); 5141 else 5142 hn_update_link_status(sc); 5143 } 5144 5145 static void 5146 hn_resume(struct hn_softc *sc) 5147 { 5148 5149 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 5150 (sc->hn_flags & HN_FLAG_VF)) 5151 hn_resume_data(sc); 5152 5153 /* 5154 * When the VF is activated, the synthetic interface is changed 5155 * to DOWN in hn_set_vf(). Here, if the VF is still active, we 5156 * don't call hn_resume_mgmt() until the VF is deactivated in 5157 * hn_set_vf(). 5158 */ 5159 if (!(sc->hn_flags & HN_FLAG_VF)) 5160 hn_resume_mgmt(sc); 5161 5162 /* 5163 * Re-enable polling if this interface is running and 5164 * the polling is requested. 5165 */ 5166 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 5167 hn_polling(sc, sc->hn_pollhz); 5168 } 5169 5170 static void 5171 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 5172 { 5173 const struct rndis_status_msg *msg; 5174 int ofs; 5175 5176 if (dlen < sizeof(*msg)) { 5177 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 5178 return; 5179 } 5180 msg = data; 5181 5182 switch (msg->rm_status) { 5183 case RNDIS_STATUS_MEDIA_CONNECT: 5184 case RNDIS_STATUS_MEDIA_DISCONNECT: 5185 hn_update_link_status(sc); 5186 break; 5187 5188 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 5189 /* Not really useful; ignore. */ 5190 break; 5191 5192 case RNDIS_STATUS_NETWORK_CHANGE: 5193 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 5194 if (dlen < ofs + msg->rm_stbuflen || 5195 msg->rm_stbuflen < sizeof(uint32_t)) { 5196 if_printf(sc->hn_ifp, "network changed\n"); 5197 } else { 5198 uint32_t change; 5199 5200 memcpy(&change, ((const uint8_t *)msg) + ofs, 5201 sizeof(change)); 5202 if_printf(sc->hn_ifp, "network changed, change %u\n", 5203 change); 5204 } 5205 hn_change_network(sc); 5206 break; 5207 5208 default: 5209 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 5210 msg->rm_status); 5211 break; 5212 } 5213 } 5214 5215 static int 5216 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 5217 { 5218 const struct rndis_pktinfo *pi = info_data; 5219 uint32_t mask = 0; 5220 5221 while (info_dlen != 0) { 5222 const void *data; 5223 uint32_t dlen; 5224 5225 if (__predict_false(info_dlen < sizeof(*pi))) 5226 return (EINVAL); 5227 if (__predict_false(info_dlen < pi->rm_size)) 5228 return (EINVAL); 5229 info_dlen -= pi->rm_size; 5230 5231 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 5232 return (EINVAL); 5233 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 5234 return (EINVAL); 5235 dlen = pi->rm_size - pi->rm_pktinfooffset; 5236 data = pi->rm_data; 5237 5238 switch (pi->rm_type) { 5239 case NDIS_PKTINFO_TYPE_VLAN: 5240 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) 5241 return (EINVAL); 5242 info->vlan_info = *((const uint32_t *)data); 5243 mask |= HN_RXINFO_VLAN; 5244 break; 5245 5246 case NDIS_PKTINFO_TYPE_CSUM: 5247 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) 5248 return (EINVAL); 5249 info->csum_info = *((const uint32_t *)data); 5250 mask |= HN_RXINFO_CSUM; 5251 break; 5252 5253 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 5254 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) 5255 return (EINVAL); 5256 info->hash_value = *((const uint32_t *)data); 5257 mask |= HN_RXINFO_HASHVAL; 5258 break; 5259 5260 case HN_NDIS_PKTINFO_TYPE_HASHINF: 5261 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) 5262 return (EINVAL); 5263 info->hash_info = *((const uint32_t *)data); 5264 mask |= HN_RXINFO_HASHINF; 5265 break; 5266 5267 default: 5268 goto next; 5269 } 5270 5271 if (mask == HN_RXINFO_ALL) { 5272 /* All found; done */ 5273 break; 5274 } 5275 next: 5276 pi = (const struct rndis_pktinfo *) 5277 ((const uint8_t *)pi + pi->rm_size); 5278 } 5279 5280 /* 5281 * Final fixup. 5282 * - If there is no hash value, invalidate the hash info. 5283 */ 5284 if ((mask & HN_RXINFO_HASHVAL) == 0) 5285 info->hash_info = HN_NDIS_HASH_INFO_INVALID; 5286 return (0); 5287 } 5288 5289 static __inline bool 5290 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 5291 { 5292 5293 if (off < check_off) { 5294 if (__predict_true(off + len <= check_off)) 5295 return (false); 5296 } else if (off > check_off) { 5297 if (__predict_true(check_off + check_len <= off)) 5298 return (false); 5299 } 5300 return (true); 5301 } 5302 5303 static void 5304 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 5305 { 5306 const struct rndis_packet_msg *pkt; 5307 struct hn_rxinfo info; 5308 int data_off, pktinfo_off, data_len, pktinfo_len; 5309 5310 /* 5311 * Check length. 5312 */ 5313 if (__predict_false(dlen < sizeof(*pkt))) { 5314 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 5315 return; 5316 } 5317 pkt = data; 5318 5319 if (__predict_false(dlen < pkt->rm_len)) { 5320 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 5321 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 5322 return; 5323 } 5324 if (__predict_false(pkt->rm_len < 5325 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 5326 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 5327 "msglen %u, data %u, oob %u, pktinfo %u\n", 5328 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 5329 pkt->rm_pktinfolen); 5330 return; 5331 } 5332 if (__predict_false(pkt->rm_datalen == 0)) { 5333 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 5334 return; 5335 } 5336 5337 /* 5338 * Check offests. 5339 */ 5340 #define IS_OFFSET_INVALID(ofs) \ 5341 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 5342 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 5343 5344 /* XXX Hyper-V does not meet data offset alignment requirement */ 5345 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 5346 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5347 "data offset %u\n", pkt->rm_dataoffset); 5348 return; 5349 } 5350 if (__predict_false(pkt->rm_oobdataoffset > 0 && 5351 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 5352 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5353 "oob offset %u\n", pkt->rm_oobdataoffset); 5354 return; 5355 } 5356 if (__predict_true(pkt->rm_pktinfooffset > 0) && 5357 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 5358 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5359 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 5360 return; 5361 } 5362 5363 #undef IS_OFFSET_INVALID 5364 5365 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 5366 data_len = pkt->rm_datalen; 5367 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 5368 pktinfo_len = pkt->rm_pktinfolen; 5369 5370 /* 5371 * Check OOB coverage. 5372 */ 5373 if (__predict_false(pkt->rm_oobdatalen != 0)) { 5374 int oob_off, oob_len; 5375 5376 if_printf(rxr->hn_ifp, "got oobdata\n"); 5377 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 5378 oob_len = pkt->rm_oobdatalen; 5379 5380 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 5381 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5382 "oob overflow, msglen %u, oob abs %d len %d\n", 5383 pkt->rm_len, oob_off, oob_len); 5384 return; 5385 } 5386 5387 /* 5388 * Check against data. 5389 */ 5390 if (hn_rndis_check_overlap(oob_off, oob_len, 5391 data_off, data_len)) { 5392 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5393 "oob overlaps data, oob abs %d len %d, " 5394 "data abs %d len %d\n", 5395 oob_off, oob_len, data_off, data_len); 5396 return; 5397 } 5398 5399 /* 5400 * Check against pktinfo. 5401 */ 5402 if (pktinfo_len != 0 && 5403 hn_rndis_check_overlap(oob_off, oob_len, 5404 pktinfo_off, pktinfo_len)) { 5405 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5406 "oob overlaps pktinfo, oob abs %d len %d, " 5407 "pktinfo abs %d len %d\n", 5408 oob_off, oob_len, pktinfo_off, pktinfo_len); 5409 return; 5410 } 5411 } 5412 5413 /* 5414 * Check per-packet-info coverage and find useful per-packet-info. 5415 */ 5416 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; 5417 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; 5418 info.hash_info = HN_NDIS_HASH_INFO_INVALID; 5419 if (__predict_true(pktinfo_len != 0)) { 5420 bool overlap; 5421 int error; 5422 5423 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 5424 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5425 "pktinfo overflow, msglen %u, " 5426 "pktinfo abs %d len %d\n", 5427 pkt->rm_len, pktinfo_off, pktinfo_len); 5428 return; 5429 } 5430 5431 /* 5432 * Check packet info coverage. 5433 */ 5434 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 5435 data_off, data_len); 5436 if (__predict_false(overlap)) { 5437 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5438 "pktinfo overlap data, pktinfo abs %d len %d, " 5439 "data abs %d len %d\n", 5440 pktinfo_off, pktinfo_len, data_off, data_len); 5441 return; 5442 } 5443 5444 /* 5445 * Find useful per-packet-info. 5446 */ 5447 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 5448 pktinfo_len, &info); 5449 if (__predict_false(error)) { 5450 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 5451 "pktinfo\n"); 5452 return; 5453 } 5454 } 5455 5456 if (__predict_false(data_off + data_len > pkt->rm_len)) { 5457 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5458 "data overflow, msglen %u, data abs %d len %d\n", 5459 pkt->rm_len, data_off, data_len); 5460 return; 5461 } 5462 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); 5463 } 5464 5465 static __inline void 5466 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 5467 { 5468 const struct rndis_msghdr *hdr; 5469 5470 if (__predict_false(dlen < sizeof(*hdr))) { 5471 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 5472 return; 5473 } 5474 hdr = data; 5475 5476 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 5477 /* Hot data path. */ 5478 hn_rndis_rx_data(rxr, data, dlen); 5479 /* Done! */ 5480 return; 5481 } 5482 5483 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 5484 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 5485 else 5486 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 5487 } 5488 5489 static void 5490 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 5491 { 5492 const struct hn_nvs_hdr *hdr; 5493 5494 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 5495 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 5496 return; 5497 } 5498 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 5499 5500 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 5501 /* Useless; ignore */ 5502 return; 5503 } 5504 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 5505 } 5506 5507 static void 5508 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 5509 const struct vmbus_chanpkt_hdr *pkt) 5510 { 5511 struct hn_nvs_sendctx *sndc; 5512 5513 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 5514 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 5515 VMBUS_CHANPKT_DATALEN(pkt)); 5516 /* 5517 * NOTE: 5518 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 5519 * its callback. 5520 */ 5521 } 5522 5523 static void 5524 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5525 const struct vmbus_chanpkt_hdr *pkthdr) 5526 { 5527 const struct vmbus_chanpkt_rxbuf *pkt; 5528 const struct hn_nvs_hdr *nvs_hdr; 5529 int count, i, hlen; 5530 5531 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 5532 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 5533 return; 5534 } 5535 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 5536 5537 /* Make sure that this is a RNDIS message. */ 5538 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 5539 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 5540 nvs_hdr->nvs_type); 5541 return; 5542 } 5543 5544 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 5545 if (__predict_false(hlen < sizeof(*pkt))) { 5546 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 5547 return; 5548 } 5549 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 5550 5551 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 5552 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 5553 pkt->cp_rxbuf_id); 5554 return; 5555 } 5556 5557 count = pkt->cp_rxbuf_cnt; 5558 if (__predict_false(hlen < 5559 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 5560 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 5561 return; 5562 } 5563 5564 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 5565 for (i = 0; i < count; ++i) { 5566 int ofs, len; 5567 5568 ofs = pkt->cp_rxbuf[i].rb_ofs; 5569 len = pkt->cp_rxbuf[i].rb_len; 5570 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 5571 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 5572 "ofs %d, len %d\n", i, ofs, len); 5573 continue; 5574 } 5575 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 5576 } 5577 5578 /* 5579 * Ack the consumed RXBUF associated w/ this channel packet, 5580 * so that this RXBUF can be recycled by the hypervisor. 5581 */ 5582 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 5583 } 5584 5585 static void 5586 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5587 uint64_t tid) 5588 { 5589 struct hn_nvs_rndis_ack ack; 5590 int retries, error; 5591 5592 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 5593 ack.nvs_status = HN_NVS_STATUS_OK; 5594 5595 retries = 0; 5596 again: 5597 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 5598 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 5599 if (__predict_false(error == EAGAIN)) { 5600 /* 5601 * NOTE: 5602 * This should _not_ happen in real world, since the 5603 * consumption of the TX bufring from the TX path is 5604 * controlled. 5605 */ 5606 if (rxr->hn_ack_failed == 0) 5607 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 5608 rxr->hn_ack_failed++; 5609 retries++; 5610 if (retries < 10) { 5611 DELAY(100); 5612 goto again; 5613 } 5614 /* RXBUF leaks! */ 5615 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 5616 } 5617 } 5618 5619 static void 5620 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 5621 { 5622 struct hn_rx_ring *rxr = xrxr; 5623 struct hn_softc *sc = rxr->hn_ifp->if_softc; 5624 5625 for (;;) { 5626 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 5627 int error, pktlen; 5628 5629 pktlen = rxr->hn_pktbuf_len; 5630 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 5631 if (__predict_false(error == ENOBUFS)) { 5632 void *nbuf; 5633 int nlen; 5634 5635 /* 5636 * Expand channel packet buffer. 5637 * 5638 * XXX 5639 * Use M_WAITOK here, since allocation failure 5640 * is fatal. 5641 */ 5642 nlen = rxr->hn_pktbuf_len * 2; 5643 while (nlen < pktlen) 5644 nlen *= 2; 5645 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 5646 5647 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 5648 rxr->hn_pktbuf_len, nlen); 5649 5650 free(rxr->hn_pktbuf, M_DEVBUF); 5651 rxr->hn_pktbuf = nbuf; 5652 rxr->hn_pktbuf_len = nlen; 5653 /* Retry! */ 5654 continue; 5655 } else if (__predict_false(error == EAGAIN)) { 5656 /* No more channel packets; done! */ 5657 break; 5658 } 5659 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 5660 5661 switch (pkt->cph_type) { 5662 case VMBUS_CHANPKT_TYPE_COMP: 5663 hn_nvs_handle_comp(sc, chan, pkt); 5664 break; 5665 5666 case VMBUS_CHANPKT_TYPE_RXBUF: 5667 hn_nvs_handle_rxbuf(rxr, chan, pkt); 5668 break; 5669 5670 case VMBUS_CHANPKT_TYPE_INBAND: 5671 hn_nvs_handle_notify(sc, pkt); 5672 break; 5673 5674 default: 5675 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 5676 pkt->cph_type); 5677 break; 5678 } 5679 } 5680 hn_chan_rollup(rxr, rxr->hn_txr); 5681 } 5682 5683 static void 5684 hn_tx_taskq_create(void *arg __unused) 5685 { 5686 int i; 5687 5688 /* 5689 * Fix the # of TX taskqueues. 5690 */ 5691 if (hn_tx_taskq_cnt <= 0) 5692 hn_tx_taskq_cnt = 1; 5693 else if (hn_tx_taskq_cnt > mp_ncpus) 5694 hn_tx_taskq_cnt = mp_ncpus; 5695 5696 /* 5697 * Fix the TX taskqueue mode. 5698 */ 5699 switch (hn_tx_taskq_mode) { 5700 case HN_TX_TASKQ_M_INDEP: 5701 case HN_TX_TASKQ_M_GLOBAL: 5702 case HN_TX_TASKQ_M_EVTTQ: 5703 break; 5704 default: 5705 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 5706 break; 5707 } 5708 5709 if (vm_guest != VM_GUEST_HV) 5710 return; 5711 5712 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 5713 return; 5714 5715 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 5716 M_DEVBUF, M_WAITOK); 5717 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 5718 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 5719 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 5720 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 5721 "hn tx%d", i); 5722 } 5723 } 5724 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5725 hn_tx_taskq_create, NULL); 5726 5727 static void 5728 hn_tx_taskq_destroy(void *arg __unused) 5729 { 5730 5731 if (hn_tx_taskque != NULL) { 5732 int i; 5733 5734 for (i = 0; i < hn_tx_taskq_cnt; ++i) 5735 taskqueue_free(hn_tx_taskque[i]); 5736 free(hn_tx_taskque, M_DEVBUF); 5737 } 5738 } 5739 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5740 hn_tx_taskq_destroy, NULL); 5741