1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/bus.h> 65 #include <sys/kernel.h> 66 #include <sys/limits.h> 67 #include <sys/malloc.h> 68 #include <sys/mbuf.h> 69 #include <sys/module.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/smp.h> 73 #include <sys/socket.h> 74 #include <sys/sockio.h> 75 #include <sys/sx.h> 76 #include <sys/sysctl.h> 77 #include <sys/systm.h> 78 #include <sys/taskqueue.h> 79 #include <sys/buf_ring.h> 80 #include <sys/eventhandler.h> 81 82 #include <machine/atomic.h> 83 #include <machine/in_cksum.h> 84 85 #include <net/bpf.h> 86 #include <net/ethernet.h> 87 #include <net/if.h> 88 #include <net/if_dl.h> 89 #include <net/if_media.h> 90 #include <net/if_types.h> 91 #include <net/if_var.h> 92 #include <net/rndis.h> 93 #ifdef RSS 94 #include <net/rss_config.h> 95 #endif 96 97 #include <netinet/in_systm.h> 98 #include <netinet/in.h> 99 #include <netinet/ip.h> 100 #include <netinet/ip6.h> 101 #include <netinet/tcp.h> 102 #include <netinet/tcp_lro.h> 103 #include <netinet/udp.h> 104 105 #include <dev/hyperv/include/hyperv.h> 106 #include <dev/hyperv/include/hyperv_busdma.h> 107 #include <dev/hyperv/include/vmbus.h> 108 #include <dev/hyperv/include/vmbus_xact.h> 109 110 #include <dev/hyperv/netvsc/ndis.h> 111 #include <dev/hyperv/netvsc/if_hnreg.h> 112 #include <dev/hyperv/netvsc/if_hnvar.h> 113 #include <dev/hyperv/netvsc/hn_nvs.h> 114 #include <dev/hyperv/netvsc/hn_rndis.h> 115 116 #include "vmbus_if.h" 117 118 #define HN_IFSTART_SUPPORT 119 120 #define HN_RING_CNT_DEF_MAX 8 121 122 /* YYY should get it from the underlying channel */ 123 #define HN_TX_DESC_CNT 512 124 125 #define HN_RNDIS_PKT_LEN \ 126 (sizeof(struct rndis_packet_msg) + \ 127 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 128 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 129 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 130 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 131 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 132 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 133 134 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 135 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 136 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 137 /* -1 for RNDIS packet message */ 138 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 139 140 #define HN_DIRECT_TX_SIZE_DEF 128 141 142 #define HN_EARLY_TXEOF_THRESH 8 143 144 #define HN_PKTBUF_LEN_DEF (16 * 1024) 145 146 #define HN_LROENT_CNT_DEF 128 147 148 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 149 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 150 /* YYY 2*MTU is a bit rough, but should be good enough. */ 151 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 152 153 #define HN_LRO_ACKCNT_DEF 1 154 155 #define HN_LOCK_INIT(sc) \ 156 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 157 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 158 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 159 #define HN_LOCK(sc) \ 160 do { \ 161 while (sx_try_xlock(&(sc)->hn_lock) == 0) \ 162 DELAY(1000); \ 163 } while (0) 164 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 165 166 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 167 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 168 #define HN_CSUM_IP_HWASSIST(sc) \ 169 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 170 #define HN_CSUM_IP6_HWASSIST(sc) \ 171 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 172 173 #define HN_PKTSIZE_MIN(align) \ 174 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 175 HN_RNDIS_PKT_LEN, (align)) 176 #define HN_PKTSIZE(m, align) \ 177 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 178 179 #ifdef RSS 180 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 181 #else 182 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 183 #endif 184 185 struct hn_txdesc { 186 #ifndef HN_USE_TXDESC_BUFRING 187 SLIST_ENTRY(hn_txdesc) link; 188 #endif 189 STAILQ_ENTRY(hn_txdesc) agg_link; 190 191 /* Aggregated txdescs, in sending order. */ 192 STAILQ_HEAD(, hn_txdesc) agg_list; 193 194 /* The oldest packet, if transmission aggregation happens. */ 195 struct mbuf *m; 196 struct hn_tx_ring *txr; 197 int refs; 198 uint32_t flags; /* HN_TXD_FLAG_ */ 199 struct hn_nvs_sendctx send_ctx; 200 uint32_t chim_index; 201 int chim_size; 202 203 bus_dmamap_t data_dmap; 204 205 bus_addr_t rndis_pkt_paddr; 206 struct rndis_packet_msg *rndis_pkt; 207 bus_dmamap_t rndis_pkt_dmap; 208 }; 209 210 #define HN_TXD_FLAG_ONLIST 0x0001 211 #define HN_TXD_FLAG_DMAMAP 0x0002 212 #define HN_TXD_FLAG_ONAGG 0x0004 213 214 struct hn_rxinfo { 215 uint32_t vlan_info; 216 uint32_t csum_info; 217 uint32_t hash_info; 218 uint32_t hash_value; 219 }; 220 221 struct hn_update_vf { 222 struct hn_rx_ring *rxr; 223 struct ifnet *vf; 224 }; 225 226 #define HN_RXINFO_VLAN 0x0001 227 #define HN_RXINFO_CSUM 0x0002 228 #define HN_RXINFO_HASHINF 0x0004 229 #define HN_RXINFO_HASHVAL 0x0008 230 #define HN_RXINFO_ALL \ 231 (HN_RXINFO_VLAN | \ 232 HN_RXINFO_CSUM | \ 233 HN_RXINFO_HASHINF | \ 234 HN_RXINFO_HASHVAL) 235 236 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff 237 #define HN_NDIS_RXCSUM_INFO_INVALID 0 238 #define HN_NDIS_HASH_INFO_INVALID 0 239 240 static int hn_probe(device_t); 241 static int hn_attach(device_t); 242 static int hn_detach(device_t); 243 static int hn_shutdown(device_t); 244 static void hn_chan_callback(struct vmbus_channel *, 245 void *); 246 247 static void hn_init(void *); 248 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 249 #ifdef HN_IFSTART_SUPPORT 250 static void hn_start(struct ifnet *); 251 #endif 252 static int hn_transmit(struct ifnet *, struct mbuf *); 253 static void hn_xmit_qflush(struct ifnet *); 254 static int hn_ifmedia_upd(struct ifnet *); 255 static void hn_ifmedia_sts(struct ifnet *, 256 struct ifmediareq *); 257 258 static int hn_rndis_rxinfo(const void *, int, 259 struct hn_rxinfo *); 260 static void hn_rndis_rx_data(struct hn_rx_ring *, 261 const void *, int); 262 static void hn_rndis_rx_status(struct hn_softc *, 263 const void *, int); 264 265 static void hn_nvs_handle_notify(struct hn_softc *, 266 const struct vmbus_chanpkt_hdr *); 267 static void hn_nvs_handle_comp(struct hn_softc *, 268 struct vmbus_channel *, 269 const struct vmbus_chanpkt_hdr *); 270 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 271 struct vmbus_channel *, 272 const struct vmbus_chanpkt_hdr *); 273 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 274 struct vmbus_channel *, uint64_t); 275 276 #if __FreeBSD_version >= 1100099 277 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 278 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 279 #endif 280 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 281 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 282 #if __FreeBSD_version < 1100095 283 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 284 #else 285 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 286 #endif 287 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 288 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 289 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 290 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 291 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 292 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 293 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 294 #ifndef RSS 295 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 296 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 297 #endif 298 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 299 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 300 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 301 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 302 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 303 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 304 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 305 306 static void hn_stop(struct hn_softc *, bool); 307 static void hn_init_locked(struct hn_softc *); 308 static int hn_chan_attach(struct hn_softc *, 309 struct vmbus_channel *); 310 static void hn_chan_detach(struct hn_softc *, 311 struct vmbus_channel *); 312 static int hn_attach_subchans(struct hn_softc *); 313 static void hn_detach_allchans(struct hn_softc *); 314 static void hn_chan_rollup(struct hn_rx_ring *, 315 struct hn_tx_ring *); 316 static void hn_set_ring_inuse(struct hn_softc *, int); 317 static int hn_synth_attach(struct hn_softc *, int); 318 static void hn_synth_detach(struct hn_softc *); 319 static int hn_synth_alloc_subchans(struct hn_softc *, 320 int *); 321 static bool hn_synth_attachable(const struct hn_softc *); 322 static void hn_suspend(struct hn_softc *); 323 static void hn_suspend_data(struct hn_softc *); 324 static void hn_suspend_mgmt(struct hn_softc *); 325 static void hn_resume(struct hn_softc *); 326 static void hn_resume_data(struct hn_softc *); 327 static void hn_resume_mgmt(struct hn_softc *); 328 static void hn_suspend_mgmt_taskfunc(void *, int); 329 static void hn_chan_drain(struct hn_softc *, 330 struct vmbus_channel *); 331 static void hn_polling(struct hn_softc *, u_int); 332 static void hn_chan_polling(struct vmbus_channel *, u_int); 333 334 static void hn_update_link_status(struct hn_softc *); 335 static void hn_change_network(struct hn_softc *); 336 static void hn_link_taskfunc(void *, int); 337 static void hn_netchg_init_taskfunc(void *, int); 338 static void hn_netchg_status_taskfunc(void *, int); 339 static void hn_link_status(struct hn_softc *); 340 341 static int hn_create_rx_data(struct hn_softc *, int); 342 static void hn_destroy_rx_data(struct hn_softc *); 343 static int hn_check_iplen(const struct mbuf *, int); 344 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 345 static int hn_rxfilter_config(struct hn_softc *); 346 #ifndef RSS 347 static int hn_rss_reconfig(struct hn_softc *); 348 #endif 349 static void hn_rss_ind_fixup(struct hn_softc *); 350 static int hn_rxpkt(struct hn_rx_ring *, const void *, 351 int, const struct hn_rxinfo *); 352 353 static int hn_tx_ring_create(struct hn_softc *, int); 354 static void hn_tx_ring_destroy(struct hn_tx_ring *); 355 static int hn_create_tx_data(struct hn_softc *, int); 356 static void hn_fixup_tx_data(struct hn_softc *); 357 static void hn_destroy_tx_data(struct hn_softc *); 358 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 359 static void hn_txdesc_gc(struct hn_tx_ring *, 360 struct hn_txdesc *); 361 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 362 struct hn_txdesc *, struct mbuf **); 363 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 364 struct hn_txdesc *); 365 static void hn_set_chim_size(struct hn_softc *, int); 366 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 367 static bool hn_tx_ring_pending(struct hn_tx_ring *); 368 static void hn_tx_ring_qflush(struct hn_tx_ring *); 369 static void hn_resume_tx(struct hn_softc *, int); 370 static void hn_set_txagg(struct hn_softc *); 371 static void *hn_try_txagg(struct ifnet *, 372 struct hn_tx_ring *, struct hn_txdesc *, 373 int); 374 static int hn_get_txswq_depth(const struct hn_tx_ring *); 375 static void hn_txpkt_done(struct hn_nvs_sendctx *, 376 struct hn_softc *, struct vmbus_channel *, 377 const void *, int); 378 static int hn_txpkt_sglist(struct hn_tx_ring *, 379 struct hn_txdesc *); 380 static int hn_txpkt_chim(struct hn_tx_ring *, 381 struct hn_txdesc *); 382 static int hn_xmit(struct hn_tx_ring *, int); 383 static void hn_xmit_taskfunc(void *, int); 384 static void hn_xmit_txeof(struct hn_tx_ring *); 385 static void hn_xmit_txeof_taskfunc(void *, int); 386 #ifdef HN_IFSTART_SUPPORT 387 static int hn_start_locked(struct hn_tx_ring *, int); 388 static void hn_start_taskfunc(void *, int); 389 static void hn_start_txeof(struct hn_tx_ring *); 390 static void hn_start_txeof_taskfunc(void *, int); 391 #endif 392 393 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 394 "Hyper-V network interface"); 395 396 /* Trust tcp segements verification on host side. */ 397 static int hn_trust_hosttcp = 1; 398 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 399 &hn_trust_hosttcp, 0, 400 "Trust tcp segement verification on host side, " 401 "when csum info is missing (global setting)"); 402 403 /* Trust udp datagrams verification on host side. */ 404 static int hn_trust_hostudp = 1; 405 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 406 &hn_trust_hostudp, 0, 407 "Trust udp datagram verification on host side, " 408 "when csum info is missing (global setting)"); 409 410 /* Trust ip packets verification on host side. */ 411 static int hn_trust_hostip = 1; 412 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 413 &hn_trust_hostip, 0, 414 "Trust ip packet verification on host side, " 415 "when csum info is missing (global setting)"); 416 417 /* Limit TSO burst size */ 418 static int hn_tso_maxlen = IP_MAXPACKET; 419 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 420 &hn_tso_maxlen, 0, "TSO burst limit"); 421 422 /* Limit chimney send size */ 423 static int hn_tx_chimney_size = 0; 424 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 425 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 426 427 /* Limit the size of packet for direct transmission */ 428 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 429 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 430 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 431 432 /* # of LRO entries per RX ring */ 433 #if defined(INET) || defined(INET6) 434 #if __FreeBSD_version >= 1100095 435 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 436 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 437 &hn_lro_entry_count, 0, "LRO entry count"); 438 #endif 439 #endif 440 441 static int hn_tx_taskq_cnt = 1; 442 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 443 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 444 445 #define HN_TX_TASKQ_M_INDEP 0 446 #define HN_TX_TASKQ_M_GLOBAL 1 447 #define HN_TX_TASKQ_M_EVTTQ 2 448 449 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 450 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 451 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 452 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 453 454 #ifndef HN_USE_TXDESC_BUFRING 455 static int hn_use_txdesc_bufring = 0; 456 #else 457 static int hn_use_txdesc_bufring = 1; 458 #endif 459 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 460 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 461 462 #ifdef HN_IFSTART_SUPPORT 463 /* Use ifnet.if_start instead of ifnet.if_transmit */ 464 static int hn_use_if_start = 0; 465 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 466 &hn_use_if_start, 0, "Use if_start TX method"); 467 #endif 468 469 /* # of channels to use */ 470 static int hn_chan_cnt = 0; 471 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 472 &hn_chan_cnt, 0, 473 "# of channels to use; each channel has one RX ring and one TX ring"); 474 475 /* # of transmit rings to use */ 476 static int hn_tx_ring_cnt = 0; 477 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 478 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 479 480 /* Software TX ring deptch */ 481 static int hn_tx_swq_depth = 0; 482 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 483 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 484 485 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 486 #if __FreeBSD_version >= 1100095 487 static u_int hn_lro_mbufq_depth = 0; 488 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 489 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 490 #endif 491 492 /* Packet transmission aggregation size limit */ 493 static int hn_tx_agg_size = -1; 494 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 495 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 496 497 /* Packet transmission aggregation count limit */ 498 static int hn_tx_agg_pkts = -1; 499 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 500 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 501 502 static u_int hn_cpu_index; /* next CPU for channel */ 503 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 504 505 #ifndef RSS 506 static const uint8_t 507 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 508 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 509 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 510 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 511 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 512 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 513 }; 514 #endif /* !RSS */ 515 516 static device_method_t hn_methods[] = { 517 /* Device interface */ 518 DEVMETHOD(device_probe, hn_probe), 519 DEVMETHOD(device_attach, hn_attach), 520 DEVMETHOD(device_detach, hn_detach), 521 DEVMETHOD(device_shutdown, hn_shutdown), 522 DEVMETHOD_END 523 }; 524 525 static driver_t hn_driver = { 526 "hn", 527 hn_methods, 528 sizeof(struct hn_softc) 529 }; 530 531 static devclass_t hn_devclass; 532 533 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 534 MODULE_VERSION(hn, 1); 535 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 536 537 #if __FreeBSD_version >= 1100099 538 static void 539 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 540 { 541 int i; 542 543 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 544 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 545 } 546 #endif 547 548 static int 549 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 550 { 551 552 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 553 txd->chim_size == 0, ("invalid rndis sglist txd")); 554 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 555 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 556 } 557 558 static int 559 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 560 { 561 struct hn_nvs_rndis rndis; 562 563 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 564 txd->chim_size > 0, ("invalid rndis chim txd")); 565 566 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 567 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 568 rndis.nvs_chim_idx = txd->chim_index; 569 rndis.nvs_chim_sz = txd->chim_size; 570 571 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 572 &rndis, sizeof(rndis), &txd->send_ctx)); 573 } 574 575 static __inline uint32_t 576 hn_chim_alloc(struct hn_softc *sc) 577 { 578 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 579 u_long *bmap = sc->hn_chim_bmap; 580 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 581 582 for (i = 0; i < bmap_cnt; ++i) { 583 int idx; 584 585 idx = ffsl(~bmap[i]); 586 if (idx == 0) 587 continue; 588 589 --idx; /* ffsl is 1-based */ 590 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 591 ("invalid i %d and idx %d", i, idx)); 592 593 if (atomic_testandset_long(&bmap[i], idx)) 594 continue; 595 596 ret = i * LONG_BIT + idx; 597 break; 598 } 599 return (ret); 600 } 601 602 static __inline void 603 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 604 { 605 u_long mask; 606 uint32_t idx; 607 608 idx = chim_idx / LONG_BIT; 609 KASSERT(idx < sc->hn_chim_bmap_cnt, 610 ("invalid chimney index 0x%x", chim_idx)); 611 612 mask = 1UL << (chim_idx % LONG_BIT); 613 KASSERT(sc->hn_chim_bmap[idx] & mask, 614 ("index bitmap 0x%lx, chimney index %u, " 615 "bitmap idx %d, bitmask 0x%lx", 616 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 617 618 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 619 } 620 621 #if defined(INET6) || defined(INET) 622 /* 623 * NOTE: If this function failed, the m_head would be freed. 624 */ 625 static __inline struct mbuf * 626 hn_tso_fixup(struct mbuf *m_head) 627 { 628 struct ether_vlan_header *evl; 629 struct tcphdr *th; 630 int ehlen; 631 632 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 633 634 #define PULLUP_HDR(m, len) \ 635 do { \ 636 if (__predict_false((m)->m_len < (len))) { \ 637 (m) = m_pullup((m), (len)); \ 638 if ((m) == NULL) \ 639 return (NULL); \ 640 } \ 641 } while (0) 642 643 PULLUP_HDR(m_head, sizeof(*evl)); 644 evl = mtod(m_head, struct ether_vlan_header *); 645 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 646 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 647 else 648 ehlen = ETHER_HDR_LEN; 649 650 #ifdef INET 651 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 652 struct ip *ip; 653 int iphlen; 654 655 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 656 ip = mtodo(m_head, ehlen); 657 iphlen = ip->ip_hl << 2; 658 659 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 660 th = mtodo(m_head, ehlen + iphlen); 661 662 ip->ip_len = 0; 663 ip->ip_sum = 0; 664 th->th_sum = in_pseudo(ip->ip_src.s_addr, 665 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 666 } 667 #endif 668 #if defined(INET6) && defined(INET) 669 else 670 #endif 671 #ifdef INET6 672 { 673 struct ip6_hdr *ip6; 674 675 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 676 ip6 = mtodo(m_head, ehlen); 677 if (ip6->ip6_nxt != IPPROTO_TCP) { 678 m_freem(m_head); 679 return (NULL); 680 } 681 682 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 683 th = mtodo(m_head, ehlen + sizeof(*ip6)); 684 685 ip6->ip6_plen = 0; 686 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 687 } 688 #endif 689 return (m_head); 690 691 #undef PULLUP_HDR 692 } 693 #endif /* INET6 || INET */ 694 695 static int 696 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 697 { 698 int error = 0; 699 700 HN_LOCK_ASSERT(sc); 701 702 if (sc->hn_rx_filter != filter) { 703 error = hn_rndis_set_rxfilter(sc, filter); 704 if (!error) 705 sc->hn_rx_filter = filter; 706 } 707 return (error); 708 } 709 710 static int 711 hn_rxfilter_config(struct hn_softc *sc) 712 { 713 struct ifnet *ifp = sc->hn_ifp; 714 uint32_t filter; 715 716 HN_LOCK_ASSERT(sc); 717 718 if ((ifp->if_flags & IFF_PROMISC) || 719 (sc->hn_flags & HN_FLAG_VF)) { 720 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 721 } else { 722 filter = NDIS_PACKET_TYPE_DIRECTED; 723 if (ifp->if_flags & IFF_BROADCAST) 724 filter |= NDIS_PACKET_TYPE_BROADCAST; 725 /* TODO: support multicast list */ 726 if ((ifp->if_flags & IFF_ALLMULTI) || 727 !TAILQ_EMPTY(&ifp->if_multiaddrs)) 728 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 729 } 730 return (hn_set_rxfilter(sc, filter)); 731 } 732 733 static void 734 hn_set_txagg(struct hn_softc *sc) 735 { 736 uint32_t size, pkts; 737 int i; 738 739 /* 740 * Setup aggregation size. 741 */ 742 if (sc->hn_agg_size < 0) 743 size = UINT32_MAX; 744 else 745 size = sc->hn_agg_size; 746 747 if (sc->hn_rndis_agg_size < size) 748 size = sc->hn_rndis_agg_size; 749 750 /* NOTE: We only aggregate packets using chimney sending buffers. */ 751 if (size > (uint32_t)sc->hn_chim_szmax) 752 size = sc->hn_chim_szmax; 753 754 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 755 /* Disable */ 756 size = 0; 757 pkts = 0; 758 goto done; 759 } 760 761 /* NOTE: Type of the per TX ring setting is 'int'. */ 762 if (size > INT_MAX) 763 size = INT_MAX; 764 765 /* 766 * Setup aggregation packet count. 767 */ 768 if (sc->hn_agg_pkts < 0) 769 pkts = UINT32_MAX; 770 else 771 pkts = sc->hn_agg_pkts; 772 773 if (sc->hn_rndis_agg_pkts < pkts) 774 pkts = sc->hn_rndis_agg_pkts; 775 776 if (pkts <= 1) { 777 /* Disable */ 778 size = 0; 779 pkts = 0; 780 goto done; 781 } 782 783 /* NOTE: Type of the per TX ring setting is 'short'. */ 784 if (pkts > SHRT_MAX) 785 pkts = SHRT_MAX; 786 787 done: 788 /* NOTE: Type of the per TX ring setting is 'short'. */ 789 if (sc->hn_rndis_agg_align > SHRT_MAX) { 790 /* Disable */ 791 size = 0; 792 pkts = 0; 793 } 794 795 if (bootverbose) { 796 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 797 size, pkts, sc->hn_rndis_agg_align); 798 } 799 800 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 801 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 802 803 mtx_lock(&txr->hn_tx_lock); 804 txr->hn_agg_szmax = size; 805 txr->hn_agg_pktmax = pkts; 806 txr->hn_agg_align = sc->hn_rndis_agg_align; 807 mtx_unlock(&txr->hn_tx_lock); 808 } 809 } 810 811 static int 812 hn_get_txswq_depth(const struct hn_tx_ring *txr) 813 { 814 815 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 816 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 817 return txr->hn_txdesc_cnt; 818 return hn_tx_swq_depth; 819 } 820 821 #ifndef RSS 822 static int 823 hn_rss_reconfig(struct hn_softc *sc) 824 { 825 int error; 826 827 HN_LOCK_ASSERT(sc); 828 829 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 830 return (ENXIO); 831 832 /* 833 * Disable RSS first. 834 * 835 * NOTE: 836 * Direct reconfiguration by setting the UNCHG flags does 837 * _not_ work properly. 838 */ 839 if (bootverbose) 840 if_printf(sc->hn_ifp, "disable RSS\n"); 841 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 842 if (error) { 843 if_printf(sc->hn_ifp, "RSS disable failed\n"); 844 return (error); 845 } 846 847 /* 848 * Reenable the RSS w/ the updated RSS key or indirect 849 * table. 850 */ 851 if (bootverbose) 852 if_printf(sc->hn_ifp, "reconfig RSS\n"); 853 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 854 if (error) { 855 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 856 return (error); 857 } 858 return (0); 859 } 860 #endif /* !RSS */ 861 862 static void 863 hn_rss_ind_fixup(struct hn_softc *sc) 864 { 865 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 866 int i, nchan; 867 868 nchan = sc->hn_rx_ring_inuse; 869 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 870 871 /* 872 * Check indirect table to make sure that all channels in it 873 * can be used. 874 */ 875 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 876 if (rss->rss_ind[i] >= nchan) { 877 if_printf(sc->hn_ifp, 878 "RSS indirect table %d fixup: %u -> %d\n", 879 i, rss->rss_ind[i], nchan - 1); 880 rss->rss_ind[i] = nchan - 1; 881 } 882 } 883 } 884 885 static int 886 hn_ifmedia_upd(struct ifnet *ifp __unused) 887 { 888 889 return EOPNOTSUPP; 890 } 891 892 static void 893 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 894 { 895 struct hn_softc *sc = ifp->if_softc; 896 897 ifmr->ifm_status = IFM_AVALID; 898 ifmr->ifm_active = IFM_ETHER; 899 900 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 901 ifmr->ifm_active |= IFM_NONE; 902 return; 903 } 904 ifmr->ifm_status |= IFM_ACTIVE; 905 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 906 } 907 908 static void 909 hn_update_vf_task(void *arg, int pending __unused) 910 { 911 struct hn_update_vf *uv = arg; 912 913 uv->rxr->hn_vf = uv->vf; 914 } 915 916 static void 917 hn_update_vf(struct hn_softc *sc, struct ifnet *vf) 918 { 919 struct hn_rx_ring *rxr; 920 struct hn_update_vf uv; 921 struct task task; 922 int i; 923 924 HN_LOCK_ASSERT(sc); 925 926 TASK_INIT(&task, 0, hn_update_vf_task, &uv); 927 928 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 929 rxr = &sc->hn_rx_ring[i]; 930 931 if (i < sc->hn_rx_ring_inuse) { 932 uv.rxr = rxr; 933 uv.vf = vf; 934 vmbus_chan_run_task(rxr->hn_chan, &task); 935 } else { 936 rxr->hn_vf = vf; 937 } 938 } 939 } 940 941 static void 942 hn_set_vf(struct hn_softc *sc, struct ifnet *ifp, bool vf) 943 { 944 struct ifnet *hn_ifp; 945 946 HN_LOCK(sc); 947 948 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 949 goto out; 950 951 hn_ifp = sc->hn_ifp; 952 953 if (ifp == hn_ifp) 954 goto out; 955 956 if (ifp->if_alloctype != IFT_ETHER) 957 goto out; 958 959 /* Ignore lagg/vlan interfaces */ 960 if (strcmp(ifp->if_dname, "lagg") == 0 || 961 strcmp(ifp->if_dname, "vlan") == 0) 962 goto out; 963 964 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 965 goto out; 966 967 /* Now we're sure 'ifp' is a real VF device. */ 968 if (vf) { 969 if (sc->hn_flags & HN_FLAG_VF) 970 goto out; 971 972 sc->hn_flags |= HN_FLAG_VF; 973 hn_rxfilter_config(sc); 974 } else { 975 if (!(sc->hn_flags & HN_FLAG_VF)) 976 goto out; 977 978 sc->hn_flags &= ~HN_FLAG_VF; 979 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 980 hn_rxfilter_config(sc); 981 else 982 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 983 } 984 985 hn_nvs_set_datapath(sc, 986 vf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC); 987 988 hn_update_vf(sc, vf ? ifp : NULL); 989 990 if (vf) { 991 hn_suspend_mgmt(sc); 992 sc->hn_link_flags &= 993 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 994 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 995 } else { 996 hn_resume_mgmt(sc); 997 } 998 999 devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp), 1000 vf ? "VF_UP" : "VF_DOWN", NULL); 1001 1002 if (bootverbose) 1003 if_printf(hn_ifp, "Data path is switched %s %s\n", 1004 vf ? "to" : "from", if_name(ifp)); 1005 out: 1006 HN_UNLOCK(sc); 1007 } 1008 1009 static void 1010 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1011 { 1012 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1013 return; 1014 1015 hn_set_vf(arg, ifp, event == IFNET_EVENT_UP); 1016 } 1017 1018 static void 1019 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1020 { 1021 hn_set_vf(arg, ifp, ifp->if_flags & IFF_UP); 1022 } 1023 1024 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ 1025 static const struct hyperv_guid g_net_vsc_device_type = { 1026 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 1027 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} 1028 }; 1029 1030 static int 1031 hn_probe(device_t dev) 1032 { 1033 1034 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, 1035 &g_net_vsc_device_type) == 0) { 1036 device_set_desc(dev, "Hyper-V Network Interface"); 1037 return BUS_PROBE_DEFAULT; 1038 } 1039 return ENXIO; 1040 } 1041 1042 static int 1043 hn_attach(device_t dev) 1044 { 1045 struct hn_softc *sc = device_get_softc(dev); 1046 struct sysctl_oid_list *child; 1047 struct sysctl_ctx_list *ctx; 1048 uint8_t eaddr[ETHER_ADDR_LEN]; 1049 struct ifnet *ifp = NULL; 1050 int error, ring_cnt, tx_ring_cnt; 1051 1052 sc->hn_dev = dev; 1053 sc->hn_prichan = vmbus_get_channel(dev); 1054 HN_LOCK_INIT(sc); 1055 1056 /* 1057 * Initialize these tunables once. 1058 */ 1059 sc->hn_agg_size = hn_tx_agg_size; 1060 sc->hn_agg_pkts = hn_tx_agg_pkts; 1061 1062 /* 1063 * Setup taskqueue for transmission. 1064 */ 1065 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 1066 int i; 1067 1068 sc->hn_tx_taskqs = 1069 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 1070 M_DEVBUF, M_WAITOK); 1071 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 1072 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 1073 M_WAITOK, taskqueue_thread_enqueue, 1074 &sc->hn_tx_taskqs[i]); 1075 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 1076 "%s tx%d", device_get_nameunit(dev), i); 1077 } 1078 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 1079 sc->hn_tx_taskqs = hn_tx_taskque; 1080 } 1081 1082 /* 1083 * Setup taskqueue for mangement tasks, e.g. link status. 1084 */ 1085 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 1086 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 1087 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 1088 device_get_nameunit(dev)); 1089 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 1090 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 1091 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 1092 hn_netchg_status_taskfunc, sc); 1093 1094 /* 1095 * Allocate ifnet and setup its name earlier, so that if_printf 1096 * can be used by functions, which will be called after 1097 * ether_ifattach(). 1098 */ 1099 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 1100 ifp->if_softc = sc; 1101 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 1102 1103 /* 1104 * Initialize ifmedia earlier so that it can be unconditionally 1105 * destroyed, if error happened later on. 1106 */ 1107 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 1108 1109 /* 1110 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 1111 * to use (tx_ring_cnt). 1112 * 1113 * NOTE: 1114 * The # of RX rings to use is same as the # of channels to use. 1115 */ 1116 ring_cnt = hn_chan_cnt; 1117 if (ring_cnt <= 0) { 1118 /* Default */ 1119 ring_cnt = mp_ncpus; 1120 if (ring_cnt > HN_RING_CNT_DEF_MAX) 1121 ring_cnt = HN_RING_CNT_DEF_MAX; 1122 } else if (ring_cnt > mp_ncpus) { 1123 ring_cnt = mp_ncpus; 1124 } 1125 #ifdef RSS 1126 if (ring_cnt > rss_getnumbuckets()) 1127 ring_cnt = rss_getnumbuckets(); 1128 #endif 1129 1130 tx_ring_cnt = hn_tx_ring_cnt; 1131 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 1132 tx_ring_cnt = ring_cnt; 1133 #ifdef HN_IFSTART_SUPPORT 1134 if (hn_use_if_start) { 1135 /* ifnet.if_start only needs one TX ring. */ 1136 tx_ring_cnt = 1; 1137 } 1138 #endif 1139 1140 /* 1141 * Set the leader CPU for channels. 1142 */ 1143 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 1144 1145 /* 1146 * Create enough TX/RX rings, even if only limited number of 1147 * channels can be allocated. 1148 */ 1149 error = hn_create_tx_data(sc, tx_ring_cnt); 1150 if (error) 1151 goto failed; 1152 error = hn_create_rx_data(sc, ring_cnt); 1153 if (error) 1154 goto failed; 1155 1156 /* 1157 * Create transaction context for NVS and RNDIS transactions. 1158 */ 1159 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 1160 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 1161 if (sc->hn_xact == NULL) { 1162 error = ENXIO; 1163 goto failed; 1164 } 1165 1166 /* 1167 * Install orphan handler for the revocation of this device's 1168 * primary channel. 1169 * 1170 * NOTE: 1171 * The processing order is critical here: 1172 * Install the orphan handler, _before_ testing whether this 1173 * device's primary channel has been revoked or not. 1174 */ 1175 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 1176 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 1177 error = ENXIO; 1178 goto failed; 1179 } 1180 1181 /* 1182 * Attach the synthetic parts, i.e. NVS and RNDIS. 1183 */ 1184 error = hn_synth_attach(sc, ETHERMTU); 1185 if (error) 1186 goto failed; 1187 1188 error = hn_rndis_get_eaddr(sc, eaddr); 1189 if (error) 1190 goto failed; 1191 1192 #if __FreeBSD_version >= 1100099 1193 if (sc->hn_rx_ring_inuse > 1) { 1194 /* 1195 * Reduce TCP segment aggregation limit for multiple 1196 * RX rings to increase ACK timeliness. 1197 */ 1198 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 1199 } 1200 #endif 1201 1202 /* 1203 * Fixup TX stuffs after synthetic parts are attached. 1204 */ 1205 hn_fixup_tx_data(sc); 1206 1207 ctx = device_get_sysctl_ctx(dev); 1208 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 1209 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 1210 &sc->hn_nvs_ver, 0, "NVS version"); 1211 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 1212 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1213 hn_ndis_version_sysctl, "A", "NDIS version"); 1214 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 1215 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1216 hn_caps_sysctl, "A", "capabilities"); 1217 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 1218 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1219 hn_hwassist_sysctl, "A", "hwassist"); 1220 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 1221 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1222 hn_rxfilter_sysctl, "A", "rxfilter"); 1223 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 1224 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1225 hn_rss_hash_sysctl, "A", "RSS hash"); 1226 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 1227 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 1228 #ifndef RSS 1229 /* 1230 * Don't allow RSS key/indirect table changes, if RSS is defined. 1231 */ 1232 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 1233 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1234 hn_rss_key_sysctl, "IU", "RSS key"); 1235 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 1236 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1237 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 1238 #endif 1239 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 1240 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 1241 "RNDIS offered packet transmission aggregation size limit"); 1242 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 1243 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 1244 "RNDIS offered packet transmission aggregation count limit"); 1245 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 1246 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 1247 "RNDIS packet transmission aggregation alignment"); 1248 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 1249 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1250 hn_txagg_size_sysctl, "I", 1251 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 1252 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 1253 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1254 hn_txagg_pkts_sysctl, "I", 1255 "Packet transmission aggregation packets, " 1256 "0 -- disable, -1 -- auto"); 1257 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 1258 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1259 hn_polling_sysctl, "I", 1260 "Polling frequency: [100,1000000], 0 disable polling"); 1261 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 1262 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1263 hn_vf_sysctl, "A", "Virtual Function's name"); 1264 1265 /* 1266 * Setup the ifmedia, which has been initialized earlier. 1267 */ 1268 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 1269 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 1270 /* XXX ifmedia_set really should do this for us */ 1271 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 1272 1273 /* 1274 * Setup the ifnet for this interface. 1275 */ 1276 1277 ifp->if_baudrate = IF_Gbps(10); 1278 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 1279 ifp->if_ioctl = hn_ioctl; 1280 ifp->if_init = hn_init; 1281 #ifdef HN_IFSTART_SUPPORT 1282 if (hn_use_if_start) { 1283 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 1284 1285 ifp->if_start = hn_start; 1286 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 1287 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 1288 IFQ_SET_READY(&ifp->if_snd); 1289 } else 1290 #endif 1291 { 1292 ifp->if_transmit = hn_transmit; 1293 ifp->if_qflush = hn_xmit_qflush; 1294 } 1295 1296 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO; 1297 #ifdef foo 1298 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 1299 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 1300 #endif 1301 if (sc->hn_caps & HN_CAP_VLAN) { 1302 /* XXX not sure about VLAN_MTU. */ 1303 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 1304 } 1305 1306 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 1307 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 1308 ifp->if_capabilities |= IFCAP_TXCSUM; 1309 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 1310 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 1311 if (sc->hn_caps & HN_CAP_TSO4) { 1312 ifp->if_capabilities |= IFCAP_TSO4; 1313 ifp->if_hwassist |= CSUM_IP_TSO; 1314 } 1315 if (sc->hn_caps & HN_CAP_TSO6) { 1316 ifp->if_capabilities |= IFCAP_TSO6; 1317 ifp->if_hwassist |= CSUM_IP6_TSO; 1318 } 1319 1320 /* Enable all available capabilities by default. */ 1321 ifp->if_capenable = ifp->if_capabilities; 1322 1323 /* 1324 * Disable IPv6 TSO and TXCSUM by default, they still can 1325 * be enabled through SIOCSIFCAP. 1326 */ 1327 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 1328 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 1329 1330 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 1331 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 1332 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 1333 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 1334 } 1335 1336 ether_ifattach(ifp, eaddr); 1337 1338 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 1339 if_printf(ifp, "TSO segcnt %u segsz %u\n", 1340 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 1341 } 1342 1343 /* Inform the upper layer about the long frame support. */ 1344 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 1345 1346 /* 1347 * Kick off link status check. 1348 */ 1349 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 1350 hn_update_link_status(sc); 1351 1352 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 1353 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 1354 1355 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 1356 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 1357 1358 return (0); 1359 failed: 1360 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 1361 hn_synth_detach(sc); 1362 hn_detach(dev); 1363 return (error); 1364 } 1365 1366 static int 1367 hn_detach(device_t dev) 1368 { 1369 struct hn_softc *sc = device_get_softc(dev); 1370 struct ifnet *ifp = sc->hn_ifp; 1371 1372 if (sc->hn_ifaddr_evthand != NULL) 1373 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 1374 if (sc->hn_ifnet_evthand != NULL) 1375 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 1376 1377 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 1378 /* 1379 * In case that the vmbus missed the orphan handler 1380 * installation. 1381 */ 1382 vmbus_xact_ctx_orphan(sc->hn_xact); 1383 } 1384 1385 if (device_is_attached(dev)) { 1386 HN_LOCK(sc); 1387 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 1388 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 1389 hn_stop(sc, true); 1390 /* 1391 * NOTE: 1392 * hn_stop() only suspends data, so managment 1393 * stuffs have to be suspended manually here. 1394 */ 1395 hn_suspend_mgmt(sc); 1396 hn_synth_detach(sc); 1397 } 1398 HN_UNLOCK(sc); 1399 ether_ifdetach(ifp); 1400 } 1401 1402 ifmedia_removeall(&sc->hn_media); 1403 hn_destroy_rx_data(sc); 1404 hn_destroy_tx_data(sc); 1405 1406 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 1407 int i; 1408 1409 for (i = 0; i < hn_tx_taskq_cnt; ++i) 1410 taskqueue_free(sc->hn_tx_taskqs[i]); 1411 free(sc->hn_tx_taskqs, M_DEVBUF); 1412 } 1413 taskqueue_free(sc->hn_mgmt_taskq0); 1414 1415 if (sc->hn_xact != NULL) { 1416 /* 1417 * Uninstall the orphan handler _before_ the xact is 1418 * destructed. 1419 */ 1420 vmbus_chan_unset_orphan(sc->hn_prichan); 1421 vmbus_xact_ctx_destroy(sc->hn_xact); 1422 } 1423 1424 if_free(ifp); 1425 1426 HN_LOCK_DESTROY(sc); 1427 return (0); 1428 } 1429 1430 static int 1431 hn_shutdown(device_t dev) 1432 { 1433 1434 return (0); 1435 } 1436 1437 static void 1438 hn_link_status(struct hn_softc *sc) 1439 { 1440 uint32_t link_status; 1441 int error; 1442 1443 error = hn_rndis_get_linkstatus(sc, &link_status); 1444 if (error) { 1445 /* XXX what to do? */ 1446 return; 1447 } 1448 1449 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 1450 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 1451 else 1452 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1453 if_link_state_change(sc->hn_ifp, 1454 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 1455 LINK_STATE_UP : LINK_STATE_DOWN); 1456 } 1457 1458 static void 1459 hn_link_taskfunc(void *xsc, int pending __unused) 1460 { 1461 struct hn_softc *sc = xsc; 1462 1463 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 1464 return; 1465 hn_link_status(sc); 1466 } 1467 1468 static void 1469 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 1470 { 1471 struct hn_softc *sc = xsc; 1472 1473 /* Prevent any link status checks from running. */ 1474 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 1475 1476 /* 1477 * Fake up a [link down --> link up] state change; 5 seconds 1478 * delay is used, which closely simulates miibus reaction 1479 * upon link down event. 1480 */ 1481 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1482 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 1483 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 1484 &sc->hn_netchg_status, 5 * hz); 1485 } 1486 1487 static void 1488 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 1489 { 1490 struct hn_softc *sc = xsc; 1491 1492 /* Re-allow link status checks. */ 1493 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 1494 hn_link_status(sc); 1495 } 1496 1497 static void 1498 hn_update_link_status(struct hn_softc *sc) 1499 { 1500 1501 if (sc->hn_mgmt_taskq != NULL) 1502 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 1503 } 1504 1505 static void 1506 hn_change_network(struct hn_softc *sc) 1507 { 1508 1509 if (sc->hn_mgmt_taskq != NULL) 1510 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 1511 } 1512 1513 static __inline int 1514 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 1515 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 1516 { 1517 struct mbuf *m = *m_head; 1518 int error; 1519 1520 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 1521 1522 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 1523 m, segs, nsegs, BUS_DMA_NOWAIT); 1524 if (error == EFBIG) { 1525 struct mbuf *m_new; 1526 1527 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 1528 if (m_new == NULL) 1529 return ENOBUFS; 1530 else 1531 *m_head = m = m_new; 1532 txr->hn_tx_collapsed++; 1533 1534 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 1535 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 1536 } 1537 if (!error) { 1538 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 1539 BUS_DMASYNC_PREWRITE); 1540 txd->flags |= HN_TXD_FLAG_DMAMAP; 1541 } 1542 return error; 1543 } 1544 1545 static __inline int 1546 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 1547 { 1548 1549 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 1550 ("put an onlist txd %#x", txd->flags)); 1551 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1552 ("put an onagg txd %#x", txd->flags)); 1553 1554 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1555 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 1556 return 0; 1557 1558 if (!STAILQ_EMPTY(&txd->agg_list)) { 1559 struct hn_txdesc *tmp_txd; 1560 1561 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 1562 int freed; 1563 1564 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 1565 ("resursive aggregation on aggregated txdesc")); 1566 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 1567 ("not aggregated txdesc")); 1568 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1569 ("aggregated txdesc uses dmamap")); 1570 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 1571 ("aggregated txdesc consumes " 1572 "chimney sending buffer")); 1573 KASSERT(tmp_txd->chim_size == 0, 1574 ("aggregated txdesc has non-zero " 1575 "chimney sending size")); 1576 1577 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 1578 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 1579 freed = hn_txdesc_put(txr, tmp_txd); 1580 KASSERT(freed, ("failed to free aggregated txdesc")); 1581 } 1582 } 1583 1584 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 1585 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1586 ("chim txd uses dmamap")); 1587 hn_chim_free(txr->hn_sc, txd->chim_index); 1588 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 1589 txd->chim_size = 0; 1590 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 1591 bus_dmamap_sync(txr->hn_tx_data_dtag, 1592 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 1593 bus_dmamap_unload(txr->hn_tx_data_dtag, 1594 txd->data_dmap); 1595 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 1596 } 1597 1598 if (txd->m != NULL) { 1599 m_freem(txd->m); 1600 txd->m = NULL; 1601 } 1602 1603 txd->flags |= HN_TXD_FLAG_ONLIST; 1604 #ifndef HN_USE_TXDESC_BUFRING 1605 mtx_lock_spin(&txr->hn_txlist_spin); 1606 KASSERT(txr->hn_txdesc_avail >= 0 && 1607 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 1608 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 1609 txr->hn_txdesc_avail++; 1610 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 1611 mtx_unlock_spin(&txr->hn_txlist_spin); 1612 #else /* HN_USE_TXDESC_BUFRING */ 1613 #ifdef HN_DEBUG 1614 atomic_add_int(&txr->hn_txdesc_avail, 1); 1615 #endif 1616 buf_ring_enqueue(txr->hn_txdesc_br, txd); 1617 #endif /* !HN_USE_TXDESC_BUFRING */ 1618 1619 return 1; 1620 } 1621 1622 static __inline struct hn_txdesc * 1623 hn_txdesc_get(struct hn_tx_ring *txr) 1624 { 1625 struct hn_txdesc *txd; 1626 1627 #ifndef HN_USE_TXDESC_BUFRING 1628 mtx_lock_spin(&txr->hn_txlist_spin); 1629 txd = SLIST_FIRST(&txr->hn_txlist); 1630 if (txd != NULL) { 1631 KASSERT(txr->hn_txdesc_avail > 0, 1632 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 1633 txr->hn_txdesc_avail--; 1634 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 1635 } 1636 mtx_unlock_spin(&txr->hn_txlist_spin); 1637 #else 1638 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 1639 #endif 1640 1641 if (txd != NULL) { 1642 #ifdef HN_USE_TXDESC_BUFRING 1643 #ifdef HN_DEBUG 1644 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 1645 #endif 1646 #endif /* HN_USE_TXDESC_BUFRING */ 1647 KASSERT(txd->m == NULL && txd->refs == 0 && 1648 STAILQ_EMPTY(&txd->agg_list) && 1649 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 1650 txd->chim_size == 0 && 1651 (txd->flags & HN_TXD_FLAG_ONLIST) && 1652 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 1653 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 1654 txd->flags &= ~HN_TXD_FLAG_ONLIST; 1655 txd->refs = 1; 1656 } 1657 return txd; 1658 } 1659 1660 static __inline void 1661 hn_txdesc_hold(struct hn_txdesc *txd) 1662 { 1663 1664 /* 0->1 transition will never work */ 1665 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1666 atomic_add_int(&txd->refs, 1); 1667 } 1668 1669 static __inline void 1670 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 1671 { 1672 1673 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1674 ("recursive aggregation on aggregating txdesc")); 1675 1676 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1677 ("already aggregated")); 1678 KASSERT(STAILQ_EMPTY(&txd->agg_list), 1679 ("recursive aggregation on to-be-aggregated txdesc")); 1680 1681 txd->flags |= HN_TXD_FLAG_ONAGG; 1682 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 1683 } 1684 1685 static bool 1686 hn_tx_ring_pending(struct hn_tx_ring *txr) 1687 { 1688 bool pending = false; 1689 1690 #ifndef HN_USE_TXDESC_BUFRING 1691 mtx_lock_spin(&txr->hn_txlist_spin); 1692 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 1693 pending = true; 1694 mtx_unlock_spin(&txr->hn_txlist_spin); 1695 #else 1696 if (!buf_ring_full(txr->hn_txdesc_br)) 1697 pending = true; 1698 #endif 1699 return (pending); 1700 } 1701 1702 static __inline void 1703 hn_txeof(struct hn_tx_ring *txr) 1704 { 1705 txr->hn_has_txeof = 0; 1706 txr->hn_txeof(txr); 1707 } 1708 1709 static void 1710 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 1711 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 1712 { 1713 struct hn_txdesc *txd = sndc->hn_cbarg; 1714 struct hn_tx_ring *txr; 1715 1716 txr = txd->txr; 1717 KASSERT(txr->hn_chan == chan, 1718 ("channel mismatch, on chan%u, should be chan%u", 1719 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 1720 1721 txr->hn_has_txeof = 1; 1722 hn_txdesc_put(txr, txd); 1723 1724 ++txr->hn_txdone_cnt; 1725 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 1726 txr->hn_txdone_cnt = 0; 1727 if (txr->hn_oactive) 1728 hn_txeof(txr); 1729 } 1730 } 1731 1732 static void 1733 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 1734 { 1735 #if defined(INET) || defined(INET6) 1736 tcp_lro_flush_all(&rxr->hn_lro); 1737 #endif 1738 1739 /* 1740 * NOTE: 1741 * 'txr' could be NULL, if multiple channels and 1742 * ifnet.if_start method are enabled. 1743 */ 1744 if (txr == NULL || !txr->hn_has_txeof) 1745 return; 1746 1747 txr->hn_txdone_cnt = 0; 1748 hn_txeof(txr); 1749 } 1750 1751 static __inline uint32_t 1752 hn_rndis_pktmsg_offset(uint32_t ofs) 1753 { 1754 1755 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 1756 ("invalid RNDIS packet msg offset %u", ofs)); 1757 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 1758 } 1759 1760 static __inline void * 1761 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 1762 size_t pi_dlen, uint32_t pi_type) 1763 { 1764 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 1765 struct rndis_pktinfo *pi; 1766 1767 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 1768 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 1769 1770 /* 1771 * Per-packet-info does not move; it only grows. 1772 * 1773 * NOTE: 1774 * rm_pktinfooffset in this phase counts from the beginning 1775 * of rndis_packet_msg. 1776 */ 1777 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 1778 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 1779 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 1780 pkt->rm_pktinfolen); 1781 pkt->rm_pktinfolen += pi_size; 1782 1783 pi->rm_size = pi_size; 1784 pi->rm_type = pi_type; 1785 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 1786 1787 /* Data immediately follow per-packet-info. */ 1788 pkt->rm_dataoffset += pi_size; 1789 1790 /* Update RNDIS packet msg length */ 1791 pkt->rm_len += pi_size; 1792 1793 return (pi->rm_data); 1794 } 1795 1796 static __inline int 1797 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 1798 { 1799 struct hn_txdesc *txd; 1800 struct mbuf *m; 1801 int error, pkts; 1802 1803 txd = txr->hn_agg_txd; 1804 KASSERT(txd != NULL, ("no aggregate txdesc")); 1805 1806 /* 1807 * Since hn_txpkt() will reset this temporary stat, save 1808 * it now, so that oerrors can be updated properly, if 1809 * hn_txpkt() ever fails. 1810 */ 1811 pkts = txr->hn_stat_pkts; 1812 1813 /* 1814 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 1815 * failure, save it for later freeing, if hn_txpkt() ever 1816 * fails. 1817 */ 1818 m = txd->m; 1819 error = hn_txpkt(ifp, txr, txd); 1820 if (__predict_false(error)) { 1821 /* txd is freed, but m is not. */ 1822 m_freem(m); 1823 1824 txr->hn_flush_failed++; 1825 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 1826 } 1827 1828 /* Reset all aggregation states. */ 1829 txr->hn_agg_txd = NULL; 1830 txr->hn_agg_szleft = 0; 1831 txr->hn_agg_pktleft = 0; 1832 txr->hn_agg_prevpkt = NULL; 1833 1834 return (error); 1835 } 1836 1837 static void * 1838 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1839 int pktsize) 1840 { 1841 void *chim; 1842 1843 if (txr->hn_agg_txd != NULL) { 1844 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 1845 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 1846 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 1847 int olen; 1848 1849 /* 1850 * Update the previous RNDIS packet's total length, 1851 * it can be increased due to the mandatory alignment 1852 * padding for this RNDIS packet. And update the 1853 * aggregating txdesc's chimney sending buffer size 1854 * accordingly. 1855 * 1856 * XXX 1857 * Zero-out the padding, as required by the RNDIS spec. 1858 */ 1859 olen = pkt->rm_len; 1860 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 1861 agg_txd->chim_size += pkt->rm_len - olen; 1862 1863 /* Link this txdesc to the parent. */ 1864 hn_txdesc_agg(agg_txd, txd); 1865 1866 chim = (uint8_t *)pkt + pkt->rm_len; 1867 /* Save the current packet for later fixup. */ 1868 txr->hn_agg_prevpkt = chim; 1869 1870 txr->hn_agg_pktleft--; 1871 txr->hn_agg_szleft -= pktsize; 1872 if (txr->hn_agg_szleft <= 1873 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1874 /* 1875 * Probably can't aggregate more packets, 1876 * flush this aggregating txdesc proactively. 1877 */ 1878 txr->hn_agg_pktleft = 0; 1879 } 1880 /* Done! */ 1881 return (chim); 1882 } 1883 hn_flush_txagg(ifp, txr); 1884 } 1885 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 1886 1887 txr->hn_tx_chimney_tried++; 1888 txd->chim_index = hn_chim_alloc(txr->hn_sc); 1889 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 1890 return (NULL); 1891 txr->hn_tx_chimney++; 1892 1893 chim = txr->hn_sc->hn_chim + 1894 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 1895 1896 if (txr->hn_agg_pktmax > 1 && 1897 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1898 txr->hn_agg_txd = txd; 1899 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 1900 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 1901 txr->hn_agg_prevpkt = chim; 1902 } 1903 return (chim); 1904 } 1905 1906 /* 1907 * NOTE: 1908 * If this function fails, then both txd and m_head0 will be freed. 1909 */ 1910 static int 1911 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1912 struct mbuf **m_head0) 1913 { 1914 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 1915 int error, nsegs, i; 1916 struct mbuf *m_head = *m_head0; 1917 struct rndis_packet_msg *pkt; 1918 uint32_t *pi_data; 1919 void *chim = NULL; 1920 int pkt_hlen, pkt_size; 1921 1922 pkt = txd->rndis_pkt; 1923 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 1924 if (pkt_size < txr->hn_chim_size) { 1925 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 1926 if (chim != NULL) 1927 pkt = chim; 1928 } else { 1929 if (txr->hn_agg_txd != NULL) 1930 hn_flush_txagg(ifp, txr); 1931 } 1932 1933 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 1934 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len; 1935 pkt->rm_dataoffset = sizeof(*pkt); 1936 pkt->rm_datalen = m_head->m_pkthdr.len; 1937 pkt->rm_oobdataoffset = 0; 1938 pkt->rm_oobdatalen = 0; 1939 pkt->rm_oobdataelements = 0; 1940 pkt->rm_pktinfooffset = sizeof(*pkt); 1941 pkt->rm_pktinfolen = 0; 1942 pkt->rm_vchandle = 0; 1943 pkt->rm_reserved = 0; 1944 1945 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 1946 /* 1947 * Set the hash value for this packet, so that the host could 1948 * dispatch the TX done event for this packet back to this TX 1949 * ring's channel. 1950 */ 1951 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1952 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 1953 *pi_data = txr->hn_tx_idx; 1954 } 1955 1956 if (m_head->m_flags & M_VLANTAG) { 1957 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1958 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 1959 *pi_data = NDIS_VLAN_INFO_MAKE( 1960 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 1961 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 1962 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 1963 } 1964 1965 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 1966 #if defined(INET6) || defined(INET) 1967 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1968 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 1969 #ifdef INET 1970 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 1971 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, 1972 m_head->m_pkthdr.tso_segsz); 1973 } 1974 #endif 1975 #if defined(INET6) && defined(INET) 1976 else 1977 #endif 1978 #ifdef INET6 1979 { 1980 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, 1981 m_head->m_pkthdr.tso_segsz); 1982 } 1983 #endif 1984 #endif /* INET6 || INET */ 1985 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 1986 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1987 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 1988 if (m_head->m_pkthdr.csum_flags & 1989 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 1990 *pi_data = NDIS_TXCSUM_INFO_IPV6; 1991 } else { 1992 *pi_data = NDIS_TXCSUM_INFO_IPV4; 1993 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 1994 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 1995 } 1996 1997 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) 1998 *pi_data |= NDIS_TXCSUM_INFO_TCPCS; 1999 else if (m_head->m_pkthdr.csum_flags & 2000 (CSUM_IP_UDP | CSUM_IP6_UDP)) 2001 *pi_data |= NDIS_TXCSUM_INFO_UDPCS; 2002 } 2003 2004 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 2005 /* Convert RNDIS packet message offsets */ 2006 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset); 2007 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 2008 2009 /* 2010 * Fast path: Chimney sending. 2011 */ 2012 if (chim != NULL) { 2013 struct hn_txdesc *tgt_txd = txd; 2014 2015 if (txr->hn_agg_txd != NULL) { 2016 tgt_txd = txr->hn_agg_txd; 2017 #ifdef INVARIANTS 2018 *m_head0 = NULL; 2019 #endif 2020 } 2021 2022 KASSERT(pkt == chim, 2023 ("RNDIS pkt not in chimney sending buffer")); 2024 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 2025 ("chimney sending buffer is not used")); 2026 tgt_txd->chim_size += pkt->rm_len; 2027 2028 m_copydata(m_head, 0, m_head->m_pkthdr.len, 2029 ((uint8_t *)chim) + pkt_hlen); 2030 2031 txr->hn_gpa_cnt = 0; 2032 txr->hn_sendpkt = hn_txpkt_chim; 2033 goto done; 2034 } 2035 2036 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 2037 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2038 ("chimney buffer is used")); 2039 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 2040 2041 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 2042 if (__predict_false(error)) { 2043 int freed; 2044 2045 /* 2046 * This mbuf is not linked w/ the txd yet, so free it now. 2047 */ 2048 m_freem(m_head); 2049 *m_head0 = NULL; 2050 2051 freed = hn_txdesc_put(txr, txd); 2052 KASSERT(freed != 0, 2053 ("fail to free txd upon txdma error")); 2054 2055 txr->hn_txdma_failed++; 2056 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 2057 return error; 2058 } 2059 *m_head0 = m_head; 2060 2061 /* +1 RNDIS packet message */ 2062 txr->hn_gpa_cnt = nsegs + 1; 2063 2064 /* send packet with page buffer */ 2065 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 2066 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 2067 txr->hn_gpa[0].gpa_len = pkt_hlen; 2068 2069 /* 2070 * Fill the page buffers with mbuf info after the page 2071 * buffer for RNDIS packet message. 2072 */ 2073 for (i = 0; i < nsegs; ++i) { 2074 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 2075 2076 gpa->gpa_page = atop(segs[i].ds_addr); 2077 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 2078 gpa->gpa_len = segs[i].ds_len; 2079 } 2080 2081 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2082 txd->chim_size = 0; 2083 txr->hn_sendpkt = hn_txpkt_sglist; 2084 done: 2085 txd->m = m_head; 2086 2087 /* Set the completion routine */ 2088 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 2089 2090 /* Update temporary stats for later use. */ 2091 txr->hn_stat_pkts++; 2092 txr->hn_stat_size += m_head->m_pkthdr.len; 2093 if (m_head->m_flags & M_MCAST) 2094 txr->hn_stat_mcasts++; 2095 2096 return 0; 2097 } 2098 2099 /* 2100 * NOTE: 2101 * If this function fails, then txd will be freed, but the mbuf 2102 * associated w/ the txd will _not_ be freed. 2103 */ 2104 static int 2105 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 2106 { 2107 int error, send_failed = 0, has_bpf; 2108 2109 again: 2110 has_bpf = bpf_peers_present(ifp->if_bpf); 2111 if (has_bpf) { 2112 /* 2113 * Make sure that this txd and any aggregated txds are not 2114 * freed before ETHER_BPF_MTAP. 2115 */ 2116 hn_txdesc_hold(txd); 2117 } 2118 error = txr->hn_sendpkt(txr, txd); 2119 if (!error) { 2120 if (has_bpf) { 2121 const struct hn_txdesc *tmp_txd; 2122 2123 ETHER_BPF_MTAP(ifp, txd->m); 2124 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 2125 ETHER_BPF_MTAP(ifp, tmp_txd->m); 2126 } 2127 2128 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 2129 #ifdef HN_IFSTART_SUPPORT 2130 if (!hn_use_if_start) 2131 #endif 2132 { 2133 if_inc_counter(ifp, IFCOUNTER_OBYTES, 2134 txr->hn_stat_size); 2135 if (txr->hn_stat_mcasts != 0) { 2136 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 2137 txr->hn_stat_mcasts); 2138 } 2139 } 2140 txr->hn_pkts += txr->hn_stat_pkts; 2141 txr->hn_sends++; 2142 } 2143 if (has_bpf) 2144 hn_txdesc_put(txr, txd); 2145 2146 if (__predict_false(error)) { 2147 int freed; 2148 2149 /* 2150 * This should "really rarely" happen. 2151 * 2152 * XXX Too many RX to be acked or too many sideband 2153 * commands to run? Ask netvsc_channel_rollup() 2154 * to kick start later. 2155 */ 2156 txr->hn_has_txeof = 1; 2157 if (!send_failed) { 2158 txr->hn_send_failed++; 2159 send_failed = 1; 2160 /* 2161 * Try sending again after set hn_has_txeof; 2162 * in case that we missed the last 2163 * netvsc_channel_rollup(). 2164 */ 2165 goto again; 2166 } 2167 if_printf(ifp, "send failed\n"); 2168 2169 /* 2170 * Caller will perform further processing on the 2171 * associated mbuf, so don't free it in hn_txdesc_put(); 2172 * only unload it from the DMA map in hn_txdesc_put(), 2173 * if it was loaded. 2174 */ 2175 txd->m = NULL; 2176 freed = hn_txdesc_put(txr, txd); 2177 KASSERT(freed != 0, 2178 ("fail to free txd upon send error")); 2179 2180 txr->hn_send_failed++; 2181 } 2182 2183 /* Reset temporary stats, after this sending is done. */ 2184 txr->hn_stat_size = 0; 2185 txr->hn_stat_pkts = 0; 2186 txr->hn_stat_mcasts = 0; 2187 2188 return (error); 2189 } 2190 2191 /* 2192 * Append the specified data to the indicated mbuf chain, 2193 * Extend the mbuf chain if the new data does not fit in 2194 * existing space. 2195 * 2196 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 2197 * There should be an equivalent in the kernel mbuf code, 2198 * but there does not appear to be one yet. 2199 * 2200 * Differs from m_append() in that additional mbufs are 2201 * allocated with cluster size MJUMPAGESIZE, and filled 2202 * accordingly. 2203 * 2204 * Return 1 if able to complete the job; otherwise 0. 2205 */ 2206 static int 2207 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 2208 { 2209 struct mbuf *m, *n; 2210 int remainder, space; 2211 2212 for (m = m0; m->m_next != NULL; m = m->m_next) 2213 ; 2214 remainder = len; 2215 space = M_TRAILINGSPACE(m); 2216 if (space > 0) { 2217 /* 2218 * Copy into available space. 2219 */ 2220 if (space > remainder) 2221 space = remainder; 2222 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 2223 m->m_len += space; 2224 cp += space; 2225 remainder -= space; 2226 } 2227 while (remainder > 0) { 2228 /* 2229 * Allocate a new mbuf; could check space 2230 * and allocate a cluster instead. 2231 */ 2232 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 2233 if (n == NULL) 2234 break; 2235 n->m_len = min(MJUMPAGESIZE, remainder); 2236 bcopy(cp, mtod(n, caddr_t), n->m_len); 2237 cp += n->m_len; 2238 remainder -= n->m_len; 2239 m->m_next = n; 2240 m = n; 2241 } 2242 if (m0->m_flags & M_PKTHDR) 2243 m0->m_pkthdr.len += len - remainder; 2244 2245 return (remainder == 0); 2246 } 2247 2248 #if defined(INET) || defined(INET6) 2249 static __inline int 2250 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 2251 { 2252 #if __FreeBSD_version >= 1100095 2253 if (hn_lro_mbufq_depth) { 2254 tcp_lro_queue_mbuf(lc, m); 2255 return 0; 2256 } 2257 #endif 2258 return tcp_lro_rx(lc, m, 0); 2259 } 2260 #endif 2261 2262 static int 2263 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, 2264 const struct hn_rxinfo *info) 2265 { 2266 struct ifnet *ifp; 2267 struct mbuf *m_new; 2268 int size, do_lro = 0, do_csum = 1; 2269 int hash_type; 2270 2271 /* If the VF is active, inject the packet through the VF */ 2272 ifp = rxr->hn_vf ? rxr->hn_vf : rxr->hn_ifp; 2273 2274 if (dlen <= MHLEN) { 2275 m_new = m_gethdr(M_NOWAIT, MT_DATA); 2276 if (m_new == NULL) { 2277 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2278 return (0); 2279 } 2280 memcpy(mtod(m_new, void *), data, dlen); 2281 m_new->m_pkthdr.len = m_new->m_len = dlen; 2282 rxr->hn_small_pkts++; 2283 } else { 2284 /* 2285 * Get an mbuf with a cluster. For packets 2K or less, 2286 * get a standard 2K cluster. For anything larger, get a 2287 * 4K cluster. Any buffers larger than 4K can cause problems 2288 * if looped around to the Hyper-V TX channel, so avoid them. 2289 */ 2290 size = MCLBYTES; 2291 if (dlen > MCLBYTES) { 2292 /* 4096 */ 2293 size = MJUMPAGESIZE; 2294 } 2295 2296 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 2297 if (m_new == NULL) { 2298 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2299 return (0); 2300 } 2301 2302 hv_m_append(m_new, dlen, data); 2303 } 2304 m_new->m_pkthdr.rcvif = ifp; 2305 2306 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) 2307 do_csum = 0; 2308 2309 /* receive side checksum offload */ 2310 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { 2311 /* IP csum offload */ 2312 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 2313 m_new->m_pkthdr.csum_flags |= 2314 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2315 rxr->hn_csum_ip++; 2316 } 2317 2318 /* TCP/UDP csum offload */ 2319 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | 2320 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 2321 m_new->m_pkthdr.csum_flags |= 2322 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2323 m_new->m_pkthdr.csum_data = 0xffff; 2324 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) 2325 rxr->hn_csum_tcp++; 2326 else 2327 rxr->hn_csum_udp++; 2328 } 2329 2330 /* 2331 * XXX 2332 * As of this write (Oct 28th, 2016), host side will turn 2333 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 2334 * the do_lro setting here is actually _not_ accurate. We 2335 * depend on the RSS hash type check to reset do_lro. 2336 */ 2337 if ((info->csum_info & 2338 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 2339 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 2340 do_lro = 1; 2341 } else { 2342 const struct ether_header *eh; 2343 uint16_t etype; 2344 int hoff; 2345 2346 hoff = sizeof(*eh); 2347 if (m_new->m_len < hoff) 2348 goto skip; 2349 eh = mtod(m_new, struct ether_header *); 2350 etype = ntohs(eh->ether_type); 2351 if (etype == ETHERTYPE_VLAN) { 2352 const struct ether_vlan_header *evl; 2353 2354 hoff = sizeof(*evl); 2355 if (m_new->m_len < hoff) 2356 goto skip; 2357 evl = mtod(m_new, struct ether_vlan_header *); 2358 etype = ntohs(evl->evl_proto); 2359 } 2360 2361 if (etype == ETHERTYPE_IP) { 2362 int pr; 2363 2364 pr = hn_check_iplen(m_new, hoff); 2365 if (pr == IPPROTO_TCP) { 2366 if (do_csum && 2367 (rxr->hn_trust_hcsum & 2368 HN_TRUST_HCSUM_TCP)) { 2369 rxr->hn_csum_trusted++; 2370 m_new->m_pkthdr.csum_flags |= 2371 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2372 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2373 m_new->m_pkthdr.csum_data = 0xffff; 2374 } 2375 do_lro = 1; 2376 } else if (pr == IPPROTO_UDP) { 2377 if (do_csum && 2378 (rxr->hn_trust_hcsum & 2379 HN_TRUST_HCSUM_UDP)) { 2380 rxr->hn_csum_trusted++; 2381 m_new->m_pkthdr.csum_flags |= 2382 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2383 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2384 m_new->m_pkthdr.csum_data = 0xffff; 2385 } 2386 } else if (pr != IPPROTO_DONE && do_csum && 2387 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 2388 rxr->hn_csum_trusted++; 2389 m_new->m_pkthdr.csum_flags |= 2390 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2391 } 2392 } 2393 } 2394 skip: 2395 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { 2396 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 2397 NDIS_VLAN_INFO_ID(info->vlan_info), 2398 NDIS_VLAN_INFO_PRI(info->vlan_info), 2399 NDIS_VLAN_INFO_CFI(info->vlan_info)); 2400 m_new->m_flags |= M_VLANTAG; 2401 } 2402 2403 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { 2404 rxr->hn_rss_pkts++; 2405 m_new->m_pkthdr.flowid = info->hash_value; 2406 hash_type = M_HASHTYPE_OPAQUE_HASH; 2407 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == 2408 NDIS_HASH_FUNCTION_TOEPLITZ) { 2409 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); 2410 2411 /* 2412 * NOTE: 2413 * do_lro is resetted, if the hash types are not TCP 2414 * related. See the comment in the above csum_flags 2415 * setup section. 2416 */ 2417 switch (type) { 2418 case NDIS_HASH_IPV4: 2419 hash_type = M_HASHTYPE_RSS_IPV4; 2420 do_lro = 0; 2421 break; 2422 2423 case NDIS_HASH_TCP_IPV4: 2424 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 2425 break; 2426 2427 case NDIS_HASH_IPV6: 2428 hash_type = M_HASHTYPE_RSS_IPV6; 2429 do_lro = 0; 2430 break; 2431 2432 case NDIS_HASH_IPV6_EX: 2433 hash_type = M_HASHTYPE_RSS_IPV6_EX; 2434 do_lro = 0; 2435 break; 2436 2437 case NDIS_HASH_TCP_IPV6: 2438 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 2439 break; 2440 2441 case NDIS_HASH_TCP_IPV6_EX: 2442 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 2443 break; 2444 } 2445 } 2446 } else { 2447 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 2448 hash_type = M_HASHTYPE_OPAQUE; 2449 } 2450 M_HASHTYPE_SET(m_new, hash_type); 2451 2452 /* 2453 * Note: Moved RX completion back to hv_nv_on_receive() so all 2454 * messages (not just data messages) will trigger a response. 2455 */ 2456 2457 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 2458 rxr->hn_pkts++; 2459 2460 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { 2461 #if defined(INET) || defined(INET6) 2462 struct lro_ctrl *lro = &rxr->hn_lro; 2463 2464 if (lro->lro_cnt) { 2465 rxr->hn_lro_tried++; 2466 if (hn_lro_rx(lro, m_new) == 0) { 2467 /* DONE! */ 2468 return 0; 2469 } 2470 } 2471 #endif 2472 } 2473 2474 /* We're not holding the lock here, so don't release it */ 2475 (*ifp->if_input)(ifp, m_new); 2476 2477 return (0); 2478 } 2479 2480 static int 2481 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 2482 { 2483 struct hn_softc *sc = ifp->if_softc; 2484 struct ifreq *ifr = (struct ifreq *)data; 2485 int mask, error = 0; 2486 2487 switch (cmd) { 2488 case SIOCSIFMTU: 2489 if (ifr->ifr_mtu > HN_MTU_MAX) { 2490 error = EINVAL; 2491 break; 2492 } 2493 2494 HN_LOCK(sc); 2495 2496 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2497 HN_UNLOCK(sc); 2498 break; 2499 } 2500 2501 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 2502 /* Can't change MTU */ 2503 HN_UNLOCK(sc); 2504 error = EOPNOTSUPP; 2505 break; 2506 } 2507 2508 if (ifp->if_mtu == ifr->ifr_mtu) { 2509 HN_UNLOCK(sc); 2510 break; 2511 } 2512 2513 /* 2514 * Suspend this interface before the synthetic parts 2515 * are ripped. 2516 */ 2517 hn_suspend(sc); 2518 2519 /* 2520 * Detach the synthetics parts, i.e. NVS and RNDIS. 2521 */ 2522 hn_synth_detach(sc); 2523 2524 /* 2525 * Reattach the synthetic parts, i.e. NVS and RNDIS, 2526 * with the new MTU setting. 2527 */ 2528 error = hn_synth_attach(sc, ifr->ifr_mtu); 2529 if (error) { 2530 HN_UNLOCK(sc); 2531 break; 2532 } 2533 2534 /* 2535 * Commit the requested MTU, after the synthetic parts 2536 * have been successfully attached. 2537 */ 2538 ifp->if_mtu = ifr->ifr_mtu; 2539 2540 /* 2541 * Make sure that various parameters based on MTU are 2542 * still valid, after the MTU change. 2543 */ 2544 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 2545 hn_set_chim_size(sc, sc->hn_chim_szmax); 2546 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 2547 #if __FreeBSD_version >= 1100099 2548 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < 2549 HN_LRO_LENLIM_MIN(ifp)) 2550 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 2551 #endif 2552 2553 /* 2554 * All done! Resume the interface now. 2555 */ 2556 hn_resume(sc); 2557 2558 HN_UNLOCK(sc); 2559 break; 2560 2561 case SIOCSIFFLAGS: 2562 HN_LOCK(sc); 2563 2564 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2565 HN_UNLOCK(sc); 2566 break; 2567 } 2568 2569 if (ifp->if_flags & IFF_UP) { 2570 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2571 /* 2572 * Caller meight hold mutex, e.g. 2573 * bpf; use busy-wait for the RNDIS 2574 * reply. 2575 */ 2576 HN_NO_SLEEPING(sc); 2577 hn_rxfilter_config(sc); 2578 HN_SLEEPING_OK(sc); 2579 } else { 2580 hn_init_locked(sc); 2581 } 2582 } else { 2583 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2584 hn_stop(sc, false); 2585 } 2586 sc->hn_if_flags = ifp->if_flags; 2587 2588 HN_UNLOCK(sc); 2589 break; 2590 2591 case SIOCSIFCAP: 2592 HN_LOCK(sc); 2593 mask = ifr->ifr_reqcap ^ ifp->if_capenable; 2594 2595 if (mask & IFCAP_TXCSUM) { 2596 ifp->if_capenable ^= IFCAP_TXCSUM; 2597 if (ifp->if_capenable & IFCAP_TXCSUM) 2598 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 2599 else 2600 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 2601 } 2602 if (mask & IFCAP_TXCSUM_IPV6) { 2603 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 2604 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 2605 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 2606 else 2607 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 2608 } 2609 2610 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 2611 if (mask & IFCAP_RXCSUM) 2612 ifp->if_capenable ^= IFCAP_RXCSUM; 2613 #ifdef foo 2614 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2615 if (mask & IFCAP_RXCSUM_IPV6) 2616 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 2617 #endif 2618 2619 if (mask & IFCAP_LRO) 2620 ifp->if_capenable ^= IFCAP_LRO; 2621 2622 if (mask & IFCAP_TSO4) { 2623 ifp->if_capenable ^= IFCAP_TSO4; 2624 if (ifp->if_capenable & IFCAP_TSO4) 2625 ifp->if_hwassist |= CSUM_IP_TSO; 2626 else 2627 ifp->if_hwassist &= ~CSUM_IP_TSO; 2628 } 2629 if (mask & IFCAP_TSO6) { 2630 ifp->if_capenable ^= IFCAP_TSO6; 2631 if (ifp->if_capenable & IFCAP_TSO6) 2632 ifp->if_hwassist |= CSUM_IP6_TSO; 2633 else 2634 ifp->if_hwassist &= ~CSUM_IP6_TSO; 2635 } 2636 2637 HN_UNLOCK(sc); 2638 break; 2639 2640 case SIOCADDMULTI: 2641 case SIOCDELMULTI: 2642 HN_LOCK(sc); 2643 2644 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2645 HN_UNLOCK(sc); 2646 break; 2647 } 2648 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2649 /* 2650 * Multicast uses mutex; use busy-wait for 2651 * the RNDIS reply. 2652 */ 2653 HN_NO_SLEEPING(sc); 2654 hn_rxfilter_config(sc); 2655 HN_SLEEPING_OK(sc); 2656 } 2657 2658 HN_UNLOCK(sc); 2659 break; 2660 2661 case SIOCSIFMEDIA: 2662 case SIOCGIFMEDIA: 2663 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 2664 break; 2665 2666 default: 2667 error = ether_ioctl(ifp, cmd, data); 2668 break; 2669 } 2670 return (error); 2671 } 2672 2673 static void 2674 hn_stop(struct hn_softc *sc, bool detaching) 2675 { 2676 struct ifnet *ifp = sc->hn_ifp; 2677 int i; 2678 2679 HN_LOCK_ASSERT(sc); 2680 2681 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 2682 ("synthetic parts were not attached")); 2683 2684 /* Disable polling. */ 2685 hn_polling(sc, 0); 2686 2687 /* Clear RUNNING bit _before_ hn_suspend_data() */ 2688 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2689 hn_suspend_data(sc); 2690 2691 /* Clear OACTIVE bit. */ 2692 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2693 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2694 sc->hn_tx_ring[i].hn_oactive = 0; 2695 2696 /* 2697 * If the VF is active, make sure the filter is not 0, even if 2698 * the synthetic NIC is down. 2699 */ 2700 if (!detaching && (sc->hn_flags & HN_FLAG_VF)) 2701 hn_rxfilter_config(sc); 2702 } 2703 2704 static void 2705 hn_init_locked(struct hn_softc *sc) 2706 { 2707 struct ifnet *ifp = sc->hn_ifp; 2708 int i; 2709 2710 HN_LOCK_ASSERT(sc); 2711 2712 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 2713 return; 2714 2715 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2716 return; 2717 2718 /* Configure RX filter */ 2719 hn_rxfilter_config(sc); 2720 2721 /* Clear OACTIVE bit. */ 2722 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2723 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2724 sc->hn_tx_ring[i].hn_oactive = 0; 2725 2726 /* Clear TX 'suspended' bit. */ 2727 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 2728 2729 /* Everything is ready; unleash! */ 2730 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2731 2732 /* Re-enable polling if requested. */ 2733 if (sc->hn_pollhz > 0) 2734 hn_polling(sc, sc->hn_pollhz); 2735 } 2736 2737 static void 2738 hn_init(void *xsc) 2739 { 2740 struct hn_softc *sc = xsc; 2741 2742 HN_LOCK(sc); 2743 hn_init_locked(sc); 2744 HN_UNLOCK(sc); 2745 } 2746 2747 #if __FreeBSD_version >= 1100099 2748 2749 static int 2750 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 2751 { 2752 struct hn_softc *sc = arg1; 2753 unsigned int lenlim; 2754 int error; 2755 2756 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 2757 error = sysctl_handle_int(oidp, &lenlim, 0, req); 2758 if (error || req->newptr == NULL) 2759 return error; 2760 2761 HN_LOCK(sc); 2762 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 2763 lenlim > TCP_LRO_LENGTH_MAX) { 2764 HN_UNLOCK(sc); 2765 return EINVAL; 2766 } 2767 hn_set_lro_lenlim(sc, lenlim); 2768 HN_UNLOCK(sc); 2769 2770 return 0; 2771 } 2772 2773 static int 2774 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 2775 { 2776 struct hn_softc *sc = arg1; 2777 int ackcnt, error, i; 2778 2779 /* 2780 * lro_ackcnt_lim is append count limit, 2781 * +1 to turn it into aggregation limit. 2782 */ 2783 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 2784 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 2785 if (error || req->newptr == NULL) 2786 return error; 2787 2788 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 2789 return EINVAL; 2790 2791 /* 2792 * Convert aggregation limit back to append 2793 * count limit. 2794 */ 2795 --ackcnt; 2796 HN_LOCK(sc); 2797 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 2798 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 2799 HN_UNLOCK(sc); 2800 return 0; 2801 } 2802 2803 #endif 2804 2805 static int 2806 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 2807 { 2808 struct hn_softc *sc = arg1; 2809 int hcsum = arg2; 2810 int on, error, i; 2811 2812 on = 0; 2813 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 2814 on = 1; 2815 2816 error = sysctl_handle_int(oidp, &on, 0, req); 2817 if (error || req->newptr == NULL) 2818 return error; 2819 2820 HN_LOCK(sc); 2821 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2822 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 2823 2824 if (on) 2825 rxr->hn_trust_hcsum |= hcsum; 2826 else 2827 rxr->hn_trust_hcsum &= ~hcsum; 2828 } 2829 HN_UNLOCK(sc); 2830 return 0; 2831 } 2832 2833 static int 2834 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 2835 { 2836 struct hn_softc *sc = arg1; 2837 int chim_size, error; 2838 2839 chim_size = sc->hn_tx_ring[0].hn_chim_size; 2840 error = sysctl_handle_int(oidp, &chim_size, 0, req); 2841 if (error || req->newptr == NULL) 2842 return error; 2843 2844 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 2845 return EINVAL; 2846 2847 HN_LOCK(sc); 2848 hn_set_chim_size(sc, chim_size); 2849 HN_UNLOCK(sc); 2850 return 0; 2851 } 2852 2853 #if __FreeBSD_version < 1100095 2854 static int 2855 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 2856 { 2857 struct hn_softc *sc = arg1; 2858 int ofs = arg2, i, error; 2859 struct hn_rx_ring *rxr; 2860 uint64_t stat; 2861 2862 stat = 0; 2863 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2864 rxr = &sc->hn_rx_ring[i]; 2865 stat += *((int *)((uint8_t *)rxr + ofs)); 2866 } 2867 2868 error = sysctl_handle_64(oidp, &stat, 0, req); 2869 if (error || req->newptr == NULL) 2870 return error; 2871 2872 /* Zero out this stat. */ 2873 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2874 rxr = &sc->hn_rx_ring[i]; 2875 *((int *)((uint8_t *)rxr + ofs)) = 0; 2876 } 2877 return 0; 2878 } 2879 #else 2880 static int 2881 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 2882 { 2883 struct hn_softc *sc = arg1; 2884 int ofs = arg2, i, error; 2885 struct hn_rx_ring *rxr; 2886 uint64_t stat; 2887 2888 stat = 0; 2889 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2890 rxr = &sc->hn_rx_ring[i]; 2891 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 2892 } 2893 2894 error = sysctl_handle_64(oidp, &stat, 0, req); 2895 if (error || req->newptr == NULL) 2896 return error; 2897 2898 /* Zero out this stat. */ 2899 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2900 rxr = &sc->hn_rx_ring[i]; 2901 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 2902 } 2903 return 0; 2904 } 2905 2906 #endif 2907 2908 static int 2909 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2910 { 2911 struct hn_softc *sc = arg1; 2912 int ofs = arg2, i, error; 2913 struct hn_rx_ring *rxr; 2914 u_long stat; 2915 2916 stat = 0; 2917 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2918 rxr = &sc->hn_rx_ring[i]; 2919 stat += *((u_long *)((uint8_t *)rxr + ofs)); 2920 } 2921 2922 error = sysctl_handle_long(oidp, &stat, 0, req); 2923 if (error || req->newptr == NULL) 2924 return error; 2925 2926 /* Zero out this stat. */ 2927 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2928 rxr = &sc->hn_rx_ring[i]; 2929 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 2930 } 2931 return 0; 2932 } 2933 2934 static int 2935 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2936 { 2937 struct hn_softc *sc = arg1; 2938 int ofs = arg2, i, error; 2939 struct hn_tx_ring *txr; 2940 u_long stat; 2941 2942 stat = 0; 2943 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2944 txr = &sc->hn_tx_ring[i]; 2945 stat += *((u_long *)((uint8_t *)txr + ofs)); 2946 } 2947 2948 error = sysctl_handle_long(oidp, &stat, 0, req); 2949 if (error || req->newptr == NULL) 2950 return error; 2951 2952 /* Zero out this stat. */ 2953 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2954 txr = &sc->hn_tx_ring[i]; 2955 *((u_long *)((uint8_t *)txr + ofs)) = 0; 2956 } 2957 return 0; 2958 } 2959 2960 static int 2961 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 2962 { 2963 struct hn_softc *sc = arg1; 2964 int ofs = arg2, i, error, conf; 2965 struct hn_tx_ring *txr; 2966 2967 txr = &sc->hn_tx_ring[0]; 2968 conf = *((int *)((uint8_t *)txr + ofs)); 2969 2970 error = sysctl_handle_int(oidp, &conf, 0, req); 2971 if (error || req->newptr == NULL) 2972 return error; 2973 2974 HN_LOCK(sc); 2975 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2976 txr = &sc->hn_tx_ring[i]; 2977 *((int *)((uint8_t *)txr + ofs)) = conf; 2978 } 2979 HN_UNLOCK(sc); 2980 2981 return 0; 2982 } 2983 2984 static int 2985 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 2986 { 2987 struct hn_softc *sc = arg1; 2988 int error, size; 2989 2990 size = sc->hn_agg_size; 2991 error = sysctl_handle_int(oidp, &size, 0, req); 2992 if (error || req->newptr == NULL) 2993 return (error); 2994 2995 HN_LOCK(sc); 2996 sc->hn_agg_size = size; 2997 hn_set_txagg(sc); 2998 HN_UNLOCK(sc); 2999 3000 return (0); 3001 } 3002 3003 static int 3004 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 3005 { 3006 struct hn_softc *sc = arg1; 3007 int error, pkts; 3008 3009 pkts = sc->hn_agg_pkts; 3010 error = sysctl_handle_int(oidp, &pkts, 0, req); 3011 if (error || req->newptr == NULL) 3012 return (error); 3013 3014 HN_LOCK(sc); 3015 sc->hn_agg_pkts = pkts; 3016 hn_set_txagg(sc); 3017 HN_UNLOCK(sc); 3018 3019 return (0); 3020 } 3021 3022 static int 3023 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 3024 { 3025 struct hn_softc *sc = arg1; 3026 int pkts; 3027 3028 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 3029 return (sysctl_handle_int(oidp, &pkts, 0, req)); 3030 } 3031 3032 static int 3033 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 3034 { 3035 struct hn_softc *sc = arg1; 3036 int align; 3037 3038 align = sc->hn_tx_ring[0].hn_agg_align; 3039 return (sysctl_handle_int(oidp, &align, 0, req)); 3040 } 3041 3042 static void 3043 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 3044 { 3045 if (pollhz == 0) 3046 vmbus_chan_poll_disable(chan); 3047 else 3048 vmbus_chan_poll_enable(chan, pollhz); 3049 } 3050 3051 static void 3052 hn_polling(struct hn_softc *sc, u_int pollhz) 3053 { 3054 int nsubch = sc->hn_rx_ring_inuse - 1; 3055 3056 HN_LOCK_ASSERT(sc); 3057 3058 if (nsubch > 0) { 3059 struct vmbus_channel **subch; 3060 int i; 3061 3062 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 3063 for (i = 0; i < nsubch; ++i) 3064 hn_chan_polling(subch[i], pollhz); 3065 vmbus_subchan_rel(subch, nsubch); 3066 } 3067 hn_chan_polling(sc->hn_prichan, pollhz); 3068 } 3069 3070 static int 3071 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 3072 { 3073 struct hn_softc *sc = arg1; 3074 int pollhz, error; 3075 3076 pollhz = sc->hn_pollhz; 3077 error = sysctl_handle_int(oidp, &pollhz, 0, req); 3078 if (error || req->newptr == NULL) 3079 return (error); 3080 3081 if (pollhz != 0 && 3082 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 3083 return (EINVAL); 3084 3085 HN_LOCK(sc); 3086 if (sc->hn_pollhz != pollhz) { 3087 sc->hn_pollhz = pollhz; 3088 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 3089 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 3090 hn_polling(sc, sc->hn_pollhz); 3091 } 3092 HN_UNLOCK(sc); 3093 3094 return (0); 3095 } 3096 3097 static int 3098 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 3099 { 3100 struct hn_softc *sc = arg1; 3101 char verstr[16]; 3102 3103 snprintf(verstr, sizeof(verstr), "%u.%u", 3104 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 3105 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 3106 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 3107 } 3108 3109 static int 3110 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 3111 { 3112 struct hn_softc *sc = arg1; 3113 char caps_str[128]; 3114 uint32_t caps; 3115 3116 HN_LOCK(sc); 3117 caps = sc->hn_caps; 3118 HN_UNLOCK(sc); 3119 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 3120 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 3121 } 3122 3123 static int 3124 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 3125 { 3126 struct hn_softc *sc = arg1; 3127 char assist_str[128]; 3128 uint32_t hwassist; 3129 3130 HN_LOCK(sc); 3131 hwassist = sc->hn_ifp->if_hwassist; 3132 HN_UNLOCK(sc); 3133 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 3134 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 3135 } 3136 3137 static int 3138 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 3139 { 3140 struct hn_softc *sc = arg1; 3141 char filter_str[128]; 3142 uint32_t filter; 3143 3144 HN_LOCK(sc); 3145 filter = sc->hn_rx_filter; 3146 HN_UNLOCK(sc); 3147 snprintf(filter_str, sizeof(filter_str), "%b", filter, 3148 NDIS_PACKET_TYPES); 3149 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 3150 } 3151 3152 #ifndef RSS 3153 3154 static int 3155 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 3156 { 3157 struct hn_softc *sc = arg1; 3158 int error; 3159 3160 HN_LOCK(sc); 3161 3162 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3163 if (error || req->newptr == NULL) 3164 goto back; 3165 3166 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3167 if (error) 3168 goto back; 3169 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 3170 3171 if (sc->hn_rx_ring_inuse > 1) { 3172 error = hn_rss_reconfig(sc); 3173 } else { 3174 /* Not RSS capable, at least for now; just save the RSS key. */ 3175 error = 0; 3176 } 3177 back: 3178 HN_UNLOCK(sc); 3179 return (error); 3180 } 3181 3182 static int 3183 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 3184 { 3185 struct hn_softc *sc = arg1; 3186 int error; 3187 3188 HN_LOCK(sc); 3189 3190 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 3191 if (error || req->newptr == NULL) 3192 goto back; 3193 3194 /* 3195 * Don't allow RSS indirect table change, if this interface is not 3196 * RSS capable currently. 3197 */ 3198 if (sc->hn_rx_ring_inuse == 1) { 3199 error = EOPNOTSUPP; 3200 goto back; 3201 } 3202 3203 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 3204 if (error) 3205 goto back; 3206 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 3207 3208 hn_rss_ind_fixup(sc); 3209 error = hn_rss_reconfig(sc); 3210 back: 3211 HN_UNLOCK(sc); 3212 return (error); 3213 } 3214 3215 #endif /* !RSS */ 3216 3217 static int 3218 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 3219 { 3220 struct hn_softc *sc = arg1; 3221 char hash_str[128]; 3222 uint32_t hash; 3223 3224 HN_LOCK(sc); 3225 hash = sc->hn_rss_hash; 3226 HN_UNLOCK(sc); 3227 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 3228 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 3229 } 3230 3231 static int 3232 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 3233 { 3234 struct hn_softc *sc = arg1; 3235 char vf_name[128]; 3236 struct ifnet *vf; 3237 3238 HN_LOCK(sc); 3239 vf_name[0] = '\0'; 3240 vf = sc->hn_rx_ring[0].hn_vf; 3241 if (vf != NULL) 3242 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf)); 3243 HN_UNLOCK(sc); 3244 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 3245 } 3246 3247 static int 3248 hn_check_iplen(const struct mbuf *m, int hoff) 3249 { 3250 const struct ip *ip; 3251 int len, iphlen, iplen; 3252 const struct tcphdr *th; 3253 int thoff; /* TCP data offset */ 3254 3255 len = hoff + sizeof(struct ip); 3256 3257 /* The packet must be at least the size of an IP header. */ 3258 if (m->m_pkthdr.len < len) 3259 return IPPROTO_DONE; 3260 3261 /* The fixed IP header must reside completely in the first mbuf. */ 3262 if (m->m_len < len) 3263 return IPPROTO_DONE; 3264 3265 ip = mtodo(m, hoff); 3266 3267 /* Bound check the packet's stated IP header length. */ 3268 iphlen = ip->ip_hl << 2; 3269 if (iphlen < sizeof(struct ip)) /* minimum header length */ 3270 return IPPROTO_DONE; 3271 3272 /* The full IP header must reside completely in the one mbuf. */ 3273 if (m->m_len < hoff + iphlen) 3274 return IPPROTO_DONE; 3275 3276 iplen = ntohs(ip->ip_len); 3277 3278 /* 3279 * Check that the amount of data in the buffers is as 3280 * at least much as the IP header would have us expect. 3281 */ 3282 if (m->m_pkthdr.len < hoff + iplen) 3283 return IPPROTO_DONE; 3284 3285 /* 3286 * Ignore IP fragments. 3287 */ 3288 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 3289 return IPPROTO_DONE; 3290 3291 /* 3292 * The TCP/IP or UDP/IP header must be entirely contained within 3293 * the first fragment of a packet. 3294 */ 3295 switch (ip->ip_p) { 3296 case IPPROTO_TCP: 3297 if (iplen < iphlen + sizeof(struct tcphdr)) 3298 return IPPROTO_DONE; 3299 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 3300 return IPPROTO_DONE; 3301 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 3302 thoff = th->th_off << 2; 3303 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 3304 return IPPROTO_DONE; 3305 if (m->m_len < hoff + iphlen + thoff) 3306 return IPPROTO_DONE; 3307 break; 3308 case IPPROTO_UDP: 3309 if (iplen < iphlen + sizeof(struct udphdr)) 3310 return IPPROTO_DONE; 3311 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 3312 return IPPROTO_DONE; 3313 break; 3314 default: 3315 if (iplen < iphlen) 3316 return IPPROTO_DONE; 3317 break; 3318 } 3319 return ip->ip_p; 3320 } 3321 3322 static int 3323 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 3324 { 3325 struct sysctl_oid_list *child; 3326 struct sysctl_ctx_list *ctx; 3327 device_t dev = sc->hn_dev; 3328 #if defined(INET) || defined(INET6) 3329 #if __FreeBSD_version >= 1100095 3330 int lroent_cnt; 3331 #endif 3332 #endif 3333 int i; 3334 3335 /* 3336 * Create RXBUF for reception. 3337 * 3338 * NOTE: 3339 * - It is shared by all channels. 3340 * - A large enough buffer is allocated, certain version of NVSes 3341 * may further limit the usable space. 3342 */ 3343 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3344 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 3345 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3346 if (sc->hn_rxbuf == NULL) { 3347 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 3348 return (ENOMEM); 3349 } 3350 3351 sc->hn_rx_ring_cnt = ring_cnt; 3352 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 3353 3354 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 3355 M_DEVBUF, M_WAITOK | M_ZERO); 3356 3357 #if defined(INET) || defined(INET6) 3358 #if __FreeBSD_version >= 1100095 3359 lroent_cnt = hn_lro_entry_count; 3360 if (lroent_cnt < TCP_LRO_ENTRIES) 3361 lroent_cnt = TCP_LRO_ENTRIES; 3362 if (bootverbose) 3363 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 3364 #endif 3365 #endif /* INET || INET6 */ 3366 3367 ctx = device_get_sysctl_ctx(dev); 3368 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 3369 3370 /* Create dev.hn.UNIT.rx sysctl tree */ 3371 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 3372 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3373 3374 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3375 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3376 3377 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3378 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 3379 &rxr->hn_br_dma, BUS_DMA_WAITOK); 3380 if (rxr->hn_br == NULL) { 3381 device_printf(dev, "allocate bufring failed\n"); 3382 return (ENOMEM); 3383 } 3384 3385 if (hn_trust_hosttcp) 3386 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 3387 if (hn_trust_hostudp) 3388 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 3389 if (hn_trust_hostip) 3390 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 3391 rxr->hn_ifp = sc->hn_ifp; 3392 if (i < sc->hn_tx_ring_cnt) 3393 rxr->hn_txr = &sc->hn_tx_ring[i]; 3394 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 3395 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 3396 rxr->hn_rx_idx = i; 3397 rxr->hn_rxbuf = sc->hn_rxbuf; 3398 3399 /* 3400 * Initialize LRO. 3401 */ 3402 #if defined(INET) || defined(INET6) 3403 #if __FreeBSD_version >= 1100095 3404 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 3405 hn_lro_mbufq_depth); 3406 #else 3407 tcp_lro_init(&rxr->hn_lro); 3408 rxr->hn_lro.ifp = sc->hn_ifp; 3409 #endif 3410 #if __FreeBSD_version >= 1100099 3411 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 3412 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 3413 #endif 3414 #endif /* INET || INET6 */ 3415 3416 if (sc->hn_rx_sysctl_tree != NULL) { 3417 char name[16]; 3418 3419 /* 3420 * Create per RX ring sysctl tree: 3421 * dev.hn.UNIT.rx.RINGID 3422 */ 3423 snprintf(name, sizeof(name), "%d", i); 3424 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 3425 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 3426 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3427 3428 if (rxr->hn_rx_sysctl_tree != NULL) { 3429 SYSCTL_ADD_ULONG(ctx, 3430 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3431 OID_AUTO, "packets", CTLFLAG_RW, 3432 &rxr->hn_pkts, "# of packets received"); 3433 SYSCTL_ADD_ULONG(ctx, 3434 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3435 OID_AUTO, "rss_pkts", CTLFLAG_RW, 3436 &rxr->hn_rss_pkts, 3437 "# of packets w/ RSS info received"); 3438 SYSCTL_ADD_INT(ctx, 3439 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3440 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 3441 &rxr->hn_pktbuf_len, 0, 3442 "Temporary channel packet buffer length"); 3443 } 3444 } 3445 } 3446 3447 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 3448 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3449 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 3450 #if __FreeBSD_version < 1100095 3451 hn_rx_stat_int_sysctl, 3452 #else 3453 hn_rx_stat_u64_sysctl, 3454 #endif 3455 "LU", "LRO queued"); 3456 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 3457 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3458 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 3459 #if __FreeBSD_version < 1100095 3460 hn_rx_stat_int_sysctl, 3461 #else 3462 hn_rx_stat_u64_sysctl, 3463 #endif 3464 "LU", "LRO flushed"); 3465 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 3466 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3467 __offsetof(struct hn_rx_ring, hn_lro_tried), 3468 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 3469 #if __FreeBSD_version >= 1100099 3470 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 3471 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3472 hn_lro_lenlim_sysctl, "IU", 3473 "Max # of data bytes to be aggregated by LRO"); 3474 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 3475 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3476 hn_lro_ackcnt_sysctl, "I", 3477 "Max # of ACKs to be aggregated by LRO"); 3478 #endif 3479 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 3480 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 3481 hn_trust_hcsum_sysctl, "I", 3482 "Trust tcp segement verification on host side, " 3483 "when csum info is missing"); 3484 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 3485 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 3486 hn_trust_hcsum_sysctl, "I", 3487 "Trust udp datagram verification on host side, " 3488 "when csum info is missing"); 3489 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 3490 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 3491 hn_trust_hcsum_sysctl, "I", 3492 "Trust ip packet verification on host side, " 3493 "when csum info is missing"); 3494 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 3495 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3496 __offsetof(struct hn_rx_ring, hn_csum_ip), 3497 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 3498 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 3499 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3500 __offsetof(struct hn_rx_ring, hn_csum_tcp), 3501 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 3502 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 3503 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3504 __offsetof(struct hn_rx_ring, hn_csum_udp), 3505 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 3506 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 3507 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3508 __offsetof(struct hn_rx_ring, hn_csum_trusted), 3509 hn_rx_stat_ulong_sysctl, "LU", 3510 "# of packets that we trust host's csum verification"); 3511 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 3512 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3513 __offsetof(struct hn_rx_ring, hn_small_pkts), 3514 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 3515 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 3516 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3517 __offsetof(struct hn_rx_ring, hn_ack_failed), 3518 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 3519 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 3520 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 3521 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 3522 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 3523 3524 return (0); 3525 } 3526 3527 static void 3528 hn_destroy_rx_data(struct hn_softc *sc) 3529 { 3530 int i; 3531 3532 if (sc->hn_rxbuf != NULL) { 3533 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 3534 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 3535 else 3536 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 3537 sc->hn_rxbuf = NULL; 3538 } 3539 3540 if (sc->hn_rx_ring_cnt == 0) 3541 return; 3542 3543 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3544 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3545 3546 if (rxr->hn_br == NULL) 3547 continue; 3548 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 3549 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 3550 } else { 3551 device_printf(sc->hn_dev, 3552 "%dth channel bufring is referenced", i); 3553 } 3554 rxr->hn_br = NULL; 3555 3556 #if defined(INET) || defined(INET6) 3557 tcp_lro_free(&rxr->hn_lro); 3558 #endif 3559 free(rxr->hn_pktbuf, M_DEVBUF); 3560 } 3561 free(sc->hn_rx_ring, M_DEVBUF); 3562 sc->hn_rx_ring = NULL; 3563 3564 sc->hn_rx_ring_cnt = 0; 3565 sc->hn_rx_ring_inuse = 0; 3566 } 3567 3568 static int 3569 hn_tx_ring_create(struct hn_softc *sc, int id) 3570 { 3571 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 3572 device_t dev = sc->hn_dev; 3573 bus_dma_tag_t parent_dtag; 3574 int error, i; 3575 3576 txr->hn_sc = sc; 3577 txr->hn_tx_idx = id; 3578 3579 #ifndef HN_USE_TXDESC_BUFRING 3580 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 3581 #endif 3582 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 3583 3584 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 3585 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 3586 M_DEVBUF, M_WAITOK | M_ZERO); 3587 #ifndef HN_USE_TXDESC_BUFRING 3588 SLIST_INIT(&txr->hn_txlist); 3589 #else 3590 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 3591 M_WAITOK, &txr->hn_tx_lock); 3592 #endif 3593 3594 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 3595 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 3596 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 3597 } else { 3598 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 3599 } 3600 3601 #ifdef HN_IFSTART_SUPPORT 3602 if (hn_use_if_start) { 3603 txr->hn_txeof = hn_start_txeof; 3604 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 3605 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 3606 } else 3607 #endif 3608 { 3609 int br_depth; 3610 3611 txr->hn_txeof = hn_xmit_txeof; 3612 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 3613 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 3614 3615 br_depth = hn_get_txswq_depth(txr); 3616 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 3617 M_WAITOK, &txr->hn_tx_lock); 3618 } 3619 3620 txr->hn_direct_tx_size = hn_direct_tx_size; 3621 3622 /* 3623 * Always schedule transmission instead of trying to do direct 3624 * transmission. This one gives the best performance so far. 3625 */ 3626 txr->hn_sched_tx = 1; 3627 3628 parent_dtag = bus_get_dma_tag(dev); 3629 3630 /* DMA tag for RNDIS packet messages. */ 3631 error = bus_dma_tag_create(parent_dtag, /* parent */ 3632 HN_RNDIS_PKT_ALIGN, /* alignment */ 3633 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 3634 BUS_SPACE_MAXADDR, /* lowaddr */ 3635 BUS_SPACE_MAXADDR, /* highaddr */ 3636 NULL, NULL, /* filter, filterarg */ 3637 HN_RNDIS_PKT_LEN, /* maxsize */ 3638 1, /* nsegments */ 3639 HN_RNDIS_PKT_LEN, /* maxsegsize */ 3640 0, /* flags */ 3641 NULL, /* lockfunc */ 3642 NULL, /* lockfuncarg */ 3643 &txr->hn_tx_rndis_dtag); 3644 if (error) { 3645 device_printf(dev, "failed to create rndis dmatag\n"); 3646 return error; 3647 } 3648 3649 /* DMA tag for data. */ 3650 error = bus_dma_tag_create(parent_dtag, /* parent */ 3651 1, /* alignment */ 3652 HN_TX_DATA_BOUNDARY, /* boundary */ 3653 BUS_SPACE_MAXADDR, /* lowaddr */ 3654 BUS_SPACE_MAXADDR, /* highaddr */ 3655 NULL, NULL, /* filter, filterarg */ 3656 HN_TX_DATA_MAXSIZE, /* maxsize */ 3657 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 3658 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 3659 0, /* flags */ 3660 NULL, /* lockfunc */ 3661 NULL, /* lockfuncarg */ 3662 &txr->hn_tx_data_dtag); 3663 if (error) { 3664 device_printf(dev, "failed to create data dmatag\n"); 3665 return error; 3666 } 3667 3668 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 3669 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 3670 3671 txd->txr = txr; 3672 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3673 STAILQ_INIT(&txd->agg_list); 3674 3675 /* 3676 * Allocate and load RNDIS packet message. 3677 */ 3678 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 3679 (void **)&txd->rndis_pkt, 3680 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 3681 &txd->rndis_pkt_dmap); 3682 if (error) { 3683 device_printf(dev, 3684 "failed to allocate rndis_packet_msg, %d\n", i); 3685 return error; 3686 } 3687 3688 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 3689 txd->rndis_pkt_dmap, 3690 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 3691 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 3692 BUS_DMA_NOWAIT); 3693 if (error) { 3694 device_printf(dev, 3695 "failed to load rndis_packet_msg, %d\n", i); 3696 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3697 txd->rndis_pkt, txd->rndis_pkt_dmap); 3698 return error; 3699 } 3700 3701 /* DMA map for TX data. */ 3702 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 3703 &txd->data_dmap); 3704 if (error) { 3705 device_printf(dev, 3706 "failed to allocate tx data dmamap\n"); 3707 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 3708 txd->rndis_pkt_dmap); 3709 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3710 txd->rndis_pkt, txd->rndis_pkt_dmap); 3711 return error; 3712 } 3713 3714 /* All set, put it to list */ 3715 txd->flags |= HN_TXD_FLAG_ONLIST; 3716 #ifndef HN_USE_TXDESC_BUFRING 3717 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 3718 #else 3719 buf_ring_enqueue(txr->hn_txdesc_br, txd); 3720 #endif 3721 } 3722 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 3723 3724 if (sc->hn_tx_sysctl_tree != NULL) { 3725 struct sysctl_oid_list *child; 3726 struct sysctl_ctx_list *ctx; 3727 char name[16]; 3728 3729 /* 3730 * Create per TX ring sysctl tree: 3731 * dev.hn.UNIT.tx.RINGID 3732 */ 3733 ctx = device_get_sysctl_ctx(dev); 3734 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 3735 3736 snprintf(name, sizeof(name), "%d", id); 3737 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 3738 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3739 3740 if (txr->hn_tx_sysctl_tree != NULL) { 3741 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 3742 3743 #ifdef HN_DEBUG 3744 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 3745 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 3746 "# of available TX descs"); 3747 #endif 3748 #ifdef HN_IFSTART_SUPPORT 3749 if (!hn_use_if_start) 3750 #endif 3751 { 3752 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 3753 CTLFLAG_RD, &txr->hn_oactive, 0, 3754 "over active"); 3755 } 3756 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 3757 CTLFLAG_RW, &txr->hn_pkts, 3758 "# of packets transmitted"); 3759 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 3760 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 3761 } 3762 } 3763 3764 return 0; 3765 } 3766 3767 static void 3768 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 3769 { 3770 struct hn_tx_ring *txr = txd->txr; 3771 3772 KASSERT(txd->m == NULL, ("still has mbuf installed")); 3773 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 3774 3775 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 3776 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 3777 txd->rndis_pkt_dmap); 3778 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 3779 } 3780 3781 static void 3782 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 3783 { 3784 3785 KASSERT(txd->refs == 0 || txd->refs == 1, 3786 ("invalid txd refs %d", txd->refs)); 3787 3788 /* Aggregated txds will be freed by their aggregating txd. */ 3789 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 3790 int freed; 3791 3792 freed = hn_txdesc_put(txr, txd); 3793 KASSERT(freed, ("can't free txdesc")); 3794 } 3795 } 3796 3797 static void 3798 hn_tx_ring_destroy(struct hn_tx_ring *txr) 3799 { 3800 int i; 3801 3802 if (txr->hn_txdesc == NULL) 3803 return; 3804 3805 /* 3806 * NOTE: 3807 * Because the freeing of aggregated txds will be deferred 3808 * to the aggregating txd, two passes are used here: 3809 * - The first pass GCes any pending txds. This GC is necessary, 3810 * since if the channels are revoked, hypervisor will not 3811 * deliver send-done for all pending txds. 3812 * - The second pass frees the busdma stuffs, i.e. after all txds 3813 * were freed. 3814 */ 3815 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3816 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 3817 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3818 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 3819 3820 if (txr->hn_tx_data_dtag != NULL) 3821 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 3822 if (txr->hn_tx_rndis_dtag != NULL) 3823 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 3824 3825 #ifdef HN_USE_TXDESC_BUFRING 3826 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 3827 #endif 3828 3829 free(txr->hn_txdesc, M_DEVBUF); 3830 txr->hn_txdesc = NULL; 3831 3832 if (txr->hn_mbuf_br != NULL) 3833 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 3834 3835 #ifndef HN_USE_TXDESC_BUFRING 3836 mtx_destroy(&txr->hn_txlist_spin); 3837 #endif 3838 mtx_destroy(&txr->hn_tx_lock); 3839 } 3840 3841 static int 3842 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 3843 { 3844 struct sysctl_oid_list *child; 3845 struct sysctl_ctx_list *ctx; 3846 int i; 3847 3848 /* 3849 * Create TXBUF for chimney sending. 3850 * 3851 * NOTE: It is shared by all channels. 3852 */ 3853 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 3854 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 3855 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3856 if (sc->hn_chim == NULL) { 3857 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 3858 return (ENOMEM); 3859 } 3860 3861 sc->hn_tx_ring_cnt = ring_cnt; 3862 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 3863 3864 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 3865 M_DEVBUF, M_WAITOK | M_ZERO); 3866 3867 ctx = device_get_sysctl_ctx(sc->hn_dev); 3868 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 3869 3870 /* Create dev.hn.UNIT.tx sysctl tree */ 3871 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 3872 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3873 3874 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 3875 int error; 3876 3877 error = hn_tx_ring_create(sc, i); 3878 if (error) 3879 return error; 3880 } 3881 3882 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 3883 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3884 __offsetof(struct hn_tx_ring, hn_no_txdescs), 3885 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 3886 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 3887 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3888 __offsetof(struct hn_tx_ring, hn_send_failed), 3889 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 3890 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 3891 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3892 __offsetof(struct hn_tx_ring, hn_txdma_failed), 3893 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 3894 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 3895 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3896 __offsetof(struct hn_tx_ring, hn_flush_failed), 3897 hn_tx_stat_ulong_sysctl, "LU", 3898 "# of packet transmission aggregation flush failure"); 3899 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 3900 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3901 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 3902 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 3903 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 3904 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3905 __offsetof(struct hn_tx_ring, hn_tx_chimney), 3906 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 3907 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 3908 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3909 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 3910 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 3911 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 3912 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 3913 "# of total TX descs"); 3914 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 3915 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 3916 "Chimney send packet size upper boundary"); 3917 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 3918 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3919 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 3920 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 3921 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3922 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 3923 hn_tx_conf_int_sysctl, "I", 3924 "Size of the packet for direct transmission"); 3925 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 3926 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3927 __offsetof(struct hn_tx_ring, hn_sched_tx), 3928 hn_tx_conf_int_sysctl, "I", 3929 "Always schedule transmission " 3930 "instead of doing direct transmission"); 3931 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 3932 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 3933 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 3934 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 3935 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 3936 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 3937 "Applied packet transmission aggregation size"); 3938 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 3939 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3940 hn_txagg_pktmax_sysctl, "I", 3941 "Applied packet transmission aggregation packets"); 3942 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 3943 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3944 hn_txagg_align_sysctl, "I", 3945 "Applied packet transmission aggregation alignment"); 3946 3947 return 0; 3948 } 3949 3950 static void 3951 hn_set_chim_size(struct hn_softc *sc, int chim_size) 3952 { 3953 int i; 3954 3955 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3956 sc->hn_tx_ring[i].hn_chim_size = chim_size; 3957 } 3958 3959 static void 3960 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 3961 { 3962 struct ifnet *ifp = sc->hn_ifp; 3963 int tso_minlen; 3964 3965 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 3966 return; 3967 3968 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 3969 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 3970 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 3971 3972 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 3973 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 3974 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 3975 3976 if (tso_maxlen < tso_minlen) 3977 tso_maxlen = tso_minlen; 3978 else if (tso_maxlen > IP_MAXPACKET) 3979 tso_maxlen = IP_MAXPACKET; 3980 if (tso_maxlen > sc->hn_ndis_tso_szmax) 3981 tso_maxlen = sc->hn_ndis_tso_szmax; 3982 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 3983 if (bootverbose) 3984 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 3985 } 3986 3987 static void 3988 hn_fixup_tx_data(struct hn_softc *sc) 3989 { 3990 uint64_t csum_assist; 3991 int i; 3992 3993 hn_set_chim_size(sc, sc->hn_chim_szmax); 3994 if (hn_tx_chimney_size > 0 && 3995 hn_tx_chimney_size < sc->hn_chim_szmax) 3996 hn_set_chim_size(sc, hn_tx_chimney_size); 3997 3998 csum_assist = 0; 3999 if (sc->hn_caps & HN_CAP_IPCS) 4000 csum_assist |= CSUM_IP; 4001 if (sc->hn_caps & HN_CAP_TCP4CS) 4002 csum_assist |= CSUM_IP_TCP; 4003 if (sc->hn_caps & HN_CAP_UDP4CS) 4004 csum_assist |= CSUM_IP_UDP; 4005 if (sc->hn_caps & HN_CAP_TCP6CS) 4006 csum_assist |= CSUM_IP6_TCP; 4007 if (sc->hn_caps & HN_CAP_UDP6CS) 4008 csum_assist |= CSUM_IP6_UDP; 4009 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4010 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 4011 4012 if (sc->hn_caps & HN_CAP_HASHVAL) { 4013 /* 4014 * Support HASHVAL pktinfo on TX path. 4015 */ 4016 if (bootverbose) 4017 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 4018 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4019 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 4020 } 4021 } 4022 4023 static void 4024 hn_destroy_tx_data(struct hn_softc *sc) 4025 { 4026 int i; 4027 4028 if (sc->hn_chim != NULL) { 4029 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 4030 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 4031 } else { 4032 device_printf(sc->hn_dev, 4033 "chimney sending buffer is referenced"); 4034 } 4035 sc->hn_chim = NULL; 4036 } 4037 4038 if (sc->hn_tx_ring_cnt == 0) 4039 return; 4040 4041 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4042 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 4043 4044 free(sc->hn_tx_ring, M_DEVBUF); 4045 sc->hn_tx_ring = NULL; 4046 4047 sc->hn_tx_ring_cnt = 0; 4048 sc->hn_tx_ring_inuse = 0; 4049 } 4050 4051 #ifdef HN_IFSTART_SUPPORT 4052 4053 static void 4054 hn_start_taskfunc(void *xtxr, int pending __unused) 4055 { 4056 struct hn_tx_ring *txr = xtxr; 4057 4058 mtx_lock(&txr->hn_tx_lock); 4059 hn_start_locked(txr, 0); 4060 mtx_unlock(&txr->hn_tx_lock); 4061 } 4062 4063 static int 4064 hn_start_locked(struct hn_tx_ring *txr, int len) 4065 { 4066 struct hn_softc *sc = txr->hn_sc; 4067 struct ifnet *ifp = sc->hn_ifp; 4068 int sched = 0; 4069 4070 KASSERT(hn_use_if_start, 4071 ("hn_start_locked is called, when if_start is disabled")); 4072 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 4073 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 4074 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 4075 4076 if (__predict_false(txr->hn_suspended)) 4077 return (0); 4078 4079 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 4080 IFF_DRV_RUNNING) 4081 return (0); 4082 4083 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 4084 struct hn_txdesc *txd; 4085 struct mbuf *m_head; 4086 int error; 4087 4088 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 4089 if (m_head == NULL) 4090 break; 4091 4092 if (len > 0 && m_head->m_pkthdr.len > len) { 4093 /* 4094 * This sending could be time consuming; let callers 4095 * dispatch this packet sending (and sending of any 4096 * following up packets) to tx taskqueue. 4097 */ 4098 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 4099 sched = 1; 4100 break; 4101 } 4102 4103 #if defined(INET6) || defined(INET) 4104 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 4105 m_head = hn_tso_fixup(m_head); 4106 if (__predict_false(m_head == NULL)) { 4107 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 4108 continue; 4109 } 4110 } 4111 #endif 4112 4113 txd = hn_txdesc_get(txr); 4114 if (txd == NULL) { 4115 txr->hn_no_txdescs++; 4116 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 4117 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4118 break; 4119 } 4120 4121 error = hn_encap(ifp, txr, txd, &m_head); 4122 if (error) { 4123 /* Both txd and m_head are freed */ 4124 KASSERT(txr->hn_agg_txd == NULL, 4125 ("encap failed w/ pending aggregating txdesc")); 4126 continue; 4127 } 4128 4129 if (txr->hn_agg_pktleft == 0) { 4130 if (txr->hn_agg_txd != NULL) { 4131 KASSERT(m_head == NULL, 4132 ("pending mbuf for aggregating txdesc")); 4133 error = hn_flush_txagg(ifp, txr); 4134 if (__predict_false(error)) { 4135 atomic_set_int(&ifp->if_drv_flags, 4136 IFF_DRV_OACTIVE); 4137 break; 4138 } 4139 } else { 4140 KASSERT(m_head != NULL, ("mbuf was freed")); 4141 error = hn_txpkt(ifp, txr, txd); 4142 if (__predict_false(error)) { 4143 /* txd is freed, but m_head is not */ 4144 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 4145 atomic_set_int(&ifp->if_drv_flags, 4146 IFF_DRV_OACTIVE); 4147 break; 4148 } 4149 } 4150 } 4151 #ifdef INVARIANTS 4152 else { 4153 KASSERT(txr->hn_agg_txd != NULL, 4154 ("no aggregating txdesc")); 4155 KASSERT(m_head == NULL, 4156 ("pending mbuf for aggregating txdesc")); 4157 } 4158 #endif 4159 } 4160 4161 /* Flush pending aggerated transmission. */ 4162 if (txr->hn_agg_txd != NULL) 4163 hn_flush_txagg(ifp, txr); 4164 return (sched); 4165 } 4166 4167 static void 4168 hn_start(struct ifnet *ifp) 4169 { 4170 struct hn_softc *sc = ifp->if_softc; 4171 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 4172 4173 if (txr->hn_sched_tx) 4174 goto do_sched; 4175 4176 if (mtx_trylock(&txr->hn_tx_lock)) { 4177 int sched; 4178 4179 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 4180 mtx_unlock(&txr->hn_tx_lock); 4181 if (!sched) 4182 return; 4183 } 4184 do_sched: 4185 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4186 } 4187 4188 static void 4189 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 4190 { 4191 struct hn_tx_ring *txr = xtxr; 4192 4193 mtx_lock(&txr->hn_tx_lock); 4194 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 4195 hn_start_locked(txr, 0); 4196 mtx_unlock(&txr->hn_tx_lock); 4197 } 4198 4199 static void 4200 hn_start_txeof(struct hn_tx_ring *txr) 4201 { 4202 struct hn_softc *sc = txr->hn_sc; 4203 struct ifnet *ifp = sc->hn_ifp; 4204 4205 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 4206 4207 if (txr->hn_sched_tx) 4208 goto do_sched; 4209 4210 if (mtx_trylock(&txr->hn_tx_lock)) { 4211 int sched; 4212 4213 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4214 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 4215 mtx_unlock(&txr->hn_tx_lock); 4216 if (sched) { 4217 taskqueue_enqueue(txr->hn_tx_taskq, 4218 &txr->hn_tx_task); 4219 } 4220 } else { 4221 do_sched: 4222 /* 4223 * Release the OACTIVE earlier, with the hope, that 4224 * others could catch up. The task will clear the 4225 * flag again with the hn_tx_lock to avoid possible 4226 * races. 4227 */ 4228 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4229 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4230 } 4231 } 4232 4233 #endif /* HN_IFSTART_SUPPORT */ 4234 4235 static int 4236 hn_xmit(struct hn_tx_ring *txr, int len) 4237 { 4238 struct hn_softc *sc = txr->hn_sc; 4239 struct ifnet *ifp = sc->hn_ifp; 4240 struct mbuf *m_head; 4241 int sched = 0; 4242 4243 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 4244 #ifdef HN_IFSTART_SUPPORT 4245 KASSERT(hn_use_if_start == 0, 4246 ("hn_xmit is called, when if_start is enabled")); 4247 #endif 4248 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 4249 4250 if (__predict_false(txr->hn_suspended)) 4251 return (0); 4252 4253 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 4254 return (0); 4255 4256 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 4257 struct hn_txdesc *txd; 4258 int error; 4259 4260 if (len > 0 && m_head->m_pkthdr.len > len) { 4261 /* 4262 * This sending could be time consuming; let callers 4263 * dispatch this packet sending (and sending of any 4264 * following up packets) to tx taskqueue. 4265 */ 4266 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4267 sched = 1; 4268 break; 4269 } 4270 4271 txd = hn_txdesc_get(txr); 4272 if (txd == NULL) { 4273 txr->hn_no_txdescs++; 4274 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4275 txr->hn_oactive = 1; 4276 break; 4277 } 4278 4279 error = hn_encap(ifp, txr, txd, &m_head); 4280 if (error) { 4281 /* Both txd and m_head are freed; discard */ 4282 KASSERT(txr->hn_agg_txd == NULL, 4283 ("encap failed w/ pending aggregating txdesc")); 4284 drbr_advance(ifp, txr->hn_mbuf_br); 4285 continue; 4286 } 4287 4288 if (txr->hn_agg_pktleft == 0) { 4289 if (txr->hn_agg_txd != NULL) { 4290 KASSERT(m_head == NULL, 4291 ("pending mbuf for aggregating txdesc")); 4292 error = hn_flush_txagg(ifp, txr); 4293 if (__predict_false(error)) { 4294 txr->hn_oactive = 1; 4295 break; 4296 } 4297 } else { 4298 KASSERT(m_head != NULL, ("mbuf was freed")); 4299 error = hn_txpkt(ifp, txr, txd); 4300 if (__predict_false(error)) { 4301 /* txd is freed, but m_head is not */ 4302 drbr_putback(ifp, txr->hn_mbuf_br, 4303 m_head); 4304 txr->hn_oactive = 1; 4305 break; 4306 } 4307 } 4308 } 4309 #ifdef INVARIANTS 4310 else { 4311 KASSERT(txr->hn_agg_txd != NULL, 4312 ("no aggregating txdesc")); 4313 KASSERT(m_head == NULL, 4314 ("pending mbuf for aggregating txdesc")); 4315 } 4316 #endif 4317 4318 /* Sent */ 4319 drbr_advance(ifp, txr->hn_mbuf_br); 4320 } 4321 4322 /* Flush pending aggerated transmission. */ 4323 if (txr->hn_agg_txd != NULL) 4324 hn_flush_txagg(ifp, txr); 4325 return (sched); 4326 } 4327 4328 static int 4329 hn_transmit(struct ifnet *ifp, struct mbuf *m) 4330 { 4331 struct hn_softc *sc = ifp->if_softc; 4332 struct hn_tx_ring *txr; 4333 int error, idx = 0; 4334 4335 #if defined(INET6) || defined(INET) 4336 /* 4337 * Perform TSO packet header fixup now, since the TSO 4338 * packet header should be cache-hot. 4339 */ 4340 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 4341 m = hn_tso_fixup(m); 4342 if (__predict_false(m == NULL)) { 4343 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 4344 return EIO; 4345 } 4346 } 4347 #endif 4348 4349 /* 4350 * Select the TX ring based on flowid 4351 */ 4352 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 4353 #ifdef RSS 4354 uint32_t bid; 4355 4356 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 4357 &bid) == 0) 4358 idx = bid % sc->hn_tx_ring_inuse; 4359 else 4360 #endif 4361 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 4362 } 4363 txr = &sc->hn_tx_ring[idx]; 4364 4365 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 4366 if (error) { 4367 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 4368 return error; 4369 } 4370 4371 if (txr->hn_oactive) 4372 return 0; 4373 4374 if (txr->hn_sched_tx) 4375 goto do_sched; 4376 4377 if (mtx_trylock(&txr->hn_tx_lock)) { 4378 int sched; 4379 4380 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4381 mtx_unlock(&txr->hn_tx_lock); 4382 if (!sched) 4383 return 0; 4384 } 4385 do_sched: 4386 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4387 return 0; 4388 } 4389 4390 static void 4391 hn_tx_ring_qflush(struct hn_tx_ring *txr) 4392 { 4393 struct mbuf *m; 4394 4395 mtx_lock(&txr->hn_tx_lock); 4396 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 4397 m_freem(m); 4398 mtx_unlock(&txr->hn_tx_lock); 4399 } 4400 4401 static void 4402 hn_xmit_qflush(struct ifnet *ifp) 4403 { 4404 struct hn_softc *sc = ifp->if_softc; 4405 int i; 4406 4407 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4408 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 4409 if_qflush(ifp); 4410 } 4411 4412 static void 4413 hn_xmit_txeof(struct hn_tx_ring *txr) 4414 { 4415 4416 if (txr->hn_sched_tx) 4417 goto do_sched; 4418 4419 if (mtx_trylock(&txr->hn_tx_lock)) { 4420 int sched; 4421 4422 txr->hn_oactive = 0; 4423 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4424 mtx_unlock(&txr->hn_tx_lock); 4425 if (sched) { 4426 taskqueue_enqueue(txr->hn_tx_taskq, 4427 &txr->hn_tx_task); 4428 } 4429 } else { 4430 do_sched: 4431 /* 4432 * Release the oactive earlier, with the hope, that 4433 * others could catch up. The task will clear the 4434 * oactive again with the hn_tx_lock to avoid possible 4435 * races. 4436 */ 4437 txr->hn_oactive = 0; 4438 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4439 } 4440 } 4441 4442 static void 4443 hn_xmit_taskfunc(void *xtxr, int pending __unused) 4444 { 4445 struct hn_tx_ring *txr = xtxr; 4446 4447 mtx_lock(&txr->hn_tx_lock); 4448 hn_xmit(txr, 0); 4449 mtx_unlock(&txr->hn_tx_lock); 4450 } 4451 4452 static void 4453 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 4454 { 4455 struct hn_tx_ring *txr = xtxr; 4456 4457 mtx_lock(&txr->hn_tx_lock); 4458 txr->hn_oactive = 0; 4459 hn_xmit(txr, 0); 4460 mtx_unlock(&txr->hn_tx_lock); 4461 } 4462 4463 static int 4464 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 4465 { 4466 struct vmbus_chan_br cbr; 4467 struct hn_rx_ring *rxr; 4468 struct hn_tx_ring *txr = NULL; 4469 int idx, error; 4470 4471 idx = vmbus_chan_subidx(chan); 4472 4473 /* 4474 * Link this channel to RX/TX ring. 4475 */ 4476 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4477 ("invalid channel index %d, should > 0 && < %d", 4478 idx, sc->hn_rx_ring_inuse)); 4479 rxr = &sc->hn_rx_ring[idx]; 4480 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 4481 ("RX ring %d already attached", idx)); 4482 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 4483 rxr->hn_chan = chan; 4484 4485 if (bootverbose) { 4486 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 4487 idx, vmbus_chan_id(chan)); 4488 } 4489 4490 if (idx < sc->hn_tx_ring_inuse) { 4491 txr = &sc->hn_tx_ring[idx]; 4492 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 4493 ("TX ring %d already attached", idx)); 4494 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 4495 4496 txr->hn_chan = chan; 4497 if (bootverbose) { 4498 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 4499 idx, vmbus_chan_id(chan)); 4500 } 4501 } 4502 4503 /* Bind this channel to a proper CPU. */ 4504 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 4505 4506 /* 4507 * Open this channel 4508 */ 4509 cbr.cbr = rxr->hn_br; 4510 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 4511 cbr.cbr_txsz = HN_TXBR_SIZE; 4512 cbr.cbr_rxsz = HN_RXBR_SIZE; 4513 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 4514 if (error) { 4515 if (error == EISCONN) { 4516 if_printf(sc->hn_ifp, "bufring is connected after " 4517 "chan%u open failure\n", vmbus_chan_id(chan)); 4518 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4519 } else { 4520 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 4521 vmbus_chan_id(chan), error); 4522 } 4523 } 4524 return (error); 4525 } 4526 4527 static void 4528 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 4529 { 4530 struct hn_rx_ring *rxr; 4531 int idx, error; 4532 4533 idx = vmbus_chan_subidx(chan); 4534 4535 /* 4536 * Link this channel to RX/TX ring. 4537 */ 4538 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4539 ("invalid channel index %d, should > 0 && < %d", 4540 idx, sc->hn_rx_ring_inuse)); 4541 rxr = &sc->hn_rx_ring[idx]; 4542 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 4543 ("RX ring %d is not attached", idx)); 4544 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 4545 4546 if (idx < sc->hn_tx_ring_inuse) { 4547 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 4548 4549 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 4550 ("TX ring %d is not attached attached", idx)); 4551 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 4552 } 4553 4554 /* 4555 * Close this channel. 4556 * 4557 * NOTE: 4558 * Channel closing does _not_ destroy the target channel. 4559 */ 4560 error = vmbus_chan_close_direct(chan); 4561 if (error == EISCONN) { 4562 if_printf(sc->hn_ifp, "chan%u bufring is connected " 4563 "after being closed\n", vmbus_chan_id(chan)); 4564 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4565 } else if (error) { 4566 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 4567 vmbus_chan_id(chan), error); 4568 } 4569 } 4570 4571 static int 4572 hn_attach_subchans(struct hn_softc *sc) 4573 { 4574 struct vmbus_channel **subchans; 4575 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4576 int i, error = 0; 4577 4578 KASSERT(subchan_cnt > 0, ("no sub-channels")); 4579 4580 /* Attach the sub-channels. */ 4581 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4582 for (i = 0; i < subchan_cnt; ++i) { 4583 int error1; 4584 4585 error1 = hn_chan_attach(sc, subchans[i]); 4586 if (error1) { 4587 error = error1; 4588 /* Move on; all channels will be detached later. */ 4589 } 4590 } 4591 vmbus_subchan_rel(subchans, subchan_cnt); 4592 4593 if (error) { 4594 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 4595 } else { 4596 if (bootverbose) { 4597 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 4598 subchan_cnt); 4599 } 4600 } 4601 return (error); 4602 } 4603 4604 static void 4605 hn_detach_allchans(struct hn_softc *sc) 4606 { 4607 struct vmbus_channel **subchans; 4608 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4609 int i; 4610 4611 if (subchan_cnt == 0) 4612 goto back; 4613 4614 /* Detach the sub-channels. */ 4615 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4616 for (i = 0; i < subchan_cnt; ++i) 4617 hn_chan_detach(sc, subchans[i]); 4618 vmbus_subchan_rel(subchans, subchan_cnt); 4619 4620 back: 4621 /* 4622 * Detach the primary channel, _after_ all sub-channels 4623 * are detached. 4624 */ 4625 hn_chan_detach(sc, sc->hn_prichan); 4626 4627 /* Wait for sub-channels to be destroyed, if any. */ 4628 vmbus_subchan_drain(sc->hn_prichan); 4629 4630 #ifdef INVARIANTS 4631 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4632 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 4633 HN_RX_FLAG_ATTACHED) == 0, 4634 ("%dth RX ring is still attached", i)); 4635 } 4636 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4637 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 4638 HN_TX_FLAG_ATTACHED) == 0, 4639 ("%dth TX ring is still attached", i)); 4640 } 4641 #endif 4642 } 4643 4644 static int 4645 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 4646 { 4647 struct vmbus_channel **subchans; 4648 int nchan, rxr_cnt, error; 4649 4650 nchan = *nsubch + 1; 4651 if (nchan == 1) { 4652 /* 4653 * Multiple RX/TX rings are not requested. 4654 */ 4655 *nsubch = 0; 4656 return (0); 4657 } 4658 4659 /* 4660 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 4661 * table entries. 4662 */ 4663 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 4664 if (error) { 4665 /* No RSS; this is benign. */ 4666 *nsubch = 0; 4667 return (0); 4668 } 4669 if (bootverbose) { 4670 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 4671 rxr_cnt, nchan); 4672 } 4673 4674 if (nchan > rxr_cnt) 4675 nchan = rxr_cnt; 4676 if (nchan == 1) { 4677 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 4678 *nsubch = 0; 4679 return (0); 4680 } 4681 4682 /* 4683 * Allocate sub-channels from NVS. 4684 */ 4685 *nsubch = nchan - 1; 4686 error = hn_nvs_alloc_subchans(sc, nsubch); 4687 if (error || *nsubch == 0) { 4688 /* Failed to allocate sub-channels. */ 4689 *nsubch = 0; 4690 return (0); 4691 } 4692 4693 /* 4694 * Wait for all sub-channels to become ready before moving on. 4695 */ 4696 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 4697 vmbus_subchan_rel(subchans, *nsubch); 4698 return (0); 4699 } 4700 4701 static bool 4702 hn_synth_attachable(const struct hn_softc *sc) 4703 { 4704 int i; 4705 4706 if (sc->hn_flags & HN_FLAG_ERRORS) 4707 return (false); 4708 4709 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4710 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4711 4712 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 4713 return (false); 4714 } 4715 return (true); 4716 } 4717 4718 static int 4719 hn_synth_attach(struct hn_softc *sc, int mtu) 4720 { 4721 #define ATTACHED_NVS 0x0002 4722 #define ATTACHED_RNDIS 0x0004 4723 4724 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 4725 int error, nsubch, nchan, i; 4726 uint32_t old_caps, attached = 0; 4727 4728 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 4729 ("synthetic parts were attached")); 4730 4731 if (!hn_synth_attachable(sc)) 4732 return (ENXIO); 4733 4734 /* Save capabilities for later verification. */ 4735 old_caps = sc->hn_caps; 4736 sc->hn_caps = 0; 4737 4738 /* Clear RSS stuffs. */ 4739 sc->hn_rss_ind_size = 0; 4740 sc->hn_rss_hash = 0; 4741 4742 /* 4743 * Attach the primary channel _before_ attaching NVS and RNDIS. 4744 */ 4745 error = hn_chan_attach(sc, sc->hn_prichan); 4746 if (error) 4747 goto failed; 4748 4749 /* 4750 * Attach NVS. 4751 */ 4752 error = hn_nvs_attach(sc, mtu); 4753 if (error) 4754 goto failed; 4755 attached |= ATTACHED_NVS; 4756 4757 /* 4758 * Attach RNDIS _after_ NVS is attached. 4759 */ 4760 error = hn_rndis_attach(sc, mtu); 4761 if (error) 4762 goto failed; 4763 attached |= ATTACHED_RNDIS; 4764 4765 /* 4766 * Make sure capabilities are not changed. 4767 */ 4768 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 4769 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 4770 old_caps, sc->hn_caps); 4771 error = ENXIO; 4772 goto failed; 4773 } 4774 4775 /* 4776 * Allocate sub-channels for multi-TX/RX rings. 4777 * 4778 * NOTE: 4779 * The # of RX rings that can be used is equivalent to the # of 4780 * channels to be requested. 4781 */ 4782 nsubch = sc->hn_rx_ring_cnt - 1; 4783 error = hn_synth_alloc_subchans(sc, &nsubch); 4784 if (error) 4785 goto failed; 4786 /* NOTE: _Full_ synthetic parts detach is required now. */ 4787 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 4788 4789 /* 4790 * Set the # of TX/RX rings that could be used according to 4791 * the # of channels that NVS offered. 4792 */ 4793 nchan = nsubch + 1; 4794 hn_set_ring_inuse(sc, nchan); 4795 if (nchan == 1) { 4796 /* Only the primary channel can be used; done */ 4797 goto back; 4798 } 4799 4800 /* 4801 * Attach the sub-channels. 4802 * 4803 * NOTE: hn_set_ring_inuse() _must_ have been called. 4804 */ 4805 error = hn_attach_subchans(sc); 4806 if (error) 4807 goto failed; 4808 4809 /* 4810 * Configure RSS key and indirect table _after_ all sub-channels 4811 * are attached. 4812 */ 4813 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 4814 /* 4815 * RSS key is not set yet; set it to the default RSS key. 4816 */ 4817 if (bootverbose) 4818 if_printf(sc->hn_ifp, "setup default RSS key\n"); 4819 #ifdef RSS 4820 rss_getkey(rss->rss_key); 4821 #else 4822 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 4823 #endif 4824 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4825 } 4826 4827 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 4828 /* 4829 * RSS indirect table is not set yet; set it up in round- 4830 * robin fashion. 4831 */ 4832 if (bootverbose) { 4833 if_printf(sc->hn_ifp, "setup default RSS indirect " 4834 "table\n"); 4835 } 4836 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 4837 uint32_t subidx; 4838 4839 #ifdef RSS 4840 subidx = rss_get_indirection_to_bucket(i); 4841 #else 4842 subidx = i; 4843 #endif 4844 rss->rss_ind[i] = subidx % nchan; 4845 } 4846 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4847 } else { 4848 /* 4849 * # of usable channels may be changed, so we have to 4850 * make sure that all entries in RSS indirect table 4851 * are valid. 4852 * 4853 * NOTE: hn_set_ring_inuse() _must_ have been called. 4854 */ 4855 hn_rss_ind_fixup(sc); 4856 } 4857 4858 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 4859 if (error) 4860 goto failed; 4861 back: 4862 /* 4863 * Fixup transmission aggregation setup. 4864 */ 4865 hn_set_txagg(sc); 4866 return (0); 4867 4868 failed: 4869 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 4870 hn_synth_detach(sc); 4871 } else { 4872 if (attached & ATTACHED_RNDIS) 4873 hn_rndis_detach(sc); 4874 if (attached & ATTACHED_NVS) 4875 hn_nvs_detach(sc); 4876 hn_chan_detach(sc, sc->hn_prichan); 4877 /* Restore old capabilities. */ 4878 sc->hn_caps = old_caps; 4879 } 4880 return (error); 4881 4882 #undef ATTACHED_RNDIS 4883 #undef ATTACHED_NVS 4884 } 4885 4886 /* 4887 * NOTE: 4888 * The interface must have been suspended though hn_suspend(), before 4889 * this function get called. 4890 */ 4891 static void 4892 hn_synth_detach(struct hn_softc *sc) 4893 { 4894 4895 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4896 ("synthetic parts were not attached")); 4897 4898 /* Detach the RNDIS first. */ 4899 hn_rndis_detach(sc); 4900 4901 /* Detach NVS. */ 4902 hn_nvs_detach(sc); 4903 4904 /* Detach all of the channels. */ 4905 hn_detach_allchans(sc); 4906 4907 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 4908 } 4909 4910 static void 4911 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 4912 { 4913 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 4914 ("invalid ring count %d", ring_cnt)); 4915 4916 if (sc->hn_tx_ring_cnt > ring_cnt) 4917 sc->hn_tx_ring_inuse = ring_cnt; 4918 else 4919 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 4920 sc->hn_rx_ring_inuse = ring_cnt; 4921 4922 #ifdef RSS 4923 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 4924 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 4925 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 4926 rss_getnumbuckets()); 4927 } 4928 #endif 4929 4930 if (bootverbose) { 4931 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 4932 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 4933 } 4934 } 4935 4936 static void 4937 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 4938 { 4939 4940 /* 4941 * NOTE: 4942 * The TX bufring will not be drained by the hypervisor, 4943 * if the primary channel is revoked. 4944 */ 4945 while (!vmbus_chan_rx_empty(chan) || 4946 (!vmbus_chan_is_revoked(sc->hn_prichan) && 4947 !vmbus_chan_tx_empty(chan))) 4948 pause("waitch", 1); 4949 vmbus_chan_intr_drain(chan); 4950 } 4951 4952 static void 4953 hn_suspend_data(struct hn_softc *sc) 4954 { 4955 struct vmbus_channel **subch = NULL; 4956 struct hn_tx_ring *txr; 4957 int i, nsubch; 4958 4959 HN_LOCK_ASSERT(sc); 4960 4961 /* 4962 * Suspend TX. 4963 */ 4964 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4965 txr = &sc->hn_tx_ring[i]; 4966 4967 mtx_lock(&txr->hn_tx_lock); 4968 txr->hn_suspended = 1; 4969 mtx_unlock(&txr->hn_tx_lock); 4970 /* No one is able send more packets now. */ 4971 4972 /* 4973 * Wait for all pending sends to finish. 4974 * 4975 * NOTE: 4976 * We will _not_ receive all pending send-done, if the 4977 * primary channel is revoked. 4978 */ 4979 while (hn_tx_ring_pending(txr) && 4980 !vmbus_chan_is_revoked(sc->hn_prichan)) 4981 pause("hnwtx", 1 /* 1 tick */); 4982 } 4983 4984 /* 4985 * Disable RX by clearing RX filter. 4986 */ 4987 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 4988 4989 /* 4990 * Give RNDIS enough time to flush all pending data packets. 4991 */ 4992 pause("waitrx", (200 * hz) / 1000); 4993 4994 /* 4995 * Drain RX/TX bufrings and interrupts. 4996 */ 4997 nsubch = sc->hn_rx_ring_inuse - 1; 4998 if (nsubch > 0) 4999 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 5000 5001 if (subch != NULL) { 5002 for (i = 0; i < nsubch; ++i) 5003 hn_chan_drain(sc, subch[i]); 5004 } 5005 hn_chan_drain(sc, sc->hn_prichan); 5006 5007 if (subch != NULL) 5008 vmbus_subchan_rel(subch, nsubch); 5009 5010 /* 5011 * Drain any pending TX tasks. 5012 * 5013 * NOTE: 5014 * The above hn_chan_drain() can dispatch TX tasks, so the TX 5015 * tasks will have to be drained _after_ the above hn_chan_drain() 5016 * calls. 5017 */ 5018 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 5019 txr = &sc->hn_tx_ring[i]; 5020 5021 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 5022 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 5023 } 5024 } 5025 5026 static void 5027 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 5028 { 5029 5030 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 5031 } 5032 5033 static void 5034 hn_suspend_mgmt(struct hn_softc *sc) 5035 { 5036 struct task task; 5037 5038 HN_LOCK_ASSERT(sc); 5039 5040 /* 5041 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 5042 * through hn_mgmt_taskq. 5043 */ 5044 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 5045 vmbus_chan_run_task(sc->hn_prichan, &task); 5046 5047 /* 5048 * Make sure that all pending management tasks are completed. 5049 */ 5050 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 5051 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 5052 taskqueue_drain_all(sc->hn_mgmt_taskq0); 5053 } 5054 5055 static void 5056 hn_suspend(struct hn_softc *sc) 5057 { 5058 5059 /* Disable polling. */ 5060 hn_polling(sc, 0); 5061 5062 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 5063 (sc->hn_flags & HN_FLAG_VF)) 5064 hn_suspend_data(sc); 5065 hn_suspend_mgmt(sc); 5066 } 5067 5068 static void 5069 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 5070 { 5071 int i; 5072 5073 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 5074 ("invalid TX ring count %d", tx_ring_cnt)); 5075 5076 for (i = 0; i < tx_ring_cnt; ++i) { 5077 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 5078 5079 mtx_lock(&txr->hn_tx_lock); 5080 txr->hn_suspended = 0; 5081 mtx_unlock(&txr->hn_tx_lock); 5082 } 5083 } 5084 5085 static void 5086 hn_resume_data(struct hn_softc *sc) 5087 { 5088 int i; 5089 5090 HN_LOCK_ASSERT(sc); 5091 5092 /* 5093 * Re-enable RX. 5094 */ 5095 hn_rxfilter_config(sc); 5096 5097 /* 5098 * Make sure to clear suspend status on "all" TX rings, 5099 * since hn_tx_ring_inuse can be changed after 5100 * hn_suspend_data(). 5101 */ 5102 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 5103 5104 #ifdef HN_IFSTART_SUPPORT 5105 if (!hn_use_if_start) 5106 #endif 5107 { 5108 /* 5109 * Flush unused drbrs, since hn_tx_ring_inuse may be 5110 * reduced. 5111 */ 5112 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 5113 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 5114 } 5115 5116 /* 5117 * Kick start TX. 5118 */ 5119 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 5120 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 5121 5122 /* 5123 * Use txeof task, so that any pending oactive can be 5124 * cleared properly. 5125 */ 5126 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5127 } 5128 } 5129 5130 static void 5131 hn_resume_mgmt(struct hn_softc *sc) 5132 { 5133 5134 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 5135 5136 /* 5137 * Kick off network change detection, if it was pending. 5138 * If no network change was pending, start link status 5139 * checks, which is more lightweight than network change 5140 * detection. 5141 */ 5142 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 5143 hn_change_network(sc); 5144 else 5145 hn_update_link_status(sc); 5146 } 5147 5148 static void 5149 hn_resume(struct hn_softc *sc) 5150 { 5151 5152 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 5153 (sc->hn_flags & HN_FLAG_VF)) 5154 hn_resume_data(sc); 5155 5156 /* 5157 * When the VF is activated, the synthetic interface is changed 5158 * to DOWN in hn_set_vf(). Here, if the VF is still active, we 5159 * don't call hn_resume_mgmt() until the VF is deactivated in 5160 * hn_set_vf(). 5161 */ 5162 if (!(sc->hn_flags & HN_FLAG_VF)) 5163 hn_resume_mgmt(sc); 5164 5165 /* 5166 * Re-enable polling if this interface is running and 5167 * the polling is requested. 5168 */ 5169 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 5170 hn_polling(sc, sc->hn_pollhz); 5171 } 5172 5173 static void 5174 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 5175 { 5176 const struct rndis_status_msg *msg; 5177 int ofs; 5178 5179 if (dlen < sizeof(*msg)) { 5180 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 5181 return; 5182 } 5183 msg = data; 5184 5185 switch (msg->rm_status) { 5186 case RNDIS_STATUS_MEDIA_CONNECT: 5187 case RNDIS_STATUS_MEDIA_DISCONNECT: 5188 hn_update_link_status(sc); 5189 break; 5190 5191 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 5192 /* Not really useful; ignore. */ 5193 break; 5194 5195 case RNDIS_STATUS_NETWORK_CHANGE: 5196 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 5197 if (dlen < ofs + msg->rm_stbuflen || 5198 msg->rm_stbuflen < sizeof(uint32_t)) { 5199 if_printf(sc->hn_ifp, "network changed\n"); 5200 } else { 5201 uint32_t change; 5202 5203 memcpy(&change, ((const uint8_t *)msg) + ofs, 5204 sizeof(change)); 5205 if_printf(sc->hn_ifp, "network changed, change %u\n", 5206 change); 5207 } 5208 hn_change_network(sc); 5209 break; 5210 5211 default: 5212 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 5213 msg->rm_status); 5214 break; 5215 } 5216 } 5217 5218 static int 5219 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 5220 { 5221 const struct rndis_pktinfo *pi = info_data; 5222 uint32_t mask = 0; 5223 5224 while (info_dlen != 0) { 5225 const void *data; 5226 uint32_t dlen; 5227 5228 if (__predict_false(info_dlen < sizeof(*pi))) 5229 return (EINVAL); 5230 if (__predict_false(info_dlen < pi->rm_size)) 5231 return (EINVAL); 5232 info_dlen -= pi->rm_size; 5233 5234 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 5235 return (EINVAL); 5236 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 5237 return (EINVAL); 5238 dlen = pi->rm_size - pi->rm_pktinfooffset; 5239 data = pi->rm_data; 5240 5241 switch (pi->rm_type) { 5242 case NDIS_PKTINFO_TYPE_VLAN: 5243 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) 5244 return (EINVAL); 5245 info->vlan_info = *((const uint32_t *)data); 5246 mask |= HN_RXINFO_VLAN; 5247 break; 5248 5249 case NDIS_PKTINFO_TYPE_CSUM: 5250 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) 5251 return (EINVAL); 5252 info->csum_info = *((const uint32_t *)data); 5253 mask |= HN_RXINFO_CSUM; 5254 break; 5255 5256 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 5257 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) 5258 return (EINVAL); 5259 info->hash_value = *((const uint32_t *)data); 5260 mask |= HN_RXINFO_HASHVAL; 5261 break; 5262 5263 case HN_NDIS_PKTINFO_TYPE_HASHINF: 5264 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) 5265 return (EINVAL); 5266 info->hash_info = *((const uint32_t *)data); 5267 mask |= HN_RXINFO_HASHINF; 5268 break; 5269 5270 default: 5271 goto next; 5272 } 5273 5274 if (mask == HN_RXINFO_ALL) { 5275 /* All found; done */ 5276 break; 5277 } 5278 next: 5279 pi = (const struct rndis_pktinfo *) 5280 ((const uint8_t *)pi + pi->rm_size); 5281 } 5282 5283 /* 5284 * Final fixup. 5285 * - If there is no hash value, invalidate the hash info. 5286 */ 5287 if ((mask & HN_RXINFO_HASHVAL) == 0) 5288 info->hash_info = HN_NDIS_HASH_INFO_INVALID; 5289 return (0); 5290 } 5291 5292 static __inline bool 5293 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 5294 { 5295 5296 if (off < check_off) { 5297 if (__predict_true(off + len <= check_off)) 5298 return (false); 5299 } else if (off > check_off) { 5300 if (__predict_true(check_off + check_len <= off)) 5301 return (false); 5302 } 5303 return (true); 5304 } 5305 5306 static void 5307 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 5308 { 5309 const struct rndis_packet_msg *pkt; 5310 struct hn_rxinfo info; 5311 int data_off, pktinfo_off, data_len, pktinfo_len; 5312 5313 /* 5314 * Check length. 5315 */ 5316 if (__predict_false(dlen < sizeof(*pkt))) { 5317 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 5318 return; 5319 } 5320 pkt = data; 5321 5322 if (__predict_false(dlen < pkt->rm_len)) { 5323 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 5324 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 5325 return; 5326 } 5327 if (__predict_false(pkt->rm_len < 5328 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 5329 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 5330 "msglen %u, data %u, oob %u, pktinfo %u\n", 5331 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 5332 pkt->rm_pktinfolen); 5333 return; 5334 } 5335 if (__predict_false(pkt->rm_datalen == 0)) { 5336 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 5337 return; 5338 } 5339 5340 /* 5341 * Check offests. 5342 */ 5343 #define IS_OFFSET_INVALID(ofs) \ 5344 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 5345 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 5346 5347 /* XXX Hyper-V does not meet data offset alignment requirement */ 5348 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 5349 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5350 "data offset %u\n", pkt->rm_dataoffset); 5351 return; 5352 } 5353 if (__predict_false(pkt->rm_oobdataoffset > 0 && 5354 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 5355 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5356 "oob offset %u\n", pkt->rm_oobdataoffset); 5357 return; 5358 } 5359 if (__predict_true(pkt->rm_pktinfooffset > 0) && 5360 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 5361 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5362 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 5363 return; 5364 } 5365 5366 #undef IS_OFFSET_INVALID 5367 5368 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 5369 data_len = pkt->rm_datalen; 5370 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 5371 pktinfo_len = pkt->rm_pktinfolen; 5372 5373 /* 5374 * Check OOB coverage. 5375 */ 5376 if (__predict_false(pkt->rm_oobdatalen != 0)) { 5377 int oob_off, oob_len; 5378 5379 if_printf(rxr->hn_ifp, "got oobdata\n"); 5380 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 5381 oob_len = pkt->rm_oobdatalen; 5382 5383 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 5384 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5385 "oob overflow, msglen %u, oob abs %d len %d\n", 5386 pkt->rm_len, oob_off, oob_len); 5387 return; 5388 } 5389 5390 /* 5391 * Check against data. 5392 */ 5393 if (hn_rndis_check_overlap(oob_off, oob_len, 5394 data_off, data_len)) { 5395 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5396 "oob overlaps data, oob abs %d len %d, " 5397 "data abs %d len %d\n", 5398 oob_off, oob_len, data_off, data_len); 5399 return; 5400 } 5401 5402 /* 5403 * Check against pktinfo. 5404 */ 5405 if (pktinfo_len != 0 && 5406 hn_rndis_check_overlap(oob_off, oob_len, 5407 pktinfo_off, pktinfo_len)) { 5408 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5409 "oob overlaps pktinfo, oob abs %d len %d, " 5410 "pktinfo abs %d len %d\n", 5411 oob_off, oob_len, pktinfo_off, pktinfo_len); 5412 return; 5413 } 5414 } 5415 5416 /* 5417 * Check per-packet-info coverage and find useful per-packet-info. 5418 */ 5419 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; 5420 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; 5421 info.hash_info = HN_NDIS_HASH_INFO_INVALID; 5422 if (__predict_true(pktinfo_len != 0)) { 5423 bool overlap; 5424 int error; 5425 5426 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 5427 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5428 "pktinfo overflow, msglen %u, " 5429 "pktinfo abs %d len %d\n", 5430 pkt->rm_len, pktinfo_off, pktinfo_len); 5431 return; 5432 } 5433 5434 /* 5435 * Check packet info coverage. 5436 */ 5437 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 5438 data_off, data_len); 5439 if (__predict_false(overlap)) { 5440 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5441 "pktinfo overlap data, pktinfo abs %d len %d, " 5442 "data abs %d len %d\n", 5443 pktinfo_off, pktinfo_len, data_off, data_len); 5444 return; 5445 } 5446 5447 /* 5448 * Find useful per-packet-info. 5449 */ 5450 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 5451 pktinfo_len, &info); 5452 if (__predict_false(error)) { 5453 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 5454 "pktinfo\n"); 5455 return; 5456 } 5457 } 5458 5459 if (__predict_false(data_off + data_len > pkt->rm_len)) { 5460 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5461 "data overflow, msglen %u, data abs %d len %d\n", 5462 pkt->rm_len, data_off, data_len); 5463 return; 5464 } 5465 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); 5466 } 5467 5468 static __inline void 5469 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 5470 { 5471 const struct rndis_msghdr *hdr; 5472 5473 if (__predict_false(dlen < sizeof(*hdr))) { 5474 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 5475 return; 5476 } 5477 hdr = data; 5478 5479 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 5480 /* Hot data path. */ 5481 hn_rndis_rx_data(rxr, data, dlen); 5482 /* Done! */ 5483 return; 5484 } 5485 5486 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 5487 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 5488 else 5489 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 5490 } 5491 5492 static void 5493 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 5494 { 5495 const struct hn_nvs_hdr *hdr; 5496 5497 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 5498 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 5499 return; 5500 } 5501 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 5502 5503 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 5504 /* Useless; ignore */ 5505 return; 5506 } 5507 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 5508 } 5509 5510 static void 5511 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 5512 const struct vmbus_chanpkt_hdr *pkt) 5513 { 5514 struct hn_nvs_sendctx *sndc; 5515 5516 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 5517 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 5518 VMBUS_CHANPKT_DATALEN(pkt)); 5519 /* 5520 * NOTE: 5521 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 5522 * its callback. 5523 */ 5524 } 5525 5526 static void 5527 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5528 const struct vmbus_chanpkt_hdr *pkthdr) 5529 { 5530 const struct vmbus_chanpkt_rxbuf *pkt; 5531 const struct hn_nvs_hdr *nvs_hdr; 5532 int count, i, hlen; 5533 5534 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 5535 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 5536 return; 5537 } 5538 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 5539 5540 /* Make sure that this is a RNDIS message. */ 5541 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 5542 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 5543 nvs_hdr->nvs_type); 5544 return; 5545 } 5546 5547 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 5548 if (__predict_false(hlen < sizeof(*pkt))) { 5549 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 5550 return; 5551 } 5552 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 5553 5554 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 5555 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 5556 pkt->cp_rxbuf_id); 5557 return; 5558 } 5559 5560 count = pkt->cp_rxbuf_cnt; 5561 if (__predict_false(hlen < 5562 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 5563 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 5564 return; 5565 } 5566 5567 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 5568 for (i = 0; i < count; ++i) { 5569 int ofs, len; 5570 5571 ofs = pkt->cp_rxbuf[i].rb_ofs; 5572 len = pkt->cp_rxbuf[i].rb_len; 5573 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 5574 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 5575 "ofs %d, len %d\n", i, ofs, len); 5576 continue; 5577 } 5578 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 5579 } 5580 5581 /* 5582 * Ack the consumed RXBUF associated w/ this channel packet, 5583 * so that this RXBUF can be recycled by the hypervisor. 5584 */ 5585 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 5586 } 5587 5588 static void 5589 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5590 uint64_t tid) 5591 { 5592 struct hn_nvs_rndis_ack ack; 5593 int retries, error; 5594 5595 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 5596 ack.nvs_status = HN_NVS_STATUS_OK; 5597 5598 retries = 0; 5599 again: 5600 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 5601 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 5602 if (__predict_false(error == EAGAIN)) { 5603 /* 5604 * NOTE: 5605 * This should _not_ happen in real world, since the 5606 * consumption of the TX bufring from the TX path is 5607 * controlled. 5608 */ 5609 if (rxr->hn_ack_failed == 0) 5610 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 5611 rxr->hn_ack_failed++; 5612 retries++; 5613 if (retries < 10) { 5614 DELAY(100); 5615 goto again; 5616 } 5617 /* RXBUF leaks! */ 5618 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 5619 } 5620 } 5621 5622 static void 5623 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 5624 { 5625 struct hn_rx_ring *rxr = xrxr; 5626 struct hn_softc *sc = rxr->hn_ifp->if_softc; 5627 5628 for (;;) { 5629 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 5630 int error, pktlen; 5631 5632 pktlen = rxr->hn_pktbuf_len; 5633 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 5634 if (__predict_false(error == ENOBUFS)) { 5635 void *nbuf; 5636 int nlen; 5637 5638 /* 5639 * Expand channel packet buffer. 5640 * 5641 * XXX 5642 * Use M_WAITOK here, since allocation failure 5643 * is fatal. 5644 */ 5645 nlen = rxr->hn_pktbuf_len * 2; 5646 while (nlen < pktlen) 5647 nlen *= 2; 5648 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 5649 5650 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 5651 rxr->hn_pktbuf_len, nlen); 5652 5653 free(rxr->hn_pktbuf, M_DEVBUF); 5654 rxr->hn_pktbuf = nbuf; 5655 rxr->hn_pktbuf_len = nlen; 5656 /* Retry! */ 5657 continue; 5658 } else if (__predict_false(error == EAGAIN)) { 5659 /* No more channel packets; done! */ 5660 break; 5661 } 5662 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 5663 5664 switch (pkt->cph_type) { 5665 case VMBUS_CHANPKT_TYPE_COMP: 5666 hn_nvs_handle_comp(sc, chan, pkt); 5667 break; 5668 5669 case VMBUS_CHANPKT_TYPE_RXBUF: 5670 hn_nvs_handle_rxbuf(rxr, chan, pkt); 5671 break; 5672 5673 case VMBUS_CHANPKT_TYPE_INBAND: 5674 hn_nvs_handle_notify(sc, pkt); 5675 break; 5676 5677 default: 5678 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 5679 pkt->cph_type); 5680 break; 5681 } 5682 } 5683 hn_chan_rollup(rxr, rxr->hn_txr); 5684 } 5685 5686 static void 5687 hn_tx_taskq_create(void *arg __unused) 5688 { 5689 int i; 5690 5691 /* 5692 * Fix the # of TX taskqueues. 5693 */ 5694 if (hn_tx_taskq_cnt <= 0) 5695 hn_tx_taskq_cnt = 1; 5696 else if (hn_tx_taskq_cnt > mp_ncpus) 5697 hn_tx_taskq_cnt = mp_ncpus; 5698 5699 /* 5700 * Fix the TX taskqueue mode. 5701 */ 5702 switch (hn_tx_taskq_mode) { 5703 case HN_TX_TASKQ_M_INDEP: 5704 case HN_TX_TASKQ_M_GLOBAL: 5705 case HN_TX_TASKQ_M_EVTTQ: 5706 break; 5707 default: 5708 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 5709 break; 5710 } 5711 5712 if (vm_guest != VM_GUEST_HV) 5713 return; 5714 5715 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 5716 return; 5717 5718 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 5719 M_DEVBUF, M_WAITOK); 5720 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 5721 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 5722 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 5723 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 5724 "hn tx%d", i); 5725 } 5726 } 5727 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5728 hn_tx_taskq_create, NULL); 5729 5730 static void 5731 hn_tx_taskq_destroy(void *arg __unused) 5732 { 5733 5734 if (hn_tx_taskque != NULL) { 5735 int i; 5736 5737 for (i = 0; i < hn_tx_taskq_cnt; ++i) 5738 taskqueue_free(hn_tx_taskque[i]); 5739 free(hn_tx_taskque, M_DEVBUF); 5740 } 5741 } 5742 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5743 hn_tx_taskq_destroy, NULL); 5744