1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/bus.h> 65 #include <sys/kernel.h> 66 #include <sys/limits.h> 67 #include <sys/malloc.h> 68 #include <sys/mbuf.h> 69 #include <sys/module.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/smp.h> 73 #include <sys/socket.h> 74 #include <sys/sockio.h> 75 #include <sys/sx.h> 76 #include <sys/sysctl.h> 77 #include <sys/systm.h> 78 #include <sys/taskqueue.h> 79 #include <sys/buf_ring.h> 80 81 #include <machine/atomic.h> 82 #include <machine/in_cksum.h> 83 84 #include <net/bpf.h> 85 #include <net/ethernet.h> 86 #include <net/if.h> 87 #include <net/if_media.h> 88 #include <net/if_types.h> 89 #include <net/if_var.h> 90 #include <net/rndis.h> 91 #ifdef RSS 92 #include <net/rss_config.h> 93 #endif 94 95 #include <netinet/in_systm.h> 96 #include <netinet/in.h> 97 #include <netinet/ip.h> 98 #include <netinet/ip6.h> 99 #include <netinet/tcp.h> 100 #include <netinet/tcp_lro.h> 101 #include <netinet/udp.h> 102 103 #include <dev/hyperv/include/hyperv.h> 104 #include <dev/hyperv/include/hyperv_busdma.h> 105 #include <dev/hyperv/include/vmbus.h> 106 #include <dev/hyperv/include/vmbus_xact.h> 107 108 #include <dev/hyperv/netvsc/ndis.h> 109 #include <dev/hyperv/netvsc/if_hnreg.h> 110 #include <dev/hyperv/netvsc/if_hnvar.h> 111 #include <dev/hyperv/netvsc/hn_nvs.h> 112 #include <dev/hyperv/netvsc/hn_rndis.h> 113 114 #include "vmbus_if.h" 115 116 #define HN_IFSTART_SUPPORT 117 118 #define HN_RING_CNT_DEF_MAX 8 119 120 /* YYY should get it from the underlying channel */ 121 #define HN_TX_DESC_CNT 512 122 123 #define HN_RNDIS_PKT_LEN \ 124 (sizeof(struct rndis_packet_msg) + \ 125 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 126 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 127 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 128 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 129 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 130 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 131 132 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 133 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 134 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 135 /* -1 for RNDIS packet message */ 136 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 137 138 #define HN_DIRECT_TX_SIZE_DEF 128 139 140 #define HN_EARLY_TXEOF_THRESH 8 141 142 #define HN_PKTBUF_LEN_DEF (16 * 1024) 143 144 #define HN_LROENT_CNT_DEF 128 145 146 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 147 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 148 /* YYY 2*MTU is a bit rough, but should be good enough. */ 149 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 150 151 #define HN_LRO_ACKCNT_DEF 1 152 153 #define HN_LOCK_INIT(sc) \ 154 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 155 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 156 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 157 #define HN_LOCK(sc) \ 158 do { \ 159 while (sx_try_xlock(&(sc)->hn_lock) == 0) \ 160 DELAY(1000); \ 161 } while (0) 162 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 163 164 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 165 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 166 #define HN_CSUM_IP_HWASSIST(sc) \ 167 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 168 #define HN_CSUM_IP6_HWASSIST(sc) \ 169 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 170 171 #define HN_PKTSIZE_MIN(align) \ 172 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 173 HN_RNDIS_PKT_LEN, (align)) 174 #define HN_PKTSIZE(m, align) \ 175 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 176 177 #ifdef RSS 178 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 179 #else 180 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 181 #endif 182 183 struct hn_txdesc { 184 #ifndef HN_USE_TXDESC_BUFRING 185 SLIST_ENTRY(hn_txdesc) link; 186 #endif 187 STAILQ_ENTRY(hn_txdesc) agg_link; 188 189 /* Aggregated txdescs, in sending order. */ 190 STAILQ_HEAD(, hn_txdesc) agg_list; 191 192 /* The oldest packet, if transmission aggregation happens. */ 193 struct mbuf *m; 194 struct hn_tx_ring *txr; 195 int refs; 196 uint32_t flags; /* HN_TXD_FLAG_ */ 197 struct hn_nvs_sendctx send_ctx; 198 uint32_t chim_index; 199 int chim_size; 200 201 bus_dmamap_t data_dmap; 202 203 bus_addr_t rndis_pkt_paddr; 204 struct rndis_packet_msg *rndis_pkt; 205 bus_dmamap_t rndis_pkt_dmap; 206 }; 207 208 #define HN_TXD_FLAG_ONLIST 0x0001 209 #define HN_TXD_FLAG_DMAMAP 0x0002 210 #define HN_TXD_FLAG_ONAGG 0x0004 211 212 struct hn_rxinfo { 213 uint32_t vlan_info; 214 uint32_t csum_info; 215 uint32_t hash_info; 216 uint32_t hash_value; 217 }; 218 219 #define HN_RXINFO_VLAN 0x0001 220 #define HN_RXINFO_CSUM 0x0002 221 #define HN_RXINFO_HASHINF 0x0004 222 #define HN_RXINFO_HASHVAL 0x0008 223 #define HN_RXINFO_ALL \ 224 (HN_RXINFO_VLAN | \ 225 HN_RXINFO_CSUM | \ 226 HN_RXINFO_HASHINF | \ 227 HN_RXINFO_HASHVAL) 228 229 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff 230 #define HN_NDIS_RXCSUM_INFO_INVALID 0 231 #define HN_NDIS_HASH_INFO_INVALID 0 232 233 static int hn_probe(device_t); 234 static int hn_attach(device_t); 235 static int hn_detach(device_t); 236 static int hn_shutdown(device_t); 237 static void hn_chan_callback(struct vmbus_channel *, 238 void *); 239 240 static void hn_init(void *); 241 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 242 #ifdef HN_IFSTART_SUPPORT 243 static void hn_start(struct ifnet *); 244 #endif 245 static int hn_transmit(struct ifnet *, struct mbuf *); 246 static void hn_xmit_qflush(struct ifnet *); 247 static int hn_ifmedia_upd(struct ifnet *); 248 static void hn_ifmedia_sts(struct ifnet *, 249 struct ifmediareq *); 250 251 static int hn_rndis_rxinfo(const void *, int, 252 struct hn_rxinfo *); 253 static void hn_rndis_rx_data(struct hn_rx_ring *, 254 const void *, int); 255 static void hn_rndis_rx_status(struct hn_softc *, 256 const void *, int); 257 258 static void hn_nvs_handle_notify(struct hn_softc *, 259 const struct vmbus_chanpkt_hdr *); 260 static void hn_nvs_handle_comp(struct hn_softc *, 261 struct vmbus_channel *, 262 const struct vmbus_chanpkt_hdr *); 263 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 264 struct vmbus_channel *, 265 const struct vmbus_chanpkt_hdr *); 266 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 267 struct vmbus_channel *, uint64_t); 268 269 #if __FreeBSD_version >= 1100099 270 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 271 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 272 #endif 273 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 274 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 275 #if __FreeBSD_version < 1100095 276 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 277 #else 278 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 279 #endif 280 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 281 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 282 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 283 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 284 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 285 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 286 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 287 #ifndef RSS 288 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 289 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 290 #endif 291 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 292 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 293 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 294 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 295 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 296 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 297 298 static void hn_stop(struct hn_softc *); 299 static void hn_init_locked(struct hn_softc *); 300 static int hn_chan_attach(struct hn_softc *, 301 struct vmbus_channel *); 302 static void hn_chan_detach(struct hn_softc *, 303 struct vmbus_channel *); 304 static int hn_attach_subchans(struct hn_softc *); 305 static void hn_detach_allchans(struct hn_softc *); 306 static void hn_chan_rollup(struct hn_rx_ring *, 307 struct hn_tx_ring *); 308 static void hn_set_ring_inuse(struct hn_softc *, int); 309 static int hn_synth_attach(struct hn_softc *, int); 310 static void hn_synth_detach(struct hn_softc *); 311 static int hn_synth_alloc_subchans(struct hn_softc *, 312 int *); 313 static bool hn_synth_attachable(const struct hn_softc *); 314 static void hn_suspend(struct hn_softc *); 315 static void hn_suspend_data(struct hn_softc *); 316 static void hn_suspend_mgmt(struct hn_softc *); 317 static void hn_resume(struct hn_softc *); 318 static void hn_resume_data(struct hn_softc *); 319 static void hn_resume_mgmt(struct hn_softc *); 320 static void hn_suspend_mgmt_taskfunc(void *, int); 321 static void hn_chan_drain(struct hn_softc *, 322 struct vmbus_channel *); 323 static void hn_polling(struct hn_softc *, u_int); 324 static void hn_chan_polling(struct vmbus_channel *, u_int); 325 326 static void hn_update_link_status(struct hn_softc *); 327 static void hn_change_network(struct hn_softc *); 328 static void hn_link_taskfunc(void *, int); 329 static void hn_netchg_init_taskfunc(void *, int); 330 static void hn_netchg_status_taskfunc(void *, int); 331 static void hn_link_status(struct hn_softc *); 332 333 static int hn_create_rx_data(struct hn_softc *, int); 334 static void hn_destroy_rx_data(struct hn_softc *); 335 static int hn_check_iplen(const struct mbuf *, int); 336 static int hn_set_rxfilter(struct hn_softc *); 337 #ifndef RSS 338 static int hn_rss_reconfig(struct hn_softc *); 339 #endif 340 static void hn_rss_ind_fixup(struct hn_softc *); 341 static int hn_rxpkt(struct hn_rx_ring *, const void *, 342 int, const struct hn_rxinfo *); 343 344 static int hn_tx_ring_create(struct hn_softc *, int); 345 static void hn_tx_ring_destroy(struct hn_tx_ring *); 346 static int hn_create_tx_data(struct hn_softc *, int); 347 static void hn_fixup_tx_data(struct hn_softc *); 348 static void hn_destroy_tx_data(struct hn_softc *); 349 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 350 static void hn_txdesc_gc(struct hn_tx_ring *, 351 struct hn_txdesc *); 352 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 353 struct hn_txdesc *, struct mbuf **); 354 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 355 struct hn_txdesc *); 356 static void hn_set_chim_size(struct hn_softc *, int); 357 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 358 static bool hn_tx_ring_pending(struct hn_tx_ring *); 359 static void hn_tx_ring_qflush(struct hn_tx_ring *); 360 static void hn_resume_tx(struct hn_softc *, int); 361 static void hn_set_txagg(struct hn_softc *); 362 static void *hn_try_txagg(struct ifnet *, 363 struct hn_tx_ring *, struct hn_txdesc *, 364 int); 365 static int hn_get_txswq_depth(const struct hn_tx_ring *); 366 static void hn_txpkt_done(struct hn_nvs_sendctx *, 367 struct hn_softc *, struct vmbus_channel *, 368 const void *, int); 369 static int hn_txpkt_sglist(struct hn_tx_ring *, 370 struct hn_txdesc *); 371 static int hn_txpkt_chim(struct hn_tx_ring *, 372 struct hn_txdesc *); 373 static int hn_xmit(struct hn_tx_ring *, int); 374 static void hn_xmit_taskfunc(void *, int); 375 static void hn_xmit_txeof(struct hn_tx_ring *); 376 static void hn_xmit_txeof_taskfunc(void *, int); 377 #ifdef HN_IFSTART_SUPPORT 378 static int hn_start_locked(struct hn_tx_ring *, int); 379 static void hn_start_taskfunc(void *, int); 380 static void hn_start_txeof(struct hn_tx_ring *); 381 static void hn_start_txeof_taskfunc(void *, int); 382 #endif 383 384 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 385 "Hyper-V network interface"); 386 387 /* Trust tcp segements verification on host side. */ 388 static int hn_trust_hosttcp = 1; 389 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 390 &hn_trust_hosttcp, 0, 391 "Trust tcp segement verification on host side, " 392 "when csum info is missing (global setting)"); 393 394 /* Trust udp datagrams verification on host side. */ 395 static int hn_trust_hostudp = 1; 396 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 397 &hn_trust_hostudp, 0, 398 "Trust udp datagram verification on host side, " 399 "when csum info is missing (global setting)"); 400 401 /* Trust ip packets verification on host side. */ 402 static int hn_trust_hostip = 1; 403 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 404 &hn_trust_hostip, 0, 405 "Trust ip packet verification on host side, " 406 "when csum info is missing (global setting)"); 407 408 /* Limit TSO burst size */ 409 static int hn_tso_maxlen = IP_MAXPACKET; 410 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 411 &hn_tso_maxlen, 0, "TSO burst limit"); 412 413 /* Limit chimney send size */ 414 static int hn_tx_chimney_size = 0; 415 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 416 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 417 418 /* Limit the size of packet for direct transmission */ 419 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 420 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 421 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 422 423 /* # of LRO entries per RX ring */ 424 #if defined(INET) || defined(INET6) 425 #if __FreeBSD_version >= 1100095 426 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 427 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 428 &hn_lro_entry_count, 0, "LRO entry count"); 429 #endif 430 #endif 431 432 static int hn_tx_taskq_cnt = 1; 433 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 434 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 435 436 #define HN_TX_TASKQ_M_INDEP 0 437 #define HN_TX_TASKQ_M_GLOBAL 1 438 #define HN_TX_TASKQ_M_EVTTQ 2 439 440 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 441 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 442 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 443 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 444 445 #ifndef HN_USE_TXDESC_BUFRING 446 static int hn_use_txdesc_bufring = 0; 447 #else 448 static int hn_use_txdesc_bufring = 1; 449 #endif 450 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 451 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 452 453 #ifdef HN_IFSTART_SUPPORT 454 /* Use ifnet.if_start instead of ifnet.if_transmit */ 455 static int hn_use_if_start = 0; 456 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 457 &hn_use_if_start, 0, "Use if_start TX method"); 458 #endif 459 460 /* # of channels to use */ 461 static int hn_chan_cnt = 0; 462 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 463 &hn_chan_cnt, 0, 464 "# of channels to use; each channel has one RX ring and one TX ring"); 465 466 /* # of transmit rings to use */ 467 static int hn_tx_ring_cnt = 0; 468 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 469 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 470 471 /* Software TX ring deptch */ 472 static int hn_tx_swq_depth = 0; 473 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 474 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 475 476 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 477 #if __FreeBSD_version >= 1100095 478 static u_int hn_lro_mbufq_depth = 0; 479 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 480 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 481 #endif 482 483 /* Packet transmission aggregation size limit */ 484 static int hn_tx_agg_size = -1; 485 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 486 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 487 488 /* Packet transmission aggregation count limit */ 489 static int hn_tx_agg_pkts = -1; 490 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 491 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 492 493 static u_int hn_cpu_index; /* next CPU for channel */ 494 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 495 496 #ifndef RSS 497 static const uint8_t 498 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 499 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 500 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 501 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 502 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 503 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 504 }; 505 #endif /* !RSS */ 506 507 static device_method_t hn_methods[] = { 508 /* Device interface */ 509 DEVMETHOD(device_probe, hn_probe), 510 DEVMETHOD(device_attach, hn_attach), 511 DEVMETHOD(device_detach, hn_detach), 512 DEVMETHOD(device_shutdown, hn_shutdown), 513 DEVMETHOD_END 514 }; 515 516 static driver_t hn_driver = { 517 "hn", 518 hn_methods, 519 sizeof(struct hn_softc) 520 }; 521 522 static devclass_t hn_devclass; 523 524 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 525 MODULE_VERSION(hn, 1); 526 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 527 528 #if __FreeBSD_version >= 1100099 529 static void 530 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 531 { 532 int i; 533 534 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 535 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 536 } 537 #endif 538 539 static int 540 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 541 { 542 543 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 544 txd->chim_size == 0, ("invalid rndis sglist txd")); 545 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 546 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 547 } 548 549 static int 550 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 551 { 552 struct hn_nvs_rndis rndis; 553 554 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 555 txd->chim_size > 0, ("invalid rndis chim txd")); 556 557 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 558 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 559 rndis.nvs_chim_idx = txd->chim_index; 560 rndis.nvs_chim_sz = txd->chim_size; 561 562 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 563 &rndis, sizeof(rndis), &txd->send_ctx)); 564 } 565 566 static __inline uint32_t 567 hn_chim_alloc(struct hn_softc *sc) 568 { 569 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 570 u_long *bmap = sc->hn_chim_bmap; 571 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 572 573 for (i = 0; i < bmap_cnt; ++i) { 574 int idx; 575 576 idx = ffsl(~bmap[i]); 577 if (idx == 0) 578 continue; 579 580 --idx; /* ffsl is 1-based */ 581 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 582 ("invalid i %d and idx %d", i, idx)); 583 584 if (atomic_testandset_long(&bmap[i], idx)) 585 continue; 586 587 ret = i * LONG_BIT + idx; 588 break; 589 } 590 return (ret); 591 } 592 593 static __inline void 594 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 595 { 596 u_long mask; 597 uint32_t idx; 598 599 idx = chim_idx / LONG_BIT; 600 KASSERT(idx < sc->hn_chim_bmap_cnt, 601 ("invalid chimney index 0x%x", chim_idx)); 602 603 mask = 1UL << (chim_idx % LONG_BIT); 604 KASSERT(sc->hn_chim_bmap[idx] & mask, 605 ("index bitmap 0x%lx, chimney index %u, " 606 "bitmap idx %d, bitmask 0x%lx", 607 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 608 609 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 610 } 611 612 #if defined(INET6) || defined(INET) 613 /* 614 * NOTE: If this function failed, the m_head would be freed. 615 */ 616 static __inline struct mbuf * 617 hn_tso_fixup(struct mbuf *m_head) 618 { 619 struct ether_vlan_header *evl; 620 struct tcphdr *th; 621 int ehlen; 622 623 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 624 625 #define PULLUP_HDR(m, len) \ 626 do { \ 627 if (__predict_false((m)->m_len < (len))) { \ 628 (m) = m_pullup((m), (len)); \ 629 if ((m) == NULL) \ 630 return (NULL); \ 631 } \ 632 } while (0) 633 634 PULLUP_HDR(m_head, sizeof(*evl)); 635 evl = mtod(m_head, struct ether_vlan_header *); 636 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 637 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 638 else 639 ehlen = ETHER_HDR_LEN; 640 641 #ifdef INET 642 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 643 struct ip *ip; 644 int iphlen; 645 646 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 647 ip = mtodo(m_head, ehlen); 648 iphlen = ip->ip_hl << 2; 649 650 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 651 th = mtodo(m_head, ehlen + iphlen); 652 653 ip->ip_len = 0; 654 ip->ip_sum = 0; 655 th->th_sum = in_pseudo(ip->ip_src.s_addr, 656 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 657 } 658 #endif 659 #if defined(INET6) && defined(INET) 660 else 661 #endif 662 #ifdef INET6 663 { 664 struct ip6_hdr *ip6; 665 666 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 667 ip6 = mtodo(m_head, ehlen); 668 if (ip6->ip6_nxt != IPPROTO_TCP) { 669 m_freem(m_head); 670 return (NULL); 671 } 672 673 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 674 th = mtodo(m_head, ehlen + sizeof(*ip6)); 675 676 ip6->ip6_plen = 0; 677 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 678 } 679 #endif 680 return (m_head); 681 682 #undef PULLUP_HDR 683 } 684 #endif /* INET6 || INET */ 685 686 static int 687 hn_set_rxfilter(struct hn_softc *sc) 688 { 689 struct ifnet *ifp = sc->hn_ifp; 690 uint32_t filter; 691 int error = 0; 692 693 HN_LOCK_ASSERT(sc); 694 695 if (ifp->if_flags & IFF_PROMISC) { 696 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 697 } else { 698 filter = NDIS_PACKET_TYPE_DIRECTED; 699 if (ifp->if_flags & IFF_BROADCAST) 700 filter |= NDIS_PACKET_TYPE_BROADCAST; 701 /* TODO: support multicast list */ 702 if ((ifp->if_flags & IFF_ALLMULTI) || 703 !TAILQ_EMPTY(&ifp->if_multiaddrs)) 704 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 705 } 706 707 if (sc->hn_rx_filter != filter) { 708 error = hn_rndis_set_rxfilter(sc, filter); 709 if (!error) 710 sc->hn_rx_filter = filter; 711 } 712 return (error); 713 } 714 715 static void 716 hn_set_txagg(struct hn_softc *sc) 717 { 718 uint32_t size, pkts; 719 int i; 720 721 /* 722 * Setup aggregation size. 723 */ 724 if (sc->hn_agg_size < 0) 725 size = UINT32_MAX; 726 else 727 size = sc->hn_agg_size; 728 729 if (sc->hn_rndis_agg_size < size) 730 size = sc->hn_rndis_agg_size; 731 732 /* NOTE: We only aggregate packets using chimney sending buffers. */ 733 if (size > (uint32_t)sc->hn_chim_szmax) 734 size = sc->hn_chim_szmax; 735 736 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 737 /* Disable */ 738 size = 0; 739 pkts = 0; 740 goto done; 741 } 742 743 /* NOTE: Type of the per TX ring setting is 'int'. */ 744 if (size > INT_MAX) 745 size = INT_MAX; 746 747 /* 748 * Setup aggregation packet count. 749 */ 750 if (sc->hn_agg_pkts < 0) 751 pkts = UINT32_MAX; 752 else 753 pkts = sc->hn_agg_pkts; 754 755 if (sc->hn_rndis_agg_pkts < pkts) 756 pkts = sc->hn_rndis_agg_pkts; 757 758 if (pkts <= 1) { 759 /* Disable */ 760 size = 0; 761 pkts = 0; 762 goto done; 763 } 764 765 /* NOTE: Type of the per TX ring setting is 'short'. */ 766 if (pkts > SHRT_MAX) 767 pkts = SHRT_MAX; 768 769 done: 770 /* NOTE: Type of the per TX ring setting is 'short'. */ 771 if (sc->hn_rndis_agg_align > SHRT_MAX) { 772 /* Disable */ 773 size = 0; 774 pkts = 0; 775 } 776 777 if (bootverbose) { 778 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 779 size, pkts, sc->hn_rndis_agg_align); 780 } 781 782 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 783 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 784 785 mtx_lock(&txr->hn_tx_lock); 786 txr->hn_agg_szmax = size; 787 txr->hn_agg_pktmax = pkts; 788 txr->hn_agg_align = sc->hn_rndis_agg_align; 789 mtx_unlock(&txr->hn_tx_lock); 790 } 791 } 792 793 static int 794 hn_get_txswq_depth(const struct hn_tx_ring *txr) 795 { 796 797 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 798 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 799 return txr->hn_txdesc_cnt; 800 return hn_tx_swq_depth; 801 } 802 803 #ifndef RSS 804 static int 805 hn_rss_reconfig(struct hn_softc *sc) 806 { 807 int error; 808 809 HN_LOCK_ASSERT(sc); 810 811 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 812 return (ENXIO); 813 814 /* 815 * Disable RSS first. 816 * 817 * NOTE: 818 * Direct reconfiguration by setting the UNCHG flags does 819 * _not_ work properly. 820 */ 821 if (bootverbose) 822 if_printf(sc->hn_ifp, "disable RSS\n"); 823 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 824 if (error) { 825 if_printf(sc->hn_ifp, "RSS disable failed\n"); 826 return (error); 827 } 828 829 /* 830 * Reenable the RSS w/ the updated RSS key or indirect 831 * table. 832 */ 833 if (bootverbose) 834 if_printf(sc->hn_ifp, "reconfig RSS\n"); 835 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 836 if (error) { 837 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 838 return (error); 839 } 840 return (0); 841 } 842 #endif /* !RSS */ 843 844 static void 845 hn_rss_ind_fixup(struct hn_softc *sc) 846 { 847 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 848 int i, nchan; 849 850 nchan = sc->hn_rx_ring_inuse; 851 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 852 853 /* 854 * Check indirect table to make sure that all channels in it 855 * can be used. 856 */ 857 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 858 if (rss->rss_ind[i] >= nchan) { 859 if_printf(sc->hn_ifp, 860 "RSS indirect table %d fixup: %u -> %d\n", 861 i, rss->rss_ind[i], nchan - 1); 862 rss->rss_ind[i] = nchan - 1; 863 } 864 } 865 } 866 867 static int 868 hn_ifmedia_upd(struct ifnet *ifp __unused) 869 { 870 871 return EOPNOTSUPP; 872 } 873 874 static void 875 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 876 { 877 struct hn_softc *sc = ifp->if_softc; 878 879 ifmr->ifm_status = IFM_AVALID; 880 ifmr->ifm_active = IFM_ETHER; 881 882 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 883 ifmr->ifm_active |= IFM_NONE; 884 return; 885 } 886 ifmr->ifm_status |= IFM_ACTIVE; 887 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 888 } 889 890 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ 891 static const struct hyperv_guid g_net_vsc_device_type = { 892 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 893 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} 894 }; 895 896 static int 897 hn_probe(device_t dev) 898 { 899 900 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, 901 &g_net_vsc_device_type) == 0) { 902 device_set_desc(dev, "Hyper-V Network Interface"); 903 return BUS_PROBE_DEFAULT; 904 } 905 return ENXIO; 906 } 907 908 static int 909 hn_attach(device_t dev) 910 { 911 struct hn_softc *sc = device_get_softc(dev); 912 struct sysctl_oid_list *child; 913 struct sysctl_ctx_list *ctx; 914 uint8_t eaddr[ETHER_ADDR_LEN]; 915 struct ifnet *ifp = NULL; 916 int error, ring_cnt, tx_ring_cnt; 917 918 sc->hn_dev = dev; 919 sc->hn_prichan = vmbus_get_channel(dev); 920 HN_LOCK_INIT(sc); 921 922 /* 923 * Initialize these tunables once. 924 */ 925 sc->hn_agg_size = hn_tx_agg_size; 926 sc->hn_agg_pkts = hn_tx_agg_pkts; 927 928 /* 929 * Setup taskqueue for transmission. 930 */ 931 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 932 int i; 933 934 sc->hn_tx_taskqs = 935 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 936 M_DEVBUF, M_WAITOK); 937 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 938 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 939 M_WAITOK, taskqueue_thread_enqueue, 940 &sc->hn_tx_taskqs[i]); 941 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 942 "%s tx%d", device_get_nameunit(dev), i); 943 } 944 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 945 sc->hn_tx_taskqs = hn_tx_taskque; 946 } 947 948 /* 949 * Setup taskqueue for mangement tasks, e.g. link status. 950 */ 951 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 952 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 953 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 954 device_get_nameunit(dev)); 955 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 956 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 957 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 958 hn_netchg_status_taskfunc, sc); 959 960 /* 961 * Allocate ifnet and setup its name earlier, so that if_printf 962 * can be used by functions, which will be called after 963 * ether_ifattach(). 964 */ 965 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 966 ifp->if_softc = sc; 967 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 968 969 /* 970 * Initialize ifmedia earlier so that it can be unconditionally 971 * destroyed, if error happened later on. 972 */ 973 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 974 975 /* 976 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 977 * to use (tx_ring_cnt). 978 * 979 * NOTE: 980 * The # of RX rings to use is same as the # of channels to use. 981 */ 982 ring_cnt = hn_chan_cnt; 983 if (ring_cnt <= 0) { 984 /* Default */ 985 ring_cnt = mp_ncpus; 986 if (ring_cnt > HN_RING_CNT_DEF_MAX) 987 ring_cnt = HN_RING_CNT_DEF_MAX; 988 } else if (ring_cnt > mp_ncpus) { 989 ring_cnt = mp_ncpus; 990 } 991 #ifdef RSS 992 if (ring_cnt > rss_getnumbuckets()) 993 ring_cnt = rss_getnumbuckets(); 994 #endif 995 996 tx_ring_cnt = hn_tx_ring_cnt; 997 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 998 tx_ring_cnt = ring_cnt; 999 #ifdef HN_IFSTART_SUPPORT 1000 if (hn_use_if_start) { 1001 /* ifnet.if_start only needs one TX ring. */ 1002 tx_ring_cnt = 1; 1003 } 1004 #endif 1005 1006 /* 1007 * Set the leader CPU for channels. 1008 */ 1009 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 1010 1011 /* 1012 * Create enough TX/RX rings, even if only limited number of 1013 * channels can be allocated. 1014 */ 1015 error = hn_create_tx_data(sc, tx_ring_cnt); 1016 if (error) 1017 goto failed; 1018 error = hn_create_rx_data(sc, ring_cnt); 1019 if (error) 1020 goto failed; 1021 1022 /* 1023 * Create transaction context for NVS and RNDIS transactions. 1024 */ 1025 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 1026 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 1027 if (sc->hn_xact == NULL) { 1028 error = ENXIO; 1029 goto failed; 1030 } 1031 1032 /* 1033 * Install orphan handler for the revocation of this device's 1034 * primary channel. 1035 * 1036 * NOTE: 1037 * The processing order is critical here: 1038 * Install the orphan handler, _before_ testing whether this 1039 * device's primary channel has been revoked or not. 1040 */ 1041 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 1042 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 1043 error = ENXIO; 1044 goto failed; 1045 } 1046 1047 /* 1048 * Attach the synthetic parts, i.e. NVS and RNDIS. 1049 */ 1050 error = hn_synth_attach(sc, ETHERMTU); 1051 if (error) 1052 goto failed; 1053 1054 error = hn_rndis_get_eaddr(sc, eaddr); 1055 if (error) 1056 goto failed; 1057 1058 #if __FreeBSD_version >= 1100099 1059 if (sc->hn_rx_ring_inuse > 1) { 1060 /* 1061 * Reduce TCP segment aggregation limit for multiple 1062 * RX rings to increase ACK timeliness. 1063 */ 1064 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 1065 } 1066 #endif 1067 1068 /* 1069 * Fixup TX stuffs after synthetic parts are attached. 1070 */ 1071 hn_fixup_tx_data(sc); 1072 1073 ctx = device_get_sysctl_ctx(dev); 1074 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 1075 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 1076 &sc->hn_nvs_ver, 0, "NVS version"); 1077 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 1078 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1079 hn_ndis_version_sysctl, "A", "NDIS version"); 1080 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 1081 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1082 hn_caps_sysctl, "A", "capabilities"); 1083 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 1084 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1085 hn_hwassist_sysctl, "A", "hwassist"); 1086 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 1087 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1088 hn_rxfilter_sysctl, "A", "rxfilter"); 1089 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 1090 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1091 hn_rss_hash_sysctl, "A", "RSS hash"); 1092 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 1093 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 1094 #ifndef RSS 1095 /* 1096 * Don't allow RSS key/indirect table changes, if RSS is defined. 1097 */ 1098 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 1099 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1100 hn_rss_key_sysctl, "IU", "RSS key"); 1101 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 1102 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1103 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 1104 #endif 1105 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 1106 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 1107 "RNDIS offered packet transmission aggregation size limit"); 1108 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 1109 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 1110 "RNDIS offered packet transmission aggregation count limit"); 1111 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 1112 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 1113 "RNDIS packet transmission aggregation alignment"); 1114 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 1115 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1116 hn_txagg_size_sysctl, "I", 1117 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 1118 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 1119 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1120 hn_txagg_pkts_sysctl, "I", 1121 "Packet transmission aggregation packets, " 1122 "0 -- disable, -1 -- auto"); 1123 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 1124 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1125 hn_polling_sysctl, "I", 1126 "Polling frequency: [100,1000000], 0 disable polling"); 1127 1128 /* 1129 * Setup the ifmedia, which has been initialized earlier. 1130 */ 1131 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 1132 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 1133 /* XXX ifmedia_set really should do this for us */ 1134 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 1135 1136 /* 1137 * Setup the ifnet for this interface. 1138 */ 1139 1140 ifp->if_baudrate = IF_Gbps(10); 1141 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 1142 ifp->if_ioctl = hn_ioctl; 1143 ifp->if_init = hn_init; 1144 #ifdef HN_IFSTART_SUPPORT 1145 if (hn_use_if_start) { 1146 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 1147 1148 ifp->if_start = hn_start; 1149 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 1150 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 1151 IFQ_SET_READY(&ifp->if_snd); 1152 } else 1153 #endif 1154 { 1155 ifp->if_transmit = hn_transmit; 1156 ifp->if_qflush = hn_xmit_qflush; 1157 } 1158 1159 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO; 1160 #ifdef foo 1161 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 1162 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 1163 #endif 1164 if (sc->hn_caps & HN_CAP_VLAN) { 1165 /* XXX not sure about VLAN_MTU. */ 1166 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 1167 } 1168 1169 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 1170 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 1171 ifp->if_capabilities |= IFCAP_TXCSUM; 1172 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 1173 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 1174 if (sc->hn_caps & HN_CAP_TSO4) { 1175 ifp->if_capabilities |= IFCAP_TSO4; 1176 ifp->if_hwassist |= CSUM_IP_TSO; 1177 } 1178 if (sc->hn_caps & HN_CAP_TSO6) { 1179 ifp->if_capabilities |= IFCAP_TSO6; 1180 ifp->if_hwassist |= CSUM_IP6_TSO; 1181 } 1182 1183 /* Enable all available capabilities by default. */ 1184 ifp->if_capenable = ifp->if_capabilities; 1185 1186 /* 1187 * Disable IPv6 TSO and TXCSUM by default, they still can 1188 * be enabled through SIOCSIFCAP. 1189 */ 1190 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 1191 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 1192 1193 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 1194 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 1195 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 1196 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 1197 } 1198 1199 ether_ifattach(ifp, eaddr); 1200 1201 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 1202 if_printf(ifp, "TSO segcnt %u segsz %u\n", 1203 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 1204 } 1205 1206 /* Inform the upper layer about the long frame support. */ 1207 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 1208 1209 /* 1210 * Kick off link status check. 1211 */ 1212 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 1213 hn_update_link_status(sc); 1214 1215 return (0); 1216 failed: 1217 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 1218 hn_synth_detach(sc); 1219 hn_detach(dev); 1220 return (error); 1221 } 1222 1223 static int 1224 hn_detach(device_t dev) 1225 { 1226 struct hn_softc *sc = device_get_softc(dev); 1227 struct ifnet *ifp = sc->hn_ifp; 1228 1229 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 1230 /* 1231 * In case that the vmbus missed the orphan handler 1232 * installation. 1233 */ 1234 vmbus_xact_ctx_orphan(sc->hn_xact); 1235 } 1236 1237 if (device_is_attached(dev)) { 1238 HN_LOCK(sc); 1239 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 1240 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 1241 hn_stop(sc); 1242 /* 1243 * NOTE: 1244 * hn_stop() only suspends data, so managment 1245 * stuffs have to be suspended manually here. 1246 */ 1247 hn_suspend_mgmt(sc); 1248 hn_synth_detach(sc); 1249 } 1250 HN_UNLOCK(sc); 1251 ether_ifdetach(ifp); 1252 } 1253 1254 ifmedia_removeall(&sc->hn_media); 1255 hn_destroy_rx_data(sc); 1256 hn_destroy_tx_data(sc); 1257 1258 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 1259 int i; 1260 1261 for (i = 0; i < hn_tx_taskq_cnt; ++i) 1262 taskqueue_free(sc->hn_tx_taskqs[i]); 1263 free(sc->hn_tx_taskqs, M_DEVBUF); 1264 } 1265 taskqueue_free(sc->hn_mgmt_taskq0); 1266 1267 if (sc->hn_xact != NULL) { 1268 /* 1269 * Uninstall the orphan handler _before_ the xact is 1270 * destructed. 1271 */ 1272 vmbus_chan_unset_orphan(sc->hn_prichan); 1273 vmbus_xact_ctx_destroy(sc->hn_xact); 1274 } 1275 1276 if_free(ifp); 1277 1278 HN_LOCK_DESTROY(sc); 1279 return (0); 1280 } 1281 1282 static int 1283 hn_shutdown(device_t dev) 1284 { 1285 1286 return (0); 1287 } 1288 1289 static void 1290 hn_link_status(struct hn_softc *sc) 1291 { 1292 uint32_t link_status; 1293 int error; 1294 1295 error = hn_rndis_get_linkstatus(sc, &link_status); 1296 if (error) { 1297 /* XXX what to do? */ 1298 return; 1299 } 1300 1301 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 1302 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 1303 else 1304 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1305 if_link_state_change(sc->hn_ifp, 1306 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 1307 LINK_STATE_UP : LINK_STATE_DOWN); 1308 } 1309 1310 static void 1311 hn_link_taskfunc(void *xsc, int pending __unused) 1312 { 1313 struct hn_softc *sc = xsc; 1314 1315 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 1316 return; 1317 hn_link_status(sc); 1318 } 1319 1320 static void 1321 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 1322 { 1323 struct hn_softc *sc = xsc; 1324 1325 /* Prevent any link status checks from running. */ 1326 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 1327 1328 /* 1329 * Fake up a [link down --> link up] state change; 5 seconds 1330 * delay is used, which closely simulates miibus reaction 1331 * upon link down event. 1332 */ 1333 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1334 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 1335 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 1336 &sc->hn_netchg_status, 5 * hz); 1337 } 1338 1339 static void 1340 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 1341 { 1342 struct hn_softc *sc = xsc; 1343 1344 /* Re-allow link status checks. */ 1345 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 1346 hn_link_status(sc); 1347 } 1348 1349 static void 1350 hn_update_link_status(struct hn_softc *sc) 1351 { 1352 1353 if (sc->hn_mgmt_taskq != NULL) 1354 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 1355 } 1356 1357 static void 1358 hn_change_network(struct hn_softc *sc) 1359 { 1360 1361 if (sc->hn_mgmt_taskq != NULL) 1362 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 1363 } 1364 1365 static __inline int 1366 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 1367 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 1368 { 1369 struct mbuf *m = *m_head; 1370 int error; 1371 1372 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 1373 1374 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 1375 m, segs, nsegs, BUS_DMA_NOWAIT); 1376 if (error == EFBIG) { 1377 struct mbuf *m_new; 1378 1379 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 1380 if (m_new == NULL) 1381 return ENOBUFS; 1382 else 1383 *m_head = m = m_new; 1384 txr->hn_tx_collapsed++; 1385 1386 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 1387 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 1388 } 1389 if (!error) { 1390 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 1391 BUS_DMASYNC_PREWRITE); 1392 txd->flags |= HN_TXD_FLAG_DMAMAP; 1393 } 1394 return error; 1395 } 1396 1397 static __inline int 1398 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 1399 { 1400 1401 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 1402 ("put an onlist txd %#x", txd->flags)); 1403 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1404 ("put an onagg txd %#x", txd->flags)); 1405 1406 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1407 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 1408 return 0; 1409 1410 if (!STAILQ_EMPTY(&txd->agg_list)) { 1411 struct hn_txdesc *tmp_txd; 1412 1413 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 1414 int freed; 1415 1416 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 1417 ("resursive aggregation on aggregated txdesc")); 1418 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 1419 ("not aggregated txdesc")); 1420 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1421 ("aggregated txdesc uses dmamap")); 1422 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 1423 ("aggregated txdesc consumes " 1424 "chimney sending buffer")); 1425 KASSERT(tmp_txd->chim_size == 0, 1426 ("aggregated txdesc has non-zero " 1427 "chimney sending size")); 1428 1429 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 1430 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 1431 freed = hn_txdesc_put(txr, tmp_txd); 1432 KASSERT(freed, ("failed to free aggregated txdesc")); 1433 } 1434 } 1435 1436 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 1437 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1438 ("chim txd uses dmamap")); 1439 hn_chim_free(txr->hn_sc, txd->chim_index); 1440 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 1441 txd->chim_size = 0; 1442 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 1443 bus_dmamap_sync(txr->hn_tx_data_dtag, 1444 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 1445 bus_dmamap_unload(txr->hn_tx_data_dtag, 1446 txd->data_dmap); 1447 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 1448 } 1449 1450 if (txd->m != NULL) { 1451 m_freem(txd->m); 1452 txd->m = NULL; 1453 } 1454 1455 txd->flags |= HN_TXD_FLAG_ONLIST; 1456 #ifndef HN_USE_TXDESC_BUFRING 1457 mtx_lock_spin(&txr->hn_txlist_spin); 1458 KASSERT(txr->hn_txdesc_avail >= 0 && 1459 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 1460 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 1461 txr->hn_txdesc_avail++; 1462 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 1463 mtx_unlock_spin(&txr->hn_txlist_spin); 1464 #else /* HN_USE_TXDESC_BUFRING */ 1465 #ifdef HN_DEBUG 1466 atomic_add_int(&txr->hn_txdesc_avail, 1); 1467 #endif 1468 buf_ring_enqueue(txr->hn_txdesc_br, txd); 1469 #endif /* !HN_USE_TXDESC_BUFRING */ 1470 1471 return 1; 1472 } 1473 1474 static __inline struct hn_txdesc * 1475 hn_txdesc_get(struct hn_tx_ring *txr) 1476 { 1477 struct hn_txdesc *txd; 1478 1479 #ifndef HN_USE_TXDESC_BUFRING 1480 mtx_lock_spin(&txr->hn_txlist_spin); 1481 txd = SLIST_FIRST(&txr->hn_txlist); 1482 if (txd != NULL) { 1483 KASSERT(txr->hn_txdesc_avail > 0, 1484 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 1485 txr->hn_txdesc_avail--; 1486 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 1487 } 1488 mtx_unlock_spin(&txr->hn_txlist_spin); 1489 #else 1490 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 1491 #endif 1492 1493 if (txd != NULL) { 1494 #ifdef HN_USE_TXDESC_BUFRING 1495 #ifdef HN_DEBUG 1496 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 1497 #endif 1498 #endif /* HN_USE_TXDESC_BUFRING */ 1499 KASSERT(txd->m == NULL && txd->refs == 0 && 1500 STAILQ_EMPTY(&txd->agg_list) && 1501 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 1502 txd->chim_size == 0 && 1503 (txd->flags & HN_TXD_FLAG_ONLIST) && 1504 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 1505 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 1506 txd->flags &= ~HN_TXD_FLAG_ONLIST; 1507 txd->refs = 1; 1508 } 1509 return txd; 1510 } 1511 1512 static __inline void 1513 hn_txdesc_hold(struct hn_txdesc *txd) 1514 { 1515 1516 /* 0->1 transition will never work */ 1517 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1518 atomic_add_int(&txd->refs, 1); 1519 } 1520 1521 static __inline void 1522 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 1523 { 1524 1525 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1526 ("recursive aggregation on aggregating txdesc")); 1527 1528 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1529 ("already aggregated")); 1530 KASSERT(STAILQ_EMPTY(&txd->agg_list), 1531 ("recursive aggregation on to-be-aggregated txdesc")); 1532 1533 txd->flags |= HN_TXD_FLAG_ONAGG; 1534 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 1535 } 1536 1537 static bool 1538 hn_tx_ring_pending(struct hn_tx_ring *txr) 1539 { 1540 bool pending = false; 1541 1542 #ifndef HN_USE_TXDESC_BUFRING 1543 mtx_lock_spin(&txr->hn_txlist_spin); 1544 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 1545 pending = true; 1546 mtx_unlock_spin(&txr->hn_txlist_spin); 1547 #else 1548 if (!buf_ring_full(txr->hn_txdesc_br)) 1549 pending = true; 1550 #endif 1551 return (pending); 1552 } 1553 1554 static __inline void 1555 hn_txeof(struct hn_tx_ring *txr) 1556 { 1557 txr->hn_has_txeof = 0; 1558 txr->hn_txeof(txr); 1559 } 1560 1561 static void 1562 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 1563 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 1564 { 1565 struct hn_txdesc *txd = sndc->hn_cbarg; 1566 struct hn_tx_ring *txr; 1567 1568 txr = txd->txr; 1569 KASSERT(txr->hn_chan == chan, 1570 ("channel mismatch, on chan%u, should be chan%u", 1571 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 1572 1573 txr->hn_has_txeof = 1; 1574 hn_txdesc_put(txr, txd); 1575 1576 ++txr->hn_txdone_cnt; 1577 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 1578 txr->hn_txdone_cnt = 0; 1579 if (txr->hn_oactive) 1580 hn_txeof(txr); 1581 } 1582 } 1583 1584 static void 1585 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 1586 { 1587 #if defined(INET) || defined(INET6) 1588 tcp_lro_flush_all(&rxr->hn_lro); 1589 #endif 1590 1591 /* 1592 * NOTE: 1593 * 'txr' could be NULL, if multiple channels and 1594 * ifnet.if_start method are enabled. 1595 */ 1596 if (txr == NULL || !txr->hn_has_txeof) 1597 return; 1598 1599 txr->hn_txdone_cnt = 0; 1600 hn_txeof(txr); 1601 } 1602 1603 static __inline uint32_t 1604 hn_rndis_pktmsg_offset(uint32_t ofs) 1605 { 1606 1607 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 1608 ("invalid RNDIS packet msg offset %u", ofs)); 1609 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 1610 } 1611 1612 static __inline void * 1613 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 1614 size_t pi_dlen, uint32_t pi_type) 1615 { 1616 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 1617 struct rndis_pktinfo *pi; 1618 1619 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 1620 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 1621 1622 /* 1623 * Per-packet-info does not move; it only grows. 1624 * 1625 * NOTE: 1626 * rm_pktinfooffset in this phase counts from the beginning 1627 * of rndis_packet_msg. 1628 */ 1629 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 1630 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 1631 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 1632 pkt->rm_pktinfolen); 1633 pkt->rm_pktinfolen += pi_size; 1634 1635 pi->rm_size = pi_size; 1636 pi->rm_type = pi_type; 1637 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 1638 1639 /* Data immediately follow per-packet-info. */ 1640 pkt->rm_dataoffset += pi_size; 1641 1642 /* Update RNDIS packet msg length */ 1643 pkt->rm_len += pi_size; 1644 1645 return (pi->rm_data); 1646 } 1647 1648 static __inline int 1649 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 1650 { 1651 struct hn_txdesc *txd; 1652 struct mbuf *m; 1653 int error, pkts; 1654 1655 txd = txr->hn_agg_txd; 1656 KASSERT(txd != NULL, ("no aggregate txdesc")); 1657 1658 /* 1659 * Since hn_txpkt() will reset this temporary stat, save 1660 * it now, so that oerrors can be updated properly, if 1661 * hn_txpkt() ever fails. 1662 */ 1663 pkts = txr->hn_stat_pkts; 1664 1665 /* 1666 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 1667 * failure, save it for later freeing, if hn_txpkt() ever 1668 * fails. 1669 */ 1670 m = txd->m; 1671 error = hn_txpkt(ifp, txr, txd); 1672 if (__predict_false(error)) { 1673 /* txd is freed, but m is not. */ 1674 m_freem(m); 1675 1676 txr->hn_flush_failed++; 1677 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 1678 } 1679 1680 /* Reset all aggregation states. */ 1681 txr->hn_agg_txd = NULL; 1682 txr->hn_agg_szleft = 0; 1683 txr->hn_agg_pktleft = 0; 1684 txr->hn_agg_prevpkt = NULL; 1685 1686 return (error); 1687 } 1688 1689 static void * 1690 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1691 int pktsize) 1692 { 1693 void *chim; 1694 1695 if (txr->hn_agg_txd != NULL) { 1696 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 1697 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 1698 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 1699 int olen; 1700 1701 /* 1702 * Update the previous RNDIS packet's total length, 1703 * it can be increased due to the mandatory alignment 1704 * padding for this RNDIS packet. And update the 1705 * aggregating txdesc's chimney sending buffer size 1706 * accordingly. 1707 * 1708 * XXX 1709 * Zero-out the padding, as required by the RNDIS spec. 1710 */ 1711 olen = pkt->rm_len; 1712 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 1713 agg_txd->chim_size += pkt->rm_len - olen; 1714 1715 /* Link this txdesc to the parent. */ 1716 hn_txdesc_agg(agg_txd, txd); 1717 1718 chim = (uint8_t *)pkt + pkt->rm_len; 1719 /* Save the current packet for later fixup. */ 1720 txr->hn_agg_prevpkt = chim; 1721 1722 txr->hn_agg_pktleft--; 1723 txr->hn_agg_szleft -= pktsize; 1724 if (txr->hn_agg_szleft <= 1725 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1726 /* 1727 * Probably can't aggregate more packets, 1728 * flush this aggregating txdesc proactively. 1729 */ 1730 txr->hn_agg_pktleft = 0; 1731 } 1732 /* Done! */ 1733 return (chim); 1734 } 1735 hn_flush_txagg(ifp, txr); 1736 } 1737 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 1738 1739 txr->hn_tx_chimney_tried++; 1740 txd->chim_index = hn_chim_alloc(txr->hn_sc); 1741 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 1742 return (NULL); 1743 txr->hn_tx_chimney++; 1744 1745 chim = txr->hn_sc->hn_chim + 1746 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 1747 1748 if (txr->hn_agg_pktmax > 1 && 1749 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1750 txr->hn_agg_txd = txd; 1751 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 1752 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 1753 txr->hn_agg_prevpkt = chim; 1754 } 1755 return (chim); 1756 } 1757 1758 /* 1759 * NOTE: 1760 * If this function fails, then both txd and m_head0 will be freed. 1761 */ 1762 static int 1763 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1764 struct mbuf **m_head0) 1765 { 1766 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 1767 int error, nsegs, i; 1768 struct mbuf *m_head = *m_head0; 1769 struct rndis_packet_msg *pkt; 1770 uint32_t *pi_data; 1771 void *chim = NULL; 1772 int pkt_hlen, pkt_size; 1773 1774 pkt = txd->rndis_pkt; 1775 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 1776 if (pkt_size < txr->hn_chim_size) { 1777 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 1778 if (chim != NULL) 1779 pkt = chim; 1780 } else { 1781 if (txr->hn_agg_txd != NULL) 1782 hn_flush_txagg(ifp, txr); 1783 } 1784 1785 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 1786 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len; 1787 pkt->rm_dataoffset = sizeof(*pkt); 1788 pkt->rm_datalen = m_head->m_pkthdr.len; 1789 pkt->rm_oobdataoffset = 0; 1790 pkt->rm_oobdatalen = 0; 1791 pkt->rm_oobdataelements = 0; 1792 pkt->rm_pktinfooffset = sizeof(*pkt); 1793 pkt->rm_pktinfolen = 0; 1794 pkt->rm_vchandle = 0; 1795 pkt->rm_reserved = 0; 1796 1797 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 1798 /* 1799 * Set the hash value for this packet, so that the host could 1800 * dispatch the TX done event for this packet back to this TX 1801 * ring's channel. 1802 */ 1803 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1804 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 1805 *pi_data = txr->hn_tx_idx; 1806 } 1807 1808 if (m_head->m_flags & M_VLANTAG) { 1809 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1810 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 1811 *pi_data = NDIS_VLAN_INFO_MAKE( 1812 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 1813 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 1814 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 1815 } 1816 1817 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 1818 #if defined(INET6) || defined(INET) 1819 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1820 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 1821 #ifdef INET 1822 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 1823 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, 1824 m_head->m_pkthdr.tso_segsz); 1825 } 1826 #endif 1827 #if defined(INET6) && defined(INET) 1828 else 1829 #endif 1830 #ifdef INET6 1831 { 1832 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, 1833 m_head->m_pkthdr.tso_segsz); 1834 } 1835 #endif 1836 #endif /* INET6 || INET */ 1837 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 1838 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1839 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 1840 if (m_head->m_pkthdr.csum_flags & 1841 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 1842 *pi_data = NDIS_TXCSUM_INFO_IPV6; 1843 } else { 1844 *pi_data = NDIS_TXCSUM_INFO_IPV4; 1845 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 1846 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 1847 } 1848 1849 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) 1850 *pi_data |= NDIS_TXCSUM_INFO_TCPCS; 1851 else if (m_head->m_pkthdr.csum_flags & 1852 (CSUM_IP_UDP | CSUM_IP6_UDP)) 1853 *pi_data |= NDIS_TXCSUM_INFO_UDPCS; 1854 } 1855 1856 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 1857 /* Convert RNDIS packet message offsets */ 1858 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset); 1859 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 1860 1861 /* 1862 * Fast path: Chimney sending. 1863 */ 1864 if (chim != NULL) { 1865 struct hn_txdesc *tgt_txd = txd; 1866 1867 if (txr->hn_agg_txd != NULL) { 1868 tgt_txd = txr->hn_agg_txd; 1869 #ifdef INVARIANTS 1870 *m_head0 = NULL; 1871 #endif 1872 } 1873 1874 KASSERT(pkt == chim, 1875 ("RNDIS pkt not in chimney sending buffer")); 1876 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 1877 ("chimney sending buffer is not used")); 1878 tgt_txd->chim_size += pkt->rm_len; 1879 1880 m_copydata(m_head, 0, m_head->m_pkthdr.len, 1881 ((uint8_t *)chim) + pkt_hlen); 1882 1883 txr->hn_gpa_cnt = 0; 1884 txr->hn_sendpkt = hn_txpkt_chim; 1885 goto done; 1886 } 1887 1888 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 1889 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 1890 ("chimney buffer is used")); 1891 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 1892 1893 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 1894 if (__predict_false(error)) { 1895 int freed; 1896 1897 /* 1898 * This mbuf is not linked w/ the txd yet, so free it now. 1899 */ 1900 m_freem(m_head); 1901 *m_head0 = NULL; 1902 1903 freed = hn_txdesc_put(txr, txd); 1904 KASSERT(freed != 0, 1905 ("fail to free txd upon txdma error")); 1906 1907 txr->hn_txdma_failed++; 1908 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 1909 return error; 1910 } 1911 *m_head0 = m_head; 1912 1913 /* +1 RNDIS packet message */ 1914 txr->hn_gpa_cnt = nsegs + 1; 1915 1916 /* send packet with page buffer */ 1917 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 1918 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 1919 txr->hn_gpa[0].gpa_len = pkt_hlen; 1920 1921 /* 1922 * Fill the page buffers with mbuf info after the page 1923 * buffer for RNDIS packet message. 1924 */ 1925 for (i = 0; i < nsegs; ++i) { 1926 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 1927 1928 gpa->gpa_page = atop(segs[i].ds_addr); 1929 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 1930 gpa->gpa_len = segs[i].ds_len; 1931 } 1932 1933 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 1934 txd->chim_size = 0; 1935 txr->hn_sendpkt = hn_txpkt_sglist; 1936 done: 1937 txd->m = m_head; 1938 1939 /* Set the completion routine */ 1940 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 1941 1942 /* Update temporary stats for later use. */ 1943 txr->hn_stat_pkts++; 1944 txr->hn_stat_size += m_head->m_pkthdr.len; 1945 if (m_head->m_flags & M_MCAST) 1946 txr->hn_stat_mcasts++; 1947 1948 return 0; 1949 } 1950 1951 /* 1952 * NOTE: 1953 * If this function fails, then txd will be freed, but the mbuf 1954 * associated w/ the txd will _not_ be freed. 1955 */ 1956 static int 1957 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 1958 { 1959 int error, send_failed = 0, has_bpf; 1960 1961 again: 1962 has_bpf = bpf_peers_present(ifp->if_bpf); 1963 if (has_bpf) { 1964 /* 1965 * Make sure that this txd and any aggregated txds are not 1966 * freed before ETHER_BPF_MTAP. 1967 */ 1968 hn_txdesc_hold(txd); 1969 } 1970 error = txr->hn_sendpkt(txr, txd); 1971 if (!error) { 1972 if (has_bpf) { 1973 const struct hn_txdesc *tmp_txd; 1974 1975 ETHER_BPF_MTAP(ifp, txd->m); 1976 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 1977 ETHER_BPF_MTAP(ifp, tmp_txd->m); 1978 } 1979 1980 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 1981 #ifdef HN_IFSTART_SUPPORT 1982 if (!hn_use_if_start) 1983 #endif 1984 { 1985 if_inc_counter(ifp, IFCOUNTER_OBYTES, 1986 txr->hn_stat_size); 1987 if (txr->hn_stat_mcasts != 0) { 1988 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1989 txr->hn_stat_mcasts); 1990 } 1991 } 1992 txr->hn_pkts += txr->hn_stat_pkts; 1993 txr->hn_sends++; 1994 } 1995 if (has_bpf) 1996 hn_txdesc_put(txr, txd); 1997 1998 if (__predict_false(error)) { 1999 int freed; 2000 2001 /* 2002 * This should "really rarely" happen. 2003 * 2004 * XXX Too many RX to be acked or too many sideband 2005 * commands to run? Ask netvsc_channel_rollup() 2006 * to kick start later. 2007 */ 2008 txr->hn_has_txeof = 1; 2009 if (!send_failed) { 2010 txr->hn_send_failed++; 2011 send_failed = 1; 2012 /* 2013 * Try sending again after set hn_has_txeof; 2014 * in case that we missed the last 2015 * netvsc_channel_rollup(). 2016 */ 2017 goto again; 2018 } 2019 if_printf(ifp, "send failed\n"); 2020 2021 /* 2022 * Caller will perform further processing on the 2023 * associated mbuf, so don't free it in hn_txdesc_put(); 2024 * only unload it from the DMA map in hn_txdesc_put(), 2025 * if it was loaded. 2026 */ 2027 txd->m = NULL; 2028 freed = hn_txdesc_put(txr, txd); 2029 KASSERT(freed != 0, 2030 ("fail to free txd upon send error")); 2031 2032 txr->hn_send_failed++; 2033 } 2034 2035 /* Reset temporary stats, after this sending is done. */ 2036 txr->hn_stat_size = 0; 2037 txr->hn_stat_pkts = 0; 2038 txr->hn_stat_mcasts = 0; 2039 2040 return (error); 2041 } 2042 2043 /* 2044 * Append the specified data to the indicated mbuf chain, 2045 * Extend the mbuf chain if the new data does not fit in 2046 * existing space. 2047 * 2048 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 2049 * There should be an equivalent in the kernel mbuf code, 2050 * but there does not appear to be one yet. 2051 * 2052 * Differs from m_append() in that additional mbufs are 2053 * allocated with cluster size MJUMPAGESIZE, and filled 2054 * accordingly. 2055 * 2056 * Return 1 if able to complete the job; otherwise 0. 2057 */ 2058 static int 2059 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 2060 { 2061 struct mbuf *m, *n; 2062 int remainder, space; 2063 2064 for (m = m0; m->m_next != NULL; m = m->m_next) 2065 ; 2066 remainder = len; 2067 space = M_TRAILINGSPACE(m); 2068 if (space > 0) { 2069 /* 2070 * Copy into available space. 2071 */ 2072 if (space > remainder) 2073 space = remainder; 2074 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 2075 m->m_len += space; 2076 cp += space; 2077 remainder -= space; 2078 } 2079 while (remainder > 0) { 2080 /* 2081 * Allocate a new mbuf; could check space 2082 * and allocate a cluster instead. 2083 */ 2084 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 2085 if (n == NULL) 2086 break; 2087 n->m_len = min(MJUMPAGESIZE, remainder); 2088 bcopy(cp, mtod(n, caddr_t), n->m_len); 2089 cp += n->m_len; 2090 remainder -= n->m_len; 2091 m->m_next = n; 2092 m = n; 2093 } 2094 if (m0->m_flags & M_PKTHDR) 2095 m0->m_pkthdr.len += len - remainder; 2096 2097 return (remainder == 0); 2098 } 2099 2100 #if defined(INET) || defined(INET6) 2101 static __inline int 2102 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 2103 { 2104 #if __FreeBSD_version >= 1100095 2105 if (hn_lro_mbufq_depth) { 2106 tcp_lro_queue_mbuf(lc, m); 2107 return 0; 2108 } 2109 #endif 2110 return tcp_lro_rx(lc, m, 0); 2111 } 2112 #endif 2113 2114 static int 2115 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, 2116 const struct hn_rxinfo *info) 2117 { 2118 struct ifnet *ifp = rxr->hn_ifp; 2119 struct mbuf *m_new; 2120 int size, do_lro = 0, do_csum = 1; 2121 int hash_type; 2122 2123 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) 2124 return (0); 2125 2126 /* 2127 * Bail out if packet contains more data than configured MTU. 2128 */ 2129 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) { 2130 return (0); 2131 } else if (dlen <= MHLEN) { 2132 m_new = m_gethdr(M_NOWAIT, MT_DATA); 2133 if (m_new == NULL) { 2134 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2135 return (0); 2136 } 2137 memcpy(mtod(m_new, void *), data, dlen); 2138 m_new->m_pkthdr.len = m_new->m_len = dlen; 2139 rxr->hn_small_pkts++; 2140 } else { 2141 /* 2142 * Get an mbuf with a cluster. For packets 2K or less, 2143 * get a standard 2K cluster. For anything larger, get a 2144 * 4K cluster. Any buffers larger than 4K can cause problems 2145 * if looped around to the Hyper-V TX channel, so avoid them. 2146 */ 2147 size = MCLBYTES; 2148 if (dlen > MCLBYTES) { 2149 /* 4096 */ 2150 size = MJUMPAGESIZE; 2151 } 2152 2153 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 2154 if (m_new == NULL) { 2155 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2156 return (0); 2157 } 2158 2159 hv_m_append(m_new, dlen, data); 2160 } 2161 m_new->m_pkthdr.rcvif = ifp; 2162 2163 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) 2164 do_csum = 0; 2165 2166 /* receive side checksum offload */ 2167 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { 2168 /* IP csum offload */ 2169 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 2170 m_new->m_pkthdr.csum_flags |= 2171 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2172 rxr->hn_csum_ip++; 2173 } 2174 2175 /* TCP/UDP csum offload */ 2176 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | 2177 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 2178 m_new->m_pkthdr.csum_flags |= 2179 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2180 m_new->m_pkthdr.csum_data = 0xffff; 2181 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) 2182 rxr->hn_csum_tcp++; 2183 else 2184 rxr->hn_csum_udp++; 2185 } 2186 2187 /* 2188 * XXX 2189 * As of this write (Oct 28th, 2016), host side will turn 2190 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 2191 * the do_lro setting here is actually _not_ accurate. We 2192 * depend on the RSS hash type check to reset do_lro. 2193 */ 2194 if ((info->csum_info & 2195 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 2196 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 2197 do_lro = 1; 2198 } else { 2199 const struct ether_header *eh; 2200 uint16_t etype; 2201 int hoff; 2202 2203 hoff = sizeof(*eh); 2204 if (m_new->m_len < hoff) 2205 goto skip; 2206 eh = mtod(m_new, struct ether_header *); 2207 etype = ntohs(eh->ether_type); 2208 if (etype == ETHERTYPE_VLAN) { 2209 const struct ether_vlan_header *evl; 2210 2211 hoff = sizeof(*evl); 2212 if (m_new->m_len < hoff) 2213 goto skip; 2214 evl = mtod(m_new, struct ether_vlan_header *); 2215 etype = ntohs(evl->evl_proto); 2216 } 2217 2218 if (etype == ETHERTYPE_IP) { 2219 int pr; 2220 2221 pr = hn_check_iplen(m_new, hoff); 2222 if (pr == IPPROTO_TCP) { 2223 if (do_csum && 2224 (rxr->hn_trust_hcsum & 2225 HN_TRUST_HCSUM_TCP)) { 2226 rxr->hn_csum_trusted++; 2227 m_new->m_pkthdr.csum_flags |= 2228 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2229 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2230 m_new->m_pkthdr.csum_data = 0xffff; 2231 } 2232 do_lro = 1; 2233 } else if (pr == IPPROTO_UDP) { 2234 if (do_csum && 2235 (rxr->hn_trust_hcsum & 2236 HN_TRUST_HCSUM_UDP)) { 2237 rxr->hn_csum_trusted++; 2238 m_new->m_pkthdr.csum_flags |= 2239 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2240 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2241 m_new->m_pkthdr.csum_data = 0xffff; 2242 } 2243 } else if (pr != IPPROTO_DONE && do_csum && 2244 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 2245 rxr->hn_csum_trusted++; 2246 m_new->m_pkthdr.csum_flags |= 2247 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2248 } 2249 } 2250 } 2251 skip: 2252 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { 2253 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 2254 NDIS_VLAN_INFO_ID(info->vlan_info), 2255 NDIS_VLAN_INFO_PRI(info->vlan_info), 2256 NDIS_VLAN_INFO_CFI(info->vlan_info)); 2257 m_new->m_flags |= M_VLANTAG; 2258 } 2259 2260 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { 2261 rxr->hn_rss_pkts++; 2262 m_new->m_pkthdr.flowid = info->hash_value; 2263 hash_type = M_HASHTYPE_OPAQUE_HASH; 2264 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == 2265 NDIS_HASH_FUNCTION_TOEPLITZ) { 2266 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); 2267 2268 /* 2269 * NOTE: 2270 * do_lro is resetted, if the hash types are not TCP 2271 * related. See the comment in the above csum_flags 2272 * setup section. 2273 */ 2274 switch (type) { 2275 case NDIS_HASH_IPV4: 2276 hash_type = M_HASHTYPE_RSS_IPV4; 2277 do_lro = 0; 2278 break; 2279 2280 case NDIS_HASH_TCP_IPV4: 2281 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 2282 break; 2283 2284 case NDIS_HASH_IPV6: 2285 hash_type = M_HASHTYPE_RSS_IPV6; 2286 do_lro = 0; 2287 break; 2288 2289 case NDIS_HASH_IPV6_EX: 2290 hash_type = M_HASHTYPE_RSS_IPV6_EX; 2291 do_lro = 0; 2292 break; 2293 2294 case NDIS_HASH_TCP_IPV6: 2295 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 2296 break; 2297 2298 case NDIS_HASH_TCP_IPV6_EX: 2299 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 2300 break; 2301 } 2302 } 2303 } else { 2304 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 2305 hash_type = M_HASHTYPE_OPAQUE; 2306 } 2307 M_HASHTYPE_SET(m_new, hash_type); 2308 2309 /* 2310 * Note: Moved RX completion back to hv_nv_on_receive() so all 2311 * messages (not just data messages) will trigger a response. 2312 */ 2313 2314 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 2315 rxr->hn_pkts++; 2316 2317 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { 2318 #if defined(INET) || defined(INET6) 2319 struct lro_ctrl *lro = &rxr->hn_lro; 2320 2321 if (lro->lro_cnt) { 2322 rxr->hn_lro_tried++; 2323 if (hn_lro_rx(lro, m_new) == 0) { 2324 /* DONE! */ 2325 return 0; 2326 } 2327 } 2328 #endif 2329 } 2330 2331 /* We're not holding the lock here, so don't release it */ 2332 (*ifp->if_input)(ifp, m_new); 2333 2334 return (0); 2335 } 2336 2337 static int 2338 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 2339 { 2340 struct hn_softc *sc = ifp->if_softc; 2341 struct ifreq *ifr = (struct ifreq *)data; 2342 int mask, error = 0; 2343 2344 switch (cmd) { 2345 case SIOCSIFMTU: 2346 if (ifr->ifr_mtu > HN_MTU_MAX) { 2347 error = EINVAL; 2348 break; 2349 } 2350 2351 HN_LOCK(sc); 2352 2353 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2354 HN_UNLOCK(sc); 2355 break; 2356 } 2357 2358 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 2359 /* Can't change MTU */ 2360 HN_UNLOCK(sc); 2361 error = EOPNOTSUPP; 2362 break; 2363 } 2364 2365 if (ifp->if_mtu == ifr->ifr_mtu) { 2366 HN_UNLOCK(sc); 2367 break; 2368 } 2369 2370 /* Disable polling. */ 2371 hn_polling(sc, 0); 2372 2373 /* 2374 * Suspend this interface before the synthetic parts 2375 * are ripped. 2376 */ 2377 hn_suspend(sc); 2378 2379 /* 2380 * Detach the synthetics parts, i.e. NVS and RNDIS. 2381 */ 2382 hn_synth_detach(sc); 2383 2384 /* 2385 * Reattach the synthetic parts, i.e. NVS and RNDIS, 2386 * with the new MTU setting. 2387 */ 2388 error = hn_synth_attach(sc, ifr->ifr_mtu); 2389 if (error) { 2390 HN_UNLOCK(sc); 2391 break; 2392 } 2393 2394 /* 2395 * Commit the requested MTU, after the synthetic parts 2396 * have been successfully attached. 2397 */ 2398 ifp->if_mtu = ifr->ifr_mtu; 2399 2400 /* 2401 * Make sure that various parameters based on MTU are 2402 * still valid, after the MTU change. 2403 */ 2404 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 2405 hn_set_chim_size(sc, sc->hn_chim_szmax); 2406 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 2407 #if __FreeBSD_version >= 1100099 2408 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < 2409 HN_LRO_LENLIM_MIN(ifp)) 2410 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 2411 #endif 2412 2413 /* 2414 * All done! Resume the interface now. 2415 */ 2416 hn_resume(sc); 2417 2418 /* 2419 * Re-enable polling if this interface is running and 2420 * the polling is requested. 2421 */ 2422 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 2423 hn_polling(sc, sc->hn_pollhz); 2424 2425 HN_UNLOCK(sc); 2426 break; 2427 2428 case SIOCSIFFLAGS: 2429 HN_LOCK(sc); 2430 2431 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2432 HN_UNLOCK(sc); 2433 break; 2434 } 2435 2436 if (ifp->if_flags & IFF_UP) { 2437 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2438 /* 2439 * Caller meight hold mutex, e.g. 2440 * bpf; use busy-wait for the RNDIS 2441 * reply. 2442 */ 2443 HN_NO_SLEEPING(sc); 2444 hn_set_rxfilter(sc); 2445 HN_SLEEPING_OK(sc); 2446 } else { 2447 hn_init_locked(sc); 2448 } 2449 } else { 2450 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2451 hn_stop(sc); 2452 } 2453 sc->hn_if_flags = ifp->if_flags; 2454 2455 HN_UNLOCK(sc); 2456 break; 2457 2458 case SIOCSIFCAP: 2459 HN_LOCK(sc); 2460 mask = ifr->ifr_reqcap ^ ifp->if_capenable; 2461 2462 if (mask & IFCAP_TXCSUM) { 2463 ifp->if_capenable ^= IFCAP_TXCSUM; 2464 if (ifp->if_capenable & IFCAP_TXCSUM) 2465 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 2466 else 2467 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 2468 } 2469 if (mask & IFCAP_TXCSUM_IPV6) { 2470 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 2471 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 2472 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 2473 else 2474 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 2475 } 2476 2477 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 2478 if (mask & IFCAP_RXCSUM) 2479 ifp->if_capenable ^= IFCAP_RXCSUM; 2480 #ifdef foo 2481 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2482 if (mask & IFCAP_RXCSUM_IPV6) 2483 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 2484 #endif 2485 2486 if (mask & IFCAP_LRO) 2487 ifp->if_capenable ^= IFCAP_LRO; 2488 2489 if (mask & IFCAP_TSO4) { 2490 ifp->if_capenable ^= IFCAP_TSO4; 2491 if (ifp->if_capenable & IFCAP_TSO4) 2492 ifp->if_hwassist |= CSUM_IP_TSO; 2493 else 2494 ifp->if_hwassist &= ~CSUM_IP_TSO; 2495 } 2496 if (mask & IFCAP_TSO6) { 2497 ifp->if_capenable ^= IFCAP_TSO6; 2498 if (ifp->if_capenable & IFCAP_TSO6) 2499 ifp->if_hwassist |= CSUM_IP6_TSO; 2500 else 2501 ifp->if_hwassist &= ~CSUM_IP6_TSO; 2502 } 2503 2504 HN_UNLOCK(sc); 2505 break; 2506 2507 case SIOCADDMULTI: 2508 case SIOCDELMULTI: 2509 HN_LOCK(sc); 2510 2511 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2512 HN_UNLOCK(sc); 2513 break; 2514 } 2515 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2516 /* 2517 * Multicast uses mutex; use busy-wait for 2518 * the RNDIS reply. 2519 */ 2520 HN_NO_SLEEPING(sc); 2521 hn_set_rxfilter(sc); 2522 HN_SLEEPING_OK(sc); 2523 } 2524 2525 HN_UNLOCK(sc); 2526 break; 2527 2528 case SIOCSIFMEDIA: 2529 case SIOCGIFMEDIA: 2530 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 2531 break; 2532 2533 default: 2534 error = ether_ioctl(ifp, cmd, data); 2535 break; 2536 } 2537 return (error); 2538 } 2539 2540 static void 2541 hn_stop(struct hn_softc *sc) 2542 { 2543 struct ifnet *ifp = sc->hn_ifp; 2544 int i; 2545 2546 HN_LOCK_ASSERT(sc); 2547 2548 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 2549 ("synthetic parts were not attached")); 2550 2551 /* Disable polling. */ 2552 hn_polling(sc, 0); 2553 2554 /* Clear RUNNING bit _before_ hn_suspend_data() */ 2555 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2556 hn_suspend_data(sc); 2557 2558 /* Clear OACTIVE bit. */ 2559 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2560 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2561 sc->hn_tx_ring[i].hn_oactive = 0; 2562 } 2563 2564 static void 2565 hn_init_locked(struct hn_softc *sc) 2566 { 2567 struct ifnet *ifp = sc->hn_ifp; 2568 int i; 2569 2570 HN_LOCK_ASSERT(sc); 2571 2572 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 2573 return; 2574 2575 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2576 return; 2577 2578 /* Configure RX filter */ 2579 hn_set_rxfilter(sc); 2580 2581 /* Clear OACTIVE bit. */ 2582 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2583 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2584 sc->hn_tx_ring[i].hn_oactive = 0; 2585 2586 /* Clear TX 'suspended' bit. */ 2587 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 2588 2589 /* Everything is ready; unleash! */ 2590 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2591 2592 /* Re-enable polling if requested. */ 2593 if (sc->hn_pollhz > 0) 2594 hn_polling(sc, sc->hn_pollhz); 2595 } 2596 2597 static void 2598 hn_init(void *xsc) 2599 { 2600 struct hn_softc *sc = xsc; 2601 2602 HN_LOCK(sc); 2603 hn_init_locked(sc); 2604 HN_UNLOCK(sc); 2605 } 2606 2607 #if __FreeBSD_version >= 1100099 2608 2609 static int 2610 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 2611 { 2612 struct hn_softc *sc = arg1; 2613 unsigned int lenlim; 2614 int error; 2615 2616 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 2617 error = sysctl_handle_int(oidp, &lenlim, 0, req); 2618 if (error || req->newptr == NULL) 2619 return error; 2620 2621 HN_LOCK(sc); 2622 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 2623 lenlim > TCP_LRO_LENGTH_MAX) { 2624 HN_UNLOCK(sc); 2625 return EINVAL; 2626 } 2627 hn_set_lro_lenlim(sc, lenlim); 2628 HN_UNLOCK(sc); 2629 2630 return 0; 2631 } 2632 2633 static int 2634 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 2635 { 2636 struct hn_softc *sc = arg1; 2637 int ackcnt, error, i; 2638 2639 /* 2640 * lro_ackcnt_lim is append count limit, 2641 * +1 to turn it into aggregation limit. 2642 */ 2643 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 2644 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 2645 if (error || req->newptr == NULL) 2646 return error; 2647 2648 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 2649 return EINVAL; 2650 2651 /* 2652 * Convert aggregation limit back to append 2653 * count limit. 2654 */ 2655 --ackcnt; 2656 HN_LOCK(sc); 2657 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 2658 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 2659 HN_UNLOCK(sc); 2660 return 0; 2661 } 2662 2663 #endif 2664 2665 static int 2666 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 2667 { 2668 struct hn_softc *sc = arg1; 2669 int hcsum = arg2; 2670 int on, error, i; 2671 2672 on = 0; 2673 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 2674 on = 1; 2675 2676 error = sysctl_handle_int(oidp, &on, 0, req); 2677 if (error || req->newptr == NULL) 2678 return error; 2679 2680 HN_LOCK(sc); 2681 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2682 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 2683 2684 if (on) 2685 rxr->hn_trust_hcsum |= hcsum; 2686 else 2687 rxr->hn_trust_hcsum &= ~hcsum; 2688 } 2689 HN_UNLOCK(sc); 2690 return 0; 2691 } 2692 2693 static int 2694 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 2695 { 2696 struct hn_softc *sc = arg1; 2697 int chim_size, error; 2698 2699 chim_size = sc->hn_tx_ring[0].hn_chim_size; 2700 error = sysctl_handle_int(oidp, &chim_size, 0, req); 2701 if (error || req->newptr == NULL) 2702 return error; 2703 2704 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 2705 return EINVAL; 2706 2707 HN_LOCK(sc); 2708 hn_set_chim_size(sc, chim_size); 2709 HN_UNLOCK(sc); 2710 return 0; 2711 } 2712 2713 #if __FreeBSD_version < 1100095 2714 static int 2715 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 2716 { 2717 struct hn_softc *sc = arg1; 2718 int ofs = arg2, i, error; 2719 struct hn_rx_ring *rxr; 2720 uint64_t stat; 2721 2722 stat = 0; 2723 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2724 rxr = &sc->hn_rx_ring[i]; 2725 stat += *((int *)((uint8_t *)rxr + ofs)); 2726 } 2727 2728 error = sysctl_handle_64(oidp, &stat, 0, req); 2729 if (error || req->newptr == NULL) 2730 return error; 2731 2732 /* Zero out this stat. */ 2733 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2734 rxr = &sc->hn_rx_ring[i]; 2735 *((int *)((uint8_t *)rxr + ofs)) = 0; 2736 } 2737 return 0; 2738 } 2739 #else 2740 static int 2741 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 2742 { 2743 struct hn_softc *sc = arg1; 2744 int ofs = arg2, i, error; 2745 struct hn_rx_ring *rxr; 2746 uint64_t stat; 2747 2748 stat = 0; 2749 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2750 rxr = &sc->hn_rx_ring[i]; 2751 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 2752 } 2753 2754 error = sysctl_handle_64(oidp, &stat, 0, req); 2755 if (error || req->newptr == NULL) 2756 return error; 2757 2758 /* Zero out this stat. */ 2759 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2760 rxr = &sc->hn_rx_ring[i]; 2761 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 2762 } 2763 return 0; 2764 } 2765 2766 #endif 2767 2768 static int 2769 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2770 { 2771 struct hn_softc *sc = arg1; 2772 int ofs = arg2, i, error; 2773 struct hn_rx_ring *rxr; 2774 u_long stat; 2775 2776 stat = 0; 2777 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2778 rxr = &sc->hn_rx_ring[i]; 2779 stat += *((u_long *)((uint8_t *)rxr + ofs)); 2780 } 2781 2782 error = sysctl_handle_long(oidp, &stat, 0, req); 2783 if (error || req->newptr == NULL) 2784 return error; 2785 2786 /* Zero out this stat. */ 2787 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2788 rxr = &sc->hn_rx_ring[i]; 2789 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 2790 } 2791 return 0; 2792 } 2793 2794 static int 2795 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2796 { 2797 struct hn_softc *sc = arg1; 2798 int ofs = arg2, i, error; 2799 struct hn_tx_ring *txr; 2800 u_long stat; 2801 2802 stat = 0; 2803 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2804 txr = &sc->hn_tx_ring[i]; 2805 stat += *((u_long *)((uint8_t *)txr + ofs)); 2806 } 2807 2808 error = sysctl_handle_long(oidp, &stat, 0, req); 2809 if (error || req->newptr == NULL) 2810 return error; 2811 2812 /* Zero out this stat. */ 2813 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2814 txr = &sc->hn_tx_ring[i]; 2815 *((u_long *)((uint8_t *)txr + ofs)) = 0; 2816 } 2817 return 0; 2818 } 2819 2820 static int 2821 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 2822 { 2823 struct hn_softc *sc = arg1; 2824 int ofs = arg2, i, error, conf; 2825 struct hn_tx_ring *txr; 2826 2827 txr = &sc->hn_tx_ring[0]; 2828 conf = *((int *)((uint8_t *)txr + ofs)); 2829 2830 error = sysctl_handle_int(oidp, &conf, 0, req); 2831 if (error || req->newptr == NULL) 2832 return error; 2833 2834 HN_LOCK(sc); 2835 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2836 txr = &sc->hn_tx_ring[i]; 2837 *((int *)((uint8_t *)txr + ofs)) = conf; 2838 } 2839 HN_UNLOCK(sc); 2840 2841 return 0; 2842 } 2843 2844 static int 2845 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 2846 { 2847 struct hn_softc *sc = arg1; 2848 int error, size; 2849 2850 size = sc->hn_agg_size; 2851 error = sysctl_handle_int(oidp, &size, 0, req); 2852 if (error || req->newptr == NULL) 2853 return (error); 2854 2855 HN_LOCK(sc); 2856 sc->hn_agg_size = size; 2857 hn_set_txagg(sc); 2858 HN_UNLOCK(sc); 2859 2860 return (0); 2861 } 2862 2863 static int 2864 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 2865 { 2866 struct hn_softc *sc = arg1; 2867 int error, pkts; 2868 2869 pkts = sc->hn_agg_pkts; 2870 error = sysctl_handle_int(oidp, &pkts, 0, req); 2871 if (error || req->newptr == NULL) 2872 return (error); 2873 2874 HN_LOCK(sc); 2875 sc->hn_agg_pkts = pkts; 2876 hn_set_txagg(sc); 2877 HN_UNLOCK(sc); 2878 2879 return (0); 2880 } 2881 2882 static int 2883 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 2884 { 2885 struct hn_softc *sc = arg1; 2886 int pkts; 2887 2888 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 2889 return (sysctl_handle_int(oidp, &pkts, 0, req)); 2890 } 2891 2892 static int 2893 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 2894 { 2895 struct hn_softc *sc = arg1; 2896 int align; 2897 2898 align = sc->hn_tx_ring[0].hn_agg_align; 2899 return (sysctl_handle_int(oidp, &align, 0, req)); 2900 } 2901 2902 static void 2903 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 2904 { 2905 if (pollhz == 0) 2906 vmbus_chan_poll_disable(chan); 2907 else 2908 vmbus_chan_poll_enable(chan, pollhz); 2909 } 2910 2911 static void 2912 hn_polling(struct hn_softc *sc, u_int pollhz) 2913 { 2914 int nsubch = sc->hn_rx_ring_inuse - 1; 2915 2916 HN_LOCK_ASSERT(sc); 2917 2918 if (nsubch > 0) { 2919 struct vmbus_channel **subch; 2920 int i; 2921 2922 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 2923 for (i = 0; i < nsubch; ++i) 2924 hn_chan_polling(subch[i], pollhz); 2925 vmbus_subchan_rel(subch, nsubch); 2926 } 2927 hn_chan_polling(sc->hn_prichan, pollhz); 2928 } 2929 2930 static int 2931 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 2932 { 2933 struct hn_softc *sc = arg1; 2934 int pollhz, error; 2935 2936 pollhz = sc->hn_pollhz; 2937 error = sysctl_handle_int(oidp, &pollhz, 0, req); 2938 if (error || req->newptr == NULL) 2939 return (error); 2940 2941 if (pollhz != 0 && 2942 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 2943 return (EINVAL); 2944 2945 HN_LOCK(sc); 2946 if (sc->hn_pollhz != pollhz) { 2947 sc->hn_pollhz = pollhz; 2948 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 2949 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 2950 hn_polling(sc, sc->hn_pollhz); 2951 } 2952 HN_UNLOCK(sc); 2953 2954 return (0); 2955 } 2956 2957 static int 2958 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 2959 { 2960 struct hn_softc *sc = arg1; 2961 char verstr[16]; 2962 2963 snprintf(verstr, sizeof(verstr), "%u.%u", 2964 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 2965 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 2966 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 2967 } 2968 2969 static int 2970 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 2971 { 2972 struct hn_softc *sc = arg1; 2973 char caps_str[128]; 2974 uint32_t caps; 2975 2976 HN_LOCK(sc); 2977 caps = sc->hn_caps; 2978 HN_UNLOCK(sc); 2979 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 2980 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 2981 } 2982 2983 static int 2984 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 2985 { 2986 struct hn_softc *sc = arg1; 2987 char assist_str[128]; 2988 uint32_t hwassist; 2989 2990 HN_LOCK(sc); 2991 hwassist = sc->hn_ifp->if_hwassist; 2992 HN_UNLOCK(sc); 2993 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 2994 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 2995 } 2996 2997 static int 2998 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 2999 { 3000 struct hn_softc *sc = arg1; 3001 char filter_str[128]; 3002 uint32_t filter; 3003 3004 HN_LOCK(sc); 3005 filter = sc->hn_rx_filter; 3006 HN_UNLOCK(sc); 3007 snprintf(filter_str, sizeof(filter_str), "%b", filter, 3008 NDIS_PACKET_TYPES); 3009 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 3010 } 3011 3012 #ifndef RSS 3013 3014 static int 3015 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 3016 { 3017 struct hn_softc *sc = arg1; 3018 int error; 3019 3020 HN_LOCK(sc); 3021 3022 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3023 if (error || req->newptr == NULL) 3024 goto back; 3025 3026 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3027 if (error) 3028 goto back; 3029 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 3030 3031 if (sc->hn_rx_ring_inuse > 1) { 3032 error = hn_rss_reconfig(sc); 3033 } else { 3034 /* Not RSS capable, at least for now; just save the RSS key. */ 3035 error = 0; 3036 } 3037 back: 3038 HN_UNLOCK(sc); 3039 return (error); 3040 } 3041 3042 static int 3043 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 3044 { 3045 struct hn_softc *sc = arg1; 3046 int error; 3047 3048 HN_LOCK(sc); 3049 3050 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 3051 if (error || req->newptr == NULL) 3052 goto back; 3053 3054 /* 3055 * Don't allow RSS indirect table change, if this interface is not 3056 * RSS capable currently. 3057 */ 3058 if (sc->hn_rx_ring_inuse == 1) { 3059 error = EOPNOTSUPP; 3060 goto back; 3061 } 3062 3063 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 3064 if (error) 3065 goto back; 3066 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 3067 3068 hn_rss_ind_fixup(sc); 3069 error = hn_rss_reconfig(sc); 3070 back: 3071 HN_UNLOCK(sc); 3072 return (error); 3073 } 3074 3075 #endif /* !RSS */ 3076 3077 static int 3078 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 3079 { 3080 struct hn_softc *sc = arg1; 3081 char hash_str[128]; 3082 uint32_t hash; 3083 3084 HN_LOCK(sc); 3085 hash = sc->hn_rss_hash; 3086 HN_UNLOCK(sc); 3087 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 3088 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 3089 } 3090 3091 static int 3092 hn_check_iplen(const struct mbuf *m, int hoff) 3093 { 3094 const struct ip *ip; 3095 int len, iphlen, iplen; 3096 const struct tcphdr *th; 3097 int thoff; /* TCP data offset */ 3098 3099 len = hoff + sizeof(struct ip); 3100 3101 /* The packet must be at least the size of an IP header. */ 3102 if (m->m_pkthdr.len < len) 3103 return IPPROTO_DONE; 3104 3105 /* The fixed IP header must reside completely in the first mbuf. */ 3106 if (m->m_len < len) 3107 return IPPROTO_DONE; 3108 3109 ip = mtodo(m, hoff); 3110 3111 /* Bound check the packet's stated IP header length. */ 3112 iphlen = ip->ip_hl << 2; 3113 if (iphlen < sizeof(struct ip)) /* minimum header length */ 3114 return IPPROTO_DONE; 3115 3116 /* The full IP header must reside completely in the one mbuf. */ 3117 if (m->m_len < hoff + iphlen) 3118 return IPPROTO_DONE; 3119 3120 iplen = ntohs(ip->ip_len); 3121 3122 /* 3123 * Check that the amount of data in the buffers is as 3124 * at least much as the IP header would have us expect. 3125 */ 3126 if (m->m_pkthdr.len < hoff + iplen) 3127 return IPPROTO_DONE; 3128 3129 /* 3130 * Ignore IP fragments. 3131 */ 3132 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 3133 return IPPROTO_DONE; 3134 3135 /* 3136 * The TCP/IP or UDP/IP header must be entirely contained within 3137 * the first fragment of a packet. 3138 */ 3139 switch (ip->ip_p) { 3140 case IPPROTO_TCP: 3141 if (iplen < iphlen + sizeof(struct tcphdr)) 3142 return IPPROTO_DONE; 3143 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 3144 return IPPROTO_DONE; 3145 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 3146 thoff = th->th_off << 2; 3147 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 3148 return IPPROTO_DONE; 3149 if (m->m_len < hoff + iphlen + thoff) 3150 return IPPROTO_DONE; 3151 break; 3152 case IPPROTO_UDP: 3153 if (iplen < iphlen + sizeof(struct udphdr)) 3154 return IPPROTO_DONE; 3155 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 3156 return IPPROTO_DONE; 3157 break; 3158 default: 3159 if (iplen < iphlen) 3160 return IPPROTO_DONE; 3161 break; 3162 } 3163 return ip->ip_p; 3164 } 3165 3166 static int 3167 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 3168 { 3169 struct sysctl_oid_list *child; 3170 struct sysctl_ctx_list *ctx; 3171 device_t dev = sc->hn_dev; 3172 #if defined(INET) || defined(INET6) 3173 #if __FreeBSD_version >= 1100095 3174 int lroent_cnt; 3175 #endif 3176 #endif 3177 int i; 3178 3179 /* 3180 * Create RXBUF for reception. 3181 * 3182 * NOTE: 3183 * - It is shared by all channels. 3184 * - A large enough buffer is allocated, certain version of NVSes 3185 * may further limit the usable space. 3186 */ 3187 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3188 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 3189 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3190 if (sc->hn_rxbuf == NULL) { 3191 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 3192 return (ENOMEM); 3193 } 3194 3195 sc->hn_rx_ring_cnt = ring_cnt; 3196 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 3197 3198 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 3199 M_DEVBUF, M_WAITOK | M_ZERO); 3200 3201 #if defined(INET) || defined(INET6) 3202 #if __FreeBSD_version >= 1100095 3203 lroent_cnt = hn_lro_entry_count; 3204 if (lroent_cnt < TCP_LRO_ENTRIES) 3205 lroent_cnt = TCP_LRO_ENTRIES; 3206 if (bootverbose) 3207 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 3208 #endif 3209 #endif /* INET || INET6 */ 3210 3211 ctx = device_get_sysctl_ctx(dev); 3212 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 3213 3214 /* Create dev.hn.UNIT.rx sysctl tree */ 3215 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 3216 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3217 3218 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3219 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3220 3221 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3222 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 3223 &rxr->hn_br_dma, BUS_DMA_WAITOK); 3224 if (rxr->hn_br == NULL) { 3225 device_printf(dev, "allocate bufring failed\n"); 3226 return (ENOMEM); 3227 } 3228 3229 if (hn_trust_hosttcp) 3230 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 3231 if (hn_trust_hostudp) 3232 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 3233 if (hn_trust_hostip) 3234 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 3235 rxr->hn_ifp = sc->hn_ifp; 3236 if (i < sc->hn_tx_ring_cnt) 3237 rxr->hn_txr = &sc->hn_tx_ring[i]; 3238 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 3239 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 3240 rxr->hn_rx_idx = i; 3241 rxr->hn_rxbuf = sc->hn_rxbuf; 3242 3243 /* 3244 * Initialize LRO. 3245 */ 3246 #if defined(INET) || defined(INET6) 3247 #if __FreeBSD_version >= 1100095 3248 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 3249 hn_lro_mbufq_depth); 3250 #else 3251 tcp_lro_init(&rxr->hn_lro); 3252 rxr->hn_lro.ifp = sc->hn_ifp; 3253 #endif 3254 #if __FreeBSD_version >= 1100099 3255 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 3256 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 3257 #endif 3258 #endif /* INET || INET6 */ 3259 3260 if (sc->hn_rx_sysctl_tree != NULL) { 3261 char name[16]; 3262 3263 /* 3264 * Create per RX ring sysctl tree: 3265 * dev.hn.UNIT.rx.RINGID 3266 */ 3267 snprintf(name, sizeof(name), "%d", i); 3268 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 3269 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 3270 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3271 3272 if (rxr->hn_rx_sysctl_tree != NULL) { 3273 SYSCTL_ADD_ULONG(ctx, 3274 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3275 OID_AUTO, "packets", CTLFLAG_RW, 3276 &rxr->hn_pkts, "# of packets received"); 3277 SYSCTL_ADD_ULONG(ctx, 3278 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3279 OID_AUTO, "rss_pkts", CTLFLAG_RW, 3280 &rxr->hn_rss_pkts, 3281 "# of packets w/ RSS info received"); 3282 SYSCTL_ADD_INT(ctx, 3283 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3284 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 3285 &rxr->hn_pktbuf_len, 0, 3286 "Temporary channel packet buffer length"); 3287 } 3288 } 3289 } 3290 3291 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 3292 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3293 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 3294 #if __FreeBSD_version < 1100095 3295 hn_rx_stat_int_sysctl, 3296 #else 3297 hn_rx_stat_u64_sysctl, 3298 #endif 3299 "LU", "LRO queued"); 3300 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 3301 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3302 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 3303 #if __FreeBSD_version < 1100095 3304 hn_rx_stat_int_sysctl, 3305 #else 3306 hn_rx_stat_u64_sysctl, 3307 #endif 3308 "LU", "LRO flushed"); 3309 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 3310 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3311 __offsetof(struct hn_rx_ring, hn_lro_tried), 3312 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 3313 #if __FreeBSD_version >= 1100099 3314 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 3315 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3316 hn_lro_lenlim_sysctl, "IU", 3317 "Max # of data bytes to be aggregated by LRO"); 3318 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 3319 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3320 hn_lro_ackcnt_sysctl, "I", 3321 "Max # of ACKs to be aggregated by LRO"); 3322 #endif 3323 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 3324 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 3325 hn_trust_hcsum_sysctl, "I", 3326 "Trust tcp segement verification on host side, " 3327 "when csum info is missing"); 3328 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 3329 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 3330 hn_trust_hcsum_sysctl, "I", 3331 "Trust udp datagram verification on host side, " 3332 "when csum info is missing"); 3333 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 3334 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 3335 hn_trust_hcsum_sysctl, "I", 3336 "Trust ip packet verification on host side, " 3337 "when csum info is missing"); 3338 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 3339 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3340 __offsetof(struct hn_rx_ring, hn_csum_ip), 3341 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 3342 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 3343 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3344 __offsetof(struct hn_rx_ring, hn_csum_tcp), 3345 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 3346 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 3347 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3348 __offsetof(struct hn_rx_ring, hn_csum_udp), 3349 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 3350 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 3351 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3352 __offsetof(struct hn_rx_ring, hn_csum_trusted), 3353 hn_rx_stat_ulong_sysctl, "LU", 3354 "# of packets that we trust host's csum verification"); 3355 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 3356 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3357 __offsetof(struct hn_rx_ring, hn_small_pkts), 3358 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 3359 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 3360 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3361 __offsetof(struct hn_rx_ring, hn_ack_failed), 3362 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 3363 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 3364 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 3365 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 3366 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 3367 3368 return (0); 3369 } 3370 3371 static void 3372 hn_destroy_rx_data(struct hn_softc *sc) 3373 { 3374 int i; 3375 3376 if (sc->hn_rxbuf != NULL) { 3377 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 3378 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 3379 else 3380 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 3381 sc->hn_rxbuf = NULL; 3382 } 3383 3384 if (sc->hn_rx_ring_cnt == 0) 3385 return; 3386 3387 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3388 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3389 3390 if (rxr->hn_br == NULL) 3391 continue; 3392 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 3393 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 3394 } else { 3395 device_printf(sc->hn_dev, 3396 "%dth channel bufring is referenced", i); 3397 } 3398 rxr->hn_br = NULL; 3399 3400 #if defined(INET) || defined(INET6) 3401 tcp_lro_free(&rxr->hn_lro); 3402 #endif 3403 free(rxr->hn_pktbuf, M_DEVBUF); 3404 } 3405 free(sc->hn_rx_ring, M_DEVBUF); 3406 sc->hn_rx_ring = NULL; 3407 3408 sc->hn_rx_ring_cnt = 0; 3409 sc->hn_rx_ring_inuse = 0; 3410 } 3411 3412 static int 3413 hn_tx_ring_create(struct hn_softc *sc, int id) 3414 { 3415 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 3416 device_t dev = sc->hn_dev; 3417 bus_dma_tag_t parent_dtag; 3418 int error, i; 3419 3420 txr->hn_sc = sc; 3421 txr->hn_tx_idx = id; 3422 3423 #ifndef HN_USE_TXDESC_BUFRING 3424 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 3425 #endif 3426 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 3427 3428 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 3429 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 3430 M_DEVBUF, M_WAITOK | M_ZERO); 3431 #ifndef HN_USE_TXDESC_BUFRING 3432 SLIST_INIT(&txr->hn_txlist); 3433 #else 3434 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 3435 M_WAITOK, &txr->hn_tx_lock); 3436 #endif 3437 3438 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 3439 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 3440 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 3441 } else { 3442 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 3443 } 3444 3445 #ifdef HN_IFSTART_SUPPORT 3446 if (hn_use_if_start) { 3447 txr->hn_txeof = hn_start_txeof; 3448 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 3449 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 3450 } else 3451 #endif 3452 { 3453 int br_depth; 3454 3455 txr->hn_txeof = hn_xmit_txeof; 3456 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 3457 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 3458 3459 br_depth = hn_get_txswq_depth(txr); 3460 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 3461 M_WAITOK, &txr->hn_tx_lock); 3462 } 3463 3464 txr->hn_direct_tx_size = hn_direct_tx_size; 3465 3466 /* 3467 * Always schedule transmission instead of trying to do direct 3468 * transmission. This one gives the best performance so far. 3469 */ 3470 txr->hn_sched_tx = 1; 3471 3472 parent_dtag = bus_get_dma_tag(dev); 3473 3474 /* DMA tag for RNDIS packet messages. */ 3475 error = bus_dma_tag_create(parent_dtag, /* parent */ 3476 HN_RNDIS_PKT_ALIGN, /* alignment */ 3477 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 3478 BUS_SPACE_MAXADDR, /* lowaddr */ 3479 BUS_SPACE_MAXADDR, /* highaddr */ 3480 NULL, NULL, /* filter, filterarg */ 3481 HN_RNDIS_PKT_LEN, /* maxsize */ 3482 1, /* nsegments */ 3483 HN_RNDIS_PKT_LEN, /* maxsegsize */ 3484 0, /* flags */ 3485 NULL, /* lockfunc */ 3486 NULL, /* lockfuncarg */ 3487 &txr->hn_tx_rndis_dtag); 3488 if (error) { 3489 device_printf(dev, "failed to create rndis dmatag\n"); 3490 return error; 3491 } 3492 3493 /* DMA tag for data. */ 3494 error = bus_dma_tag_create(parent_dtag, /* parent */ 3495 1, /* alignment */ 3496 HN_TX_DATA_BOUNDARY, /* boundary */ 3497 BUS_SPACE_MAXADDR, /* lowaddr */ 3498 BUS_SPACE_MAXADDR, /* highaddr */ 3499 NULL, NULL, /* filter, filterarg */ 3500 HN_TX_DATA_MAXSIZE, /* maxsize */ 3501 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 3502 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 3503 0, /* flags */ 3504 NULL, /* lockfunc */ 3505 NULL, /* lockfuncarg */ 3506 &txr->hn_tx_data_dtag); 3507 if (error) { 3508 device_printf(dev, "failed to create data dmatag\n"); 3509 return error; 3510 } 3511 3512 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 3513 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 3514 3515 txd->txr = txr; 3516 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3517 STAILQ_INIT(&txd->agg_list); 3518 3519 /* 3520 * Allocate and load RNDIS packet message. 3521 */ 3522 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 3523 (void **)&txd->rndis_pkt, 3524 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 3525 &txd->rndis_pkt_dmap); 3526 if (error) { 3527 device_printf(dev, 3528 "failed to allocate rndis_packet_msg, %d\n", i); 3529 return error; 3530 } 3531 3532 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 3533 txd->rndis_pkt_dmap, 3534 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 3535 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 3536 BUS_DMA_NOWAIT); 3537 if (error) { 3538 device_printf(dev, 3539 "failed to load rndis_packet_msg, %d\n", i); 3540 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3541 txd->rndis_pkt, txd->rndis_pkt_dmap); 3542 return error; 3543 } 3544 3545 /* DMA map for TX data. */ 3546 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 3547 &txd->data_dmap); 3548 if (error) { 3549 device_printf(dev, 3550 "failed to allocate tx data dmamap\n"); 3551 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 3552 txd->rndis_pkt_dmap); 3553 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3554 txd->rndis_pkt, txd->rndis_pkt_dmap); 3555 return error; 3556 } 3557 3558 /* All set, put it to list */ 3559 txd->flags |= HN_TXD_FLAG_ONLIST; 3560 #ifndef HN_USE_TXDESC_BUFRING 3561 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 3562 #else 3563 buf_ring_enqueue(txr->hn_txdesc_br, txd); 3564 #endif 3565 } 3566 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 3567 3568 if (sc->hn_tx_sysctl_tree != NULL) { 3569 struct sysctl_oid_list *child; 3570 struct sysctl_ctx_list *ctx; 3571 char name[16]; 3572 3573 /* 3574 * Create per TX ring sysctl tree: 3575 * dev.hn.UNIT.tx.RINGID 3576 */ 3577 ctx = device_get_sysctl_ctx(dev); 3578 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 3579 3580 snprintf(name, sizeof(name), "%d", id); 3581 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 3582 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3583 3584 if (txr->hn_tx_sysctl_tree != NULL) { 3585 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 3586 3587 #ifdef HN_DEBUG 3588 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 3589 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 3590 "# of available TX descs"); 3591 #endif 3592 #ifdef HN_IFSTART_SUPPORT 3593 if (!hn_use_if_start) 3594 #endif 3595 { 3596 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 3597 CTLFLAG_RD, &txr->hn_oactive, 0, 3598 "over active"); 3599 } 3600 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 3601 CTLFLAG_RW, &txr->hn_pkts, 3602 "# of packets transmitted"); 3603 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 3604 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 3605 } 3606 } 3607 3608 return 0; 3609 } 3610 3611 static void 3612 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 3613 { 3614 struct hn_tx_ring *txr = txd->txr; 3615 3616 KASSERT(txd->m == NULL, ("still has mbuf installed")); 3617 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 3618 3619 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 3620 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 3621 txd->rndis_pkt_dmap); 3622 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 3623 } 3624 3625 static void 3626 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 3627 { 3628 3629 KASSERT(txd->refs == 0 || txd->refs == 1, 3630 ("invalid txd refs %d", txd->refs)); 3631 3632 /* Aggregated txds will be freed by their aggregating txd. */ 3633 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 3634 int freed; 3635 3636 freed = hn_txdesc_put(txr, txd); 3637 KASSERT(freed, ("can't free txdesc")); 3638 } 3639 } 3640 3641 static void 3642 hn_tx_ring_destroy(struct hn_tx_ring *txr) 3643 { 3644 int i; 3645 3646 if (txr->hn_txdesc == NULL) 3647 return; 3648 3649 /* 3650 * NOTE: 3651 * Because the freeing of aggregated txds will be deferred 3652 * to the aggregating txd, two passes are used here: 3653 * - The first pass GCes any pending txds. This GC is necessary, 3654 * since if the channels are revoked, hypervisor will not 3655 * deliver send-done for all pending txds. 3656 * - The second pass frees the busdma stuffs, i.e. after all txds 3657 * were freed. 3658 */ 3659 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3660 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 3661 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3662 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 3663 3664 if (txr->hn_tx_data_dtag != NULL) 3665 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 3666 if (txr->hn_tx_rndis_dtag != NULL) 3667 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 3668 3669 #ifdef HN_USE_TXDESC_BUFRING 3670 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 3671 #endif 3672 3673 free(txr->hn_txdesc, M_DEVBUF); 3674 txr->hn_txdesc = NULL; 3675 3676 if (txr->hn_mbuf_br != NULL) 3677 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 3678 3679 #ifndef HN_USE_TXDESC_BUFRING 3680 mtx_destroy(&txr->hn_txlist_spin); 3681 #endif 3682 mtx_destroy(&txr->hn_tx_lock); 3683 } 3684 3685 static int 3686 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 3687 { 3688 struct sysctl_oid_list *child; 3689 struct sysctl_ctx_list *ctx; 3690 int i; 3691 3692 /* 3693 * Create TXBUF for chimney sending. 3694 * 3695 * NOTE: It is shared by all channels. 3696 */ 3697 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 3698 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 3699 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3700 if (sc->hn_chim == NULL) { 3701 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 3702 return (ENOMEM); 3703 } 3704 3705 sc->hn_tx_ring_cnt = ring_cnt; 3706 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 3707 3708 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 3709 M_DEVBUF, M_WAITOK | M_ZERO); 3710 3711 ctx = device_get_sysctl_ctx(sc->hn_dev); 3712 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 3713 3714 /* Create dev.hn.UNIT.tx sysctl tree */ 3715 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 3716 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3717 3718 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 3719 int error; 3720 3721 error = hn_tx_ring_create(sc, i); 3722 if (error) 3723 return error; 3724 } 3725 3726 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 3727 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3728 __offsetof(struct hn_tx_ring, hn_no_txdescs), 3729 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 3730 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 3731 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3732 __offsetof(struct hn_tx_ring, hn_send_failed), 3733 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 3734 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 3735 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3736 __offsetof(struct hn_tx_ring, hn_txdma_failed), 3737 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 3738 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 3739 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3740 __offsetof(struct hn_tx_ring, hn_flush_failed), 3741 hn_tx_stat_ulong_sysctl, "LU", 3742 "# of packet transmission aggregation flush failure"); 3743 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 3744 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3745 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 3746 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 3747 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 3748 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3749 __offsetof(struct hn_tx_ring, hn_tx_chimney), 3750 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 3751 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 3752 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3753 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 3754 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 3755 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 3756 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 3757 "# of total TX descs"); 3758 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 3759 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 3760 "Chimney send packet size upper boundary"); 3761 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 3762 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3763 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 3764 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 3765 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3766 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 3767 hn_tx_conf_int_sysctl, "I", 3768 "Size of the packet for direct transmission"); 3769 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 3770 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3771 __offsetof(struct hn_tx_ring, hn_sched_tx), 3772 hn_tx_conf_int_sysctl, "I", 3773 "Always schedule transmission " 3774 "instead of doing direct transmission"); 3775 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 3776 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 3777 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 3778 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 3779 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 3780 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 3781 "Applied packet transmission aggregation size"); 3782 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 3783 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3784 hn_txagg_pktmax_sysctl, "I", 3785 "Applied packet transmission aggregation packets"); 3786 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 3787 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3788 hn_txagg_align_sysctl, "I", 3789 "Applied packet transmission aggregation alignment"); 3790 3791 return 0; 3792 } 3793 3794 static void 3795 hn_set_chim_size(struct hn_softc *sc, int chim_size) 3796 { 3797 int i; 3798 3799 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3800 sc->hn_tx_ring[i].hn_chim_size = chim_size; 3801 } 3802 3803 static void 3804 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 3805 { 3806 struct ifnet *ifp = sc->hn_ifp; 3807 int tso_minlen; 3808 3809 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 3810 return; 3811 3812 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 3813 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 3814 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 3815 3816 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 3817 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 3818 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 3819 3820 if (tso_maxlen < tso_minlen) 3821 tso_maxlen = tso_minlen; 3822 else if (tso_maxlen > IP_MAXPACKET) 3823 tso_maxlen = IP_MAXPACKET; 3824 if (tso_maxlen > sc->hn_ndis_tso_szmax) 3825 tso_maxlen = sc->hn_ndis_tso_szmax; 3826 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 3827 if (bootverbose) 3828 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 3829 } 3830 3831 static void 3832 hn_fixup_tx_data(struct hn_softc *sc) 3833 { 3834 uint64_t csum_assist; 3835 int i; 3836 3837 hn_set_chim_size(sc, sc->hn_chim_szmax); 3838 if (hn_tx_chimney_size > 0 && 3839 hn_tx_chimney_size < sc->hn_chim_szmax) 3840 hn_set_chim_size(sc, hn_tx_chimney_size); 3841 3842 csum_assist = 0; 3843 if (sc->hn_caps & HN_CAP_IPCS) 3844 csum_assist |= CSUM_IP; 3845 if (sc->hn_caps & HN_CAP_TCP4CS) 3846 csum_assist |= CSUM_IP_TCP; 3847 if (sc->hn_caps & HN_CAP_UDP4CS) 3848 csum_assist |= CSUM_IP_UDP; 3849 if (sc->hn_caps & HN_CAP_TCP6CS) 3850 csum_assist |= CSUM_IP6_TCP; 3851 if (sc->hn_caps & HN_CAP_UDP6CS) 3852 csum_assist |= CSUM_IP6_UDP; 3853 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3854 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 3855 3856 if (sc->hn_caps & HN_CAP_HASHVAL) { 3857 /* 3858 * Support HASHVAL pktinfo on TX path. 3859 */ 3860 if (bootverbose) 3861 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 3862 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3863 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 3864 } 3865 } 3866 3867 static void 3868 hn_destroy_tx_data(struct hn_softc *sc) 3869 { 3870 int i; 3871 3872 if (sc->hn_chim != NULL) { 3873 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 3874 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 3875 } else { 3876 device_printf(sc->hn_dev, 3877 "chimney sending buffer is referenced"); 3878 } 3879 sc->hn_chim = NULL; 3880 } 3881 3882 if (sc->hn_tx_ring_cnt == 0) 3883 return; 3884 3885 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3886 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 3887 3888 free(sc->hn_tx_ring, M_DEVBUF); 3889 sc->hn_tx_ring = NULL; 3890 3891 sc->hn_tx_ring_cnt = 0; 3892 sc->hn_tx_ring_inuse = 0; 3893 } 3894 3895 #ifdef HN_IFSTART_SUPPORT 3896 3897 static void 3898 hn_start_taskfunc(void *xtxr, int pending __unused) 3899 { 3900 struct hn_tx_ring *txr = xtxr; 3901 3902 mtx_lock(&txr->hn_tx_lock); 3903 hn_start_locked(txr, 0); 3904 mtx_unlock(&txr->hn_tx_lock); 3905 } 3906 3907 static int 3908 hn_start_locked(struct hn_tx_ring *txr, int len) 3909 { 3910 struct hn_softc *sc = txr->hn_sc; 3911 struct ifnet *ifp = sc->hn_ifp; 3912 int sched = 0; 3913 3914 KASSERT(hn_use_if_start, 3915 ("hn_start_locked is called, when if_start is disabled")); 3916 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 3917 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 3918 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3919 3920 if (__predict_false(txr->hn_suspended)) 3921 return (0); 3922 3923 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 3924 IFF_DRV_RUNNING) 3925 return (0); 3926 3927 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 3928 struct hn_txdesc *txd; 3929 struct mbuf *m_head; 3930 int error; 3931 3932 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 3933 if (m_head == NULL) 3934 break; 3935 3936 if (len > 0 && m_head->m_pkthdr.len > len) { 3937 /* 3938 * This sending could be time consuming; let callers 3939 * dispatch this packet sending (and sending of any 3940 * following up packets) to tx taskqueue. 3941 */ 3942 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 3943 sched = 1; 3944 break; 3945 } 3946 3947 #if defined(INET6) || defined(INET) 3948 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3949 m_head = hn_tso_fixup(m_head); 3950 if (__predict_false(m_head == NULL)) { 3951 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3952 continue; 3953 } 3954 } 3955 #endif 3956 3957 txd = hn_txdesc_get(txr); 3958 if (txd == NULL) { 3959 txr->hn_no_txdescs++; 3960 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 3961 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 3962 break; 3963 } 3964 3965 error = hn_encap(ifp, txr, txd, &m_head); 3966 if (error) { 3967 /* Both txd and m_head are freed */ 3968 KASSERT(txr->hn_agg_txd == NULL, 3969 ("encap failed w/ pending aggregating txdesc")); 3970 continue; 3971 } 3972 3973 if (txr->hn_agg_pktleft == 0) { 3974 if (txr->hn_agg_txd != NULL) { 3975 KASSERT(m_head == NULL, 3976 ("pending mbuf for aggregating txdesc")); 3977 error = hn_flush_txagg(ifp, txr); 3978 if (__predict_false(error)) { 3979 atomic_set_int(&ifp->if_drv_flags, 3980 IFF_DRV_OACTIVE); 3981 break; 3982 } 3983 } else { 3984 KASSERT(m_head != NULL, ("mbuf was freed")); 3985 error = hn_txpkt(ifp, txr, txd); 3986 if (__predict_false(error)) { 3987 /* txd is freed, but m_head is not */ 3988 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 3989 atomic_set_int(&ifp->if_drv_flags, 3990 IFF_DRV_OACTIVE); 3991 break; 3992 } 3993 } 3994 } 3995 #ifdef INVARIANTS 3996 else { 3997 KASSERT(txr->hn_agg_txd != NULL, 3998 ("no aggregating txdesc")); 3999 KASSERT(m_head == NULL, 4000 ("pending mbuf for aggregating txdesc")); 4001 } 4002 #endif 4003 } 4004 4005 /* Flush pending aggerated transmission. */ 4006 if (txr->hn_agg_txd != NULL) 4007 hn_flush_txagg(ifp, txr); 4008 return (sched); 4009 } 4010 4011 static void 4012 hn_start(struct ifnet *ifp) 4013 { 4014 struct hn_softc *sc = ifp->if_softc; 4015 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 4016 4017 if (txr->hn_sched_tx) 4018 goto do_sched; 4019 4020 if (mtx_trylock(&txr->hn_tx_lock)) { 4021 int sched; 4022 4023 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 4024 mtx_unlock(&txr->hn_tx_lock); 4025 if (!sched) 4026 return; 4027 } 4028 do_sched: 4029 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4030 } 4031 4032 static void 4033 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 4034 { 4035 struct hn_tx_ring *txr = xtxr; 4036 4037 mtx_lock(&txr->hn_tx_lock); 4038 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 4039 hn_start_locked(txr, 0); 4040 mtx_unlock(&txr->hn_tx_lock); 4041 } 4042 4043 static void 4044 hn_start_txeof(struct hn_tx_ring *txr) 4045 { 4046 struct hn_softc *sc = txr->hn_sc; 4047 struct ifnet *ifp = sc->hn_ifp; 4048 4049 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 4050 4051 if (txr->hn_sched_tx) 4052 goto do_sched; 4053 4054 if (mtx_trylock(&txr->hn_tx_lock)) { 4055 int sched; 4056 4057 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4058 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 4059 mtx_unlock(&txr->hn_tx_lock); 4060 if (sched) { 4061 taskqueue_enqueue(txr->hn_tx_taskq, 4062 &txr->hn_tx_task); 4063 } 4064 } else { 4065 do_sched: 4066 /* 4067 * Release the OACTIVE earlier, with the hope, that 4068 * others could catch up. The task will clear the 4069 * flag again with the hn_tx_lock to avoid possible 4070 * races. 4071 */ 4072 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4073 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4074 } 4075 } 4076 4077 #endif /* HN_IFSTART_SUPPORT */ 4078 4079 static int 4080 hn_xmit(struct hn_tx_ring *txr, int len) 4081 { 4082 struct hn_softc *sc = txr->hn_sc; 4083 struct ifnet *ifp = sc->hn_ifp; 4084 struct mbuf *m_head; 4085 int sched = 0; 4086 4087 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 4088 #ifdef HN_IFSTART_SUPPORT 4089 KASSERT(hn_use_if_start == 0, 4090 ("hn_xmit is called, when if_start is enabled")); 4091 #endif 4092 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 4093 4094 if (__predict_false(txr->hn_suspended)) 4095 return (0); 4096 4097 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 4098 return (0); 4099 4100 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 4101 struct hn_txdesc *txd; 4102 int error; 4103 4104 if (len > 0 && m_head->m_pkthdr.len > len) { 4105 /* 4106 * This sending could be time consuming; let callers 4107 * dispatch this packet sending (and sending of any 4108 * following up packets) to tx taskqueue. 4109 */ 4110 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4111 sched = 1; 4112 break; 4113 } 4114 4115 txd = hn_txdesc_get(txr); 4116 if (txd == NULL) { 4117 txr->hn_no_txdescs++; 4118 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4119 txr->hn_oactive = 1; 4120 break; 4121 } 4122 4123 error = hn_encap(ifp, txr, txd, &m_head); 4124 if (error) { 4125 /* Both txd and m_head are freed; discard */ 4126 KASSERT(txr->hn_agg_txd == NULL, 4127 ("encap failed w/ pending aggregating txdesc")); 4128 drbr_advance(ifp, txr->hn_mbuf_br); 4129 continue; 4130 } 4131 4132 if (txr->hn_agg_pktleft == 0) { 4133 if (txr->hn_agg_txd != NULL) { 4134 KASSERT(m_head == NULL, 4135 ("pending mbuf for aggregating txdesc")); 4136 error = hn_flush_txagg(ifp, txr); 4137 if (__predict_false(error)) { 4138 txr->hn_oactive = 1; 4139 break; 4140 } 4141 } else { 4142 KASSERT(m_head != NULL, ("mbuf was freed")); 4143 error = hn_txpkt(ifp, txr, txd); 4144 if (__predict_false(error)) { 4145 /* txd is freed, but m_head is not */ 4146 drbr_putback(ifp, txr->hn_mbuf_br, 4147 m_head); 4148 txr->hn_oactive = 1; 4149 break; 4150 } 4151 } 4152 } 4153 #ifdef INVARIANTS 4154 else { 4155 KASSERT(txr->hn_agg_txd != NULL, 4156 ("no aggregating txdesc")); 4157 KASSERT(m_head == NULL, 4158 ("pending mbuf for aggregating txdesc")); 4159 } 4160 #endif 4161 4162 /* Sent */ 4163 drbr_advance(ifp, txr->hn_mbuf_br); 4164 } 4165 4166 /* Flush pending aggerated transmission. */ 4167 if (txr->hn_agg_txd != NULL) 4168 hn_flush_txagg(ifp, txr); 4169 return (sched); 4170 } 4171 4172 static int 4173 hn_transmit(struct ifnet *ifp, struct mbuf *m) 4174 { 4175 struct hn_softc *sc = ifp->if_softc; 4176 struct hn_tx_ring *txr; 4177 int error, idx = 0; 4178 4179 #if defined(INET6) || defined(INET) 4180 /* 4181 * Perform TSO packet header fixup now, since the TSO 4182 * packet header should be cache-hot. 4183 */ 4184 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 4185 m = hn_tso_fixup(m); 4186 if (__predict_false(m == NULL)) { 4187 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 4188 return EIO; 4189 } 4190 } 4191 #endif 4192 4193 /* 4194 * Select the TX ring based on flowid 4195 */ 4196 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 4197 #ifdef RSS 4198 uint32_t bid; 4199 4200 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 4201 &bid) == 0) 4202 idx = bid % sc->hn_tx_ring_inuse; 4203 else 4204 #endif 4205 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 4206 } 4207 txr = &sc->hn_tx_ring[idx]; 4208 4209 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 4210 if (error) { 4211 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 4212 return error; 4213 } 4214 4215 if (txr->hn_oactive) 4216 return 0; 4217 4218 if (txr->hn_sched_tx) 4219 goto do_sched; 4220 4221 if (mtx_trylock(&txr->hn_tx_lock)) { 4222 int sched; 4223 4224 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4225 mtx_unlock(&txr->hn_tx_lock); 4226 if (!sched) 4227 return 0; 4228 } 4229 do_sched: 4230 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4231 return 0; 4232 } 4233 4234 static void 4235 hn_tx_ring_qflush(struct hn_tx_ring *txr) 4236 { 4237 struct mbuf *m; 4238 4239 mtx_lock(&txr->hn_tx_lock); 4240 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 4241 m_freem(m); 4242 mtx_unlock(&txr->hn_tx_lock); 4243 } 4244 4245 static void 4246 hn_xmit_qflush(struct ifnet *ifp) 4247 { 4248 struct hn_softc *sc = ifp->if_softc; 4249 int i; 4250 4251 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4252 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 4253 if_qflush(ifp); 4254 } 4255 4256 static void 4257 hn_xmit_txeof(struct hn_tx_ring *txr) 4258 { 4259 4260 if (txr->hn_sched_tx) 4261 goto do_sched; 4262 4263 if (mtx_trylock(&txr->hn_tx_lock)) { 4264 int sched; 4265 4266 txr->hn_oactive = 0; 4267 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4268 mtx_unlock(&txr->hn_tx_lock); 4269 if (sched) { 4270 taskqueue_enqueue(txr->hn_tx_taskq, 4271 &txr->hn_tx_task); 4272 } 4273 } else { 4274 do_sched: 4275 /* 4276 * Release the oactive earlier, with the hope, that 4277 * others could catch up. The task will clear the 4278 * oactive again with the hn_tx_lock to avoid possible 4279 * races. 4280 */ 4281 txr->hn_oactive = 0; 4282 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4283 } 4284 } 4285 4286 static void 4287 hn_xmit_taskfunc(void *xtxr, int pending __unused) 4288 { 4289 struct hn_tx_ring *txr = xtxr; 4290 4291 mtx_lock(&txr->hn_tx_lock); 4292 hn_xmit(txr, 0); 4293 mtx_unlock(&txr->hn_tx_lock); 4294 } 4295 4296 static void 4297 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 4298 { 4299 struct hn_tx_ring *txr = xtxr; 4300 4301 mtx_lock(&txr->hn_tx_lock); 4302 txr->hn_oactive = 0; 4303 hn_xmit(txr, 0); 4304 mtx_unlock(&txr->hn_tx_lock); 4305 } 4306 4307 static int 4308 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 4309 { 4310 struct vmbus_chan_br cbr; 4311 struct hn_rx_ring *rxr; 4312 struct hn_tx_ring *txr = NULL; 4313 int idx, error; 4314 4315 idx = vmbus_chan_subidx(chan); 4316 4317 /* 4318 * Link this channel to RX/TX ring. 4319 */ 4320 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4321 ("invalid channel index %d, should > 0 && < %d", 4322 idx, sc->hn_rx_ring_inuse)); 4323 rxr = &sc->hn_rx_ring[idx]; 4324 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 4325 ("RX ring %d already attached", idx)); 4326 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 4327 4328 if (bootverbose) { 4329 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 4330 idx, vmbus_chan_id(chan)); 4331 } 4332 4333 if (idx < sc->hn_tx_ring_inuse) { 4334 txr = &sc->hn_tx_ring[idx]; 4335 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 4336 ("TX ring %d already attached", idx)); 4337 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 4338 4339 txr->hn_chan = chan; 4340 if (bootverbose) { 4341 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 4342 idx, vmbus_chan_id(chan)); 4343 } 4344 } 4345 4346 /* Bind this channel to a proper CPU. */ 4347 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 4348 4349 /* 4350 * Open this channel 4351 */ 4352 cbr.cbr = rxr->hn_br; 4353 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 4354 cbr.cbr_txsz = HN_TXBR_SIZE; 4355 cbr.cbr_rxsz = HN_RXBR_SIZE; 4356 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 4357 if (error) { 4358 if (error == EISCONN) { 4359 if_printf(sc->hn_ifp, "bufring is connected after " 4360 "chan%u open failure\n", vmbus_chan_id(chan)); 4361 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4362 } else { 4363 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 4364 vmbus_chan_id(chan), error); 4365 } 4366 } 4367 return (error); 4368 } 4369 4370 static void 4371 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 4372 { 4373 struct hn_rx_ring *rxr; 4374 int idx, error; 4375 4376 idx = vmbus_chan_subidx(chan); 4377 4378 /* 4379 * Link this channel to RX/TX ring. 4380 */ 4381 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4382 ("invalid channel index %d, should > 0 && < %d", 4383 idx, sc->hn_rx_ring_inuse)); 4384 rxr = &sc->hn_rx_ring[idx]; 4385 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 4386 ("RX ring %d is not attached", idx)); 4387 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 4388 4389 if (idx < sc->hn_tx_ring_inuse) { 4390 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 4391 4392 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 4393 ("TX ring %d is not attached attached", idx)); 4394 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 4395 } 4396 4397 /* 4398 * Close this channel. 4399 * 4400 * NOTE: 4401 * Channel closing does _not_ destroy the target channel. 4402 */ 4403 error = vmbus_chan_close_direct(chan); 4404 if (error == EISCONN) { 4405 if_printf(sc->hn_ifp, "chan%u bufring is connected " 4406 "after being closed\n", vmbus_chan_id(chan)); 4407 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4408 } else if (error) { 4409 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 4410 vmbus_chan_id(chan), error); 4411 } 4412 } 4413 4414 static int 4415 hn_attach_subchans(struct hn_softc *sc) 4416 { 4417 struct vmbus_channel **subchans; 4418 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4419 int i, error = 0; 4420 4421 KASSERT(subchan_cnt > 0, ("no sub-channels")); 4422 4423 /* Attach the sub-channels. */ 4424 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4425 for (i = 0; i < subchan_cnt; ++i) { 4426 int error1; 4427 4428 error1 = hn_chan_attach(sc, subchans[i]); 4429 if (error1) { 4430 error = error1; 4431 /* Move on; all channels will be detached later. */ 4432 } 4433 } 4434 vmbus_subchan_rel(subchans, subchan_cnt); 4435 4436 if (error) { 4437 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 4438 } else { 4439 if (bootverbose) { 4440 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 4441 subchan_cnt); 4442 } 4443 } 4444 return (error); 4445 } 4446 4447 static void 4448 hn_detach_allchans(struct hn_softc *sc) 4449 { 4450 struct vmbus_channel **subchans; 4451 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4452 int i; 4453 4454 if (subchan_cnt == 0) 4455 goto back; 4456 4457 /* Detach the sub-channels. */ 4458 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4459 for (i = 0; i < subchan_cnt; ++i) 4460 hn_chan_detach(sc, subchans[i]); 4461 vmbus_subchan_rel(subchans, subchan_cnt); 4462 4463 back: 4464 /* 4465 * Detach the primary channel, _after_ all sub-channels 4466 * are detached. 4467 */ 4468 hn_chan_detach(sc, sc->hn_prichan); 4469 4470 /* Wait for sub-channels to be destroyed, if any. */ 4471 vmbus_subchan_drain(sc->hn_prichan); 4472 4473 #ifdef INVARIANTS 4474 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4475 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 4476 HN_RX_FLAG_ATTACHED) == 0, 4477 ("%dth RX ring is still attached", i)); 4478 } 4479 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4480 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 4481 HN_TX_FLAG_ATTACHED) == 0, 4482 ("%dth TX ring is still attached", i)); 4483 } 4484 #endif 4485 } 4486 4487 static int 4488 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 4489 { 4490 struct vmbus_channel **subchans; 4491 int nchan, rxr_cnt, error; 4492 4493 nchan = *nsubch + 1; 4494 if (nchan == 1) { 4495 /* 4496 * Multiple RX/TX rings are not requested. 4497 */ 4498 *nsubch = 0; 4499 return (0); 4500 } 4501 4502 /* 4503 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 4504 * table entries. 4505 */ 4506 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 4507 if (error) { 4508 /* No RSS; this is benign. */ 4509 *nsubch = 0; 4510 return (0); 4511 } 4512 if (bootverbose) { 4513 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 4514 rxr_cnt, nchan); 4515 } 4516 4517 if (nchan > rxr_cnt) 4518 nchan = rxr_cnt; 4519 if (nchan == 1) { 4520 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 4521 *nsubch = 0; 4522 return (0); 4523 } 4524 4525 /* 4526 * Allocate sub-channels from NVS. 4527 */ 4528 *nsubch = nchan - 1; 4529 error = hn_nvs_alloc_subchans(sc, nsubch); 4530 if (error || *nsubch == 0) { 4531 /* Failed to allocate sub-channels. */ 4532 *nsubch = 0; 4533 return (0); 4534 } 4535 4536 /* 4537 * Wait for all sub-channels to become ready before moving on. 4538 */ 4539 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 4540 vmbus_subchan_rel(subchans, *nsubch); 4541 return (0); 4542 } 4543 4544 static bool 4545 hn_synth_attachable(const struct hn_softc *sc) 4546 { 4547 int i; 4548 4549 if (sc->hn_flags & HN_FLAG_ERRORS) 4550 return (false); 4551 4552 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4553 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4554 4555 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 4556 return (false); 4557 } 4558 return (true); 4559 } 4560 4561 static int 4562 hn_synth_attach(struct hn_softc *sc, int mtu) 4563 { 4564 #define ATTACHED_NVS 0x0002 4565 #define ATTACHED_RNDIS 0x0004 4566 4567 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 4568 int error, nsubch, nchan, i; 4569 uint32_t old_caps, attached = 0; 4570 4571 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 4572 ("synthetic parts were attached")); 4573 4574 if (!hn_synth_attachable(sc)) 4575 return (ENXIO); 4576 4577 /* Save capabilities for later verification. */ 4578 old_caps = sc->hn_caps; 4579 sc->hn_caps = 0; 4580 4581 /* Clear RSS stuffs. */ 4582 sc->hn_rss_ind_size = 0; 4583 sc->hn_rss_hash = 0; 4584 4585 /* 4586 * Attach the primary channel _before_ attaching NVS and RNDIS. 4587 */ 4588 error = hn_chan_attach(sc, sc->hn_prichan); 4589 if (error) 4590 goto failed; 4591 4592 /* 4593 * Attach NVS. 4594 */ 4595 error = hn_nvs_attach(sc, mtu); 4596 if (error) 4597 goto failed; 4598 attached |= ATTACHED_NVS; 4599 4600 /* 4601 * Attach RNDIS _after_ NVS is attached. 4602 */ 4603 error = hn_rndis_attach(sc, mtu); 4604 if (error) 4605 goto failed; 4606 attached |= ATTACHED_RNDIS; 4607 4608 /* 4609 * Make sure capabilities are not changed. 4610 */ 4611 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 4612 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 4613 old_caps, sc->hn_caps); 4614 error = ENXIO; 4615 goto failed; 4616 } 4617 4618 /* 4619 * Allocate sub-channels for multi-TX/RX rings. 4620 * 4621 * NOTE: 4622 * The # of RX rings that can be used is equivalent to the # of 4623 * channels to be requested. 4624 */ 4625 nsubch = sc->hn_rx_ring_cnt - 1; 4626 error = hn_synth_alloc_subchans(sc, &nsubch); 4627 if (error) 4628 goto failed; 4629 /* NOTE: _Full_ synthetic parts detach is required now. */ 4630 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 4631 4632 /* 4633 * Set the # of TX/RX rings that could be used according to 4634 * the # of channels that NVS offered. 4635 */ 4636 nchan = nsubch + 1; 4637 hn_set_ring_inuse(sc, nchan); 4638 if (nchan == 1) { 4639 /* Only the primary channel can be used; done */ 4640 goto back; 4641 } 4642 4643 /* 4644 * Attach the sub-channels. 4645 * 4646 * NOTE: hn_set_ring_inuse() _must_ have been called. 4647 */ 4648 error = hn_attach_subchans(sc); 4649 if (error) 4650 goto failed; 4651 4652 /* 4653 * Configure RSS key and indirect table _after_ all sub-channels 4654 * are attached. 4655 */ 4656 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 4657 /* 4658 * RSS key is not set yet; set it to the default RSS key. 4659 */ 4660 if (bootverbose) 4661 if_printf(sc->hn_ifp, "setup default RSS key\n"); 4662 #ifdef RSS 4663 rss_getkey(rss->rss_key); 4664 #else 4665 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 4666 #endif 4667 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4668 } 4669 4670 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 4671 /* 4672 * RSS indirect table is not set yet; set it up in round- 4673 * robin fashion. 4674 */ 4675 if (bootverbose) { 4676 if_printf(sc->hn_ifp, "setup default RSS indirect " 4677 "table\n"); 4678 } 4679 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 4680 uint32_t subidx; 4681 4682 #ifdef RSS 4683 subidx = rss_get_indirection_to_bucket(i); 4684 #else 4685 subidx = i; 4686 #endif 4687 rss->rss_ind[i] = subidx % nchan; 4688 } 4689 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4690 } else { 4691 /* 4692 * # of usable channels may be changed, so we have to 4693 * make sure that all entries in RSS indirect table 4694 * are valid. 4695 * 4696 * NOTE: hn_set_ring_inuse() _must_ have been called. 4697 */ 4698 hn_rss_ind_fixup(sc); 4699 } 4700 4701 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 4702 if (error) 4703 goto failed; 4704 back: 4705 /* 4706 * Fixup transmission aggregation setup. 4707 */ 4708 hn_set_txagg(sc); 4709 return (0); 4710 4711 failed: 4712 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 4713 hn_synth_detach(sc); 4714 } else { 4715 if (attached & ATTACHED_RNDIS) 4716 hn_rndis_detach(sc); 4717 if (attached & ATTACHED_NVS) 4718 hn_nvs_detach(sc); 4719 hn_chan_detach(sc, sc->hn_prichan); 4720 /* Restore old capabilities. */ 4721 sc->hn_caps = old_caps; 4722 } 4723 return (error); 4724 4725 #undef ATTACHED_RNDIS 4726 #undef ATTACHED_NVS 4727 } 4728 4729 /* 4730 * NOTE: 4731 * The interface must have been suspended though hn_suspend(), before 4732 * this function get called. 4733 */ 4734 static void 4735 hn_synth_detach(struct hn_softc *sc) 4736 { 4737 4738 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4739 ("synthetic parts were not attached")); 4740 4741 /* Detach the RNDIS first. */ 4742 hn_rndis_detach(sc); 4743 4744 /* Detach NVS. */ 4745 hn_nvs_detach(sc); 4746 4747 /* Detach all of the channels. */ 4748 hn_detach_allchans(sc); 4749 4750 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 4751 } 4752 4753 static void 4754 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 4755 { 4756 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 4757 ("invalid ring count %d", ring_cnt)); 4758 4759 if (sc->hn_tx_ring_cnt > ring_cnt) 4760 sc->hn_tx_ring_inuse = ring_cnt; 4761 else 4762 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 4763 sc->hn_rx_ring_inuse = ring_cnt; 4764 4765 #ifdef RSS 4766 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 4767 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 4768 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 4769 rss_getnumbuckets()); 4770 } 4771 #endif 4772 4773 if (bootverbose) { 4774 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 4775 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 4776 } 4777 } 4778 4779 static void 4780 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 4781 { 4782 4783 /* 4784 * NOTE: 4785 * The TX bufring will not be drained by the hypervisor, 4786 * if the primary channel is revoked. 4787 */ 4788 while (!vmbus_chan_rx_empty(chan) || 4789 (!vmbus_chan_is_revoked(sc->hn_prichan) && 4790 !vmbus_chan_tx_empty(chan))) 4791 pause("waitch", 1); 4792 vmbus_chan_intr_drain(chan); 4793 } 4794 4795 static void 4796 hn_suspend_data(struct hn_softc *sc) 4797 { 4798 struct vmbus_channel **subch = NULL; 4799 struct hn_tx_ring *txr; 4800 int i, nsubch; 4801 4802 HN_LOCK_ASSERT(sc); 4803 4804 /* 4805 * Suspend TX. 4806 */ 4807 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4808 txr = &sc->hn_tx_ring[i]; 4809 4810 mtx_lock(&txr->hn_tx_lock); 4811 txr->hn_suspended = 1; 4812 mtx_unlock(&txr->hn_tx_lock); 4813 /* No one is able send more packets now. */ 4814 4815 /* 4816 * Wait for all pending sends to finish. 4817 * 4818 * NOTE: 4819 * We will _not_ receive all pending send-done, if the 4820 * primary channel is revoked. 4821 */ 4822 while (hn_tx_ring_pending(txr) && 4823 !vmbus_chan_is_revoked(sc->hn_prichan)) 4824 pause("hnwtx", 1 /* 1 tick */); 4825 } 4826 4827 /* 4828 * Disable RX by clearing RX filter. 4829 */ 4830 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 4831 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); 4832 4833 /* 4834 * Give RNDIS enough time to flush all pending data packets. 4835 */ 4836 pause("waitrx", (200 * hz) / 1000); 4837 4838 /* 4839 * Drain RX/TX bufrings and interrupts. 4840 */ 4841 nsubch = sc->hn_rx_ring_inuse - 1; 4842 if (nsubch > 0) 4843 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4844 4845 if (subch != NULL) { 4846 for (i = 0; i < nsubch; ++i) 4847 hn_chan_drain(sc, subch[i]); 4848 } 4849 hn_chan_drain(sc, sc->hn_prichan); 4850 4851 if (subch != NULL) 4852 vmbus_subchan_rel(subch, nsubch); 4853 4854 /* 4855 * Drain any pending TX tasks. 4856 * 4857 * NOTE: 4858 * The above hn_chan_drain() can dispatch TX tasks, so the TX 4859 * tasks will have to be drained _after_ the above hn_chan_drain() 4860 * calls. 4861 */ 4862 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4863 txr = &sc->hn_tx_ring[i]; 4864 4865 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 4866 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 4867 } 4868 } 4869 4870 static void 4871 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 4872 { 4873 4874 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 4875 } 4876 4877 static void 4878 hn_suspend_mgmt(struct hn_softc *sc) 4879 { 4880 struct task task; 4881 4882 HN_LOCK_ASSERT(sc); 4883 4884 /* 4885 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 4886 * through hn_mgmt_taskq. 4887 */ 4888 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 4889 vmbus_chan_run_task(sc->hn_prichan, &task); 4890 4891 /* 4892 * Make sure that all pending management tasks are completed. 4893 */ 4894 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 4895 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 4896 taskqueue_drain_all(sc->hn_mgmt_taskq0); 4897 } 4898 4899 static void 4900 hn_suspend(struct hn_softc *sc) 4901 { 4902 4903 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 4904 hn_suspend_data(sc); 4905 hn_suspend_mgmt(sc); 4906 } 4907 4908 static void 4909 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 4910 { 4911 int i; 4912 4913 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 4914 ("invalid TX ring count %d", tx_ring_cnt)); 4915 4916 for (i = 0; i < tx_ring_cnt; ++i) { 4917 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 4918 4919 mtx_lock(&txr->hn_tx_lock); 4920 txr->hn_suspended = 0; 4921 mtx_unlock(&txr->hn_tx_lock); 4922 } 4923 } 4924 4925 static void 4926 hn_resume_data(struct hn_softc *sc) 4927 { 4928 int i; 4929 4930 HN_LOCK_ASSERT(sc); 4931 4932 /* 4933 * Re-enable RX. 4934 */ 4935 hn_set_rxfilter(sc); 4936 4937 /* 4938 * Make sure to clear suspend status on "all" TX rings, 4939 * since hn_tx_ring_inuse can be changed after 4940 * hn_suspend_data(). 4941 */ 4942 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 4943 4944 #ifdef HN_IFSTART_SUPPORT 4945 if (!hn_use_if_start) 4946 #endif 4947 { 4948 /* 4949 * Flush unused drbrs, since hn_tx_ring_inuse may be 4950 * reduced. 4951 */ 4952 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 4953 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 4954 } 4955 4956 /* 4957 * Kick start TX. 4958 */ 4959 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4960 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 4961 4962 /* 4963 * Use txeof task, so that any pending oactive can be 4964 * cleared properly. 4965 */ 4966 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4967 } 4968 } 4969 4970 static void 4971 hn_resume_mgmt(struct hn_softc *sc) 4972 { 4973 4974 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 4975 4976 /* 4977 * Kick off network change detection, if it was pending. 4978 * If no network change was pending, start link status 4979 * checks, which is more lightweight than network change 4980 * detection. 4981 */ 4982 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 4983 hn_change_network(sc); 4984 else 4985 hn_update_link_status(sc); 4986 } 4987 4988 static void 4989 hn_resume(struct hn_softc *sc) 4990 { 4991 4992 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 4993 hn_resume_data(sc); 4994 hn_resume_mgmt(sc); 4995 } 4996 4997 static void 4998 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 4999 { 5000 const struct rndis_status_msg *msg; 5001 int ofs; 5002 5003 if (dlen < sizeof(*msg)) { 5004 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 5005 return; 5006 } 5007 msg = data; 5008 5009 switch (msg->rm_status) { 5010 case RNDIS_STATUS_MEDIA_CONNECT: 5011 case RNDIS_STATUS_MEDIA_DISCONNECT: 5012 hn_update_link_status(sc); 5013 break; 5014 5015 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 5016 /* Not really useful; ignore. */ 5017 break; 5018 5019 case RNDIS_STATUS_NETWORK_CHANGE: 5020 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 5021 if (dlen < ofs + msg->rm_stbuflen || 5022 msg->rm_stbuflen < sizeof(uint32_t)) { 5023 if_printf(sc->hn_ifp, "network changed\n"); 5024 } else { 5025 uint32_t change; 5026 5027 memcpy(&change, ((const uint8_t *)msg) + ofs, 5028 sizeof(change)); 5029 if_printf(sc->hn_ifp, "network changed, change %u\n", 5030 change); 5031 } 5032 hn_change_network(sc); 5033 break; 5034 5035 default: 5036 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 5037 msg->rm_status); 5038 break; 5039 } 5040 } 5041 5042 static int 5043 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 5044 { 5045 const struct rndis_pktinfo *pi = info_data; 5046 uint32_t mask = 0; 5047 5048 while (info_dlen != 0) { 5049 const void *data; 5050 uint32_t dlen; 5051 5052 if (__predict_false(info_dlen < sizeof(*pi))) 5053 return (EINVAL); 5054 if (__predict_false(info_dlen < pi->rm_size)) 5055 return (EINVAL); 5056 info_dlen -= pi->rm_size; 5057 5058 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 5059 return (EINVAL); 5060 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 5061 return (EINVAL); 5062 dlen = pi->rm_size - pi->rm_pktinfooffset; 5063 data = pi->rm_data; 5064 5065 switch (pi->rm_type) { 5066 case NDIS_PKTINFO_TYPE_VLAN: 5067 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) 5068 return (EINVAL); 5069 info->vlan_info = *((const uint32_t *)data); 5070 mask |= HN_RXINFO_VLAN; 5071 break; 5072 5073 case NDIS_PKTINFO_TYPE_CSUM: 5074 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) 5075 return (EINVAL); 5076 info->csum_info = *((const uint32_t *)data); 5077 mask |= HN_RXINFO_CSUM; 5078 break; 5079 5080 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 5081 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) 5082 return (EINVAL); 5083 info->hash_value = *((const uint32_t *)data); 5084 mask |= HN_RXINFO_HASHVAL; 5085 break; 5086 5087 case HN_NDIS_PKTINFO_TYPE_HASHINF: 5088 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) 5089 return (EINVAL); 5090 info->hash_info = *((const uint32_t *)data); 5091 mask |= HN_RXINFO_HASHINF; 5092 break; 5093 5094 default: 5095 goto next; 5096 } 5097 5098 if (mask == HN_RXINFO_ALL) { 5099 /* All found; done */ 5100 break; 5101 } 5102 next: 5103 pi = (const struct rndis_pktinfo *) 5104 ((const uint8_t *)pi + pi->rm_size); 5105 } 5106 5107 /* 5108 * Final fixup. 5109 * - If there is no hash value, invalidate the hash info. 5110 */ 5111 if ((mask & HN_RXINFO_HASHVAL) == 0) 5112 info->hash_info = HN_NDIS_HASH_INFO_INVALID; 5113 return (0); 5114 } 5115 5116 static __inline bool 5117 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 5118 { 5119 5120 if (off < check_off) { 5121 if (__predict_true(off + len <= check_off)) 5122 return (false); 5123 } else if (off > check_off) { 5124 if (__predict_true(check_off + check_len <= off)) 5125 return (false); 5126 } 5127 return (true); 5128 } 5129 5130 static void 5131 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 5132 { 5133 const struct rndis_packet_msg *pkt; 5134 struct hn_rxinfo info; 5135 int data_off, pktinfo_off, data_len, pktinfo_len; 5136 5137 /* 5138 * Check length. 5139 */ 5140 if (__predict_false(dlen < sizeof(*pkt))) { 5141 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 5142 return; 5143 } 5144 pkt = data; 5145 5146 if (__predict_false(dlen < pkt->rm_len)) { 5147 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 5148 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 5149 return; 5150 } 5151 if (__predict_false(pkt->rm_len < 5152 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 5153 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 5154 "msglen %u, data %u, oob %u, pktinfo %u\n", 5155 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 5156 pkt->rm_pktinfolen); 5157 return; 5158 } 5159 if (__predict_false(pkt->rm_datalen == 0)) { 5160 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 5161 return; 5162 } 5163 5164 /* 5165 * Check offests. 5166 */ 5167 #define IS_OFFSET_INVALID(ofs) \ 5168 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 5169 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 5170 5171 /* XXX Hyper-V does not meet data offset alignment requirement */ 5172 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 5173 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5174 "data offset %u\n", pkt->rm_dataoffset); 5175 return; 5176 } 5177 if (__predict_false(pkt->rm_oobdataoffset > 0 && 5178 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 5179 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5180 "oob offset %u\n", pkt->rm_oobdataoffset); 5181 return; 5182 } 5183 if (__predict_true(pkt->rm_pktinfooffset > 0) && 5184 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 5185 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5186 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 5187 return; 5188 } 5189 5190 #undef IS_OFFSET_INVALID 5191 5192 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 5193 data_len = pkt->rm_datalen; 5194 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 5195 pktinfo_len = pkt->rm_pktinfolen; 5196 5197 /* 5198 * Check OOB coverage. 5199 */ 5200 if (__predict_false(pkt->rm_oobdatalen != 0)) { 5201 int oob_off, oob_len; 5202 5203 if_printf(rxr->hn_ifp, "got oobdata\n"); 5204 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 5205 oob_len = pkt->rm_oobdatalen; 5206 5207 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 5208 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5209 "oob overflow, msglen %u, oob abs %d len %d\n", 5210 pkt->rm_len, oob_off, oob_len); 5211 return; 5212 } 5213 5214 /* 5215 * Check against data. 5216 */ 5217 if (hn_rndis_check_overlap(oob_off, oob_len, 5218 data_off, data_len)) { 5219 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5220 "oob overlaps data, oob abs %d len %d, " 5221 "data abs %d len %d\n", 5222 oob_off, oob_len, data_off, data_len); 5223 return; 5224 } 5225 5226 /* 5227 * Check against pktinfo. 5228 */ 5229 if (pktinfo_len != 0 && 5230 hn_rndis_check_overlap(oob_off, oob_len, 5231 pktinfo_off, pktinfo_len)) { 5232 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5233 "oob overlaps pktinfo, oob abs %d len %d, " 5234 "pktinfo abs %d len %d\n", 5235 oob_off, oob_len, pktinfo_off, pktinfo_len); 5236 return; 5237 } 5238 } 5239 5240 /* 5241 * Check per-packet-info coverage and find useful per-packet-info. 5242 */ 5243 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; 5244 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; 5245 info.hash_info = HN_NDIS_HASH_INFO_INVALID; 5246 if (__predict_true(pktinfo_len != 0)) { 5247 bool overlap; 5248 int error; 5249 5250 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 5251 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5252 "pktinfo overflow, msglen %u, " 5253 "pktinfo abs %d len %d\n", 5254 pkt->rm_len, pktinfo_off, pktinfo_len); 5255 return; 5256 } 5257 5258 /* 5259 * Check packet info coverage. 5260 */ 5261 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 5262 data_off, data_len); 5263 if (__predict_false(overlap)) { 5264 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5265 "pktinfo overlap data, pktinfo abs %d len %d, " 5266 "data abs %d len %d\n", 5267 pktinfo_off, pktinfo_len, data_off, data_len); 5268 return; 5269 } 5270 5271 /* 5272 * Find useful per-packet-info. 5273 */ 5274 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 5275 pktinfo_len, &info); 5276 if (__predict_false(error)) { 5277 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 5278 "pktinfo\n"); 5279 return; 5280 } 5281 } 5282 5283 if (__predict_false(data_off + data_len > pkt->rm_len)) { 5284 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5285 "data overflow, msglen %u, data abs %d len %d\n", 5286 pkt->rm_len, data_off, data_len); 5287 return; 5288 } 5289 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); 5290 } 5291 5292 static __inline void 5293 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 5294 { 5295 const struct rndis_msghdr *hdr; 5296 5297 if (__predict_false(dlen < sizeof(*hdr))) { 5298 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 5299 return; 5300 } 5301 hdr = data; 5302 5303 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 5304 /* Hot data path. */ 5305 hn_rndis_rx_data(rxr, data, dlen); 5306 /* Done! */ 5307 return; 5308 } 5309 5310 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 5311 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 5312 else 5313 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 5314 } 5315 5316 static void 5317 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 5318 { 5319 const struct hn_nvs_hdr *hdr; 5320 5321 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 5322 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 5323 return; 5324 } 5325 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 5326 5327 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 5328 /* Useless; ignore */ 5329 return; 5330 } 5331 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 5332 } 5333 5334 static void 5335 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 5336 const struct vmbus_chanpkt_hdr *pkt) 5337 { 5338 struct hn_nvs_sendctx *sndc; 5339 5340 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 5341 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 5342 VMBUS_CHANPKT_DATALEN(pkt)); 5343 /* 5344 * NOTE: 5345 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 5346 * its callback. 5347 */ 5348 } 5349 5350 static void 5351 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5352 const struct vmbus_chanpkt_hdr *pkthdr) 5353 { 5354 const struct vmbus_chanpkt_rxbuf *pkt; 5355 const struct hn_nvs_hdr *nvs_hdr; 5356 int count, i, hlen; 5357 5358 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 5359 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 5360 return; 5361 } 5362 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 5363 5364 /* Make sure that this is a RNDIS message. */ 5365 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 5366 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 5367 nvs_hdr->nvs_type); 5368 return; 5369 } 5370 5371 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 5372 if (__predict_false(hlen < sizeof(*pkt))) { 5373 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 5374 return; 5375 } 5376 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 5377 5378 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 5379 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 5380 pkt->cp_rxbuf_id); 5381 return; 5382 } 5383 5384 count = pkt->cp_rxbuf_cnt; 5385 if (__predict_false(hlen < 5386 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 5387 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 5388 return; 5389 } 5390 5391 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 5392 for (i = 0; i < count; ++i) { 5393 int ofs, len; 5394 5395 ofs = pkt->cp_rxbuf[i].rb_ofs; 5396 len = pkt->cp_rxbuf[i].rb_len; 5397 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 5398 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 5399 "ofs %d, len %d\n", i, ofs, len); 5400 continue; 5401 } 5402 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 5403 } 5404 5405 /* 5406 * Ack the consumed RXBUF associated w/ this channel packet, 5407 * so that this RXBUF can be recycled by the hypervisor. 5408 */ 5409 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 5410 } 5411 5412 static void 5413 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5414 uint64_t tid) 5415 { 5416 struct hn_nvs_rndis_ack ack; 5417 int retries, error; 5418 5419 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 5420 ack.nvs_status = HN_NVS_STATUS_OK; 5421 5422 retries = 0; 5423 again: 5424 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 5425 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 5426 if (__predict_false(error == EAGAIN)) { 5427 /* 5428 * NOTE: 5429 * This should _not_ happen in real world, since the 5430 * consumption of the TX bufring from the TX path is 5431 * controlled. 5432 */ 5433 if (rxr->hn_ack_failed == 0) 5434 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 5435 rxr->hn_ack_failed++; 5436 retries++; 5437 if (retries < 10) { 5438 DELAY(100); 5439 goto again; 5440 } 5441 /* RXBUF leaks! */ 5442 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 5443 } 5444 } 5445 5446 static void 5447 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 5448 { 5449 struct hn_rx_ring *rxr = xrxr; 5450 struct hn_softc *sc = rxr->hn_ifp->if_softc; 5451 5452 for (;;) { 5453 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 5454 int error, pktlen; 5455 5456 pktlen = rxr->hn_pktbuf_len; 5457 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 5458 if (__predict_false(error == ENOBUFS)) { 5459 void *nbuf; 5460 int nlen; 5461 5462 /* 5463 * Expand channel packet buffer. 5464 * 5465 * XXX 5466 * Use M_WAITOK here, since allocation failure 5467 * is fatal. 5468 */ 5469 nlen = rxr->hn_pktbuf_len * 2; 5470 while (nlen < pktlen) 5471 nlen *= 2; 5472 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 5473 5474 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 5475 rxr->hn_pktbuf_len, nlen); 5476 5477 free(rxr->hn_pktbuf, M_DEVBUF); 5478 rxr->hn_pktbuf = nbuf; 5479 rxr->hn_pktbuf_len = nlen; 5480 /* Retry! */ 5481 continue; 5482 } else if (__predict_false(error == EAGAIN)) { 5483 /* No more channel packets; done! */ 5484 break; 5485 } 5486 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 5487 5488 switch (pkt->cph_type) { 5489 case VMBUS_CHANPKT_TYPE_COMP: 5490 hn_nvs_handle_comp(sc, chan, pkt); 5491 break; 5492 5493 case VMBUS_CHANPKT_TYPE_RXBUF: 5494 hn_nvs_handle_rxbuf(rxr, chan, pkt); 5495 break; 5496 5497 case VMBUS_CHANPKT_TYPE_INBAND: 5498 hn_nvs_handle_notify(sc, pkt); 5499 break; 5500 5501 default: 5502 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 5503 pkt->cph_type); 5504 break; 5505 } 5506 } 5507 hn_chan_rollup(rxr, rxr->hn_txr); 5508 } 5509 5510 static void 5511 hn_tx_taskq_create(void *arg __unused) 5512 { 5513 int i; 5514 5515 /* 5516 * Fix the # of TX taskqueues. 5517 */ 5518 if (hn_tx_taskq_cnt <= 0) 5519 hn_tx_taskq_cnt = 1; 5520 else if (hn_tx_taskq_cnt > mp_ncpus) 5521 hn_tx_taskq_cnt = mp_ncpus; 5522 5523 /* 5524 * Fix the TX taskqueue mode. 5525 */ 5526 switch (hn_tx_taskq_mode) { 5527 case HN_TX_TASKQ_M_INDEP: 5528 case HN_TX_TASKQ_M_GLOBAL: 5529 case HN_TX_TASKQ_M_EVTTQ: 5530 break; 5531 default: 5532 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 5533 break; 5534 } 5535 5536 if (vm_guest != VM_GUEST_HV) 5537 return; 5538 5539 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 5540 return; 5541 5542 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 5543 M_DEVBUF, M_WAITOK); 5544 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 5545 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 5546 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 5547 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 5548 "hn tx%d", i); 5549 } 5550 } 5551 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5552 hn_tx_taskq_create, NULL); 5553 5554 static void 5555 hn_tx_taskq_destroy(void *arg __unused) 5556 { 5557 5558 if (hn_tx_taskque != NULL) { 5559 int i; 5560 5561 for (i = 0; i < hn_tx_taskq_cnt; ++i) 5562 taskqueue_free(hn_tx_taskque[i]); 5563 free(hn_tx_taskque, M_DEVBUF); 5564 } 5565 } 5566 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5567 hn_tx_taskq_destroy, NULL); 5568