1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/bus.h> 65 #include <sys/kernel.h> 66 #include <sys/limits.h> 67 #include <sys/malloc.h> 68 #include <sys/mbuf.h> 69 #include <sys/module.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/smp.h> 73 #include <sys/socket.h> 74 #include <sys/sockio.h> 75 #include <sys/sx.h> 76 #include <sys/sysctl.h> 77 #include <sys/systm.h> 78 #include <sys/taskqueue.h> 79 #include <sys/buf_ring.h> 80 81 #include <machine/atomic.h> 82 #include <machine/in_cksum.h> 83 84 #include <net/bpf.h> 85 #include <net/ethernet.h> 86 #include <net/if.h> 87 #include <net/if_media.h> 88 #include <net/if_types.h> 89 #include <net/if_var.h> 90 #include <net/rndis.h> 91 #ifdef RSS 92 #include <net/rss_config.h> 93 #endif 94 95 #include <netinet/in_systm.h> 96 #include <netinet/in.h> 97 #include <netinet/ip.h> 98 #include <netinet/ip6.h> 99 #include <netinet/tcp.h> 100 #include <netinet/tcp_lro.h> 101 #include <netinet/udp.h> 102 103 #include <dev/hyperv/include/hyperv.h> 104 #include <dev/hyperv/include/hyperv_busdma.h> 105 #include <dev/hyperv/include/vmbus.h> 106 #include <dev/hyperv/include/vmbus_xact.h> 107 108 #include <dev/hyperv/netvsc/ndis.h> 109 #include <dev/hyperv/netvsc/if_hnreg.h> 110 #include <dev/hyperv/netvsc/if_hnvar.h> 111 #include <dev/hyperv/netvsc/hn_nvs.h> 112 #include <dev/hyperv/netvsc/hn_rndis.h> 113 114 #include "vmbus_if.h" 115 116 #define HN_IFSTART_SUPPORT 117 118 #define HN_RING_CNT_DEF_MAX 8 119 120 /* YYY should get it from the underlying channel */ 121 #define HN_TX_DESC_CNT 512 122 123 #define HN_RNDIS_PKT_LEN \ 124 (sizeof(struct rndis_packet_msg) + \ 125 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 126 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 127 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 128 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 129 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 130 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 131 132 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 133 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 134 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 135 /* -1 for RNDIS packet message */ 136 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 137 138 #define HN_DIRECT_TX_SIZE_DEF 128 139 140 #define HN_EARLY_TXEOF_THRESH 8 141 142 #define HN_PKTBUF_LEN_DEF (16 * 1024) 143 144 #define HN_LROENT_CNT_DEF 128 145 146 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 147 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 148 /* YYY 2*MTU is a bit rough, but should be good enough. */ 149 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 150 151 #define HN_LRO_ACKCNT_DEF 1 152 153 #define HN_LOCK_INIT(sc) \ 154 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 155 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 156 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 157 #define HN_LOCK(sc) \ 158 do { \ 159 while (sx_try_xlock(&(sc)->hn_lock) == 0) \ 160 DELAY(1000); \ 161 } while (0) 162 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 163 164 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 165 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 166 #define HN_CSUM_IP_HWASSIST(sc) \ 167 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 168 #define HN_CSUM_IP6_HWASSIST(sc) \ 169 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 170 171 #define HN_PKTSIZE_MIN(align) \ 172 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 173 HN_RNDIS_PKT_LEN, (align)) 174 #define HN_PKTSIZE(m, align) \ 175 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 176 177 #ifdef RSS 178 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 179 #else 180 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 181 #endif 182 183 struct hn_txdesc { 184 #ifndef HN_USE_TXDESC_BUFRING 185 SLIST_ENTRY(hn_txdesc) link; 186 #endif 187 STAILQ_ENTRY(hn_txdesc) agg_link; 188 189 /* Aggregated txdescs, in sending order. */ 190 STAILQ_HEAD(, hn_txdesc) agg_list; 191 192 /* The oldest packet, if transmission aggregation happens. */ 193 struct mbuf *m; 194 struct hn_tx_ring *txr; 195 int refs; 196 uint32_t flags; /* HN_TXD_FLAG_ */ 197 struct hn_nvs_sendctx send_ctx; 198 uint32_t chim_index; 199 int chim_size; 200 201 bus_dmamap_t data_dmap; 202 203 bus_addr_t rndis_pkt_paddr; 204 struct rndis_packet_msg *rndis_pkt; 205 bus_dmamap_t rndis_pkt_dmap; 206 }; 207 208 #define HN_TXD_FLAG_ONLIST 0x0001 209 #define HN_TXD_FLAG_DMAMAP 0x0002 210 #define HN_TXD_FLAG_ONAGG 0x0004 211 212 struct hn_rxinfo { 213 uint32_t vlan_info; 214 uint32_t csum_info; 215 uint32_t hash_info; 216 uint32_t hash_value; 217 }; 218 219 #define HN_RXINFO_VLAN 0x0001 220 #define HN_RXINFO_CSUM 0x0002 221 #define HN_RXINFO_HASHINF 0x0004 222 #define HN_RXINFO_HASHVAL 0x0008 223 #define HN_RXINFO_ALL \ 224 (HN_RXINFO_VLAN | \ 225 HN_RXINFO_CSUM | \ 226 HN_RXINFO_HASHINF | \ 227 HN_RXINFO_HASHVAL) 228 229 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff 230 #define HN_NDIS_RXCSUM_INFO_INVALID 0 231 #define HN_NDIS_HASH_INFO_INVALID 0 232 233 static int hn_probe(device_t); 234 static int hn_attach(device_t); 235 static int hn_detach(device_t); 236 static int hn_shutdown(device_t); 237 static void hn_chan_callback(struct vmbus_channel *, 238 void *); 239 240 static void hn_init(void *); 241 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 242 #ifdef HN_IFSTART_SUPPORT 243 static void hn_start(struct ifnet *); 244 #endif 245 static int hn_transmit(struct ifnet *, struct mbuf *); 246 static void hn_xmit_qflush(struct ifnet *); 247 static int hn_ifmedia_upd(struct ifnet *); 248 static void hn_ifmedia_sts(struct ifnet *, 249 struct ifmediareq *); 250 251 static int hn_rndis_rxinfo(const void *, int, 252 struct hn_rxinfo *); 253 static void hn_rndis_rx_data(struct hn_rx_ring *, 254 const void *, int); 255 static void hn_rndis_rx_status(struct hn_softc *, 256 const void *, int); 257 258 static void hn_nvs_handle_notify(struct hn_softc *, 259 const struct vmbus_chanpkt_hdr *); 260 static void hn_nvs_handle_comp(struct hn_softc *, 261 struct vmbus_channel *, 262 const struct vmbus_chanpkt_hdr *); 263 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 264 struct vmbus_channel *, 265 const struct vmbus_chanpkt_hdr *); 266 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 267 struct vmbus_channel *, uint64_t); 268 269 #if __FreeBSD_version >= 1100099 270 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 271 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 272 #endif 273 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 274 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 275 #if __FreeBSD_version < 1100095 276 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 277 #else 278 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 279 #endif 280 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 281 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 282 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 283 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 284 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 285 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 286 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 287 #ifndef RSS 288 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 289 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 290 #endif 291 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 292 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 293 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 294 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 295 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 296 297 static void hn_stop(struct hn_softc *); 298 static void hn_init_locked(struct hn_softc *); 299 static int hn_chan_attach(struct hn_softc *, 300 struct vmbus_channel *); 301 static void hn_chan_detach(struct hn_softc *, 302 struct vmbus_channel *); 303 static int hn_attach_subchans(struct hn_softc *); 304 static void hn_detach_allchans(struct hn_softc *); 305 static void hn_chan_rollup(struct hn_rx_ring *, 306 struct hn_tx_ring *); 307 static void hn_set_ring_inuse(struct hn_softc *, int); 308 static int hn_synth_attach(struct hn_softc *, int); 309 static void hn_synth_detach(struct hn_softc *); 310 static int hn_synth_alloc_subchans(struct hn_softc *, 311 int *); 312 static bool hn_synth_attachable(const struct hn_softc *); 313 static void hn_suspend(struct hn_softc *); 314 static void hn_suspend_data(struct hn_softc *); 315 static void hn_suspend_mgmt(struct hn_softc *); 316 static void hn_resume(struct hn_softc *); 317 static void hn_resume_data(struct hn_softc *); 318 static void hn_resume_mgmt(struct hn_softc *); 319 static void hn_suspend_mgmt_taskfunc(void *, int); 320 static void hn_chan_drain(struct hn_softc *, 321 struct vmbus_channel *); 322 323 static void hn_update_link_status(struct hn_softc *); 324 static void hn_change_network(struct hn_softc *); 325 static void hn_link_taskfunc(void *, int); 326 static void hn_netchg_init_taskfunc(void *, int); 327 static void hn_netchg_status_taskfunc(void *, int); 328 static void hn_link_status(struct hn_softc *); 329 330 static int hn_create_rx_data(struct hn_softc *, int); 331 static void hn_destroy_rx_data(struct hn_softc *); 332 static int hn_check_iplen(const struct mbuf *, int); 333 static int hn_set_rxfilter(struct hn_softc *); 334 #ifndef RSS 335 static int hn_rss_reconfig(struct hn_softc *); 336 #endif 337 static void hn_rss_ind_fixup(struct hn_softc *); 338 static int hn_rxpkt(struct hn_rx_ring *, const void *, 339 int, const struct hn_rxinfo *); 340 341 static int hn_tx_ring_create(struct hn_softc *, int); 342 static void hn_tx_ring_destroy(struct hn_tx_ring *); 343 static int hn_create_tx_data(struct hn_softc *, int); 344 static void hn_fixup_tx_data(struct hn_softc *); 345 static void hn_destroy_tx_data(struct hn_softc *); 346 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 347 static void hn_txdesc_gc(struct hn_tx_ring *, 348 struct hn_txdesc *); 349 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 350 struct hn_txdesc *, struct mbuf **); 351 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 352 struct hn_txdesc *); 353 static void hn_set_chim_size(struct hn_softc *, int); 354 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 355 static bool hn_tx_ring_pending(struct hn_tx_ring *); 356 static void hn_tx_ring_qflush(struct hn_tx_ring *); 357 static void hn_resume_tx(struct hn_softc *, int); 358 static void hn_set_txagg(struct hn_softc *); 359 static void *hn_try_txagg(struct ifnet *, 360 struct hn_tx_ring *, struct hn_txdesc *, 361 int); 362 static int hn_get_txswq_depth(const struct hn_tx_ring *); 363 static void hn_txpkt_done(struct hn_nvs_sendctx *, 364 struct hn_softc *, struct vmbus_channel *, 365 const void *, int); 366 static int hn_txpkt_sglist(struct hn_tx_ring *, 367 struct hn_txdesc *); 368 static int hn_txpkt_chim(struct hn_tx_ring *, 369 struct hn_txdesc *); 370 static int hn_xmit(struct hn_tx_ring *, int); 371 static void hn_xmit_taskfunc(void *, int); 372 static void hn_xmit_txeof(struct hn_tx_ring *); 373 static void hn_xmit_txeof_taskfunc(void *, int); 374 #ifdef HN_IFSTART_SUPPORT 375 static int hn_start_locked(struct hn_tx_ring *, int); 376 static void hn_start_taskfunc(void *, int); 377 static void hn_start_txeof(struct hn_tx_ring *); 378 static void hn_start_txeof_taskfunc(void *, int); 379 #endif 380 381 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 382 "Hyper-V network interface"); 383 384 /* Trust tcp segements verification on host side. */ 385 static int hn_trust_hosttcp = 1; 386 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 387 &hn_trust_hosttcp, 0, 388 "Trust tcp segement verification on host side, " 389 "when csum info is missing (global setting)"); 390 391 /* Trust udp datagrams verification on host side. */ 392 static int hn_trust_hostudp = 1; 393 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 394 &hn_trust_hostudp, 0, 395 "Trust udp datagram verification on host side, " 396 "when csum info is missing (global setting)"); 397 398 /* Trust ip packets verification on host side. */ 399 static int hn_trust_hostip = 1; 400 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 401 &hn_trust_hostip, 0, 402 "Trust ip packet verification on host side, " 403 "when csum info is missing (global setting)"); 404 405 /* Limit TSO burst size */ 406 static int hn_tso_maxlen = IP_MAXPACKET; 407 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 408 &hn_tso_maxlen, 0, "TSO burst limit"); 409 410 /* Limit chimney send size */ 411 static int hn_tx_chimney_size = 0; 412 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 413 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 414 415 /* Limit the size of packet for direct transmission */ 416 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 417 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 418 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 419 420 /* # of LRO entries per RX ring */ 421 #if defined(INET) || defined(INET6) 422 #if __FreeBSD_version >= 1100095 423 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 424 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 425 &hn_lro_entry_count, 0, "LRO entry count"); 426 #endif 427 #endif 428 429 static int hn_tx_taskq_cnt = 1; 430 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 431 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 432 433 #define HN_TX_TASKQ_M_INDEP 0 434 #define HN_TX_TASKQ_M_GLOBAL 1 435 #define HN_TX_TASKQ_M_EVTTQ 2 436 437 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 438 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 439 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 440 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 441 442 #ifndef HN_USE_TXDESC_BUFRING 443 static int hn_use_txdesc_bufring = 0; 444 #else 445 static int hn_use_txdesc_bufring = 1; 446 #endif 447 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 448 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 449 450 #ifdef HN_IFSTART_SUPPORT 451 /* Use ifnet.if_start instead of ifnet.if_transmit */ 452 static int hn_use_if_start = 0; 453 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 454 &hn_use_if_start, 0, "Use if_start TX method"); 455 #endif 456 457 /* # of channels to use */ 458 static int hn_chan_cnt = 0; 459 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 460 &hn_chan_cnt, 0, 461 "# of channels to use; each channel has one RX ring and one TX ring"); 462 463 /* # of transmit rings to use */ 464 static int hn_tx_ring_cnt = 0; 465 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 466 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 467 468 /* Software TX ring deptch */ 469 static int hn_tx_swq_depth = 0; 470 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 471 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 472 473 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 474 #if __FreeBSD_version >= 1100095 475 static u_int hn_lro_mbufq_depth = 0; 476 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 477 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 478 #endif 479 480 /* Packet transmission aggregation size limit */ 481 static int hn_tx_agg_size = -1; 482 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 483 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 484 485 /* Packet transmission aggregation count limit */ 486 static int hn_tx_agg_pkts = -1; 487 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 488 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 489 490 static u_int hn_cpu_index; /* next CPU for channel */ 491 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 492 493 #ifndef RSS 494 static const uint8_t 495 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 496 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 497 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 498 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 499 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 500 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 501 }; 502 #endif /* !RSS */ 503 504 static device_method_t hn_methods[] = { 505 /* Device interface */ 506 DEVMETHOD(device_probe, hn_probe), 507 DEVMETHOD(device_attach, hn_attach), 508 DEVMETHOD(device_detach, hn_detach), 509 DEVMETHOD(device_shutdown, hn_shutdown), 510 DEVMETHOD_END 511 }; 512 513 static driver_t hn_driver = { 514 "hn", 515 hn_methods, 516 sizeof(struct hn_softc) 517 }; 518 519 static devclass_t hn_devclass; 520 521 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 522 MODULE_VERSION(hn, 1); 523 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 524 525 #if __FreeBSD_version >= 1100099 526 static void 527 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 528 { 529 int i; 530 531 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 532 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 533 } 534 #endif 535 536 static int 537 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 538 { 539 540 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 541 txd->chim_size == 0, ("invalid rndis sglist txd")); 542 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 543 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 544 } 545 546 static int 547 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 548 { 549 struct hn_nvs_rndis rndis; 550 551 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 552 txd->chim_size > 0, ("invalid rndis chim txd")); 553 554 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 555 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 556 rndis.nvs_chim_idx = txd->chim_index; 557 rndis.nvs_chim_sz = txd->chim_size; 558 559 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 560 &rndis, sizeof(rndis), &txd->send_ctx)); 561 } 562 563 static __inline uint32_t 564 hn_chim_alloc(struct hn_softc *sc) 565 { 566 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 567 u_long *bmap = sc->hn_chim_bmap; 568 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 569 570 for (i = 0; i < bmap_cnt; ++i) { 571 int idx; 572 573 idx = ffsl(~bmap[i]); 574 if (idx == 0) 575 continue; 576 577 --idx; /* ffsl is 1-based */ 578 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 579 ("invalid i %d and idx %d", i, idx)); 580 581 if (atomic_testandset_long(&bmap[i], idx)) 582 continue; 583 584 ret = i * LONG_BIT + idx; 585 break; 586 } 587 return (ret); 588 } 589 590 static __inline void 591 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 592 { 593 u_long mask; 594 uint32_t idx; 595 596 idx = chim_idx / LONG_BIT; 597 KASSERT(idx < sc->hn_chim_bmap_cnt, 598 ("invalid chimney index 0x%x", chim_idx)); 599 600 mask = 1UL << (chim_idx % LONG_BIT); 601 KASSERT(sc->hn_chim_bmap[idx] & mask, 602 ("index bitmap 0x%lx, chimney index %u, " 603 "bitmap idx %d, bitmask 0x%lx", 604 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 605 606 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 607 } 608 609 #if defined(INET6) || defined(INET) 610 /* 611 * NOTE: If this function failed, the m_head would be freed. 612 */ 613 static __inline struct mbuf * 614 hn_tso_fixup(struct mbuf *m_head) 615 { 616 struct ether_vlan_header *evl; 617 struct tcphdr *th; 618 int ehlen; 619 620 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 621 622 #define PULLUP_HDR(m, len) \ 623 do { \ 624 if (__predict_false((m)->m_len < (len))) { \ 625 (m) = m_pullup((m), (len)); \ 626 if ((m) == NULL) \ 627 return (NULL); \ 628 } \ 629 } while (0) 630 631 PULLUP_HDR(m_head, sizeof(*evl)); 632 evl = mtod(m_head, struct ether_vlan_header *); 633 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 634 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 635 else 636 ehlen = ETHER_HDR_LEN; 637 638 #ifdef INET 639 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 640 struct ip *ip; 641 int iphlen; 642 643 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 644 ip = mtodo(m_head, ehlen); 645 iphlen = ip->ip_hl << 2; 646 647 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 648 th = mtodo(m_head, ehlen + iphlen); 649 650 ip->ip_len = 0; 651 ip->ip_sum = 0; 652 th->th_sum = in_pseudo(ip->ip_src.s_addr, 653 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 654 } 655 #endif 656 #if defined(INET6) && defined(INET) 657 else 658 #endif 659 #ifdef INET6 660 { 661 struct ip6_hdr *ip6; 662 663 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 664 ip6 = mtodo(m_head, ehlen); 665 if (ip6->ip6_nxt != IPPROTO_TCP) { 666 m_freem(m_head); 667 return (NULL); 668 } 669 670 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 671 th = mtodo(m_head, ehlen + sizeof(*ip6)); 672 673 ip6->ip6_plen = 0; 674 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 675 } 676 #endif 677 return (m_head); 678 679 #undef PULLUP_HDR 680 } 681 #endif /* INET6 || INET */ 682 683 static int 684 hn_set_rxfilter(struct hn_softc *sc) 685 { 686 struct ifnet *ifp = sc->hn_ifp; 687 uint32_t filter; 688 int error = 0; 689 690 HN_LOCK_ASSERT(sc); 691 692 if (ifp->if_flags & IFF_PROMISC) { 693 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 694 } else { 695 filter = NDIS_PACKET_TYPE_DIRECTED; 696 if (ifp->if_flags & IFF_BROADCAST) 697 filter |= NDIS_PACKET_TYPE_BROADCAST; 698 /* TODO: support multicast list */ 699 if ((ifp->if_flags & IFF_ALLMULTI) || 700 !TAILQ_EMPTY(&ifp->if_multiaddrs)) 701 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 702 } 703 704 if (sc->hn_rx_filter != filter) { 705 error = hn_rndis_set_rxfilter(sc, filter); 706 if (!error) 707 sc->hn_rx_filter = filter; 708 } 709 return (error); 710 } 711 712 static void 713 hn_set_txagg(struct hn_softc *sc) 714 { 715 uint32_t size, pkts; 716 int i; 717 718 /* 719 * Setup aggregation size. 720 */ 721 if (sc->hn_agg_size < 0) 722 size = UINT32_MAX; 723 else 724 size = sc->hn_agg_size; 725 726 if (sc->hn_rndis_agg_size < size) 727 size = sc->hn_rndis_agg_size; 728 729 /* NOTE: We only aggregate packets using chimney sending buffers. */ 730 if (size > (uint32_t)sc->hn_chim_szmax) 731 size = sc->hn_chim_szmax; 732 733 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 734 /* Disable */ 735 size = 0; 736 pkts = 0; 737 goto done; 738 } 739 740 /* NOTE: Type of the per TX ring setting is 'int'. */ 741 if (size > INT_MAX) 742 size = INT_MAX; 743 744 /* 745 * Setup aggregation packet count. 746 */ 747 if (sc->hn_agg_pkts < 0) 748 pkts = UINT32_MAX; 749 else 750 pkts = sc->hn_agg_pkts; 751 752 if (sc->hn_rndis_agg_pkts < pkts) 753 pkts = sc->hn_rndis_agg_pkts; 754 755 if (pkts <= 1) { 756 /* Disable */ 757 size = 0; 758 pkts = 0; 759 goto done; 760 } 761 762 /* NOTE: Type of the per TX ring setting is 'short'. */ 763 if (pkts > SHRT_MAX) 764 pkts = SHRT_MAX; 765 766 done: 767 /* NOTE: Type of the per TX ring setting is 'short'. */ 768 if (sc->hn_rndis_agg_align > SHRT_MAX) { 769 /* Disable */ 770 size = 0; 771 pkts = 0; 772 } 773 774 if (bootverbose) { 775 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 776 size, pkts, sc->hn_rndis_agg_align); 777 } 778 779 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 780 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 781 782 mtx_lock(&txr->hn_tx_lock); 783 txr->hn_agg_szmax = size; 784 txr->hn_agg_pktmax = pkts; 785 txr->hn_agg_align = sc->hn_rndis_agg_align; 786 mtx_unlock(&txr->hn_tx_lock); 787 } 788 } 789 790 static int 791 hn_get_txswq_depth(const struct hn_tx_ring *txr) 792 { 793 794 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 795 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 796 return txr->hn_txdesc_cnt; 797 return hn_tx_swq_depth; 798 } 799 800 #ifndef RSS 801 static int 802 hn_rss_reconfig(struct hn_softc *sc) 803 { 804 int error; 805 806 HN_LOCK_ASSERT(sc); 807 808 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 809 return (ENXIO); 810 811 /* 812 * Disable RSS first. 813 * 814 * NOTE: 815 * Direct reconfiguration by setting the UNCHG flags does 816 * _not_ work properly. 817 */ 818 if (bootverbose) 819 if_printf(sc->hn_ifp, "disable RSS\n"); 820 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 821 if (error) { 822 if_printf(sc->hn_ifp, "RSS disable failed\n"); 823 return (error); 824 } 825 826 /* 827 * Reenable the RSS w/ the updated RSS key or indirect 828 * table. 829 */ 830 if (bootverbose) 831 if_printf(sc->hn_ifp, "reconfig RSS\n"); 832 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 833 if (error) { 834 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 835 return (error); 836 } 837 return (0); 838 } 839 #endif /* !RSS */ 840 841 static void 842 hn_rss_ind_fixup(struct hn_softc *sc) 843 { 844 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 845 int i, nchan; 846 847 nchan = sc->hn_rx_ring_inuse; 848 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 849 850 /* 851 * Check indirect table to make sure that all channels in it 852 * can be used. 853 */ 854 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 855 if (rss->rss_ind[i] >= nchan) { 856 if_printf(sc->hn_ifp, 857 "RSS indirect table %d fixup: %u -> %d\n", 858 i, rss->rss_ind[i], nchan - 1); 859 rss->rss_ind[i] = nchan - 1; 860 } 861 } 862 } 863 864 static int 865 hn_ifmedia_upd(struct ifnet *ifp __unused) 866 { 867 868 return EOPNOTSUPP; 869 } 870 871 static void 872 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 873 { 874 struct hn_softc *sc = ifp->if_softc; 875 876 ifmr->ifm_status = IFM_AVALID; 877 ifmr->ifm_active = IFM_ETHER; 878 879 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 880 ifmr->ifm_active |= IFM_NONE; 881 return; 882 } 883 ifmr->ifm_status |= IFM_ACTIVE; 884 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 885 } 886 887 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ 888 static const struct hyperv_guid g_net_vsc_device_type = { 889 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 890 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} 891 }; 892 893 static int 894 hn_probe(device_t dev) 895 { 896 897 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, 898 &g_net_vsc_device_type) == 0) { 899 device_set_desc(dev, "Hyper-V Network Interface"); 900 return BUS_PROBE_DEFAULT; 901 } 902 return ENXIO; 903 } 904 905 static int 906 hn_attach(device_t dev) 907 { 908 struct hn_softc *sc = device_get_softc(dev); 909 struct sysctl_oid_list *child; 910 struct sysctl_ctx_list *ctx; 911 uint8_t eaddr[ETHER_ADDR_LEN]; 912 struct ifnet *ifp = NULL; 913 int error, ring_cnt, tx_ring_cnt; 914 915 sc->hn_dev = dev; 916 sc->hn_prichan = vmbus_get_channel(dev); 917 HN_LOCK_INIT(sc); 918 919 /* 920 * Initialize these tunables once. 921 */ 922 sc->hn_agg_size = hn_tx_agg_size; 923 sc->hn_agg_pkts = hn_tx_agg_pkts; 924 925 /* 926 * Setup taskqueue for transmission. 927 */ 928 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 929 int i; 930 931 sc->hn_tx_taskqs = 932 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 933 M_DEVBUF, M_WAITOK); 934 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 935 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 936 M_WAITOK, taskqueue_thread_enqueue, 937 &sc->hn_tx_taskqs[i]); 938 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 939 "%s tx%d", device_get_nameunit(dev), i); 940 } 941 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 942 sc->hn_tx_taskqs = hn_tx_taskque; 943 } 944 945 /* 946 * Setup taskqueue for mangement tasks, e.g. link status. 947 */ 948 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 949 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 950 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 951 device_get_nameunit(dev)); 952 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 953 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 954 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 955 hn_netchg_status_taskfunc, sc); 956 957 /* 958 * Allocate ifnet and setup its name earlier, so that if_printf 959 * can be used by functions, which will be called after 960 * ether_ifattach(). 961 */ 962 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 963 ifp->if_softc = sc; 964 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 965 966 /* 967 * Initialize ifmedia earlier so that it can be unconditionally 968 * destroyed, if error happened later on. 969 */ 970 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 971 972 /* 973 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 974 * to use (tx_ring_cnt). 975 * 976 * NOTE: 977 * The # of RX rings to use is same as the # of channels to use. 978 */ 979 ring_cnt = hn_chan_cnt; 980 if (ring_cnt <= 0) { 981 /* Default */ 982 ring_cnt = mp_ncpus; 983 if (ring_cnt > HN_RING_CNT_DEF_MAX) 984 ring_cnt = HN_RING_CNT_DEF_MAX; 985 } else if (ring_cnt > mp_ncpus) { 986 ring_cnt = mp_ncpus; 987 } 988 #ifdef RSS 989 if (ring_cnt > rss_getnumbuckets()) 990 ring_cnt = rss_getnumbuckets(); 991 #endif 992 993 tx_ring_cnt = hn_tx_ring_cnt; 994 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 995 tx_ring_cnt = ring_cnt; 996 #ifdef HN_IFSTART_SUPPORT 997 if (hn_use_if_start) { 998 /* ifnet.if_start only needs one TX ring. */ 999 tx_ring_cnt = 1; 1000 } 1001 #endif 1002 1003 /* 1004 * Set the leader CPU for channels. 1005 */ 1006 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 1007 1008 /* 1009 * Create enough TX/RX rings, even if only limited number of 1010 * channels can be allocated. 1011 */ 1012 error = hn_create_tx_data(sc, tx_ring_cnt); 1013 if (error) 1014 goto failed; 1015 error = hn_create_rx_data(sc, ring_cnt); 1016 if (error) 1017 goto failed; 1018 1019 /* 1020 * Create transaction context for NVS and RNDIS transactions. 1021 */ 1022 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 1023 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 1024 if (sc->hn_xact == NULL) { 1025 error = ENXIO; 1026 goto failed; 1027 } 1028 1029 /* 1030 * Install orphan handler for the revocation of this device's 1031 * primary channel. 1032 * 1033 * NOTE: 1034 * The processing order is critical here: 1035 * Install the orphan handler, _before_ testing whether this 1036 * device's primary channel has been revoked or not. 1037 */ 1038 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 1039 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 1040 error = ENXIO; 1041 goto failed; 1042 } 1043 1044 /* 1045 * Attach the synthetic parts, i.e. NVS and RNDIS. 1046 */ 1047 error = hn_synth_attach(sc, ETHERMTU); 1048 if (error) 1049 goto failed; 1050 1051 error = hn_rndis_get_eaddr(sc, eaddr); 1052 if (error) 1053 goto failed; 1054 1055 #if __FreeBSD_version >= 1100099 1056 if (sc->hn_rx_ring_inuse > 1) { 1057 /* 1058 * Reduce TCP segment aggregation limit for multiple 1059 * RX rings to increase ACK timeliness. 1060 */ 1061 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 1062 } 1063 #endif 1064 1065 /* 1066 * Fixup TX stuffs after synthetic parts are attached. 1067 */ 1068 hn_fixup_tx_data(sc); 1069 1070 ctx = device_get_sysctl_ctx(dev); 1071 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 1072 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 1073 &sc->hn_nvs_ver, 0, "NVS version"); 1074 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 1075 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1076 hn_ndis_version_sysctl, "A", "NDIS version"); 1077 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 1078 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1079 hn_caps_sysctl, "A", "capabilities"); 1080 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 1081 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1082 hn_hwassist_sysctl, "A", "hwassist"); 1083 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 1084 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1085 hn_rxfilter_sysctl, "A", "rxfilter"); 1086 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 1087 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1088 hn_rss_hash_sysctl, "A", "RSS hash"); 1089 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 1090 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 1091 #ifndef RSS 1092 /* 1093 * Don't allow RSS key/indirect table changes, if RSS is defined. 1094 */ 1095 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 1096 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1097 hn_rss_key_sysctl, "IU", "RSS key"); 1098 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 1099 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1100 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 1101 #endif 1102 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 1103 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 1104 "RNDIS offered packet transmission aggregation size limit"); 1105 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 1106 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 1107 "RNDIS offered packet transmission aggregation count limit"); 1108 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 1109 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 1110 "RNDIS packet transmission aggregation alignment"); 1111 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 1112 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1113 hn_txagg_size_sysctl, "I", 1114 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 1115 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 1116 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1117 hn_txagg_pkts_sysctl, "I", 1118 "Packet transmission aggregation packets, " 1119 "0 -- disable, -1 -- auto"); 1120 1121 /* 1122 * Setup the ifmedia, which has been initialized earlier. 1123 */ 1124 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 1125 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 1126 /* XXX ifmedia_set really should do this for us */ 1127 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 1128 1129 /* 1130 * Setup the ifnet for this interface. 1131 */ 1132 1133 ifp->if_baudrate = IF_Gbps(10); 1134 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 1135 ifp->if_ioctl = hn_ioctl; 1136 ifp->if_init = hn_init; 1137 #ifdef HN_IFSTART_SUPPORT 1138 if (hn_use_if_start) { 1139 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 1140 1141 ifp->if_start = hn_start; 1142 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 1143 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 1144 IFQ_SET_READY(&ifp->if_snd); 1145 } else 1146 #endif 1147 { 1148 ifp->if_transmit = hn_transmit; 1149 ifp->if_qflush = hn_xmit_qflush; 1150 } 1151 1152 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO; 1153 #ifdef foo 1154 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 1155 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 1156 #endif 1157 if (sc->hn_caps & HN_CAP_VLAN) { 1158 /* XXX not sure about VLAN_MTU. */ 1159 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 1160 } 1161 1162 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 1163 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 1164 ifp->if_capabilities |= IFCAP_TXCSUM; 1165 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 1166 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 1167 if (sc->hn_caps & HN_CAP_TSO4) { 1168 ifp->if_capabilities |= IFCAP_TSO4; 1169 ifp->if_hwassist |= CSUM_IP_TSO; 1170 } 1171 if (sc->hn_caps & HN_CAP_TSO6) { 1172 ifp->if_capabilities |= IFCAP_TSO6; 1173 ifp->if_hwassist |= CSUM_IP6_TSO; 1174 } 1175 1176 /* Enable all available capabilities by default. */ 1177 ifp->if_capenable = ifp->if_capabilities; 1178 1179 /* 1180 * Disable IPv6 TSO and TXCSUM by default, they still can 1181 * be enabled through SIOCSIFCAP. 1182 */ 1183 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 1184 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 1185 1186 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 1187 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 1188 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 1189 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 1190 } 1191 1192 ether_ifattach(ifp, eaddr); 1193 1194 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 1195 if_printf(ifp, "TSO segcnt %u segsz %u\n", 1196 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 1197 } 1198 1199 /* Inform the upper layer about the long frame support. */ 1200 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 1201 1202 /* 1203 * Kick off link status check. 1204 */ 1205 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 1206 hn_update_link_status(sc); 1207 1208 return (0); 1209 failed: 1210 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 1211 hn_synth_detach(sc); 1212 hn_detach(dev); 1213 return (error); 1214 } 1215 1216 static int 1217 hn_detach(device_t dev) 1218 { 1219 struct hn_softc *sc = device_get_softc(dev); 1220 struct ifnet *ifp = sc->hn_ifp; 1221 1222 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 1223 /* 1224 * In case that the vmbus missed the orphan handler 1225 * installation. 1226 */ 1227 vmbus_xact_ctx_orphan(sc->hn_xact); 1228 } 1229 1230 if (device_is_attached(dev)) { 1231 HN_LOCK(sc); 1232 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 1233 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 1234 hn_stop(sc); 1235 /* 1236 * NOTE: 1237 * hn_stop() only suspends data, so managment 1238 * stuffs have to be suspended manually here. 1239 */ 1240 hn_suspend_mgmt(sc); 1241 hn_synth_detach(sc); 1242 } 1243 HN_UNLOCK(sc); 1244 ether_ifdetach(ifp); 1245 } 1246 1247 ifmedia_removeall(&sc->hn_media); 1248 hn_destroy_rx_data(sc); 1249 hn_destroy_tx_data(sc); 1250 1251 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 1252 int i; 1253 1254 for (i = 0; i < hn_tx_taskq_cnt; ++i) 1255 taskqueue_free(sc->hn_tx_taskqs[i]); 1256 free(sc->hn_tx_taskqs, M_DEVBUF); 1257 } 1258 taskqueue_free(sc->hn_mgmt_taskq0); 1259 1260 if (sc->hn_xact != NULL) { 1261 /* 1262 * Uninstall the orphan handler _before_ the xact is 1263 * destructed. 1264 */ 1265 vmbus_chan_unset_orphan(sc->hn_prichan); 1266 vmbus_xact_ctx_destroy(sc->hn_xact); 1267 } 1268 1269 if_free(ifp); 1270 1271 HN_LOCK_DESTROY(sc); 1272 return (0); 1273 } 1274 1275 static int 1276 hn_shutdown(device_t dev) 1277 { 1278 1279 return (0); 1280 } 1281 1282 static void 1283 hn_link_status(struct hn_softc *sc) 1284 { 1285 uint32_t link_status; 1286 int error; 1287 1288 error = hn_rndis_get_linkstatus(sc, &link_status); 1289 if (error) { 1290 /* XXX what to do? */ 1291 return; 1292 } 1293 1294 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 1295 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 1296 else 1297 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1298 if_link_state_change(sc->hn_ifp, 1299 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 1300 LINK_STATE_UP : LINK_STATE_DOWN); 1301 } 1302 1303 static void 1304 hn_link_taskfunc(void *xsc, int pending __unused) 1305 { 1306 struct hn_softc *sc = xsc; 1307 1308 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 1309 return; 1310 hn_link_status(sc); 1311 } 1312 1313 static void 1314 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 1315 { 1316 struct hn_softc *sc = xsc; 1317 1318 /* Prevent any link status checks from running. */ 1319 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 1320 1321 /* 1322 * Fake up a [link down --> link up] state change; 5 seconds 1323 * delay is used, which closely simulates miibus reaction 1324 * upon link down event. 1325 */ 1326 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1327 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 1328 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 1329 &sc->hn_netchg_status, 5 * hz); 1330 } 1331 1332 static void 1333 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 1334 { 1335 struct hn_softc *sc = xsc; 1336 1337 /* Re-allow link status checks. */ 1338 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 1339 hn_link_status(sc); 1340 } 1341 1342 static void 1343 hn_update_link_status(struct hn_softc *sc) 1344 { 1345 1346 if (sc->hn_mgmt_taskq != NULL) 1347 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 1348 } 1349 1350 static void 1351 hn_change_network(struct hn_softc *sc) 1352 { 1353 1354 if (sc->hn_mgmt_taskq != NULL) 1355 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 1356 } 1357 1358 static __inline int 1359 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 1360 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 1361 { 1362 struct mbuf *m = *m_head; 1363 int error; 1364 1365 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 1366 1367 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 1368 m, segs, nsegs, BUS_DMA_NOWAIT); 1369 if (error == EFBIG) { 1370 struct mbuf *m_new; 1371 1372 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 1373 if (m_new == NULL) 1374 return ENOBUFS; 1375 else 1376 *m_head = m = m_new; 1377 txr->hn_tx_collapsed++; 1378 1379 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 1380 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 1381 } 1382 if (!error) { 1383 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 1384 BUS_DMASYNC_PREWRITE); 1385 txd->flags |= HN_TXD_FLAG_DMAMAP; 1386 } 1387 return error; 1388 } 1389 1390 static __inline int 1391 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 1392 { 1393 1394 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 1395 ("put an onlist txd %#x", txd->flags)); 1396 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1397 ("put an onagg txd %#x", txd->flags)); 1398 1399 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1400 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 1401 return 0; 1402 1403 if (!STAILQ_EMPTY(&txd->agg_list)) { 1404 struct hn_txdesc *tmp_txd; 1405 1406 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 1407 int freed; 1408 1409 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 1410 ("resursive aggregation on aggregated txdesc")); 1411 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 1412 ("not aggregated txdesc")); 1413 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1414 ("aggregated txdesc uses dmamap")); 1415 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 1416 ("aggregated txdesc consumes " 1417 "chimney sending buffer")); 1418 KASSERT(tmp_txd->chim_size == 0, 1419 ("aggregated txdesc has non-zero " 1420 "chimney sending size")); 1421 1422 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 1423 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 1424 freed = hn_txdesc_put(txr, tmp_txd); 1425 KASSERT(freed, ("failed to free aggregated txdesc")); 1426 } 1427 } 1428 1429 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 1430 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1431 ("chim txd uses dmamap")); 1432 hn_chim_free(txr->hn_sc, txd->chim_index); 1433 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 1434 txd->chim_size = 0; 1435 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 1436 bus_dmamap_sync(txr->hn_tx_data_dtag, 1437 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 1438 bus_dmamap_unload(txr->hn_tx_data_dtag, 1439 txd->data_dmap); 1440 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 1441 } 1442 1443 if (txd->m != NULL) { 1444 m_freem(txd->m); 1445 txd->m = NULL; 1446 } 1447 1448 txd->flags |= HN_TXD_FLAG_ONLIST; 1449 #ifndef HN_USE_TXDESC_BUFRING 1450 mtx_lock_spin(&txr->hn_txlist_spin); 1451 KASSERT(txr->hn_txdesc_avail >= 0 && 1452 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 1453 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 1454 txr->hn_txdesc_avail++; 1455 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 1456 mtx_unlock_spin(&txr->hn_txlist_spin); 1457 #else /* HN_USE_TXDESC_BUFRING */ 1458 #ifdef HN_DEBUG 1459 atomic_add_int(&txr->hn_txdesc_avail, 1); 1460 #endif 1461 buf_ring_enqueue(txr->hn_txdesc_br, txd); 1462 #endif /* !HN_USE_TXDESC_BUFRING */ 1463 1464 return 1; 1465 } 1466 1467 static __inline struct hn_txdesc * 1468 hn_txdesc_get(struct hn_tx_ring *txr) 1469 { 1470 struct hn_txdesc *txd; 1471 1472 #ifndef HN_USE_TXDESC_BUFRING 1473 mtx_lock_spin(&txr->hn_txlist_spin); 1474 txd = SLIST_FIRST(&txr->hn_txlist); 1475 if (txd != NULL) { 1476 KASSERT(txr->hn_txdesc_avail > 0, 1477 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 1478 txr->hn_txdesc_avail--; 1479 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 1480 } 1481 mtx_unlock_spin(&txr->hn_txlist_spin); 1482 #else 1483 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 1484 #endif 1485 1486 if (txd != NULL) { 1487 #ifdef HN_USE_TXDESC_BUFRING 1488 #ifdef HN_DEBUG 1489 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 1490 #endif 1491 #endif /* HN_USE_TXDESC_BUFRING */ 1492 KASSERT(txd->m == NULL && txd->refs == 0 && 1493 STAILQ_EMPTY(&txd->agg_list) && 1494 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 1495 txd->chim_size == 0 && 1496 (txd->flags & HN_TXD_FLAG_ONLIST) && 1497 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 1498 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 1499 txd->flags &= ~HN_TXD_FLAG_ONLIST; 1500 txd->refs = 1; 1501 } 1502 return txd; 1503 } 1504 1505 static __inline void 1506 hn_txdesc_hold(struct hn_txdesc *txd) 1507 { 1508 1509 /* 0->1 transition will never work */ 1510 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1511 atomic_add_int(&txd->refs, 1); 1512 } 1513 1514 static __inline void 1515 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 1516 { 1517 1518 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1519 ("recursive aggregation on aggregating txdesc")); 1520 1521 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1522 ("already aggregated")); 1523 KASSERT(STAILQ_EMPTY(&txd->agg_list), 1524 ("recursive aggregation on to-be-aggregated txdesc")); 1525 1526 txd->flags |= HN_TXD_FLAG_ONAGG; 1527 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 1528 } 1529 1530 static bool 1531 hn_tx_ring_pending(struct hn_tx_ring *txr) 1532 { 1533 bool pending = false; 1534 1535 #ifndef HN_USE_TXDESC_BUFRING 1536 mtx_lock_spin(&txr->hn_txlist_spin); 1537 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 1538 pending = true; 1539 mtx_unlock_spin(&txr->hn_txlist_spin); 1540 #else 1541 if (!buf_ring_full(txr->hn_txdesc_br)) 1542 pending = true; 1543 #endif 1544 return (pending); 1545 } 1546 1547 static __inline void 1548 hn_txeof(struct hn_tx_ring *txr) 1549 { 1550 txr->hn_has_txeof = 0; 1551 txr->hn_txeof(txr); 1552 } 1553 1554 static void 1555 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 1556 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 1557 { 1558 struct hn_txdesc *txd = sndc->hn_cbarg; 1559 struct hn_tx_ring *txr; 1560 1561 txr = txd->txr; 1562 KASSERT(txr->hn_chan == chan, 1563 ("channel mismatch, on chan%u, should be chan%u", 1564 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 1565 1566 txr->hn_has_txeof = 1; 1567 hn_txdesc_put(txr, txd); 1568 1569 ++txr->hn_txdone_cnt; 1570 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 1571 txr->hn_txdone_cnt = 0; 1572 if (txr->hn_oactive) 1573 hn_txeof(txr); 1574 } 1575 } 1576 1577 static void 1578 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 1579 { 1580 #if defined(INET) || defined(INET6) 1581 tcp_lro_flush_all(&rxr->hn_lro); 1582 #endif 1583 1584 /* 1585 * NOTE: 1586 * 'txr' could be NULL, if multiple channels and 1587 * ifnet.if_start method are enabled. 1588 */ 1589 if (txr == NULL || !txr->hn_has_txeof) 1590 return; 1591 1592 txr->hn_txdone_cnt = 0; 1593 hn_txeof(txr); 1594 } 1595 1596 static __inline uint32_t 1597 hn_rndis_pktmsg_offset(uint32_t ofs) 1598 { 1599 1600 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 1601 ("invalid RNDIS packet msg offset %u", ofs)); 1602 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 1603 } 1604 1605 static __inline void * 1606 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 1607 size_t pi_dlen, uint32_t pi_type) 1608 { 1609 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 1610 struct rndis_pktinfo *pi; 1611 1612 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 1613 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 1614 1615 /* 1616 * Per-packet-info does not move; it only grows. 1617 * 1618 * NOTE: 1619 * rm_pktinfooffset in this phase counts from the beginning 1620 * of rndis_packet_msg. 1621 */ 1622 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 1623 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 1624 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 1625 pkt->rm_pktinfolen); 1626 pkt->rm_pktinfolen += pi_size; 1627 1628 pi->rm_size = pi_size; 1629 pi->rm_type = pi_type; 1630 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 1631 1632 /* Data immediately follow per-packet-info. */ 1633 pkt->rm_dataoffset += pi_size; 1634 1635 /* Update RNDIS packet msg length */ 1636 pkt->rm_len += pi_size; 1637 1638 return (pi->rm_data); 1639 } 1640 1641 static __inline int 1642 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 1643 { 1644 struct hn_txdesc *txd; 1645 struct mbuf *m; 1646 int error, pkts; 1647 1648 txd = txr->hn_agg_txd; 1649 KASSERT(txd != NULL, ("no aggregate txdesc")); 1650 1651 /* 1652 * Since hn_txpkt() will reset this temporary stat, save 1653 * it now, so that oerrors can be updated properly, if 1654 * hn_txpkt() ever fails. 1655 */ 1656 pkts = txr->hn_stat_pkts; 1657 1658 /* 1659 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 1660 * failure, save it for later freeing, if hn_txpkt() ever 1661 * fails. 1662 */ 1663 m = txd->m; 1664 error = hn_txpkt(ifp, txr, txd); 1665 if (__predict_false(error)) { 1666 /* txd is freed, but m is not. */ 1667 m_freem(m); 1668 1669 txr->hn_flush_failed++; 1670 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 1671 } 1672 1673 /* Reset all aggregation states. */ 1674 txr->hn_agg_txd = NULL; 1675 txr->hn_agg_szleft = 0; 1676 txr->hn_agg_pktleft = 0; 1677 txr->hn_agg_prevpkt = NULL; 1678 1679 return (error); 1680 } 1681 1682 static void * 1683 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1684 int pktsize) 1685 { 1686 void *chim; 1687 1688 if (txr->hn_agg_txd != NULL) { 1689 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 1690 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 1691 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 1692 int olen; 1693 1694 /* 1695 * Update the previous RNDIS packet's total length, 1696 * it can be increased due to the mandatory alignment 1697 * padding for this RNDIS packet. And update the 1698 * aggregating txdesc's chimney sending buffer size 1699 * accordingly. 1700 * 1701 * XXX 1702 * Zero-out the padding, as required by the RNDIS spec. 1703 */ 1704 olen = pkt->rm_len; 1705 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 1706 agg_txd->chim_size += pkt->rm_len - olen; 1707 1708 /* Link this txdesc to the parent. */ 1709 hn_txdesc_agg(agg_txd, txd); 1710 1711 chim = (uint8_t *)pkt + pkt->rm_len; 1712 /* Save the current packet for later fixup. */ 1713 txr->hn_agg_prevpkt = chim; 1714 1715 txr->hn_agg_pktleft--; 1716 txr->hn_agg_szleft -= pktsize; 1717 if (txr->hn_agg_szleft <= 1718 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1719 /* 1720 * Probably can't aggregate more packets, 1721 * flush this aggregating txdesc proactively. 1722 */ 1723 txr->hn_agg_pktleft = 0; 1724 } 1725 /* Done! */ 1726 return (chim); 1727 } 1728 hn_flush_txagg(ifp, txr); 1729 } 1730 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 1731 1732 txr->hn_tx_chimney_tried++; 1733 txd->chim_index = hn_chim_alloc(txr->hn_sc); 1734 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 1735 return (NULL); 1736 txr->hn_tx_chimney++; 1737 1738 chim = txr->hn_sc->hn_chim + 1739 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 1740 1741 if (txr->hn_agg_pktmax > 1 && 1742 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1743 txr->hn_agg_txd = txd; 1744 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 1745 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 1746 txr->hn_agg_prevpkt = chim; 1747 } 1748 return (chim); 1749 } 1750 1751 /* 1752 * NOTE: 1753 * If this function fails, then both txd and m_head0 will be freed. 1754 */ 1755 static int 1756 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1757 struct mbuf **m_head0) 1758 { 1759 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 1760 int error, nsegs, i; 1761 struct mbuf *m_head = *m_head0; 1762 struct rndis_packet_msg *pkt; 1763 uint32_t *pi_data; 1764 void *chim = NULL; 1765 int pkt_hlen, pkt_size; 1766 1767 pkt = txd->rndis_pkt; 1768 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 1769 if (pkt_size < txr->hn_chim_size) { 1770 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 1771 if (chim != NULL) 1772 pkt = chim; 1773 } else { 1774 if (txr->hn_agg_txd != NULL) 1775 hn_flush_txagg(ifp, txr); 1776 } 1777 1778 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 1779 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len; 1780 pkt->rm_dataoffset = sizeof(*pkt); 1781 pkt->rm_datalen = m_head->m_pkthdr.len; 1782 pkt->rm_oobdataoffset = 0; 1783 pkt->rm_oobdatalen = 0; 1784 pkt->rm_oobdataelements = 0; 1785 pkt->rm_pktinfooffset = sizeof(*pkt); 1786 pkt->rm_pktinfolen = 0; 1787 pkt->rm_vchandle = 0; 1788 pkt->rm_reserved = 0; 1789 1790 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 1791 /* 1792 * Set the hash value for this packet, so that the host could 1793 * dispatch the TX done event for this packet back to this TX 1794 * ring's channel. 1795 */ 1796 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1797 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 1798 *pi_data = txr->hn_tx_idx; 1799 } 1800 1801 if (m_head->m_flags & M_VLANTAG) { 1802 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1803 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 1804 *pi_data = NDIS_VLAN_INFO_MAKE( 1805 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 1806 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 1807 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 1808 } 1809 1810 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 1811 #if defined(INET6) || defined(INET) 1812 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1813 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 1814 #ifdef INET 1815 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 1816 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, 1817 m_head->m_pkthdr.tso_segsz); 1818 } 1819 #endif 1820 #if defined(INET6) && defined(INET) 1821 else 1822 #endif 1823 #ifdef INET6 1824 { 1825 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, 1826 m_head->m_pkthdr.tso_segsz); 1827 } 1828 #endif 1829 #endif /* INET6 || INET */ 1830 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 1831 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1832 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 1833 if (m_head->m_pkthdr.csum_flags & 1834 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 1835 *pi_data = NDIS_TXCSUM_INFO_IPV6; 1836 } else { 1837 *pi_data = NDIS_TXCSUM_INFO_IPV4; 1838 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 1839 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 1840 } 1841 1842 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) 1843 *pi_data |= NDIS_TXCSUM_INFO_TCPCS; 1844 else if (m_head->m_pkthdr.csum_flags & 1845 (CSUM_IP_UDP | CSUM_IP6_UDP)) 1846 *pi_data |= NDIS_TXCSUM_INFO_UDPCS; 1847 } 1848 1849 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 1850 /* Convert RNDIS packet message offsets */ 1851 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset); 1852 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 1853 1854 /* 1855 * Fast path: Chimney sending. 1856 */ 1857 if (chim != NULL) { 1858 struct hn_txdesc *tgt_txd = txd; 1859 1860 if (txr->hn_agg_txd != NULL) { 1861 tgt_txd = txr->hn_agg_txd; 1862 #ifdef INVARIANTS 1863 *m_head0 = NULL; 1864 #endif 1865 } 1866 1867 KASSERT(pkt == chim, 1868 ("RNDIS pkt not in chimney sending buffer")); 1869 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 1870 ("chimney sending buffer is not used")); 1871 tgt_txd->chim_size += pkt->rm_len; 1872 1873 m_copydata(m_head, 0, m_head->m_pkthdr.len, 1874 ((uint8_t *)chim) + pkt_hlen); 1875 1876 txr->hn_gpa_cnt = 0; 1877 txr->hn_sendpkt = hn_txpkt_chim; 1878 goto done; 1879 } 1880 1881 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 1882 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 1883 ("chimney buffer is used")); 1884 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 1885 1886 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 1887 if (__predict_false(error)) { 1888 int freed; 1889 1890 /* 1891 * This mbuf is not linked w/ the txd yet, so free it now. 1892 */ 1893 m_freem(m_head); 1894 *m_head0 = NULL; 1895 1896 freed = hn_txdesc_put(txr, txd); 1897 KASSERT(freed != 0, 1898 ("fail to free txd upon txdma error")); 1899 1900 txr->hn_txdma_failed++; 1901 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 1902 return error; 1903 } 1904 *m_head0 = m_head; 1905 1906 /* +1 RNDIS packet message */ 1907 txr->hn_gpa_cnt = nsegs + 1; 1908 1909 /* send packet with page buffer */ 1910 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 1911 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 1912 txr->hn_gpa[0].gpa_len = pkt_hlen; 1913 1914 /* 1915 * Fill the page buffers with mbuf info after the page 1916 * buffer for RNDIS packet message. 1917 */ 1918 for (i = 0; i < nsegs; ++i) { 1919 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 1920 1921 gpa->gpa_page = atop(segs[i].ds_addr); 1922 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 1923 gpa->gpa_len = segs[i].ds_len; 1924 } 1925 1926 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 1927 txd->chim_size = 0; 1928 txr->hn_sendpkt = hn_txpkt_sglist; 1929 done: 1930 txd->m = m_head; 1931 1932 /* Set the completion routine */ 1933 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 1934 1935 /* Update temporary stats for later use. */ 1936 txr->hn_stat_pkts++; 1937 txr->hn_stat_size += m_head->m_pkthdr.len; 1938 if (m_head->m_flags & M_MCAST) 1939 txr->hn_stat_mcasts++; 1940 1941 return 0; 1942 } 1943 1944 /* 1945 * NOTE: 1946 * If this function fails, then txd will be freed, but the mbuf 1947 * associated w/ the txd will _not_ be freed. 1948 */ 1949 static int 1950 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 1951 { 1952 int error, send_failed = 0, has_bpf; 1953 1954 again: 1955 has_bpf = bpf_peers_present(ifp->if_bpf); 1956 if (has_bpf) { 1957 /* 1958 * Make sure that this txd and any aggregated txds are not 1959 * freed before ETHER_BPF_MTAP. 1960 */ 1961 hn_txdesc_hold(txd); 1962 } 1963 error = txr->hn_sendpkt(txr, txd); 1964 if (!error) { 1965 if (has_bpf) { 1966 const struct hn_txdesc *tmp_txd; 1967 1968 ETHER_BPF_MTAP(ifp, txd->m); 1969 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 1970 ETHER_BPF_MTAP(ifp, tmp_txd->m); 1971 } 1972 1973 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 1974 #ifdef HN_IFSTART_SUPPORT 1975 if (!hn_use_if_start) 1976 #endif 1977 { 1978 if_inc_counter(ifp, IFCOUNTER_OBYTES, 1979 txr->hn_stat_size); 1980 if (txr->hn_stat_mcasts != 0) { 1981 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1982 txr->hn_stat_mcasts); 1983 } 1984 } 1985 txr->hn_pkts += txr->hn_stat_pkts; 1986 txr->hn_sends++; 1987 } 1988 if (has_bpf) 1989 hn_txdesc_put(txr, txd); 1990 1991 if (__predict_false(error)) { 1992 int freed; 1993 1994 /* 1995 * This should "really rarely" happen. 1996 * 1997 * XXX Too many RX to be acked or too many sideband 1998 * commands to run? Ask netvsc_channel_rollup() 1999 * to kick start later. 2000 */ 2001 txr->hn_has_txeof = 1; 2002 if (!send_failed) { 2003 txr->hn_send_failed++; 2004 send_failed = 1; 2005 /* 2006 * Try sending again after set hn_has_txeof; 2007 * in case that we missed the last 2008 * netvsc_channel_rollup(). 2009 */ 2010 goto again; 2011 } 2012 if_printf(ifp, "send failed\n"); 2013 2014 /* 2015 * Caller will perform further processing on the 2016 * associated mbuf, so don't free it in hn_txdesc_put(); 2017 * only unload it from the DMA map in hn_txdesc_put(), 2018 * if it was loaded. 2019 */ 2020 txd->m = NULL; 2021 freed = hn_txdesc_put(txr, txd); 2022 KASSERT(freed != 0, 2023 ("fail to free txd upon send error")); 2024 2025 txr->hn_send_failed++; 2026 } 2027 2028 /* Reset temporary stats, after this sending is done. */ 2029 txr->hn_stat_size = 0; 2030 txr->hn_stat_pkts = 0; 2031 txr->hn_stat_mcasts = 0; 2032 2033 return (error); 2034 } 2035 2036 /* 2037 * Append the specified data to the indicated mbuf chain, 2038 * Extend the mbuf chain if the new data does not fit in 2039 * existing space. 2040 * 2041 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 2042 * There should be an equivalent in the kernel mbuf code, 2043 * but there does not appear to be one yet. 2044 * 2045 * Differs from m_append() in that additional mbufs are 2046 * allocated with cluster size MJUMPAGESIZE, and filled 2047 * accordingly. 2048 * 2049 * Return 1 if able to complete the job; otherwise 0. 2050 */ 2051 static int 2052 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 2053 { 2054 struct mbuf *m, *n; 2055 int remainder, space; 2056 2057 for (m = m0; m->m_next != NULL; m = m->m_next) 2058 ; 2059 remainder = len; 2060 space = M_TRAILINGSPACE(m); 2061 if (space > 0) { 2062 /* 2063 * Copy into available space. 2064 */ 2065 if (space > remainder) 2066 space = remainder; 2067 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 2068 m->m_len += space; 2069 cp += space; 2070 remainder -= space; 2071 } 2072 while (remainder > 0) { 2073 /* 2074 * Allocate a new mbuf; could check space 2075 * and allocate a cluster instead. 2076 */ 2077 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 2078 if (n == NULL) 2079 break; 2080 n->m_len = min(MJUMPAGESIZE, remainder); 2081 bcopy(cp, mtod(n, caddr_t), n->m_len); 2082 cp += n->m_len; 2083 remainder -= n->m_len; 2084 m->m_next = n; 2085 m = n; 2086 } 2087 if (m0->m_flags & M_PKTHDR) 2088 m0->m_pkthdr.len += len - remainder; 2089 2090 return (remainder == 0); 2091 } 2092 2093 #if defined(INET) || defined(INET6) 2094 static __inline int 2095 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 2096 { 2097 #if __FreeBSD_version >= 1100095 2098 if (hn_lro_mbufq_depth) { 2099 tcp_lro_queue_mbuf(lc, m); 2100 return 0; 2101 } 2102 #endif 2103 return tcp_lro_rx(lc, m, 0); 2104 } 2105 #endif 2106 2107 static int 2108 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, 2109 const struct hn_rxinfo *info) 2110 { 2111 struct ifnet *ifp = rxr->hn_ifp; 2112 struct mbuf *m_new; 2113 int size, do_lro = 0, do_csum = 1; 2114 int hash_type; 2115 2116 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) 2117 return (0); 2118 2119 /* 2120 * Bail out if packet contains more data than configured MTU. 2121 */ 2122 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) { 2123 return (0); 2124 } else if (dlen <= MHLEN) { 2125 m_new = m_gethdr(M_NOWAIT, MT_DATA); 2126 if (m_new == NULL) { 2127 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2128 return (0); 2129 } 2130 memcpy(mtod(m_new, void *), data, dlen); 2131 m_new->m_pkthdr.len = m_new->m_len = dlen; 2132 rxr->hn_small_pkts++; 2133 } else { 2134 /* 2135 * Get an mbuf with a cluster. For packets 2K or less, 2136 * get a standard 2K cluster. For anything larger, get a 2137 * 4K cluster. Any buffers larger than 4K can cause problems 2138 * if looped around to the Hyper-V TX channel, so avoid them. 2139 */ 2140 size = MCLBYTES; 2141 if (dlen > MCLBYTES) { 2142 /* 4096 */ 2143 size = MJUMPAGESIZE; 2144 } 2145 2146 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 2147 if (m_new == NULL) { 2148 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2149 return (0); 2150 } 2151 2152 hv_m_append(m_new, dlen, data); 2153 } 2154 m_new->m_pkthdr.rcvif = ifp; 2155 2156 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) 2157 do_csum = 0; 2158 2159 /* receive side checksum offload */ 2160 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { 2161 /* IP csum offload */ 2162 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 2163 m_new->m_pkthdr.csum_flags |= 2164 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2165 rxr->hn_csum_ip++; 2166 } 2167 2168 /* TCP/UDP csum offload */ 2169 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | 2170 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 2171 m_new->m_pkthdr.csum_flags |= 2172 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2173 m_new->m_pkthdr.csum_data = 0xffff; 2174 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) 2175 rxr->hn_csum_tcp++; 2176 else 2177 rxr->hn_csum_udp++; 2178 } 2179 2180 /* 2181 * XXX 2182 * As of this write (Oct 28th, 2016), host side will turn 2183 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 2184 * the do_lro setting here is actually _not_ accurate. We 2185 * depend on the RSS hash type check to reset do_lro. 2186 */ 2187 if ((info->csum_info & 2188 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 2189 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 2190 do_lro = 1; 2191 } else { 2192 const struct ether_header *eh; 2193 uint16_t etype; 2194 int hoff; 2195 2196 hoff = sizeof(*eh); 2197 if (m_new->m_len < hoff) 2198 goto skip; 2199 eh = mtod(m_new, struct ether_header *); 2200 etype = ntohs(eh->ether_type); 2201 if (etype == ETHERTYPE_VLAN) { 2202 const struct ether_vlan_header *evl; 2203 2204 hoff = sizeof(*evl); 2205 if (m_new->m_len < hoff) 2206 goto skip; 2207 evl = mtod(m_new, struct ether_vlan_header *); 2208 etype = ntohs(evl->evl_proto); 2209 } 2210 2211 if (etype == ETHERTYPE_IP) { 2212 int pr; 2213 2214 pr = hn_check_iplen(m_new, hoff); 2215 if (pr == IPPROTO_TCP) { 2216 if (do_csum && 2217 (rxr->hn_trust_hcsum & 2218 HN_TRUST_HCSUM_TCP)) { 2219 rxr->hn_csum_trusted++; 2220 m_new->m_pkthdr.csum_flags |= 2221 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2222 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2223 m_new->m_pkthdr.csum_data = 0xffff; 2224 } 2225 do_lro = 1; 2226 } else if (pr == IPPROTO_UDP) { 2227 if (do_csum && 2228 (rxr->hn_trust_hcsum & 2229 HN_TRUST_HCSUM_UDP)) { 2230 rxr->hn_csum_trusted++; 2231 m_new->m_pkthdr.csum_flags |= 2232 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2233 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2234 m_new->m_pkthdr.csum_data = 0xffff; 2235 } 2236 } else if (pr != IPPROTO_DONE && do_csum && 2237 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 2238 rxr->hn_csum_trusted++; 2239 m_new->m_pkthdr.csum_flags |= 2240 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2241 } 2242 } 2243 } 2244 skip: 2245 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { 2246 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 2247 NDIS_VLAN_INFO_ID(info->vlan_info), 2248 NDIS_VLAN_INFO_PRI(info->vlan_info), 2249 NDIS_VLAN_INFO_CFI(info->vlan_info)); 2250 m_new->m_flags |= M_VLANTAG; 2251 } 2252 2253 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { 2254 rxr->hn_rss_pkts++; 2255 m_new->m_pkthdr.flowid = info->hash_value; 2256 hash_type = M_HASHTYPE_OPAQUE_HASH; 2257 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == 2258 NDIS_HASH_FUNCTION_TOEPLITZ) { 2259 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); 2260 2261 /* 2262 * NOTE: 2263 * do_lro is resetted, if the hash types are not TCP 2264 * related. See the comment in the above csum_flags 2265 * setup section. 2266 */ 2267 switch (type) { 2268 case NDIS_HASH_IPV4: 2269 hash_type = M_HASHTYPE_RSS_IPV4; 2270 do_lro = 0; 2271 break; 2272 2273 case NDIS_HASH_TCP_IPV4: 2274 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 2275 break; 2276 2277 case NDIS_HASH_IPV6: 2278 hash_type = M_HASHTYPE_RSS_IPV6; 2279 do_lro = 0; 2280 break; 2281 2282 case NDIS_HASH_IPV6_EX: 2283 hash_type = M_HASHTYPE_RSS_IPV6_EX; 2284 do_lro = 0; 2285 break; 2286 2287 case NDIS_HASH_TCP_IPV6: 2288 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 2289 break; 2290 2291 case NDIS_HASH_TCP_IPV6_EX: 2292 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 2293 break; 2294 } 2295 } 2296 } else { 2297 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 2298 hash_type = M_HASHTYPE_OPAQUE; 2299 } 2300 M_HASHTYPE_SET(m_new, hash_type); 2301 2302 /* 2303 * Note: Moved RX completion back to hv_nv_on_receive() so all 2304 * messages (not just data messages) will trigger a response. 2305 */ 2306 2307 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 2308 rxr->hn_pkts++; 2309 2310 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { 2311 #if defined(INET) || defined(INET6) 2312 struct lro_ctrl *lro = &rxr->hn_lro; 2313 2314 if (lro->lro_cnt) { 2315 rxr->hn_lro_tried++; 2316 if (hn_lro_rx(lro, m_new) == 0) { 2317 /* DONE! */ 2318 return 0; 2319 } 2320 } 2321 #endif 2322 } 2323 2324 /* We're not holding the lock here, so don't release it */ 2325 (*ifp->if_input)(ifp, m_new); 2326 2327 return (0); 2328 } 2329 2330 static int 2331 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 2332 { 2333 struct hn_softc *sc = ifp->if_softc; 2334 struct ifreq *ifr = (struct ifreq *)data; 2335 int mask, error = 0; 2336 2337 switch (cmd) { 2338 case SIOCSIFMTU: 2339 if (ifr->ifr_mtu > HN_MTU_MAX) { 2340 error = EINVAL; 2341 break; 2342 } 2343 2344 HN_LOCK(sc); 2345 2346 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2347 HN_UNLOCK(sc); 2348 break; 2349 } 2350 2351 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 2352 /* Can't change MTU */ 2353 HN_UNLOCK(sc); 2354 error = EOPNOTSUPP; 2355 break; 2356 } 2357 2358 if (ifp->if_mtu == ifr->ifr_mtu) { 2359 HN_UNLOCK(sc); 2360 break; 2361 } 2362 2363 /* 2364 * Suspend this interface before the synthetic parts 2365 * are ripped. 2366 */ 2367 hn_suspend(sc); 2368 2369 /* 2370 * Detach the synthetics parts, i.e. NVS and RNDIS. 2371 */ 2372 hn_synth_detach(sc); 2373 2374 /* 2375 * Reattach the synthetic parts, i.e. NVS and RNDIS, 2376 * with the new MTU setting. 2377 */ 2378 error = hn_synth_attach(sc, ifr->ifr_mtu); 2379 if (error) { 2380 HN_UNLOCK(sc); 2381 break; 2382 } 2383 2384 /* 2385 * Commit the requested MTU, after the synthetic parts 2386 * have been successfully attached. 2387 */ 2388 ifp->if_mtu = ifr->ifr_mtu; 2389 2390 /* 2391 * Make sure that various parameters based on MTU are 2392 * still valid, after the MTU change. 2393 */ 2394 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 2395 hn_set_chim_size(sc, sc->hn_chim_szmax); 2396 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 2397 #if __FreeBSD_version >= 1100099 2398 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < 2399 HN_LRO_LENLIM_MIN(ifp)) 2400 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 2401 #endif 2402 2403 /* 2404 * All done! Resume the interface now. 2405 */ 2406 hn_resume(sc); 2407 2408 HN_UNLOCK(sc); 2409 break; 2410 2411 case SIOCSIFFLAGS: 2412 HN_LOCK(sc); 2413 2414 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2415 HN_UNLOCK(sc); 2416 break; 2417 } 2418 2419 if (ifp->if_flags & IFF_UP) { 2420 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2421 /* 2422 * Caller meight hold mutex, e.g. 2423 * bpf; use busy-wait for the RNDIS 2424 * reply. 2425 */ 2426 HN_NO_SLEEPING(sc); 2427 hn_set_rxfilter(sc); 2428 HN_SLEEPING_OK(sc); 2429 } else { 2430 hn_init_locked(sc); 2431 } 2432 } else { 2433 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2434 hn_stop(sc); 2435 } 2436 sc->hn_if_flags = ifp->if_flags; 2437 2438 HN_UNLOCK(sc); 2439 break; 2440 2441 case SIOCSIFCAP: 2442 HN_LOCK(sc); 2443 mask = ifr->ifr_reqcap ^ ifp->if_capenable; 2444 2445 if (mask & IFCAP_TXCSUM) { 2446 ifp->if_capenable ^= IFCAP_TXCSUM; 2447 if (ifp->if_capenable & IFCAP_TXCSUM) 2448 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 2449 else 2450 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 2451 } 2452 if (mask & IFCAP_TXCSUM_IPV6) { 2453 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 2454 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 2455 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 2456 else 2457 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 2458 } 2459 2460 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 2461 if (mask & IFCAP_RXCSUM) 2462 ifp->if_capenable ^= IFCAP_RXCSUM; 2463 #ifdef foo 2464 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2465 if (mask & IFCAP_RXCSUM_IPV6) 2466 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 2467 #endif 2468 2469 if (mask & IFCAP_LRO) 2470 ifp->if_capenable ^= IFCAP_LRO; 2471 2472 if (mask & IFCAP_TSO4) { 2473 ifp->if_capenable ^= IFCAP_TSO4; 2474 if (ifp->if_capenable & IFCAP_TSO4) 2475 ifp->if_hwassist |= CSUM_IP_TSO; 2476 else 2477 ifp->if_hwassist &= ~CSUM_IP_TSO; 2478 } 2479 if (mask & IFCAP_TSO6) { 2480 ifp->if_capenable ^= IFCAP_TSO6; 2481 if (ifp->if_capenable & IFCAP_TSO6) 2482 ifp->if_hwassist |= CSUM_IP6_TSO; 2483 else 2484 ifp->if_hwassist &= ~CSUM_IP6_TSO; 2485 } 2486 2487 HN_UNLOCK(sc); 2488 break; 2489 2490 case SIOCADDMULTI: 2491 case SIOCDELMULTI: 2492 HN_LOCK(sc); 2493 2494 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2495 HN_UNLOCK(sc); 2496 break; 2497 } 2498 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2499 /* 2500 * Multicast uses mutex; use busy-wait for 2501 * the RNDIS reply. 2502 */ 2503 HN_NO_SLEEPING(sc); 2504 hn_set_rxfilter(sc); 2505 HN_SLEEPING_OK(sc); 2506 } 2507 2508 HN_UNLOCK(sc); 2509 break; 2510 2511 case SIOCSIFMEDIA: 2512 case SIOCGIFMEDIA: 2513 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 2514 break; 2515 2516 default: 2517 error = ether_ioctl(ifp, cmd, data); 2518 break; 2519 } 2520 return (error); 2521 } 2522 2523 static void 2524 hn_stop(struct hn_softc *sc) 2525 { 2526 struct ifnet *ifp = sc->hn_ifp; 2527 int i; 2528 2529 HN_LOCK_ASSERT(sc); 2530 2531 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 2532 ("synthetic parts were not attached")); 2533 2534 /* Clear RUNNING bit _before_ hn_suspend_data() */ 2535 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2536 hn_suspend_data(sc); 2537 2538 /* Clear OACTIVE bit. */ 2539 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2540 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2541 sc->hn_tx_ring[i].hn_oactive = 0; 2542 } 2543 2544 static void 2545 hn_init_locked(struct hn_softc *sc) 2546 { 2547 struct ifnet *ifp = sc->hn_ifp; 2548 int i; 2549 2550 HN_LOCK_ASSERT(sc); 2551 2552 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 2553 return; 2554 2555 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2556 return; 2557 2558 /* Configure RX filter */ 2559 hn_set_rxfilter(sc); 2560 2561 /* Clear OACTIVE bit. */ 2562 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2563 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2564 sc->hn_tx_ring[i].hn_oactive = 0; 2565 2566 /* Clear TX 'suspended' bit. */ 2567 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 2568 2569 /* Everything is ready; unleash! */ 2570 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2571 } 2572 2573 static void 2574 hn_init(void *xsc) 2575 { 2576 struct hn_softc *sc = xsc; 2577 2578 HN_LOCK(sc); 2579 hn_init_locked(sc); 2580 HN_UNLOCK(sc); 2581 } 2582 2583 #if __FreeBSD_version >= 1100099 2584 2585 static int 2586 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 2587 { 2588 struct hn_softc *sc = arg1; 2589 unsigned int lenlim; 2590 int error; 2591 2592 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 2593 error = sysctl_handle_int(oidp, &lenlim, 0, req); 2594 if (error || req->newptr == NULL) 2595 return error; 2596 2597 HN_LOCK(sc); 2598 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 2599 lenlim > TCP_LRO_LENGTH_MAX) { 2600 HN_UNLOCK(sc); 2601 return EINVAL; 2602 } 2603 hn_set_lro_lenlim(sc, lenlim); 2604 HN_UNLOCK(sc); 2605 2606 return 0; 2607 } 2608 2609 static int 2610 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 2611 { 2612 struct hn_softc *sc = arg1; 2613 int ackcnt, error, i; 2614 2615 /* 2616 * lro_ackcnt_lim is append count limit, 2617 * +1 to turn it into aggregation limit. 2618 */ 2619 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 2620 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 2621 if (error || req->newptr == NULL) 2622 return error; 2623 2624 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 2625 return EINVAL; 2626 2627 /* 2628 * Convert aggregation limit back to append 2629 * count limit. 2630 */ 2631 --ackcnt; 2632 HN_LOCK(sc); 2633 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 2634 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 2635 HN_UNLOCK(sc); 2636 return 0; 2637 } 2638 2639 #endif 2640 2641 static int 2642 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 2643 { 2644 struct hn_softc *sc = arg1; 2645 int hcsum = arg2; 2646 int on, error, i; 2647 2648 on = 0; 2649 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 2650 on = 1; 2651 2652 error = sysctl_handle_int(oidp, &on, 0, req); 2653 if (error || req->newptr == NULL) 2654 return error; 2655 2656 HN_LOCK(sc); 2657 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2658 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 2659 2660 if (on) 2661 rxr->hn_trust_hcsum |= hcsum; 2662 else 2663 rxr->hn_trust_hcsum &= ~hcsum; 2664 } 2665 HN_UNLOCK(sc); 2666 return 0; 2667 } 2668 2669 static int 2670 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 2671 { 2672 struct hn_softc *sc = arg1; 2673 int chim_size, error; 2674 2675 chim_size = sc->hn_tx_ring[0].hn_chim_size; 2676 error = sysctl_handle_int(oidp, &chim_size, 0, req); 2677 if (error || req->newptr == NULL) 2678 return error; 2679 2680 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 2681 return EINVAL; 2682 2683 HN_LOCK(sc); 2684 hn_set_chim_size(sc, chim_size); 2685 HN_UNLOCK(sc); 2686 return 0; 2687 } 2688 2689 #if __FreeBSD_version < 1100095 2690 static int 2691 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 2692 { 2693 struct hn_softc *sc = arg1; 2694 int ofs = arg2, i, error; 2695 struct hn_rx_ring *rxr; 2696 uint64_t stat; 2697 2698 stat = 0; 2699 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2700 rxr = &sc->hn_rx_ring[i]; 2701 stat += *((int *)((uint8_t *)rxr + ofs)); 2702 } 2703 2704 error = sysctl_handle_64(oidp, &stat, 0, req); 2705 if (error || req->newptr == NULL) 2706 return error; 2707 2708 /* Zero out this stat. */ 2709 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2710 rxr = &sc->hn_rx_ring[i]; 2711 *((int *)((uint8_t *)rxr + ofs)) = 0; 2712 } 2713 return 0; 2714 } 2715 #else 2716 static int 2717 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 2718 { 2719 struct hn_softc *sc = arg1; 2720 int ofs = arg2, i, error; 2721 struct hn_rx_ring *rxr; 2722 uint64_t stat; 2723 2724 stat = 0; 2725 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2726 rxr = &sc->hn_rx_ring[i]; 2727 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 2728 } 2729 2730 error = sysctl_handle_64(oidp, &stat, 0, req); 2731 if (error || req->newptr == NULL) 2732 return error; 2733 2734 /* Zero out this stat. */ 2735 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2736 rxr = &sc->hn_rx_ring[i]; 2737 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 2738 } 2739 return 0; 2740 } 2741 2742 #endif 2743 2744 static int 2745 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2746 { 2747 struct hn_softc *sc = arg1; 2748 int ofs = arg2, i, error; 2749 struct hn_rx_ring *rxr; 2750 u_long stat; 2751 2752 stat = 0; 2753 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2754 rxr = &sc->hn_rx_ring[i]; 2755 stat += *((u_long *)((uint8_t *)rxr + ofs)); 2756 } 2757 2758 error = sysctl_handle_long(oidp, &stat, 0, req); 2759 if (error || req->newptr == NULL) 2760 return error; 2761 2762 /* Zero out this stat. */ 2763 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2764 rxr = &sc->hn_rx_ring[i]; 2765 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 2766 } 2767 return 0; 2768 } 2769 2770 static int 2771 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2772 { 2773 struct hn_softc *sc = arg1; 2774 int ofs = arg2, i, error; 2775 struct hn_tx_ring *txr; 2776 u_long stat; 2777 2778 stat = 0; 2779 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2780 txr = &sc->hn_tx_ring[i]; 2781 stat += *((u_long *)((uint8_t *)txr + ofs)); 2782 } 2783 2784 error = sysctl_handle_long(oidp, &stat, 0, req); 2785 if (error || req->newptr == NULL) 2786 return error; 2787 2788 /* Zero out this stat. */ 2789 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2790 txr = &sc->hn_tx_ring[i]; 2791 *((u_long *)((uint8_t *)txr + ofs)) = 0; 2792 } 2793 return 0; 2794 } 2795 2796 static int 2797 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 2798 { 2799 struct hn_softc *sc = arg1; 2800 int ofs = arg2, i, error, conf; 2801 struct hn_tx_ring *txr; 2802 2803 txr = &sc->hn_tx_ring[0]; 2804 conf = *((int *)((uint8_t *)txr + ofs)); 2805 2806 error = sysctl_handle_int(oidp, &conf, 0, req); 2807 if (error || req->newptr == NULL) 2808 return error; 2809 2810 HN_LOCK(sc); 2811 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2812 txr = &sc->hn_tx_ring[i]; 2813 *((int *)((uint8_t *)txr + ofs)) = conf; 2814 } 2815 HN_UNLOCK(sc); 2816 2817 return 0; 2818 } 2819 2820 static int 2821 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 2822 { 2823 struct hn_softc *sc = arg1; 2824 int error, size; 2825 2826 size = sc->hn_agg_size; 2827 error = sysctl_handle_int(oidp, &size, 0, req); 2828 if (error || req->newptr == NULL) 2829 return (error); 2830 2831 HN_LOCK(sc); 2832 sc->hn_agg_size = size; 2833 hn_set_txagg(sc); 2834 HN_UNLOCK(sc); 2835 2836 return (0); 2837 } 2838 2839 static int 2840 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 2841 { 2842 struct hn_softc *sc = arg1; 2843 int error, pkts; 2844 2845 pkts = sc->hn_agg_pkts; 2846 error = sysctl_handle_int(oidp, &pkts, 0, req); 2847 if (error || req->newptr == NULL) 2848 return (error); 2849 2850 HN_LOCK(sc); 2851 sc->hn_agg_pkts = pkts; 2852 hn_set_txagg(sc); 2853 HN_UNLOCK(sc); 2854 2855 return (0); 2856 } 2857 2858 static int 2859 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 2860 { 2861 struct hn_softc *sc = arg1; 2862 int pkts; 2863 2864 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 2865 return (sysctl_handle_int(oidp, &pkts, 0, req)); 2866 } 2867 2868 static int 2869 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 2870 { 2871 struct hn_softc *sc = arg1; 2872 int align; 2873 2874 align = sc->hn_tx_ring[0].hn_agg_align; 2875 return (sysctl_handle_int(oidp, &align, 0, req)); 2876 } 2877 2878 static int 2879 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 2880 { 2881 struct hn_softc *sc = arg1; 2882 char verstr[16]; 2883 2884 snprintf(verstr, sizeof(verstr), "%u.%u", 2885 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 2886 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 2887 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 2888 } 2889 2890 static int 2891 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 2892 { 2893 struct hn_softc *sc = arg1; 2894 char caps_str[128]; 2895 uint32_t caps; 2896 2897 HN_LOCK(sc); 2898 caps = sc->hn_caps; 2899 HN_UNLOCK(sc); 2900 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 2901 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 2902 } 2903 2904 static int 2905 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 2906 { 2907 struct hn_softc *sc = arg1; 2908 char assist_str[128]; 2909 uint32_t hwassist; 2910 2911 HN_LOCK(sc); 2912 hwassist = sc->hn_ifp->if_hwassist; 2913 HN_UNLOCK(sc); 2914 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 2915 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 2916 } 2917 2918 static int 2919 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 2920 { 2921 struct hn_softc *sc = arg1; 2922 char filter_str[128]; 2923 uint32_t filter; 2924 2925 HN_LOCK(sc); 2926 filter = sc->hn_rx_filter; 2927 HN_UNLOCK(sc); 2928 snprintf(filter_str, sizeof(filter_str), "%b", filter, 2929 NDIS_PACKET_TYPES); 2930 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 2931 } 2932 2933 #ifndef RSS 2934 2935 static int 2936 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 2937 { 2938 struct hn_softc *sc = arg1; 2939 int error; 2940 2941 HN_LOCK(sc); 2942 2943 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 2944 if (error || req->newptr == NULL) 2945 goto back; 2946 2947 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 2948 if (error) 2949 goto back; 2950 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 2951 2952 if (sc->hn_rx_ring_inuse > 1) { 2953 error = hn_rss_reconfig(sc); 2954 } else { 2955 /* Not RSS capable, at least for now; just save the RSS key. */ 2956 error = 0; 2957 } 2958 back: 2959 HN_UNLOCK(sc); 2960 return (error); 2961 } 2962 2963 static int 2964 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 2965 { 2966 struct hn_softc *sc = arg1; 2967 int error; 2968 2969 HN_LOCK(sc); 2970 2971 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 2972 if (error || req->newptr == NULL) 2973 goto back; 2974 2975 /* 2976 * Don't allow RSS indirect table change, if this interface is not 2977 * RSS capable currently. 2978 */ 2979 if (sc->hn_rx_ring_inuse == 1) { 2980 error = EOPNOTSUPP; 2981 goto back; 2982 } 2983 2984 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 2985 if (error) 2986 goto back; 2987 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 2988 2989 hn_rss_ind_fixup(sc); 2990 error = hn_rss_reconfig(sc); 2991 back: 2992 HN_UNLOCK(sc); 2993 return (error); 2994 } 2995 2996 #endif /* !RSS */ 2997 2998 static int 2999 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 3000 { 3001 struct hn_softc *sc = arg1; 3002 char hash_str[128]; 3003 uint32_t hash; 3004 3005 HN_LOCK(sc); 3006 hash = sc->hn_rss_hash; 3007 HN_UNLOCK(sc); 3008 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 3009 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 3010 } 3011 3012 static int 3013 hn_check_iplen(const struct mbuf *m, int hoff) 3014 { 3015 const struct ip *ip; 3016 int len, iphlen, iplen; 3017 const struct tcphdr *th; 3018 int thoff; /* TCP data offset */ 3019 3020 len = hoff + sizeof(struct ip); 3021 3022 /* The packet must be at least the size of an IP header. */ 3023 if (m->m_pkthdr.len < len) 3024 return IPPROTO_DONE; 3025 3026 /* The fixed IP header must reside completely in the first mbuf. */ 3027 if (m->m_len < len) 3028 return IPPROTO_DONE; 3029 3030 ip = mtodo(m, hoff); 3031 3032 /* Bound check the packet's stated IP header length. */ 3033 iphlen = ip->ip_hl << 2; 3034 if (iphlen < sizeof(struct ip)) /* minimum header length */ 3035 return IPPROTO_DONE; 3036 3037 /* The full IP header must reside completely in the one mbuf. */ 3038 if (m->m_len < hoff + iphlen) 3039 return IPPROTO_DONE; 3040 3041 iplen = ntohs(ip->ip_len); 3042 3043 /* 3044 * Check that the amount of data in the buffers is as 3045 * at least much as the IP header would have us expect. 3046 */ 3047 if (m->m_pkthdr.len < hoff + iplen) 3048 return IPPROTO_DONE; 3049 3050 /* 3051 * Ignore IP fragments. 3052 */ 3053 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 3054 return IPPROTO_DONE; 3055 3056 /* 3057 * The TCP/IP or UDP/IP header must be entirely contained within 3058 * the first fragment of a packet. 3059 */ 3060 switch (ip->ip_p) { 3061 case IPPROTO_TCP: 3062 if (iplen < iphlen + sizeof(struct tcphdr)) 3063 return IPPROTO_DONE; 3064 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 3065 return IPPROTO_DONE; 3066 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 3067 thoff = th->th_off << 2; 3068 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 3069 return IPPROTO_DONE; 3070 if (m->m_len < hoff + iphlen + thoff) 3071 return IPPROTO_DONE; 3072 break; 3073 case IPPROTO_UDP: 3074 if (iplen < iphlen + sizeof(struct udphdr)) 3075 return IPPROTO_DONE; 3076 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 3077 return IPPROTO_DONE; 3078 break; 3079 default: 3080 if (iplen < iphlen) 3081 return IPPROTO_DONE; 3082 break; 3083 } 3084 return ip->ip_p; 3085 } 3086 3087 static int 3088 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 3089 { 3090 struct sysctl_oid_list *child; 3091 struct sysctl_ctx_list *ctx; 3092 device_t dev = sc->hn_dev; 3093 #if defined(INET) || defined(INET6) 3094 #if __FreeBSD_version >= 1100095 3095 int lroent_cnt; 3096 #endif 3097 #endif 3098 int i; 3099 3100 /* 3101 * Create RXBUF for reception. 3102 * 3103 * NOTE: 3104 * - It is shared by all channels. 3105 * - A large enough buffer is allocated, certain version of NVSes 3106 * may further limit the usable space. 3107 */ 3108 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3109 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 3110 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3111 if (sc->hn_rxbuf == NULL) { 3112 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 3113 return (ENOMEM); 3114 } 3115 3116 sc->hn_rx_ring_cnt = ring_cnt; 3117 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 3118 3119 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 3120 M_DEVBUF, M_WAITOK | M_ZERO); 3121 3122 #if defined(INET) || defined(INET6) 3123 #if __FreeBSD_version >= 1100095 3124 lroent_cnt = hn_lro_entry_count; 3125 if (lroent_cnt < TCP_LRO_ENTRIES) 3126 lroent_cnt = TCP_LRO_ENTRIES; 3127 if (bootverbose) 3128 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 3129 #endif 3130 #endif /* INET || INET6 */ 3131 3132 ctx = device_get_sysctl_ctx(dev); 3133 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 3134 3135 /* Create dev.hn.UNIT.rx sysctl tree */ 3136 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 3137 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3138 3139 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3140 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3141 3142 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3143 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 3144 &rxr->hn_br_dma, BUS_DMA_WAITOK); 3145 if (rxr->hn_br == NULL) { 3146 device_printf(dev, "allocate bufring failed\n"); 3147 return (ENOMEM); 3148 } 3149 3150 if (hn_trust_hosttcp) 3151 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 3152 if (hn_trust_hostudp) 3153 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 3154 if (hn_trust_hostip) 3155 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 3156 rxr->hn_ifp = sc->hn_ifp; 3157 if (i < sc->hn_tx_ring_cnt) 3158 rxr->hn_txr = &sc->hn_tx_ring[i]; 3159 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 3160 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 3161 rxr->hn_rx_idx = i; 3162 rxr->hn_rxbuf = sc->hn_rxbuf; 3163 3164 /* 3165 * Initialize LRO. 3166 */ 3167 #if defined(INET) || defined(INET6) 3168 #if __FreeBSD_version >= 1100095 3169 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 3170 hn_lro_mbufq_depth); 3171 #else 3172 tcp_lro_init(&rxr->hn_lro); 3173 rxr->hn_lro.ifp = sc->hn_ifp; 3174 #endif 3175 #if __FreeBSD_version >= 1100099 3176 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 3177 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 3178 #endif 3179 #endif /* INET || INET6 */ 3180 3181 if (sc->hn_rx_sysctl_tree != NULL) { 3182 char name[16]; 3183 3184 /* 3185 * Create per RX ring sysctl tree: 3186 * dev.hn.UNIT.rx.RINGID 3187 */ 3188 snprintf(name, sizeof(name), "%d", i); 3189 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 3190 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 3191 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3192 3193 if (rxr->hn_rx_sysctl_tree != NULL) { 3194 SYSCTL_ADD_ULONG(ctx, 3195 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3196 OID_AUTO, "packets", CTLFLAG_RW, 3197 &rxr->hn_pkts, "# of packets received"); 3198 SYSCTL_ADD_ULONG(ctx, 3199 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3200 OID_AUTO, "rss_pkts", CTLFLAG_RW, 3201 &rxr->hn_rss_pkts, 3202 "# of packets w/ RSS info received"); 3203 SYSCTL_ADD_INT(ctx, 3204 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3205 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 3206 &rxr->hn_pktbuf_len, 0, 3207 "Temporary channel packet buffer length"); 3208 } 3209 } 3210 } 3211 3212 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 3213 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3214 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 3215 #if __FreeBSD_version < 1100095 3216 hn_rx_stat_int_sysctl, 3217 #else 3218 hn_rx_stat_u64_sysctl, 3219 #endif 3220 "LU", "LRO queued"); 3221 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 3222 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3223 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 3224 #if __FreeBSD_version < 1100095 3225 hn_rx_stat_int_sysctl, 3226 #else 3227 hn_rx_stat_u64_sysctl, 3228 #endif 3229 "LU", "LRO flushed"); 3230 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 3231 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3232 __offsetof(struct hn_rx_ring, hn_lro_tried), 3233 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 3234 #if __FreeBSD_version >= 1100099 3235 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 3236 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3237 hn_lro_lenlim_sysctl, "IU", 3238 "Max # of data bytes to be aggregated by LRO"); 3239 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 3240 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3241 hn_lro_ackcnt_sysctl, "I", 3242 "Max # of ACKs to be aggregated by LRO"); 3243 #endif 3244 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 3245 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 3246 hn_trust_hcsum_sysctl, "I", 3247 "Trust tcp segement verification on host side, " 3248 "when csum info is missing"); 3249 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 3250 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 3251 hn_trust_hcsum_sysctl, "I", 3252 "Trust udp datagram verification on host side, " 3253 "when csum info is missing"); 3254 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 3255 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 3256 hn_trust_hcsum_sysctl, "I", 3257 "Trust ip packet verification on host side, " 3258 "when csum info is missing"); 3259 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 3260 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3261 __offsetof(struct hn_rx_ring, hn_csum_ip), 3262 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 3263 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 3264 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3265 __offsetof(struct hn_rx_ring, hn_csum_tcp), 3266 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 3267 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 3268 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3269 __offsetof(struct hn_rx_ring, hn_csum_udp), 3270 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 3271 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 3272 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3273 __offsetof(struct hn_rx_ring, hn_csum_trusted), 3274 hn_rx_stat_ulong_sysctl, "LU", 3275 "# of packets that we trust host's csum verification"); 3276 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 3277 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3278 __offsetof(struct hn_rx_ring, hn_small_pkts), 3279 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 3280 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 3281 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3282 __offsetof(struct hn_rx_ring, hn_ack_failed), 3283 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 3284 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 3285 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 3286 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 3287 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 3288 3289 return (0); 3290 } 3291 3292 static void 3293 hn_destroy_rx_data(struct hn_softc *sc) 3294 { 3295 int i; 3296 3297 if (sc->hn_rxbuf != NULL) { 3298 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 3299 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 3300 else 3301 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 3302 sc->hn_rxbuf = NULL; 3303 } 3304 3305 if (sc->hn_rx_ring_cnt == 0) 3306 return; 3307 3308 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3309 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3310 3311 if (rxr->hn_br == NULL) 3312 continue; 3313 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 3314 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 3315 } else { 3316 device_printf(sc->hn_dev, 3317 "%dth channel bufring is referenced", i); 3318 } 3319 rxr->hn_br = NULL; 3320 3321 #if defined(INET) || defined(INET6) 3322 tcp_lro_free(&rxr->hn_lro); 3323 #endif 3324 free(rxr->hn_pktbuf, M_DEVBUF); 3325 } 3326 free(sc->hn_rx_ring, M_DEVBUF); 3327 sc->hn_rx_ring = NULL; 3328 3329 sc->hn_rx_ring_cnt = 0; 3330 sc->hn_rx_ring_inuse = 0; 3331 } 3332 3333 static int 3334 hn_tx_ring_create(struct hn_softc *sc, int id) 3335 { 3336 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 3337 device_t dev = sc->hn_dev; 3338 bus_dma_tag_t parent_dtag; 3339 int error, i; 3340 3341 txr->hn_sc = sc; 3342 txr->hn_tx_idx = id; 3343 3344 #ifndef HN_USE_TXDESC_BUFRING 3345 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 3346 #endif 3347 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 3348 3349 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 3350 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 3351 M_DEVBUF, M_WAITOK | M_ZERO); 3352 #ifndef HN_USE_TXDESC_BUFRING 3353 SLIST_INIT(&txr->hn_txlist); 3354 #else 3355 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 3356 M_WAITOK, &txr->hn_tx_lock); 3357 #endif 3358 3359 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 3360 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 3361 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 3362 } else { 3363 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 3364 } 3365 3366 #ifdef HN_IFSTART_SUPPORT 3367 if (hn_use_if_start) { 3368 txr->hn_txeof = hn_start_txeof; 3369 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 3370 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 3371 } else 3372 #endif 3373 { 3374 int br_depth; 3375 3376 txr->hn_txeof = hn_xmit_txeof; 3377 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 3378 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 3379 3380 br_depth = hn_get_txswq_depth(txr); 3381 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 3382 M_WAITOK, &txr->hn_tx_lock); 3383 } 3384 3385 txr->hn_direct_tx_size = hn_direct_tx_size; 3386 3387 /* 3388 * Always schedule transmission instead of trying to do direct 3389 * transmission. This one gives the best performance so far. 3390 */ 3391 txr->hn_sched_tx = 1; 3392 3393 parent_dtag = bus_get_dma_tag(dev); 3394 3395 /* DMA tag for RNDIS packet messages. */ 3396 error = bus_dma_tag_create(parent_dtag, /* parent */ 3397 HN_RNDIS_PKT_ALIGN, /* alignment */ 3398 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 3399 BUS_SPACE_MAXADDR, /* lowaddr */ 3400 BUS_SPACE_MAXADDR, /* highaddr */ 3401 NULL, NULL, /* filter, filterarg */ 3402 HN_RNDIS_PKT_LEN, /* maxsize */ 3403 1, /* nsegments */ 3404 HN_RNDIS_PKT_LEN, /* maxsegsize */ 3405 0, /* flags */ 3406 NULL, /* lockfunc */ 3407 NULL, /* lockfuncarg */ 3408 &txr->hn_tx_rndis_dtag); 3409 if (error) { 3410 device_printf(dev, "failed to create rndis dmatag\n"); 3411 return error; 3412 } 3413 3414 /* DMA tag for data. */ 3415 error = bus_dma_tag_create(parent_dtag, /* parent */ 3416 1, /* alignment */ 3417 HN_TX_DATA_BOUNDARY, /* boundary */ 3418 BUS_SPACE_MAXADDR, /* lowaddr */ 3419 BUS_SPACE_MAXADDR, /* highaddr */ 3420 NULL, NULL, /* filter, filterarg */ 3421 HN_TX_DATA_MAXSIZE, /* maxsize */ 3422 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 3423 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 3424 0, /* flags */ 3425 NULL, /* lockfunc */ 3426 NULL, /* lockfuncarg */ 3427 &txr->hn_tx_data_dtag); 3428 if (error) { 3429 device_printf(dev, "failed to create data dmatag\n"); 3430 return error; 3431 } 3432 3433 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 3434 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 3435 3436 txd->txr = txr; 3437 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3438 STAILQ_INIT(&txd->agg_list); 3439 3440 /* 3441 * Allocate and load RNDIS packet message. 3442 */ 3443 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 3444 (void **)&txd->rndis_pkt, 3445 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 3446 &txd->rndis_pkt_dmap); 3447 if (error) { 3448 device_printf(dev, 3449 "failed to allocate rndis_packet_msg, %d\n", i); 3450 return error; 3451 } 3452 3453 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 3454 txd->rndis_pkt_dmap, 3455 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 3456 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 3457 BUS_DMA_NOWAIT); 3458 if (error) { 3459 device_printf(dev, 3460 "failed to load rndis_packet_msg, %d\n", i); 3461 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3462 txd->rndis_pkt, txd->rndis_pkt_dmap); 3463 return error; 3464 } 3465 3466 /* DMA map for TX data. */ 3467 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 3468 &txd->data_dmap); 3469 if (error) { 3470 device_printf(dev, 3471 "failed to allocate tx data dmamap\n"); 3472 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 3473 txd->rndis_pkt_dmap); 3474 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3475 txd->rndis_pkt, txd->rndis_pkt_dmap); 3476 return error; 3477 } 3478 3479 /* All set, put it to list */ 3480 txd->flags |= HN_TXD_FLAG_ONLIST; 3481 #ifndef HN_USE_TXDESC_BUFRING 3482 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 3483 #else 3484 buf_ring_enqueue(txr->hn_txdesc_br, txd); 3485 #endif 3486 } 3487 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 3488 3489 if (sc->hn_tx_sysctl_tree != NULL) { 3490 struct sysctl_oid_list *child; 3491 struct sysctl_ctx_list *ctx; 3492 char name[16]; 3493 3494 /* 3495 * Create per TX ring sysctl tree: 3496 * dev.hn.UNIT.tx.RINGID 3497 */ 3498 ctx = device_get_sysctl_ctx(dev); 3499 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 3500 3501 snprintf(name, sizeof(name), "%d", id); 3502 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 3503 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3504 3505 if (txr->hn_tx_sysctl_tree != NULL) { 3506 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 3507 3508 #ifdef HN_DEBUG 3509 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 3510 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 3511 "# of available TX descs"); 3512 #endif 3513 #ifdef HN_IFSTART_SUPPORT 3514 if (!hn_use_if_start) 3515 #endif 3516 { 3517 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 3518 CTLFLAG_RD, &txr->hn_oactive, 0, 3519 "over active"); 3520 } 3521 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 3522 CTLFLAG_RW, &txr->hn_pkts, 3523 "# of packets transmitted"); 3524 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 3525 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 3526 } 3527 } 3528 3529 return 0; 3530 } 3531 3532 static void 3533 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 3534 { 3535 struct hn_tx_ring *txr = txd->txr; 3536 3537 KASSERT(txd->m == NULL, ("still has mbuf installed")); 3538 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 3539 3540 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 3541 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 3542 txd->rndis_pkt_dmap); 3543 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 3544 } 3545 3546 static void 3547 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 3548 { 3549 3550 KASSERT(txd->refs == 0 || txd->refs == 1, 3551 ("invalid txd refs %d", txd->refs)); 3552 3553 /* Aggregated txds will be freed by their aggregating txd. */ 3554 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 3555 int freed; 3556 3557 freed = hn_txdesc_put(txr, txd); 3558 KASSERT(freed, ("can't free txdesc")); 3559 } 3560 } 3561 3562 static void 3563 hn_tx_ring_destroy(struct hn_tx_ring *txr) 3564 { 3565 int i; 3566 3567 if (txr->hn_txdesc == NULL) 3568 return; 3569 3570 /* 3571 * NOTE: 3572 * Because the freeing of aggregated txds will be deferred 3573 * to the aggregating txd, two passes are used here: 3574 * - The first pass GCes any pending txds. This GC is necessary, 3575 * since if the channels are revoked, hypervisor will not 3576 * deliver send-done for all pending txds. 3577 * - The second pass frees the busdma stuffs, i.e. after all txds 3578 * were freed. 3579 */ 3580 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3581 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 3582 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3583 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 3584 3585 if (txr->hn_tx_data_dtag != NULL) 3586 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 3587 if (txr->hn_tx_rndis_dtag != NULL) 3588 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 3589 3590 #ifdef HN_USE_TXDESC_BUFRING 3591 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 3592 #endif 3593 3594 free(txr->hn_txdesc, M_DEVBUF); 3595 txr->hn_txdesc = NULL; 3596 3597 if (txr->hn_mbuf_br != NULL) 3598 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 3599 3600 #ifndef HN_USE_TXDESC_BUFRING 3601 mtx_destroy(&txr->hn_txlist_spin); 3602 #endif 3603 mtx_destroy(&txr->hn_tx_lock); 3604 } 3605 3606 static int 3607 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 3608 { 3609 struct sysctl_oid_list *child; 3610 struct sysctl_ctx_list *ctx; 3611 int i; 3612 3613 /* 3614 * Create TXBUF for chimney sending. 3615 * 3616 * NOTE: It is shared by all channels. 3617 */ 3618 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 3619 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 3620 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3621 if (sc->hn_chim == NULL) { 3622 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 3623 return (ENOMEM); 3624 } 3625 3626 sc->hn_tx_ring_cnt = ring_cnt; 3627 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 3628 3629 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 3630 M_DEVBUF, M_WAITOK | M_ZERO); 3631 3632 ctx = device_get_sysctl_ctx(sc->hn_dev); 3633 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 3634 3635 /* Create dev.hn.UNIT.tx sysctl tree */ 3636 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 3637 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3638 3639 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 3640 int error; 3641 3642 error = hn_tx_ring_create(sc, i); 3643 if (error) 3644 return error; 3645 } 3646 3647 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 3648 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3649 __offsetof(struct hn_tx_ring, hn_no_txdescs), 3650 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 3651 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 3652 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3653 __offsetof(struct hn_tx_ring, hn_send_failed), 3654 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 3655 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 3656 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3657 __offsetof(struct hn_tx_ring, hn_txdma_failed), 3658 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 3659 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 3660 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3661 __offsetof(struct hn_tx_ring, hn_flush_failed), 3662 hn_tx_stat_ulong_sysctl, "LU", 3663 "# of packet transmission aggregation flush failure"); 3664 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 3665 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3666 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 3667 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 3668 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 3669 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3670 __offsetof(struct hn_tx_ring, hn_tx_chimney), 3671 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 3672 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 3673 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3674 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 3675 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 3676 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 3677 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 3678 "# of total TX descs"); 3679 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 3680 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 3681 "Chimney send packet size upper boundary"); 3682 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 3683 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3684 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 3685 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 3686 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3687 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 3688 hn_tx_conf_int_sysctl, "I", 3689 "Size of the packet for direct transmission"); 3690 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 3691 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3692 __offsetof(struct hn_tx_ring, hn_sched_tx), 3693 hn_tx_conf_int_sysctl, "I", 3694 "Always schedule transmission " 3695 "instead of doing direct transmission"); 3696 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 3697 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 3698 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 3699 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 3700 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 3701 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 3702 "Applied packet transmission aggregation size"); 3703 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 3704 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3705 hn_txagg_pktmax_sysctl, "I", 3706 "Applied packet transmission aggregation packets"); 3707 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 3708 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3709 hn_txagg_align_sysctl, "I", 3710 "Applied packet transmission aggregation alignment"); 3711 3712 return 0; 3713 } 3714 3715 static void 3716 hn_set_chim_size(struct hn_softc *sc, int chim_size) 3717 { 3718 int i; 3719 3720 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3721 sc->hn_tx_ring[i].hn_chim_size = chim_size; 3722 } 3723 3724 static void 3725 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 3726 { 3727 struct ifnet *ifp = sc->hn_ifp; 3728 int tso_minlen; 3729 3730 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 3731 return; 3732 3733 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 3734 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 3735 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 3736 3737 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 3738 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 3739 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 3740 3741 if (tso_maxlen < tso_minlen) 3742 tso_maxlen = tso_minlen; 3743 else if (tso_maxlen > IP_MAXPACKET) 3744 tso_maxlen = IP_MAXPACKET; 3745 if (tso_maxlen > sc->hn_ndis_tso_szmax) 3746 tso_maxlen = sc->hn_ndis_tso_szmax; 3747 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 3748 if (bootverbose) 3749 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 3750 } 3751 3752 static void 3753 hn_fixup_tx_data(struct hn_softc *sc) 3754 { 3755 uint64_t csum_assist; 3756 int i; 3757 3758 hn_set_chim_size(sc, sc->hn_chim_szmax); 3759 if (hn_tx_chimney_size > 0 && 3760 hn_tx_chimney_size < sc->hn_chim_szmax) 3761 hn_set_chim_size(sc, hn_tx_chimney_size); 3762 3763 csum_assist = 0; 3764 if (sc->hn_caps & HN_CAP_IPCS) 3765 csum_assist |= CSUM_IP; 3766 if (sc->hn_caps & HN_CAP_TCP4CS) 3767 csum_assist |= CSUM_IP_TCP; 3768 if (sc->hn_caps & HN_CAP_UDP4CS) 3769 csum_assist |= CSUM_IP_UDP; 3770 if (sc->hn_caps & HN_CAP_TCP6CS) 3771 csum_assist |= CSUM_IP6_TCP; 3772 if (sc->hn_caps & HN_CAP_UDP6CS) 3773 csum_assist |= CSUM_IP6_UDP; 3774 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3775 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 3776 3777 if (sc->hn_caps & HN_CAP_HASHVAL) { 3778 /* 3779 * Support HASHVAL pktinfo on TX path. 3780 */ 3781 if (bootverbose) 3782 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 3783 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3784 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 3785 } 3786 } 3787 3788 static void 3789 hn_destroy_tx_data(struct hn_softc *sc) 3790 { 3791 int i; 3792 3793 if (sc->hn_chim != NULL) { 3794 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 3795 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 3796 } else { 3797 device_printf(sc->hn_dev, 3798 "chimney sending buffer is referenced"); 3799 } 3800 sc->hn_chim = NULL; 3801 } 3802 3803 if (sc->hn_tx_ring_cnt == 0) 3804 return; 3805 3806 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3807 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 3808 3809 free(sc->hn_tx_ring, M_DEVBUF); 3810 sc->hn_tx_ring = NULL; 3811 3812 sc->hn_tx_ring_cnt = 0; 3813 sc->hn_tx_ring_inuse = 0; 3814 } 3815 3816 #ifdef HN_IFSTART_SUPPORT 3817 3818 static void 3819 hn_start_taskfunc(void *xtxr, int pending __unused) 3820 { 3821 struct hn_tx_ring *txr = xtxr; 3822 3823 mtx_lock(&txr->hn_tx_lock); 3824 hn_start_locked(txr, 0); 3825 mtx_unlock(&txr->hn_tx_lock); 3826 } 3827 3828 static int 3829 hn_start_locked(struct hn_tx_ring *txr, int len) 3830 { 3831 struct hn_softc *sc = txr->hn_sc; 3832 struct ifnet *ifp = sc->hn_ifp; 3833 int sched = 0; 3834 3835 KASSERT(hn_use_if_start, 3836 ("hn_start_locked is called, when if_start is disabled")); 3837 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 3838 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 3839 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3840 3841 if (__predict_false(txr->hn_suspended)) 3842 return (0); 3843 3844 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 3845 IFF_DRV_RUNNING) 3846 return (0); 3847 3848 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 3849 struct hn_txdesc *txd; 3850 struct mbuf *m_head; 3851 int error; 3852 3853 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 3854 if (m_head == NULL) 3855 break; 3856 3857 if (len > 0 && m_head->m_pkthdr.len > len) { 3858 /* 3859 * This sending could be time consuming; let callers 3860 * dispatch this packet sending (and sending of any 3861 * following up packets) to tx taskqueue. 3862 */ 3863 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 3864 sched = 1; 3865 break; 3866 } 3867 3868 #if defined(INET6) || defined(INET) 3869 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3870 m_head = hn_tso_fixup(m_head); 3871 if (__predict_false(m_head == NULL)) { 3872 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3873 continue; 3874 } 3875 } 3876 #endif 3877 3878 txd = hn_txdesc_get(txr); 3879 if (txd == NULL) { 3880 txr->hn_no_txdescs++; 3881 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 3882 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 3883 break; 3884 } 3885 3886 error = hn_encap(ifp, txr, txd, &m_head); 3887 if (error) { 3888 /* Both txd and m_head are freed */ 3889 KASSERT(txr->hn_agg_txd == NULL, 3890 ("encap failed w/ pending aggregating txdesc")); 3891 continue; 3892 } 3893 3894 if (txr->hn_agg_pktleft == 0) { 3895 if (txr->hn_agg_txd != NULL) { 3896 KASSERT(m_head == NULL, 3897 ("pending mbuf for aggregating txdesc")); 3898 error = hn_flush_txagg(ifp, txr); 3899 if (__predict_false(error)) { 3900 atomic_set_int(&ifp->if_drv_flags, 3901 IFF_DRV_OACTIVE); 3902 break; 3903 } 3904 } else { 3905 KASSERT(m_head != NULL, ("mbuf was freed")); 3906 error = hn_txpkt(ifp, txr, txd); 3907 if (__predict_false(error)) { 3908 /* txd is freed, but m_head is not */ 3909 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 3910 atomic_set_int(&ifp->if_drv_flags, 3911 IFF_DRV_OACTIVE); 3912 break; 3913 } 3914 } 3915 } 3916 #ifdef INVARIANTS 3917 else { 3918 KASSERT(txr->hn_agg_txd != NULL, 3919 ("no aggregating txdesc")); 3920 KASSERT(m_head == NULL, 3921 ("pending mbuf for aggregating txdesc")); 3922 } 3923 #endif 3924 } 3925 3926 /* Flush pending aggerated transmission. */ 3927 if (txr->hn_agg_txd != NULL) 3928 hn_flush_txagg(ifp, txr); 3929 return (sched); 3930 } 3931 3932 static void 3933 hn_start(struct ifnet *ifp) 3934 { 3935 struct hn_softc *sc = ifp->if_softc; 3936 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 3937 3938 if (txr->hn_sched_tx) 3939 goto do_sched; 3940 3941 if (mtx_trylock(&txr->hn_tx_lock)) { 3942 int sched; 3943 3944 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 3945 mtx_unlock(&txr->hn_tx_lock); 3946 if (!sched) 3947 return; 3948 } 3949 do_sched: 3950 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 3951 } 3952 3953 static void 3954 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 3955 { 3956 struct hn_tx_ring *txr = xtxr; 3957 3958 mtx_lock(&txr->hn_tx_lock); 3959 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 3960 hn_start_locked(txr, 0); 3961 mtx_unlock(&txr->hn_tx_lock); 3962 } 3963 3964 static void 3965 hn_start_txeof(struct hn_tx_ring *txr) 3966 { 3967 struct hn_softc *sc = txr->hn_sc; 3968 struct ifnet *ifp = sc->hn_ifp; 3969 3970 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 3971 3972 if (txr->hn_sched_tx) 3973 goto do_sched; 3974 3975 if (mtx_trylock(&txr->hn_tx_lock)) { 3976 int sched; 3977 3978 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 3979 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 3980 mtx_unlock(&txr->hn_tx_lock); 3981 if (sched) { 3982 taskqueue_enqueue(txr->hn_tx_taskq, 3983 &txr->hn_tx_task); 3984 } 3985 } else { 3986 do_sched: 3987 /* 3988 * Release the OACTIVE earlier, with the hope, that 3989 * others could catch up. The task will clear the 3990 * flag again with the hn_tx_lock to avoid possible 3991 * races. 3992 */ 3993 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 3994 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 3995 } 3996 } 3997 3998 #endif /* HN_IFSTART_SUPPORT */ 3999 4000 static int 4001 hn_xmit(struct hn_tx_ring *txr, int len) 4002 { 4003 struct hn_softc *sc = txr->hn_sc; 4004 struct ifnet *ifp = sc->hn_ifp; 4005 struct mbuf *m_head; 4006 int sched = 0; 4007 4008 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 4009 #ifdef HN_IFSTART_SUPPORT 4010 KASSERT(hn_use_if_start == 0, 4011 ("hn_xmit is called, when if_start is enabled")); 4012 #endif 4013 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 4014 4015 if (__predict_false(txr->hn_suspended)) 4016 return (0); 4017 4018 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 4019 return (0); 4020 4021 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 4022 struct hn_txdesc *txd; 4023 int error; 4024 4025 if (len > 0 && m_head->m_pkthdr.len > len) { 4026 /* 4027 * This sending could be time consuming; let callers 4028 * dispatch this packet sending (and sending of any 4029 * following up packets) to tx taskqueue. 4030 */ 4031 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4032 sched = 1; 4033 break; 4034 } 4035 4036 txd = hn_txdesc_get(txr); 4037 if (txd == NULL) { 4038 txr->hn_no_txdescs++; 4039 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4040 txr->hn_oactive = 1; 4041 break; 4042 } 4043 4044 error = hn_encap(ifp, txr, txd, &m_head); 4045 if (error) { 4046 /* Both txd and m_head are freed; discard */ 4047 KASSERT(txr->hn_agg_txd == NULL, 4048 ("encap failed w/ pending aggregating txdesc")); 4049 drbr_advance(ifp, txr->hn_mbuf_br); 4050 continue; 4051 } 4052 4053 if (txr->hn_agg_pktleft == 0) { 4054 if (txr->hn_agg_txd != NULL) { 4055 KASSERT(m_head == NULL, 4056 ("pending mbuf for aggregating txdesc")); 4057 error = hn_flush_txagg(ifp, txr); 4058 if (__predict_false(error)) { 4059 txr->hn_oactive = 1; 4060 break; 4061 } 4062 } else { 4063 KASSERT(m_head != NULL, ("mbuf was freed")); 4064 error = hn_txpkt(ifp, txr, txd); 4065 if (__predict_false(error)) { 4066 /* txd is freed, but m_head is not */ 4067 drbr_putback(ifp, txr->hn_mbuf_br, 4068 m_head); 4069 txr->hn_oactive = 1; 4070 break; 4071 } 4072 } 4073 } 4074 #ifdef INVARIANTS 4075 else { 4076 KASSERT(txr->hn_agg_txd != NULL, 4077 ("no aggregating txdesc")); 4078 KASSERT(m_head == NULL, 4079 ("pending mbuf for aggregating txdesc")); 4080 } 4081 #endif 4082 4083 /* Sent */ 4084 drbr_advance(ifp, txr->hn_mbuf_br); 4085 } 4086 4087 /* Flush pending aggerated transmission. */ 4088 if (txr->hn_agg_txd != NULL) 4089 hn_flush_txagg(ifp, txr); 4090 return (sched); 4091 } 4092 4093 static int 4094 hn_transmit(struct ifnet *ifp, struct mbuf *m) 4095 { 4096 struct hn_softc *sc = ifp->if_softc; 4097 struct hn_tx_ring *txr; 4098 int error, idx = 0; 4099 4100 #if defined(INET6) || defined(INET) 4101 /* 4102 * Perform TSO packet header fixup now, since the TSO 4103 * packet header should be cache-hot. 4104 */ 4105 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 4106 m = hn_tso_fixup(m); 4107 if (__predict_false(m == NULL)) { 4108 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 4109 return EIO; 4110 } 4111 } 4112 #endif 4113 4114 /* 4115 * Select the TX ring based on flowid 4116 */ 4117 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 4118 #ifdef RSS 4119 uint32_t bid; 4120 4121 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 4122 &bid) == 0) 4123 idx = bid % sc->hn_tx_ring_inuse; 4124 else 4125 #endif 4126 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 4127 } 4128 txr = &sc->hn_tx_ring[idx]; 4129 4130 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 4131 if (error) { 4132 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 4133 return error; 4134 } 4135 4136 if (txr->hn_oactive) 4137 return 0; 4138 4139 if (txr->hn_sched_tx) 4140 goto do_sched; 4141 4142 if (mtx_trylock(&txr->hn_tx_lock)) { 4143 int sched; 4144 4145 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4146 mtx_unlock(&txr->hn_tx_lock); 4147 if (!sched) 4148 return 0; 4149 } 4150 do_sched: 4151 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4152 return 0; 4153 } 4154 4155 static void 4156 hn_tx_ring_qflush(struct hn_tx_ring *txr) 4157 { 4158 struct mbuf *m; 4159 4160 mtx_lock(&txr->hn_tx_lock); 4161 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 4162 m_freem(m); 4163 mtx_unlock(&txr->hn_tx_lock); 4164 } 4165 4166 static void 4167 hn_xmit_qflush(struct ifnet *ifp) 4168 { 4169 struct hn_softc *sc = ifp->if_softc; 4170 int i; 4171 4172 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4173 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 4174 if_qflush(ifp); 4175 } 4176 4177 static void 4178 hn_xmit_txeof(struct hn_tx_ring *txr) 4179 { 4180 4181 if (txr->hn_sched_tx) 4182 goto do_sched; 4183 4184 if (mtx_trylock(&txr->hn_tx_lock)) { 4185 int sched; 4186 4187 txr->hn_oactive = 0; 4188 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4189 mtx_unlock(&txr->hn_tx_lock); 4190 if (sched) { 4191 taskqueue_enqueue(txr->hn_tx_taskq, 4192 &txr->hn_tx_task); 4193 } 4194 } else { 4195 do_sched: 4196 /* 4197 * Release the oactive earlier, with the hope, that 4198 * others could catch up. The task will clear the 4199 * oactive again with the hn_tx_lock to avoid possible 4200 * races. 4201 */ 4202 txr->hn_oactive = 0; 4203 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4204 } 4205 } 4206 4207 static void 4208 hn_xmit_taskfunc(void *xtxr, int pending __unused) 4209 { 4210 struct hn_tx_ring *txr = xtxr; 4211 4212 mtx_lock(&txr->hn_tx_lock); 4213 hn_xmit(txr, 0); 4214 mtx_unlock(&txr->hn_tx_lock); 4215 } 4216 4217 static void 4218 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 4219 { 4220 struct hn_tx_ring *txr = xtxr; 4221 4222 mtx_lock(&txr->hn_tx_lock); 4223 txr->hn_oactive = 0; 4224 hn_xmit(txr, 0); 4225 mtx_unlock(&txr->hn_tx_lock); 4226 } 4227 4228 static int 4229 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 4230 { 4231 struct vmbus_chan_br cbr; 4232 struct hn_rx_ring *rxr; 4233 struct hn_tx_ring *txr = NULL; 4234 int idx, error; 4235 4236 idx = vmbus_chan_subidx(chan); 4237 4238 /* 4239 * Link this channel to RX/TX ring. 4240 */ 4241 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4242 ("invalid channel index %d, should > 0 && < %d", 4243 idx, sc->hn_rx_ring_inuse)); 4244 rxr = &sc->hn_rx_ring[idx]; 4245 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 4246 ("RX ring %d already attached", idx)); 4247 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 4248 4249 if (bootverbose) { 4250 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 4251 idx, vmbus_chan_id(chan)); 4252 } 4253 4254 if (idx < sc->hn_tx_ring_inuse) { 4255 txr = &sc->hn_tx_ring[idx]; 4256 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 4257 ("TX ring %d already attached", idx)); 4258 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 4259 4260 txr->hn_chan = chan; 4261 if (bootverbose) { 4262 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 4263 idx, vmbus_chan_id(chan)); 4264 } 4265 } 4266 4267 /* Bind this channel to a proper CPU. */ 4268 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 4269 4270 /* 4271 * Open this channel 4272 */ 4273 cbr.cbr = rxr->hn_br; 4274 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 4275 cbr.cbr_txsz = HN_TXBR_SIZE; 4276 cbr.cbr_rxsz = HN_RXBR_SIZE; 4277 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 4278 if (error) { 4279 if (error == EISCONN) { 4280 if_printf(sc->hn_ifp, "bufring is connected after " 4281 "chan%u open failure\n", vmbus_chan_id(chan)); 4282 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4283 } else { 4284 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 4285 vmbus_chan_id(chan), error); 4286 } 4287 } 4288 return (error); 4289 } 4290 4291 static void 4292 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 4293 { 4294 struct hn_rx_ring *rxr; 4295 int idx, error; 4296 4297 idx = vmbus_chan_subidx(chan); 4298 4299 /* 4300 * Link this channel to RX/TX ring. 4301 */ 4302 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4303 ("invalid channel index %d, should > 0 && < %d", 4304 idx, sc->hn_rx_ring_inuse)); 4305 rxr = &sc->hn_rx_ring[idx]; 4306 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 4307 ("RX ring %d is not attached", idx)); 4308 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 4309 4310 if (idx < sc->hn_tx_ring_inuse) { 4311 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 4312 4313 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 4314 ("TX ring %d is not attached attached", idx)); 4315 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 4316 } 4317 4318 /* 4319 * Close this channel. 4320 * 4321 * NOTE: 4322 * Channel closing does _not_ destroy the target channel. 4323 */ 4324 error = vmbus_chan_close_direct(chan); 4325 if (error == EISCONN) { 4326 if_printf(sc->hn_ifp, "chan%u bufring is connected " 4327 "after being closed\n", vmbus_chan_id(chan)); 4328 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4329 } else if (error) { 4330 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 4331 vmbus_chan_id(chan), error); 4332 } 4333 } 4334 4335 static int 4336 hn_attach_subchans(struct hn_softc *sc) 4337 { 4338 struct vmbus_channel **subchans; 4339 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4340 int i, error = 0; 4341 4342 KASSERT(subchan_cnt > 0, ("no sub-channels")); 4343 4344 /* Attach the sub-channels. */ 4345 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4346 for (i = 0; i < subchan_cnt; ++i) { 4347 int error1; 4348 4349 error1 = hn_chan_attach(sc, subchans[i]); 4350 if (error1) { 4351 error = error1; 4352 /* Move on; all channels will be detached later. */ 4353 } 4354 } 4355 vmbus_subchan_rel(subchans, subchan_cnt); 4356 4357 if (error) { 4358 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 4359 } else { 4360 if (bootverbose) { 4361 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 4362 subchan_cnt); 4363 } 4364 } 4365 return (error); 4366 } 4367 4368 static void 4369 hn_detach_allchans(struct hn_softc *sc) 4370 { 4371 struct vmbus_channel **subchans; 4372 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4373 int i; 4374 4375 if (subchan_cnt == 0) 4376 goto back; 4377 4378 /* Detach the sub-channels. */ 4379 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4380 for (i = 0; i < subchan_cnt; ++i) 4381 hn_chan_detach(sc, subchans[i]); 4382 vmbus_subchan_rel(subchans, subchan_cnt); 4383 4384 back: 4385 /* 4386 * Detach the primary channel, _after_ all sub-channels 4387 * are detached. 4388 */ 4389 hn_chan_detach(sc, sc->hn_prichan); 4390 4391 /* Wait for sub-channels to be destroyed, if any. */ 4392 vmbus_subchan_drain(sc->hn_prichan); 4393 4394 #ifdef INVARIANTS 4395 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4396 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 4397 HN_RX_FLAG_ATTACHED) == 0, 4398 ("%dth RX ring is still attached", i)); 4399 } 4400 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4401 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 4402 HN_TX_FLAG_ATTACHED) == 0, 4403 ("%dth TX ring is still attached", i)); 4404 } 4405 #endif 4406 } 4407 4408 static int 4409 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 4410 { 4411 struct vmbus_channel **subchans; 4412 int nchan, rxr_cnt, error; 4413 4414 nchan = *nsubch + 1; 4415 if (nchan == 1) { 4416 /* 4417 * Multiple RX/TX rings are not requested. 4418 */ 4419 *nsubch = 0; 4420 return (0); 4421 } 4422 4423 /* 4424 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 4425 * table entries. 4426 */ 4427 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 4428 if (error) { 4429 /* No RSS; this is benign. */ 4430 *nsubch = 0; 4431 return (0); 4432 } 4433 if (bootverbose) { 4434 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 4435 rxr_cnt, nchan); 4436 } 4437 4438 if (nchan > rxr_cnt) 4439 nchan = rxr_cnt; 4440 if (nchan == 1) { 4441 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 4442 *nsubch = 0; 4443 return (0); 4444 } 4445 4446 /* 4447 * Allocate sub-channels from NVS. 4448 */ 4449 *nsubch = nchan - 1; 4450 error = hn_nvs_alloc_subchans(sc, nsubch); 4451 if (error || *nsubch == 0) { 4452 /* Failed to allocate sub-channels. */ 4453 *nsubch = 0; 4454 return (0); 4455 } 4456 4457 /* 4458 * Wait for all sub-channels to become ready before moving on. 4459 */ 4460 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 4461 vmbus_subchan_rel(subchans, *nsubch); 4462 return (0); 4463 } 4464 4465 static bool 4466 hn_synth_attachable(const struct hn_softc *sc) 4467 { 4468 int i; 4469 4470 if (sc->hn_flags & HN_FLAG_ERRORS) 4471 return (false); 4472 4473 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4474 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4475 4476 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 4477 return (false); 4478 } 4479 return (true); 4480 } 4481 4482 static int 4483 hn_synth_attach(struct hn_softc *sc, int mtu) 4484 { 4485 #define ATTACHED_NVS 0x0002 4486 #define ATTACHED_RNDIS 0x0004 4487 4488 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 4489 int error, nsubch, nchan, i; 4490 uint32_t old_caps, attached = 0; 4491 4492 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 4493 ("synthetic parts were attached")); 4494 4495 if (!hn_synth_attachable(sc)) 4496 return (ENXIO); 4497 4498 /* Save capabilities for later verification. */ 4499 old_caps = sc->hn_caps; 4500 sc->hn_caps = 0; 4501 4502 /* Clear RSS stuffs. */ 4503 sc->hn_rss_ind_size = 0; 4504 sc->hn_rss_hash = 0; 4505 4506 /* 4507 * Attach the primary channel _before_ attaching NVS and RNDIS. 4508 */ 4509 error = hn_chan_attach(sc, sc->hn_prichan); 4510 if (error) 4511 goto failed; 4512 4513 /* 4514 * Attach NVS. 4515 */ 4516 error = hn_nvs_attach(sc, mtu); 4517 if (error) 4518 goto failed; 4519 attached |= ATTACHED_NVS; 4520 4521 /* 4522 * Attach RNDIS _after_ NVS is attached. 4523 */ 4524 error = hn_rndis_attach(sc, mtu); 4525 if (error) 4526 goto failed; 4527 attached |= ATTACHED_RNDIS; 4528 4529 /* 4530 * Make sure capabilities are not changed. 4531 */ 4532 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 4533 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 4534 old_caps, sc->hn_caps); 4535 error = ENXIO; 4536 goto failed; 4537 } 4538 4539 /* 4540 * Allocate sub-channels for multi-TX/RX rings. 4541 * 4542 * NOTE: 4543 * The # of RX rings that can be used is equivalent to the # of 4544 * channels to be requested. 4545 */ 4546 nsubch = sc->hn_rx_ring_cnt - 1; 4547 error = hn_synth_alloc_subchans(sc, &nsubch); 4548 if (error) 4549 goto failed; 4550 /* NOTE: _Full_ synthetic parts detach is required now. */ 4551 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 4552 4553 /* 4554 * Set the # of TX/RX rings that could be used according to 4555 * the # of channels that NVS offered. 4556 */ 4557 nchan = nsubch + 1; 4558 hn_set_ring_inuse(sc, nchan); 4559 if (nchan == 1) { 4560 /* Only the primary channel can be used; done */ 4561 goto back; 4562 } 4563 4564 /* 4565 * Attach the sub-channels. 4566 * 4567 * NOTE: hn_set_ring_inuse() _must_ have been called. 4568 */ 4569 error = hn_attach_subchans(sc); 4570 if (error) 4571 goto failed; 4572 4573 /* 4574 * Configure RSS key and indirect table _after_ all sub-channels 4575 * are attached. 4576 */ 4577 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 4578 /* 4579 * RSS key is not set yet; set it to the default RSS key. 4580 */ 4581 if (bootverbose) 4582 if_printf(sc->hn_ifp, "setup default RSS key\n"); 4583 #ifdef RSS 4584 rss_getkey(rss->rss_key); 4585 #else 4586 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 4587 #endif 4588 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4589 } 4590 4591 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 4592 /* 4593 * RSS indirect table is not set yet; set it up in round- 4594 * robin fashion. 4595 */ 4596 if (bootverbose) { 4597 if_printf(sc->hn_ifp, "setup default RSS indirect " 4598 "table\n"); 4599 } 4600 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 4601 uint32_t subidx; 4602 4603 #ifdef RSS 4604 subidx = rss_get_indirection_to_bucket(i); 4605 #else 4606 subidx = i; 4607 #endif 4608 rss->rss_ind[i] = subidx % nchan; 4609 } 4610 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4611 } else { 4612 /* 4613 * # of usable channels may be changed, so we have to 4614 * make sure that all entries in RSS indirect table 4615 * are valid. 4616 * 4617 * NOTE: hn_set_ring_inuse() _must_ have been called. 4618 */ 4619 hn_rss_ind_fixup(sc); 4620 } 4621 4622 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 4623 if (error) 4624 goto failed; 4625 back: 4626 /* 4627 * Fixup transmission aggregation setup. 4628 */ 4629 hn_set_txagg(sc); 4630 return (0); 4631 4632 failed: 4633 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 4634 hn_synth_detach(sc); 4635 } else { 4636 if (attached & ATTACHED_RNDIS) 4637 hn_rndis_detach(sc); 4638 if (attached & ATTACHED_NVS) 4639 hn_nvs_detach(sc); 4640 hn_chan_detach(sc, sc->hn_prichan); 4641 /* Restore old capabilities. */ 4642 sc->hn_caps = old_caps; 4643 } 4644 return (error); 4645 4646 #undef ATTACHED_RNDIS 4647 #undef ATTACHED_NVS 4648 } 4649 4650 /* 4651 * NOTE: 4652 * The interface must have been suspended though hn_suspend(), before 4653 * this function get called. 4654 */ 4655 static void 4656 hn_synth_detach(struct hn_softc *sc) 4657 { 4658 4659 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4660 ("synthetic parts were not attached")); 4661 4662 /* Detach the RNDIS first. */ 4663 hn_rndis_detach(sc); 4664 4665 /* Detach NVS. */ 4666 hn_nvs_detach(sc); 4667 4668 /* Detach all of the channels. */ 4669 hn_detach_allchans(sc); 4670 4671 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 4672 } 4673 4674 static void 4675 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 4676 { 4677 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 4678 ("invalid ring count %d", ring_cnt)); 4679 4680 if (sc->hn_tx_ring_cnt > ring_cnt) 4681 sc->hn_tx_ring_inuse = ring_cnt; 4682 else 4683 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 4684 sc->hn_rx_ring_inuse = ring_cnt; 4685 4686 #ifdef RSS 4687 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 4688 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 4689 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 4690 rss_getnumbuckets()); 4691 } 4692 #endif 4693 4694 if (bootverbose) { 4695 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 4696 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 4697 } 4698 } 4699 4700 static void 4701 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 4702 { 4703 4704 /* 4705 * NOTE: 4706 * The TX bufring will not be drained by the hypervisor, 4707 * if the primary channel is revoked. 4708 */ 4709 while (!vmbus_chan_rx_empty(chan) || 4710 (!vmbus_chan_is_revoked(sc->hn_prichan) && 4711 !vmbus_chan_tx_empty(chan))) 4712 pause("waitch", 1); 4713 vmbus_chan_intr_drain(chan); 4714 } 4715 4716 static void 4717 hn_suspend_data(struct hn_softc *sc) 4718 { 4719 struct vmbus_channel **subch = NULL; 4720 struct hn_tx_ring *txr; 4721 int i, nsubch; 4722 4723 HN_LOCK_ASSERT(sc); 4724 4725 /* 4726 * Suspend TX. 4727 */ 4728 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4729 txr = &sc->hn_tx_ring[i]; 4730 4731 mtx_lock(&txr->hn_tx_lock); 4732 txr->hn_suspended = 1; 4733 mtx_unlock(&txr->hn_tx_lock); 4734 /* No one is able send more packets now. */ 4735 4736 /* 4737 * Wait for all pending sends to finish. 4738 * 4739 * NOTE: 4740 * We will _not_ receive all pending send-done, if the 4741 * primary channel is revoked. 4742 */ 4743 while (hn_tx_ring_pending(txr) && 4744 !vmbus_chan_is_revoked(sc->hn_prichan)) 4745 pause("hnwtx", 1 /* 1 tick */); 4746 } 4747 4748 /* 4749 * Disable RX by clearing RX filter. 4750 */ 4751 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 4752 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); 4753 4754 /* 4755 * Give RNDIS enough time to flush all pending data packets. 4756 */ 4757 pause("waitrx", (200 * hz) / 1000); 4758 4759 /* 4760 * Drain RX/TX bufrings and interrupts. 4761 */ 4762 nsubch = sc->hn_rx_ring_inuse - 1; 4763 if (nsubch > 0) 4764 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4765 4766 if (subch != NULL) { 4767 for (i = 0; i < nsubch; ++i) 4768 hn_chan_drain(sc, subch[i]); 4769 } 4770 hn_chan_drain(sc, sc->hn_prichan); 4771 4772 if (subch != NULL) 4773 vmbus_subchan_rel(subch, nsubch); 4774 4775 /* 4776 * Drain any pending TX tasks. 4777 * 4778 * NOTE: 4779 * The above hn_chan_drain() can dispatch TX tasks, so the TX 4780 * tasks will have to be drained _after_ the above hn_chan_drain() 4781 * calls. 4782 */ 4783 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4784 txr = &sc->hn_tx_ring[i]; 4785 4786 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 4787 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 4788 } 4789 } 4790 4791 static void 4792 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 4793 { 4794 4795 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 4796 } 4797 4798 static void 4799 hn_suspend_mgmt(struct hn_softc *sc) 4800 { 4801 struct task task; 4802 4803 HN_LOCK_ASSERT(sc); 4804 4805 /* 4806 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 4807 * through hn_mgmt_taskq. 4808 */ 4809 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 4810 vmbus_chan_run_task(sc->hn_prichan, &task); 4811 4812 /* 4813 * Make sure that all pending management tasks are completed. 4814 */ 4815 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 4816 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 4817 taskqueue_drain_all(sc->hn_mgmt_taskq0); 4818 } 4819 4820 static void 4821 hn_suspend(struct hn_softc *sc) 4822 { 4823 4824 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 4825 hn_suspend_data(sc); 4826 hn_suspend_mgmt(sc); 4827 } 4828 4829 static void 4830 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 4831 { 4832 int i; 4833 4834 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 4835 ("invalid TX ring count %d", tx_ring_cnt)); 4836 4837 for (i = 0; i < tx_ring_cnt; ++i) { 4838 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 4839 4840 mtx_lock(&txr->hn_tx_lock); 4841 txr->hn_suspended = 0; 4842 mtx_unlock(&txr->hn_tx_lock); 4843 } 4844 } 4845 4846 static void 4847 hn_resume_data(struct hn_softc *sc) 4848 { 4849 int i; 4850 4851 HN_LOCK_ASSERT(sc); 4852 4853 /* 4854 * Re-enable RX. 4855 */ 4856 hn_set_rxfilter(sc); 4857 4858 /* 4859 * Make sure to clear suspend status on "all" TX rings, 4860 * since hn_tx_ring_inuse can be changed after 4861 * hn_suspend_data(). 4862 */ 4863 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 4864 4865 #ifdef HN_IFSTART_SUPPORT 4866 if (!hn_use_if_start) 4867 #endif 4868 { 4869 /* 4870 * Flush unused drbrs, since hn_tx_ring_inuse may be 4871 * reduced. 4872 */ 4873 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 4874 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 4875 } 4876 4877 /* 4878 * Kick start TX. 4879 */ 4880 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4881 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 4882 4883 /* 4884 * Use txeof task, so that any pending oactive can be 4885 * cleared properly. 4886 */ 4887 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4888 } 4889 } 4890 4891 static void 4892 hn_resume_mgmt(struct hn_softc *sc) 4893 { 4894 4895 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 4896 4897 /* 4898 * Kick off network change detection, if it was pending. 4899 * If no network change was pending, start link status 4900 * checks, which is more lightweight than network change 4901 * detection. 4902 */ 4903 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 4904 hn_change_network(sc); 4905 else 4906 hn_update_link_status(sc); 4907 } 4908 4909 static void 4910 hn_resume(struct hn_softc *sc) 4911 { 4912 4913 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 4914 hn_resume_data(sc); 4915 hn_resume_mgmt(sc); 4916 } 4917 4918 static void 4919 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 4920 { 4921 const struct rndis_status_msg *msg; 4922 int ofs; 4923 4924 if (dlen < sizeof(*msg)) { 4925 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 4926 return; 4927 } 4928 msg = data; 4929 4930 switch (msg->rm_status) { 4931 case RNDIS_STATUS_MEDIA_CONNECT: 4932 case RNDIS_STATUS_MEDIA_DISCONNECT: 4933 hn_update_link_status(sc); 4934 break; 4935 4936 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 4937 /* Not really useful; ignore. */ 4938 break; 4939 4940 case RNDIS_STATUS_NETWORK_CHANGE: 4941 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 4942 if (dlen < ofs + msg->rm_stbuflen || 4943 msg->rm_stbuflen < sizeof(uint32_t)) { 4944 if_printf(sc->hn_ifp, "network changed\n"); 4945 } else { 4946 uint32_t change; 4947 4948 memcpy(&change, ((const uint8_t *)msg) + ofs, 4949 sizeof(change)); 4950 if_printf(sc->hn_ifp, "network changed, change %u\n", 4951 change); 4952 } 4953 hn_change_network(sc); 4954 break; 4955 4956 default: 4957 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 4958 msg->rm_status); 4959 break; 4960 } 4961 } 4962 4963 static int 4964 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 4965 { 4966 const struct rndis_pktinfo *pi = info_data; 4967 uint32_t mask = 0; 4968 4969 while (info_dlen != 0) { 4970 const void *data; 4971 uint32_t dlen; 4972 4973 if (__predict_false(info_dlen < sizeof(*pi))) 4974 return (EINVAL); 4975 if (__predict_false(info_dlen < pi->rm_size)) 4976 return (EINVAL); 4977 info_dlen -= pi->rm_size; 4978 4979 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 4980 return (EINVAL); 4981 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 4982 return (EINVAL); 4983 dlen = pi->rm_size - pi->rm_pktinfooffset; 4984 data = pi->rm_data; 4985 4986 switch (pi->rm_type) { 4987 case NDIS_PKTINFO_TYPE_VLAN: 4988 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) 4989 return (EINVAL); 4990 info->vlan_info = *((const uint32_t *)data); 4991 mask |= HN_RXINFO_VLAN; 4992 break; 4993 4994 case NDIS_PKTINFO_TYPE_CSUM: 4995 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) 4996 return (EINVAL); 4997 info->csum_info = *((const uint32_t *)data); 4998 mask |= HN_RXINFO_CSUM; 4999 break; 5000 5001 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 5002 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) 5003 return (EINVAL); 5004 info->hash_value = *((const uint32_t *)data); 5005 mask |= HN_RXINFO_HASHVAL; 5006 break; 5007 5008 case HN_NDIS_PKTINFO_TYPE_HASHINF: 5009 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) 5010 return (EINVAL); 5011 info->hash_info = *((const uint32_t *)data); 5012 mask |= HN_RXINFO_HASHINF; 5013 break; 5014 5015 default: 5016 goto next; 5017 } 5018 5019 if (mask == HN_RXINFO_ALL) { 5020 /* All found; done */ 5021 break; 5022 } 5023 next: 5024 pi = (const struct rndis_pktinfo *) 5025 ((const uint8_t *)pi + pi->rm_size); 5026 } 5027 5028 /* 5029 * Final fixup. 5030 * - If there is no hash value, invalidate the hash info. 5031 */ 5032 if ((mask & HN_RXINFO_HASHVAL) == 0) 5033 info->hash_info = HN_NDIS_HASH_INFO_INVALID; 5034 return (0); 5035 } 5036 5037 static __inline bool 5038 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 5039 { 5040 5041 if (off < check_off) { 5042 if (__predict_true(off + len <= check_off)) 5043 return (false); 5044 } else if (off > check_off) { 5045 if (__predict_true(check_off + check_len <= off)) 5046 return (false); 5047 } 5048 return (true); 5049 } 5050 5051 static void 5052 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 5053 { 5054 const struct rndis_packet_msg *pkt; 5055 struct hn_rxinfo info; 5056 int data_off, pktinfo_off, data_len, pktinfo_len; 5057 5058 /* 5059 * Check length. 5060 */ 5061 if (__predict_false(dlen < sizeof(*pkt))) { 5062 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 5063 return; 5064 } 5065 pkt = data; 5066 5067 if (__predict_false(dlen < pkt->rm_len)) { 5068 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 5069 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 5070 return; 5071 } 5072 if (__predict_false(pkt->rm_len < 5073 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 5074 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 5075 "msglen %u, data %u, oob %u, pktinfo %u\n", 5076 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 5077 pkt->rm_pktinfolen); 5078 return; 5079 } 5080 if (__predict_false(pkt->rm_datalen == 0)) { 5081 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 5082 return; 5083 } 5084 5085 /* 5086 * Check offests. 5087 */ 5088 #define IS_OFFSET_INVALID(ofs) \ 5089 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 5090 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 5091 5092 /* XXX Hyper-V does not meet data offset alignment requirement */ 5093 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 5094 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5095 "data offset %u\n", pkt->rm_dataoffset); 5096 return; 5097 } 5098 if (__predict_false(pkt->rm_oobdataoffset > 0 && 5099 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 5100 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5101 "oob offset %u\n", pkt->rm_oobdataoffset); 5102 return; 5103 } 5104 if (__predict_true(pkt->rm_pktinfooffset > 0) && 5105 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 5106 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5107 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 5108 return; 5109 } 5110 5111 #undef IS_OFFSET_INVALID 5112 5113 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 5114 data_len = pkt->rm_datalen; 5115 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 5116 pktinfo_len = pkt->rm_pktinfolen; 5117 5118 /* 5119 * Check OOB coverage. 5120 */ 5121 if (__predict_false(pkt->rm_oobdatalen != 0)) { 5122 int oob_off, oob_len; 5123 5124 if_printf(rxr->hn_ifp, "got oobdata\n"); 5125 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 5126 oob_len = pkt->rm_oobdatalen; 5127 5128 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 5129 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5130 "oob overflow, msglen %u, oob abs %d len %d\n", 5131 pkt->rm_len, oob_off, oob_len); 5132 return; 5133 } 5134 5135 /* 5136 * Check against data. 5137 */ 5138 if (hn_rndis_check_overlap(oob_off, oob_len, 5139 data_off, data_len)) { 5140 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5141 "oob overlaps data, oob abs %d len %d, " 5142 "data abs %d len %d\n", 5143 oob_off, oob_len, data_off, data_len); 5144 return; 5145 } 5146 5147 /* 5148 * Check against pktinfo. 5149 */ 5150 if (pktinfo_len != 0 && 5151 hn_rndis_check_overlap(oob_off, oob_len, 5152 pktinfo_off, pktinfo_len)) { 5153 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5154 "oob overlaps pktinfo, oob abs %d len %d, " 5155 "pktinfo abs %d len %d\n", 5156 oob_off, oob_len, pktinfo_off, pktinfo_len); 5157 return; 5158 } 5159 } 5160 5161 /* 5162 * Check per-packet-info coverage and find useful per-packet-info. 5163 */ 5164 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; 5165 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; 5166 info.hash_info = HN_NDIS_HASH_INFO_INVALID; 5167 if (__predict_true(pktinfo_len != 0)) { 5168 bool overlap; 5169 int error; 5170 5171 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 5172 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5173 "pktinfo overflow, msglen %u, " 5174 "pktinfo abs %d len %d\n", 5175 pkt->rm_len, pktinfo_off, pktinfo_len); 5176 return; 5177 } 5178 5179 /* 5180 * Check packet info coverage. 5181 */ 5182 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 5183 data_off, data_len); 5184 if (__predict_false(overlap)) { 5185 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5186 "pktinfo overlap data, pktinfo abs %d len %d, " 5187 "data abs %d len %d\n", 5188 pktinfo_off, pktinfo_len, data_off, data_len); 5189 return; 5190 } 5191 5192 /* 5193 * Find useful per-packet-info. 5194 */ 5195 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 5196 pktinfo_len, &info); 5197 if (__predict_false(error)) { 5198 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 5199 "pktinfo\n"); 5200 return; 5201 } 5202 } 5203 5204 if (__predict_false(data_off + data_len > pkt->rm_len)) { 5205 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5206 "data overflow, msglen %u, data abs %d len %d\n", 5207 pkt->rm_len, data_off, data_len); 5208 return; 5209 } 5210 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); 5211 } 5212 5213 static __inline void 5214 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 5215 { 5216 const struct rndis_msghdr *hdr; 5217 5218 if (__predict_false(dlen < sizeof(*hdr))) { 5219 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 5220 return; 5221 } 5222 hdr = data; 5223 5224 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 5225 /* Hot data path. */ 5226 hn_rndis_rx_data(rxr, data, dlen); 5227 /* Done! */ 5228 return; 5229 } 5230 5231 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 5232 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 5233 else 5234 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 5235 } 5236 5237 static void 5238 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 5239 { 5240 const struct hn_nvs_hdr *hdr; 5241 5242 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 5243 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 5244 return; 5245 } 5246 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 5247 5248 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 5249 /* Useless; ignore */ 5250 return; 5251 } 5252 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 5253 } 5254 5255 static void 5256 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 5257 const struct vmbus_chanpkt_hdr *pkt) 5258 { 5259 struct hn_nvs_sendctx *sndc; 5260 5261 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 5262 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 5263 VMBUS_CHANPKT_DATALEN(pkt)); 5264 /* 5265 * NOTE: 5266 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 5267 * its callback. 5268 */ 5269 } 5270 5271 static void 5272 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5273 const struct vmbus_chanpkt_hdr *pkthdr) 5274 { 5275 const struct vmbus_chanpkt_rxbuf *pkt; 5276 const struct hn_nvs_hdr *nvs_hdr; 5277 int count, i, hlen; 5278 5279 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 5280 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 5281 return; 5282 } 5283 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 5284 5285 /* Make sure that this is a RNDIS message. */ 5286 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 5287 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 5288 nvs_hdr->nvs_type); 5289 return; 5290 } 5291 5292 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 5293 if (__predict_false(hlen < sizeof(*pkt))) { 5294 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 5295 return; 5296 } 5297 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 5298 5299 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 5300 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 5301 pkt->cp_rxbuf_id); 5302 return; 5303 } 5304 5305 count = pkt->cp_rxbuf_cnt; 5306 if (__predict_false(hlen < 5307 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 5308 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 5309 return; 5310 } 5311 5312 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 5313 for (i = 0; i < count; ++i) { 5314 int ofs, len; 5315 5316 ofs = pkt->cp_rxbuf[i].rb_ofs; 5317 len = pkt->cp_rxbuf[i].rb_len; 5318 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 5319 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 5320 "ofs %d, len %d\n", i, ofs, len); 5321 continue; 5322 } 5323 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 5324 } 5325 5326 /* 5327 * Ack the consumed RXBUF associated w/ this channel packet, 5328 * so that this RXBUF can be recycled by the hypervisor. 5329 */ 5330 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 5331 } 5332 5333 static void 5334 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5335 uint64_t tid) 5336 { 5337 struct hn_nvs_rndis_ack ack; 5338 int retries, error; 5339 5340 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 5341 ack.nvs_status = HN_NVS_STATUS_OK; 5342 5343 retries = 0; 5344 again: 5345 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 5346 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 5347 if (__predict_false(error == EAGAIN)) { 5348 /* 5349 * NOTE: 5350 * This should _not_ happen in real world, since the 5351 * consumption of the TX bufring from the TX path is 5352 * controlled. 5353 */ 5354 if (rxr->hn_ack_failed == 0) 5355 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 5356 rxr->hn_ack_failed++; 5357 retries++; 5358 if (retries < 10) { 5359 DELAY(100); 5360 goto again; 5361 } 5362 /* RXBUF leaks! */ 5363 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 5364 } 5365 } 5366 5367 static void 5368 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 5369 { 5370 struct hn_rx_ring *rxr = xrxr; 5371 struct hn_softc *sc = rxr->hn_ifp->if_softc; 5372 5373 for (;;) { 5374 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 5375 int error, pktlen; 5376 5377 pktlen = rxr->hn_pktbuf_len; 5378 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 5379 if (__predict_false(error == ENOBUFS)) { 5380 void *nbuf; 5381 int nlen; 5382 5383 /* 5384 * Expand channel packet buffer. 5385 * 5386 * XXX 5387 * Use M_WAITOK here, since allocation failure 5388 * is fatal. 5389 */ 5390 nlen = rxr->hn_pktbuf_len * 2; 5391 while (nlen < pktlen) 5392 nlen *= 2; 5393 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 5394 5395 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 5396 rxr->hn_pktbuf_len, nlen); 5397 5398 free(rxr->hn_pktbuf, M_DEVBUF); 5399 rxr->hn_pktbuf = nbuf; 5400 rxr->hn_pktbuf_len = nlen; 5401 /* Retry! */ 5402 continue; 5403 } else if (__predict_false(error == EAGAIN)) { 5404 /* No more channel packets; done! */ 5405 break; 5406 } 5407 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 5408 5409 switch (pkt->cph_type) { 5410 case VMBUS_CHANPKT_TYPE_COMP: 5411 hn_nvs_handle_comp(sc, chan, pkt); 5412 break; 5413 5414 case VMBUS_CHANPKT_TYPE_RXBUF: 5415 hn_nvs_handle_rxbuf(rxr, chan, pkt); 5416 break; 5417 5418 case VMBUS_CHANPKT_TYPE_INBAND: 5419 hn_nvs_handle_notify(sc, pkt); 5420 break; 5421 5422 default: 5423 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 5424 pkt->cph_type); 5425 break; 5426 } 5427 } 5428 hn_chan_rollup(rxr, rxr->hn_txr); 5429 } 5430 5431 static void 5432 hn_tx_taskq_create(void *arg __unused) 5433 { 5434 int i; 5435 5436 /* 5437 * Fix the # of TX taskqueues. 5438 */ 5439 if (hn_tx_taskq_cnt <= 0) 5440 hn_tx_taskq_cnt = 1; 5441 else if (hn_tx_taskq_cnt > mp_ncpus) 5442 hn_tx_taskq_cnt = mp_ncpus; 5443 5444 /* 5445 * Fix the TX taskqueue mode. 5446 */ 5447 switch (hn_tx_taskq_mode) { 5448 case HN_TX_TASKQ_M_INDEP: 5449 case HN_TX_TASKQ_M_GLOBAL: 5450 case HN_TX_TASKQ_M_EVTTQ: 5451 break; 5452 default: 5453 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 5454 break; 5455 } 5456 5457 if (vm_guest != VM_GUEST_HV) 5458 return; 5459 5460 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 5461 return; 5462 5463 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 5464 M_DEVBUF, M_WAITOK); 5465 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 5466 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 5467 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 5468 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 5469 "hn tx%d", i); 5470 } 5471 } 5472 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5473 hn_tx_taskq_create, NULL); 5474 5475 static void 5476 hn_tx_taskq_destroy(void *arg __unused) 5477 { 5478 5479 if (hn_tx_taskque != NULL) { 5480 int i; 5481 5482 for (i = 0; i < hn_tx_taskq_cnt; ++i) 5483 taskqueue_free(hn_tx_taskque[i]); 5484 free(hn_tx_taskque, M_DEVBUF); 5485 } 5486 } 5487 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5488 hn_tx_taskq_destroy, NULL); 5489