1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 #include "opt_hn.h" 57 #include "opt_inet6.h" 58 #include "opt_inet.h" 59 #include "opt_rss.h" 60 61 #include <sys/param.h> 62 #include <sys/systm.h> 63 #include <sys/bus.h> 64 #include <sys/counter.h> 65 #include <sys/kernel.h> 66 #include <sys/limits.h> 67 #include <sys/malloc.h> 68 #include <sys/mbuf.h> 69 #include <sys/module.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/proc.h> 73 #include <sys/rmlock.h> 74 #include <sys/sbuf.h> 75 #include <sys/sched.h> 76 #include <sys/smp.h> 77 #include <sys/socket.h> 78 #include <sys/sockio.h> 79 #include <sys/sx.h> 80 #include <sys/sysctl.h> 81 #include <sys/taskqueue.h> 82 #include <sys/buf_ring.h> 83 #include <sys/eventhandler.h> 84 #include <sys/epoch.h> 85 86 #include <vm/vm.h> 87 #include <vm/vm_extern.h> 88 #include <vm/pmap.h> 89 90 #include <machine/atomic.h> 91 #include <machine/in_cksum.h> 92 93 #include <net/bpf.h> 94 #include <net/ethernet.h> 95 #include <net/if.h> 96 #include <net/if_dl.h> 97 #include <net/if_media.h> 98 #include <net/if_types.h> 99 #include <net/if_var.h> 100 #include <net/rndis.h> 101 #ifdef RSS 102 #include <net/rss_config.h> 103 #endif 104 105 #include <netinet/in_systm.h> 106 #include <netinet/in.h> 107 #include <netinet/ip.h> 108 #include <netinet/ip6.h> 109 #include <netinet/tcp.h> 110 #include <netinet/tcp_lro.h> 111 #include <netinet/udp.h> 112 113 #include <dev/hyperv/include/hyperv.h> 114 #include <dev/hyperv/include/hyperv_busdma.h> 115 #include <dev/hyperv/include/vmbus.h> 116 #include <dev/hyperv/include/vmbus_xact.h> 117 118 #include <dev/hyperv/netvsc/ndis.h> 119 #include <dev/hyperv/netvsc/if_hnreg.h> 120 #include <dev/hyperv/netvsc/if_hnvar.h> 121 #include <dev/hyperv/netvsc/hn_nvs.h> 122 #include <dev/hyperv/netvsc/hn_rndis.h> 123 124 #include "vmbus_if.h" 125 126 #define HN_IFSTART_SUPPORT 127 128 #define HN_RING_CNT_DEF_MAX 8 129 130 #define HN_VFMAP_SIZE_DEF 8 131 132 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 133 134 /* YYY should get it from the underlying channel */ 135 #define HN_TX_DESC_CNT 512 136 137 #define HN_RNDIS_PKT_LEN \ 138 (sizeof(struct rndis_packet_msg) + \ 139 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 140 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 141 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 142 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 143 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 144 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 145 146 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 147 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 148 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 149 /* -1 for RNDIS packet message */ 150 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 151 152 #define HN_DIRECT_TX_SIZE_DEF 128 153 154 #define HN_EARLY_TXEOF_THRESH 8 155 156 #define HN_PKTBUF_LEN_DEF (16 * 1024) 157 158 #define HN_LROENT_CNT_DEF 128 159 160 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 161 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 162 /* YYY 2*MTU is a bit rough, but should be good enough. */ 163 #define HN_LRO_LENLIM_MIN(ifp) (2 * if_getmtu(ifp)) 164 165 #define HN_LRO_ACKCNT_DEF 1 166 167 #define HN_LOCK_INIT(sc) \ 168 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 169 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 170 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 171 #define HN_LOCK(sc) \ 172 do { \ 173 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 174 /* Relinquish cpu to avoid deadlock */ \ 175 sched_relinquish(curthread); \ 176 DELAY(1000); \ 177 } \ 178 } while (0) 179 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 180 181 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 182 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 183 #define HN_CSUM_IP_HWASSIST(sc) \ 184 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 185 #define HN_CSUM_IP6_HWASSIST(sc) \ 186 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 187 188 #define HN_PKTSIZE_MIN(align) \ 189 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 190 HN_RNDIS_PKT_LEN, (align)) 191 #define HN_PKTSIZE(m, align) \ 192 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 193 194 #ifdef RSS 195 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 196 #else 197 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 198 #endif 199 200 struct hn_txdesc { 201 #ifndef HN_USE_TXDESC_BUFRING 202 SLIST_ENTRY(hn_txdesc) link; 203 #endif 204 STAILQ_ENTRY(hn_txdesc) agg_link; 205 206 /* Aggregated txdescs, in sending order. */ 207 STAILQ_HEAD(, hn_txdesc) agg_list; 208 209 /* The oldest packet, if transmission aggregation happens. */ 210 struct mbuf *m; 211 struct hn_tx_ring *txr; 212 int refs; 213 uint32_t flags; /* HN_TXD_FLAG_ */ 214 struct hn_nvs_sendctx send_ctx; 215 uint32_t chim_index; 216 int chim_size; 217 218 bus_dmamap_t data_dmap; 219 220 bus_addr_t rndis_pkt_paddr; 221 struct rndis_packet_msg *rndis_pkt; 222 bus_dmamap_t rndis_pkt_dmap; 223 }; 224 225 #define HN_TXD_FLAG_ONLIST 0x0001 226 #define HN_TXD_FLAG_DMAMAP 0x0002 227 #define HN_TXD_FLAG_ONAGG 0x0004 228 229 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 230 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 231 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 232 233 struct packet_info_id { 234 uint8_t ver; 235 uint8_t flag; 236 uint16_t pkt_id; 237 }; 238 239 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 240 241 242 struct hn_rxinfo { 243 const uint32_t *vlan_info; 244 const uint32_t *csum_info; 245 const uint32_t *hash_info; 246 const uint32_t *hash_value; 247 const struct packet_info_id *pktinfo_id; 248 }; 249 250 struct hn_rxvf_setarg { 251 struct hn_rx_ring *rxr; 252 if_t vf_ifp; 253 }; 254 255 #define HN_RXINFO_VLAN 0x0001 256 #define HN_RXINFO_CSUM 0x0002 257 #define HN_RXINFO_HASHINF 0x0004 258 #define HN_RXINFO_HASHVAL 0x0008 259 #define HN_RXINFO_PKTINFO_ID 0x0010 260 #define HN_RXINFO_ALL \ 261 (HN_RXINFO_VLAN | \ 262 HN_RXINFO_CSUM | \ 263 HN_RXINFO_HASHINF | \ 264 HN_RXINFO_HASHVAL | \ 265 HN_RXINFO_PKTINFO_ID) 266 267 static int hn_probe(device_t); 268 static int hn_attach(device_t); 269 static int hn_detach(device_t); 270 static int hn_shutdown(device_t); 271 static void hn_chan_callback(struct vmbus_channel *, 272 void *); 273 274 static void hn_init(void *); 275 static int hn_ioctl(if_t, u_long, caddr_t); 276 #ifdef HN_IFSTART_SUPPORT 277 static void hn_start(if_t); 278 #endif 279 static int hn_transmit(if_t, struct mbuf *); 280 static void hn_xmit_qflush(if_t); 281 static int hn_ifmedia_upd(if_t); 282 static void hn_ifmedia_sts(if_t, 283 struct ifmediareq *); 284 285 static void hn_ifnet_event(void *, if_t, int); 286 static void hn_ifaddr_event(void *, if_t); 287 static void hn_ifnet_attevent(void *, if_t); 288 static void hn_ifnet_detevent(void *, if_t); 289 static void hn_ifnet_lnkevent(void *, if_t, int); 290 291 static bool hn_ismyvf(const struct hn_softc *, 292 const if_t); 293 static void hn_rxvf_change(struct hn_softc *, 294 if_t, bool); 295 static void hn_rxvf_set(struct hn_softc *, if_t); 296 static void hn_rxvf_set_task(void *, int); 297 static void hn_xpnt_vf_input(if_t, struct mbuf *); 298 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 299 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 300 struct ifreq *); 301 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 302 static bool hn_xpnt_vf_isready(struct hn_softc *); 303 static void hn_xpnt_vf_setready(struct hn_softc *); 304 static void hn_xpnt_vf_init_taskfunc(void *, int); 305 static void hn_xpnt_vf_init(struct hn_softc *); 306 static void hn_xpnt_vf_setenable(struct hn_softc *); 307 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 308 static void hn_vf_rss_fixup(struct hn_softc *, bool); 309 static void hn_vf_rss_restore(struct hn_softc *); 310 311 static int hn_rndis_rxinfo(const void *, int, 312 struct hn_rxinfo *); 313 static void hn_rndis_rx_data(struct hn_rx_ring *, 314 const void *, int); 315 static void hn_rndis_rx_status(struct hn_softc *, 316 const void *, int); 317 static void hn_rndis_init_fixat(struct hn_softc *, int); 318 319 static void hn_nvs_handle_notify(struct hn_softc *, 320 const struct vmbus_chanpkt_hdr *); 321 static void hn_nvs_handle_comp(struct hn_softc *, 322 struct vmbus_channel *, 323 const struct vmbus_chanpkt_hdr *); 324 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 325 struct vmbus_channel *, 326 const struct vmbus_chanpkt_hdr *); 327 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 328 struct vmbus_channel *, uint64_t); 329 330 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 331 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 332 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 334 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 335 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 336 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 337 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 338 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 339 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 342 #ifndef RSS 343 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 344 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 345 #endif 346 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 347 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 348 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 349 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 350 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 358 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 359 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 360 361 static void hn_stop(struct hn_softc *, bool); 362 static void hn_init_locked(struct hn_softc *); 363 static int hn_chan_attach(struct hn_softc *, 364 struct vmbus_channel *); 365 static void hn_chan_detach(struct hn_softc *, 366 struct vmbus_channel *); 367 static int hn_attach_subchans(struct hn_softc *); 368 static void hn_detach_allchans(struct hn_softc *); 369 static void hn_chan_rollup(struct hn_rx_ring *, 370 struct hn_tx_ring *); 371 static void hn_set_ring_inuse(struct hn_softc *, int); 372 static int hn_synth_attach(struct hn_softc *, int); 373 static void hn_synth_detach(struct hn_softc *); 374 static int hn_synth_alloc_subchans(struct hn_softc *, 375 int *); 376 static bool hn_synth_attachable(const struct hn_softc *); 377 static void hn_suspend(struct hn_softc *); 378 static void hn_suspend_data(struct hn_softc *); 379 static void hn_suspend_mgmt(struct hn_softc *); 380 static void hn_resume(struct hn_softc *); 381 static void hn_resume_data(struct hn_softc *); 382 static void hn_resume_mgmt(struct hn_softc *); 383 static void hn_suspend_mgmt_taskfunc(void *, int); 384 static void hn_chan_drain(struct hn_softc *, 385 struct vmbus_channel *); 386 static void hn_disable_rx(struct hn_softc *); 387 static void hn_drain_rxtx(struct hn_softc *, int); 388 static void hn_polling(struct hn_softc *, u_int); 389 static void hn_chan_polling(struct vmbus_channel *, u_int); 390 static void hn_mtu_change_fixup(struct hn_softc *); 391 392 static void hn_update_link_status(struct hn_softc *); 393 static void hn_change_network(struct hn_softc *); 394 static void hn_link_taskfunc(void *, int); 395 static void hn_netchg_init_taskfunc(void *, int); 396 static void hn_netchg_status_taskfunc(void *, int); 397 static void hn_link_status(struct hn_softc *); 398 399 static int hn_create_rx_data(struct hn_softc *, int); 400 static void hn_destroy_rx_data(struct hn_softc *); 401 static int hn_check_iplen(const struct mbuf *, int); 402 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 403 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 404 static int hn_rxfilter_config(struct hn_softc *); 405 static int hn_rss_reconfig(struct hn_softc *); 406 static void hn_rss_ind_fixup(struct hn_softc *); 407 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 408 static int hn_rxpkt(struct hn_rx_ring *); 409 static uint32_t hn_rss_type_fromndis(uint32_t); 410 static uint32_t hn_rss_type_tondis(uint32_t); 411 412 static int hn_tx_ring_create(struct hn_softc *, int); 413 static void hn_tx_ring_destroy(struct hn_tx_ring *); 414 static int hn_create_tx_data(struct hn_softc *, int); 415 static void hn_fixup_tx_data(struct hn_softc *); 416 static void hn_fixup_rx_data(struct hn_softc *); 417 static void hn_destroy_tx_data(struct hn_softc *); 418 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 419 static void hn_txdesc_gc(struct hn_tx_ring *, 420 struct hn_txdesc *); 421 static int hn_encap(if_t, struct hn_tx_ring *, 422 struct hn_txdesc *, struct mbuf **); 423 static int hn_txpkt(if_t, struct hn_tx_ring *, 424 struct hn_txdesc *); 425 static void hn_set_chim_size(struct hn_softc *, int); 426 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 427 static bool hn_tx_ring_pending(struct hn_tx_ring *); 428 static void hn_tx_ring_qflush(struct hn_tx_ring *); 429 static void hn_resume_tx(struct hn_softc *, int); 430 static void hn_set_txagg(struct hn_softc *); 431 static void *hn_try_txagg(if_t, 432 struct hn_tx_ring *, struct hn_txdesc *, 433 int); 434 static int hn_get_txswq_depth(const struct hn_tx_ring *); 435 static void hn_txpkt_done(struct hn_nvs_sendctx *, 436 struct hn_softc *, struct vmbus_channel *, 437 const void *, int); 438 static int hn_txpkt_sglist(struct hn_tx_ring *, 439 struct hn_txdesc *); 440 static int hn_txpkt_chim(struct hn_tx_ring *, 441 struct hn_txdesc *); 442 static int hn_xmit(struct hn_tx_ring *, int); 443 static void hn_xmit_taskfunc(void *, int); 444 static void hn_xmit_txeof(struct hn_tx_ring *); 445 static void hn_xmit_txeof_taskfunc(void *, int); 446 #ifdef HN_IFSTART_SUPPORT 447 static int hn_start_locked(struct hn_tx_ring *, int); 448 static void hn_start_taskfunc(void *, int); 449 static void hn_start_txeof(struct hn_tx_ring *); 450 static void hn_start_txeof_taskfunc(void *, int); 451 #endif 452 453 static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS); 454 455 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 456 "Hyper-V network interface"); 457 458 /* Trust tcp segment verification on host side. */ 459 static int hn_trust_hosttcp = 1; 460 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 461 &hn_trust_hosttcp, 0, 462 "Trust tcp segment verification on host side, " 463 "when csum info is missing (global setting)"); 464 465 /* Trust udp datagrams verification on host side. */ 466 static int hn_trust_hostudp = 1; 467 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 468 &hn_trust_hostudp, 0, 469 "Trust udp datagram verification on host side, " 470 "when csum info is missing (global setting)"); 471 472 /* Trust ip packets verification on host side. */ 473 static int hn_trust_hostip = 1; 474 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 475 &hn_trust_hostip, 0, 476 "Trust ip packet verification on host side, " 477 "when csum info is missing (global setting)"); 478 479 /* 480 * Offload UDP/IPv4 checksum. 481 */ 482 static int hn_enable_udp4cs = 1; 483 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 484 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 485 486 /* 487 * Offload UDP/IPv6 checksum. 488 */ 489 static int hn_enable_udp6cs = 1; 490 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 491 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 492 493 /* Stats. */ 494 static counter_u64_t hn_udpcs_fixup; 495 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 496 &hn_udpcs_fixup, "# of UDP checksum fixup"); 497 498 /* 499 * See hn_set_hlen(). 500 * 501 * This value is for Azure. For Hyper-V, set this above 502 * 65536 to disable UDP datagram checksum fixup. 503 */ 504 static int hn_udpcs_fixup_mtu = 1420; 505 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 506 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 507 508 /* Limit TSO burst size */ 509 static int hn_tso_maxlen = IP_MAXPACKET; 510 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 511 &hn_tso_maxlen, 0, "TSO burst limit"); 512 513 /* Limit chimney send size */ 514 static int hn_tx_chimney_size = 0; 515 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 516 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 517 518 /* Limit the size of packet for direct transmission */ 519 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 520 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 521 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 522 523 /* # of LRO entries per RX ring */ 524 #if defined(INET) || defined(INET6) 525 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 526 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 527 &hn_lro_entry_count, 0, "LRO entry count"); 528 #endif 529 530 static int hn_tx_taskq_cnt = 1; 531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 532 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 533 534 #define HN_TX_TASKQ_M_INDEP 0 535 #define HN_TX_TASKQ_M_GLOBAL 1 536 #define HN_TX_TASKQ_M_EVTTQ 2 537 538 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 539 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 540 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 541 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 542 543 #ifndef HN_USE_TXDESC_BUFRING 544 static int hn_use_txdesc_bufring = 0; 545 #else 546 static int hn_use_txdesc_bufring = 1; 547 #endif 548 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 549 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 550 551 #ifdef HN_IFSTART_SUPPORT 552 /* Use ifnet.if_start instead of ifnet.if_transmit */ 553 static int hn_use_if_start = 0; 554 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 555 &hn_use_if_start, 0, "Use if_start TX method"); 556 #endif 557 558 /* # of channels to use */ 559 static int hn_chan_cnt = 0; 560 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 561 &hn_chan_cnt, 0, 562 "# of channels to use; each channel has one RX ring and one TX ring"); 563 564 /* # of transmit rings to use */ 565 static int hn_tx_ring_cnt = 0; 566 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 567 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 568 569 /* Software TX ring deptch */ 570 static int hn_tx_swq_depth = 0; 571 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 572 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 573 574 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 575 static u_int hn_lro_mbufq_depth = 0; 576 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 577 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 578 579 /* Packet transmission aggregation size limit */ 580 static int hn_tx_agg_size = -1; 581 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 582 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 583 584 /* Packet transmission aggregation count limit */ 585 static int hn_tx_agg_pkts = -1; 586 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 587 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 588 589 /* VF list */ 590 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 591 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 592 hn_vflist_sysctl, "A", 593 "VF list"); 594 595 /* VF mapping */ 596 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 597 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 598 hn_vfmap_sysctl, "A", 599 "VF mapping"); 600 601 /* Transparent VF */ 602 static int hn_xpnt_vf = 1; 603 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 604 &hn_xpnt_vf, 0, "Transparent VF mod"); 605 606 /* Accurate BPF support for Transparent VF */ 607 static int hn_xpnt_vf_accbpf = 0; 608 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 609 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 610 611 /* Extra wait for transparent VF attach routing; unit seconds. */ 612 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 613 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 614 &hn_xpnt_vf_attwait, 0, 615 "Extra wait for transparent VF attach routing; unit: seconds"); 616 617 static u_int hn_cpu_index; /* next CPU for channel */ 618 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 619 620 static struct rmlock hn_vfmap_lock; 621 static int hn_vfmap_size; 622 static if_t *hn_vfmap; 623 624 #ifndef RSS 625 static const uint8_t 626 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 627 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 628 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 629 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 630 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 631 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 632 }; 633 #endif /* !RSS */ 634 635 static const struct hyperv_guid hn_guid = { 636 .hv_guid = { 637 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 638 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 639 }; 640 641 static device_method_t hn_methods[] = { 642 /* Device interface */ 643 DEVMETHOD(device_probe, hn_probe), 644 DEVMETHOD(device_attach, hn_attach), 645 DEVMETHOD(device_detach, hn_detach), 646 DEVMETHOD(device_shutdown, hn_shutdown), 647 DEVMETHOD_END 648 }; 649 650 static driver_t hn_driver = { 651 "hn", 652 hn_methods, 653 sizeof(struct hn_softc) 654 }; 655 656 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0); 657 MODULE_VERSION(hn, 1); 658 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 659 660 static void 661 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 662 { 663 int i; 664 665 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 666 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 667 } 668 669 static int 670 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 671 { 672 673 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 674 txd->chim_size == 0, ("invalid rndis sglist txd")); 675 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 676 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 677 } 678 679 static int 680 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 681 { 682 struct hn_nvs_rndis rndis; 683 684 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 685 txd->chim_size > 0, ("invalid rndis chim txd")); 686 687 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 688 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 689 rndis.nvs_chim_idx = txd->chim_index; 690 rndis.nvs_chim_sz = txd->chim_size; 691 692 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 693 &rndis, sizeof(rndis), &txd->send_ctx)); 694 } 695 696 static __inline uint32_t 697 hn_chim_alloc(struct hn_softc *sc) 698 { 699 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 700 u_long *bmap = sc->hn_chim_bmap; 701 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 702 703 for (i = 0; i < bmap_cnt; ++i) { 704 int idx; 705 706 idx = ffsl(~bmap[i]); 707 if (idx == 0) 708 continue; 709 710 --idx; /* ffsl is 1-based */ 711 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 712 ("invalid i %d and idx %d", i, idx)); 713 714 if (atomic_testandset_long(&bmap[i], idx)) 715 continue; 716 717 ret = i * LONG_BIT + idx; 718 break; 719 } 720 return (ret); 721 } 722 723 static __inline void 724 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 725 { 726 u_long mask; 727 uint32_t idx; 728 729 idx = chim_idx / LONG_BIT; 730 KASSERT(idx < sc->hn_chim_bmap_cnt, 731 ("invalid chimney index 0x%x", chim_idx)); 732 733 mask = 1UL << (chim_idx % LONG_BIT); 734 KASSERT(sc->hn_chim_bmap[idx] & mask, 735 ("index bitmap 0x%lx, chimney index %u, " 736 "bitmap idx %d, bitmask 0x%lx", 737 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 738 739 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 740 } 741 742 #if defined(INET6) || defined(INET) 743 744 #define PULLUP_HDR(m, len) \ 745 do { \ 746 if (__predict_false((m)->m_len < (len))) { \ 747 (m) = m_pullup((m), (len)); \ 748 if ((m) == NULL) \ 749 return (NULL); \ 750 } \ 751 } while (0) 752 753 /* 754 * NOTE: If this function failed, the m_head would be freed. 755 */ 756 static __inline struct mbuf * 757 hn_tso_fixup(struct mbuf *m_head) 758 { 759 struct ether_vlan_header *evl; 760 struct tcphdr *th; 761 int ehlen; 762 763 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 764 765 PULLUP_HDR(m_head, sizeof(*evl)); 766 evl = mtod(m_head, struct ether_vlan_header *); 767 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 768 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 769 else 770 ehlen = ETHER_HDR_LEN; 771 m_head->m_pkthdr.l2hlen = ehlen; 772 773 #ifdef INET 774 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 775 struct ip *ip; 776 int iphlen; 777 778 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 779 ip = mtodo(m_head, ehlen); 780 iphlen = ip->ip_hl << 2; 781 m_head->m_pkthdr.l3hlen = iphlen; 782 783 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 784 th = mtodo(m_head, ehlen + iphlen); 785 786 ip->ip_len = 0; 787 ip->ip_sum = 0; 788 th->th_sum = in_pseudo(ip->ip_src.s_addr, 789 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 790 } 791 #endif 792 #if defined(INET6) && defined(INET) 793 else 794 #endif 795 #ifdef INET6 796 { 797 struct ip6_hdr *ip6; 798 799 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 800 ip6 = mtodo(m_head, ehlen); 801 if (ip6->ip6_nxt != IPPROTO_TCP) { 802 m_freem(m_head); 803 return (NULL); 804 } 805 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 806 807 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 808 th = mtodo(m_head, ehlen + sizeof(*ip6)); 809 810 ip6->ip6_plen = 0; 811 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 812 } 813 #endif 814 return (m_head); 815 } 816 817 /* 818 * NOTE: If this function failed, the m_head would be freed. 819 */ 820 static __inline struct mbuf * 821 hn_set_hlen(struct mbuf *m_head) 822 { 823 const struct ether_vlan_header *evl; 824 int ehlen; 825 826 PULLUP_HDR(m_head, sizeof(*evl)); 827 evl = mtod(m_head, const struct ether_vlan_header *); 828 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 829 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 830 else 831 ehlen = ETHER_HDR_LEN; 832 m_head->m_pkthdr.l2hlen = ehlen; 833 834 #ifdef INET 835 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 836 const struct ip *ip; 837 int iphlen; 838 839 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 840 ip = mtodo(m_head, ehlen); 841 iphlen = ip->ip_hl << 2; 842 m_head->m_pkthdr.l3hlen = iphlen; 843 844 /* 845 * UDP checksum offload does not work in Azure, if the 846 * following conditions meet: 847 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 848 * - IP_DF is not set in the IP hdr. 849 * 850 * Fallback to software checksum for these UDP datagrams. 851 */ 852 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 853 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 854 (ntohs(ip->ip_off) & IP_DF) == 0) { 855 uint16_t off = ehlen + iphlen; 856 857 counter_u64_add(hn_udpcs_fixup, 1); 858 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 859 *(uint16_t *)(m_head->m_data + off + 860 m_head->m_pkthdr.csum_data) = in_cksum_skip( 861 m_head, m_head->m_pkthdr.len, off); 862 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 863 } 864 } 865 #endif 866 #if defined(INET6) && defined(INET) 867 else 868 #endif 869 #ifdef INET6 870 { 871 const struct ip6_hdr *ip6; 872 873 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 874 ip6 = mtodo(m_head, ehlen); 875 if (ip6->ip6_nxt != IPPROTO_TCP && 876 ip6->ip6_nxt != IPPROTO_UDP) { 877 m_freem(m_head); 878 return (NULL); 879 } 880 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 881 } 882 #endif 883 return (m_head); 884 } 885 886 /* 887 * NOTE: If this function failed, the m_head would be freed. 888 */ 889 static __inline struct mbuf * 890 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 891 { 892 const struct tcphdr *th; 893 int ehlen, iphlen; 894 895 *tcpsyn = 0; 896 ehlen = m_head->m_pkthdr.l2hlen; 897 iphlen = m_head->m_pkthdr.l3hlen; 898 899 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 900 th = mtodo(m_head, ehlen + iphlen); 901 if (th->th_flags & TH_SYN) 902 *tcpsyn = 1; 903 return (m_head); 904 } 905 906 #undef PULLUP_HDR 907 908 #endif /* INET6 || INET */ 909 910 static int 911 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 912 { 913 int error = 0; 914 915 HN_LOCK_ASSERT(sc); 916 917 if (sc->hn_rx_filter != filter) { 918 error = hn_rndis_set_rxfilter(sc, filter); 919 if (!error) 920 sc->hn_rx_filter = filter; 921 } 922 return (error); 923 } 924 925 static int 926 hn_rxfilter_config(struct hn_softc *sc) 927 { 928 if_t ifp = sc->hn_ifp; 929 uint32_t filter; 930 931 HN_LOCK_ASSERT(sc); 932 933 /* 934 * If the non-transparent mode VF is activated, we don't know how 935 * its RX filter is configured, so stick the synthetic device in 936 * the promiscous mode. 937 */ 938 if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 939 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 940 } else { 941 filter = NDIS_PACKET_TYPE_DIRECTED; 942 if (if_getflags(ifp) & IFF_BROADCAST) 943 filter |= NDIS_PACKET_TYPE_BROADCAST; 944 /* TODO: support multicast list */ 945 if ((if_getflags(ifp) & IFF_ALLMULTI) || 946 !if_maddr_empty(ifp)) 947 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 948 } 949 return (hn_set_rxfilter(sc, filter)); 950 } 951 952 static void 953 hn_set_txagg(struct hn_softc *sc) 954 { 955 uint32_t size, pkts; 956 int i; 957 958 /* 959 * Setup aggregation size. 960 */ 961 if (sc->hn_agg_size < 0) 962 size = UINT32_MAX; 963 else 964 size = sc->hn_agg_size; 965 966 if (sc->hn_rndis_agg_size < size) 967 size = sc->hn_rndis_agg_size; 968 969 /* NOTE: We only aggregate packets using chimney sending buffers. */ 970 if (size > (uint32_t)sc->hn_chim_szmax) 971 size = sc->hn_chim_szmax; 972 973 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 974 /* Disable */ 975 size = 0; 976 pkts = 0; 977 goto done; 978 } 979 980 /* NOTE: Type of the per TX ring setting is 'int'. */ 981 if (size > INT_MAX) 982 size = INT_MAX; 983 984 /* 985 * Setup aggregation packet count. 986 */ 987 if (sc->hn_agg_pkts < 0) 988 pkts = UINT32_MAX; 989 else 990 pkts = sc->hn_agg_pkts; 991 992 if (sc->hn_rndis_agg_pkts < pkts) 993 pkts = sc->hn_rndis_agg_pkts; 994 995 if (pkts <= 1) { 996 /* Disable */ 997 size = 0; 998 pkts = 0; 999 goto done; 1000 } 1001 1002 /* NOTE: Type of the per TX ring setting is 'short'. */ 1003 if (pkts > SHRT_MAX) 1004 pkts = SHRT_MAX; 1005 1006 done: 1007 /* NOTE: Type of the per TX ring setting is 'short'. */ 1008 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1009 /* Disable */ 1010 size = 0; 1011 pkts = 0; 1012 } 1013 1014 if (bootverbose) { 1015 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1016 size, pkts, sc->hn_rndis_agg_align); 1017 } 1018 1019 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1020 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1021 1022 mtx_lock(&txr->hn_tx_lock); 1023 txr->hn_agg_szmax = size; 1024 txr->hn_agg_pktmax = pkts; 1025 txr->hn_agg_align = sc->hn_rndis_agg_align; 1026 mtx_unlock(&txr->hn_tx_lock); 1027 } 1028 } 1029 1030 static int 1031 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1032 { 1033 1034 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1035 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1036 return txr->hn_txdesc_cnt; 1037 return hn_tx_swq_depth; 1038 } 1039 1040 static int 1041 hn_rss_reconfig(struct hn_softc *sc) 1042 { 1043 int error; 1044 1045 HN_LOCK_ASSERT(sc); 1046 1047 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1048 return (ENXIO); 1049 1050 /* 1051 * Disable RSS first. 1052 * 1053 * NOTE: 1054 * Direct reconfiguration by setting the UNCHG flags does 1055 * _not_ work properly. 1056 */ 1057 if (bootverbose) 1058 if_printf(sc->hn_ifp, "disable RSS\n"); 1059 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1060 if (error) { 1061 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1062 return (error); 1063 } 1064 1065 /* 1066 * Reenable the RSS w/ the updated RSS key or indirect 1067 * table. 1068 */ 1069 if (bootverbose) 1070 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1071 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1072 if (error) { 1073 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1074 return (error); 1075 } 1076 return (0); 1077 } 1078 1079 static void 1080 hn_rss_ind_fixup(struct hn_softc *sc) 1081 { 1082 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1083 int i, nchan; 1084 1085 nchan = sc->hn_rx_ring_inuse; 1086 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1087 1088 /* 1089 * Check indirect table to make sure that all channels in it 1090 * can be used. 1091 */ 1092 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1093 if (rss->rss_ind[i] >= nchan) { 1094 if_printf(sc->hn_ifp, 1095 "RSS indirect table %d fixup: %u -> %d\n", 1096 i, rss->rss_ind[i], nchan - 1); 1097 rss->rss_ind[i] = nchan - 1; 1098 } 1099 } 1100 } 1101 1102 static int 1103 hn_ifmedia_upd(if_t ifp __unused) 1104 { 1105 1106 return EOPNOTSUPP; 1107 } 1108 1109 static void 1110 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr) 1111 { 1112 struct hn_softc *sc = if_getsoftc(ifp); 1113 1114 ifmr->ifm_status = IFM_AVALID; 1115 ifmr->ifm_active = IFM_ETHER; 1116 1117 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1118 ifmr->ifm_active |= IFM_NONE; 1119 return; 1120 } 1121 ifmr->ifm_status |= IFM_ACTIVE; 1122 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1123 } 1124 1125 static void 1126 hn_rxvf_set_task(void *xarg, int pending __unused) 1127 { 1128 struct hn_rxvf_setarg *arg = xarg; 1129 1130 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1131 } 1132 1133 static void 1134 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp) 1135 { 1136 struct hn_rx_ring *rxr; 1137 struct hn_rxvf_setarg arg; 1138 struct task task; 1139 int i; 1140 1141 HN_LOCK_ASSERT(sc); 1142 1143 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1144 1145 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1146 rxr = &sc->hn_rx_ring[i]; 1147 1148 if (i < sc->hn_rx_ring_inuse) { 1149 arg.rxr = rxr; 1150 arg.vf_ifp = vf_ifp; 1151 vmbus_chan_run_task(rxr->hn_chan, &task); 1152 } else { 1153 rxr->hn_rxvf_ifp = vf_ifp; 1154 } 1155 } 1156 } 1157 1158 static bool 1159 hn_ismyvf(const struct hn_softc *sc, const if_t ifp) 1160 { 1161 if_t hn_ifp; 1162 1163 hn_ifp = sc->hn_ifp; 1164 1165 if (ifp == hn_ifp) 1166 return (false); 1167 1168 if (if_getalloctype(ifp) != IFT_ETHER) 1169 return (false); 1170 1171 /* Ignore lagg/vlan interfaces */ 1172 if (strcmp(if_getdname(ifp), "lagg") == 0 || 1173 strcmp(if_getdname(ifp), "vlan") == 0) 1174 return (false); 1175 1176 /* 1177 * During detach events if_getifaddr(ifp) might be NULL. 1178 * Make sure the bcmp() below doesn't panic on that: 1179 */ 1180 if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL) 1181 return (false); 1182 1183 if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0) 1184 return (false); 1185 1186 return (true); 1187 } 1188 1189 static void 1190 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf) 1191 { 1192 if_t hn_ifp; 1193 1194 HN_LOCK(sc); 1195 1196 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1197 goto out; 1198 1199 if (!hn_ismyvf(sc, ifp)) 1200 goto out; 1201 hn_ifp = sc->hn_ifp; 1202 1203 if (rxvf) { 1204 if (sc->hn_flags & HN_FLAG_RXVF) 1205 goto out; 1206 1207 sc->hn_flags |= HN_FLAG_RXVF; 1208 hn_rxfilter_config(sc); 1209 } else { 1210 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1211 goto out; 1212 1213 sc->hn_flags &= ~HN_FLAG_RXVF; 1214 if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING) 1215 hn_rxfilter_config(sc); 1216 else 1217 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1218 } 1219 1220 hn_nvs_set_datapath(sc, 1221 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1222 1223 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1224 1225 if (rxvf) { 1226 hn_vf_rss_fixup(sc, true); 1227 hn_suspend_mgmt(sc); 1228 sc->hn_link_flags &= 1229 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1230 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1231 } else { 1232 hn_vf_rss_restore(sc); 1233 hn_resume_mgmt(sc); 1234 } 1235 1236 devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp), 1237 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1238 1239 if (bootverbose) { 1240 if_printf(hn_ifp, "datapath is switched %s %s\n", 1241 rxvf ? "to" : "from", if_name(ifp)); 1242 } 1243 out: 1244 HN_UNLOCK(sc); 1245 } 1246 1247 static void 1248 hn_ifnet_event(void *arg, if_t ifp, int event) 1249 { 1250 1251 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1252 return; 1253 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1254 } 1255 1256 static void 1257 hn_ifaddr_event(void *arg, if_t ifp) 1258 { 1259 1260 hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP); 1261 } 1262 1263 static int 1264 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1265 { 1266 if_t ifp, vf_ifp; 1267 uint64_t tmp; 1268 int error; 1269 1270 HN_LOCK_ASSERT(sc); 1271 ifp = sc->hn_ifp; 1272 vf_ifp = sc->hn_vf_ifp; 1273 1274 /* 1275 * Fix up requested capabilities w/ supported capabilities, 1276 * since the supported capabilities could have been changed. 1277 */ 1278 ifr->ifr_reqcap &= if_getcapabilities(ifp); 1279 /* Pass SIOCSIFCAP to VF. */ 1280 error = ifhwioctl(SIOCSIFCAP, vf_ifp, (caddr_t)ifr, curthread); 1281 1282 /* 1283 * NOTE: 1284 * The error will be propagated to the callers, however, it 1285 * is _not_ useful here. 1286 */ 1287 1288 /* 1289 * Merge VF's enabled capabilities. 1290 */ 1291 if_setcapenable(ifp, if_getcapenable(vf_ifp) & if_getcapabilities(ifp)); 1292 1293 tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP_HWASSIST(sc); 1294 if (if_getcapenable(ifp) & IFCAP_TXCSUM) 1295 if_sethwassistbits(ifp, tmp, 0); 1296 else 1297 if_sethwassistbits(ifp, 0, tmp); 1298 1299 tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP6_HWASSIST(sc); 1300 if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) 1301 if_sethwassistbits(ifp, tmp, 0); 1302 else 1303 if_sethwassistbits(ifp, 0, tmp); 1304 1305 tmp = if_gethwassist(vf_ifp) & CSUM_IP_TSO; 1306 if (if_getcapenable(ifp) & IFCAP_TSO4) 1307 if_sethwassistbits(ifp, tmp, 0); 1308 else 1309 if_sethwassistbits(ifp, 0, tmp); 1310 1311 tmp = if_gethwassist(vf_ifp) & CSUM_IP6_TSO; 1312 if (if_getcapenable(ifp) & IFCAP_TSO6) 1313 if_sethwassistbits(ifp, tmp, 0); 1314 else 1315 if_sethwassistbits(ifp, 0, tmp); 1316 1317 return (error); 1318 } 1319 1320 static int 1321 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1322 { 1323 if_t vf_ifp; 1324 struct ifreq ifr; 1325 1326 HN_LOCK_ASSERT(sc); 1327 vf_ifp = sc->hn_vf_ifp; 1328 1329 memset(&ifr, 0, sizeof(ifr)); 1330 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1331 ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff; 1332 ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16; 1333 return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread)); 1334 } 1335 1336 static void 1337 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1338 { 1339 if_t ifp = sc->hn_ifp; 1340 int allmulti = 0; 1341 1342 HN_LOCK_ASSERT(sc); 1343 1344 /* XXX vlan(4) style mcast addr maintenance */ 1345 if (!if_maddr_empty(ifp)) 1346 allmulti = IFF_ALLMULTI; 1347 1348 /* Always set the VF's if_flags */ 1349 if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti); 1350 } 1351 1352 static void 1353 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m) 1354 { 1355 struct rm_priotracker pt; 1356 if_t hn_ifp = NULL; 1357 struct mbuf *mn; 1358 1359 /* 1360 * XXX racy, if hn(4) ever detached. 1361 */ 1362 rm_rlock(&hn_vfmap_lock, &pt); 1363 if (if_getindex(vf_ifp) < hn_vfmap_size) 1364 hn_ifp = hn_vfmap[if_getindex(vf_ifp)]; 1365 rm_runlock(&hn_vfmap_lock, &pt); 1366 1367 if (hn_ifp != NULL) { 1368 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1369 /* 1370 * Allow tapping on the VF. 1371 */ 1372 ETHER_BPF_MTAP(vf_ifp, mn); 1373 1374 /* 1375 * Update VF stats. 1376 */ 1377 if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) { 1378 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1379 mn->m_pkthdr.len); 1380 } 1381 /* 1382 * XXX IFCOUNTER_IMCAST 1383 * This stat updating is kinda invasive, since it 1384 * requires two checks on the mbuf: the length check 1385 * and the ethernet header check. As of this write, 1386 * all multicast packets go directly to hn(4), which 1387 * makes imcast stat updating in the VF a try in vian. 1388 */ 1389 1390 /* 1391 * Fix up rcvif and increase hn(4)'s ipackets. 1392 */ 1393 mn->m_pkthdr.rcvif = hn_ifp; 1394 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1395 } 1396 /* 1397 * Go through hn(4)'s if_input. 1398 */ 1399 if_input(hn_ifp, m); 1400 } else { 1401 /* 1402 * In the middle of the transition; free this 1403 * mbuf chain. 1404 */ 1405 while (m != NULL) { 1406 mn = m->m_nextpkt; 1407 m->m_nextpkt = NULL; 1408 m_freem(m); 1409 m = mn; 1410 } 1411 } 1412 } 1413 1414 static void 1415 hn_mtu_change_fixup(struct hn_softc *sc) 1416 { 1417 if_t ifp; 1418 1419 HN_LOCK_ASSERT(sc); 1420 ifp = sc->hn_ifp; 1421 1422 hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp)); 1423 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1424 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1425 } 1426 1427 static uint32_t 1428 hn_rss_type_fromndis(uint32_t rss_hash) 1429 { 1430 uint32_t types = 0; 1431 1432 if (rss_hash & NDIS_HASH_IPV4) 1433 types |= RSS_TYPE_IPV4; 1434 if (rss_hash & NDIS_HASH_TCP_IPV4) 1435 types |= RSS_TYPE_TCP_IPV4; 1436 if (rss_hash & NDIS_HASH_IPV6) 1437 types |= RSS_TYPE_IPV6; 1438 if (rss_hash & NDIS_HASH_IPV6_EX) 1439 types |= RSS_TYPE_IPV6_EX; 1440 if (rss_hash & NDIS_HASH_TCP_IPV6) 1441 types |= RSS_TYPE_TCP_IPV6; 1442 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1443 types |= RSS_TYPE_TCP_IPV6_EX; 1444 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1445 types |= RSS_TYPE_UDP_IPV4; 1446 return (types); 1447 } 1448 1449 static uint32_t 1450 hn_rss_type_tondis(uint32_t types) 1451 { 1452 uint32_t rss_hash = 0; 1453 1454 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1455 ("UDP6 and UDP6EX are not supported")); 1456 1457 if (types & RSS_TYPE_IPV4) 1458 rss_hash |= NDIS_HASH_IPV4; 1459 if (types & RSS_TYPE_TCP_IPV4) 1460 rss_hash |= NDIS_HASH_TCP_IPV4; 1461 if (types & RSS_TYPE_IPV6) 1462 rss_hash |= NDIS_HASH_IPV6; 1463 if (types & RSS_TYPE_IPV6_EX) 1464 rss_hash |= NDIS_HASH_IPV6_EX; 1465 if (types & RSS_TYPE_TCP_IPV6) 1466 rss_hash |= NDIS_HASH_TCP_IPV6; 1467 if (types & RSS_TYPE_TCP_IPV6_EX) 1468 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1469 if (types & RSS_TYPE_UDP_IPV4) 1470 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1471 return (rss_hash); 1472 } 1473 1474 static void 1475 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1476 { 1477 int i; 1478 1479 HN_LOCK_ASSERT(sc); 1480 1481 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1482 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1483 } 1484 1485 static void 1486 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1487 { 1488 if_t ifp, vf_ifp; 1489 struct ifrsshash ifrh; 1490 struct ifrsskey ifrk; 1491 int error; 1492 uint32_t my_types, diff_types, mbuf_types = 0; 1493 1494 HN_LOCK_ASSERT(sc); 1495 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1496 ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); 1497 1498 if (sc->hn_rx_ring_inuse == 1) { 1499 /* No RSS on synthetic parts; done. */ 1500 return; 1501 } 1502 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1503 /* Synthetic parts do not support Toeplitz; done. */ 1504 return; 1505 } 1506 1507 ifp = sc->hn_ifp; 1508 vf_ifp = sc->hn_vf_ifp; 1509 1510 /* 1511 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1512 * supported. 1513 */ 1514 memset(&ifrk, 0, sizeof(ifrk)); 1515 strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name)); 1516 error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread); 1517 if (error) { 1518 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1519 if_name(vf_ifp), error); 1520 goto done; 1521 } 1522 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1523 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1524 if_name(vf_ifp), ifrk.ifrk_func); 1525 goto done; 1526 } 1527 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1528 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1529 if_name(vf_ifp), ifrk.ifrk_keylen); 1530 goto done; 1531 } 1532 1533 /* 1534 * Extract VF's RSS hash. Only Toeplitz is supported. 1535 */ 1536 memset(&ifrh, 0, sizeof(ifrh)); 1537 strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name)); 1538 error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread); 1539 if (error) { 1540 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1541 if_name(vf_ifp), error); 1542 goto done; 1543 } 1544 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1545 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1546 if_name(vf_ifp), ifrh.ifrh_func); 1547 goto done; 1548 } 1549 1550 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1551 if ((ifrh.ifrh_types & my_types) == 0) { 1552 /* This disables RSS; ignore it then */ 1553 if_printf(ifp, "%s intersection of RSS types failed. " 1554 "VF %#x, mine %#x\n", if_name(vf_ifp), 1555 ifrh.ifrh_types, my_types); 1556 goto done; 1557 } 1558 1559 diff_types = my_types ^ ifrh.ifrh_types; 1560 my_types &= ifrh.ifrh_types; 1561 mbuf_types = my_types; 1562 1563 /* 1564 * Detect RSS hash value/type confliction. 1565 * 1566 * NOTE: 1567 * We don't disable the hash type, but stop delivery the hash 1568 * value/type through mbufs on RX path. 1569 * 1570 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1571 * hash is delivered with type of TCP_IPV4. This means if 1572 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1573 * least to hn_mbuf_hash. However, given that _all_ of the 1574 * NICs implement TCP_IPV4, this will _not_ impose any issues 1575 * here. 1576 */ 1577 if ((my_types & RSS_TYPE_IPV4) && 1578 (diff_types & ifrh.ifrh_types & 1579 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1580 /* Conflict; disable IPV4 hash type/value delivery. */ 1581 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1582 mbuf_types &= ~RSS_TYPE_IPV4; 1583 } 1584 if ((my_types & RSS_TYPE_IPV6) && 1585 (diff_types & ifrh.ifrh_types & 1586 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1587 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1588 RSS_TYPE_IPV6_EX))) { 1589 /* Conflict; disable IPV6 hash type/value delivery. */ 1590 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1591 mbuf_types &= ~RSS_TYPE_IPV6; 1592 } 1593 if ((my_types & RSS_TYPE_IPV6_EX) && 1594 (diff_types & ifrh.ifrh_types & 1595 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1596 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1597 RSS_TYPE_IPV6))) { 1598 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1599 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1600 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1601 } 1602 if ((my_types & RSS_TYPE_TCP_IPV6) && 1603 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1604 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1605 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1606 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1607 } 1608 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1609 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1610 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1611 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1612 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1613 } 1614 if ((my_types & RSS_TYPE_UDP_IPV6) && 1615 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1616 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1617 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1618 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1619 } 1620 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1621 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1622 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1623 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1624 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1625 } 1626 1627 /* 1628 * Indirect table does not matter. 1629 */ 1630 1631 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1632 hn_rss_type_tondis(my_types); 1633 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1634 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1635 1636 if (reconf) { 1637 error = hn_rss_reconfig(sc); 1638 if (error) { 1639 /* XXX roll-back? */ 1640 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1641 /* XXX keep going. */ 1642 } 1643 } 1644 done: 1645 /* Hash deliverability for mbufs. */ 1646 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1647 } 1648 1649 static void 1650 hn_vf_rss_restore(struct hn_softc *sc) 1651 { 1652 1653 HN_LOCK_ASSERT(sc); 1654 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1655 ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); 1656 1657 if (sc->hn_rx_ring_inuse == 1) 1658 goto done; 1659 1660 /* 1661 * Restore hash types. Key does _not_ matter. 1662 */ 1663 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1664 int error; 1665 1666 sc->hn_rss_hash = sc->hn_rss_hcap; 1667 error = hn_rss_reconfig(sc); 1668 if (error) { 1669 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1670 error); 1671 /* XXX keep going. */ 1672 } 1673 } 1674 done: 1675 /* Hash deliverability for mbufs. */ 1676 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1677 } 1678 1679 static void 1680 hn_xpnt_vf_setready(struct hn_softc *sc) 1681 { 1682 if_t ifp, vf_ifp; 1683 struct ifreq ifr; 1684 1685 HN_LOCK_ASSERT(sc); 1686 ifp = sc->hn_ifp; 1687 vf_ifp = sc->hn_vf_ifp; 1688 1689 /* 1690 * Mark the VF ready. 1691 */ 1692 sc->hn_vf_rdytick = 0; 1693 1694 /* 1695 * Save information for restoration. 1696 */ 1697 sc->hn_saved_caps = if_getcapabilities(ifp); 1698 sc->hn_saved_tsomax = if_gethwtsomax(ifp); 1699 sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp); 1700 sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp); 1701 1702 /* 1703 * Intersect supported/enabled capabilities. 1704 * 1705 * NOTE: 1706 * if_hwassist is not changed here. 1707 */ 1708 if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp)); 1709 if_setcapenablebit(ifp, 0, if_getcapabilities(ifp)); 1710 1711 /* 1712 * Fix TSO settings. 1713 */ 1714 if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp)) 1715 if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp)); 1716 if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp)) 1717 if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp)); 1718 if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp)) 1719 if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp)); 1720 1721 /* 1722 * Change VF's enabled capabilities. 1723 */ 1724 memset(&ifr, 0, sizeof(ifr)); 1725 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1726 ifr.ifr_reqcap = if_getcapenable(ifp); 1727 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1728 1729 if (if_getmtu(ifp) != ETHERMTU) { 1730 int error; 1731 1732 /* 1733 * Change VF's MTU. 1734 */ 1735 memset(&ifr, 0, sizeof(ifr)); 1736 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1737 ifr.ifr_mtu = if_getmtu(ifp); 1738 error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread); 1739 if (error) { 1740 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1741 if_name(vf_ifp), if_getmtu(ifp)); 1742 if (if_getmtu(ifp) > ETHERMTU) { 1743 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1744 1745 /* 1746 * XXX 1747 * No need to adjust the synthetic parts' MTU; 1748 * failure of the adjustment will cause us 1749 * infinite headache. 1750 */ 1751 if_setmtu(ifp, ETHERMTU); 1752 hn_mtu_change_fixup(sc); 1753 } 1754 } 1755 } 1756 } 1757 1758 static bool 1759 hn_xpnt_vf_isready(struct hn_softc *sc) 1760 { 1761 1762 HN_LOCK_ASSERT(sc); 1763 1764 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1765 return (false); 1766 1767 if (sc->hn_vf_rdytick == 0) 1768 return (true); 1769 1770 if (sc->hn_vf_rdytick > ticks) 1771 return (false); 1772 1773 /* Mark VF as ready. */ 1774 hn_xpnt_vf_setready(sc); 1775 return (true); 1776 } 1777 1778 static void 1779 hn_xpnt_vf_setenable(struct hn_softc *sc) 1780 { 1781 int i; 1782 1783 HN_LOCK_ASSERT(sc); 1784 1785 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1786 rm_wlock(&sc->hn_vf_lock); 1787 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1788 rm_wunlock(&sc->hn_vf_lock); 1789 1790 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1791 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1792 } 1793 1794 static void 1795 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1796 { 1797 int i; 1798 1799 HN_LOCK_ASSERT(sc); 1800 1801 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1802 rm_wlock(&sc->hn_vf_lock); 1803 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1804 if (clear_vf) 1805 sc->hn_vf_ifp = NULL; 1806 rm_wunlock(&sc->hn_vf_lock); 1807 1808 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1809 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1810 } 1811 1812 static void 1813 hn_xpnt_vf_init(struct hn_softc *sc) 1814 { 1815 int error; 1816 1817 HN_LOCK_ASSERT(sc); 1818 1819 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1820 ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); 1821 1822 if (bootverbose) { 1823 if_printf(sc->hn_ifp, "try bringing up %s\n", 1824 if_name(sc->hn_vf_ifp)); 1825 } 1826 1827 /* 1828 * Bring the VF up. 1829 */ 1830 hn_xpnt_vf_saveifflags(sc); 1831 if_setflagbits(sc->hn_ifp, IFF_UP, 0); 1832 error = hn_xpnt_vf_iocsetflags(sc); 1833 if (error) { 1834 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1835 if_name(sc->hn_vf_ifp), error); 1836 return; 1837 } 1838 1839 /* 1840 * NOTE: 1841 * Datapath setting must happen _after_ bringing the VF up. 1842 */ 1843 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1844 1845 /* 1846 * NOTE: 1847 * Fixup RSS related bits _after_ the VF is brought up, since 1848 * many VFs generate RSS key during it's initialization. 1849 */ 1850 hn_vf_rss_fixup(sc, true); 1851 1852 /* Mark transparent mode VF as enabled. */ 1853 hn_xpnt_vf_setenable(sc); 1854 } 1855 1856 static void 1857 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1858 { 1859 struct hn_softc *sc = xsc; 1860 1861 HN_LOCK(sc); 1862 1863 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1864 goto done; 1865 if (sc->hn_vf_ifp == NULL) 1866 goto done; 1867 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1868 goto done; 1869 1870 if (sc->hn_vf_rdytick != 0) { 1871 /* Mark VF as ready. */ 1872 hn_xpnt_vf_setready(sc); 1873 } 1874 1875 if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) { 1876 /* 1877 * Delayed VF initialization. 1878 */ 1879 if (bootverbose) { 1880 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1881 if_name(sc->hn_vf_ifp)); 1882 } 1883 hn_xpnt_vf_init(sc); 1884 } 1885 done: 1886 HN_UNLOCK(sc); 1887 } 1888 1889 static void 1890 hn_ifnet_attevent(void *xsc, if_t ifp) 1891 { 1892 struct hn_softc *sc = xsc; 1893 1894 HN_LOCK(sc); 1895 1896 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1897 goto done; 1898 1899 if (!hn_ismyvf(sc, ifp)) 1900 goto done; 1901 1902 if (sc->hn_vf_ifp != NULL) { 1903 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1904 if_name(sc->hn_vf_ifp)); 1905 goto done; 1906 } 1907 1908 if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) { 1909 /* 1910 * ifnet.if_start is _not_ supported by transparent 1911 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1912 */ 1913 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1914 "in transparent VF mode.\n", if_name(sc->hn_vf_ifp)); 1915 1916 goto done; 1917 } 1918 1919 rm_wlock(&hn_vfmap_lock); 1920 1921 if (if_getindex(ifp) >= hn_vfmap_size) { 1922 if_t *newmap; 1923 int newsize; 1924 1925 newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF; 1926 newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF, 1927 M_WAITOK | M_ZERO); 1928 1929 memcpy(newmap, hn_vfmap, 1930 sizeof(if_t) * hn_vfmap_size); 1931 free(hn_vfmap, M_DEVBUF); 1932 hn_vfmap = newmap; 1933 hn_vfmap_size = newsize; 1934 } 1935 KASSERT(hn_vfmap[if_getindex(ifp)] == NULL, 1936 ("%s: ifindex %d was mapped to %s", 1937 if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)]))); 1938 hn_vfmap[if_getindex(ifp)] = sc->hn_ifp; 1939 1940 rm_wunlock(&hn_vfmap_lock); 1941 1942 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1943 rm_wlock(&sc->hn_vf_lock); 1944 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1945 ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); 1946 sc->hn_vf_ifp = ifp; 1947 rm_wunlock(&sc->hn_vf_lock); 1948 1949 if (hn_xpnt_vf) { 1950 int wait_ticks; 1951 1952 /* 1953 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1954 * Save vf_ifp's current if_input for later restoration. 1955 */ 1956 sc->hn_vf_input = if_getinputfn(ifp); 1957 if_setinputfn(ifp, hn_xpnt_vf_input); 1958 1959 /* 1960 * Stop link status management; use the VF's. 1961 */ 1962 hn_suspend_mgmt(sc); 1963 1964 /* 1965 * Give VF sometime to complete its attach routing. 1966 */ 1967 wait_ticks = hn_xpnt_vf_attwait * hz; 1968 sc->hn_vf_rdytick = ticks + wait_ticks; 1969 1970 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1971 wait_ticks); 1972 } 1973 done: 1974 HN_UNLOCK(sc); 1975 } 1976 1977 static void 1978 hn_ifnet_detevent(void *xsc, if_t ifp) 1979 { 1980 struct hn_softc *sc = xsc; 1981 1982 HN_LOCK(sc); 1983 1984 if (sc->hn_vf_ifp == NULL) 1985 goto done; 1986 1987 if (!hn_ismyvf(sc, ifp)) 1988 goto done; 1989 1990 if (hn_xpnt_vf) { 1991 /* 1992 * Make sure that the delayed initialization is not running. 1993 * 1994 * NOTE: 1995 * - This lock _must_ be released, since the hn_vf_init task 1996 * will try holding this lock. 1997 * - It is safe to release this lock here, since the 1998 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 1999 * 2000 * XXX racy, if hn(4) ever detached. 2001 */ 2002 HN_UNLOCK(sc); 2003 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 2004 HN_LOCK(sc); 2005 2006 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 2007 if_name(sc->hn_ifp))); 2008 if_setinputfn(ifp, sc->hn_vf_input); 2009 sc->hn_vf_input = NULL; 2010 2011 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 2012 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 2013 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 2014 2015 if (sc->hn_vf_rdytick == 0) { 2016 /* 2017 * The VF was ready; restore some settings. 2018 */ 2019 if_setcapabilities(ifp, sc->hn_saved_caps); 2020 /* 2021 * NOTE: 2022 * There is _no_ need to fixup if_capenable and 2023 * if_hwassist, since the if_capabilities before 2024 * restoration was an intersection of the VF's 2025 * if_capabilites and the synthetic device's 2026 * if_capabilites. 2027 */ 2028 if_sethwtsomax(ifp, sc->hn_saved_tsomax); 2029 if_sethwtsomaxsegcount(sc->hn_ifp, 2030 sc->hn_saved_tsosegcnt); 2031 if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz); 2032 } 2033 2034 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2035 /* 2036 * Restore RSS settings. 2037 */ 2038 hn_vf_rss_restore(sc); 2039 2040 /* 2041 * Resume link status management, which was suspended 2042 * by hn_ifnet_attevent(). 2043 */ 2044 hn_resume_mgmt(sc); 2045 } 2046 } 2047 2048 /* Mark transparent mode VF as disabled. */ 2049 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2050 2051 rm_wlock(&hn_vfmap_lock); 2052 2053 KASSERT(if_getindex(ifp) < hn_vfmap_size, 2054 ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size)); 2055 if (hn_vfmap[if_getindex(ifp)] != NULL) { 2056 KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp, 2057 ("%s: ifindex %d was mapped to %s", 2058 if_name(ifp), if_getindex(ifp), 2059 if_name(hn_vfmap[if_getindex(ifp)]))); 2060 hn_vfmap[if_getindex(ifp)] = NULL; 2061 } 2062 2063 rm_wunlock(&hn_vfmap_lock); 2064 done: 2065 HN_UNLOCK(sc); 2066 } 2067 2068 static void 2069 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state) 2070 { 2071 struct hn_softc *sc = xsc; 2072 2073 if (sc->hn_vf_ifp == ifp) 2074 if_link_state_change(sc->hn_ifp, link_state); 2075 } 2076 2077 static int 2078 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS) 2079 { 2080 struct hn_softc *sc = arg1; 2081 unsigned int tsomax; 2082 int error; 2083 2084 tsomax = if_gethwtsomax(sc->hn_ifp); 2085 error = sysctl_handle_int(oidp, &tsomax, 0, req); 2086 return error; 2087 } 2088 2089 static int 2090 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS) 2091 { 2092 struct hn_softc *sc = arg1; 2093 unsigned int tsomaxsegcnt; 2094 int error; 2095 2096 tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp); 2097 error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req); 2098 return error; 2099 } 2100 2101 static int 2102 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS) 2103 { 2104 struct hn_softc *sc = arg1; 2105 unsigned int tsomaxsegsz; 2106 int error; 2107 2108 tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp); 2109 error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req); 2110 return error; 2111 } 2112 2113 static int 2114 hn_probe(device_t dev) 2115 { 2116 2117 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2118 device_set_desc(dev, "Hyper-V Network Interface"); 2119 return BUS_PROBE_DEFAULT; 2120 } 2121 return ENXIO; 2122 } 2123 2124 static int 2125 hn_attach(device_t dev) 2126 { 2127 struct hn_softc *sc = device_get_softc(dev); 2128 struct sysctl_oid_list *child; 2129 struct sysctl_ctx_list *ctx; 2130 uint8_t eaddr[ETHER_ADDR_LEN]; 2131 if_t ifp = NULL; 2132 int error, ring_cnt, tx_ring_cnt; 2133 uint32_t mtu; 2134 2135 sc->hn_dev = dev; 2136 sc->hn_prichan = vmbus_get_channel(dev); 2137 HN_LOCK_INIT(sc); 2138 rm_init(&sc->hn_vf_lock, "hnvf"); 2139 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2140 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2141 2142 /* 2143 * Initialize these tunables once. 2144 */ 2145 sc->hn_agg_size = hn_tx_agg_size; 2146 sc->hn_agg_pkts = hn_tx_agg_pkts; 2147 2148 /* 2149 * Setup taskqueue for transmission. 2150 */ 2151 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2152 int i; 2153 2154 sc->hn_tx_taskqs = 2155 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2156 M_DEVBUF, M_WAITOK); 2157 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2158 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2159 M_WAITOK, taskqueue_thread_enqueue, 2160 &sc->hn_tx_taskqs[i]); 2161 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2162 "%s tx%d", device_get_nameunit(dev), i); 2163 } 2164 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2165 sc->hn_tx_taskqs = hn_tx_taskque; 2166 } 2167 2168 /* 2169 * Setup taskqueue for mangement tasks, e.g. link status. 2170 */ 2171 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2172 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2173 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2174 device_get_nameunit(dev)); 2175 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2176 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2177 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2178 hn_netchg_status_taskfunc, sc); 2179 2180 if (hn_xpnt_vf) { 2181 /* 2182 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2183 */ 2184 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2185 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2186 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2187 device_get_nameunit(dev)); 2188 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2189 hn_xpnt_vf_init_taskfunc, sc); 2190 } 2191 2192 /* 2193 * Allocate ifnet and setup its name earlier, so that if_printf 2194 * can be used by functions, which will be called after 2195 * ether_ifattach(). 2196 */ 2197 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2198 if_setsoftc(ifp, sc); 2199 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2200 2201 /* 2202 * Initialize ifmedia earlier so that it can be unconditionally 2203 * destroyed, if error happened later on. 2204 */ 2205 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2206 2207 /* 2208 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2209 * to use (tx_ring_cnt). 2210 * 2211 * NOTE: 2212 * The # of RX rings to use is same as the # of channels to use. 2213 */ 2214 ring_cnt = hn_chan_cnt; 2215 if (ring_cnt <= 0) { 2216 /* Default */ 2217 ring_cnt = mp_ncpus; 2218 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2219 ring_cnt = HN_RING_CNT_DEF_MAX; 2220 } else if (ring_cnt > mp_ncpus) { 2221 ring_cnt = mp_ncpus; 2222 } 2223 #ifdef RSS 2224 if (ring_cnt > rss_getnumbuckets()) 2225 ring_cnt = rss_getnumbuckets(); 2226 #endif 2227 2228 tx_ring_cnt = hn_tx_ring_cnt; 2229 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2230 tx_ring_cnt = ring_cnt; 2231 #ifdef HN_IFSTART_SUPPORT 2232 if (hn_use_if_start) { 2233 /* ifnet.if_start only needs one TX ring. */ 2234 tx_ring_cnt = 1; 2235 } 2236 #endif 2237 2238 /* 2239 * Set the leader CPU for channels. 2240 */ 2241 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2242 2243 /* 2244 * Create enough TX/RX rings, even if only limited number of 2245 * channels can be allocated. 2246 */ 2247 error = hn_create_tx_data(sc, tx_ring_cnt); 2248 if (error) 2249 goto failed; 2250 error = hn_create_rx_data(sc, ring_cnt); 2251 if (error) 2252 goto failed; 2253 2254 /* 2255 * Create transaction context for NVS and RNDIS transactions. 2256 */ 2257 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2258 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2259 if (sc->hn_xact == NULL) { 2260 error = ENXIO; 2261 goto failed; 2262 } 2263 2264 /* 2265 * Install orphan handler for the revocation of this device's 2266 * primary channel. 2267 * 2268 * NOTE: 2269 * The processing order is critical here: 2270 * Install the orphan handler, _before_ testing whether this 2271 * device's primary channel has been revoked or not. 2272 */ 2273 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2274 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2275 error = ENXIO; 2276 goto failed; 2277 } 2278 2279 /* 2280 * Attach the synthetic parts, i.e. NVS and RNDIS. 2281 */ 2282 error = hn_synth_attach(sc, ETHERMTU); 2283 if (error) 2284 goto failed; 2285 2286 error = hn_rndis_get_eaddr(sc, eaddr); 2287 if (error) 2288 goto failed; 2289 2290 error = hn_rndis_get_mtu(sc, &mtu); 2291 if (error) 2292 mtu = ETHERMTU; 2293 else if (bootverbose) 2294 device_printf(dev, "RNDIS mtu %u\n", mtu); 2295 2296 if (sc->hn_rx_ring_inuse > 1) { 2297 /* 2298 * Reduce TCP segment aggregation limit for multiple 2299 * RX rings to increase ACK timeliness. 2300 */ 2301 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2302 } 2303 2304 /* 2305 * Fixup TX/RX stuffs after synthetic parts are attached. 2306 */ 2307 hn_fixup_tx_data(sc); 2308 hn_fixup_rx_data(sc); 2309 2310 ctx = device_get_sysctl_ctx(dev); 2311 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2312 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2313 &sc->hn_nvs_ver, 0, "NVS version"); 2314 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2315 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2316 hn_ndis_version_sysctl, "A", "NDIS version"); 2317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2318 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2319 hn_caps_sysctl, "A", "capabilities"); 2320 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2321 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2322 hn_hwassist_sysctl, "A", "hwassist"); 2323 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max", 2324 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl, 2325 "IU", "max TSO size"); 2326 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt", 2327 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl, 2328 "IU", "max # of TSO segments"); 2329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz", 2330 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl, 2331 "IU", "max size of TSO segment"); 2332 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2333 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2334 hn_rxfilter_sysctl, "A", "rxfilter"); 2335 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2336 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2337 hn_rss_hash_sysctl, "A", "RSS hash"); 2338 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2339 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2340 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2341 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2342 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2343 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2344 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2345 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2346 #ifndef RSS 2347 /* 2348 * Don't allow RSS key/indirect table changes, if RSS is defined. 2349 */ 2350 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2351 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2352 hn_rss_key_sysctl, "IU", "RSS key"); 2353 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2354 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2355 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2356 #endif 2357 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2358 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2359 "RNDIS offered packet transmission aggregation size limit"); 2360 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2361 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2362 "RNDIS offered packet transmission aggregation count limit"); 2363 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2364 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2365 "RNDIS packet transmission aggregation alignment"); 2366 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2367 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2368 hn_txagg_size_sysctl, "I", 2369 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2370 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2371 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2372 hn_txagg_pkts_sysctl, "I", 2373 "Packet transmission aggregation packets, " 2374 "0 -- disable, -1 -- auto"); 2375 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2376 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2377 hn_polling_sysctl, "I", 2378 "Polling frequency: [100,1000000], 0 disable polling"); 2379 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2380 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2381 hn_vf_sysctl, "A", "Virtual Function's name"); 2382 if (!hn_xpnt_vf) { 2383 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2384 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2385 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2386 } else { 2387 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2388 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2389 hn_xpnt_vf_enabled_sysctl, "I", 2390 "Transparent VF enabled"); 2391 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2392 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2393 hn_xpnt_vf_accbpf_sysctl, "I", 2394 "Accurate BPF for transparent VF"); 2395 } 2396 2397 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch", 2398 CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A", 2399 "switch to rsc"); 2400 2401 /* 2402 * Setup the ifmedia, which has been initialized earlier. 2403 */ 2404 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2405 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2406 /* XXX ifmedia_set really should do this for us */ 2407 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2408 2409 /* 2410 * Setup the ifnet for this interface. 2411 */ 2412 2413 if_setbaudrate(ifp, IF_Gbps(10)); 2414 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); 2415 if_setioctlfn(ifp, hn_ioctl); 2416 if_setinitfn(ifp, hn_init); 2417 #ifdef HN_IFSTART_SUPPORT 2418 if (hn_use_if_start) { 2419 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2420 2421 if_setstartfn(ifp, hn_start); 2422 if_setsendqlen(ifp, qdepth); 2423 if_setsendqready(ifp); 2424 } else 2425 #endif 2426 { 2427 if_settransmitfn(ifp, hn_transmit); 2428 if_setqflushfn(ifp, hn_xmit_qflush); 2429 } 2430 2431 if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0); 2432 #ifdef foo 2433 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2434 if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0); 2435 #endif 2436 if (sc->hn_caps & HN_CAP_VLAN) { 2437 /* XXX not sure about VLAN_MTU. */ 2438 if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0); 2439 } 2440 2441 if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist); 2442 if (if_gethwassist(ifp) & HN_CSUM_IP_MASK) 2443 if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0); 2444 if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK) 2445 if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0); 2446 if (sc->hn_caps & HN_CAP_TSO4) { 2447 if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0); 2448 if_sethwassistbits(ifp, CSUM_IP_TSO, 0); 2449 } 2450 if (sc->hn_caps & HN_CAP_TSO6) { 2451 if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0); 2452 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); 2453 } 2454 2455 /* Enable all available capabilities by default. */ 2456 if_setcapenable(ifp, if_getcapabilities(ifp)); 2457 2458 /* 2459 * Disable IPv6 TSO and TXCSUM by default, they still can 2460 * be enabled through SIOCSIFCAP. 2461 */ 2462 if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6)); 2463 if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO)); 2464 2465 if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) { 2466 /* 2467 * Lock hn_set_tso_maxsize() to simplify its 2468 * internal logic. 2469 */ 2470 HN_LOCK(sc); 2471 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2472 HN_UNLOCK(sc); 2473 if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX); 2474 if_sethwtsomaxsegsize(ifp, PAGE_SIZE); 2475 } 2476 2477 ether_ifattach(ifp, eaddr); 2478 2479 if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2480 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2481 if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp)); 2482 } 2483 if (mtu < ETHERMTU) { 2484 2485 if_setmtu(ifp, mtu); 2486 } 2487 2488 /* Inform the upper layer about the long frame support. */ 2489 if_setifheaderlen(ifp, sizeof(struct ether_vlan_header)); 2490 2491 /* 2492 * Kick off link status check. 2493 */ 2494 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2495 hn_update_link_status(sc); 2496 2497 if (!hn_xpnt_vf) { 2498 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2499 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2500 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2501 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2502 } else { 2503 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2504 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2505 } 2506 2507 /* 2508 * NOTE: 2509 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2510 * since interface's LLADDR is needed; interface LLADDR is not 2511 * available when ifnet_arrival event is triggered. 2512 */ 2513 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2514 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2515 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2516 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2517 2518 return (0); 2519 failed: 2520 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2521 hn_synth_detach(sc); 2522 hn_detach(dev); 2523 return (error); 2524 } 2525 2526 static int 2527 hn_detach(device_t dev) 2528 { 2529 struct hn_softc *sc = device_get_softc(dev); 2530 if_t ifp = sc->hn_ifp, vf_ifp; 2531 2532 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2533 /* 2534 * In case that the vmbus missed the orphan handler 2535 * installation. 2536 */ 2537 vmbus_xact_ctx_orphan(sc->hn_xact); 2538 } 2539 2540 if (sc->hn_ifaddr_evthand != NULL) 2541 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2542 if (sc->hn_ifnet_evthand != NULL) 2543 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2544 if (sc->hn_ifnet_atthand != NULL) { 2545 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2546 sc->hn_ifnet_atthand); 2547 } 2548 if (sc->hn_ifnet_dethand != NULL) { 2549 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2550 sc->hn_ifnet_dethand); 2551 } 2552 if (sc->hn_ifnet_lnkhand != NULL) 2553 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2554 2555 vf_ifp = sc->hn_vf_ifp; 2556 __compiler_membar(); 2557 if (vf_ifp != NULL) 2558 hn_ifnet_detevent(sc, vf_ifp); 2559 2560 if (device_is_attached(dev)) { 2561 HN_LOCK(sc); 2562 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2563 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 2564 hn_stop(sc, true); 2565 /* 2566 * NOTE: 2567 * hn_stop() only suspends data, so managment 2568 * stuffs have to be suspended manually here. 2569 */ 2570 hn_suspend_mgmt(sc); 2571 hn_synth_detach(sc); 2572 } 2573 HN_UNLOCK(sc); 2574 ether_ifdetach(ifp); 2575 } 2576 2577 ifmedia_removeall(&sc->hn_media); 2578 hn_destroy_rx_data(sc); 2579 hn_destroy_tx_data(sc); 2580 2581 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2582 int i; 2583 2584 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2585 taskqueue_free(sc->hn_tx_taskqs[i]); 2586 free(sc->hn_tx_taskqs, M_DEVBUF); 2587 } 2588 taskqueue_free(sc->hn_mgmt_taskq0); 2589 if (sc->hn_vf_taskq != NULL) 2590 taskqueue_free(sc->hn_vf_taskq); 2591 2592 if (sc->hn_xact != NULL) { 2593 /* 2594 * Uninstall the orphan handler _before_ the xact is 2595 * destructed. 2596 */ 2597 vmbus_chan_unset_orphan(sc->hn_prichan); 2598 vmbus_xact_ctx_destroy(sc->hn_xact); 2599 } 2600 2601 if_free(ifp); 2602 2603 HN_LOCK_DESTROY(sc); 2604 rm_destroy(&sc->hn_vf_lock); 2605 return (0); 2606 } 2607 2608 static int 2609 hn_shutdown(device_t dev) 2610 { 2611 2612 return (0); 2613 } 2614 2615 static void 2616 hn_link_status(struct hn_softc *sc) 2617 { 2618 uint32_t link_status; 2619 int error; 2620 2621 error = hn_rndis_get_linkstatus(sc, &link_status); 2622 if (error) { 2623 /* XXX what to do? */ 2624 return; 2625 } 2626 2627 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2628 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2629 else 2630 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2631 if_link_state_change(sc->hn_ifp, 2632 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2633 LINK_STATE_UP : LINK_STATE_DOWN); 2634 } 2635 2636 static void 2637 hn_link_taskfunc(void *xsc, int pending __unused) 2638 { 2639 struct hn_softc *sc = xsc; 2640 2641 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2642 return; 2643 hn_link_status(sc); 2644 } 2645 2646 static void 2647 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2648 { 2649 struct hn_softc *sc = xsc; 2650 2651 /* Prevent any link status checks from running. */ 2652 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2653 2654 /* 2655 * Fake up a [link down --> link up] state change; 5 seconds 2656 * delay is used, which closely simulates miibus reaction 2657 * upon link down event. 2658 */ 2659 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2660 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2661 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2662 &sc->hn_netchg_status, 5 * hz); 2663 } 2664 2665 static void 2666 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2667 { 2668 struct hn_softc *sc = xsc; 2669 2670 /* Re-allow link status checks. */ 2671 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2672 hn_link_status(sc); 2673 } 2674 2675 static void 2676 hn_update_link_status(struct hn_softc *sc) 2677 { 2678 2679 if (sc->hn_mgmt_taskq != NULL) 2680 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2681 } 2682 2683 static void 2684 hn_change_network(struct hn_softc *sc) 2685 { 2686 2687 if (sc->hn_mgmt_taskq != NULL) 2688 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2689 } 2690 2691 static __inline int 2692 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2693 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2694 { 2695 struct mbuf *m = *m_head; 2696 int error; 2697 2698 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2699 2700 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2701 m, segs, nsegs, BUS_DMA_NOWAIT); 2702 if (error == EFBIG) { 2703 struct mbuf *m_new; 2704 2705 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2706 if (m_new == NULL) 2707 return ENOBUFS; 2708 else 2709 *m_head = m = m_new; 2710 txr->hn_tx_collapsed++; 2711 2712 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2713 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2714 } 2715 if (!error) { 2716 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2717 BUS_DMASYNC_PREWRITE); 2718 txd->flags |= HN_TXD_FLAG_DMAMAP; 2719 } 2720 return error; 2721 } 2722 2723 static __inline int 2724 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2725 { 2726 2727 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2728 ("put an onlist txd %#x", txd->flags)); 2729 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2730 ("put an onagg txd %#x", txd->flags)); 2731 2732 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2733 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2734 return 0; 2735 2736 if (!STAILQ_EMPTY(&txd->agg_list)) { 2737 struct hn_txdesc *tmp_txd; 2738 2739 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2740 int freed __diagused; 2741 2742 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2743 ("resursive aggregation on aggregated txdesc")); 2744 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2745 ("not aggregated txdesc")); 2746 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2747 ("aggregated txdesc uses dmamap")); 2748 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2749 ("aggregated txdesc consumes " 2750 "chimney sending buffer")); 2751 KASSERT(tmp_txd->chim_size == 0, 2752 ("aggregated txdesc has non-zero " 2753 "chimney sending size")); 2754 2755 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2756 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2757 freed = hn_txdesc_put(txr, tmp_txd); 2758 KASSERT(freed, ("failed to free aggregated txdesc")); 2759 } 2760 } 2761 2762 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2763 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2764 ("chim txd uses dmamap")); 2765 hn_chim_free(txr->hn_sc, txd->chim_index); 2766 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2767 txd->chim_size = 0; 2768 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2769 bus_dmamap_sync(txr->hn_tx_data_dtag, 2770 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2771 bus_dmamap_unload(txr->hn_tx_data_dtag, 2772 txd->data_dmap); 2773 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2774 } 2775 2776 if (txd->m != NULL) { 2777 m_freem(txd->m); 2778 txd->m = NULL; 2779 } 2780 2781 txd->flags |= HN_TXD_FLAG_ONLIST; 2782 #ifndef HN_USE_TXDESC_BUFRING 2783 mtx_lock_spin(&txr->hn_txlist_spin); 2784 KASSERT(txr->hn_txdesc_avail >= 0 && 2785 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2786 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2787 txr->hn_txdesc_avail++; 2788 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2789 mtx_unlock_spin(&txr->hn_txlist_spin); 2790 #else /* HN_USE_TXDESC_BUFRING */ 2791 #ifdef HN_DEBUG 2792 atomic_add_int(&txr->hn_txdesc_avail, 1); 2793 #endif 2794 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2795 #endif /* !HN_USE_TXDESC_BUFRING */ 2796 2797 return 1; 2798 } 2799 2800 static __inline struct hn_txdesc * 2801 hn_txdesc_get(struct hn_tx_ring *txr) 2802 { 2803 struct hn_txdesc *txd; 2804 2805 #ifndef HN_USE_TXDESC_BUFRING 2806 mtx_lock_spin(&txr->hn_txlist_spin); 2807 txd = SLIST_FIRST(&txr->hn_txlist); 2808 if (txd != NULL) { 2809 KASSERT(txr->hn_txdesc_avail > 0, 2810 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2811 txr->hn_txdesc_avail--; 2812 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2813 } 2814 mtx_unlock_spin(&txr->hn_txlist_spin); 2815 #else 2816 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2817 #endif 2818 2819 if (txd != NULL) { 2820 #ifdef HN_USE_TXDESC_BUFRING 2821 #ifdef HN_DEBUG 2822 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2823 #endif 2824 #endif /* HN_USE_TXDESC_BUFRING */ 2825 KASSERT(txd->m == NULL && txd->refs == 0 && 2826 STAILQ_EMPTY(&txd->agg_list) && 2827 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2828 txd->chim_size == 0 && 2829 (txd->flags & HN_TXD_FLAG_ONLIST) && 2830 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2831 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2832 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2833 txd->refs = 1; 2834 } 2835 return txd; 2836 } 2837 2838 static __inline void 2839 hn_txdesc_hold(struct hn_txdesc *txd) 2840 { 2841 2842 /* 0->1 transition will never work */ 2843 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2844 atomic_add_int(&txd->refs, 1); 2845 } 2846 2847 static __inline void 2848 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2849 { 2850 2851 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2852 ("recursive aggregation on aggregating txdesc")); 2853 2854 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2855 ("already aggregated")); 2856 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2857 ("recursive aggregation on to-be-aggregated txdesc")); 2858 2859 txd->flags |= HN_TXD_FLAG_ONAGG; 2860 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2861 } 2862 2863 static bool 2864 hn_tx_ring_pending(struct hn_tx_ring *txr) 2865 { 2866 bool pending = false; 2867 2868 #ifndef HN_USE_TXDESC_BUFRING 2869 mtx_lock_spin(&txr->hn_txlist_spin); 2870 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2871 pending = true; 2872 mtx_unlock_spin(&txr->hn_txlist_spin); 2873 #else 2874 if (!buf_ring_full(txr->hn_txdesc_br)) 2875 pending = true; 2876 #endif 2877 return (pending); 2878 } 2879 2880 static __inline void 2881 hn_txeof(struct hn_tx_ring *txr) 2882 { 2883 txr->hn_has_txeof = 0; 2884 txr->hn_txeof(txr); 2885 } 2886 2887 static void 2888 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2889 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2890 { 2891 struct hn_txdesc *txd = sndc->hn_cbarg; 2892 struct hn_tx_ring *txr; 2893 2894 txr = txd->txr; 2895 KASSERT(txr->hn_chan == chan, 2896 ("channel mismatch, on chan%u, should be chan%u", 2897 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2898 2899 txr->hn_has_txeof = 1; 2900 hn_txdesc_put(txr, txd); 2901 2902 ++txr->hn_txdone_cnt; 2903 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2904 txr->hn_txdone_cnt = 0; 2905 if (txr->hn_oactive) 2906 hn_txeof(txr); 2907 } 2908 } 2909 2910 static void 2911 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2912 { 2913 #if defined(INET) || defined(INET6) 2914 struct epoch_tracker et; 2915 2916 NET_EPOCH_ENTER(et); 2917 tcp_lro_flush_all(&rxr->hn_lro); 2918 NET_EPOCH_EXIT(et); 2919 #endif 2920 2921 /* 2922 * NOTE: 2923 * 'txr' could be NULL, if multiple channels and 2924 * ifnet.if_start method are enabled. 2925 */ 2926 if (txr == NULL || !txr->hn_has_txeof) 2927 return; 2928 2929 txr->hn_txdone_cnt = 0; 2930 hn_txeof(txr); 2931 } 2932 2933 static __inline uint32_t 2934 hn_rndis_pktmsg_offset(uint32_t ofs) 2935 { 2936 2937 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2938 ("invalid RNDIS packet msg offset %u", ofs)); 2939 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2940 } 2941 2942 static __inline void * 2943 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2944 size_t pi_dlen, uint32_t pi_type) 2945 { 2946 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2947 struct rndis_pktinfo *pi; 2948 2949 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2950 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2951 2952 /* 2953 * Per-packet-info does not move; it only grows. 2954 * 2955 * NOTE: 2956 * rm_pktinfooffset in this phase counts from the beginning 2957 * of rndis_packet_msg. 2958 */ 2959 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2960 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2961 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2962 pkt->rm_pktinfolen); 2963 pkt->rm_pktinfolen += pi_size; 2964 2965 pi->rm_size = pi_size; 2966 pi->rm_type = pi_type; 2967 pi->rm_internal = 0; 2968 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2969 2970 return (pi->rm_data); 2971 } 2972 2973 static __inline int 2974 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr) 2975 { 2976 struct hn_txdesc *txd; 2977 struct mbuf *m; 2978 int error, pkts; 2979 2980 txd = txr->hn_agg_txd; 2981 KASSERT(txd != NULL, ("no aggregate txdesc")); 2982 2983 /* 2984 * Since hn_txpkt() will reset this temporary stat, save 2985 * it now, so that oerrors can be updated properly, if 2986 * hn_txpkt() ever fails. 2987 */ 2988 pkts = txr->hn_stat_pkts; 2989 2990 /* 2991 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2992 * failure, save it for later freeing, if hn_txpkt() ever 2993 * fails. 2994 */ 2995 m = txd->m; 2996 error = hn_txpkt(ifp, txr, txd); 2997 if (__predict_false(error)) { 2998 /* txd is freed, but m is not. */ 2999 m_freem(m); 3000 3001 txr->hn_flush_failed++; 3002 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 3003 } 3004 3005 /* Reset all aggregation states. */ 3006 txr->hn_agg_txd = NULL; 3007 txr->hn_agg_szleft = 0; 3008 txr->hn_agg_pktleft = 0; 3009 txr->hn_agg_prevpkt = NULL; 3010 3011 return (error); 3012 } 3013 3014 static void * 3015 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3016 int pktsize) 3017 { 3018 void *chim; 3019 3020 if (txr->hn_agg_txd != NULL) { 3021 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 3022 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 3023 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 3024 int olen; 3025 3026 /* 3027 * Update the previous RNDIS packet's total length, 3028 * it can be increased due to the mandatory alignment 3029 * padding for this RNDIS packet. And update the 3030 * aggregating txdesc's chimney sending buffer size 3031 * accordingly. 3032 * 3033 * XXX 3034 * Zero-out the padding, as required by the RNDIS spec. 3035 */ 3036 olen = pkt->rm_len; 3037 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 3038 agg_txd->chim_size += pkt->rm_len - olen; 3039 3040 /* Link this txdesc to the parent. */ 3041 hn_txdesc_agg(agg_txd, txd); 3042 3043 chim = (uint8_t *)pkt + pkt->rm_len; 3044 /* Save the current packet for later fixup. */ 3045 txr->hn_agg_prevpkt = chim; 3046 3047 txr->hn_agg_pktleft--; 3048 txr->hn_agg_szleft -= pktsize; 3049 if (txr->hn_agg_szleft <= 3050 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3051 /* 3052 * Probably can't aggregate more packets, 3053 * flush this aggregating txdesc proactively. 3054 */ 3055 txr->hn_agg_pktleft = 0; 3056 } 3057 /* Done! */ 3058 return (chim); 3059 } 3060 hn_flush_txagg(ifp, txr); 3061 } 3062 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3063 3064 txr->hn_tx_chimney_tried++; 3065 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3066 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3067 return (NULL); 3068 txr->hn_tx_chimney++; 3069 3070 chim = txr->hn_sc->hn_chim + 3071 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3072 3073 if (txr->hn_agg_pktmax > 1 && 3074 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3075 txr->hn_agg_txd = txd; 3076 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3077 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3078 txr->hn_agg_prevpkt = chim; 3079 } 3080 return (chim); 3081 } 3082 3083 /* 3084 * NOTE: 3085 * If this function fails, then both txd and m_head0 will be freed. 3086 */ 3087 static int 3088 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3089 struct mbuf **m_head0) 3090 { 3091 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3092 int error, nsegs, i; 3093 struct mbuf *m_head = *m_head0; 3094 struct rndis_packet_msg *pkt; 3095 uint32_t *pi_data; 3096 void *chim = NULL; 3097 int pkt_hlen, pkt_size; 3098 3099 pkt = txd->rndis_pkt; 3100 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3101 if (pkt_size < txr->hn_chim_size) { 3102 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3103 if (chim != NULL) 3104 pkt = chim; 3105 } else { 3106 if (txr->hn_agg_txd != NULL) 3107 hn_flush_txagg(ifp, txr); 3108 } 3109 3110 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3111 pkt->rm_len = m_head->m_pkthdr.len; 3112 pkt->rm_dataoffset = 0; 3113 pkt->rm_datalen = m_head->m_pkthdr.len; 3114 pkt->rm_oobdataoffset = 0; 3115 pkt->rm_oobdatalen = 0; 3116 pkt->rm_oobdataelements = 0; 3117 pkt->rm_pktinfooffset = sizeof(*pkt); 3118 pkt->rm_pktinfolen = 0; 3119 pkt->rm_vchandle = 0; 3120 pkt->rm_reserved = 0; 3121 3122 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3123 /* 3124 * Set the hash value for this packet. 3125 */ 3126 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3127 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3128 3129 if (M_HASHTYPE_ISHASH(m_head)) 3130 /* 3131 * The flowid field contains the hash value host 3132 * set in the rx queue if it is a ip forwarding pkt. 3133 * Set the same hash value so host can send on the 3134 * cpu it was received. 3135 */ 3136 *pi_data = m_head->m_pkthdr.flowid; 3137 else 3138 /* 3139 * Otherwise just put the tx queue index. 3140 */ 3141 *pi_data = txr->hn_tx_idx; 3142 } 3143 3144 if (m_head->m_flags & M_VLANTAG) { 3145 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3146 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3147 *pi_data = NDIS_VLAN_INFO_MAKE( 3148 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3149 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3150 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3151 } 3152 3153 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3154 #if defined(INET6) || defined(INET) 3155 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3156 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3157 #ifdef INET 3158 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3159 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3160 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3161 m_head->m_pkthdr.tso_segsz); 3162 } 3163 #endif 3164 #if defined(INET6) && defined(INET) 3165 else 3166 #endif 3167 #ifdef INET6 3168 { 3169 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3170 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3171 m_head->m_pkthdr.tso_segsz); 3172 } 3173 #endif 3174 #endif /* INET6 || INET */ 3175 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3176 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3177 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3178 if (m_head->m_pkthdr.csum_flags & 3179 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3180 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3181 } else { 3182 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3183 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3184 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3185 } 3186 3187 if (m_head->m_pkthdr.csum_flags & 3188 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3189 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3190 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3191 } else if (m_head->m_pkthdr.csum_flags & 3192 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3193 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3194 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3195 } 3196 } 3197 3198 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3199 /* Fixup RNDIS packet message total length */ 3200 pkt->rm_len += pkt_hlen; 3201 /* Convert RNDIS packet message offsets */ 3202 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3203 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3204 3205 /* 3206 * Fast path: Chimney sending. 3207 */ 3208 if (chim != NULL) { 3209 struct hn_txdesc *tgt_txd = txd; 3210 3211 if (txr->hn_agg_txd != NULL) { 3212 tgt_txd = txr->hn_agg_txd; 3213 #ifdef INVARIANTS 3214 *m_head0 = NULL; 3215 #endif 3216 } 3217 3218 KASSERT(pkt == chim, 3219 ("RNDIS pkt not in chimney sending buffer")); 3220 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3221 ("chimney sending buffer is not used")); 3222 tgt_txd->chim_size += pkt->rm_len; 3223 3224 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3225 ((uint8_t *)chim) + pkt_hlen); 3226 3227 txr->hn_gpa_cnt = 0; 3228 txr->hn_sendpkt = hn_txpkt_chim; 3229 goto done; 3230 } 3231 3232 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3233 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3234 ("chimney buffer is used")); 3235 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3236 3237 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3238 if (__predict_false(error)) { 3239 int freed __diagused; 3240 3241 /* 3242 * This mbuf is not linked w/ the txd yet, so free it now. 3243 */ 3244 m_freem(m_head); 3245 *m_head0 = NULL; 3246 3247 freed = hn_txdesc_put(txr, txd); 3248 KASSERT(freed != 0, 3249 ("fail to free txd upon txdma error")); 3250 3251 txr->hn_txdma_failed++; 3252 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3253 return error; 3254 } 3255 *m_head0 = m_head; 3256 3257 /* +1 RNDIS packet message */ 3258 txr->hn_gpa_cnt = nsegs + 1; 3259 3260 /* send packet with page buffer */ 3261 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3262 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3263 txr->hn_gpa[0].gpa_len = pkt_hlen; 3264 3265 /* 3266 * Fill the page buffers with mbuf info after the page 3267 * buffer for RNDIS packet message. 3268 */ 3269 for (i = 0; i < nsegs; ++i) { 3270 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3271 3272 gpa->gpa_page = atop(segs[i].ds_addr); 3273 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3274 gpa->gpa_len = segs[i].ds_len; 3275 } 3276 3277 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3278 txd->chim_size = 0; 3279 txr->hn_sendpkt = hn_txpkt_sglist; 3280 done: 3281 txd->m = m_head; 3282 3283 /* Set the completion routine */ 3284 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3285 3286 /* Update temporary stats for later use. */ 3287 txr->hn_stat_pkts++; 3288 txr->hn_stat_size += m_head->m_pkthdr.len; 3289 if (m_head->m_flags & M_MCAST) 3290 txr->hn_stat_mcasts++; 3291 3292 return 0; 3293 } 3294 3295 /* 3296 * NOTE: 3297 * If this function fails, then txd will be freed, but the mbuf 3298 * associated w/ the txd will _not_ be freed. 3299 */ 3300 static int 3301 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3302 { 3303 int error, send_failed = 0, has_bpf; 3304 3305 again: 3306 has_bpf = bpf_peers_present(if_getbpf(ifp)); 3307 if (has_bpf) { 3308 /* 3309 * Make sure that this txd and any aggregated txds are not 3310 * freed before ETHER_BPF_MTAP. 3311 */ 3312 hn_txdesc_hold(txd); 3313 } 3314 error = txr->hn_sendpkt(txr, txd); 3315 if (!error) { 3316 if (has_bpf) { 3317 const struct hn_txdesc *tmp_txd; 3318 3319 ETHER_BPF_MTAP(ifp, txd->m); 3320 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3321 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3322 } 3323 3324 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3325 #ifdef HN_IFSTART_SUPPORT 3326 if (!hn_use_if_start) 3327 #endif 3328 { 3329 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3330 txr->hn_stat_size); 3331 if (txr->hn_stat_mcasts != 0) { 3332 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3333 txr->hn_stat_mcasts); 3334 } 3335 } 3336 txr->hn_pkts += txr->hn_stat_pkts; 3337 txr->hn_sends++; 3338 } 3339 if (has_bpf) 3340 hn_txdesc_put(txr, txd); 3341 3342 if (__predict_false(error)) { 3343 int freed __diagused; 3344 3345 /* 3346 * This should "really rarely" happen. 3347 * 3348 * XXX Too many RX to be acked or too many sideband 3349 * commands to run? Ask netvsc_channel_rollup() 3350 * to kick start later. 3351 */ 3352 txr->hn_has_txeof = 1; 3353 if (!send_failed) { 3354 txr->hn_send_failed++; 3355 send_failed = 1; 3356 /* 3357 * Try sending again after set hn_has_txeof; 3358 * in case that we missed the last 3359 * netvsc_channel_rollup(). 3360 */ 3361 goto again; 3362 } 3363 if_printf(ifp, "send failed\n"); 3364 3365 /* 3366 * Caller will perform further processing on the 3367 * associated mbuf, so don't free it in hn_txdesc_put(); 3368 * only unload it from the DMA map in hn_txdesc_put(), 3369 * if it was loaded. 3370 */ 3371 txd->m = NULL; 3372 freed = hn_txdesc_put(txr, txd); 3373 KASSERT(freed != 0, 3374 ("fail to free txd upon send error")); 3375 3376 txr->hn_send_failed++; 3377 } 3378 3379 /* Reset temporary stats, after this sending is done. */ 3380 txr->hn_stat_size = 0; 3381 txr->hn_stat_pkts = 0; 3382 txr->hn_stat_mcasts = 0; 3383 3384 return (error); 3385 } 3386 3387 /* 3388 * Append the specified data to the indicated mbuf chain, 3389 * Extend the mbuf chain if the new data does not fit in 3390 * existing space. 3391 * 3392 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3393 * There should be an equivalent in the kernel mbuf code, 3394 * but there does not appear to be one yet. 3395 * 3396 * Differs from m_append() in that additional mbufs are 3397 * allocated with cluster size MJUMPAGESIZE, and filled 3398 * accordingly. 3399 * 3400 * Return the last mbuf in the chain or NULL if failed to 3401 * allocate new mbuf. 3402 */ 3403 static struct mbuf * 3404 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3405 { 3406 struct mbuf *m, *n; 3407 int remainder, space; 3408 3409 for (m = m0; m->m_next != NULL; m = m->m_next) 3410 ; 3411 remainder = len; 3412 space = M_TRAILINGSPACE(m); 3413 if (space > 0) { 3414 /* 3415 * Copy into available space. 3416 */ 3417 if (space > remainder) 3418 space = remainder; 3419 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3420 m->m_len += space; 3421 cp += space; 3422 remainder -= space; 3423 } 3424 while (remainder > 0) { 3425 /* 3426 * Allocate a new mbuf; could check space 3427 * and allocate a cluster instead. 3428 */ 3429 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3430 if (n == NULL) 3431 return NULL; 3432 n->m_len = min(MJUMPAGESIZE, remainder); 3433 bcopy(cp, mtod(n, caddr_t), n->m_len); 3434 cp += n->m_len; 3435 remainder -= n->m_len; 3436 m->m_next = n; 3437 m = n; 3438 } 3439 3440 return m; 3441 } 3442 3443 #if defined(INET) || defined(INET6) 3444 static __inline int 3445 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3446 { 3447 if (hn_lro_mbufq_depth) { 3448 tcp_lro_queue_mbuf(lc, m); 3449 return 0; 3450 } 3451 return tcp_lro_rx(lc, m, 0); 3452 } 3453 #endif 3454 3455 static int 3456 hn_rxpkt(struct hn_rx_ring *rxr) 3457 { 3458 if_t ifp, hn_ifp = rxr->hn_ifp; 3459 struct mbuf *m_new, *n; 3460 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3461 int hash_type = M_HASHTYPE_NONE; 3462 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3463 int i; 3464 3465 ifp = hn_ifp; 3466 if (rxr->hn_rxvf_ifp != NULL) { 3467 /* 3468 * Non-transparent mode VF; pretend this packet is from 3469 * the VF. 3470 */ 3471 ifp = rxr->hn_rxvf_ifp; 3472 is_vf = 1; 3473 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3474 /* Transparent mode VF. */ 3475 is_vf = 1; 3476 } 3477 3478 if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { 3479 /* 3480 * NOTE: 3481 * See the NOTE of hn_rndis_init_fixat(). This 3482 * function can be reached, immediately after the 3483 * RNDIS is initialized but before the ifnet is 3484 * setup on the hn_attach() path; drop the unexpected 3485 * packets. 3486 */ 3487 return (0); 3488 } 3489 3490 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3491 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3492 return (0); 3493 } 3494 3495 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3496 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3497 if (m_new == NULL) { 3498 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3499 return (0); 3500 } 3501 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3502 rxr->rsc.frag_len[0]); 3503 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3504 } else { 3505 /* 3506 * Get an mbuf with a cluster. For packets 2K or less, 3507 * get a standard 2K cluster. For anything larger, get a 3508 * 4K cluster. Any buffers larger than 4K can cause problems 3509 * if looped around to the Hyper-V TX channel, so avoid them. 3510 */ 3511 size = MCLBYTES; 3512 if (rxr->rsc.pktlen > MCLBYTES) { 3513 /* 4096 */ 3514 size = MJUMPAGESIZE; 3515 } 3516 3517 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3518 if (m_new == NULL) { 3519 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3520 return (0); 3521 } 3522 3523 n = m_new; 3524 for (i = 0; i < rxr->rsc.cnt; i++) { 3525 n = hv_m_append(n, rxr->rsc.frag_len[i], 3526 rxr->rsc.frag_data[i]); 3527 if (n == NULL) { 3528 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3529 return (0); 3530 } else { 3531 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3532 } 3533 } 3534 } 3535 if (rxr->rsc.pktlen <= MHLEN) 3536 rxr->hn_small_pkts++; 3537 3538 m_new->m_pkthdr.rcvif = ifp; 3539 3540 if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0)) 3541 do_csum = 0; 3542 3543 /* receive side checksum offload */ 3544 if (rxr->rsc.csum_info != NULL) { 3545 /* IP csum offload */ 3546 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3547 m_new->m_pkthdr.csum_flags |= 3548 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3549 rxr->hn_csum_ip++; 3550 } 3551 3552 /* TCP/UDP csum offload */ 3553 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3554 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3555 m_new->m_pkthdr.csum_flags |= 3556 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3557 m_new->m_pkthdr.csum_data = 0xffff; 3558 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3559 rxr->hn_csum_tcp++; 3560 else 3561 rxr->hn_csum_udp++; 3562 } 3563 3564 /* 3565 * XXX 3566 * As of this write (Oct 28th, 2016), host side will turn 3567 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3568 * the do_lro setting here is actually _not_ accurate. We 3569 * depend on the RSS hash type check to reset do_lro. 3570 */ 3571 if ((*(rxr->rsc.csum_info) & 3572 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3573 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3574 do_lro = 1; 3575 } else { 3576 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3577 if (l3proto == ETHERTYPE_IP) { 3578 if (l4proto == IPPROTO_TCP) { 3579 if (do_csum && 3580 (rxr->hn_trust_hcsum & 3581 HN_TRUST_HCSUM_TCP)) { 3582 rxr->hn_csum_trusted++; 3583 m_new->m_pkthdr.csum_flags |= 3584 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3585 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3586 m_new->m_pkthdr.csum_data = 0xffff; 3587 } 3588 do_lro = 1; 3589 } else if (l4proto == IPPROTO_UDP) { 3590 if (do_csum && 3591 (rxr->hn_trust_hcsum & 3592 HN_TRUST_HCSUM_UDP)) { 3593 rxr->hn_csum_trusted++; 3594 m_new->m_pkthdr.csum_flags |= 3595 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3596 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3597 m_new->m_pkthdr.csum_data = 0xffff; 3598 } 3599 } else if (l4proto != IPPROTO_DONE && do_csum && 3600 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3601 rxr->hn_csum_trusted++; 3602 m_new->m_pkthdr.csum_flags |= 3603 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3604 } 3605 } 3606 } 3607 3608 if (rxr->rsc.vlan_info != NULL) { 3609 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3610 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3611 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3612 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3613 m_new->m_flags |= M_VLANTAG; 3614 } 3615 3616 /* 3617 * If VF is activated (tranparent/non-transparent mode does not 3618 * matter here). 3619 * 3620 * - Disable LRO 3621 * 3622 * hn(4) will only receive broadcast packets, multicast packets, 3623 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3624 * packet types. 3625 * 3626 * For non-transparent, we definitely _cannot_ enable LRO at 3627 * all, since the LRO flush will use hn(4) as the receiving 3628 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3629 */ 3630 if (is_vf) 3631 do_lro = 0; 3632 3633 /* 3634 * If VF is activated (tranparent/non-transparent mode does not 3635 * matter here), do _not_ mess with unsupported hash types or 3636 * functions. 3637 */ 3638 if (rxr->rsc.hash_info != NULL) { 3639 rxr->hn_rss_pkts++; 3640 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3641 if (!is_vf) 3642 hash_type = M_HASHTYPE_OPAQUE_HASH; 3643 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3644 NDIS_HASH_FUNCTION_TOEPLITZ) { 3645 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3646 rxr->hn_mbuf_hash); 3647 3648 /* 3649 * NOTE: 3650 * do_lro is resetted, if the hash types are not TCP 3651 * related. See the comment in the above csum_flags 3652 * setup section. 3653 */ 3654 switch (type) { 3655 case NDIS_HASH_IPV4: 3656 hash_type = M_HASHTYPE_RSS_IPV4; 3657 do_lro = 0; 3658 break; 3659 3660 case NDIS_HASH_TCP_IPV4: 3661 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3662 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3663 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3664 3665 if (is_vf) 3666 def_htype = M_HASHTYPE_NONE; 3667 3668 /* 3669 * UDP 4-tuple hash is delivered as 3670 * TCP 4-tuple hash. 3671 */ 3672 if (l3proto == ETHERTYPE_MAX) { 3673 hn_rxpkt_proto(m_new, 3674 &l3proto, &l4proto); 3675 } 3676 if (l3proto == ETHERTYPE_IP) { 3677 if (l4proto == IPPROTO_UDP && 3678 (rxr->hn_mbuf_hash & 3679 NDIS_HASH_UDP_IPV4_X)) { 3680 hash_type = 3681 M_HASHTYPE_RSS_UDP_IPV4; 3682 do_lro = 0; 3683 } else if (l4proto != 3684 IPPROTO_TCP) { 3685 hash_type = def_htype; 3686 do_lro = 0; 3687 } 3688 } else { 3689 hash_type = def_htype; 3690 do_lro = 0; 3691 } 3692 } 3693 break; 3694 3695 case NDIS_HASH_IPV6: 3696 hash_type = M_HASHTYPE_RSS_IPV6; 3697 do_lro = 0; 3698 break; 3699 3700 case NDIS_HASH_IPV6_EX: 3701 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3702 do_lro = 0; 3703 break; 3704 3705 case NDIS_HASH_TCP_IPV6: 3706 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3707 break; 3708 3709 case NDIS_HASH_TCP_IPV6_EX: 3710 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3711 break; 3712 } 3713 } 3714 } else if (!is_vf) { 3715 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3716 hash_type = M_HASHTYPE_OPAQUE; 3717 } 3718 M_HASHTYPE_SET(m_new, hash_type); 3719 3720 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3721 if (hn_ifp != ifp) { 3722 const struct ether_header *eh; 3723 3724 /* 3725 * Non-transparent mode VF is activated. 3726 */ 3727 3728 /* 3729 * Allow tapping on hn(4). 3730 */ 3731 ETHER_BPF_MTAP(hn_ifp, m_new); 3732 3733 /* 3734 * Update hn(4)'s stats. 3735 */ 3736 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3737 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3738 /* Checked at the beginning of this function. */ 3739 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3740 eh = mtod(m_new, struct ether_header *); 3741 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3742 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3743 } 3744 rxr->hn_pkts++; 3745 3746 if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) { 3747 #if defined(INET) || defined(INET6) 3748 struct lro_ctrl *lro = &rxr->hn_lro; 3749 3750 if (lro->lro_cnt) { 3751 rxr->hn_lro_tried++; 3752 if (hn_lro_rx(lro, m_new) == 0) { 3753 /* DONE! */ 3754 return 0; 3755 } 3756 } 3757 #endif 3758 } 3759 if_input(ifp, m_new); 3760 3761 return (0); 3762 } 3763 3764 static int 3765 hn_ioctl(if_t ifp, u_long cmd, caddr_t data) 3766 { 3767 struct hn_softc *sc = if_getsoftc(ifp); 3768 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3769 if_t vf_ifp; 3770 int mask, error = 0; 3771 struct ifrsskey *ifrk; 3772 struct ifrsshash *ifrh; 3773 uint32_t mtu; 3774 3775 switch (cmd) { 3776 case SIOCSIFMTU: 3777 if (ifr->ifr_mtu > HN_MTU_MAX) { 3778 error = EINVAL; 3779 break; 3780 } 3781 3782 HN_LOCK(sc); 3783 3784 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3785 HN_UNLOCK(sc); 3786 break; 3787 } 3788 3789 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3790 /* Can't change MTU */ 3791 HN_UNLOCK(sc); 3792 error = EOPNOTSUPP; 3793 break; 3794 } 3795 3796 if (if_getmtu(ifp) == ifr->ifr_mtu) { 3797 HN_UNLOCK(sc); 3798 break; 3799 } 3800 3801 if (hn_xpnt_vf_isready(sc)) { 3802 vf_ifp = sc->hn_vf_ifp; 3803 ifr_vf = *ifr; 3804 strlcpy(ifr_vf.ifr_name, if_name(vf_ifp), 3805 sizeof(ifr_vf.ifr_name)); 3806 error = ifhwioctl(SIOCSIFMTU,vf_ifp, 3807 (caddr_t)&ifr_vf, curthread); 3808 if (error) { 3809 HN_UNLOCK(sc); 3810 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3811 if_name(vf_ifp), ifr->ifr_mtu, error); 3812 break; 3813 } 3814 } 3815 3816 /* 3817 * Suspend this interface before the synthetic parts 3818 * are ripped. 3819 */ 3820 hn_suspend(sc); 3821 3822 /* 3823 * Detach the synthetics parts, i.e. NVS and RNDIS. 3824 */ 3825 hn_synth_detach(sc); 3826 3827 /* 3828 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3829 * with the new MTU setting. 3830 */ 3831 error = hn_synth_attach(sc, ifr->ifr_mtu); 3832 if (error) { 3833 HN_UNLOCK(sc); 3834 break; 3835 } 3836 3837 error = hn_rndis_get_mtu(sc, &mtu); 3838 if (error) 3839 mtu = ifr->ifr_mtu; 3840 else if (bootverbose) 3841 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3842 3843 /* 3844 * Commit the requested MTU, after the synthetic parts 3845 * have been successfully attached. 3846 */ 3847 if (mtu >= ifr->ifr_mtu) { 3848 mtu = ifr->ifr_mtu; 3849 } else { 3850 if_printf(ifp, "fixup mtu %d -> %u\n", 3851 ifr->ifr_mtu, mtu); 3852 } 3853 if_setmtu(ifp, mtu); 3854 3855 /* 3856 * Synthetic parts' reattach may change the chimney 3857 * sending size; update it. 3858 */ 3859 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3860 hn_set_chim_size(sc, sc->hn_chim_szmax); 3861 3862 /* 3863 * Make sure that various parameters based on MTU are 3864 * still valid, after the MTU change. 3865 */ 3866 hn_mtu_change_fixup(sc); 3867 3868 /* 3869 * All done! Resume the interface now. 3870 */ 3871 hn_resume(sc); 3872 3873 if ((sc->hn_flags & HN_FLAG_RXVF) || 3874 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3875 /* 3876 * Since we have reattached the NVS part, 3877 * change the datapath to VF again; in case 3878 * that it is lost, after the NVS was detached. 3879 */ 3880 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3881 } 3882 3883 HN_UNLOCK(sc); 3884 break; 3885 3886 case SIOCSIFFLAGS: 3887 HN_LOCK(sc); 3888 3889 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3890 HN_UNLOCK(sc); 3891 break; 3892 } 3893 3894 if (hn_xpnt_vf_isready(sc)) 3895 hn_xpnt_vf_saveifflags(sc); 3896 3897 if (if_getflags(ifp) & IFF_UP) { 3898 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { 3899 /* 3900 * Caller meight hold mutex, e.g. 3901 * bpf; use busy-wait for the RNDIS 3902 * reply. 3903 */ 3904 HN_NO_SLEEPING(sc); 3905 hn_rxfilter_config(sc); 3906 HN_SLEEPING_OK(sc); 3907 3908 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3909 error = hn_xpnt_vf_iocsetflags(sc); 3910 } else { 3911 hn_init_locked(sc); 3912 } 3913 } else { 3914 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 3915 hn_stop(sc, false); 3916 } 3917 sc->hn_if_flags = if_getflags(ifp); 3918 3919 HN_UNLOCK(sc); 3920 break; 3921 3922 case SIOCSIFCAP: 3923 HN_LOCK(sc); 3924 3925 if (hn_xpnt_vf_isready(sc)) { 3926 ifr_vf = *ifr; 3927 strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp), 3928 sizeof(ifr_vf.ifr_name)); 3929 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3930 HN_UNLOCK(sc); 3931 break; 3932 } 3933 3934 /* 3935 * Fix up requested capabilities w/ supported capabilities, 3936 * since the supported capabilities could have been changed. 3937 */ 3938 mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^ 3939 if_getcapenable(ifp); 3940 3941 if (mask & IFCAP_TXCSUM) { 3942 if_togglecapenable(ifp, IFCAP_TXCSUM); 3943 if (if_getcapenable(ifp) & IFCAP_TXCSUM) 3944 if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0); 3945 else 3946 if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc)); 3947 } 3948 if (mask & IFCAP_TXCSUM_IPV6) { 3949 if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6); 3950 if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) 3951 if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0); 3952 else 3953 if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc)); 3954 } 3955 3956 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3957 if (mask & IFCAP_RXCSUM) 3958 if_togglecapenable(ifp, IFCAP_RXCSUM); 3959 #ifdef foo 3960 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3961 if (mask & IFCAP_RXCSUM_IPV6) 3962 if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6); 3963 #endif 3964 3965 if (mask & IFCAP_LRO) 3966 if_togglecapenable(ifp, IFCAP_LRO); 3967 3968 if (mask & IFCAP_TSO4) { 3969 if_togglecapenable(ifp, IFCAP_TSO4); 3970 if (if_getcapenable(ifp) & IFCAP_TSO4) 3971 if_sethwassistbits(ifp, CSUM_IP_TSO, 0); 3972 else 3973 if_sethwassistbits(ifp, 0, CSUM_IP_TSO); 3974 } 3975 if (mask & IFCAP_TSO6) { 3976 if_togglecapenable(ifp, IFCAP_TSO6); 3977 if (if_getcapenable(ifp) & IFCAP_TSO6) 3978 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); 3979 else 3980 if_sethwassistbits(ifp, 0, CSUM_IP6_TSO); 3981 } 3982 3983 HN_UNLOCK(sc); 3984 break; 3985 3986 case SIOCADDMULTI: 3987 case SIOCDELMULTI: 3988 HN_LOCK(sc); 3989 3990 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3991 HN_UNLOCK(sc); 3992 break; 3993 } 3994 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { 3995 /* 3996 * Multicast uses mutex; use busy-wait for 3997 * the RNDIS reply. 3998 */ 3999 HN_NO_SLEEPING(sc); 4000 hn_rxfilter_config(sc); 4001 HN_SLEEPING_OK(sc); 4002 } 4003 4004 /* XXX vlan(4) style mcast addr maintenance */ 4005 if (hn_xpnt_vf_isready(sc)) { 4006 int old_if_flags; 4007 4008 old_if_flags = if_getflags(sc->hn_vf_ifp); 4009 hn_xpnt_vf_saveifflags(sc); 4010 4011 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 4012 ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) & 4013 IFF_ALLMULTI)) 4014 error = hn_xpnt_vf_iocsetflags(sc); 4015 } 4016 4017 HN_UNLOCK(sc); 4018 break; 4019 4020 case SIOCSIFMEDIA: 4021 case SIOCGIFMEDIA: 4022 HN_LOCK(sc); 4023 if (hn_xpnt_vf_isready(sc)) { 4024 /* 4025 * SIOCGIFMEDIA expects ifmediareq, so don't 4026 * create and pass ifr_vf to the VF here; just 4027 * replace the ifr_name. 4028 */ 4029 vf_ifp = sc->hn_vf_ifp; 4030 strlcpy(ifr->ifr_name, if_name(vf_ifp), 4031 sizeof(ifr->ifr_name)); 4032 error = ifhwioctl(cmd, vf_ifp, data, curthread); 4033 /* Restore the ifr_name. */ 4034 strlcpy(ifr->ifr_name, if_name(ifp), 4035 sizeof(ifr->ifr_name)); 4036 HN_UNLOCK(sc); 4037 break; 4038 } 4039 HN_UNLOCK(sc); 4040 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4041 break; 4042 4043 case SIOCGIFRSSHASH: 4044 ifrh = (struct ifrsshash *)data; 4045 HN_LOCK(sc); 4046 if (sc->hn_rx_ring_inuse == 1) { 4047 HN_UNLOCK(sc); 4048 ifrh->ifrh_func = RSS_FUNC_NONE; 4049 ifrh->ifrh_types = 0; 4050 break; 4051 } 4052 4053 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4054 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4055 else 4056 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4057 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4058 HN_UNLOCK(sc); 4059 break; 4060 4061 case SIOCGIFRSSKEY: 4062 ifrk = (struct ifrsskey *)data; 4063 HN_LOCK(sc); 4064 if (sc->hn_rx_ring_inuse == 1) { 4065 HN_UNLOCK(sc); 4066 ifrk->ifrk_func = RSS_FUNC_NONE; 4067 ifrk->ifrk_keylen = 0; 4068 break; 4069 } 4070 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4071 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4072 else 4073 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4074 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4075 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4076 NDIS_HASH_KEYSIZE_TOEPLITZ); 4077 HN_UNLOCK(sc); 4078 break; 4079 4080 default: 4081 error = ether_ioctl(ifp, cmd, data); 4082 break; 4083 } 4084 return (error); 4085 } 4086 4087 static void 4088 hn_stop(struct hn_softc *sc, bool detaching) 4089 { 4090 if_t ifp = sc->hn_ifp; 4091 int i; 4092 4093 HN_LOCK_ASSERT(sc); 4094 4095 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4096 ("synthetic parts were not attached")); 4097 4098 /* Clear RUNNING bit ASAP. */ 4099 if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); 4100 4101 /* Disable polling. */ 4102 hn_polling(sc, 0); 4103 4104 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4105 KASSERT(sc->hn_vf_ifp != NULL, 4106 ("%s: VF is not attached", if_name(ifp))); 4107 4108 /* Mark transparent mode VF as disabled. */ 4109 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4110 4111 /* 4112 * NOTE: 4113 * Datapath setting must happen _before_ bringing 4114 * the VF down. 4115 */ 4116 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4117 4118 /* 4119 * Bring the VF down. 4120 */ 4121 hn_xpnt_vf_saveifflags(sc); 4122 if_setflagbits(ifp, 0, IFF_UP); 4123 hn_xpnt_vf_iocsetflags(sc); 4124 } 4125 4126 /* Suspend data transfers. */ 4127 hn_suspend_data(sc); 4128 4129 /* Clear OACTIVE bit. */ 4130 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 4131 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4132 sc->hn_tx_ring[i].hn_oactive = 0; 4133 4134 /* 4135 * If the non-transparent mode VF is active, make sure 4136 * that the RX filter still allows packet reception. 4137 */ 4138 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4139 hn_rxfilter_config(sc); 4140 } 4141 4142 static void 4143 hn_init_locked(struct hn_softc *sc) 4144 { 4145 if_t ifp = sc->hn_ifp; 4146 int i; 4147 4148 HN_LOCK_ASSERT(sc); 4149 4150 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4151 return; 4152 4153 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 4154 return; 4155 4156 /* Configure RX filter */ 4157 hn_rxfilter_config(sc); 4158 4159 /* Clear OACTIVE bit. */ 4160 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 4161 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4162 sc->hn_tx_ring[i].hn_oactive = 0; 4163 4164 /* Clear TX 'suspended' bit. */ 4165 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4166 4167 if (hn_xpnt_vf_isready(sc)) { 4168 /* Initialize transparent VF. */ 4169 hn_xpnt_vf_init(sc); 4170 } 4171 4172 /* Everything is ready; unleash! */ 4173 if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); 4174 4175 /* Re-enable polling if requested. */ 4176 if (sc->hn_pollhz > 0) 4177 hn_polling(sc, sc->hn_pollhz); 4178 } 4179 4180 static void 4181 hn_init(void *xsc) 4182 { 4183 struct hn_softc *sc = xsc; 4184 4185 HN_LOCK(sc); 4186 hn_init_locked(sc); 4187 HN_UNLOCK(sc); 4188 } 4189 4190 static int 4191 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4192 { 4193 struct hn_softc *sc = arg1; 4194 unsigned int lenlim; 4195 int error; 4196 4197 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4198 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4199 if (error || req->newptr == NULL) 4200 return error; 4201 4202 HN_LOCK(sc); 4203 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4204 lenlim > TCP_LRO_LENGTH_MAX) { 4205 HN_UNLOCK(sc); 4206 return EINVAL; 4207 } 4208 hn_set_lro_lenlim(sc, lenlim); 4209 HN_UNLOCK(sc); 4210 4211 return 0; 4212 } 4213 4214 static int 4215 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4216 { 4217 struct hn_softc *sc = arg1; 4218 int ackcnt, error, i; 4219 4220 /* 4221 * lro_ackcnt_lim is append count limit, 4222 * +1 to turn it into aggregation limit. 4223 */ 4224 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4225 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4226 if (error || req->newptr == NULL) 4227 return error; 4228 4229 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4230 return EINVAL; 4231 4232 /* 4233 * Convert aggregation limit back to append 4234 * count limit. 4235 */ 4236 --ackcnt; 4237 HN_LOCK(sc); 4238 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4239 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4240 HN_UNLOCK(sc); 4241 return 0; 4242 } 4243 4244 static int 4245 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4246 { 4247 struct hn_softc *sc = arg1; 4248 int hcsum = arg2; 4249 int on, error, i; 4250 4251 on = 0; 4252 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4253 on = 1; 4254 4255 error = sysctl_handle_int(oidp, &on, 0, req); 4256 if (error || req->newptr == NULL) 4257 return error; 4258 4259 HN_LOCK(sc); 4260 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4261 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4262 4263 if (on) 4264 rxr->hn_trust_hcsum |= hcsum; 4265 else 4266 rxr->hn_trust_hcsum &= ~hcsum; 4267 } 4268 HN_UNLOCK(sc); 4269 return 0; 4270 } 4271 4272 static int 4273 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4274 { 4275 struct hn_softc *sc = arg1; 4276 int chim_size, error; 4277 4278 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4279 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4280 if (error || req->newptr == NULL) 4281 return error; 4282 4283 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4284 return EINVAL; 4285 4286 HN_LOCK(sc); 4287 hn_set_chim_size(sc, chim_size); 4288 HN_UNLOCK(sc); 4289 return 0; 4290 } 4291 4292 static int 4293 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4294 { 4295 struct hn_softc *sc = arg1; 4296 int ofs = arg2, i, error; 4297 struct hn_rx_ring *rxr; 4298 uint64_t stat; 4299 4300 stat = 0; 4301 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4302 rxr = &sc->hn_rx_ring[i]; 4303 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4304 } 4305 4306 error = sysctl_handle_64(oidp, &stat, 0, req); 4307 if (error || req->newptr == NULL) 4308 return error; 4309 4310 /* Zero out this stat. */ 4311 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4312 rxr = &sc->hn_rx_ring[i]; 4313 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4314 } 4315 return 0; 4316 } 4317 4318 static int 4319 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4320 { 4321 struct hn_softc *sc = arg1; 4322 int ofs = arg2, i, error; 4323 struct hn_rx_ring *rxr; 4324 u_long stat; 4325 4326 stat = 0; 4327 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4328 rxr = &sc->hn_rx_ring[i]; 4329 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4330 } 4331 4332 error = sysctl_handle_long(oidp, &stat, 0, req); 4333 if (error || req->newptr == NULL) 4334 return error; 4335 4336 /* Zero out this stat. */ 4337 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4338 rxr = &sc->hn_rx_ring[i]; 4339 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4340 } 4341 return 0; 4342 } 4343 4344 static int 4345 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4346 { 4347 struct hn_softc *sc = arg1; 4348 int ofs = arg2, i, error; 4349 struct hn_tx_ring *txr; 4350 u_long stat; 4351 4352 stat = 0; 4353 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4354 txr = &sc->hn_tx_ring[i]; 4355 stat += *((u_long *)((uint8_t *)txr + ofs)); 4356 } 4357 4358 error = sysctl_handle_long(oidp, &stat, 0, req); 4359 if (error || req->newptr == NULL) 4360 return error; 4361 4362 /* Zero out this stat. */ 4363 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4364 txr = &sc->hn_tx_ring[i]; 4365 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4366 } 4367 return 0; 4368 } 4369 4370 static int 4371 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4372 { 4373 struct hn_softc *sc = arg1; 4374 int ofs = arg2, i, error, conf; 4375 struct hn_tx_ring *txr; 4376 4377 txr = &sc->hn_tx_ring[0]; 4378 conf = *((int *)((uint8_t *)txr + ofs)); 4379 4380 error = sysctl_handle_int(oidp, &conf, 0, req); 4381 if (error || req->newptr == NULL) 4382 return error; 4383 4384 HN_LOCK(sc); 4385 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4386 txr = &sc->hn_tx_ring[i]; 4387 *((int *)((uint8_t *)txr + ofs)) = conf; 4388 } 4389 HN_UNLOCK(sc); 4390 4391 return 0; 4392 } 4393 4394 static int 4395 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4396 { 4397 struct hn_softc *sc = arg1; 4398 int error, size; 4399 4400 size = sc->hn_agg_size; 4401 error = sysctl_handle_int(oidp, &size, 0, req); 4402 if (error || req->newptr == NULL) 4403 return (error); 4404 4405 HN_LOCK(sc); 4406 sc->hn_agg_size = size; 4407 hn_set_txagg(sc); 4408 HN_UNLOCK(sc); 4409 4410 return (0); 4411 } 4412 4413 static int 4414 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4415 { 4416 struct hn_softc *sc = arg1; 4417 int error, pkts; 4418 4419 pkts = sc->hn_agg_pkts; 4420 error = sysctl_handle_int(oidp, &pkts, 0, req); 4421 if (error || req->newptr == NULL) 4422 return (error); 4423 4424 HN_LOCK(sc); 4425 sc->hn_agg_pkts = pkts; 4426 hn_set_txagg(sc); 4427 HN_UNLOCK(sc); 4428 4429 return (0); 4430 } 4431 4432 static int 4433 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4434 { 4435 struct hn_softc *sc = arg1; 4436 int pkts; 4437 4438 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4439 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4440 } 4441 4442 static int 4443 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4444 { 4445 struct hn_softc *sc = arg1; 4446 int align; 4447 4448 align = sc->hn_tx_ring[0].hn_agg_align; 4449 return (sysctl_handle_int(oidp, &align, 0, req)); 4450 } 4451 4452 static void 4453 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4454 { 4455 if (pollhz == 0) 4456 vmbus_chan_poll_disable(chan); 4457 else 4458 vmbus_chan_poll_enable(chan, pollhz); 4459 } 4460 4461 static void 4462 hn_polling(struct hn_softc *sc, u_int pollhz) 4463 { 4464 int nsubch = sc->hn_rx_ring_inuse - 1; 4465 4466 HN_LOCK_ASSERT(sc); 4467 4468 if (nsubch > 0) { 4469 struct vmbus_channel **subch; 4470 int i; 4471 4472 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4473 for (i = 0; i < nsubch; ++i) 4474 hn_chan_polling(subch[i], pollhz); 4475 vmbus_subchan_rel(subch, nsubch); 4476 } 4477 hn_chan_polling(sc->hn_prichan, pollhz); 4478 } 4479 4480 static int 4481 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4482 { 4483 struct hn_softc *sc = arg1; 4484 int pollhz, error; 4485 4486 pollhz = sc->hn_pollhz; 4487 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4488 if (error || req->newptr == NULL) 4489 return (error); 4490 4491 if (pollhz != 0 && 4492 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4493 return (EINVAL); 4494 4495 HN_LOCK(sc); 4496 if (sc->hn_pollhz != pollhz) { 4497 sc->hn_pollhz = pollhz; 4498 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && 4499 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4500 hn_polling(sc, sc->hn_pollhz); 4501 } 4502 HN_UNLOCK(sc); 4503 4504 return (0); 4505 } 4506 4507 static int 4508 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4509 { 4510 struct hn_softc *sc = arg1; 4511 char verstr[16]; 4512 4513 snprintf(verstr, sizeof(verstr), "%u.%u", 4514 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4515 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4516 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4517 } 4518 4519 static int 4520 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4521 { 4522 struct hn_softc *sc = arg1; 4523 char caps_str[128]; 4524 uint32_t caps; 4525 4526 HN_LOCK(sc); 4527 caps = sc->hn_caps; 4528 HN_UNLOCK(sc); 4529 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4530 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4531 } 4532 4533 static int 4534 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4535 { 4536 struct hn_softc *sc = arg1; 4537 char assist_str[128]; 4538 uint32_t hwassist; 4539 4540 HN_LOCK(sc); 4541 hwassist = if_gethwassist(sc->hn_ifp); 4542 HN_UNLOCK(sc); 4543 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4544 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4545 } 4546 4547 static int 4548 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4549 { 4550 struct hn_softc *sc = arg1; 4551 char filter_str[128]; 4552 uint32_t filter; 4553 4554 HN_LOCK(sc); 4555 filter = sc->hn_rx_filter; 4556 HN_UNLOCK(sc); 4557 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4558 NDIS_PACKET_TYPES); 4559 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4560 } 4561 4562 static int 4563 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS) 4564 { 4565 struct hn_softc *sc = arg1; 4566 uint32_t mtu; 4567 int error; 4568 HN_LOCK(sc); 4569 error = hn_rndis_get_mtu(sc, &mtu); 4570 if (error) { 4571 if_printf(sc->hn_ifp, "failed to get mtu\n"); 4572 goto back; 4573 } 4574 error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4575 if (error || req->newptr == NULL) 4576 goto back; 4577 4578 error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4579 if (error) 4580 goto back; 4581 error = hn_rndis_reconf_offload(sc, mtu); 4582 back: 4583 HN_UNLOCK(sc); 4584 return (error); 4585 } 4586 #ifndef RSS 4587 4588 static int 4589 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4590 { 4591 struct hn_softc *sc = arg1; 4592 int error; 4593 4594 HN_LOCK(sc); 4595 4596 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4597 if (error || req->newptr == NULL) 4598 goto back; 4599 4600 if ((sc->hn_flags & HN_FLAG_RXVF) || 4601 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4602 /* 4603 * RSS key is synchronized w/ VF's, don't allow users 4604 * to change it. 4605 */ 4606 error = EBUSY; 4607 goto back; 4608 } 4609 4610 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4611 if (error) 4612 goto back; 4613 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4614 4615 if (sc->hn_rx_ring_inuse > 1) { 4616 error = hn_rss_reconfig(sc); 4617 } else { 4618 /* Not RSS capable, at least for now; just save the RSS key. */ 4619 error = 0; 4620 } 4621 back: 4622 HN_UNLOCK(sc); 4623 return (error); 4624 } 4625 4626 static int 4627 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4628 { 4629 struct hn_softc *sc = arg1; 4630 int error; 4631 4632 HN_LOCK(sc); 4633 4634 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4635 if (error || req->newptr == NULL) 4636 goto back; 4637 4638 /* 4639 * Don't allow RSS indirect table change, if this interface is not 4640 * RSS capable currently. 4641 */ 4642 if (sc->hn_rx_ring_inuse == 1) { 4643 error = EOPNOTSUPP; 4644 goto back; 4645 } 4646 4647 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4648 if (error) 4649 goto back; 4650 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4651 4652 hn_rss_ind_fixup(sc); 4653 error = hn_rss_reconfig(sc); 4654 back: 4655 HN_UNLOCK(sc); 4656 return (error); 4657 } 4658 4659 #endif /* !RSS */ 4660 4661 static int 4662 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4663 { 4664 struct hn_softc *sc = arg1; 4665 char hash_str[128]; 4666 uint32_t hash; 4667 4668 HN_LOCK(sc); 4669 hash = sc->hn_rss_hash; 4670 HN_UNLOCK(sc); 4671 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4672 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4673 } 4674 4675 static int 4676 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4677 { 4678 struct hn_softc *sc = arg1; 4679 char hash_str[128]; 4680 uint32_t hash; 4681 4682 HN_LOCK(sc); 4683 hash = sc->hn_rss_hcap; 4684 HN_UNLOCK(sc); 4685 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4686 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4687 } 4688 4689 static int 4690 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4691 { 4692 struct hn_softc *sc = arg1; 4693 char hash_str[128]; 4694 uint32_t hash; 4695 4696 HN_LOCK(sc); 4697 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4698 HN_UNLOCK(sc); 4699 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4700 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4701 } 4702 4703 static int 4704 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4705 { 4706 struct hn_softc *sc = arg1; 4707 char vf_name[IFNAMSIZ + 1]; 4708 if_t vf_ifp; 4709 4710 HN_LOCK(sc); 4711 vf_name[0] = '\0'; 4712 vf_ifp = sc->hn_vf_ifp; 4713 if (vf_ifp != NULL) 4714 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); 4715 HN_UNLOCK(sc); 4716 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4717 } 4718 4719 static int 4720 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4721 { 4722 struct hn_softc *sc = arg1; 4723 char vf_name[IFNAMSIZ + 1]; 4724 if_t vf_ifp; 4725 4726 HN_LOCK(sc); 4727 vf_name[0] = '\0'; 4728 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4729 if (vf_ifp != NULL) 4730 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); 4731 HN_UNLOCK(sc); 4732 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4733 } 4734 4735 static int 4736 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4737 { 4738 struct rm_priotracker pt; 4739 struct sbuf *sb; 4740 int error, i; 4741 bool first; 4742 4743 error = sysctl_wire_old_buffer(req, 0); 4744 if (error != 0) 4745 return (error); 4746 4747 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4748 if (sb == NULL) 4749 return (ENOMEM); 4750 4751 rm_rlock(&hn_vfmap_lock, &pt); 4752 4753 first = true; 4754 for (i = 0; i < hn_vfmap_size; ++i) { 4755 struct epoch_tracker et; 4756 if_t ifp; 4757 4758 if (hn_vfmap[i] == NULL) 4759 continue; 4760 4761 NET_EPOCH_ENTER(et); 4762 ifp = ifnet_byindex(i); 4763 if (ifp != NULL) { 4764 if (first) 4765 sbuf_printf(sb, "%s", if_name(ifp)); 4766 else 4767 sbuf_printf(sb, " %s", if_name(ifp)); 4768 first = false; 4769 } 4770 NET_EPOCH_EXIT(et); 4771 } 4772 4773 rm_runlock(&hn_vfmap_lock, &pt); 4774 4775 error = sbuf_finish(sb); 4776 sbuf_delete(sb); 4777 return (error); 4778 } 4779 4780 static int 4781 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4782 { 4783 struct rm_priotracker pt; 4784 struct sbuf *sb; 4785 int error, i; 4786 bool first; 4787 4788 error = sysctl_wire_old_buffer(req, 0); 4789 if (error != 0) 4790 return (error); 4791 4792 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4793 if (sb == NULL) 4794 return (ENOMEM); 4795 4796 rm_rlock(&hn_vfmap_lock, &pt); 4797 4798 first = true; 4799 for (i = 0; i < hn_vfmap_size; ++i) { 4800 struct epoch_tracker et; 4801 if_t ifp, hn_ifp; 4802 4803 hn_ifp = hn_vfmap[i]; 4804 if (hn_ifp == NULL) 4805 continue; 4806 4807 NET_EPOCH_ENTER(et); 4808 ifp = ifnet_byindex(i); 4809 if (ifp != NULL) { 4810 if (first) { 4811 sbuf_printf(sb, "%s:%s", if_name(ifp), 4812 if_name(hn_ifp)); 4813 } else { 4814 sbuf_printf(sb, " %s:%s", if_name(ifp), 4815 if_name(hn_ifp)); 4816 } 4817 first = false; 4818 } 4819 NET_EPOCH_EXIT(et); 4820 } 4821 4822 rm_runlock(&hn_vfmap_lock, &pt); 4823 4824 error = sbuf_finish(sb); 4825 sbuf_delete(sb); 4826 return (error); 4827 } 4828 4829 static int 4830 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4831 { 4832 struct hn_softc *sc = arg1; 4833 int error, onoff = 0; 4834 4835 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4836 onoff = 1; 4837 error = sysctl_handle_int(oidp, &onoff, 0, req); 4838 if (error || req->newptr == NULL) 4839 return (error); 4840 4841 HN_LOCK(sc); 4842 /* NOTE: hn_vf_lock for hn_transmit() */ 4843 rm_wlock(&sc->hn_vf_lock); 4844 if (onoff) 4845 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4846 else 4847 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4848 rm_wunlock(&sc->hn_vf_lock); 4849 HN_UNLOCK(sc); 4850 4851 return (0); 4852 } 4853 4854 static int 4855 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4856 { 4857 struct hn_softc *sc = arg1; 4858 int enabled = 0; 4859 4860 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4861 enabled = 1; 4862 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4863 } 4864 4865 static int 4866 hn_check_iplen(const struct mbuf *m, int hoff) 4867 { 4868 const struct ip *ip; 4869 int len, iphlen, iplen; 4870 const struct tcphdr *th; 4871 int thoff; /* TCP data offset */ 4872 4873 len = hoff + sizeof(struct ip); 4874 4875 /* The packet must be at least the size of an IP header. */ 4876 if (m->m_pkthdr.len < len) 4877 return IPPROTO_DONE; 4878 4879 /* The fixed IP header must reside completely in the first mbuf. */ 4880 if (m->m_len < len) 4881 return IPPROTO_DONE; 4882 4883 ip = mtodo(m, hoff); 4884 4885 /* Bound check the packet's stated IP header length. */ 4886 iphlen = ip->ip_hl << 2; 4887 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4888 return IPPROTO_DONE; 4889 4890 /* The full IP header must reside completely in the one mbuf. */ 4891 if (m->m_len < hoff + iphlen) 4892 return IPPROTO_DONE; 4893 4894 iplen = ntohs(ip->ip_len); 4895 4896 /* 4897 * Check that the amount of data in the buffers is as 4898 * at least much as the IP header would have us expect. 4899 */ 4900 if (m->m_pkthdr.len < hoff + iplen) 4901 return IPPROTO_DONE; 4902 4903 /* 4904 * Ignore IP fragments. 4905 */ 4906 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4907 return IPPROTO_DONE; 4908 4909 /* 4910 * The TCP/IP or UDP/IP header must be entirely contained within 4911 * the first fragment of a packet. 4912 */ 4913 switch (ip->ip_p) { 4914 case IPPROTO_TCP: 4915 if (iplen < iphlen + sizeof(struct tcphdr)) 4916 return IPPROTO_DONE; 4917 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4918 return IPPROTO_DONE; 4919 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4920 thoff = th->th_off << 2; 4921 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4922 return IPPROTO_DONE; 4923 if (m->m_len < hoff + iphlen + thoff) 4924 return IPPROTO_DONE; 4925 break; 4926 case IPPROTO_UDP: 4927 if (iplen < iphlen + sizeof(struct udphdr)) 4928 return IPPROTO_DONE; 4929 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4930 return IPPROTO_DONE; 4931 break; 4932 default: 4933 if (iplen < iphlen) 4934 return IPPROTO_DONE; 4935 break; 4936 } 4937 return ip->ip_p; 4938 } 4939 4940 static void 4941 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4942 { 4943 const struct ether_header *eh; 4944 uint16_t etype; 4945 int hoff; 4946 4947 hoff = sizeof(*eh); 4948 /* Checked at the beginning of this function. */ 4949 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4950 4951 eh = mtod(m_new, const struct ether_header *); 4952 etype = ntohs(eh->ether_type); 4953 if (etype == ETHERTYPE_VLAN) { 4954 const struct ether_vlan_header *evl; 4955 4956 hoff = sizeof(*evl); 4957 if (m_new->m_len < hoff) 4958 return; 4959 evl = mtod(m_new, const struct ether_vlan_header *); 4960 etype = ntohs(evl->evl_proto); 4961 } 4962 *l3proto = etype; 4963 4964 if (etype == ETHERTYPE_IP) 4965 *l4proto = hn_check_iplen(m_new, hoff); 4966 else 4967 *l4proto = IPPROTO_DONE; 4968 } 4969 4970 static int 4971 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4972 { 4973 struct sysctl_oid_list *child; 4974 struct sysctl_ctx_list *ctx; 4975 device_t dev = sc->hn_dev; 4976 #if defined(INET) || defined(INET6) 4977 int lroent_cnt; 4978 #endif 4979 int i; 4980 4981 /* 4982 * Create RXBUF for reception. 4983 * 4984 * NOTE: 4985 * - It is shared by all channels. 4986 * - A large enough buffer is allocated, certain version of NVSes 4987 * may further limit the usable space. 4988 */ 4989 sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, 4990 0ul, ~0ul, PAGE_SIZE, 0); 4991 if (sc->hn_rxbuf == NULL) { 4992 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4993 return (ENOMEM); 4994 } 4995 4996 sc->hn_rx_ring_cnt = ring_cnt; 4997 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4998 4999 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 5000 M_DEVBUF, M_WAITOK | M_ZERO); 5001 5002 #if defined(INET) || defined(INET6) 5003 lroent_cnt = hn_lro_entry_count; 5004 if (lroent_cnt < TCP_LRO_ENTRIES) 5005 lroent_cnt = TCP_LRO_ENTRIES; 5006 if (bootverbose) 5007 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 5008 #endif /* INET || INET6 */ 5009 5010 ctx = device_get_sysctl_ctx(dev); 5011 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 5012 5013 /* Create dev.hn.UNIT.rx sysctl tree */ 5014 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 5015 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5016 5017 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5018 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5019 5020 rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF, 5021 M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); 5022 if (rxr->hn_br == NULL) { 5023 device_printf(dev, "allocate bufring failed\n"); 5024 return (ENOMEM); 5025 } 5026 5027 if (hn_trust_hosttcp) 5028 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 5029 if (hn_trust_hostudp) 5030 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 5031 if (hn_trust_hostip) 5032 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 5033 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 5034 rxr->hn_ifp = sc->hn_ifp; 5035 if (i < sc->hn_tx_ring_cnt) 5036 rxr->hn_txr = &sc->hn_tx_ring[i]; 5037 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 5038 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 5039 rxr->hn_rx_idx = i; 5040 rxr->hn_rxbuf = sc->hn_rxbuf; 5041 5042 /* 5043 * Initialize LRO. 5044 */ 5045 #if defined(INET) || defined(INET6) 5046 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5047 hn_lro_mbufq_depth); 5048 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5049 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5050 #endif /* INET || INET6 */ 5051 5052 if (sc->hn_rx_sysctl_tree != NULL) { 5053 char name[16]; 5054 5055 /* 5056 * Create per RX ring sysctl tree: 5057 * dev.hn.UNIT.rx.RINGID 5058 */ 5059 snprintf(name, sizeof(name), "%d", i); 5060 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5061 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5062 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5063 5064 if (rxr->hn_rx_sysctl_tree != NULL) { 5065 SYSCTL_ADD_ULONG(ctx, 5066 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5067 OID_AUTO, "packets", 5068 CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts, 5069 "# of packets received"); 5070 SYSCTL_ADD_ULONG(ctx, 5071 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5072 OID_AUTO, "rss_pkts", 5073 CTLFLAG_RW | CTLFLAG_STATS, 5074 &rxr->hn_rss_pkts, 5075 "# of packets w/ RSS info received"); 5076 SYSCTL_ADD_ULONG(ctx, 5077 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5078 OID_AUTO, "rsc_pkts", 5079 CTLFLAG_RW | CTLFLAG_STATS, 5080 &rxr->hn_rsc_pkts, 5081 "# of RSC packets received"); 5082 SYSCTL_ADD_ULONG(ctx, 5083 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5084 OID_AUTO, "rsc_drop", 5085 CTLFLAG_RW | CTLFLAG_STATS, 5086 &rxr->hn_rsc_drop, 5087 "# of RSC fragments dropped"); 5088 SYSCTL_ADD_INT(ctx, 5089 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5090 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5091 &rxr->hn_pktbuf_len, 0, 5092 "Temporary channel packet buffer length"); 5093 } 5094 } 5095 } 5096 5097 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5098 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5099 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5100 hn_rx_stat_u64_sysctl, 5101 "LU", "LRO queued"); 5102 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5103 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5104 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5105 hn_rx_stat_u64_sysctl, 5106 "LU", "LRO flushed"); 5107 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5108 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5109 __offsetof(struct hn_rx_ring, hn_lro_tried), 5110 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5111 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5112 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5113 hn_lro_lenlim_sysctl, "IU", 5114 "Max # of data bytes to be aggregated by LRO"); 5115 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5116 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5117 hn_lro_ackcnt_sysctl, "I", 5118 "Max # of ACKs to be aggregated by LRO"); 5119 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5120 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5121 hn_trust_hcsum_sysctl, "I", 5122 "Trust tcp segment verification on host side, " 5123 "when csum info is missing"); 5124 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5125 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5126 hn_trust_hcsum_sysctl, "I", 5127 "Trust udp datagram verification on host side, " 5128 "when csum info is missing"); 5129 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5130 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5131 hn_trust_hcsum_sysctl, "I", 5132 "Trust ip packet verification on host side, " 5133 "when csum info is missing"); 5134 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5135 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5136 __offsetof(struct hn_rx_ring, hn_csum_ip), 5137 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5138 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5139 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5140 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5141 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5142 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5143 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5144 __offsetof(struct hn_rx_ring, hn_csum_udp), 5145 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5146 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5147 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5148 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5149 hn_rx_stat_ulong_sysctl, "LU", 5150 "# of packets that we trust host's csum verification"); 5151 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5152 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5153 __offsetof(struct hn_rx_ring, hn_small_pkts), 5154 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5155 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5156 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5157 __offsetof(struct hn_rx_ring, hn_ack_failed), 5158 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5159 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5160 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5161 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5162 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5163 5164 return (0); 5165 } 5166 5167 static void 5168 hn_destroy_rx_data(struct hn_softc *sc) 5169 { 5170 int i; 5171 5172 if (sc->hn_rxbuf != NULL) { 5173 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5174 contigfree(sc->hn_rxbuf, HN_RXBUF_SIZE, M_DEVBUF); 5175 else 5176 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5177 sc->hn_rxbuf = NULL; 5178 } 5179 5180 if (sc->hn_rx_ring_cnt == 0) 5181 return; 5182 5183 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5184 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5185 5186 if (rxr->hn_br == NULL) 5187 continue; 5188 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5189 contigfree(rxr->hn_br, HN_TXBR_SIZE + HN_RXBR_SIZE, 5190 M_DEVBUF); 5191 } else { 5192 device_printf(sc->hn_dev, 5193 "%dth channel bufring is referenced", i); 5194 } 5195 rxr->hn_br = NULL; 5196 5197 #if defined(INET) || defined(INET6) 5198 tcp_lro_free(&rxr->hn_lro); 5199 #endif 5200 free(rxr->hn_pktbuf, M_DEVBUF); 5201 } 5202 free(sc->hn_rx_ring, M_DEVBUF); 5203 sc->hn_rx_ring = NULL; 5204 5205 sc->hn_rx_ring_cnt = 0; 5206 sc->hn_rx_ring_inuse = 0; 5207 } 5208 5209 static int 5210 hn_tx_ring_create(struct hn_softc *sc, int id) 5211 { 5212 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5213 device_t dev = sc->hn_dev; 5214 bus_dma_tag_t parent_dtag; 5215 int error, i; 5216 5217 txr->hn_sc = sc; 5218 txr->hn_tx_idx = id; 5219 5220 #ifndef HN_USE_TXDESC_BUFRING 5221 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5222 #endif 5223 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5224 5225 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5226 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5227 M_DEVBUF, M_WAITOK | M_ZERO); 5228 #ifndef HN_USE_TXDESC_BUFRING 5229 SLIST_INIT(&txr->hn_txlist); 5230 #else 5231 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5232 M_WAITOK, &txr->hn_tx_lock); 5233 #endif 5234 5235 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5236 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5237 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5238 } else { 5239 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5240 } 5241 5242 #ifdef HN_IFSTART_SUPPORT 5243 if (hn_use_if_start) { 5244 txr->hn_txeof = hn_start_txeof; 5245 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5246 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5247 } else 5248 #endif 5249 { 5250 int br_depth; 5251 5252 txr->hn_txeof = hn_xmit_txeof; 5253 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5254 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5255 5256 br_depth = hn_get_txswq_depth(txr); 5257 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5258 M_WAITOK, &txr->hn_tx_lock); 5259 } 5260 5261 txr->hn_direct_tx_size = hn_direct_tx_size; 5262 5263 /* 5264 * Always schedule transmission instead of trying to do direct 5265 * transmission. This one gives the best performance so far. 5266 */ 5267 txr->hn_sched_tx = 1; 5268 5269 parent_dtag = bus_get_dma_tag(dev); 5270 5271 /* DMA tag for RNDIS packet messages. */ 5272 error = bus_dma_tag_create(parent_dtag, /* parent */ 5273 HN_RNDIS_PKT_ALIGN, /* alignment */ 5274 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5275 BUS_SPACE_MAXADDR, /* lowaddr */ 5276 BUS_SPACE_MAXADDR, /* highaddr */ 5277 NULL, NULL, /* filter, filterarg */ 5278 HN_RNDIS_PKT_LEN, /* maxsize */ 5279 1, /* nsegments */ 5280 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5281 0, /* flags */ 5282 NULL, /* lockfunc */ 5283 NULL, /* lockfuncarg */ 5284 &txr->hn_tx_rndis_dtag); 5285 if (error) { 5286 device_printf(dev, "failed to create rndis dmatag\n"); 5287 return error; 5288 } 5289 5290 /* DMA tag for data. */ 5291 error = bus_dma_tag_create(parent_dtag, /* parent */ 5292 1, /* alignment */ 5293 HN_TX_DATA_BOUNDARY, /* boundary */ 5294 BUS_SPACE_MAXADDR, /* lowaddr */ 5295 BUS_SPACE_MAXADDR, /* highaddr */ 5296 NULL, NULL, /* filter, filterarg */ 5297 HN_TX_DATA_MAXSIZE, /* maxsize */ 5298 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5299 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5300 0, /* flags */ 5301 NULL, /* lockfunc */ 5302 NULL, /* lockfuncarg */ 5303 &txr->hn_tx_data_dtag); 5304 if (error) { 5305 device_printf(dev, "failed to create data dmatag\n"); 5306 return error; 5307 } 5308 5309 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5310 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5311 5312 txd->txr = txr; 5313 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5314 STAILQ_INIT(&txd->agg_list); 5315 5316 /* 5317 * Allocate and load RNDIS packet message. 5318 */ 5319 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5320 (void **)&txd->rndis_pkt, 5321 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5322 &txd->rndis_pkt_dmap); 5323 if (error) { 5324 device_printf(dev, 5325 "failed to allocate rndis_packet_msg, %d\n", i); 5326 return error; 5327 } 5328 5329 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5330 txd->rndis_pkt_dmap, 5331 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5332 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5333 BUS_DMA_NOWAIT); 5334 if (error) { 5335 device_printf(dev, 5336 "failed to load rndis_packet_msg, %d\n", i); 5337 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5338 txd->rndis_pkt, txd->rndis_pkt_dmap); 5339 return error; 5340 } 5341 5342 /* DMA map for TX data. */ 5343 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5344 &txd->data_dmap); 5345 if (error) { 5346 device_printf(dev, 5347 "failed to allocate tx data dmamap\n"); 5348 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5349 txd->rndis_pkt_dmap); 5350 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5351 txd->rndis_pkt, txd->rndis_pkt_dmap); 5352 return error; 5353 } 5354 5355 /* All set, put it to list */ 5356 txd->flags |= HN_TXD_FLAG_ONLIST; 5357 #ifndef HN_USE_TXDESC_BUFRING 5358 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5359 #else 5360 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5361 #endif 5362 } 5363 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5364 5365 if (sc->hn_tx_sysctl_tree != NULL) { 5366 struct sysctl_oid_list *child; 5367 struct sysctl_ctx_list *ctx; 5368 char name[16]; 5369 5370 /* 5371 * Create per TX ring sysctl tree: 5372 * dev.hn.UNIT.tx.RINGID 5373 */ 5374 ctx = device_get_sysctl_ctx(dev); 5375 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5376 5377 snprintf(name, sizeof(name), "%d", id); 5378 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5379 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5380 5381 if (txr->hn_tx_sysctl_tree != NULL) { 5382 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5383 5384 #ifdef HN_DEBUG 5385 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5386 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5387 "# of available TX descs"); 5388 #endif 5389 #ifdef HN_IFSTART_SUPPORT 5390 if (!hn_use_if_start) 5391 #endif 5392 { 5393 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5394 CTLFLAG_RD, &txr->hn_oactive, 0, 5395 "over active"); 5396 } 5397 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5398 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts, 5399 "# of packets transmitted"); 5400 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5401 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends, 5402 "# of sends"); 5403 } 5404 } 5405 5406 return 0; 5407 } 5408 5409 static void 5410 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5411 { 5412 struct hn_tx_ring *txr = txd->txr; 5413 5414 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5415 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5416 5417 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5418 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5419 txd->rndis_pkt_dmap); 5420 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5421 } 5422 5423 static void 5424 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5425 { 5426 5427 KASSERT(txd->refs == 0 || txd->refs == 1, 5428 ("invalid txd refs %d", txd->refs)); 5429 5430 /* Aggregated txds will be freed by their aggregating txd. */ 5431 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5432 int freed __diagused; 5433 5434 freed = hn_txdesc_put(txr, txd); 5435 KASSERT(freed, ("can't free txdesc")); 5436 } 5437 } 5438 5439 static void 5440 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5441 { 5442 int i; 5443 5444 if (txr->hn_txdesc == NULL) 5445 return; 5446 5447 /* 5448 * NOTE: 5449 * Because the freeing of aggregated txds will be deferred 5450 * to the aggregating txd, two passes are used here: 5451 * - The first pass GCes any pending txds. This GC is necessary, 5452 * since if the channels are revoked, hypervisor will not 5453 * deliver send-done for all pending txds. 5454 * - The second pass frees the busdma stuffs, i.e. after all txds 5455 * were freed. 5456 */ 5457 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5458 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5459 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5460 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5461 5462 if (txr->hn_tx_data_dtag != NULL) 5463 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5464 if (txr->hn_tx_rndis_dtag != NULL) 5465 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5466 5467 #ifdef HN_USE_TXDESC_BUFRING 5468 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5469 #endif 5470 5471 free(txr->hn_txdesc, M_DEVBUF); 5472 txr->hn_txdesc = NULL; 5473 5474 if (txr->hn_mbuf_br != NULL) 5475 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5476 5477 #ifndef HN_USE_TXDESC_BUFRING 5478 mtx_destroy(&txr->hn_txlist_spin); 5479 #endif 5480 mtx_destroy(&txr->hn_tx_lock); 5481 } 5482 5483 static int 5484 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5485 { 5486 struct sysctl_oid_list *child; 5487 struct sysctl_ctx_list *ctx; 5488 int i; 5489 5490 /* 5491 * Create TXBUF for chimney sending. 5492 * 5493 * NOTE: It is shared by all channels. 5494 */ 5495 sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, 5496 0ul, ~0ul, PAGE_SIZE, 0); 5497 if (sc->hn_chim == NULL) { 5498 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5499 return (ENOMEM); 5500 } 5501 5502 sc->hn_tx_ring_cnt = ring_cnt; 5503 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5504 5505 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5506 M_DEVBUF, M_WAITOK | M_ZERO); 5507 5508 ctx = device_get_sysctl_ctx(sc->hn_dev); 5509 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5510 5511 /* Create dev.hn.UNIT.tx sysctl tree */ 5512 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5513 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5514 5515 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5516 int error; 5517 5518 error = hn_tx_ring_create(sc, i); 5519 if (error) 5520 return error; 5521 } 5522 5523 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5524 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5525 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5526 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5527 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5528 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5529 __offsetof(struct hn_tx_ring, hn_send_failed), 5530 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5531 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5532 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5533 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5534 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5535 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5536 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5537 __offsetof(struct hn_tx_ring, hn_flush_failed), 5538 hn_tx_stat_ulong_sysctl, "LU", 5539 "# of packet transmission aggregation flush failure"); 5540 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5541 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5542 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5543 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5544 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5545 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5546 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5547 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5548 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5549 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5550 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5551 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5552 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5553 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5554 "# of total TX descs"); 5555 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5556 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5557 "Chimney send packet size upper boundary"); 5558 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5559 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5560 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5561 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5562 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5563 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5564 hn_tx_conf_int_sysctl, "I", 5565 "Size of the packet for direct transmission"); 5566 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5567 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5568 __offsetof(struct hn_tx_ring, hn_sched_tx), 5569 hn_tx_conf_int_sysctl, "I", 5570 "Always schedule transmission " 5571 "instead of doing direct transmission"); 5572 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5573 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5574 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5575 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5576 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5577 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5578 "Applied packet transmission aggregation size"); 5579 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5580 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5581 hn_txagg_pktmax_sysctl, "I", 5582 "Applied packet transmission aggregation packets"); 5583 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5584 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5585 hn_txagg_align_sysctl, "I", 5586 "Applied packet transmission aggregation alignment"); 5587 5588 return 0; 5589 } 5590 5591 static void 5592 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5593 { 5594 int i; 5595 5596 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5597 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5598 } 5599 5600 static void 5601 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5602 { 5603 if_t ifp = sc->hn_ifp; 5604 u_int hw_tsomax; 5605 int tso_minlen; 5606 5607 HN_LOCK_ASSERT(sc); 5608 5609 if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5610 return; 5611 5612 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5613 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5614 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5615 5616 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5617 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5618 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5619 5620 if (tso_maxlen < tso_minlen) 5621 tso_maxlen = tso_minlen; 5622 else if (tso_maxlen > IP_MAXPACKET) 5623 tso_maxlen = IP_MAXPACKET; 5624 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5625 tso_maxlen = sc->hn_ndis_tso_szmax; 5626 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5627 5628 if (hn_xpnt_vf_isready(sc)) { 5629 if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp)) 5630 hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp); 5631 } 5632 if_sethwtsomax(ifp, hw_tsomax); 5633 if (bootverbose) 5634 if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp)); 5635 } 5636 5637 static void 5638 hn_fixup_tx_data(struct hn_softc *sc) 5639 { 5640 uint64_t csum_assist; 5641 int i; 5642 5643 hn_set_chim_size(sc, sc->hn_chim_szmax); 5644 if (hn_tx_chimney_size > 0 && 5645 hn_tx_chimney_size < sc->hn_chim_szmax) 5646 hn_set_chim_size(sc, hn_tx_chimney_size); 5647 5648 csum_assist = 0; 5649 if (sc->hn_caps & HN_CAP_IPCS) 5650 csum_assist |= CSUM_IP; 5651 if (sc->hn_caps & HN_CAP_TCP4CS) 5652 csum_assist |= CSUM_IP_TCP; 5653 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5654 csum_assist |= CSUM_IP_UDP; 5655 if (sc->hn_caps & HN_CAP_TCP6CS) 5656 csum_assist |= CSUM_IP6_TCP; 5657 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5658 csum_assist |= CSUM_IP6_UDP; 5659 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5660 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5661 5662 if (sc->hn_caps & HN_CAP_HASHVAL) { 5663 /* 5664 * Support HASHVAL pktinfo on TX path. 5665 */ 5666 if (bootverbose) 5667 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5668 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5669 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5670 } 5671 } 5672 5673 static void 5674 hn_fixup_rx_data(struct hn_softc *sc) 5675 { 5676 5677 if (sc->hn_caps & HN_CAP_UDPHASH) { 5678 int i; 5679 5680 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5681 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5682 } 5683 } 5684 5685 static void 5686 hn_destroy_tx_data(struct hn_softc *sc) 5687 { 5688 int i; 5689 5690 if (sc->hn_chim != NULL) { 5691 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5692 contigfree(sc->hn_chim, HN_CHIM_SIZE, M_DEVBUF); 5693 } else { 5694 device_printf(sc->hn_dev, 5695 "chimney sending buffer is referenced"); 5696 } 5697 sc->hn_chim = NULL; 5698 } 5699 5700 if (sc->hn_tx_ring_cnt == 0) 5701 return; 5702 5703 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5704 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5705 5706 free(sc->hn_tx_ring, M_DEVBUF); 5707 sc->hn_tx_ring = NULL; 5708 5709 sc->hn_tx_ring_cnt = 0; 5710 sc->hn_tx_ring_inuse = 0; 5711 } 5712 5713 #ifdef HN_IFSTART_SUPPORT 5714 5715 static void 5716 hn_start_taskfunc(void *xtxr, int pending __unused) 5717 { 5718 struct hn_tx_ring *txr = xtxr; 5719 5720 mtx_lock(&txr->hn_tx_lock); 5721 hn_start_locked(txr, 0); 5722 mtx_unlock(&txr->hn_tx_lock); 5723 } 5724 5725 static int 5726 hn_start_locked(struct hn_tx_ring *txr, int len) 5727 { 5728 struct hn_softc *sc = txr->hn_sc; 5729 if_t ifp = sc->hn_ifp; 5730 int sched = 0; 5731 5732 KASSERT(hn_use_if_start, 5733 ("hn_start_locked is called, when if_start is disabled")); 5734 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5735 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5736 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5737 5738 if (__predict_false(txr->hn_suspended)) 5739 return (0); 5740 5741 if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5742 IFF_DRV_RUNNING) 5743 return (0); 5744 5745 while (!if_sendq_empty(ifp)) { 5746 struct hn_txdesc *txd; 5747 struct mbuf *m_head; 5748 int error; 5749 5750 m_head = if_dequeue(ifp); 5751 if (m_head == NULL) 5752 break; 5753 5754 if (len > 0 && m_head->m_pkthdr.len > len) { 5755 /* 5756 * This sending could be time consuming; let callers 5757 * dispatch this packet sending (and sending of any 5758 * following up packets) to tx taskqueue. 5759 */ 5760 if_sendq_prepend(ifp, m_head); 5761 sched = 1; 5762 break; 5763 } 5764 5765 #if defined(INET6) || defined(INET) 5766 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5767 m_head = hn_tso_fixup(m_head); 5768 if (__predict_false(m_head == NULL)) { 5769 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5770 continue; 5771 } 5772 } else if (m_head->m_pkthdr.csum_flags & 5773 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5774 m_head = hn_set_hlen(m_head); 5775 if (__predict_false(m_head == NULL)) { 5776 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5777 continue; 5778 } 5779 } 5780 #endif 5781 5782 txd = hn_txdesc_get(txr); 5783 if (txd == NULL) { 5784 txr->hn_no_txdescs++; 5785 if_sendq_prepend(ifp, m_head); 5786 if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); 5787 break; 5788 } 5789 5790 error = hn_encap(ifp, txr, txd, &m_head); 5791 if (error) { 5792 /* Both txd and m_head are freed */ 5793 KASSERT(txr->hn_agg_txd == NULL, 5794 ("encap failed w/ pending aggregating txdesc")); 5795 continue; 5796 } 5797 5798 if (txr->hn_agg_pktleft == 0) { 5799 if (txr->hn_agg_txd != NULL) { 5800 KASSERT(m_head == NULL, 5801 ("pending mbuf for aggregating txdesc")); 5802 error = hn_flush_txagg(ifp, txr); 5803 if (__predict_false(error)) { 5804 if_setdrvflagbits(ifp, 5805 IFF_DRV_OACTIVE, 0); 5806 break; 5807 } 5808 } else { 5809 KASSERT(m_head != NULL, ("mbuf was freed")); 5810 error = hn_txpkt(ifp, txr, txd); 5811 if (__predict_false(error)) { 5812 /* txd is freed, but m_head is not */ 5813 if_sendq_prepend(ifp, m_head); 5814 if_setdrvflagbits(ifp, 5815 IFF_DRV_OACTIVE, 0); 5816 break; 5817 } 5818 } 5819 } 5820 #ifdef INVARIANTS 5821 else { 5822 KASSERT(txr->hn_agg_txd != NULL, 5823 ("no aggregating txdesc")); 5824 KASSERT(m_head == NULL, 5825 ("pending mbuf for aggregating txdesc")); 5826 } 5827 #endif 5828 } 5829 5830 /* Flush pending aggerated transmission. */ 5831 if (txr->hn_agg_txd != NULL) 5832 hn_flush_txagg(ifp, txr); 5833 return (sched); 5834 } 5835 5836 static void 5837 hn_start(if_t ifp) 5838 { 5839 struct hn_softc *sc = if_getsoftc(ifp); 5840 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5841 5842 if (txr->hn_sched_tx) 5843 goto do_sched; 5844 5845 if (mtx_trylock(&txr->hn_tx_lock)) { 5846 int sched; 5847 5848 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5849 mtx_unlock(&txr->hn_tx_lock); 5850 if (!sched) 5851 return; 5852 } 5853 do_sched: 5854 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5855 } 5856 5857 static void 5858 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5859 { 5860 struct hn_tx_ring *txr = xtxr; 5861 5862 mtx_lock(&txr->hn_tx_lock); 5863 if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE); 5864 hn_start_locked(txr, 0); 5865 mtx_unlock(&txr->hn_tx_lock); 5866 } 5867 5868 static void 5869 hn_start_txeof(struct hn_tx_ring *txr) 5870 { 5871 struct hn_softc *sc = txr->hn_sc; 5872 if_t ifp = sc->hn_ifp; 5873 5874 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5875 5876 if (txr->hn_sched_tx) 5877 goto do_sched; 5878 5879 if (mtx_trylock(&txr->hn_tx_lock)) { 5880 int sched; 5881 5882 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 5883 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5884 mtx_unlock(&txr->hn_tx_lock); 5885 if (sched) { 5886 taskqueue_enqueue(txr->hn_tx_taskq, 5887 &txr->hn_tx_task); 5888 } 5889 } else { 5890 do_sched: 5891 /* 5892 * Release the OACTIVE earlier, with the hope, that 5893 * others could catch up. The task will clear the 5894 * flag again with the hn_tx_lock to avoid possible 5895 * races. 5896 */ 5897 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 5898 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5899 } 5900 } 5901 5902 #endif /* HN_IFSTART_SUPPORT */ 5903 5904 static int 5905 hn_xmit(struct hn_tx_ring *txr, int len) 5906 { 5907 struct hn_softc *sc = txr->hn_sc; 5908 if_t ifp = sc->hn_ifp; 5909 struct mbuf *m_head; 5910 int sched = 0; 5911 5912 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5913 #ifdef HN_IFSTART_SUPPORT 5914 KASSERT(hn_use_if_start == 0, 5915 ("hn_xmit is called, when if_start is enabled")); 5916 #endif 5917 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5918 5919 if (__predict_false(txr->hn_suspended)) 5920 return (0); 5921 5922 if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5923 return (0); 5924 5925 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5926 struct hn_txdesc *txd; 5927 int error; 5928 5929 if (len > 0 && m_head->m_pkthdr.len > len) { 5930 /* 5931 * This sending could be time consuming; let callers 5932 * dispatch this packet sending (and sending of any 5933 * following up packets) to tx taskqueue. 5934 */ 5935 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5936 sched = 1; 5937 break; 5938 } 5939 5940 txd = hn_txdesc_get(txr); 5941 if (txd == NULL) { 5942 txr->hn_no_txdescs++; 5943 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5944 txr->hn_oactive = 1; 5945 break; 5946 } 5947 5948 error = hn_encap(ifp, txr, txd, &m_head); 5949 if (error) { 5950 /* Both txd and m_head are freed; discard */ 5951 KASSERT(txr->hn_agg_txd == NULL, 5952 ("encap failed w/ pending aggregating txdesc")); 5953 drbr_advance(ifp, txr->hn_mbuf_br); 5954 continue; 5955 } 5956 5957 if (txr->hn_agg_pktleft == 0) { 5958 if (txr->hn_agg_txd != NULL) { 5959 KASSERT(m_head == NULL, 5960 ("pending mbuf for aggregating txdesc")); 5961 error = hn_flush_txagg(ifp, txr); 5962 if (__predict_false(error)) { 5963 txr->hn_oactive = 1; 5964 break; 5965 } 5966 } else { 5967 KASSERT(m_head != NULL, ("mbuf was freed")); 5968 error = hn_txpkt(ifp, txr, txd); 5969 if (__predict_false(error)) { 5970 /* txd is freed, but m_head is not */ 5971 drbr_putback(ifp, txr->hn_mbuf_br, 5972 m_head); 5973 txr->hn_oactive = 1; 5974 break; 5975 } 5976 } 5977 } 5978 #ifdef INVARIANTS 5979 else { 5980 KASSERT(txr->hn_agg_txd != NULL, 5981 ("no aggregating txdesc")); 5982 KASSERT(m_head == NULL, 5983 ("pending mbuf for aggregating txdesc")); 5984 } 5985 #endif 5986 5987 /* Sent */ 5988 drbr_advance(ifp, txr->hn_mbuf_br); 5989 } 5990 5991 /* Flush pending aggerated transmission. */ 5992 if (txr->hn_agg_txd != NULL) 5993 hn_flush_txagg(ifp, txr); 5994 return (sched); 5995 } 5996 5997 static int 5998 hn_transmit(if_t ifp, struct mbuf *m) 5999 { 6000 struct hn_softc *sc = if_getsoftc(ifp); 6001 struct hn_tx_ring *txr; 6002 int error, idx = 0; 6003 6004 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 6005 struct rm_priotracker pt; 6006 6007 rm_rlock(&sc->hn_vf_lock, &pt); 6008 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6009 struct mbuf *m_bpf = NULL; 6010 int obytes, omcast; 6011 6012 obytes = m->m_pkthdr.len; 6013 omcast = (m->m_flags & M_MCAST) != 0; 6014 6015 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 6016 if (bpf_peers_present(if_getbpf(ifp))) { 6017 m_bpf = m_copypacket(m, M_NOWAIT); 6018 if (m_bpf == NULL) { 6019 /* 6020 * Failed to grab a shallow 6021 * copy; tap now. 6022 */ 6023 ETHER_BPF_MTAP(ifp, m); 6024 } 6025 } 6026 } else { 6027 ETHER_BPF_MTAP(ifp, m); 6028 } 6029 6030 error = if_transmit(sc->hn_vf_ifp, m); 6031 rm_runlock(&sc->hn_vf_lock, &pt); 6032 6033 if (m_bpf != NULL) { 6034 if (!error) 6035 ETHER_BPF_MTAP(ifp, m_bpf); 6036 m_freem(m_bpf); 6037 } 6038 6039 if (error == ENOBUFS) { 6040 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6041 } else if (error) { 6042 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6043 } else { 6044 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6045 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6046 if (omcast) { 6047 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6048 omcast); 6049 } 6050 } 6051 return (error); 6052 } 6053 rm_runlock(&sc->hn_vf_lock, &pt); 6054 } 6055 6056 #if defined(INET6) || defined(INET) 6057 /* 6058 * Perform TSO packet header fixup or get l2/l3 header length now, 6059 * since packet headers should be cache-hot. 6060 */ 6061 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6062 m = hn_tso_fixup(m); 6063 if (__predict_false(m == NULL)) { 6064 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6065 return EIO; 6066 } 6067 } else if (m->m_pkthdr.csum_flags & 6068 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6069 m = hn_set_hlen(m); 6070 if (__predict_false(m == NULL)) { 6071 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6072 return EIO; 6073 } 6074 } 6075 #endif 6076 6077 /* 6078 * Select the TX ring based on flowid 6079 */ 6080 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6081 #ifdef RSS 6082 uint32_t bid; 6083 6084 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6085 &bid) == 0) 6086 idx = bid % sc->hn_tx_ring_inuse; 6087 else 6088 #endif 6089 { 6090 #if defined(INET6) || defined(INET) 6091 int tcpsyn = 0; 6092 6093 if (m->m_pkthdr.len < 128 && 6094 (m->m_pkthdr.csum_flags & 6095 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6096 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6097 m = hn_check_tcpsyn(m, &tcpsyn); 6098 if (__predict_false(m == NULL)) { 6099 if_inc_counter(ifp, 6100 IFCOUNTER_OERRORS, 1); 6101 return (EIO); 6102 } 6103 } 6104 #else 6105 const int tcpsyn = 0; 6106 #endif 6107 if (tcpsyn) 6108 idx = 0; 6109 else 6110 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6111 } 6112 } 6113 txr = &sc->hn_tx_ring[idx]; 6114 6115 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6116 if (error) { 6117 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6118 return error; 6119 } 6120 6121 if (txr->hn_oactive) 6122 return 0; 6123 6124 if (txr->hn_sched_tx) 6125 goto do_sched; 6126 6127 if (mtx_trylock(&txr->hn_tx_lock)) { 6128 int sched; 6129 6130 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6131 mtx_unlock(&txr->hn_tx_lock); 6132 if (!sched) 6133 return 0; 6134 } 6135 do_sched: 6136 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6137 return 0; 6138 } 6139 6140 static void 6141 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6142 { 6143 struct mbuf *m; 6144 6145 mtx_lock(&txr->hn_tx_lock); 6146 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6147 m_freem(m); 6148 mtx_unlock(&txr->hn_tx_lock); 6149 } 6150 6151 static void 6152 hn_xmit_qflush(if_t ifp) 6153 { 6154 struct hn_softc *sc = if_getsoftc(ifp); 6155 struct rm_priotracker pt; 6156 int i; 6157 6158 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6159 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6160 if_qflush(ifp); 6161 6162 rm_rlock(&sc->hn_vf_lock, &pt); 6163 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6164 if_qflush(sc->hn_vf_ifp); 6165 rm_runlock(&sc->hn_vf_lock, &pt); 6166 } 6167 6168 static void 6169 hn_xmit_txeof(struct hn_tx_ring *txr) 6170 { 6171 6172 if (txr->hn_sched_tx) 6173 goto do_sched; 6174 6175 if (mtx_trylock(&txr->hn_tx_lock)) { 6176 int sched; 6177 6178 txr->hn_oactive = 0; 6179 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6180 mtx_unlock(&txr->hn_tx_lock); 6181 if (sched) { 6182 taskqueue_enqueue(txr->hn_tx_taskq, 6183 &txr->hn_tx_task); 6184 } 6185 } else { 6186 do_sched: 6187 /* 6188 * Release the oactive earlier, with the hope, that 6189 * others could catch up. The task will clear the 6190 * oactive again with the hn_tx_lock to avoid possible 6191 * races. 6192 */ 6193 txr->hn_oactive = 0; 6194 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6195 } 6196 } 6197 6198 static void 6199 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6200 { 6201 struct hn_tx_ring *txr = xtxr; 6202 6203 mtx_lock(&txr->hn_tx_lock); 6204 hn_xmit(txr, 0); 6205 mtx_unlock(&txr->hn_tx_lock); 6206 } 6207 6208 static void 6209 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6210 { 6211 struct hn_tx_ring *txr = xtxr; 6212 6213 mtx_lock(&txr->hn_tx_lock); 6214 txr->hn_oactive = 0; 6215 hn_xmit(txr, 0); 6216 mtx_unlock(&txr->hn_tx_lock); 6217 } 6218 6219 static int 6220 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6221 { 6222 struct vmbus_chan_br cbr; 6223 struct hn_rx_ring *rxr; 6224 struct hn_tx_ring *txr = NULL; 6225 int idx, error; 6226 6227 idx = vmbus_chan_subidx(chan); 6228 6229 /* 6230 * Link this channel to RX/TX ring. 6231 */ 6232 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6233 ("invalid channel index %d, should > 0 && < %d", 6234 idx, sc->hn_rx_ring_inuse)); 6235 rxr = &sc->hn_rx_ring[idx]; 6236 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6237 ("RX ring %d already attached", idx)); 6238 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6239 rxr->hn_chan = chan; 6240 6241 if (bootverbose) { 6242 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6243 idx, vmbus_chan_id(chan)); 6244 } 6245 6246 if (idx < sc->hn_tx_ring_inuse) { 6247 txr = &sc->hn_tx_ring[idx]; 6248 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6249 ("TX ring %d already attached", idx)); 6250 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6251 6252 txr->hn_chan = chan; 6253 if (bootverbose) { 6254 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6255 idx, vmbus_chan_id(chan)); 6256 } 6257 } 6258 6259 /* Bind this channel to a proper CPU. */ 6260 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6261 6262 /* 6263 * Open this channel 6264 */ 6265 cbr.cbr = rxr->hn_br; 6266 cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br); 6267 cbr.cbr_txsz = HN_TXBR_SIZE; 6268 cbr.cbr_rxsz = HN_RXBR_SIZE; 6269 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6270 if (error) { 6271 if (error == EISCONN) { 6272 if_printf(sc->hn_ifp, "bufring is connected after " 6273 "chan%u open failure\n", vmbus_chan_id(chan)); 6274 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6275 } else { 6276 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6277 vmbus_chan_id(chan), error); 6278 } 6279 } 6280 return (error); 6281 } 6282 6283 static void 6284 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6285 { 6286 struct hn_rx_ring *rxr; 6287 int idx, error; 6288 6289 idx = vmbus_chan_subidx(chan); 6290 6291 /* 6292 * Link this channel to RX/TX ring. 6293 */ 6294 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6295 ("invalid channel index %d, should > 0 && < %d", 6296 idx, sc->hn_rx_ring_inuse)); 6297 rxr = &sc->hn_rx_ring[idx]; 6298 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6299 ("RX ring %d is not attached", idx)); 6300 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6301 6302 if (idx < sc->hn_tx_ring_inuse) { 6303 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6304 6305 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6306 ("TX ring %d is not attached attached", idx)); 6307 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6308 } 6309 6310 /* 6311 * Close this channel. 6312 * 6313 * NOTE: 6314 * Channel closing does _not_ destroy the target channel. 6315 */ 6316 error = vmbus_chan_close_direct(chan); 6317 if (error == EISCONN) { 6318 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6319 "after being closed\n", vmbus_chan_id(chan)); 6320 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6321 } else if (error) { 6322 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6323 vmbus_chan_id(chan), error); 6324 } 6325 } 6326 6327 static int 6328 hn_attach_subchans(struct hn_softc *sc) 6329 { 6330 struct vmbus_channel **subchans; 6331 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6332 int i, error = 0; 6333 6334 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6335 6336 /* Attach the sub-channels. */ 6337 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6338 for (i = 0; i < subchan_cnt; ++i) { 6339 int error1; 6340 6341 error1 = hn_chan_attach(sc, subchans[i]); 6342 if (error1) { 6343 error = error1; 6344 /* Move on; all channels will be detached later. */ 6345 } 6346 } 6347 vmbus_subchan_rel(subchans, subchan_cnt); 6348 6349 if (error) { 6350 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6351 } else { 6352 if (bootverbose) { 6353 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6354 subchan_cnt); 6355 } 6356 } 6357 return (error); 6358 } 6359 6360 static void 6361 hn_detach_allchans(struct hn_softc *sc) 6362 { 6363 struct vmbus_channel **subchans; 6364 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6365 int i; 6366 6367 if (subchan_cnt == 0) 6368 goto back; 6369 6370 /* Detach the sub-channels. */ 6371 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6372 for (i = 0; i < subchan_cnt; ++i) 6373 hn_chan_detach(sc, subchans[i]); 6374 vmbus_subchan_rel(subchans, subchan_cnt); 6375 6376 back: 6377 /* 6378 * Detach the primary channel, _after_ all sub-channels 6379 * are detached. 6380 */ 6381 hn_chan_detach(sc, sc->hn_prichan); 6382 6383 /* Wait for sub-channels to be destroyed, if any. */ 6384 vmbus_subchan_drain(sc->hn_prichan); 6385 6386 #ifdef INVARIANTS 6387 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6388 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6389 HN_RX_FLAG_ATTACHED) == 0, 6390 ("%dth RX ring is still attached", i)); 6391 } 6392 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6393 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6394 HN_TX_FLAG_ATTACHED) == 0, 6395 ("%dth TX ring is still attached", i)); 6396 } 6397 #endif 6398 } 6399 6400 static int 6401 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6402 { 6403 struct vmbus_channel **subchans; 6404 int nchan, rxr_cnt, error; 6405 6406 nchan = *nsubch + 1; 6407 if (nchan == 1) { 6408 /* 6409 * Multiple RX/TX rings are not requested. 6410 */ 6411 *nsubch = 0; 6412 return (0); 6413 } 6414 6415 /* 6416 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6417 * table entries. 6418 */ 6419 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6420 if (error) { 6421 /* No RSS; this is benign. */ 6422 *nsubch = 0; 6423 return (0); 6424 } 6425 if (bootverbose) { 6426 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6427 rxr_cnt, nchan); 6428 } 6429 6430 if (nchan > rxr_cnt) 6431 nchan = rxr_cnt; 6432 if (nchan == 1) { 6433 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6434 *nsubch = 0; 6435 return (0); 6436 } 6437 6438 /* 6439 * Allocate sub-channels from NVS. 6440 */ 6441 *nsubch = nchan - 1; 6442 error = hn_nvs_alloc_subchans(sc, nsubch); 6443 if (error || *nsubch == 0) { 6444 /* Failed to allocate sub-channels. */ 6445 *nsubch = 0; 6446 return (0); 6447 } 6448 6449 /* 6450 * Wait for all sub-channels to become ready before moving on. 6451 */ 6452 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6453 vmbus_subchan_rel(subchans, *nsubch); 6454 return (0); 6455 } 6456 6457 static bool 6458 hn_synth_attachable(const struct hn_softc *sc) 6459 { 6460 int i; 6461 6462 if (sc->hn_flags & HN_FLAG_ERRORS) 6463 return (false); 6464 6465 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6466 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6467 6468 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6469 return (false); 6470 } 6471 return (true); 6472 } 6473 6474 /* 6475 * Make sure that the RX filter is zero after the successful 6476 * RNDIS initialization. 6477 * 6478 * NOTE: 6479 * Under certain conditions on certain versions of Hyper-V, 6480 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6481 * after the successful RNDIS initialization, which breaks 6482 * the assumption of any following code (well, it breaks the 6483 * RNDIS API contract actually). Clear the RNDIS rxfilter 6484 * explicitly, drain packets sneaking through, and drain the 6485 * interrupt taskqueues scheduled due to the stealth packets. 6486 */ 6487 static void 6488 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6489 { 6490 6491 hn_disable_rx(sc); 6492 hn_drain_rxtx(sc, nchan); 6493 } 6494 6495 static int 6496 hn_synth_attach(struct hn_softc *sc, int mtu) 6497 { 6498 #define ATTACHED_NVS 0x0002 6499 #define ATTACHED_RNDIS 0x0004 6500 6501 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6502 int error, nsubch, nchan = 1, i, rndis_inited; 6503 uint32_t old_caps, attached = 0; 6504 6505 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6506 ("synthetic parts were attached")); 6507 6508 if (!hn_synth_attachable(sc)) 6509 return (ENXIO); 6510 6511 /* Save capabilities for later verification. */ 6512 old_caps = sc->hn_caps; 6513 sc->hn_caps = 0; 6514 6515 /* Clear RSS stuffs. */ 6516 sc->hn_rss_ind_size = 0; 6517 sc->hn_rss_hash = 0; 6518 sc->hn_rss_hcap = 0; 6519 6520 /* 6521 * Attach the primary channel _before_ attaching NVS and RNDIS. 6522 */ 6523 error = hn_chan_attach(sc, sc->hn_prichan); 6524 if (error) 6525 goto failed; 6526 6527 /* 6528 * Attach NVS. 6529 */ 6530 error = hn_nvs_attach(sc, mtu); 6531 if (error) 6532 goto failed; 6533 attached |= ATTACHED_NVS; 6534 6535 /* 6536 * Attach RNDIS _after_ NVS is attached. 6537 */ 6538 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6539 if (rndis_inited) 6540 attached |= ATTACHED_RNDIS; 6541 if (error) 6542 goto failed; 6543 6544 /* 6545 * Make sure capabilities are not changed. 6546 */ 6547 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6548 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6549 old_caps, sc->hn_caps); 6550 error = ENXIO; 6551 goto failed; 6552 } 6553 6554 /* 6555 * Allocate sub-channels for multi-TX/RX rings. 6556 * 6557 * NOTE: 6558 * The # of RX rings that can be used is equivalent to the # of 6559 * channels to be requested. 6560 */ 6561 nsubch = sc->hn_rx_ring_cnt - 1; 6562 error = hn_synth_alloc_subchans(sc, &nsubch); 6563 if (error) 6564 goto failed; 6565 /* NOTE: _Full_ synthetic parts detach is required now. */ 6566 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6567 6568 /* 6569 * Set the # of TX/RX rings that could be used according to 6570 * the # of channels that NVS offered. 6571 */ 6572 nchan = nsubch + 1; 6573 hn_set_ring_inuse(sc, nchan); 6574 if (nchan == 1) { 6575 /* Only the primary channel can be used; done */ 6576 goto back; 6577 } 6578 6579 /* 6580 * Attach the sub-channels. 6581 * 6582 * NOTE: hn_set_ring_inuse() _must_ have been called. 6583 */ 6584 error = hn_attach_subchans(sc); 6585 if (error) 6586 goto failed; 6587 6588 /* 6589 * Configure RSS key and indirect table _after_ all sub-channels 6590 * are attached. 6591 */ 6592 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6593 /* 6594 * RSS key is not set yet; set it to the default RSS key. 6595 */ 6596 if (bootverbose) 6597 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6598 #ifdef RSS 6599 rss_getkey(rss->rss_key); 6600 #else 6601 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6602 #endif 6603 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6604 } 6605 6606 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6607 /* 6608 * RSS indirect table is not set yet; set it up in round- 6609 * robin fashion. 6610 */ 6611 if (bootverbose) { 6612 if_printf(sc->hn_ifp, "setup default RSS indirect " 6613 "table\n"); 6614 } 6615 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6616 uint32_t subidx; 6617 6618 #ifdef RSS 6619 subidx = rss_get_indirection_to_bucket(i); 6620 #else 6621 subidx = i; 6622 #endif 6623 rss->rss_ind[i] = subidx % nchan; 6624 } 6625 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6626 } else { 6627 /* 6628 * # of usable channels may be changed, so we have to 6629 * make sure that all entries in RSS indirect table 6630 * are valid. 6631 * 6632 * NOTE: hn_set_ring_inuse() _must_ have been called. 6633 */ 6634 hn_rss_ind_fixup(sc); 6635 } 6636 6637 sc->hn_rss_hash = sc->hn_rss_hcap; 6638 if ((sc->hn_flags & HN_FLAG_RXVF) || 6639 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6640 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6641 hn_vf_rss_fixup(sc, false); 6642 } 6643 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6644 if (error) 6645 goto failed; 6646 back: 6647 /* 6648 * Fixup transmission aggregation setup. 6649 */ 6650 hn_set_txagg(sc); 6651 hn_rndis_init_fixat(sc, nchan); 6652 return (0); 6653 6654 failed: 6655 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6656 hn_rndis_init_fixat(sc, nchan); 6657 hn_synth_detach(sc); 6658 } else { 6659 if (attached & ATTACHED_RNDIS) { 6660 hn_rndis_init_fixat(sc, nchan); 6661 hn_rndis_detach(sc); 6662 } 6663 if (attached & ATTACHED_NVS) 6664 hn_nvs_detach(sc); 6665 hn_chan_detach(sc, sc->hn_prichan); 6666 /* Restore old capabilities. */ 6667 sc->hn_caps = old_caps; 6668 } 6669 return (error); 6670 6671 #undef ATTACHED_RNDIS 6672 #undef ATTACHED_NVS 6673 } 6674 6675 /* 6676 * NOTE: 6677 * The interface must have been suspended though hn_suspend(), before 6678 * this function get called. 6679 */ 6680 static void 6681 hn_synth_detach(struct hn_softc *sc) 6682 { 6683 6684 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6685 ("synthetic parts were not attached")); 6686 6687 /* Detach the RNDIS first. */ 6688 hn_rndis_detach(sc); 6689 6690 /* Detach NVS. */ 6691 hn_nvs_detach(sc); 6692 6693 /* Detach all of the channels. */ 6694 hn_detach_allchans(sc); 6695 6696 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6697 /* 6698 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6699 */ 6700 int error; 6701 6702 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6703 sc->hn_rxbuf_gpadl); 6704 if (error) { 6705 if_printf(sc->hn_ifp, 6706 "rxbuf gpadl disconn failed: %d\n", error); 6707 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6708 } 6709 sc->hn_rxbuf_gpadl = 0; 6710 } 6711 6712 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6713 /* 6714 * Host is post-Win2016, disconnect chimney sending buffer from 6715 * primary channel here. 6716 */ 6717 int error; 6718 6719 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6720 sc->hn_chim_gpadl); 6721 if (error) { 6722 if_printf(sc->hn_ifp, 6723 "chim gpadl disconn failed: %d\n", error); 6724 sc->hn_flags |= HN_FLAG_CHIM_REF; 6725 } 6726 sc->hn_chim_gpadl = 0; 6727 } 6728 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6729 } 6730 6731 static void 6732 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6733 { 6734 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6735 ("invalid ring count %d", ring_cnt)); 6736 6737 if (sc->hn_tx_ring_cnt > ring_cnt) 6738 sc->hn_tx_ring_inuse = ring_cnt; 6739 else 6740 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6741 sc->hn_rx_ring_inuse = ring_cnt; 6742 6743 #ifdef RSS 6744 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6745 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6746 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6747 rss_getnumbuckets()); 6748 } 6749 #endif 6750 6751 if (bootverbose) { 6752 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6753 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6754 } 6755 } 6756 6757 static void 6758 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6759 { 6760 6761 /* 6762 * NOTE: 6763 * The TX bufring will not be drained by the hypervisor, 6764 * if the primary channel is revoked. 6765 */ 6766 while (!vmbus_chan_rx_empty(chan) || 6767 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6768 !vmbus_chan_tx_empty(chan))) 6769 pause("waitch", 1); 6770 vmbus_chan_intr_drain(chan); 6771 } 6772 6773 static void 6774 hn_disable_rx(struct hn_softc *sc) 6775 { 6776 6777 /* 6778 * Disable RX by clearing RX filter forcefully. 6779 */ 6780 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6781 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6782 6783 /* 6784 * Give RNDIS enough time to flush all pending data packets. 6785 */ 6786 pause("waitrx", (200 * hz) / 1000); 6787 } 6788 6789 /* 6790 * NOTE: 6791 * RX/TX _must_ have been suspended/disabled, before this function 6792 * is called. 6793 */ 6794 static void 6795 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6796 { 6797 struct vmbus_channel **subch = NULL; 6798 int nsubch; 6799 6800 /* 6801 * Drain RX/TX bufrings and interrupts. 6802 */ 6803 nsubch = nchan - 1; 6804 if (nsubch > 0) 6805 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6806 6807 if (subch != NULL) { 6808 int i; 6809 6810 for (i = 0; i < nsubch; ++i) 6811 hn_chan_drain(sc, subch[i]); 6812 } 6813 hn_chan_drain(sc, sc->hn_prichan); 6814 6815 if (subch != NULL) 6816 vmbus_subchan_rel(subch, nsubch); 6817 } 6818 6819 static void 6820 hn_suspend_data(struct hn_softc *sc) 6821 { 6822 struct hn_tx_ring *txr; 6823 int i; 6824 6825 HN_LOCK_ASSERT(sc); 6826 6827 /* 6828 * Suspend TX. 6829 */ 6830 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6831 txr = &sc->hn_tx_ring[i]; 6832 6833 mtx_lock(&txr->hn_tx_lock); 6834 txr->hn_suspended = 1; 6835 mtx_unlock(&txr->hn_tx_lock); 6836 /* No one is able send more packets now. */ 6837 6838 /* 6839 * Wait for all pending sends to finish. 6840 * 6841 * NOTE: 6842 * We will _not_ receive all pending send-done, if the 6843 * primary channel is revoked. 6844 */ 6845 while (hn_tx_ring_pending(txr) && 6846 !vmbus_chan_is_revoked(sc->hn_prichan)) 6847 pause("hnwtx", 1 /* 1 tick */); 6848 } 6849 6850 /* 6851 * Disable RX. 6852 */ 6853 hn_disable_rx(sc); 6854 6855 /* 6856 * Drain RX/TX. 6857 */ 6858 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6859 6860 /* 6861 * Drain any pending TX tasks. 6862 * 6863 * NOTE: 6864 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6865 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6866 */ 6867 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6868 txr = &sc->hn_tx_ring[i]; 6869 6870 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6871 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6872 } 6873 } 6874 6875 static void 6876 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6877 { 6878 6879 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6880 } 6881 6882 static void 6883 hn_suspend_mgmt(struct hn_softc *sc) 6884 { 6885 struct task task; 6886 6887 HN_LOCK_ASSERT(sc); 6888 6889 /* 6890 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6891 * through hn_mgmt_taskq. 6892 */ 6893 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6894 vmbus_chan_run_task(sc->hn_prichan, &task); 6895 6896 /* 6897 * Make sure that all pending management tasks are completed. 6898 */ 6899 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6900 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6901 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6902 } 6903 6904 static void 6905 hn_suspend(struct hn_softc *sc) 6906 { 6907 6908 /* Disable polling. */ 6909 hn_polling(sc, 0); 6910 6911 /* 6912 * If the non-transparent mode VF is activated, the synthetic 6913 * device is receiving packets, so the data path of the 6914 * synthetic device must be suspended. 6915 */ 6916 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || 6917 (sc->hn_flags & HN_FLAG_RXVF)) 6918 hn_suspend_data(sc); 6919 hn_suspend_mgmt(sc); 6920 } 6921 6922 static void 6923 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6924 { 6925 int i; 6926 6927 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6928 ("invalid TX ring count %d", tx_ring_cnt)); 6929 6930 for (i = 0; i < tx_ring_cnt; ++i) { 6931 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6932 6933 mtx_lock(&txr->hn_tx_lock); 6934 txr->hn_suspended = 0; 6935 mtx_unlock(&txr->hn_tx_lock); 6936 } 6937 } 6938 6939 static void 6940 hn_resume_data(struct hn_softc *sc) 6941 { 6942 int i; 6943 6944 HN_LOCK_ASSERT(sc); 6945 6946 /* 6947 * Re-enable RX. 6948 */ 6949 hn_rxfilter_config(sc); 6950 6951 /* 6952 * Make sure to clear suspend status on "all" TX rings, 6953 * since hn_tx_ring_inuse can be changed after 6954 * hn_suspend_data(). 6955 */ 6956 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6957 6958 #ifdef HN_IFSTART_SUPPORT 6959 if (!hn_use_if_start) 6960 #endif 6961 { 6962 /* 6963 * Flush unused drbrs, since hn_tx_ring_inuse may be 6964 * reduced. 6965 */ 6966 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6967 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6968 } 6969 6970 /* 6971 * Kick start TX. 6972 */ 6973 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6974 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6975 6976 /* 6977 * Use txeof task, so that any pending oactive can be 6978 * cleared properly. 6979 */ 6980 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6981 } 6982 } 6983 6984 static void 6985 hn_resume_mgmt(struct hn_softc *sc) 6986 { 6987 6988 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6989 6990 /* 6991 * Kick off network change detection, if it was pending. 6992 * If no network change was pending, start link status 6993 * checks, which is more lightweight than network change 6994 * detection. 6995 */ 6996 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6997 hn_change_network(sc); 6998 else 6999 hn_update_link_status(sc); 7000 } 7001 7002 static void 7003 hn_resume(struct hn_softc *sc) 7004 { 7005 7006 /* 7007 * If the non-transparent mode VF is activated, the synthetic 7008 * device have to receive packets, so the data path of the 7009 * synthetic device must be resumed. 7010 */ 7011 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || 7012 (sc->hn_flags & HN_FLAG_RXVF)) 7013 hn_resume_data(sc); 7014 7015 /* 7016 * Don't resume link status change if VF is attached/activated. 7017 * - In the non-transparent VF mode, the synthetic device marks 7018 * link down until the VF is deactivated; i.e. VF is down. 7019 * - In transparent VF mode, VF's media status is used until 7020 * the VF is detached. 7021 */ 7022 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 7023 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 7024 hn_resume_mgmt(sc); 7025 7026 /* 7027 * Re-enable polling if this interface is running and 7028 * the polling is requested. 7029 */ 7030 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 7031 hn_polling(sc, sc->hn_pollhz); 7032 } 7033 7034 static void 7035 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 7036 { 7037 const struct rndis_status_msg *msg; 7038 int ofs; 7039 7040 if (dlen < sizeof(*msg)) { 7041 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7042 return; 7043 } 7044 msg = data; 7045 7046 switch (msg->rm_status) { 7047 case RNDIS_STATUS_MEDIA_CONNECT: 7048 case RNDIS_STATUS_MEDIA_DISCONNECT: 7049 hn_update_link_status(sc); 7050 break; 7051 7052 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7053 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7054 /* Not really useful; ignore. */ 7055 break; 7056 7057 case RNDIS_STATUS_NETWORK_CHANGE: 7058 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7059 if (dlen < ofs + msg->rm_stbuflen || 7060 msg->rm_stbuflen < sizeof(uint32_t)) { 7061 if_printf(sc->hn_ifp, "network changed\n"); 7062 } else { 7063 uint32_t change; 7064 7065 memcpy(&change, ((const uint8_t *)msg) + ofs, 7066 sizeof(change)); 7067 if_printf(sc->hn_ifp, "network changed, change %u\n", 7068 change); 7069 } 7070 hn_change_network(sc); 7071 break; 7072 7073 default: 7074 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7075 msg->rm_status); 7076 break; 7077 } 7078 } 7079 7080 static int 7081 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7082 { 7083 const struct rndis_pktinfo *pi = info_data; 7084 uint32_t mask = 0; 7085 7086 while (info_dlen != 0) { 7087 const void *data; 7088 uint32_t dlen; 7089 7090 if (__predict_false(info_dlen < sizeof(*pi))) 7091 return (EINVAL); 7092 if (__predict_false(info_dlen < pi->rm_size)) 7093 return (EINVAL); 7094 info_dlen -= pi->rm_size; 7095 7096 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7097 return (EINVAL); 7098 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7099 return (EINVAL); 7100 dlen = pi->rm_size - pi->rm_pktinfooffset; 7101 data = pi->rm_data; 7102 7103 if (pi->rm_internal == 1) { 7104 switch (pi->rm_type) { 7105 case NDIS_PKTINFO_IT_PKTINFO_ID: 7106 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7107 return (EINVAL); 7108 info->pktinfo_id = 7109 (const struct packet_info_id *)data; 7110 mask |= HN_RXINFO_PKTINFO_ID; 7111 break; 7112 7113 default: 7114 goto next; 7115 } 7116 } else { 7117 switch (pi->rm_type) { 7118 case NDIS_PKTINFO_TYPE_VLAN: 7119 if (__predict_false(dlen 7120 < NDIS_VLAN_INFO_SIZE)) 7121 return (EINVAL); 7122 info->vlan_info = (const uint32_t *)data; 7123 mask |= HN_RXINFO_VLAN; 7124 break; 7125 7126 case NDIS_PKTINFO_TYPE_CSUM: 7127 if (__predict_false(dlen 7128 < NDIS_RXCSUM_INFO_SIZE)) 7129 return (EINVAL); 7130 info->csum_info = (const uint32_t *)data; 7131 mask |= HN_RXINFO_CSUM; 7132 break; 7133 7134 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7135 if (__predict_false(dlen 7136 < HN_NDIS_HASH_VALUE_SIZE)) 7137 return (EINVAL); 7138 info->hash_value = (const uint32_t *)data; 7139 mask |= HN_RXINFO_HASHVAL; 7140 break; 7141 7142 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7143 if (__predict_false(dlen 7144 < HN_NDIS_HASH_INFO_SIZE)) 7145 return (EINVAL); 7146 info->hash_info = (const uint32_t *)data; 7147 mask |= HN_RXINFO_HASHINF; 7148 break; 7149 7150 default: 7151 goto next; 7152 } 7153 } 7154 7155 if (mask == HN_RXINFO_ALL) { 7156 /* All found; done */ 7157 break; 7158 } 7159 next: 7160 pi = (const struct rndis_pktinfo *) 7161 ((const uint8_t *)pi + pi->rm_size); 7162 } 7163 7164 /* 7165 * Final fixup. 7166 * - If there is no hash value, invalidate the hash info. 7167 */ 7168 if ((mask & HN_RXINFO_HASHVAL) == 0) 7169 info->hash_info = NULL; 7170 return (0); 7171 } 7172 7173 static __inline bool 7174 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7175 { 7176 7177 if (off < check_off) { 7178 if (__predict_true(off + len <= check_off)) 7179 return (false); 7180 } else if (off > check_off) { 7181 if (__predict_true(check_off + check_len <= off)) 7182 return (false); 7183 } 7184 return (true); 7185 } 7186 7187 static __inline void 7188 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7189 uint32_t len, struct hn_rxinfo *info) 7190 { 7191 uint32_t cnt = rxr->rsc.cnt; 7192 7193 if (cnt) { 7194 rxr->rsc.pktlen += len; 7195 } else { 7196 rxr->rsc.vlan_info = info->vlan_info; 7197 rxr->rsc.csum_info = info->csum_info; 7198 rxr->rsc.hash_info = info->hash_info; 7199 rxr->rsc.hash_value = info->hash_value; 7200 rxr->rsc.pktlen = len; 7201 } 7202 7203 rxr->rsc.frag_data[cnt] = data; 7204 rxr->rsc.frag_len[cnt] = len; 7205 rxr->rsc.cnt++; 7206 } 7207 7208 static void 7209 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7210 { 7211 const struct rndis_packet_msg *pkt; 7212 struct hn_rxinfo info; 7213 int data_off, pktinfo_off, data_len, pktinfo_len; 7214 bool rsc_more= false; 7215 7216 /* 7217 * Check length. 7218 */ 7219 if (__predict_false(dlen < sizeof(*pkt))) { 7220 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7221 return; 7222 } 7223 pkt = data; 7224 7225 if (__predict_false(dlen < pkt->rm_len)) { 7226 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7227 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7228 return; 7229 } 7230 if (__predict_false(pkt->rm_len < 7231 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7232 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7233 "msglen %u, data %u, oob %u, pktinfo %u\n", 7234 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7235 pkt->rm_pktinfolen); 7236 return; 7237 } 7238 if (__predict_false(pkt->rm_datalen == 0)) { 7239 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7240 return; 7241 } 7242 7243 /* 7244 * Check offests. 7245 */ 7246 #define IS_OFFSET_INVALID(ofs) \ 7247 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7248 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7249 7250 /* XXX Hyper-V does not meet data offset alignment requirement */ 7251 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7252 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7253 "data offset %u\n", pkt->rm_dataoffset); 7254 return; 7255 } 7256 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7257 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7258 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7259 "oob offset %u\n", pkt->rm_oobdataoffset); 7260 return; 7261 } 7262 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7263 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7264 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7265 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7266 return; 7267 } 7268 7269 #undef IS_OFFSET_INVALID 7270 7271 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7272 data_len = pkt->rm_datalen; 7273 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7274 pktinfo_len = pkt->rm_pktinfolen; 7275 7276 /* 7277 * Check OOB coverage. 7278 */ 7279 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7280 int oob_off, oob_len; 7281 7282 if_printf(rxr->hn_ifp, "got oobdata\n"); 7283 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7284 oob_len = pkt->rm_oobdatalen; 7285 7286 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7287 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7288 "oob overflow, msglen %u, oob abs %d len %d\n", 7289 pkt->rm_len, oob_off, oob_len); 7290 return; 7291 } 7292 7293 /* 7294 * Check against data. 7295 */ 7296 if (hn_rndis_check_overlap(oob_off, oob_len, 7297 data_off, data_len)) { 7298 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7299 "oob overlaps data, oob abs %d len %d, " 7300 "data abs %d len %d\n", 7301 oob_off, oob_len, data_off, data_len); 7302 return; 7303 } 7304 7305 /* 7306 * Check against pktinfo. 7307 */ 7308 if (pktinfo_len != 0 && 7309 hn_rndis_check_overlap(oob_off, oob_len, 7310 pktinfo_off, pktinfo_len)) { 7311 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7312 "oob overlaps pktinfo, oob abs %d len %d, " 7313 "pktinfo abs %d len %d\n", 7314 oob_off, oob_len, pktinfo_off, pktinfo_len); 7315 return; 7316 } 7317 } 7318 7319 /* 7320 * Check per-packet-info coverage and find useful per-packet-info. 7321 */ 7322 info.vlan_info = NULL; 7323 info.csum_info = NULL; 7324 info.hash_info = NULL; 7325 info.pktinfo_id = NULL; 7326 7327 if (__predict_true(pktinfo_len != 0)) { 7328 bool overlap; 7329 int error; 7330 7331 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7332 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7333 "pktinfo overflow, msglen %u, " 7334 "pktinfo abs %d len %d\n", 7335 pkt->rm_len, pktinfo_off, pktinfo_len); 7336 return; 7337 } 7338 7339 /* 7340 * Check packet info coverage. 7341 */ 7342 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7343 data_off, data_len); 7344 if (__predict_false(overlap)) { 7345 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7346 "pktinfo overlap data, pktinfo abs %d len %d, " 7347 "data abs %d len %d\n", 7348 pktinfo_off, pktinfo_len, data_off, data_len); 7349 return; 7350 } 7351 7352 /* 7353 * Find useful per-packet-info. 7354 */ 7355 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7356 pktinfo_len, &info); 7357 if (__predict_false(error)) { 7358 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7359 "pktinfo\n"); 7360 return; 7361 } 7362 } 7363 7364 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7365 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7366 "data overflow, msglen %u, data abs %d len %d\n", 7367 pkt->rm_len, data_off, data_len); 7368 return; 7369 } 7370 7371 /* Identify RSC fragments, drop invalid packets */ 7372 if ((info.pktinfo_id != NULL) && 7373 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7374 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7375 rxr->rsc.cnt = 0; 7376 rxr->hn_rsc_pkts++; 7377 } else if (rxr->rsc.cnt == 0) 7378 goto drop; 7379 7380 rsc_more = true; 7381 7382 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7383 rsc_more = false; 7384 7385 if (rsc_more && rxr->rsc.is_last) 7386 goto drop; 7387 } else { 7388 rxr->rsc.cnt = 0; 7389 } 7390 7391 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7392 goto drop; 7393 7394 /* Store data in per rx ring structure */ 7395 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7396 data_len, &info); 7397 7398 if (rsc_more) 7399 return; 7400 7401 hn_rxpkt(rxr); 7402 rxr->rsc.cnt = 0; 7403 return; 7404 drop: 7405 rxr->hn_rsc_drop++; 7406 return; 7407 } 7408 7409 static __inline void 7410 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7411 { 7412 const struct rndis_msghdr *hdr; 7413 7414 if (__predict_false(dlen < sizeof(*hdr))) { 7415 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7416 return; 7417 } 7418 hdr = data; 7419 7420 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7421 /* Hot data path. */ 7422 hn_rndis_rx_data(rxr, data, dlen); 7423 /* Done! */ 7424 return; 7425 } 7426 7427 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7428 hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen); 7429 else 7430 hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen); 7431 } 7432 7433 static void 7434 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7435 { 7436 const struct hn_nvs_hdr *hdr; 7437 7438 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7439 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7440 return; 7441 } 7442 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7443 7444 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7445 /* Useless; ignore */ 7446 return; 7447 } 7448 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7449 } 7450 7451 static void 7452 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7453 const struct vmbus_chanpkt_hdr *pkt) 7454 { 7455 struct hn_nvs_sendctx *sndc; 7456 7457 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7458 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7459 VMBUS_CHANPKT_DATALEN(pkt)); 7460 /* 7461 * NOTE: 7462 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7463 * its callback. 7464 */ 7465 } 7466 7467 static void 7468 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7469 const struct vmbus_chanpkt_hdr *pkthdr) 7470 { 7471 struct epoch_tracker et; 7472 const struct vmbus_chanpkt_rxbuf *pkt; 7473 const struct hn_nvs_hdr *nvs_hdr; 7474 int count, i, hlen; 7475 7476 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7477 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7478 return; 7479 } 7480 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7481 7482 /* Make sure that this is a RNDIS message. */ 7483 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7484 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7485 nvs_hdr->nvs_type); 7486 return; 7487 } 7488 7489 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7490 if (__predict_false(hlen < sizeof(*pkt))) { 7491 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7492 return; 7493 } 7494 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7495 7496 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7497 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7498 pkt->cp_rxbuf_id); 7499 return; 7500 } 7501 7502 count = pkt->cp_rxbuf_cnt; 7503 if (__predict_false(hlen < 7504 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7505 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7506 return; 7507 } 7508 7509 NET_EPOCH_ENTER(et); 7510 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7511 for (i = 0; i < count; ++i) { 7512 int ofs, len; 7513 7514 ofs = pkt->cp_rxbuf[i].rb_ofs; 7515 len = pkt->cp_rxbuf[i].rb_len; 7516 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7517 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7518 "ofs %d, len %d\n", i, ofs, len); 7519 continue; 7520 } 7521 7522 rxr->rsc.is_last = (i == (count - 1)); 7523 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7524 } 7525 NET_EPOCH_EXIT(et); 7526 7527 /* 7528 * Ack the consumed RXBUF associated w/ this channel packet, 7529 * so that this RXBUF can be recycled by the hypervisor. 7530 */ 7531 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7532 } 7533 7534 static void 7535 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7536 uint64_t tid) 7537 { 7538 struct hn_nvs_rndis_ack ack; 7539 int retries, error; 7540 7541 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7542 ack.nvs_status = HN_NVS_STATUS_OK; 7543 7544 retries = 0; 7545 again: 7546 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7547 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7548 if (__predict_false(error == EAGAIN)) { 7549 /* 7550 * NOTE: 7551 * This should _not_ happen in real world, since the 7552 * consumption of the TX bufring from the TX path is 7553 * controlled. 7554 */ 7555 if (rxr->hn_ack_failed == 0) 7556 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7557 rxr->hn_ack_failed++; 7558 retries++; 7559 if (retries < 10) { 7560 DELAY(100); 7561 goto again; 7562 } 7563 /* RXBUF leaks! */ 7564 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7565 } 7566 } 7567 7568 static void 7569 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7570 { 7571 struct hn_rx_ring *rxr = xrxr; 7572 struct hn_softc *sc = if_getsoftc(rxr->hn_ifp); 7573 7574 for (;;) { 7575 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7576 int error, pktlen; 7577 7578 pktlen = rxr->hn_pktbuf_len; 7579 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7580 if (__predict_false(error == ENOBUFS)) { 7581 void *nbuf; 7582 int nlen; 7583 7584 /* 7585 * Expand channel packet buffer. 7586 * 7587 * XXX 7588 * Use M_WAITOK here, since allocation failure 7589 * is fatal. 7590 */ 7591 nlen = rxr->hn_pktbuf_len * 2; 7592 while (nlen < pktlen) 7593 nlen *= 2; 7594 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7595 7596 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7597 rxr->hn_pktbuf_len, nlen); 7598 7599 free(rxr->hn_pktbuf, M_DEVBUF); 7600 rxr->hn_pktbuf = nbuf; 7601 rxr->hn_pktbuf_len = nlen; 7602 /* Retry! */ 7603 continue; 7604 } else if (__predict_false(error == EAGAIN)) { 7605 /* No more channel packets; done! */ 7606 break; 7607 } 7608 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7609 7610 switch (pkt->cph_type) { 7611 case VMBUS_CHANPKT_TYPE_COMP: 7612 hn_nvs_handle_comp(sc, chan, pkt); 7613 break; 7614 7615 case VMBUS_CHANPKT_TYPE_RXBUF: 7616 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7617 break; 7618 7619 case VMBUS_CHANPKT_TYPE_INBAND: 7620 hn_nvs_handle_notify(sc, pkt); 7621 break; 7622 7623 default: 7624 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7625 pkt->cph_type); 7626 break; 7627 } 7628 } 7629 hn_chan_rollup(rxr, rxr->hn_txr); 7630 } 7631 7632 static void 7633 hn_sysinit(void *arg __unused) 7634 { 7635 int i; 7636 7637 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7638 7639 #ifdef HN_IFSTART_SUPPORT 7640 /* 7641 * Don't use ifnet.if_start if transparent VF mode is requested; 7642 * mainly due to the IFF_DRV_OACTIVE flag. 7643 */ 7644 if (hn_xpnt_vf && hn_use_if_start) { 7645 hn_use_if_start = 0; 7646 printf("hn: tranparent VF mode, if_transmit will be used, " 7647 "instead of if_start\n"); 7648 } 7649 #endif 7650 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7651 printf("hn: invalid transparent VF attach routing " 7652 "wait timeout %d, reset to %d\n", 7653 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7654 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7655 } 7656 7657 /* 7658 * Initialize VF map. 7659 */ 7660 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7661 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7662 hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF, 7663 M_WAITOK | M_ZERO); 7664 7665 /* 7666 * Fix the # of TX taskqueues. 7667 */ 7668 if (hn_tx_taskq_cnt <= 0) 7669 hn_tx_taskq_cnt = 1; 7670 else if (hn_tx_taskq_cnt > mp_ncpus) 7671 hn_tx_taskq_cnt = mp_ncpus; 7672 7673 /* 7674 * Fix the TX taskqueue mode. 7675 */ 7676 switch (hn_tx_taskq_mode) { 7677 case HN_TX_TASKQ_M_INDEP: 7678 case HN_TX_TASKQ_M_GLOBAL: 7679 case HN_TX_TASKQ_M_EVTTQ: 7680 break; 7681 default: 7682 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7683 break; 7684 } 7685 7686 if (vm_guest != VM_GUEST_HV) 7687 return; 7688 7689 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7690 return; 7691 7692 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7693 M_DEVBUF, M_WAITOK); 7694 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7695 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7696 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7697 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7698 "hn tx%d", i); 7699 } 7700 } 7701 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7702 7703 static void 7704 hn_sysuninit(void *arg __unused) 7705 { 7706 7707 if (hn_tx_taskque != NULL) { 7708 int i; 7709 7710 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7711 taskqueue_free(hn_tx_taskque[i]); 7712 free(hn_tx_taskque, M_DEVBUF); 7713 } 7714 7715 if (hn_vfmap != NULL) 7716 free(hn_vfmap, M_DEVBUF); 7717 rm_destroy(&hn_vfmap_lock); 7718 7719 counter_u64_free(hn_udpcs_fixup); 7720 } 7721 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7722