/* * Copyright (c) 2008-2016 Solarflare Communications Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation are * those of the authors and should not be interpreted as representing official * policies, either expressed or implied, of the FreeBSD Project. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sfxge.h" #include "efx.h" /* RXQ flush response timeout (in microseconds) */ #define SFXGE_RX_QFLUSH_USEC (2000000) /* RXQ flush tries in the case of failure */ #define SFXGE_RX_QFLUSH_TRIES (5) /* RXQ default packet buffer preallocation (number of packet buffers) */ #define SFXGE_RX_QPREALLOC (0) /* Receive packet DMA attributes */ static ddi_device_acc_attr_t sfxge_rx_packet_devacc = { DDI_DEVICE_ATTR_V0, /* devacc_attr_version */ DDI_NEVERSWAP_ACC, /* devacc_attr_endian_flags */ DDI_STRICTORDER_ACC /* devacc_attr_dataorder */ }; static ddi_dma_attr_t sfxge_rx_packet_dma_attr = { DMA_ATTR_V0, /* dma_attr_version */ 0, /* dma_attr_addr_lo */ 0xffffffffffffffffull, /* dma_attr_addr_hi */ 0xffffffffffffffffull, /* dma_attr_count_max */ SFXGE_CPU_CACHE_SIZE, /* dma_attr_align */ 0xffffffff, /* dma_attr_burstsizes */ 1, /* dma_attr_minxfer */ 0xffffffffffffffffull, /* dma_attr_maxxfer */ 0xffffffffffffffffull, /* dma_attr_seg */ 1, /* dma_attr_sgllen */ 1, /* dma_attr_granular */ 0 /* dma_attr_flags */ }; /* Receive queue DMA attributes */ static ddi_device_acc_attr_t sfxge_rxq_devacc = { DDI_DEVICE_ATTR_V0, /* devacc_attr_version */ DDI_NEVERSWAP_ACC, /* devacc_attr_endian_flags */ DDI_STRICTORDER_ACC /* devacc_attr_dataorder */ }; static ddi_dma_attr_t sfxge_rxq_dma_attr = { DMA_ATTR_V0, /* dma_attr_version */ 0, /* dma_attr_addr_lo */ 0xffffffffffffffffull, /* dma_attr_addr_hi */ 0xffffffffffffffffull, /* dma_attr_count_max */ EFX_BUF_SIZE, /* dma_attr_align */ 0xffffffff, /* dma_attr_burstsizes */ 1, /* dma_attr_minxfer */ 0xffffffffffffffffull, /* dma_attr_maxxfer */ 0xffffffffffffffffull, /* dma_attr_seg */ 1, /* dma_attr_sgllen */ 1, /* dma_attr_granular */ 0 /* dma_attr_flags */ }; /* Forward declaration */ static void sfxge_rx_qpreallocate(sfxge_rxq_t *srp, int nprealloc); static int sfxge_rx_packet_ctor(void *buf, void *arg, int kmflags) { sfxge_rx_packet_t *srpp = buf; sfxge_t *sp = arg; dev_info_t *dip = sp->s_dip; int err; ASSERT3U(sizeof (srpp->__srp_u1.__srp_s1), <=, sizeof (srpp->__srp_u1.__srp_pad)); ASSERT3U(sizeof (srpp->__srp_u2.__srp_s2), <=, sizeof (srpp->__srp_u2.__srp_pad)); bzero(buf, sizeof (sfxge_rx_packet_t)); /* Allocate a DMA handle */ err = ddi_dma_alloc_handle(dip, &sfxge_rx_packet_dma_attr, (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT, NULL, &(srpp->srp_dma_handle)); if (err != DDI_SUCCESS) goto fail1; return (0); fail1: DTRACE_PROBE1(fail1, int, err); SFXGE_OBJ_CHECK(srpp, sfxge_rx_packet_t); return (-1); } static void sfxge_rx_packet_dtor(void *buf, void *arg) { sfxge_rx_packet_t *srpp = buf; _NOTE(ARGUNUSED(arg)) /* Free the DMA handle */ ddi_dma_free_handle(&(srpp->srp_dma_handle)); srpp->srp_dma_handle = NULL; SFXGE_OBJ_CHECK(srpp, sfxge_rx_packet_t); } static int sfxge_rx_qctor(void *buf, void *arg, int kmflags) { sfxge_rxq_t *srp = buf; efsys_mem_t *esmp = &(srp->sr_mem); sfxge_t *sp = arg; sfxge_dma_buffer_attr_t dma_attr; sfxge_rx_fpp_t *srfppp; int nprealloc; unsigned int id; int rc; /* Compile-time structure layout checks */ EFX_STATIC_ASSERT(sizeof (srp->__sr_u1.__sr_s1) <= sizeof (srp->__sr_u1.__sr_pad)); EFX_STATIC_ASSERT(sizeof (srp->__sr_u2.__sr_s2) <= sizeof (srp->__sr_u2.__sr_pad)); EFX_STATIC_ASSERT(sizeof (srp->__sr_u3.__sr_s3) <= sizeof (srp->__sr_u3.__sr_pad)); bzero(buf, sizeof (sfxge_rxq_t)); srp->sr_sp = sp; dma_attr.sdba_dip = sp->s_dip; dma_attr.sdba_dattrp = &sfxge_rxq_dma_attr; dma_attr.sdba_callback = DDI_DMA_SLEEP; dma_attr.sdba_length = EFX_RXQ_SIZE(sp->s_rxq_size); dma_attr.sdba_memflags = DDI_DMA_CONSISTENT; dma_attr.sdba_devaccp = &sfxge_rxq_devacc; dma_attr.sdba_bindflags = DDI_DMA_READ | DDI_DMA_CONSISTENT; dma_attr.sdba_maxcookies = 1; dma_attr.sdba_zeroinit = B_FALSE; if ((rc = sfxge_dma_buffer_create(esmp, &dma_attr)) != 0) goto fail1; /* Allocate some buffer table entries */ if ((rc = sfxge_sram_buf_tbl_alloc(sp, EFX_RXQ_NBUFS(sp->s_rxq_size), &(srp->sr_id))) != 0) goto fail2; /* Allocate the context array */ if ((srp->sr_srpp = kmem_zalloc(sizeof (sfxge_rx_packet_t *) * sp->s_rxq_size, kmflags)) == NULL) { rc = ENOMEM; goto fail3; } /* Allocate the flow table */ if ((srp->sr_flow = kmem_zalloc(sizeof (sfxge_rx_flow_t) * SFXGE_MAX_FLOW, kmflags)) == NULL) { rc = ENOMEM; goto fail4; } srp->sr_srfpp = &(srp->sr_srfp); srp->sr_rto = drv_usectohz(200000); srp->sr_mpp = &(srp->sr_mp); /* Initialize the free packet pool */ srfppp = &(srp->sr_fpp); if ((srfppp->srfpp_putp = kmem_zalloc(SFXGE_CPU_CACHE_SIZE * SFXGE_RX_FPP_NSLOTS, kmflags)) == NULL) { rc = ENOMEM; goto fail5; } for (id = 0; id < SFXGE_RX_FPP_NSLOTS; id++) { sfxge_rx_fpp_putlist_t *putp; size_t off; off = id * SFXGE_CPU_CACHE_SIZE; putp = (void *)(srfppp->srfpp_putp + off); putp->srfpl_putp = NULL; putp->srfpl_putpp = &(putp->srfpl_putp); mutex_init(&(putp->srfpl_lock), NULL, MUTEX_DRIVER, DDI_INTR_PRI(sp->s_intr.si_intr_pri)); } cv_init(&(srp->sr_flush_kv), NULL, CV_DRIVER, NULL); /* Preallocate some packets on the free packet pool */ nprealloc = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip, DDI_PROP_DONTPASS, "rx_prealloc_pkt_buffers", SFXGE_RX_QPREALLOC); sfxge_rx_qpreallocate(srp, nprealloc); return (0); fail5: DTRACE_PROBE(fail5); srp->sr_mpp = NULL; srp->sr_rto = 0; srp->sr_srfpp = NULL; /* Free the flow table */ kmem_free(srp->sr_flow, sizeof (sfxge_rx_flow_t) * SFXGE_MAX_FLOW); srp->sr_flow = NULL; fail4: DTRACE_PROBE(fail4); /* Free the context array */ kmem_free(srp->sr_srpp, sizeof (sfxge_rx_packet_t *) * sp->s_rxq_size); srp->sr_srpp = NULL; fail3: DTRACE_PROBE(fail3); /* Free the buffer table entries */ sfxge_sram_buf_tbl_free(sp, srp->sr_id, EFX_RXQ_NBUFS(sp->s_rxq_size)); srp->sr_id = 0; fail2: DTRACE_PROBE(fail2); /* Remove dma setup */ sfxge_dma_buffer_destroy(esmp); fail1: DTRACE_PROBE1(fail1, int, rc); srp->sr_sp = NULL; SFXGE_OBJ_CHECK(srp, sfxge_rxq_t); return (-1); } static void sfxge_rx_qdtor(void *buf, void *arg) { sfxge_rxq_t *srp = buf; efsys_mem_t *esmp = &(srp->sr_mem); sfxge_t *sp = srp->sr_sp; sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp); unsigned int id; _NOTE(ARGUNUSED(arg)) cv_destroy(&(srp->sr_flush_kv)); /* Tear down the free packet pool */ for (id = 0; id < SFXGE_RX_FPP_NSLOTS; id++) { sfxge_rx_fpp_putlist_t *putp; size_t off; off = id * SFXGE_CPU_CACHE_SIZE; putp = (void *)(srfppp->srfpp_putp + off); putp->srfpl_putpp = NULL; mutex_destroy(&(putp->srfpl_lock)); SFXGE_OBJ_CHECK(putp, sfxge_rx_fpp_putlist_t); } kmem_free(srfppp->srfpp_putp, SFXGE_CPU_CACHE_SIZE * SFXGE_RX_FPP_NSLOTS); srfppp->srfpp_putp = NULL; srp->sr_mpp = NULL; srp->sr_rto = 0; srp->sr_srfpp = NULL; /* Free the flow table */ kmem_free(srp->sr_flow, sizeof (sfxge_rx_flow_t) * SFXGE_MAX_FLOW); srp->sr_flow = NULL; /* Free the context array */ kmem_free(srp->sr_srpp, sizeof (sfxge_rx_packet_t *) * sp->s_rxq_size); srp->sr_srpp = NULL; /* Free the buffer table entries */ sfxge_sram_buf_tbl_free(sp, srp->sr_id, EFX_RXQ_NBUFS(sp->s_rxq_size)); srp->sr_id = 0; /* Tear down dma setup */ sfxge_dma_buffer_destroy(esmp); SFXGE_OBJ_CHECK(srp, sfxge_rxq_t); } /* Note: This function takes ownership of *srpp. */ static inline void sfxge_rx_qfpp_put(sfxge_rxq_t *srp, sfxge_rx_packet_t *srpp) { sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp); mblk_t *mp = srpp->srp_mp; unsigned int id; size_t off; sfxge_rx_fpp_putlist_t *putp; ASSERT3P(mp->b_next, ==, NULL); ASSERT3P(mp->b_prev, ==, NULL); id = CPU->cpu_seqid & SFXGE_RX_FPP_MASK; off = id * SFXGE_CPU_CACHE_SIZE; ASSERT3P(srpp->srp_putp, ==, srfppp->srfpp_putp); putp = (void *)(srpp->srp_putp + off); mutex_enter(&(putp->srfpl_lock)); putp->srfpl_count++; *putp->srfpl_putpp = mp; putp->srfpl_putpp = &(mp->b_next); mutex_exit(&(putp->srfpl_lock)); } static unsigned int sfxge_rx_qfpp_swizzle(sfxge_rxq_t *srp) { sfxge_t *sp = srp->sr_sp; unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp); unsigned int start; unsigned int id; mblk_t *p; mblk_t **pp; unsigned int count; unsigned int loaned; ASSERT(mutex_owned(&(sep->se_lock))); /* We want to access the put list for the current CPU last */ id = start = (CPU->cpu_seqid + 1) & SFXGE_RX_FPP_MASK; do { sfxge_rx_fpp_putlist_t *putp; size_t off; off = id * SFXGE_CPU_CACHE_SIZE; id = (id + 1) & SFXGE_RX_FPP_MASK; putp = (void *)(srfppp->srfpp_putp + off); /* Acquire the put list */ mutex_enter(&(putp->srfpl_lock)); p = putp->srfpl_putp; pp = putp->srfpl_putpp; count = putp->srfpl_count; putp->srfpl_putp = NULL; putp->srfpl_putpp = &(putp->srfpl_putp); putp->srfpl_count = 0; mutex_exit(&(putp->srfpl_lock)); if (p == NULL) continue; /* Add the list to the head of the get list */ *pp = srfppp->srfpp_get; srfppp->srfpp_get = p; /* Adjust the counters */ ASSERT3U(srfppp->srfpp_loaned, >=, count); srfppp->srfpp_loaned -= count; srfppp->srfpp_count += count; #if 0 /* NOTE: this probe is disabled because it is expensive!! */ DTRACE_PROBE2(count, unsigned int, (id - 1) & SFXGE_RX_FPP_MASK, unsigned int, count); #endif } while (id != start); /* Return the number of packets yet to appear in the put list */ loaned = srfppp->srfpp_loaned; return (loaned); } #define DB_FRTNP(mp) ((mp)->b_datap->db_frtnp) static void sfxge_rx_qfpp_empty(sfxge_rxq_t *srp) { sfxge_t *sp = srp->sr_sp; unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; sfxge_rx_fpp_t *srfppp; mblk_t *mp; mutex_enter(&(sep->se_lock)); srfppp = &(srp->sr_fpp); /* Swizzle put list to get list */ (void) sfxge_rx_qfpp_swizzle(srp); ASSERT3U(srfppp->srfpp_loaned, ==, 0); mp = srfppp->srfpp_get; srfppp->srfpp_get = NULL; /* Free the remainder */ while (mp != NULL) { mblk_t *next; frtn_t *freep; sfxge_rx_packet_t *srpp; next = mp->b_next; mp->b_next = NULL; ASSERT3U(srfppp->srfpp_count, >, 0); srfppp->srfpp_count--; freep = DB_FRTNP(mp); /* * ASSERT3P(freep->free_func, ==, sfxge_rx_qpacket_free); * is implied by srpp test below */ /*LINTED*/ srpp = (sfxge_rx_packet_t *)(freep->free_arg); ASSERT3P(srpp->srp_mp, ==, mp); ASSERT3P(mp->b_cont, ==, NULL); srpp->srp_recycle = B_FALSE; freeb(mp); mp = next; } ASSERT3U(srfppp->srfpp_count, ==, 0); srfppp->srfpp_min = 0; mutex_exit(&(sep->se_lock)); } /* * This is an estimate of all memory consumed per RX packet * it can be inaccurate but but sp->s_rx_pkt_mem_alloc mustn't drift */ static uint64_t sfxge_rx_pkt_mem_approx(const sfxge_rx_packet_t *srpp) { return (srpp->srp_mblksize + sizeof (mblk_t) + sizeof (dblk_t) + sizeof (sfxge_rx_packet_t)); } static void sfxge_rx_qpacket_destroy(sfxge_rxq_t *srp, sfxge_rx_packet_t *srpp) { sfxge_t *sp = srp->sr_sp; int64_t delta = sfxge_rx_pkt_mem_approx(srpp); ASSERT(!(srpp->srp_recycle)); ASSERT3P(srpp->srp_mp, ==, NULL); srpp->srp_off = 0; srpp->srp_thp = NULL; srpp->srp_iphp = NULL; srpp->srp_etherhp = NULL; srpp->srp_size = 0; srpp->srp_flags = 0; bzero(&(srpp->srp_free), sizeof (frtn_t)); srpp->srp_mblksize = 0; srpp->srp_base = NULL; /* Unbind the DMA memory from the DMA handle */ srpp->srp_addr = 0; (void) ddi_dma_unbind_handle(srpp->srp_dma_handle); /* Free the DMA memory */ srpp->srp_base = NULL; ddi_dma_mem_free(&(srpp->srp_acc_handle)); srpp->srp_acc_handle = NULL; srpp->srp_putp = NULL; srpp->srp_srp = NULL; kmem_cache_free(sp->s_rpc, srpp); if (sp->s_rx_pkt_mem_max) atomic_add_64(&sp->s_rx_pkt_mem_alloc, -delta); } static void sfxge_rx_qpacket_free(void *arg) { sfxge_rx_packet_t *srpp = arg; sfxge_rxq_t *srp = srpp->srp_srp; /* * WARNING "man -s 9f esballoc" states: * => runs sync from the thread calling freeb() * => must not sleep, or access data structures that could be freed */ /* Check whether we want to recycle the receive packets */ if (srpp->srp_recycle) { frtn_t *freep; mblk_t *mp; size_t size; freep = &(srpp->srp_free); ASSERT3P(freep->free_func, ==, sfxge_rx_qpacket_free); ASSERT3P(freep->free_arg, ==, (caddr_t)srpp); /* * Allocate a matching mblk_t before the current one is * freed. */ size = srpp->srp_mblksize; if ((mp = desballoc(srpp->srp_base, size, BPRI_HI, freep)) != NULL) { srpp->srp_mp = mp; /* NORMAL recycled case */ sfxge_rx_qfpp_put(srp, srpp); return; } } srpp->srp_mp = NULL; sfxge_rx_qpacket_destroy(srp, srpp); } static sfxge_rx_packet_t * sfxge_rx_qpacket_create(sfxge_rxq_t *srp) { sfxge_t *sp = srp->sr_sp; sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp); sfxge_rx_packet_t *srpp; size_t size; caddr_t base; size_t unit; ddi_dma_cookie_t dmac; unsigned int ncookies; frtn_t *freep; mblk_t *mp; int err; int rc; size = sp->s_rx_buffer_size; if (sp->s_rx_pkt_mem_max && (sp->s_rx_pkt_mem_alloc + size >= sp->s_rx_pkt_mem_max)) { DTRACE_PROBE(rx_pkt_mem_max); srp->sr_kstat.srk_rx_pkt_mem_limit++; return (NULL); } /* Allocate a new packet */ if ((srpp = kmem_cache_alloc(sp->s_rpc, KM_NOSLEEP)) == NULL) { srp->sr_kstat.srk_kcache_alloc_nomem++; rc = ENOMEM; goto fail1; } srpp->srp_srp = srp; srpp->srp_putp = srfppp->srfpp_putp; /* Allocate some DMA memory */ err = ddi_dma_mem_alloc(srpp->srp_dma_handle, size, &sfxge_rx_packet_devacc, DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, NULL, &base, &unit, &(srpp->srp_acc_handle)); switch (err) { case DDI_SUCCESS: break; case DDI_FAILURE: srp->sr_kstat.srk_dma_alloc_nomem++; rc = ENOMEM; goto fail2; default: srp->sr_kstat.srk_dma_alloc_fail++; rc = EFAULT; goto fail2; } /* Adjust the buffer to align the start of the DMA area correctly */ base += sp->s_rx_buffer_align; size -= sp->s_rx_buffer_align; /* Bind the DMA memory to the DMA handle */ err = ddi_dma_addr_bind_handle(srpp->srp_dma_handle, NULL, base, size, DDI_DMA_READ | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, NULL, &dmac, &ncookies); switch (err) { case DDI_DMA_MAPPED: break; case DDI_DMA_INUSE: srp->sr_kstat.srk_dma_bind_fail++; rc = EEXIST; goto fail3; case DDI_DMA_NORESOURCES: srp->sr_kstat.srk_dma_bind_nomem++; rc = ENOMEM; goto fail3; case DDI_DMA_NOMAPPING: srp->sr_kstat.srk_dma_bind_fail++; rc = ENOTSUP; goto fail3; case DDI_DMA_TOOBIG: srp->sr_kstat.srk_dma_bind_fail++; rc = EFBIG; goto fail3; default: srp->sr_kstat.srk_dma_bind_fail++; rc = EFAULT; goto fail3; } ASSERT3U(ncookies, ==, 1); srpp->srp_addr = dmac.dmac_laddress; srpp->srp_base = (unsigned char *)base; srpp->srp_mblksize = size; /* * Allocate a STREAMS block: We use size 1 so that the allocator will * use the first (and smallest) dblk cache. */ freep = &(srpp->srp_free); freep->free_func = sfxge_rx_qpacket_free; freep->free_arg = (caddr_t)srpp; if ((mp = desballoc(srpp->srp_base, size, BPRI_HI, freep)) == NULL) { srp->sr_kstat.srk_desballoc_fail++; rc = ENOMEM; goto fail4; } srpp->srp_mp = mp; srpp->srp_recycle = B_TRUE; if (sp->s_rx_pkt_mem_max) { int64_t delta = sfxge_rx_pkt_mem_approx(srpp); atomic_add_64(&sp->s_rx_pkt_mem_alloc, delta); } return (srpp); fail4: DTRACE_PROBE(fail4); bzero(&(srpp->srp_free), sizeof (frtn_t)); srpp->srp_mblksize = 0; srpp->srp_base = NULL; /* Unbind the DMA memory from the DMA handle */ srpp->srp_addr = 0; (void) ddi_dma_unbind_handle(srpp->srp_dma_handle); fail3: DTRACE_PROBE(fail3); /* Free the DMA memory */ ddi_dma_mem_free(&(srpp->srp_acc_handle)); srpp->srp_acc_handle = NULL; fail2: DTRACE_PROBE(fail2); srpp->srp_putp = NULL; srpp->srp_srp = NULL; kmem_cache_free(sp->s_rpc, srpp); fail1: DTRACE_PROBE1(fail1, int, rc); return (NULL); } #define SFXGE_REFILL_BATCH 64 /* Try to refill the RX descriptor ring from the associated free pkt pool */ static void sfxge_rx_qrefill(sfxge_rxq_t *srp, unsigned int target) { sfxge_t *sp = srp->sr_sp; sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp); unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; mblk_t *mp; int ntodo; unsigned int count; unsigned int batch; unsigned int rxfill; unsigned int mblksize; prefetch_read_many(sp->s_enp); prefetch_read_many(srp->sr_erp); ASSERT(mutex_owned(&(sep->se_lock))); if (srp->sr_state != SFXGE_RXQ_STARTED) return; rxfill = srp->sr_added - srp->sr_completed; ASSERT3U(rxfill, <=, EFX_RXQ_LIMIT(sp->s_rxq_size)); ntodo = min(EFX_RXQ_LIMIT(sp->s_rxq_size) - rxfill, target); ASSERT3U(ntodo, <=, EFX_RXQ_LIMIT(sp->s_rxq_size)); if (ntodo == 0) goto out; (void) sfxge_rx_qfpp_swizzle(srp); mp = srfppp->srfpp_get; count = srfppp->srfpp_count; mblksize = sp->s_rx_buffer_size - sp->s_rx_buffer_align; batch = 0; while (ntodo-- > 0) { mblk_t *next; frtn_t *freep; sfxge_rx_packet_t *srpp; unsigned int id; if (mp == NULL) break; next = mp->b_next; mp->b_next = NULL; if (next != NULL) prefetch_read_many(next); freep = DB_FRTNP(mp); /*LINTED*/ srpp = (sfxge_rx_packet_t *)(freep->free_arg); ASSERT3P(srpp->srp_mp, ==, mp); /* The MTU may have changed since the packet was allocated */ if (MBLKSIZE(mp) != mblksize) { srpp->srp_recycle = B_FALSE; freeb(mp); --count; mp = next; continue; } srpp->srp_off = 0; srpp->srp_thp = NULL; srpp->srp_iphp = NULL; srpp->srp_etherhp = NULL; srpp->srp_size = 0; srpp->srp_flags = EFX_DISCARD; id = (srp->sr_added + batch) & (sp->s_rxq_size - 1); ASSERT(srp->sr_srpp[id] == NULL); srp->sr_srpp[id] = srpp; addr[batch++] = srpp->srp_addr; if (batch == SFXGE_REFILL_BATCH) { efx_rx_qpost(srp->sr_erp, addr, mblksize, batch, srp->sr_completed, srp->sr_added); srp->sr_added += batch; batch = 0; } --count; mp = next; } srfppp->srfpp_get = mp; srfppp->srfpp_count = count; if (batch != 0) { efx_rx_qpost(srp->sr_erp, addr, mblksize, batch, srp->sr_completed, srp->sr_added); srp->sr_added += batch; } efx_rx_qpush(srp->sr_erp, srp->sr_added, &srp->sr_pushed); out: if (srfppp->srfpp_count < srfppp->srfpp_min) srfppp->srfpp_min = srfppp->srfpp_count; } /* Preallocate packets and put them in the free packet pool */ static void sfxge_rx_qpreallocate(sfxge_rxq_t *srp, int nprealloc) { sfxge_rx_fpp_t *srfppp = &((srp)->sr_fpp); srfppp->srfpp_lowat = nprealloc; while (nprealloc-- > 0) { sfxge_rx_packet_t *srpp; if ((srpp = sfxge_rx_qpacket_create(srp)) == NULL) break; sfxge_rx_qfpp_put(srp, srpp); } } /* Try to refill the RX descriptor ring by allocating new packets */ static void sfxge_rx_qfill(sfxge_rxq_t *srp, unsigned int target) { sfxge_t *sp = srp->sr_sp; unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; unsigned int batch; unsigned int rxfill; unsigned int mblksize; int ntodo; efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; mblk_t *mp = NULL; prefetch_read_many(sp->s_enp); prefetch_read_many(srp->sr_erp); ASSERT(mutex_owned(&(sep->se_lock))); if (srp->sr_state != SFXGE_RXQ_STARTED) return; rxfill = srp->sr_added - srp->sr_completed; ASSERT3U(rxfill, <=, EFX_RXQ_LIMIT(sp->s_rxq_size)); ntodo = min(EFX_RXQ_LIMIT(sp->s_rxq_size) - rxfill, target); ASSERT3U(ntodo, <=, EFX_RXQ_LIMIT(sp->s_rxq_size)); if (ntodo == 0) return; mblksize = sp->s_rx_buffer_size - sp->s_rx_buffer_align; batch = 0; while (ntodo-- > 0) { sfxge_rx_packet_t *srpp; unsigned int id; if ((srpp = sfxge_rx_qpacket_create(srp)) == NULL) break; mp = srpp->srp_mp; ASSERT3U(MBLKSIZE(mp), ==, mblksize); ASSERT3U(srpp->srp_off, ==, 0); ASSERT3P(srpp->srp_thp, ==, NULL); ASSERT3P(srpp->srp_iphp, ==, NULL); ASSERT3P(srpp->srp_etherhp, ==, NULL); ASSERT3U(srpp->srp_size, ==, 0); srpp->srp_flags = EFX_DISCARD; id = (srp->sr_added + batch) & (sp->s_rxq_size - 1); ASSERT(srp->sr_srpp[id] == NULL); srp->sr_srpp[id] = srpp; addr[batch++] = srpp->srp_addr; if (batch == SFXGE_REFILL_BATCH) { efx_rx_qpost(srp->sr_erp, addr, mblksize, batch, srp->sr_completed, srp->sr_added); srp->sr_added += batch; batch = 0; } } if (batch != 0) { efx_rx_qpost(srp->sr_erp, addr, mblksize, batch, srp->sr_completed, srp->sr_added); srp->sr_added += batch; } efx_rx_qpush(srp->sr_erp, srp->sr_added, &srp->sr_pushed); } void sfxge_rx_qfpp_trim(sfxge_rxq_t *srp) { sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp); sfxge_t *sp = srp->sr_sp; unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; mblk_t *p; mblk_t **pp; int count; ASSERT(mutex_owned(&(sep->se_lock))); if (srp->sr_state != SFXGE_RXQ_STARTED) goto done; /* Make sure the queue is full */ sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size)); /* The refill may have emptied the pool */ if (srfppp->srfpp_min == 0) goto done; /* Don't trim below the pool's low water mark */ if (srfppp->srfpp_count <= srfppp->srfpp_lowat) goto done; ASSERT(srfppp->srfpp_min <= srfppp->srfpp_count); /* Trim to the largest of srfppp->srfpp_min and srfpp->srfpp_lowat */ if (srfppp->srfpp_lowat > srfppp->srfpp_min) count = srfppp->srfpp_count - srfppp->srfpp_lowat; else count = srfppp->srfpp_count - srfppp->srfpp_min; /* Walk the get list */ pp = &(srfppp->srfpp_get); while (--count >= 0) { ASSERT(pp); p = *pp; ASSERT(p != NULL); pp = &(p->b_next); } ASSERT(pp); p = *pp; /* Truncate the get list */ *pp = NULL; /* Free the remainder */ while (p != NULL) { mblk_t *next; frtn_t *freep; sfxge_rx_packet_t *srpp; next = p->b_next; p->b_next = NULL; ASSERT3U(srfppp->srfpp_min, >, 0); srfppp->srfpp_min--; srfppp->srfpp_count--; freep = DB_FRTNP(p); /*LINTED*/ srpp = (sfxge_rx_packet_t *)(freep->free_arg); ASSERT3P(srpp->srp_mp, ==, p); srpp->srp_recycle = B_FALSE; freeb(p); p = next; } done: srfppp->srfpp_min = srfppp->srfpp_count; } static void sfxge_rx_qpoll(void *arg) { sfxge_rxq_t *srp = arg; sfxge_t *sp = srp->sr_sp; unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; uint16_t magic; /* * man timeout(9f) states that this code should adhere to the * same requirements as a softirq handler - DO NOT BLOCK */ /* * Post an event to the event queue to cause the free packet pool to be * trimmed if it is oversize. */ magic = SFXGE_MAGIC_RX_QFPP_TRIM | index; #if defined(DEBUG) /* This is guaranteed due to the start/stop order of rx and ev */ ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED); ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED); #else /* * Bug22691 WORKAROUND: * This handler has been observed in the field to be invoked for a * queue in the INITIALIZED state, which should never happen. * Until the mechanism for this is properly understood, add defensive * checks. */ if ((sep->se_state != SFXGE_EVQ_STARTED) || (srp->sr_state != SFXGE_RXQ_STARTED) || (!sep->se_eep)) { dev_err(sp->s_dip, CE_WARN, SFXGE_CMN_ERR "RXQ[%d] bad state in sfxge_rx_qpoll %d %d %p", index, sep->se_state, srp->sr_state, sep->se_eep); return; } #endif efx_ev_qpost(sep->se_eep, magic); srp->sr_tid = timeout(sfxge_rx_qpoll, srp, drv_usectohz(sp->s_rxq_poll_usec)); } static void sfxge_rx_qpoll_start(sfxge_rxq_t *srp) { sfxge_t *sp = srp->sr_sp; unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; ASSERT(mutex_owned(&(sep->se_lock))); ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED); /* Schedule a poll */ ASSERT3P(srp->sr_tid, ==, 0); srp->sr_tid = timeout(sfxge_rx_qpoll, srp, 0); } static void sfxge_rx_qpoll_stop(sfxge_rxq_t *srp) { sfxge_t *sp = srp->sr_sp; unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; timeout_id_t tid; ASSERT(mutex_owned(&(sep->se_lock))); ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED); /* * Cancel the qpoll timer. Care is needed as this function * can race with sfxge_rx_qpoll() for timeout id updates. * * Do not hold locks used by any timeout(9f) handlers across * calls to untimeout(9f) as this will deadlock. */ tid = 0; while ((srp->sr_tid != 0) && (srp->sr_tid != tid)) { tid = srp->sr_tid; (void) untimeout(tid); } srp->sr_tid = 0; } static int sfxge_rx_kstat_update(kstat_t *ksp, int rw) { sfxge_rxq_t *srp = ksp->ks_private; sfxge_t *sp = srp->sr_sp; unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; kstat_named_t *knp; int rc; if (rw != KSTAT_READ) { rc = EACCES; goto fail1; } ASSERT(mutex_owned(&(sep->se_lock))); if (srp->sr_state != SFXGE_RXQ_STARTED) goto done; knp = ksp->ks_data; /* NB pointer post-increment below */ knp++->value.ui32 = srp->sr_kstat.srk_rx_pkt_mem_limit; knp++->value.ui32 = srp->sr_kstat.srk_kcache_alloc_nomem; knp++->value.ui32 = srp->sr_kstat.srk_dma_alloc_nomem; knp++->value.ui32 = srp->sr_kstat.srk_dma_alloc_fail; knp++->value.ui32 = srp->sr_kstat.srk_dma_bind_nomem; knp++->value.ui32 = srp->sr_kstat.srk_dma_bind_fail; knp++->value.ui32 = srp->sr_kstat.srk_desballoc_fail; knp++->value.ui32 = srp->sr_kstat.srk_rxq_empty_discard; done: return (0); fail1: DTRACE_PROBE1(fail1, int, rc); return (rc); } static int sfxge_rx_kstat_init(sfxge_rxq_t *srp) { sfxge_t *sp = srp->sr_sp; unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; dev_info_t *dip = sp->s_dip; char name[MAXNAMELEN]; kstat_t *ksp; kstat_named_t *knp; int rc; /* Create the set */ (void) snprintf(name, MAXNAMELEN - 1, "%s_rxq%04d", ddi_driver_name(dip), index); if ((ksp = kstat_create((char *)ddi_driver_name(dip), ddi_get_instance(dip), name, "rxq", KSTAT_TYPE_NAMED, SFXGE_RX_NSTATS, 0)) == NULL) { rc = ENOMEM; goto fail1; } srp->sr_ksp = ksp; ksp->ks_update = sfxge_rx_kstat_update; ksp->ks_private = srp; ksp->ks_lock = &(sep->se_lock); /* Initialise the named stats */ knp = ksp->ks_data; kstat_named_init(knp, "rx_pkt_mem_limit", KSTAT_DATA_UINT32); knp++; kstat_named_init(knp, "kcache_alloc_nomem", KSTAT_DATA_UINT32); knp++; kstat_named_init(knp, "dma_alloc_nomem", KSTAT_DATA_UINT32); knp++; kstat_named_init(knp, "dma_alloc_fail", KSTAT_DATA_UINT32); knp++; kstat_named_init(knp, "dma_bind_nomem", KSTAT_DATA_UINT32); knp++; kstat_named_init(knp, "dma_bind_fail", KSTAT_DATA_UINT32); knp++; kstat_named_init(knp, "desballoc_fail", KSTAT_DATA_UINT32); knp++; kstat_named_init(knp, "rxq_empty_discard", KSTAT_DATA_UINT32); kstat_install(ksp); return (0); fail1: DTRACE_PROBE1(fail1, int, rc); return (rc); } static int sfxge_rx_qinit(sfxge_t *sp, unsigned int index) { sfxge_rxq_t *srp; int rc; ASSERT3U(index, <, SFXGE_RX_SCALE_MAX); if ((srp = kmem_cache_alloc(sp->s_rqc, KM_SLEEP)) == NULL) { rc = ENOMEM; goto fail1; } ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_UNINITIALIZED); srp->sr_index = index; sp->s_srp[index] = srp; if ((rc = sfxge_rx_kstat_init(srp)) != 0) goto fail2; srp->sr_state = SFXGE_RXQ_INITIALIZED; return (0); fail2: DTRACE_PROBE(fail2); kmem_cache_free(sp->s_rqc, srp); fail1: DTRACE_PROBE1(fail1, int, rc); return (rc); } static int sfxge_rx_qstart(sfxge_t *sp, unsigned int index) { sfxge_evq_t *sep = sp->s_sep[index]; sfxge_rxq_t *srp; efsys_mem_t *esmp; efx_nic_t *enp; unsigned int level; int rc; mutex_enter(&(sep->se_lock)); srp = sp->s_srp[index]; enp = sp->s_enp; esmp = &(srp->sr_mem); ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_INITIALIZED); ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED); /* Zero the memory */ bzero(esmp->esm_base, EFX_RXQ_SIZE(sp->s_rxq_size)); /* Program the buffer table */ if ((rc = sfxge_sram_buf_tbl_set(sp, srp->sr_id, esmp, EFX_RXQ_NBUFS(sp->s_rxq_size))) != 0) goto fail1; /* Create the receive queue */ if ((rc = efx_rx_qcreate(enp, index, index, EFX_RXQ_TYPE_DEFAULT, esmp, sp->s_rxq_size, srp->sr_id, sep->se_eep, &(srp->sr_erp))) != 0) goto fail2; /* Enable the receive queue */ efx_rx_qenable(srp->sr_erp); /* Set the water marks */ srp->sr_hiwat = EFX_RXQ_LIMIT(sp->s_rxq_size) * 9 / 10; srp->sr_lowat = srp->sr_hiwat / 2; srp->sr_state = SFXGE_RXQ_STARTED; srp->sr_flush = SFXGE_FLUSH_INACTIVE; sfxge_rx_qpoll_start(srp); /* Try to fill the queue from the pool */ sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size)); /* * If there were insufficient buffers in the pool to reach the at * least a batch then allocate some. */ level = srp->sr_added - srp->sr_completed; if (level < SFXGE_RX_BATCH) sfxge_rx_qfill(srp, SFXGE_RX_BATCH); mutex_exit(&(sep->se_lock)); return (0); fail2: DTRACE_PROBE(fail2); /* Clear entries from the buffer table */ sfxge_sram_buf_tbl_clear(sp, srp->sr_id, EFX_RXQ_NBUFS(sp->s_rxq_size)); fail1: DTRACE_PROBE1(fail1, int, rc); mutex_exit(&(sep->se_lock)); return (rc); } static void sfxge_rx_qflow_complete(sfxge_rxq_t *srp, sfxge_rx_flow_t *srfp) { mblk_t *mp; struct ether_header *etherhp; struct ip *iphp; struct tcphdr *thp; if (srfp->srf_mp == NULL) return; mp = srfp->srf_mp; etherhp = srfp->srf_etherhp; iphp = srfp->srf_iphp; thp = srfp->srf_last_thp; ASSERT3U(((etherhp->ether_type == htons(ETHERTYPE_VLAN)) ? sizeof (struct ether_vlan_header) : sizeof (struct ether_header)) + srfp->srf_len, ==, msgdsize(mp)); ASSERT3U(srfp->srf_len & 0xffff, ==, srfp->srf_len); iphp->ip_len = htons(srfp->srf_len); srfp->srf_first_thp->th_ack = thp->th_ack; srfp->srf_first_thp->th_win = thp->th_win; srfp->srf_first_thp->th_flags = thp->th_flags; DTRACE_PROBE2(flow_complete, uint32_t, srfp->srf_tag, size_t, srfp->srf_len); srfp->srf_mp = NULL; srfp->srf_len = 0; ASSERT(mp->b_next == NULL); *(srp->sr_mpp) = mp; srp->sr_mpp = &(mp->b_next); } static boolean_t sfxge_rx_qflow_add(sfxge_rxq_t *srp, sfxge_rx_flow_t *srfp, sfxge_rx_packet_t *srpp, clock_t now) { sfxge_t *sp = srp->sr_sp; struct ether_header *etherhp = srpp->srp_etherhp; struct ip *iphp = srpp->srp_iphp; struct tcphdr *thp = srpp->srp_thp; size_t off = srpp->srp_off; size_t size = (size_t)(srpp->srp_size); mblk_t *mp = srpp->srp_mp; uint32_t seq; unsigned int shift; ASSERT3U(MBLKL(mp), ==, off + size); ASSERT3U(DB_CKSUMFLAGS(mp), ==, HCK_FULLCKSUM | HCK_FULLCKSUM_OK | HCK_IPV4_HDRCKSUM); seq = htonl(thp->th_seq); /* * If the time between this segment and the last is greater than RTO * then consider this a new flow. */ if (now - srfp->srf_lbolt > srp->sr_rto) { srfp->srf_count = 1; srfp->srf_seq = seq + size; goto fail1; } if (seq != srfp->srf_seq) { if (srfp->srf_count > SFXGE_SLOW_START) srfp->srf_count = SFXGE_SLOW_START; srfp->srf_count >>= 1; srfp->srf_count++; srfp->srf_seq = seq + size; goto fail2; } /* Update the in-order segment count and sequence number */ srfp->srf_count++; srfp->srf_seq = seq + size; /* Don't merge across pure ACK, URG, SYN or RST segments */ if (size == 0 || thp->th_flags & (TH_URG | TH_SYN | TH_RST) || thp->th_urp != 0) goto fail3; /* * If the in-order segment count has not yet reached the slow-start * threshold then we cannot coalesce. */ if (srfp->srf_count < SFXGE_SLOW_START) goto fail4; /* Scale up the packet size from 4k (the maximum being 64k) */ ASSERT3U(srfp->srf_count, >=, SFXGE_SLOW_START); shift = MIN(srfp->srf_count - SFXGE_SLOW_START + 12, 16); if (srfp->srf_len + size >= (1 << shift)) sfxge_rx_qflow_complete(srp, srfp); ASSERT(mp->b_cont == NULL); if (srfp->srf_mp == NULL) { /* First packet in this flow */ srfp->srf_etherhp = etherhp; srfp->srf_iphp = iphp; srfp->srf_first_thp = srfp->srf_last_thp = thp; ASSERT3P(mp->b_cont, ==, NULL); srfp->srf_mp = mp; srfp->srf_mpp = &(mp->b_cont); srfp->srf_len = ntohs(iphp->ip_len); /* * If the flow is not already in the list of occupied flows then * add it. */ if (srfp->srf_next == NULL && srp->sr_srfpp != &(srfp->srf_next)) { *(srp->sr_srfpp) = srfp; srp->sr_srfpp = &(srfp->srf_next); } } else { /* Later packet in this flow - skip TCP header */ srfp->srf_last_thp = thp; mp->b_rptr += off; ASSERT3U(MBLKL(mp), ==, size); ASSERT3P(mp->b_cont, ==, NULL); *(srfp->srf_mpp) = mp; srfp->srf_mpp = &(mp->b_cont); srfp->srf_len += size; ASSERT(srfp->srf_next != NULL || srp->sr_srfpp == &(srfp->srf_next)); } DTRACE_PROBE2(flow_add, uint32_t, srfp->srf_tag, size_t, size); /* * Try to align coalesced segments on push boundaries, unless they * are too frequent. */ if (sp->s_rx_coalesce_mode == SFXGE_RX_COALESCE_ALLOW_PUSH && thp->th_flags & TH_PUSH) sfxge_rx_qflow_complete(srp, srfp); srfp->srf_lbolt = now; return (B_TRUE); fail4: fail3: fail2: fail1: sfxge_rx_qflow_complete(srp, srfp); srfp->srf_lbolt = now; return (B_FALSE); } void sfxge_rx_qpacket_coalesce(sfxge_rxq_t *srp) { sfxge_t *sp = srp->sr_sp; clock_t now; mblk_t *mp; sfxge_rx_flow_t *srfp; ASSERT(sp->s_rx_coalesce_mode != SFXGE_RX_COALESCE_OFF); now = ddi_get_lbolt(); mp = srp->sr_mp; srp->sr_mp = NULL; srp->sr_mpp = &(srp->sr_mp); /* Start with the last flow to be appended to */ srfp = *(srp->sr_srfpp); while (mp != NULL) { frtn_t *freep; sfxge_rx_packet_t *srpp; struct ether_header *etherhp; struct ip *iphp; struct tcphdr *thp; size_t off; size_t size; uint16_t ether_tci; uint32_t hash; uint32_t tag; mblk_t *next; sfxge_packet_type_t pkt_type; uint16_t sport, dport; next = mp->b_next; mp->b_next = NULL; if (next != NULL) prefetch_read_many(next); freep = DB_FRTNP(mp); /*LINTED*/ srpp = (sfxge_rx_packet_t *)(freep->free_arg); ASSERT3P(srpp->srp_mp, ==, mp); /* If the packet is not TCP then we cannot coalesce it */ if (~(srpp->srp_flags) & EFX_PKT_TCP) goto reject; /* * If the packet is not fully checksummed then we cannot * coalesce it. */ if (~(srpp->srp_flags) & (EFX_CKSUM_TCPUDP | EFX_CKSUM_IPV4)) goto reject; /* Parse the TCP header */ pkt_type = sfxge_pkthdr_parse(mp, ðerhp, &iphp, &thp, &off, &size, &sport, &dport); ASSERT(pkt_type == SFXGE_PACKET_TYPE_IPV4_TCP); ASSERT(etherhp != NULL); ASSERT(iphp != NULL); ASSERT(thp != NULL); ASSERT(off != 0); if ((iphp->ip_off & ~htons(IP_DF)) != 0) goto reject; if (etherhp->ether_type == htons(ETHERTYPE_VLAN)) { struct ether_vlan_header *ethervhp; ethervhp = (struct ether_vlan_header *)etherhp; ether_tci = ethervhp->ether_tci; } else { ether_tci = 0; } /* * Make sure any minimum length padding is stripped * before we try to add the packet to a flow. */ ASSERT3U(sp->s_rx_prefix_size + MBLKL(mp), ==, (size_t)(srpp->srp_size)); ASSERT3U(sp->s_rx_prefix_size + off + size, <=, (size_t)(srpp->srp_size)); if (sp->s_rx_prefix_size + off + size < (size_t)(srpp->srp_size)) mp->b_wptr = mp->b_rptr + off + size; /* * If there is no current flow, or the segment does not match * the current flow then we must attempt to look up the * correct flow in the table. */ if (srfp == NULL) goto lookup; if (srfp->srf_saddr != iphp->ip_src.s_addr || srfp->srf_daddr != iphp->ip_dst.s_addr) goto lookup; if (srfp->srf_sport != thp->th_sport || srfp->srf_dport != thp->th_dport) goto lookup; if (srfp->srf_tci != ether_tci) goto lookup; add: ASSERT(srfp != NULL); srpp->srp_etherhp = etherhp; srpp->srp_iphp = iphp; srpp->srp_thp = thp; srpp->srp_off = off; ASSERT3U(size, <, (1 << 16)); srpp->srp_size = (uint16_t)size; /* Try to append the packet to the flow */ if (!sfxge_rx_qflow_add(srp, srfp, srpp, now)) goto reject; mp = next; continue; lookup: /* * If there is a prefix area then read the hash from that, * otherwise calculate it. */ if (sp->s_rx_prefix_size != 0) { hash = efx_psuedo_hdr_hash_get(sp->s_enp, EFX_RX_HASHALG_TOEPLITZ, DB_BASE(mp)); } else { SFXGE_TCP_HASH(sp, &iphp->ip_src.s_addr, thp->th_sport, &iphp->ip_dst.s_addr, thp->th_dport, hash); } srfp = &(srp->sr_flow[(hash >> 6) % SFXGE_MAX_FLOW]); tag = hash + 1; /* Make sure it's not zero */ /* * If the flow we have found does not match the hash then * it may be an unused flow, or it may be stale. */ if (tag != srfp->srf_tag) { if (srfp->srf_count != 0) { if (now - srfp->srf_lbolt <= srp->sr_rto) goto reject; } if (srfp->srf_mp != NULL) goto reject; /* Start a new flow */ ASSERT(srfp->srf_next == NULL); srfp->srf_tag = tag; srfp->srf_saddr = iphp->ip_src.s_addr; srfp->srf_daddr = iphp->ip_dst.s_addr; srfp->srf_sport = thp->th_sport; srfp->srf_dport = thp->th_dport; srfp->srf_tci = ether_tci; srfp->srf_count = 0; srfp->srf_seq = ntohl(thp->th_seq); srfp->srf_lbolt = now; goto add; } /* * If the flow we have found does match the hash then it could * still be an alias. */ if (srfp->srf_saddr != iphp->ip_src.s_addr || srfp->srf_daddr != iphp->ip_dst.s_addr) goto reject; if (srfp->srf_sport != thp->th_sport || srfp->srf_dport != thp->th_dport) goto reject; if (srfp->srf_tci != ether_tci) goto reject; goto add; reject: *(srp->sr_mpp) = mp; srp->sr_mpp = &(mp->b_next); mp = next; } } void sfxge_rx_qcomplete(sfxge_rxq_t *srp, boolean_t eop) { sfxge_t *sp = srp->sr_sp; unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; unsigned int completed; sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp); unsigned int level; ASSERT(mutex_owned(&(sep->se_lock))); ASSERT(srp->sr_mp == NULL); ASSERT(srp->sr_mpp == &(srp->sr_mp)); completed = srp->sr_completed; while (completed != srp->sr_pending) { unsigned int id; sfxge_rx_packet_t *srpp; mblk_t *mp; size_t size; uint16_t flags; int rc; id = completed++ & (sp->s_rxq_size - 1); if (srp->sr_pending - completed >= 4) { unsigned int prefetch; prefetch = (id + 4) & (sp->s_rxq_size - 1); srpp = srp->sr_srpp[prefetch]; ASSERT(srpp != NULL); mp = srpp->srp_mp; prefetch_read_many(mp->b_datap); } else if (completed == srp->sr_pending) { prefetch_read_many(srp->sr_mp); } srpp = srp->sr_srpp[id]; ASSERT(srpp != NULL); srp->sr_srpp[id] = NULL; mp = srpp->srp_mp; ASSERT(mp->b_cont == NULL); /* when called from sfxge_rx_qstop() */ if (srp->sr_state != SFXGE_RXQ_STARTED) goto discard; if (srpp->srp_flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) goto discard; /* Make the data visible to the kernel */ rc = ddi_dma_sync(srpp->srp_dma_handle, 0, sp->s_rx_buffer_size, DDI_DMA_SYNC_FORKERNEL); ASSERT3P(rc, ==, DDI_SUCCESS); /* Read the length from the psuedo header if required */ if (srpp->srp_flags & EFX_PKT_PREFIX_LEN) { rc = efx_psuedo_hdr_pkt_length_get(sp->s_enp, mp->b_rptr, &srpp->srp_size); ASSERT3P(rc, ==, 0); srpp->srp_size += sp->s_rx_prefix_size; } /* Set up the packet length */ ASSERT3P(mp->b_rptr, ==, DB_BASE(mp)); mp->b_rptr += sp->s_rx_prefix_size; prefetch_read_many(mp->b_rptr); ASSERT3P(mp->b_wptr, ==, DB_BASE(mp)); mp->b_wptr += (size_t)(srpp->srp_size); ASSERT3P(mp->b_wptr, <=, DB_LIM(mp)); /* Calculate the maximum packet size */ size = sp->s_mtu; size += (srpp->srp_flags & EFX_PKT_VLAN_TAGGED) ? sizeof (struct ether_vlan_header) : sizeof (struct ether_header); if (MBLKL(mp) > size) goto discard; /* Check for loopback packets */ if (!(srpp->srp_flags & EFX_PKT_IPV4) && !(srpp->srp_flags & EFX_PKT_IPV6)) { struct ether_header *etherhp; /*LINTED*/ etherhp = (struct ether_header *)(mp->b_rptr); if (etherhp->ether_type == htons(SFXGE_ETHERTYPE_LOOPBACK)) { DTRACE_PROBE(loopback); srp->sr_loopback++; goto discard; } } /* Set up the checksum information */ flags = 0; if (srpp->srp_flags & EFX_CKSUM_IPV4) { ASSERT(srpp->srp_flags & EFX_PKT_IPV4); flags |= HCK_IPV4_HDRCKSUM; } if (srpp->srp_flags & EFX_CKSUM_TCPUDP) { ASSERT(srpp->srp_flags & EFX_PKT_TCP || srpp->srp_flags & EFX_PKT_UDP); flags |= HCK_FULLCKSUM | HCK_FULLCKSUM_OK; } DB_CKSUMSTART(mp) = 0; DB_CKSUMSTUFF(mp) = 0; DB_CKSUMEND(mp) = 0; DB_CKSUMFLAGS(mp) = flags; DB_CKSUM16(mp) = 0; /* Add the packet to the tail of the chain */ srfppp->srfpp_loaned++; ASSERT(mp->b_next == NULL); *(srp->sr_mpp) = mp; srp->sr_mpp = &(mp->b_next); continue; discard: /* Return the packet to the pool */ srfppp->srfpp_loaned++; freeb(mp); /* Equivalent to freemsg() as b_cont==0 */ } srp->sr_completed = completed; /* Attempt to coalesce any TCP packets */ if (sp->s_rx_coalesce_mode != SFXGE_RX_COALESCE_OFF) sfxge_rx_qpacket_coalesce(srp); /* * If there are any pending flows and this is the end of the * poll then they must be completed. */ if (srp->sr_srfp != NULL && eop) { sfxge_rx_flow_t *srfp; srfp = srp->sr_srfp; srp->sr_srfp = NULL; srp->sr_srfpp = &(srp->sr_srfp); do { sfxge_rx_flow_t *next; next = srfp->srf_next; srfp->srf_next = NULL; sfxge_rx_qflow_complete(srp, srfp); srfp = next; } while (srfp != NULL); } level = srp->sr_pushed - srp->sr_completed; /* If there are any packets then pass them up the stack */ if (srp->sr_mp != NULL) { mblk_t *mp; mp = srp->sr_mp; srp->sr_mp = NULL; srp->sr_mpp = &(srp->sr_mp); if (level == 0) { /* Try to refill ASAP */ sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size)); level = srp->sr_pushed - srp->sr_completed; } /* * If the RXQ is still empty, discard and recycle the * current entry to ensure that the ring always * contains at least one descriptor. This ensures that * the next hardware RX will trigger an event * (possibly delayed by interrupt moderation) and * trigger another refill/fill attempt. * * Note this drops a complete LRO fragment from the * start of the batch. * * Note also that copymsgchain() does not help with * resource starvation here, unless we are short of DMA * mappings. */ if (level == 0) { mblk_t *nmp; srp->sr_kstat.srk_rxq_empty_discard++; DTRACE_PROBE1(rxq_empty_discard, int, index); nmp = mp->b_next; if (nmp) sfxge_gld_rx_post(sp, index, nmp); /* as level==0 will swizzle,rxpost below */ freemsg(mp); } else { sfxge_gld_rx_post(sp, index, mp); } } /* Top up the queue if necessary */ if (level < srp->sr_hiwat) { sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size)); level = srp->sr_added - srp->sr_completed; if (level < srp->sr_lowat) sfxge_rx_qfill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size)); } } void sfxge_rx_qflush_done(sfxge_rxq_t *srp) { sfxge_t *sp = srp->sr_sp; unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; boolean_t flush_pending; ASSERT(mutex_owned(&(sep->se_lock))); /* * Flush successful: wakeup sfxge_rx_qstop() if flush is pending. * * A delayed flush event received after RxQ stop has timed out * will be ignored, as then the flush state will not be PENDING * (see SFCbug22989). */ flush_pending = (srp->sr_flush == SFXGE_FLUSH_PENDING); srp->sr_flush = SFXGE_FLUSH_DONE; if (flush_pending) cv_broadcast(&(srp->sr_flush_kv)); } void sfxge_rx_qflush_failed(sfxge_rxq_t *srp) { sfxge_t *sp = srp->sr_sp; unsigned int index = srp->sr_index; sfxge_evq_t *sep = sp->s_sep[index]; boolean_t flush_pending; ASSERT(mutex_owned(&(sep->se_lock))); /* * Flush failed: wakeup sfxge_rx_qstop() if flush is pending. * * A delayed flush event received after RxQ stop has timed out * will be ignored, as then the flush state will not be PENDING * (see SFCbug22989). */ flush_pending = (srp->sr_flush == SFXGE_FLUSH_PENDING); srp->sr_flush = SFXGE_FLUSH_FAILED; if (flush_pending) cv_broadcast(&(srp->sr_flush_kv)); } static void sfxge_rx_qstop(sfxge_t *sp, unsigned int index) { dev_info_t *dip = sp->s_dip; sfxge_evq_t *sep = sp->s_sep[index]; sfxge_rxq_t *srp; clock_t timeout; unsigned int flush_tries = SFXGE_RX_QFLUSH_TRIES; int rc; ASSERT(mutex_owned(&(sp->s_state_lock))); mutex_enter(&(sep->se_lock)); srp = sp->s_srp[index]; ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED); sfxge_rx_qpoll_stop(srp); /* Further packets are discarded by sfxge_rx_qcomplete() */ srp->sr_state = SFXGE_RXQ_INITIALIZED; if (sp->s_hw_err != SFXGE_HW_OK) { /* * Flag indicates possible hardware failure. * Attempt flush but do not wait for it to complete. */ srp->sr_flush = SFXGE_FLUSH_DONE; (void) efx_rx_qflush(srp->sr_erp); } /* Wait upto 2sec for queue flushing to complete */ timeout = ddi_get_lbolt() + drv_usectohz(SFXGE_RX_QFLUSH_USEC); while (srp->sr_flush != SFXGE_FLUSH_DONE && flush_tries-- > 0) { if ((rc = efx_rx_qflush(srp->sr_erp)) != 0) { if (rc == EALREADY) srp->sr_flush = SFXGE_FLUSH_DONE; else srp->sr_flush = SFXGE_FLUSH_FAILED; break; } srp->sr_flush = SFXGE_FLUSH_PENDING; if (cv_timedwait(&(srp->sr_flush_kv), &(sep->se_lock), timeout) < 0) { /* Timeout waiting for successful or failed flush */ dev_err(dip, CE_NOTE, SFXGE_CMN_ERR "rxq[%d] flush timeout", index); break; } } if (srp->sr_flush == SFXGE_FLUSH_FAILED) dev_err(dip, CE_NOTE, SFXGE_CMN_ERR "rxq[%d] flush failed", index); DTRACE_PROBE1(flush, sfxge_flush_state_t, srp->sr_flush); srp->sr_flush = SFXGE_FLUSH_DONE; /* Destroy the receive queue */ efx_rx_qdestroy(srp->sr_erp); srp->sr_erp = NULL; /* Clear entries from the buffer table */ sfxge_sram_buf_tbl_clear(sp, srp->sr_id, EFX_RXQ_NBUFS(sp->s_rxq_size)); /* * Free any unused RX packets which had descriptors on the RXQ * Packets will be discard as state != STARTED */ srp->sr_pending = srp->sr_added; sfxge_rx_qcomplete(srp, B_TRUE); ASSERT3U(srp->sr_completed, ==, srp->sr_pending); srp->sr_added = 0; srp->sr_pushed = 0; srp->sr_pending = 0; srp->sr_completed = 0; srp->sr_loopback = 0; srp->sr_lowat = 0; srp->sr_hiwat = 0; mutex_exit(&(sep->se_lock)); } static void sfxge_rx_kstat_fini(sfxge_rxq_t *srp) { kstat_delete(srp->sr_ksp); srp->sr_ksp = NULL; } static void sfxge_rx_qfini(sfxge_t *sp, unsigned int index) { sfxge_rxq_t *srp = sp->s_srp[index]; ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_INITIALIZED); sp->s_srp[index] = NULL; srp->sr_state = SFXGE_RXQ_UNINITIALIZED; sfxge_rx_kstat_fini(srp); /* Empty the pool */ sfxge_rx_qfpp_empty(srp); srp->sr_index = 0; kmem_cache_free(sp->s_rqc, srp); } static int sfxge_rx_scale_kstat_update(kstat_t *ksp, int rw) { sfxge_t *sp = ksp->ks_private; sfxge_rx_scale_t *srsp = &(sp->s_rx_scale); sfxge_intr_t *sip = &(sp->s_intr); kstat_named_t *knp; unsigned int index; unsigned int entry; unsigned int *freq; int rc; ASSERT(mutex_owned(&(srsp->srs_lock))); if (rw != KSTAT_READ) { rc = EACCES; goto fail1; } if ((freq = kmem_zalloc(sizeof (unsigned int) * sip->si_nalloc, KM_NOSLEEP)) == NULL) { rc = ENOMEM; goto fail2; } for (entry = 0; entry < SFXGE_RX_SCALE_MAX; entry++) { index = srsp->srs_tbl[entry]; freq[index]++; } knp = ksp->ks_data; for (index = 0; index < sip->si_nalloc; index++) { knp->value.ui64 = freq[index]; knp++; } knp->value.ui64 = srsp->srs_count; kmem_free(freq, sizeof (unsigned int) * sip->si_nalloc); return (0); fail2: DTRACE_PROBE(fail2); fail1: DTRACE_PROBE1(fail1, int, rc); return (rc); } static int sfxge_rx_scale_kstat_init(sfxge_t *sp) { dev_info_t *dip = sp->s_dip; sfxge_rx_scale_t *srsp = &(sp->s_rx_scale); sfxge_intr_t *sip = &(sp->s_intr); char name[MAXNAMELEN]; kstat_t *ksp; kstat_named_t *knp; unsigned int index; int rc; /* Create the set */ (void) snprintf(name, MAXNAMELEN - 1, "%s_rss", ddi_driver_name(dip)); if ((ksp = kstat_create((char *)ddi_driver_name(dip), ddi_get_instance(dip), name, "rss", KSTAT_TYPE_NAMED, sip->si_nalloc + 1, 0)) == NULL) { rc = ENOMEM; goto fail1; } srsp->srs_ksp = ksp; ksp->ks_update = sfxge_rx_scale_kstat_update; ksp->ks_private = sp; ksp->ks_lock = &(srsp->srs_lock); /* Initialise the named stats */ knp = ksp->ks_data; for (index = 0; index < sip->si_nalloc; index++) { char name[MAXNAMELEN]; (void) snprintf(name, MAXNAMELEN - 1, "evq%04d_count", index); kstat_named_init(knp, name, KSTAT_DATA_UINT64); knp++; } kstat_named_init(knp, "scale", KSTAT_DATA_UINT64); kstat_install(ksp); return (0); fail1: DTRACE_PROBE1(fail1, int, rc); return (rc); } static void sfxge_rx_scale_kstat_fini(sfxge_t *sp) { sfxge_rx_scale_t *srsp = &(sp->s_rx_scale); /* Destroy the set */ kstat_delete(srsp->srs_ksp); srsp->srs_ksp = NULL; } unsigned int sfxge_rx_scale_prop_get(sfxge_t *sp) { int rx_scale; rx_scale = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip, DDI_PROP_DONTPASS, "rx_scale_count", SFXGE_RX_SCALE_MAX); /* 0 and all -ve numbers sets to number of logical CPUs */ if (rx_scale <= 0) rx_scale = ncpus; return (rx_scale); } static int sfxge_rx_scale_init(sfxge_t *sp) { sfxge_rx_scale_t *srsp = &(sp->s_rx_scale); sfxge_intr_t *sip = &(sp->s_intr); int rc; ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_UNINITIALIZED); /* Create tables for CPU, core, cache and chip counts */ srsp->srs_cpu = kmem_zalloc(sizeof (unsigned int) * NCPU, KM_SLEEP); mutex_init(&(srsp->srs_lock), NULL, MUTEX_DRIVER, NULL); /* We need at least one event queue */ srsp->srs_count = sfxge_rx_scale_prop_get(sp); if (srsp->srs_count > sip->si_nalloc) srsp->srs_count = sip->si_nalloc; if (srsp->srs_count < 1) srsp->srs_count = 1; /* Set up the kstats */ if ((rc = sfxge_rx_scale_kstat_init(sp)) != 0) goto fail1; srsp->srs_state = SFXGE_RX_SCALE_INITIALIZED; return (0); fail1: DTRACE_PROBE1(fail1, int, rc); mutex_destroy(&(srsp->srs_lock)); return (rc); } void sfxge_rx_scale_update(void *arg) { sfxge_t *sp = arg; sfxge_rx_scale_t *srsp = &(sp->s_rx_scale); sfxge_intr_t *sip; processorid_t id; unsigned int count; unsigned int *tbl; unsigned int *rating; unsigned int entry; int rc; mutex_enter(&(srsp->srs_lock)); if (srsp->srs_state != SFXGE_RX_SCALE_STARTED) { rc = EFAULT; goto fail1; } if ((tbl = kmem_zalloc(sizeof (unsigned int) * SFXGE_RX_SCALE_MAX, KM_NOSLEEP)) == NULL) { rc = ENOMEM; goto fail2; } sip = &(sp->s_intr); if ((rating = kmem_zalloc(sizeof (unsigned int) * sip->si_nalloc, KM_NOSLEEP)) == NULL) { rc = ENOMEM; goto fail3; } mutex_enter(&cpu_lock); /* * Substract any current CPU, core, cache and chip usage from the * global contention tables. */ for (id = 0; id < NCPU; id++) { ASSERT3U(sfxge_cpu[id], >=, srsp->srs_cpu[id]); sfxge_cpu[id] -= srsp->srs_cpu[id]; srsp->srs_cpu[id] = 0; } ASSERT(srsp->srs_count != 0); /* Choose as many event queues as we need */ for (count = 0; count < srsp->srs_count; count++) { unsigned int index; sfxge_evq_t *sep; unsigned int choice; unsigned int choice_rating; bzero(rating, sizeof (unsigned int) * sip->si_nalloc); /* * Rate each event queue on its global level of CPU * contention. */ for (index = 0; index < sip->si_nalloc; index++) { sep = sp->s_sep[index]; id = sep->se_cpu_id; rating[index] += sfxge_cpu[id]; } /* Choose the queue with the lowest CPU contention */ choice = 0; choice_rating = rating[0]; for (index = 1; index < sip->si_nalloc; index++) { if (rating[index] < choice_rating) { choice = index; choice_rating = rating[index]; } } /* Add our choice to the condensed RSS table */ tbl[count] = choice; /* Add information to the global contention tables */ sep = sp->s_sep[choice]; id = sep->se_cpu_id; srsp->srs_cpu[id]++; sfxge_cpu[id]++; } mutex_exit(&cpu_lock); /* Build the expanded RSS table */ count = 0; for (entry = 0; entry < SFXGE_RX_SCALE_MAX; entry++) { unsigned int index; index = tbl[count]; count = (count + 1) % srsp->srs_count; srsp->srs_tbl[entry] = index; } /* Program the expanded RSS table into the hardware */ (void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl, SFXGE_RX_SCALE_MAX); mutex_exit(&(srsp->srs_lock)); kmem_free(rating, sizeof (unsigned int) * sip->si_nalloc); kmem_free(tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX); return; fail3: DTRACE_PROBE(fail3); kmem_free(tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX); fail2: DTRACE_PROBE(fail2); fail1: DTRACE_PROBE1(fail1, int, rc); mutex_exit(&(srsp->srs_lock)); } static int sfxge_rx_scale_start(sfxge_t *sp) { sfxge_rx_scale_t *srsp = &(sp->s_rx_scale); int rc; mutex_enter(&(srsp->srs_lock)); ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_INITIALIZED); /* Clear down the RSS table */ bzero(srsp->srs_tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX); (void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl, SFXGE_RX_SCALE_MAX); if ((rc = sfxge_toeplitz_hash_init(sp)) != 0) goto fail1; srsp->srs_state = SFXGE_RX_SCALE_STARTED; mutex_exit(&(srsp->srs_lock)); /* sfxge_t->s_state_lock held */ (void) ddi_taskq_dispatch(sp->s_tqp, sfxge_rx_scale_update, sp, DDI_SLEEP); return (0); fail1: DTRACE_PROBE1(fail1, int, rc); mutex_exit(&(srsp->srs_lock)); return (rc); } int sfxge_rx_scale_count_get(sfxge_t *sp, unsigned int *countp) { sfxge_rx_scale_t *srsp = &(sp->s_rx_scale); int rc; mutex_enter(&(srsp->srs_lock)); if (srsp->srs_state != SFXGE_RX_SCALE_INITIALIZED && srsp->srs_state != SFXGE_RX_SCALE_STARTED) { rc = ENOTSUP; goto fail1; } *countp = srsp->srs_count; mutex_exit(&(srsp->srs_lock)); return (0); fail1: DTRACE_PROBE1(fail1, int, rc); mutex_exit(&(srsp->srs_lock)); return (rc); } int sfxge_rx_scale_count_set(sfxge_t *sp, unsigned int count) { sfxge_rx_scale_t *srsp = &(sp->s_rx_scale); sfxge_intr_t *sip = &(sp->s_intr); int dispatch = 1; int rc; if (count < 1 || count > sip->si_nalloc) { rc = EINVAL; goto fail1; } mutex_enter(&(srsp->srs_lock)); if (srsp->srs_state != SFXGE_RX_SCALE_INITIALIZED && srsp->srs_state != SFXGE_RX_SCALE_STARTED) { rc = ENOTSUP; goto fail2; } srsp->srs_count = count; if (srsp->srs_state != SFXGE_RX_SCALE_STARTED) dispatch = 0; mutex_exit(&(srsp->srs_lock)); if (dispatch) /* no locks held */ (void) ddi_taskq_dispatch(sp->s_tqp, sfxge_rx_scale_update, sp, DDI_SLEEP); return (0); fail2: DTRACE_PROBE(fail2); mutex_exit(&(srsp->srs_lock)); fail1: DTRACE_PROBE1(fail1, int, rc); return (rc); } static void sfxge_rx_scale_stop(sfxge_t *sp) { sfxge_rx_scale_t *srsp = &(sp->s_rx_scale); processorid_t id; mutex_enter(&(srsp->srs_lock)); ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_STARTED); srsp->srs_state = SFXGE_RX_SCALE_INITIALIZED; mutex_enter(&cpu_lock); /* * Substract any current CPU, core, cache and chip usage from the * global contention tables. */ for (id = 0; id < NCPU; id++) { ASSERT3U(sfxge_cpu[id], >=, srsp->srs_cpu[id]); sfxge_cpu[id] -= srsp->srs_cpu[id]; srsp->srs_cpu[id] = 0; } mutex_exit(&cpu_lock); /* Clear down the RSS table */ bzero(srsp->srs_tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX); (void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl, SFXGE_RX_SCALE_MAX); mutex_exit(&(srsp->srs_lock)); } static void sfxge_rx_scale_fini(sfxge_t *sp) { sfxge_rx_scale_t *srsp = &(sp->s_rx_scale); ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_INITIALIZED); srsp->srs_state = SFXGE_RX_SCALE_UNINITIALIZED; /* Tear down the kstats */ sfxge_rx_scale_kstat_fini(sp); srsp->srs_count = 0; mutex_destroy(&(srsp->srs_lock)); /* Destroy tables */ kmem_free(srsp->srs_cpu, sizeof (unsigned int) * NCPU); srsp->srs_cpu = NULL; sfxge_toeplitz_hash_fini(sp); } int sfxge_rx_init(sfxge_t *sp) { sfxge_intr_t *sip = &(sp->s_intr); char name[MAXNAMELEN]; int index; int rc; if (sip->si_state == SFXGE_INTR_UNINITIALIZED) { rc = EINVAL; goto fail1; } if ((rc = sfxge_rx_scale_init(sp)) != 0) goto fail2; (void) snprintf(name, MAXNAMELEN - 1, "%s%d_rx_packet_cache", ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip)); sp->s_rpc = kmem_cache_create(name, sizeof (sfxge_rx_packet_t), SFXGE_CPU_CACHE_SIZE, sfxge_rx_packet_ctor, sfxge_rx_packet_dtor, NULL, sp, NULL, 0); ASSERT(sp->s_rpc != NULL); (void) snprintf(name, MAXNAMELEN - 1, "%s%d_rxq_cache", ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip)); sp->s_rqc = kmem_cache_create(name, sizeof (sfxge_rxq_t), SFXGE_CPU_CACHE_SIZE, sfxge_rx_qctor, sfxge_rx_qdtor, NULL, sp, NULL, 0); ASSERT(sp->s_rqc != NULL); sp->s_rx_pkt_mem_max = ddi_prop_get_int64(DDI_DEV_T_ANY, sp->s_dip, DDI_PROP_DONTPASS, "rx_pkt_mem_max", 0); /* disabled */ /* Initialize the receive queue(s) */ for (index = 0; index < sip->si_nalloc; index++) { if ((rc = sfxge_rx_qinit(sp, index)) != 0) goto fail3; } sp->s_rx_coalesce_mode = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip, DDI_PROP_DONTPASS, "rx_coalesce_mode", SFXGE_RX_COALESCE_OFF); return (0); fail3: DTRACE_PROBE(fail3); /* Tear down the receive queue(s) */ while (--index >= 0) sfxge_rx_qfini(sp, index); kmem_cache_destroy(sp->s_rqc); sp->s_rqc = NULL; kmem_cache_destroy(sp->s_rpc); sp->s_rpc = NULL; sfxge_rx_scale_fini(sp); fail2: DTRACE_PROBE(fail2); fail1: DTRACE_PROBE1(fail1, int, rc); return (rc); } int sfxge_rx_start(sfxge_t *sp) { sfxge_mac_t *smp = &(sp->s_mac); sfxge_intr_t *sip; const efx_nic_cfg_t *encp; size_t hdrlen, align; int index; int rc; mutex_enter(&(smp->sm_lock)); /* Calculate the receive packet buffer size and alignment */ sp->s_rx_buffer_size = EFX_MAC_PDU(sp->s_mtu); encp = efx_nic_cfg_get(sp->s_enp); /* Packet buffer allocations are cache line aligned */ EFSYS_ASSERT3U(encp->enc_rx_buf_align_start, <=, SFXGE_CPU_CACHE_SIZE); if (sp->s_family == EFX_FAMILY_HUNTINGTON) { sp->s_rx_prefix_size = encp->enc_rx_prefix_size; hdrlen = sp->s_rx_prefix_size + sizeof (struct ether_header); /* Ensure IP headers are 32bit aligned */ sp->s_rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen; sp->s_rx_buffer_size += sp->s_rx_buffer_align; } else if (encp->enc_features & EFX_FEATURE_LFSR_HASH_INSERT) { sp->s_rx_prefix_size = encp->enc_rx_prefix_size; /* * Place the start of the buffer a prefix length minus 2 * before the start of a cache line. This ensures that the * last two bytes of the prefix (which is where the LFSR hash * is located) are in the same cache line as the headers, and * the IP header is 32-bit aligned. */ sp->s_rx_buffer_align = SFXGE_CPU_CACHE_SIZE - (encp->enc_rx_prefix_size - 2); sp->s_rx_buffer_size += sp->s_rx_buffer_align; } else { sp->s_rx_prefix_size = 0; /* * Place the start of the buffer 2 bytes after a cache line * boundary so that the headers fit into the cache line and * the IP header is 32-bit aligned. */ hdrlen = sp->s_rx_prefix_size + sizeof (struct ether_header); sp->s_rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen; sp->s_rx_buffer_size += sp->s_rx_buffer_align; } /* Align end of packet buffer for RX DMA end padding */ align = MAX(1, encp->enc_rx_buf_align_end); EFSYS_ASSERT(ISP2(align)); sp->s_rx_buffer_size = P2ROUNDUP(sp->s_rx_buffer_size, align); /* Initialize the receive module */ if ((rc = efx_rx_init(sp->s_enp)) != 0) goto fail1; mutex_exit(&(smp->sm_lock)); if ((rc = sfxge_rx_scale_start(sp)) != 0) goto fail2; /* Start the receive queue(s) */ sip = &(sp->s_intr); for (index = 0; index < sip->si_nalloc; index++) { if ((rc = sfxge_rx_qstart(sp, index)) != 0) goto fail3; } ASSERT3U(sp->s_srp[0]->sr_state, ==, SFXGE_RXQ_STARTED); /* It is sufficient to have Rx scale initialized */ ASSERT3U(sp->s_rx_scale.srs_state, ==, SFXGE_RX_SCALE_STARTED); rc = efx_mac_filter_default_rxq_set(sp->s_enp, sp->s_srp[0]->sr_erp, sp->s_rx_scale.srs_count > 1); if (rc != 0) goto fail4; return (0); fail4: DTRACE_PROBE(fail4); fail3: DTRACE_PROBE(fail3); /* Stop the receive queue(s) */ while (--index >= 0) sfxge_rx_qstop(sp, index); sfxge_rx_scale_stop(sp); fail2: DTRACE_PROBE(fail2); mutex_enter(&(smp->sm_lock)); /* Tear down the receive module */ efx_rx_fini(sp->s_enp); fail1: DTRACE_PROBE1(fail1, int, rc); mutex_exit(&(smp->sm_lock)); return (rc); } void sfxge_rx_coalesce_mode_get(sfxge_t *sp, sfxge_rx_coalesce_mode_t *modep) { *modep = sp->s_rx_coalesce_mode; } int sfxge_rx_coalesce_mode_set(sfxge_t *sp, sfxge_rx_coalesce_mode_t mode) { int rc; switch (mode) { case SFXGE_RX_COALESCE_OFF: case SFXGE_RX_COALESCE_DISALLOW_PUSH: case SFXGE_RX_COALESCE_ALLOW_PUSH: break; default: rc = EINVAL; goto fail1; } sp->s_rx_coalesce_mode = mode; return (0); fail1: DTRACE_PROBE1(fail1, int, rc); return (rc); } void sfxge_rx_stop(sfxge_t *sp) { sfxge_mac_t *smp = &(sp->s_mac); sfxge_intr_t *sip = &(sp->s_intr); efx_nic_t *enp = sp->s_enp; int index; ASSERT(mutex_owned(&(sp->s_state_lock))); efx_mac_filter_default_rxq_clear(enp); /* Stop the receive queue(s) */ index = sip->si_nalloc; while (--index >= 0) { /* TBD: Flush RXQs in parallel; HW has limit + may need retry */ sfxge_rx_qstop(sp, index); } sfxge_rx_scale_stop(sp); mutex_enter(&(smp->sm_lock)); /* Tear down the receive module */ efx_rx_fini(enp); sp->s_rx_buffer_align = 0; sp->s_rx_prefix_size = 0; sp->s_rx_buffer_size = 0; mutex_exit(&(smp->sm_lock)); } unsigned int sfxge_rx_loaned(sfxge_t *sp) { sfxge_intr_t *sip = &(sp->s_intr); int index; unsigned int loaned; ASSERT3U(sip->si_state, ==, SFXGE_INTR_INITIALIZED); loaned = 0; for (index = 0; index < sip->si_nalloc; index++) { sfxge_rxq_t *srp = sp->s_srp[index]; sfxge_evq_t *sep = sp->s_sep[srp->sr_index]; mutex_enter(&(sep->se_lock)); loaned += sfxge_rx_qfpp_swizzle(srp); mutex_exit(&(sep->se_lock)); } return (loaned); } void sfxge_rx_fini(sfxge_t *sp) { sfxge_intr_t *sip = &(sp->s_intr); int index; ASSERT3U(sip->si_state, ==, SFXGE_INTR_INITIALIZED); sp->s_rx_coalesce_mode = SFXGE_RX_COALESCE_OFF; /* Tear down the receive queue(s) */ index = sip->si_nalloc; while (--index >= 0) sfxge_rx_qfini(sp, index); ASSERT3U(sp->s_rx_pkt_mem_alloc, ==, 0); kmem_cache_destroy(sp->s_rqc); sp->s_rqc = NULL; kmem_cache_destroy(sp->s_rpc); sp->s_rpc = NULL; sfxge_rx_scale_fini(sp); }