/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2020, The University of Queensland * Copyright (c) 2018, Joyent, Inc. * Copyright 2020 RackTop Systems, Inc. */ /* * Mellanox Connect-X 4/5/6 driver. */ #include <sys/modctl.h> #include <sys/conf.h> #include <sys/devops.h> #include <sys/sysmacros.h> #include <sys/atomic.h> #include <sys/cpuvar.h> #include <sys/sdt.h> #include <sys/pattr.h> #include <sys/dlpi.h> #include <sys/mac_provider.h> #include <sys/random.h> #include <mlxcx.h> boolean_t mlxcx_wq_alloc_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) { ddi_device_acc_attr_t acc; ddi_dma_attr_t attr; boolean_t ret; size_t sz; VERIFY0(mlwq->mlwq_state & MLXCX_WQ_ALLOC); /* Receive and send queue entries might be different sizes. */ switch (mlwq->mlwq_type) { case MLXCX_WQ_TYPE_SENDQ: mlwq->mlwq_entshift = mlxp->mlx_props.mldp_sq_size_shift; mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift); sz = mlwq->mlwq_nents * sizeof (mlxcx_sendq_ent_t); break; case MLXCX_WQ_TYPE_RECVQ: mlwq->mlwq_entshift = mlxp->mlx_props.mldp_rq_size_shift; mlwq->mlwq_nents = (1 << mlwq->mlwq_entshift); sz = mlwq->mlwq_nents * sizeof (mlxcx_recvq_ent_t); break; default: VERIFY(0); return (B_FALSE); } ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); mlxcx_dma_acc_attr(mlxp, &acc); mlxcx_dma_queue_attr(mlxp, &attr); ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_dma, &attr, &acc, B_TRUE, sz, B_TRUE); if (!ret) { mlxcx_warn(mlxp, "failed to allocate WQ memory"); return (B_FALSE); } /* * Just set the first pointer in the union. Yes, this is a strict * aliasing violation. No, I don't care. */ mlwq->mlwq_send_ent = (mlxcx_sendq_ent_t *)mlwq->mlwq_dma.mxdb_va; mlxcx_dma_acc_attr(mlxp, &acc); mlxcx_dma_qdbell_attr(mlxp, &attr); sz = sizeof (mlxcx_workq_doorbell_t); ret = mlxcx_dma_alloc(mlxp, &mlwq->mlwq_doorbell_dma, &attr, &acc, B_TRUE, sz, B_TRUE); if (!ret) { mlxcx_warn(mlxp, "failed to allocate WQ doorbell memory"); mlxcx_dma_free(&mlwq->mlwq_dma); mlwq->mlwq_send_ent = NULL; return (B_FALSE); } mlwq->mlwq_doorbell = (mlxcx_workq_doorbell_t *)mlwq->mlwq_doorbell_dma.mxdb_va; mlwq->mlwq_state |= MLXCX_WQ_ALLOC; return (B_TRUE); } void mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) { VERIFY(mlwq->mlwq_state & MLXCX_WQ_ALLOC); if (mlwq->mlwq_state & MLXCX_WQ_CREATED) VERIFY(mlwq->mlwq_state & MLXCX_WQ_DESTROYED); mlxcx_dma_free(&mlwq->mlwq_dma); mlwq->mlwq_send_ent = NULL; mlxcx_dma_free(&mlwq->mlwq_doorbell_dma); mlwq->mlwq_doorbell = NULL; mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC; } static boolean_t mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, uint_t ent_shift) { ddi_device_acc_attr_t acc; ddi_dma_attr_t attr; boolean_t ret; size_t sz, i; VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC); mlcq->mlcq_entshift = ent_shift; mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift); sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t); ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); mlxcx_dma_acc_attr(mlxp, &acc); mlxcx_dma_queue_attr(mlxp, &attr); ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_dma, &attr, &acc, B_TRUE, sz, B_TRUE); if (!ret) { mlxcx_warn(mlxp, "failed to allocate CQ memory"); return (B_FALSE); } mlcq->mlcq_ent = (mlxcx_completionq_ent_t *)mlcq->mlcq_dma.mxdb_va; for (i = 0; i < mlcq->mlcq_nents; ++i) { mlcq->mlcq_ent[i].mlcqe_opcode = MLXCX_CQE_OP_INVALID; mlcq->mlcq_ent[i].mlcqe_owner = MLXCX_CQE_OWNER_INIT; } mlxcx_dma_acc_attr(mlxp, &acc); mlxcx_dma_qdbell_attr(mlxp, &attr); sz = sizeof (mlxcx_completionq_doorbell_t); ret = mlxcx_dma_alloc(mlxp, &mlcq->mlcq_doorbell_dma, &attr, &acc, B_TRUE, sz, B_TRUE); if (!ret) { mlxcx_warn(mlxp, "failed to allocate CQ doorbell memory"); mlxcx_dma_free(&mlcq->mlcq_dma); mlcq->mlcq_ent = NULL; return (B_FALSE); } mlcq->mlcq_doorbell = (mlxcx_completionq_doorbell_t *)mlcq->mlcq_doorbell_dma.mxdb_va; atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ALLOC); return (B_TRUE); } static void mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) { VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC); if (mlcq->mlcq_state & MLXCX_CQ_CREATED) VERIFY(mlcq->mlcq_state & MLXCX_CQ_DESTROYED); mlxcx_dma_free(&mlcq->mlcq_dma); mlcq->mlcq_ent = NULL; mlxcx_dma_free(&mlcq->mlcq_doorbell_dma); mlcq->mlcq_doorbell = NULL; atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ALLOC); } void mlxcx_wq_teardown(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) { mlxcx_completion_queue_t *mlcq; /* * If something is holding the lock on a long operation like a * refill, setting this flag asks them to exit early if possible. */ atomic_or_uint(&mlwq->mlwq_state, MLXCX_WQ_TEARDOWN); mutex_enter(&mlwq->mlwq_mtx); list_remove(&mlxp->mlx_wqs, mlwq); if ((mlwq->mlwq_state & MLXCX_WQ_CREATED) && !(mlwq->mlwq_state & MLXCX_WQ_DESTROYED)) { if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ && mlwq->mlwq_state & MLXCX_WQ_STARTED && !mlxcx_cmd_stop_rq(mlxp, mlwq)) { mlxcx_warn(mlxp, "failed to stop " "recv queue num %x", mlwq->mlwq_num); } if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ && mlwq->mlwq_state & MLXCX_WQ_STARTED && !mlxcx_cmd_stop_sq(mlxp, mlwq)) { mlxcx_warn(mlxp, "failed to stop " "send queue num %x", mlwq->mlwq_num); } if (mlwq->mlwq_type == MLXCX_WQ_TYPE_RECVQ && !mlxcx_cmd_destroy_rq(mlxp, mlwq)) { mlxcx_warn(mlxp, "failed to destroy " "recv queue num %x", mlwq->mlwq_num); } if (mlwq->mlwq_type == MLXCX_WQ_TYPE_SENDQ && !mlxcx_cmd_destroy_sq(mlxp, mlwq)) { mlxcx_warn(mlxp, "failed to destroy " "send queue num %x", mlwq->mlwq_num); } } if (mlwq->mlwq_state & MLXCX_WQ_ALLOC) { mlxcx_wq_rele_dma(mlxp, mlwq); } mlcq = mlwq->mlwq_cq; /* These will be released by mlxcx_teardown_bufs() */ mlwq->mlwq_bufs = NULL; mlwq->mlwq_foreign_bufs = NULL; mutex_exit(&mlwq->mlwq_mtx); mutex_enter(&mlcq->mlcq_mtx); mutex_enter(&mlwq->mlwq_mtx); ASSERT3P(mlcq->mlcq_wq, ==, mlwq); mlcq->mlcq_wq = NULL; mutex_exit(&mlwq->mlwq_mtx); mutex_exit(&mlcq->mlcq_mtx); mutex_destroy(&mlwq->mlwq_mtx); } void mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) { mlxcx_event_queue_t *mleq; mlxcx_buffer_t *b; /* * If something is holding the lock on a long operation like polling * which we're going to abort anyway, this flag asks them to exit * early if possible. */ atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_TEARDOWN); mutex_enter(&mlcq->mlcq_mtx); list_remove(&mlxp->mlx_cqs, mlcq); if ((mlcq->mlcq_state & MLXCX_CQ_CREATED) && !(mlcq->mlcq_state & MLXCX_CQ_DESTROYED)) { if (!mlxcx_cmd_destroy_cq(mlxp, mlcq)) { mlxcx_warn(mlxp, "failed to destroy " "completion queue num %u", mlcq->mlcq_num); } } if (mlcq->mlcq_state & MLXCX_CQ_ALLOC) { mlxcx_cq_rele_dma(mlxp, mlcq); } /* * If we're on an EQ AVL tree, then we need to grab * the EQ's mutex to take it off. The ISR always takes * EQ mutex before CQ mutex, so we have to let go of * the CQ mutex then come back again. * * The ISR will bail out if tries to touch this CQ now since * we added the CQ_DESTROYED flag above. */ if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) { mleq = mlcq->mlcq_eq; } else { mleq = NULL; } /* Return any outstanding buffers to the free pool. */ while ((b = list_remove_head(&mlcq->mlcq_buffers)) != NULL) { mlxcx_buf_return_chain(mlxp, b, B_FALSE); } mutex_enter(&mlcq->mlcq_bufbmtx); while ((b = list_remove_head(&mlcq->mlcq_buffers_b)) != NULL) { mlxcx_buf_return_chain(mlxp, b, B_FALSE); } mutex_exit(&mlcq->mlcq_bufbmtx); /* * Since the interrupt handlers take the EQ lock before the CQ one, * we must do the same here. That means letting go of the lock * for a brief window here (we'll double-check the state when we * get back in). */ mutex_exit(&mlcq->mlcq_mtx); if (mleq != NULL) { mutex_enter(&mleq->mleq_mtx); mutex_enter(&mlcq->mlcq_mtx); /* * Double-check the state, we let go of the * mutex briefly. */ if (mlcq->mlcq_state & MLXCX_CQ_EQAVL) { avl_remove(&mleq->mleq_cqs, mlcq); atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_EQAVL); } mutex_exit(&mlcq->mlcq_mtx); mutex_exit(&mleq->mleq_mtx); } mutex_enter(&mlcq->mlcq_mtx); ASSERT0(mlcq->mlcq_state & ~(MLXCX_CQ_CREATED | MLXCX_CQ_DESTROYED | MLXCX_CQ_TEARDOWN | MLXCX_CQ_ARMED)); mutex_exit(&mlcq->mlcq_mtx); mutex_destroy(&mlcq->mlcq_mtx); mutex_destroy(&mlcq->mlcq_arm_mtx); mutex_destroy(&mlcq->mlcq_bufbmtx); list_destroy(&mlcq->mlcq_buffers); list_destroy(&mlcq->mlcq_buffers_b); kmem_free(mlcq, sizeof (mlxcx_completion_queue_t)); } static boolean_t mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq, mlxcx_completion_queue_t **cqp, uint_t ent_shift) { mlxcx_completion_queue_t *cq; cq = kmem_zalloc(sizeof (mlxcx_completion_queue_t), KM_SLEEP); mutex_init(&cq->mlcq_mtx, NULL, MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); mutex_init(&cq->mlcq_arm_mtx, NULL, MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); mutex_init(&cq->mlcq_bufbmtx, NULL, MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); list_create(&cq->mlcq_buffers, sizeof (mlxcx_buffer_t), offsetof(mlxcx_buffer_t, mlb_cq_entry)); list_create(&cq->mlcq_buffers_b, sizeof (mlxcx_buffer_t), offsetof(mlxcx_buffer_t, mlb_cq_entry)); cq->mlcq_mlx = mlxp; list_insert_tail(&mlxp->mlx_cqs, cq); mutex_enter(&cq->mlcq_mtx); if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) { mutex_exit(&cq->mlcq_mtx); return (B_FALSE); } cq->mlcq_bufhwm = cq->mlcq_nents - MLXCX_CQ_HWM_GAP; cq->mlcq_buflwm = cq->mlcq_nents - MLXCX_CQ_LWM_GAP; cq->mlcq_uar = &mlxp->mlx_uar; cq->mlcq_eq = eq; cq->mlcq_cqemod_period_usec = mlxp->mlx_props.mldp_cqemod_period_usec; cq->mlcq_cqemod_count = mlxp->mlx_props.mldp_cqemod_count; if (!mlxcx_cmd_create_cq(mlxp, cq)) { mutex_exit(&cq->mlcq_mtx); return (B_FALSE); } mutex_exit(&cq->mlcq_mtx); mutex_enter(&eq->mleq_mtx); mutex_enter(&cq->mlcq_arm_mtx); mutex_enter(&cq->mlcq_mtx); ASSERT0(cq->mlcq_state & MLXCX_CQ_EQAVL); avl_add(&eq->mleq_cqs, cq); atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_EQAVL); mlxcx_arm_cq(mlxp, cq); mutex_exit(&cq->mlcq_mtx); mutex_exit(&cq->mlcq_arm_mtx); mutex_exit(&eq->mleq_mtx); *cqp = cq; return (B_TRUE); } static boolean_t mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq, mlxcx_work_queue_t *wq) { mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); list_insert_tail(&mlxp->mlx_wqs, wq); mutex_enter(&wq->mlwq_mtx); wq->mlwq_mlx = mlxp; wq->mlwq_type = MLXCX_WQ_TYPE_RECVQ; wq->mlwq_cq = cq; wq->mlwq_pd = &mlxp->mlx_pd; wq->mlwq_uar = &mlxp->mlx_uar; wq->mlwq_bufs = mlxcx_mlbs_create(mlxp); if (!mlxcx_wq_alloc_dma(mlxp, wq)) { mutex_exit(&wq->mlwq_mtx); return (B_FALSE); } if (!mlxcx_cmd_create_rq(mlxp, wq)) { mutex_exit(&wq->mlwq_mtx); return (B_FALSE); } wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP; wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP; mutex_exit(&wq->mlwq_mtx); mutex_enter(&cq->mlcq_mtx); mutex_enter(&wq->mlwq_mtx); ASSERT3P(cq->mlcq_wq, ==, NULL); cq->mlcq_wq = wq; mutex_exit(&wq->mlwq_mtx); mutex_exit(&cq->mlcq_mtx); return (B_TRUE); } static boolean_t mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq, mlxcx_tis_t *tis, mlxcx_work_queue_t *wq) { mutex_init(&wq->mlwq_mtx, NULL, MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); list_insert_tail(&mlxp->mlx_wqs, wq); mutex_enter(&wq->mlwq_mtx); wq->mlwq_mlx = mlxp; wq->mlwq_type = MLXCX_WQ_TYPE_SENDQ; wq->mlwq_cq = cq; wq->mlwq_pd = &mlxp->mlx_pd; wq->mlwq_uar = &mlxp->mlx_uar; wq->mlwq_tis = tis; wq->mlwq_bufs = mlxcx_mlbs_create(mlxp); wq->mlwq_foreign_bufs = mlxcx_mlbs_create(mlxp); VERIFY3U(port->mlp_wqe_min_inline, <=, MLXCX_ETH_INLINE_L2); wq->mlwq_inline_mode = MLXCX_ETH_INLINE_L2; if (!mlxcx_wq_alloc_dma(mlxp, wq)) { mutex_exit(&wq->mlwq_mtx); return (B_FALSE); } if (!mlxcx_cmd_create_sq(mlxp, wq)) { mutex_exit(&wq->mlwq_mtx); return (B_FALSE); } wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP; wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP; mutex_exit(&wq->mlwq_mtx); mutex_enter(&cq->mlcq_mtx); mutex_enter(&wq->mlwq_mtx); ASSERT3P(cq->mlcq_wq, ==, NULL); cq->mlcq_wq = wq; mutex_exit(&wq->mlwq_mtx); mutex_exit(&cq->mlcq_mtx); return (B_TRUE); } /* * Before we tear down the queues associated with the rx group, * flag each cq as being torn down and wake up any tasks. */ static void mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g) { mlxcx_work_queue_t *wq; mlxcx_completion_queue_t *cq; mlxcx_buf_shard_t *s; uint_t i; mutex_enter(&g->mlg_mtx); for (i = 0; i < g->mlg_nwqs; ++i) { wq = &g->mlg_wqs[i]; cq = wq->mlwq_cq; if (cq != NULL) { s = wq->mlwq_bufs; mutex_enter(&s->mlbs_mtx); atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN); cv_broadcast(&s->mlbs_free_nonempty); mutex_exit(&s->mlbs_mtx); } } mutex_exit(&g->mlg_mtx); } void mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) { mlxcx_work_queue_t *wq; mlxcx_completion_queue_t *cq; mlxcx_flow_entry_t *fe; mlxcx_flow_group_t *fg; mlxcx_flow_table_t *ft; uint_t i; mutex_enter(&g->mlg_port->mlp_mtx); mutex_enter(&g->mlg_mtx); if (g->mlg_state & MLXCX_GROUP_FLOWS) { mlxcx_remove_all_umcast_entries(mlxp, g->mlg_port, g); if (g->mlg_rx_vlan_ft != NULL) mlxcx_remove_all_vlan_entries(mlxp, g); if (g == &mlxp->mlx_rx_groups[0]) { ft = g->mlg_port->mlp_rx_flow; mutex_enter(&ft->mlft_mtx); fg = g->mlg_port->mlp_bcast; fe = list_head(&fg->mlfg_entries); if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { (void) mlxcx_cmd_delete_flow_table_entry( mlxp, fe); } fg = g->mlg_port->mlp_promisc; fe = list_head(&fg->mlfg_entries); if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { (void) mlxcx_cmd_delete_flow_table_entry( mlxp, fe); } mutex_exit(&ft->mlft_mtx); } if (g->mlg_rx_vlan_ft != NULL) { mutex_enter(&g->mlg_rx_vlan_ft->mlft_mtx); ASSERT(list_is_empty(&g->mlg_rx_vlans)); fg = g->mlg_rx_vlan_def_fg; if (fg != NULL) { fe = list_head(&fg->mlfg_entries); if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { (void) mlxcx_cmd_delete_flow_table_entry( mlxp, fe); } } fg = g->mlg_rx_vlan_promisc_fg; if (fg != NULL) { fe = list_head(&fg->mlfg_entries); if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { (void) mlxcx_cmd_delete_flow_table_entry( mlxp, fe); } } mlxcx_teardown_flow_table(mlxp, g->mlg_rx_vlan_ft); list_destroy(&g->mlg_rx_vlans); g->mlg_rx_vlan_ft = NULL; } mutex_enter(&g->mlg_rx_hash_ft->mlft_mtx); mlxcx_teardown_flow_table(mlxp, g->mlg_rx_hash_ft); g->mlg_rx_hash_ft = NULL; avl_destroy(&g->mlg_rx_macs); g->mlg_state &= ~MLXCX_GROUP_FLOWS; } if (g->mlg_state & MLXCX_GROUP_RUNNING) { for (i = 0; i < g->mlg_nwqs; ++i) { wq = &g->mlg_wqs[i]; mutex_enter(&wq->mlwq_mtx); if (wq->mlwq_state & MLXCX_WQ_STARTED && !mlxcx_cmd_stop_rq(mlxp, wq)) { mlxcx_warn(mlxp, "failed to stop rq %x", wq->mlwq_num); } mutex_exit(&wq->mlwq_mtx); } taskq_destroy(g->mlg_refill_tq); g->mlg_state &= ~MLXCX_GROUP_RUNNING; } if (g->mlg_state & MLXCX_GROUP_TIRTIS) { for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) { mlxcx_tir_t *tir = &g->mlg_tir[i]; if (tir->mltir_state & MLXCX_TIR_CREATED && !(tir->mltir_state & MLXCX_TIR_DESTROYED)) { if (!mlxcx_cmd_destroy_tir(mlxp, tir)) { mlxcx_warn(mlxp, "failed to destroy tir %u " "for rx ring", tir->mltir_num); } } } g->mlg_state &= ~MLXCX_GROUP_TIRTIS; } if (g->mlg_state & MLXCX_GROUP_RQT) { if (g->mlg_rqt->mlrqt_state & MLXCX_RQT_CREATED && !(g->mlg_rqt->mlrqt_state & MLXCX_RQT_DESTROYED)) { if (!mlxcx_cmd_destroy_rqt(mlxp, g->mlg_rqt)) { mlxcx_warn(mlxp, "failed to destroy rqt %u " "for rx ring", g->mlg_rqt->mlrqt_num); } kmem_free(g->mlg_rqt->mlrqt_rq, g->mlg_rqt->mlrqt_rq_size); g->mlg_rqt->mlrqt_rq = NULL; kmem_free(g->mlg_rqt, sizeof (mlxcx_rqtable_t)); g->mlg_rqt = NULL; } g->mlg_state &= ~MLXCX_GROUP_RQT; } for (i = 0; i < g->mlg_nwqs; ++i) { wq = &g->mlg_wqs[i]; cq = wq->mlwq_cq; mlxcx_wq_teardown(mlxp, wq); if (cq != NULL) mlxcx_cq_teardown(mlxp, cq); } kmem_free(g->mlg_wqs, g->mlg_wqs_size); g->mlg_wqs = NULL; g->mlg_state &= ~MLXCX_GROUP_WQS; mutex_exit(&g->mlg_mtx); mutex_exit(&g->mlg_port->mlp_mtx); mutex_destroy(&g->mlg_mtx); g->mlg_state &= ~MLXCX_GROUP_INIT; ASSERT3S(g->mlg_state, ==, 0); } void mlxcx_teardown_tx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) { mlxcx_work_queue_t *wq; mlxcx_completion_queue_t *cq; uint_t i; mutex_enter(&g->mlg_mtx); if (g->mlg_state & MLXCX_GROUP_WQS) { for (i = 0; i < g->mlg_nwqs; ++i) { wq = &g->mlg_wqs[i]; mutex_enter(&wq->mlwq_mtx); cq = wq->mlwq_cq; if (wq->mlwq_state & MLXCX_WQ_STARTED && !mlxcx_cmd_stop_sq(mlxp, wq)) { mlxcx_warn(mlxp, "failed to stop sq %x", wq->mlwq_num); } mutex_exit(&wq->mlwq_mtx); mlxcx_wq_teardown(mlxp, wq); if (cq != NULL) mlxcx_cq_teardown(mlxp, cq); } g->mlg_state &= ~MLXCX_GROUP_RUNNING; kmem_free(g->mlg_wqs, g->mlg_wqs_size); g->mlg_wqs = NULL; g->mlg_state &= ~MLXCX_GROUP_WQS; } if ((g->mlg_state & MLXCX_GROUP_TIRTIS) && g->mlg_tis.mltis_state & MLXCX_TIS_CREATED && !(g->mlg_tis.mltis_state & MLXCX_TIS_DESTROYED)) { if (!mlxcx_cmd_destroy_tis(mlxp, &g->mlg_tis)) { mlxcx_warn(mlxp, "failed to destroy tis %u for tx ring", g->mlg_tis.mltis_num); } } g->mlg_state &= ~MLXCX_GROUP_TIRTIS; mutex_exit(&g->mlg_mtx); mutex_destroy(&g->mlg_mtx); g->mlg_state &= ~MLXCX_GROUP_INIT; ASSERT3S(g->mlg_state, ==, 0); } void mlxcx_teardown_groups(mlxcx_t *mlxp) { mlxcx_ring_group_t *g; uint_t i; for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { g = &mlxp->mlx_rx_groups[i]; if (!(g->mlg_state & MLXCX_GROUP_INIT)) continue; ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX); mlxcx_quiesce_rx_cqs(mlxp, g); } for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { g = &mlxp->mlx_rx_groups[i]; if (!(g->mlg_state & MLXCX_GROUP_INIT)) continue; mlxcx_teardown_rx_group(mlxp, g); } kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size); mlxp->mlx_rx_groups = NULL; for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { g = &mlxp->mlx_tx_groups[i]; if (!(g->mlg_state & MLXCX_GROUP_INIT)) continue; ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX); mlxcx_teardown_tx_group(mlxp, g); } kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size); mlxp->mlx_tx_groups = NULL; } boolean_t mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) { mlxcx_event_queue_t *eq; mlxcx_completion_queue_t *cq; mlxcx_work_queue_t *rq; mlxcx_flow_table_t *ft; mlxcx_flow_group_t *fg; mlxcx_flow_entry_t *fe; uint_t ent_shift; uint_t i, j; ASSERT3S(g->mlg_state, ==, 0); mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); mutex_enter(&g->mlg_mtx); g->mlg_mlx = mlxp; g->mlg_type = MLXCX_GROUP_RX; g->mlg_port = &mlxp->mlx_ports[0]; g->mlg_state |= MLXCX_GROUP_INIT; g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_small_group; i = g - &mlxp->mlx_rx_groups[0]; if (i < mlxp->mlx_props.mldp_rx_ngroups_large) g->mlg_nwqs = mlxp->mlx_props.mldp_rx_nrings_per_large_group; g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t); g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP); g->mlg_state |= MLXCX_GROUP_WQS; g->mlg_rqt = kmem_zalloc(sizeof (mlxcx_rqtable_t), KM_SLEEP); g->mlg_rqt->mlrqt_max = 2; while (g->mlg_rqt->mlrqt_max < g->mlg_nwqs) g->mlg_rqt->mlrqt_max <<= 1; g->mlg_rqt->mlrqt_rq_size = g->mlg_rqt->mlrqt_max * sizeof (mlxcx_work_queue_t *); g->mlg_rqt->mlrqt_rq = kmem_zalloc(g->mlg_rqt->mlrqt_rq_size, KM_SLEEP); g->mlg_state |= MLXCX_GROUP_RQT; for (i = 0; i < g->mlg_nwqs; ++i) { eq = NULL; while (eq == NULL) { eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++]; if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count) mlxp->mlx_next_eq = mlxp->mlx_intr_cq0; if (eq->mleq_type != MLXCX_EQ_TYPE_ANY && eq->mleq_type != MLXCX_EQ_TYPE_RX) { /* Try the next one */ eq = NULL; } } /* * A single completion is indicated for each rq entry as * it is used. So, the number of cq entries never needs * to be larger than the rq. */ ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift, mlxp->mlx_props.mldp_rq_size_shift); if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) { g->mlg_nwqs = i; break; } cq->mlcq_stats = &g->mlg_port->mlp_stats; rq = &g->mlg_wqs[i]; if (!mlxcx_rq_setup(mlxp, cq, rq)) { g->mlg_nwqs = i; break; } g->mlg_rqt->mlrqt_rq[g->mlg_rqt->mlrqt_used++] = rq; g->mlg_rqt->mlrqt_state |= MLXCX_RQT_DIRTY; rq->mlwq_group = g; } if (g->mlg_nwqs == 0) { mutex_exit(&g->mlg_mtx); return (B_FALSE); } if (!mlxcx_cmd_create_rqt(mlxp, g->mlg_rqt)) { mutex_exit(&g->mlg_mtx); return (B_FALSE); } for (i = 0; i < MLXCX_TIRS_PER_GROUP; ++i) { mlxcx_tir_t *tir = &g->mlg_tir[i]; tir->mltir_tdom = &mlxp->mlx_tdom; switch (i) { case MLXCX_TIR_ROLE_OTHER: tir->mltir_type = MLXCX_TIR_DIRECT; tir->mltir_rq = &g->mlg_wqs[0]; break; case MLXCX_TIR_ROLE_IPv4: case MLXCX_TIR_ROLE_IPv6: case MLXCX_TIR_ROLE_TCPv4: case MLXCX_TIR_ROLE_TCPv6: case MLXCX_TIR_ROLE_UDPv4: case MLXCX_TIR_ROLE_UDPv6: tir->mltir_type = MLXCX_TIR_INDIRECT; tir->mltir_rqtable = g->mlg_rqt; tir->mltir_hash_fn = MLXCX_TIR_HASH_TOEPLITZ; (void) random_get_pseudo_bytes(tir->mltir_toeplitz_key, sizeof (tir->mltir_toeplitz_key)); break; } switch (i) { case MLXCX_TIR_ROLE_OTHER: break; case MLXCX_TIR_ROLE_IPv4: case MLXCX_TIR_ROLE_TCPv4: case MLXCX_TIR_ROLE_UDPv4: tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv4; tir->mltir_hash_fields = MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP; break; case MLXCX_TIR_ROLE_IPv6: case MLXCX_TIR_ROLE_TCPv6: case MLXCX_TIR_ROLE_UDPv6: tir->mltir_l3_type = MLXCX_RX_HASH_L3_IPv6; tir->mltir_hash_fields = MLXCX_RX_HASH_SRC_IP | MLXCX_RX_HASH_DST_IP; break; } switch (i) { case MLXCX_TIR_ROLE_OTHER: case MLXCX_TIR_ROLE_IPv4: case MLXCX_TIR_ROLE_IPv6: break; case MLXCX_TIR_ROLE_TCPv4: case MLXCX_TIR_ROLE_TCPv6: tir->mltir_l4_type = MLXCX_RX_HASH_L4_TCP; tir->mltir_hash_fields |= MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT; break; case MLXCX_TIR_ROLE_UDPv4: case MLXCX_TIR_ROLE_UDPv6: tir->mltir_l4_type = MLXCX_RX_HASH_L4_UDP; tir->mltir_hash_fields |= MLXCX_RX_HASH_L4_SPORT | MLXCX_RX_HASH_L4_DPORT; break; } if (!mlxcx_cmd_create_tir(mlxp, tir)) { mutex_exit(&g->mlg_mtx); return (B_FALSE); } g->mlg_state |= MLXCX_GROUP_TIRTIS; } /* * Flow table: our RX hashing breakout table for RSS */ g->mlg_rx_hash_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), KM_SLEEP)); mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); avl_create(&g->mlg_rx_macs, mlxcx_grmac_compare, sizeof (mlxcx_group_mac_t), offsetof(mlxcx_group_mac_t, mlgm_group_entry)); g->mlg_state |= MLXCX_GROUP_FLOWS; mutex_enter(&ft->mlft_mtx); ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; ft->mlft_level = 2; ft->mlft_port = g->mlg_port; ft->mlft_entshift = MLXCX_RX_HASH_FT_SIZE_SHIFT; ft->mlft_nents = (1 << ft->mlft_entshift); ASSERT3U(ft->mlft_nents, >=, MLXCX_TIRS_PER_GROUP); ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), offsetof(mlxcx_flow_group_t, mlfg_entry)); for (j = 0; j < ft->mlft_nents; ++j) { ft->mlft_ent[j].mlfe_table = ft; ft->mlft_ent[j].mlfe_index = j; } if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); list_insert_tail(&ft->mlft_groups, fg); fg->mlfg_table = ft; fg->mlfg_size = 1; fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fe = list_head(&fg->mlfg_entries); fe->mlfe_ip_version = 6; fe->mlfe_ip_proto = IPPROTO_UDP; fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = &g->mlg_tir[MLXCX_TIR_ROLE_UDPv6]; if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); list_insert_tail(&ft->mlft_groups, fg); fg->mlfg_table = ft; fg->mlfg_size = 1; fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fe = list_head(&fg->mlfg_entries); fe->mlfe_ip_version = 4; fe->mlfe_ip_proto = IPPROTO_UDP; fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = &g->mlg_tir[MLXCX_TIR_ROLE_UDPv4]; if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); list_insert_tail(&ft->mlft_groups, fg); fg->mlfg_table = ft; fg->mlfg_size = 1; fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fe = list_head(&fg->mlfg_entries); fe->mlfe_ip_version = 6; fe->mlfe_ip_proto = IPPROTO_TCP; fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = &g->mlg_tir[MLXCX_TIR_ROLE_TCPv6]; if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); list_insert_tail(&ft->mlft_groups, fg); fg->mlfg_table = ft; fg->mlfg_size = 1; fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER | MLXCX_FLOW_MATCH_IP_PROTO; if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fe = list_head(&fg->mlfg_entries); fe->mlfe_ip_version = 4; fe->mlfe_ip_proto = IPPROTO_TCP; fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = &g->mlg_tir[MLXCX_TIR_ROLE_TCPv4]; if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); list_insert_tail(&ft->mlft_groups, fg); fg->mlfg_table = ft; fg->mlfg_size = 1; fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER; if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fe = list_head(&fg->mlfg_entries); fe->mlfe_ip_version = 6; fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = &g->mlg_tir[MLXCX_TIR_ROLE_IPv6]; if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); list_insert_tail(&ft->mlft_groups, fg); fg->mlfg_table = ft; fg->mlfg_size = 1; fg->mlfg_mask |= MLXCX_FLOW_MATCH_IP_VER; if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fe = list_head(&fg->mlfg_entries); fe->mlfe_ip_version = 4; fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = &g->mlg_tir[MLXCX_TIR_ROLE_IPv4]; if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); list_insert_tail(&ft->mlft_groups, fg); fg->mlfg_table = ft; fg->mlfg_size = 1; if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fe = list_head(&fg->mlfg_entries); fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; fe->mlfe_dest[fe->mlfe_ndest++].mlfed_tir = &g->mlg_tir[MLXCX_TIR_ROLE_OTHER]; if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } mutex_exit(&ft->mlft_mtx); /* * Flow table: the VLAN breakout table for doing VLAN filtering after * we've matched a MAC address. */ g->mlg_rx_vlan_ft = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), KM_SLEEP)); mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); list_create(&g->mlg_rx_vlans, sizeof (mlxcx_group_vlan_t), offsetof(mlxcx_group_vlan_t, mlgv_entry)); mutex_enter(&ft->mlft_mtx); ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; ft->mlft_level = 1; ft->mlft_port = g->mlg_port; ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_vlan_size_shift; ft->mlft_nents = (1 << ft->mlft_entshift); ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), offsetof(mlxcx_flow_group_t, mlfg_entry)); for (j = 0; j < ft->mlft_nents; ++j) { fe = &ft->mlft_ent[j]; fe->mlfe_table = ft; fe->mlfe_index = j; fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; } if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } /* First group is all actual matched VLANs */ fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); g->mlg_rx_vlan_fg = fg; list_insert_tail(&ft->mlft_groups, fg); fg->mlfg_table = ft; fg->mlfg_size = ft->mlft_nents - 2; fg->mlfg_mask |= MLXCX_FLOW_MATCH_VLAN; fg->mlfg_mask |= MLXCX_FLOW_MATCH_VID; if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } /* * Then the "default" entry which we enable when we have no VLAN IDs * added to the group (we start with this enabled). */ fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); g->mlg_rx_vlan_def_fg = fg; list_insert_tail(&ft->mlft_groups, fg); fg->mlfg_table = ft; fg->mlfg_size = 1; if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fe = list_head(&fg->mlfg_entries); if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } /* * Finally, the promisc entry which points at the *hash ft* from the * default group. We only enable this when we have promisc on. */ fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); g->mlg_rx_vlan_promisc_fg = fg; list_insert_tail(&ft->mlft_groups, fg); fg->mlfg_table = ft; fg->mlfg_size = 1; if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fe = list_head(&fg->mlfg_entries); fe->mlfe_ndest = 1; fe->mlfe_dest[0].mlfed_flow = mlxp->mlx_rx_groups[0].mlg_rx_hash_ft; mutex_exit(&ft->mlft_mtx); mutex_exit(&g->mlg_mtx); return (B_TRUE); } boolean_t mlxcx_rx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g, mlxcx_work_queue_t *rq) { uint_t j; mlxcx_buffer_t *b; mlxcx_completion_queue_t *cq; mutex_enter(&g->mlg_mtx); /* * Sadly, even though MAC has the mgi_start callback, it is not always * called -- in particular when we are being managed under an aggr, the * mgi_start callback will only ever be called on the default group. * * So instead of asserting about the group state here, we have to * check it and call group start if needed. */ if (!(g->mlg_state & MLXCX_GROUP_RUNNING)) { mutex_exit(&g->mlg_mtx); if (!mlxcx_rx_group_start(mlxp, g)) return (B_FALSE); mutex_enter(&g->mlg_mtx); } ASSERT(g->mlg_state & MLXCX_GROUP_RUNNING); cq = rq->mlwq_cq; ASSERT(cq != NULL); mutex_enter(&cq->mlcq_mtx); mutex_enter(&rq->mlwq_mtx); if (rq->mlwq_state & MLXCX_WQ_STARTED) { mutex_exit(&rq->mlwq_mtx); mutex_exit(&cq->mlcq_mtx); mutex_exit(&g->mlg_mtx); return (B_TRUE); } if (!mlxcx_cmd_start_rq(mlxp, rq)) { mutex_exit(&rq->mlwq_mtx); mutex_exit(&cq->mlcq_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } ASSERT(rq->mlwq_state & MLXCX_WQ_STARTED); ASSERT0(rq->mlwq_state & MLXCX_WQ_BUFFERS); rq->mlwq_state |= MLXCX_WQ_BUFFERS; mlxcx_shard_ready(rq->mlwq_bufs); for (j = 0; j < rq->mlwq_nents; ++j) { if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b)) break; mlxcx_buf_return(mlxp, b); } for (j = 0; j < rq->mlwq_nents / 2; ++j) { if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b)) break; mlxcx_buf_return(mlxp, b); } mlxcx_rq_refill(mlxp, rq); mutex_exit(&rq->mlwq_mtx); mutex_exit(&cq->mlcq_mtx); mutex_exit(&g->mlg_mtx); return (B_TRUE); } boolean_t mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g) { mlxcx_flow_table_t *ft; mlxcx_flow_group_t *fg; mlxcx_flow_entry_t *fe; char tq_name[TASKQ_NAMELEN]; mutex_enter(&g->mlg_mtx); if (g->mlg_state & MLXCX_GROUP_RUNNING) { mutex_exit(&g->mlg_mtx); return (B_TRUE); } ASSERT0(g->mlg_state & MLXCX_GROUP_RUNNING); g->mlg_state |= MLXCX_GROUP_RUNNING; (void) snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld", ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst, g - &mlxp->mlx_rx_groups[0]); /* * Create one refill taskq per group with one thread per work queue. * The refill task may block waiting for resources, so by effectively * having one thread per work queue we avoid work queues blocking each * other. */ if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri, g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) { mlxcx_warn(mlxp, "failed to create rq refill task queue"); mutex_exit(&g->mlg_mtx); return (B_FALSE); } if (g == &mlxp->mlx_rx_groups[0]) { ft = g->mlg_port->mlp_rx_flow; mutex_enter(&ft->mlft_mtx); /* * Broadcast and promisc entries go directly to group 0's * RSS hash fanout flow table. They bypass VLAN filtering. */ fg = g->mlg_port->mlp_bcast; fe = list_head(&fg->mlfg_entries); fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { mutex_exit(&ft->mlft_mtx); g->mlg_state &= ~MLXCX_GROUP_RUNNING; taskq_destroy(g->mlg_refill_tq); mutex_exit(&g->mlg_mtx); return (B_FALSE); } fg = g->mlg_port->mlp_promisc; fe = list_head(&fg->mlfg_entries); fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; /* * Don't actually set the promisc entry until promisc is * enabled. */ mutex_exit(&ft->mlft_mtx); } mutex_exit(&g->mlg_mtx); return (B_TRUE); } boolean_t mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) { mlxcx_event_queue_t *eq; mlxcx_completion_queue_t *cq; mlxcx_work_queue_t *sq; uint_t i; ASSERT3S(g->mlg_state, ==, 0); mutex_init(&g->mlg_mtx, NULL, MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); g->mlg_state |= MLXCX_GROUP_INIT; mutex_enter(&g->mlg_mtx); g->mlg_mlx = mlxp; g->mlg_type = MLXCX_GROUP_TX; g->mlg_port = &mlxp->mlx_ports[0]; g->mlg_nwqs = mlxp->mlx_props.mldp_tx_nrings_per_group; g->mlg_wqs_size = g->mlg_nwqs * sizeof (mlxcx_work_queue_t); g->mlg_wqs = kmem_zalloc(g->mlg_wqs_size, KM_SLEEP); g->mlg_state |= MLXCX_GROUP_WQS; g->mlg_tis.mltis_tdom = &mlxp->mlx_tdom; if (!mlxcx_cmd_create_tis(mlxp, &g->mlg_tis)) { mutex_exit(&g->mlg_mtx); return (B_FALSE); } g->mlg_state |= MLXCX_GROUP_TIRTIS; for (i = 0; i < g->mlg_nwqs; ++i) { eq = NULL; while (eq == NULL) { eq = &mlxp->mlx_eqs[mlxp->mlx_next_eq++]; if (mlxp->mlx_next_eq >= mlxp->mlx_intr_count) mlxp->mlx_next_eq = mlxp->mlx_intr_cq0; if (eq->mleq_type != MLXCX_EQ_TYPE_ANY && eq->mleq_type != MLXCX_EQ_TYPE_TX) { /* Try the next one */ eq = NULL; } } if (!mlxcx_cq_setup(mlxp, eq, &cq, mlxp->mlx_props.mldp_cq_size_shift)) return (B_FALSE); cq->mlcq_stats = &g->mlg_port->mlp_stats; sq = &g->mlg_wqs[i]; if (!mlxcx_sq_setup(mlxp, g->mlg_port, cq, &g->mlg_tis, sq)) { mutex_exit(&g->mlg_mtx); return (B_FALSE); } sq->mlwq_group = g; } mutex_exit(&g->mlg_mtx); return (B_TRUE); } boolean_t mlxcx_tx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g, mlxcx_work_queue_t *sq) { uint_t i; mlxcx_buffer_t *b; mlxcx_completion_queue_t *cq; mutex_enter(&g->mlg_mtx); cq = sq->mlwq_cq; ASSERT(cq != NULL); mutex_enter(&cq->mlcq_mtx); mutex_enter(&sq->mlwq_mtx); if (sq->mlwq_state & MLXCX_WQ_STARTED) { mutex_exit(&sq->mlwq_mtx); mutex_exit(&cq->mlcq_mtx); mutex_exit(&g->mlg_mtx); return (B_TRUE); } ASSERT0(sq->mlwq_state & MLXCX_WQ_BUFFERS); for (i = 0; i < sq->mlwq_nents; ++i) { if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b)) break; mlxcx_buf_return(mlxp, b); } for (i = 0; i < sq->mlwq_nents / 2; ++i) { if (!mlxcx_buf_create_foreign(mlxp, sq->mlwq_foreign_bufs, &b)) break; mlxcx_buf_return(mlxp, b); } for (i = 0; i < sq->mlwq_nents; ++i) { if (!mlxcx_buf_create(mlxp, sq->mlwq_bufs, &b)) break; mlxcx_buf_return(mlxp, b); } sq->mlwq_state |= MLXCX_WQ_BUFFERS; mlxcx_shard_ready(sq->mlwq_bufs); mlxcx_shard_ready(sq->mlwq_foreign_bufs); if (!mlxcx_cmd_start_sq(mlxp, sq)) { mutex_exit(&sq->mlwq_mtx); mutex_exit(&cq->mlcq_mtx); mutex_exit(&g->mlg_mtx); return (B_FALSE); } g->mlg_state |= MLXCX_GROUP_RUNNING; (void) mlxcx_sq_add_nop(mlxp, sq); mutex_exit(&sq->mlwq_mtx); mutex_exit(&cq->mlcq_mtx); mutex_exit(&g->mlg_mtx); return (B_TRUE); } static boolean_t mlxcx_sq_ring_dbell(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t first) { uint_t idx; mlxcx_bf_t *bf; ddi_fm_error_t err; uint_t try = 0; ASSERT3U(mlwq->mlwq_type, ==, MLXCX_WQ_TYPE_SENDQ); ASSERT(mutex_owned(&mlwq->mlwq_mtx)); mlwq->mlwq_doorbell->mlwqd_send_counter = to_be16(mlwq->mlwq_pc); ASSERT(mlwq->mlwq_cq != NULL); ASSERT(mlwq->mlwq_cq->mlcq_eq != NULL); idx = mlwq->mlwq_cq->mlcq_eq->mleq_intr_index & MLXCX_BF_PER_UAR_MASK; bf = &mlwq->mlwq_uar->mlu_bf[idx]; retry: MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV); ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err, DDI_FME_VERSION); if (err.fme_status != DDI_FM_OK) { if (try++ < mlxcx_doorbell_tries) { ddi_fm_dma_err_clear( mlwq->mlwq_doorbell_dma.mxdb_dma_handle, DDI_FME_VERSION); goto retry; } else { goto err; } } mlxcx_put64(mlxp, bf->mbf_even, from_be64( mlwq->mlwq_bf_ent[first].mlsqbf_qwords[0])); ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err, DDI_FME_VERSION); if (err.fme_status == DDI_FM_OK) return (B_TRUE); if (try++ < mlxcx_doorbell_tries) { ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION); goto retry; } err: ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); return (B_FALSE); } boolean_t mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) { uint_t index, start_pc; mlxcx_sendq_ent_t *ent0; ddi_fm_error_t err; ASSERT(mutex_owned(&mlwq->mlwq_mtx)); index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); ent0 = &mlwq->mlwq_send_ent[index]; start_pc = mlwq->mlwq_pc; ++mlwq->mlwq_pc; /* * This counter is manipulated in the interrupt handler, which * does not hold the mlwq_mtx, hence the atomic. */ atomic_inc_64(&mlwq->mlwq_wqebb_used); bzero(ent0, sizeof (mlxcx_sendq_ent_t)); ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP; ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); ent0->mlsqe_control.mlcs_wqe_index = to_be16(start_pc); set_bits8(&ent0->mlsqe_control.mlcs_flags, MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_NONE); set_bits8(&ent0->mlsqe_control.mlcs_flags, MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); ent0->mlsqe_control.mlcs_ds = 1; VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV)); ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, DDI_FME_VERSION); if (err.fme_status != DDI_FM_OK) { return (B_FALSE); } if (!mlxcx_sq_ring_dbell(mlxp, mlwq, index)) { return (B_FALSE); } return (B_TRUE); } boolean_t mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags, mlxcx_buffer_t *b0) { uint_t index, first, ents; mlxcx_completion_queue_t *cq; mlxcx_sendq_ent_t *ent0; mlxcx_sendq_extra_ent_t *ent; mlxcx_wqe_data_seg_t *seg; uint_t ptri, nptr; const ddi_dma_cookie_t *c; size_t rem; uint64_t wqebb_used; mlxcx_buffer_t *b; ddi_fm_error_t err; boolean_t rv; ASSERT(mutex_owned(&mlwq->mlwq_mtx)); ASSERT3P(b0->mlb_tx_head, ==, b0); ASSERT3U(b0->mlb_state, ==, MLXCX_BUFFER_ON_WQ); cq = mlwq->mlwq_cq; index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); ent0 = &mlwq->mlwq_send_ent[index]; b0->mlb_wqe_index = mlwq->mlwq_pc; ents = 1; first = index; bzero(ent0, sizeof (mlxcx_sendq_ent_t)); ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND; ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); ent0->mlsqe_control.mlcs_wqe_index = to_be16(b0->mlb_wqe_index); set_bits8(&ent0->mlsqe_control.mlcs_flags, MLXCX_SQE_FENCE_MODE, MLXCX_SQE_FENCE_WAIT_OTHERS); set_bits8(&ent0->mlsqe_control.mlcs_flags, MLXCX_SQE_COMPLETION_MODE, MLXCX_SQE_CQE_ALWAYS); VERIFY3U(inlinelen, <=, sizeof (ent0->mlsqe_eth.mles_inline_headers)); set_bits16(&ent0->mlsqe_eth.mles_szflags, MLXCX_SQE_ETH_INLINE_HDR_SZ, inlinelen); if (inlinelen > 0) { bcopy(inlinehdrs, ent0->mlsqe_eth.mles_inline_headers, inlinelen); } ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) / MLXCX_WQE_OCTOWORD; if (chkflags & HCK_IPV4_HDRCKSUM) { ASSERT(mlxp->mlx_caps->mlc_checksum); set_bit8(&ent0->mlsqe_eth.mles_csflags, MLXCX_SQE_ETH_CSFLAG_L3_CHECKSUM); } if (chkflags & HCK_FULLCKSUM) { ASSERT(mlxp->mlx_caps->mlc_checksum); set_bit8(&ent0->mlsqe_eth.mles_csflags, MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM); } /* * mlwq_wqebb_used is only incremented whilst holding * the mlwq_mtx mutex, but it is decremented (atomically) in * the interrupt context *not* under mlwq_mtx mutex. * So, now take a snapshot of the number of used wqes which will * be a conistent maximum we can use whilst iterating through * the buffers and DMA cookies. */ wqebb_used = mlwq->mlwq_wqebb_used; b = b0; ptri = 0; nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); seg = ent0->mlsqe_data; while (b != NULL) { rem = b->mlb_used; c = NULL; while (rem > 0 && (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) { if (ptri >= nptr) { if ((ents + wqebb_used) >= mlwq->mlwq_nents) return (B_FALSE); index = (mlwq->mlwq_pc + ents) & (mlwq->mlwq_nents - 1); ent = &mlwq->mlwq_send_extra_ent[index]; ++ents; seg = ent->mlsqe_data; ptri = 0; nptr = sizeof (ent->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); } seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); if (c->dmac_size > rem) { seg->mlds_byte_count = to_be32(rem); rem = 0; } else { seg->mlds_byte_count = to_be32(c->dmac_size); rem -= c->dmac_size; } seg->mlds_address = to_be64(c->dmac_laddress); ++seg; ++ptri; ++ent0->mlsqe_control.mlcs_ds; ASSERT3U(ent0->mlsqe_control.mlcs_ds, <=, MLXCX_SQE_MAX_DS); } if (b == b0) { b = list_head(&b0->mlb_tx_chain); } else { b = list_next(&b0->mlb_tx_chain, b); } } b0->mlb_wqebbs = ents; mlwq->mlwq_pc += ents; atomic_add_64(&mlwq->mlwq_wqebb_used, ents); for (; ptri < nptr; ++ptri, ++seg) { seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); seg->mlds_byte_count = to_be32(0); seg->mlds_address = to_be64(0); } /* * Make sure the workqueue entry is flushed out before updating * the doorbell. * If the ring has wrapped, we need to flush the front and back. */ if ((first + ents) > mlwq->mlwq_nents) { uint_t sync_cnt = mlwq->mlwq_nents - first; VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, sync_cnt * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV)); ent0 = &mlwq->mlwq_send_ent[0]; ents -= sync_cnt; } VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent, ents * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV)); ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, DDI_FME_VERSION); if (err.fme_status != DDI_FM_OK) { return (B_FALSE); } /* * Hold the bufmtx whilst ringing the doorbell, to prevent * the buffer from being moved to another list, so we can * safely remove it should the ring fail. */ mutex_enter(&cq->mlcq_bufbmtx); list_insert_tail(&cq->mlcq_buffers_b, b0); if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) { atomic_inc_64(&cq->mlcq_bufcnt); } else { list_remove(&cq->mlcq_buffers_b, b0); } mutex_exit(&cq->mlcq_bufbmtx); return (rv); } boolean_t mlxcx_rq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, mlxcx_buffer_t *buf) { return (mlxcx_rq_add_buffers(mlxp, mlwq, &buf, 1)); } boolean_t mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, mlxcx_buffer_t **bufs, size_t nbufs) { uint_t index; mlxcx_recvq_ent_t *ent; mlxcx_completion_queue_t *cq; mlxcx_wqe_data_seg_t *seg; uint_t bi, ptri; const ddi_dma_cookie_t *c; mlxcx_buffer_t *buf; ddi_fm_error_t err; ASSERT(mutex_owned(&mlwq->mlwq_mtx)); cq = mlwq->mlwq_cq; ASSERT(mutex_owned(&cq->mlcq_mtx)); for (bi = 0; bi < nbufs; ++bi) { buf = bufs[bi]; bufs[bi] = NULL; ASSERT3U(buf->mlb_state, ==, MLXCX_BUFFER_ON_WQ); index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); ent = &mlwq->mlwq_recv_ent[index]; buf->mlb_wqe_index = mlwq->mlwq_pc; buf->mlb_wqebbs = 1; ++mlwq->mlwq_pc; atomic_inc_64(&mlwq->mlwq_wqebb_used); mutex_enter(&cq->mlcq_bufbmtx); list_insert_tail(&cq->mlcq_buffers, buf); atomic_inc_64(&cq->mlcq_bufcnt); mutex_exit(&cq->mlcq_bufbmtx); ASSERT3U(buf->mlb_dma.mxdb_ncookies, <=, MLXCX_RECVQ_MAX_PTRS); ptri = 0; c = NULL; while ((c = mlxcx_dma_cookie_iter(&buf->mlb_dma, c)) != NULL) { seg = &ent->mlrqe_data[ptri++]; seg->mlds_lkey = to_be32(mlxp->mlx_rsvd_lkey); seg->mlds_byte_count = to_be32(c->dmac_size); seg->mlds_address = to_be64(c->dmac_laddress); } /* * Fill any unused scatter pointers with the special null * value. */ for (; ptri < MLXCX_RECVQ_MAX_PTRS; ++ptri) { seg = &ent->mlrqe_data[ptri]; seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); seg->mlds_byte_count = to_be32(0); seg->mlds_address = to_be64(0); } /* * Make sure the workqueue entry is flushed out before updating * the doorbell. */ VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle, (uintptr_t)ent - (uintptr_t)mlwq->mlwq_recv_ent, sizeof (mlxcx_recvq_ent_t), DDI_DMA_SYNC_FORDEV)); ddi_fm_dma_err_get(mlwq->mlwq_dma.mxdb_dma_handle, &err, DDI_FME_VERSION); if (err.fme_status != DDI_FM_OK) { return (B_FALSE); } } mlwq->mlwq_doorbell->mlwqd_recv_counter = to_be16(mlwq->mlwq_pc); /* * Flush the CQ doorbell as well so that HW knows how many * completions we've consumed. */ MLXCX_DMA_SYNC(cq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); ddi_fm_dma_err_get(cq->mlcq_doorbell_dma.mxdb_dma_handle, &err, DDI_FME_VERSION); if (err.fme_status != DDI_FM_OK) { return (B_FALSE); } MLXCX_DMA_SYNC(mlwq->mlwq_doorbell_dma, DDI_DMA_SYNC_FORDEV); ddi_fm_dma_err_get(mlwq->mlwq_doorbell_dma.mxdb_dma_handle, &err, DDI_FME_VERSION); if (err.fme_status != DDI_FM_OK) { return (B_FALSE); } return (B_TRUE); } static void mlxcx_rq_refill_task(void *arg) { mlxcx_work_queue_t *wq = arg; mlxcx_completion_queue_t *cq = wq->mlwq_cq; mlxcx_t *mlxp = wq->mlwq_mlx; mlxcx_buf_shard_t *s = wq->mlwq_bufs; boolean_t refill, draining; do { /* * Wait here until one of 3 conditions: * 1. The shard is draining, or * 2. There are buffers on the free list, or * 3. The WQ is being shut down. */ mutex_enter(&s->mlbs_mtx); while (s->mlbs_state != MLXCX_SHARD_DRAINING && list_is_empty(&s->mlbs_free) && (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0) { cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); } draining = (s->mlbs_state == MLXCX_SHARD_DRAINING); mutex_exit(&s->mlbs_mtx); mutex_enter(&cq->mlcq_mtx); mutex_enter(&wq->mlwq_mtx); if (draining || (cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) { refill = B_FALSE; wq->mlwq_state &= ~MLXCX_WQ_REFILLING; } else { mlxcx_rq_refill(mlxp, wq); if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) { refill = B_TRUE; } else { refill = B_FALSE; wq->mlwq_state &= ~MLXCX_WQ_REFILLING; } } mutex_exit(&wq->mlwq_mtx); mutex_exit(&cq->mlcq_mtx); } while (refill); } void mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) { size_t target, current, want, done, n; mlxcx_completion_queue_t *cq; mlxcx_ring_group_t *g; mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP]; uint_t i; ASSERT(mutex_owned(&mlwq->mlwq_mtx)); cq = mlwq->mlwq_cq; ASSERT(mutex_owned(&cq->mlcq_mtx)); ASSERT(mlwq->mlwq_state & MLXCX_WQ_BUFFERS); target = mlwq->mlwq_nents - MLXCX_RQ_REFILL_STEP; cq = mlwq->mlwq_cq; if ((mlwq->mlwq_state & MLXCX_WQ_STARTED) == 0) return; if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) return; current = cq->mlcq_bufcnt; if (current >= target - MLXCX_RQ_REFILL_STEP) return; want = target - current; done = 0; while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) { n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP); if (n == 0) { /* * We didn't get any buffers from the free queue. * It might not be an issue, schedule a taskq * to wait for free buffers if the completion * queue is low. */ if (current < MLXCX_RQ_REFILL_STEP && (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) { mlwq->mlwq_state |= MLXCX_WQ_REFILLING; g = mlwq->mlwq_group; taskq_dispatch_ent(g->mlg_refill_tq, mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP, &mlwq->mlwq_tqe); } return; } if ((mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) != 0) { for (i = 0; i < n; ++i) mlxcx_buf_return(mlxp, b[i]); return; } if (!mlxcx_rq_add_buffers(mlxp, mlwq, b, n)) { /* * mlxcx_rq_add_buffers NULLs out the buffers as it * enqueues them, so any that are non-NULL we have to * free now. The others now belong to the WQ, even if * we failed. */ for (i = 0; i < n; ++i) { if (b[i] != NULL) { mlxcx_buf_return(mlxp, b[i]); } } return; } done += n; } } static const char * mlxcx_cq_err_syndrome_string(mlxcx_cq_error_syndrome_t sy) { switch (sy) { case MLXCX_CQ_ERR_LOCAL_LENGTH: return ("LOCAL_LENGTH"); case MLXCX_CQ_ERR_LOCAL_QP_OP: return ("LOCAL_QP_OP"); case MLXCX_CQ_ERR_LOCAL_PROTECTION: return ("LOCAL_PROTECTION"); case MLXCX_CQ_ERR_WR_FLUSHED: return ("WR_FLUSHED"); case MLXCX_CQ_ERR_MEM_WINDOW_BIND: return ("MEM_WINDOW_BIND"); case MLXCX_CQ_ERR_BAD_RESPONSE: return ("BAD_RESPONSE"); case MLXCX_CQ_ERR_LOCAL_ACCESS: return ("LOCAL_ACCESS"); case MLXCX_CQ_ERR_XPORT_RETRY_CTR: return ("XPORT_RETRY_CTR"); case MLXCX_CQ_ERR_RNR_RETRY_CTR: return ("RNR_RETRY_CTR"); case MLXCX_CQ_ERR_ABORTED: return ("ABORTED"); default: return ("UNKNOWN"); } } static void mlxcx_fm_cqe_ereport(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mlxcx_completionq_error_ent_t *ent) { uint64_t ena; char buf[FM_MAX_CLASS]; const char *name = mlxcx_cq_err_syndrome_string(ent->mlcqee_syndrome); if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) return; (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", MLXCX_FM_SERVICE_MLXCX, "cqe.err"); ena = fm_ena_generate(0, FM_ENA_FMT1); ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, "syndrome", DATA_TYPE_STRING, name, "syndrome_num", DATA_TYPE_UINT8, ent->mlcqee_syndrome, "vendor_syndrome", DATA_TYPE_UINT8, ent->mlcqee_vendor_error_syndrome, "wqe_counter", DATA_TYPE_UINT16, from_be16(ent->mlcqee_wqe_counter), "wq_type", DATA_TYPE_STRING, (mlcq->mlcq_wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) ? "send": "recv", "cq_num", DATA_TYPE_UINT32, mlcq->mlcq_num, "wq_num", DATA_TYPE_UINT32, mlcq->mlcq_wq->mlwq_num, NULL); ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); } void mlxcx_tx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) { ASSERT(mutex_owned(&mlcq->mlcq_mtx)); if (ent->mlcqe_opcode == MLXCX_CQE_OP_REQ_ERR) { mlxcx_completionq_error_ent_t *eent = (mlxcx_completionq_error_ent_t *)ent; mlxcx_fm_cqe_ereport(mlxp, mlcq, eent); mlxcx_buf_return_chain(mlxp, buf, B_FALSE); mutex_enter(&mlcq->mlcq_wq->mlwq_mtx); mlxcx_check_sq(mlxp, mlcq->mlcq_wq); mutex_exit(&mlcq->mlcq_wq->mlwq_mtx); return; } if (ent->mlcqe_opcode != MLXCX_CQE_OP_REQ) { mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode); mlxcx_buf_return_chain(mlxp, buf, B_FALSE); return; } if (ent->mlcqe_send_wqe_opcode != MLXCX_WQE_OP_SEND) { mlxcx_warn(mlxp, "!got weird cq wqe opcode: %x", ent->mlcqe_send_wqe_opcode); mlxcx_buf_return_chain(mlxp, buf, B_FALSE); return; } if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) { mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format); mlxcx_buf_return_chain(mlxp, buf, B_FALSE); return; } mlxcx_buf_return_chain(mlxp, buf, B_FALSE); } mblk_t * mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) { uint32_t chkflags = 0; uint_t wqe_index; ddi_fm_error_t err; ASSERT(mutex_owned(&mlcq->mlcq_mtx)); if (ent->mlcqe_opcode == MLXCX_CQE_OP_RESP_ERR) { mlxcx_completionq_error_ent_t *eent = (mlxcx_completionq_error_ent_t *)ent; mlxcx_fm_cqe_ereport(mlxp, mlcq, eent); mlxcx_buf_return(mlxp, buf); mutex_enter(&mlcq->mlcq_wq->mlwq_mtx); mlxcx_check_rq(mlxp, mlcq->mlcq_wq); mutex_exit(&mlcq->mlcq_wq->mlwq_mtx); return (NULL); } if (ent->mlcqe_opcode != MLXCX_CQE_OP_RESP) { mlxcx_warn(mlxp, "!got weird cq opcode: %x", ent->mlcqe_opcode); mlxcx_buf_return(mlxp, buf); return (NULL); } if (ent->mlcqe_format != MLXCX_CQE_FORMAT_BASIC) { mlxcx_warn(mlxp, "!got weird cq format: %x", ent->mlcqe_format); mlxcx_buf_return(mlxp, buf); return (NULL); } if (ent->mlcqe_rx_drop_counter > 0) { atomic_add_64(&mlcq->mlcq_stats->mlps_rx_drops, ent->mlcqe_rx_drop_counter); } MLXCX_DMA_SYNC(buf->mlb_dma, DDI_DMA_SYNC_FORCPU); ddi_fm_dma_err_get(buf->mlb_dma.mxdb_dma_handle, &err, DDI_FME_VERSION); if (err.fme_status != DDI_FM_OK) { ddi_fm_dma_err_clear(buf->mlb_dma.mxdb_dma_handle, DDI_FME_VERSION); mlxcx_buf_return(mlxp, buf); return (NULL); } /* * mlxcx_buf_loan() will set mlb_wqe_index to zero. * Remember it for later. */ wqe_index = buf->mlb_wqe_index; if (!mlxcx_buf_loan(mlxp, buf)) { mlxcx_buf_return(mlxp, buf); return (NULL); } buf->mlb_mp->b_next = NULL; buf->mlb_mp->b_cont = NULL; buf->mlb_mp->b_wptr = buf->mlb_mp->b_rptr + from_be32(ent->mlcqe_byte_cnt); if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L4_OK)) { chkflags |= HCK_FULLCKSUM_OK; } if (get_bit8(ent->mlcqe_csflags, MLXCX_CQE_CSFLAGS_L3_OK)) { chkflags |= HCK_IPV4_HDRCKSUM_OK; } if (chkflags != 0) { mac_hcksum_set(buf->mlb_mp, 0, 0, 0, from_be16(ent->mlcqe_checksum), chkflags); } /* * Don't check if a refill is needed on every single completion, * since checking involves taking the RQ lock. */ if ((wqe_index & 0x7) == 0) { mlxcx_work_queue_t *wq = mlcq->mlcq_wq; ASSERT(wq != NULL); mutex_enter(&wq->mlwq_mtx); if (!(wq->mlwq_state & MLXCX_WQ_TEARDOWN)) mlxcx_rq_refill(mlxp, wq); mutex_exit(&wq->mlwq_mtx); } return (buf->mlb_mp); } static void mlxcx_buf_mp_return(caddr_t arg) { mlxcx_buffer_t *b = (mlxcx_buffer_t *)arg; mlxcx_t *mlxp = b->mlb_mlx; /* The mblk has been used now, so NULL it out. */ b->mlb_mp = NULL; if (b->mlb_state == MLXCX_BUFFER_ON_LOAN) mlxcx_buf_return(mlxp, b); } boolean_t mlxcx_buf_create(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, mlxcx_buffer_t **bp) { mlxcx_buffer_t *b; ddi_device_acc_attr_t acc; ddi_dma_attr_t attr; boolean_t ret; b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP); b->mlb_shard = shard; b->mlb_foreign = B_FALSE; mlxcx_dma_acc_attr(mlxp, &acc); mlxcx_dma_buf_attr(mlxp, &attr); ret = mlxcx_dma_alloc_offset(mlxp, &b->mlb_dma, &attr, &acc, B_FALSE, mlxp->mlx_ports[0].mlp_mtu, 2, B_TRUE); if (!ret) { kmem_cache_free(mlxp->mlx_bufs_cache, b); return (B_FALSE); } b->mlb_frtn.free_func = mlxcx_buf_mp_return; b->mlb_frtn.free_arg = (caddr_t)b; b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va, b->mlb_dma.mxdb_len, 0, &b->mlb_frtn); *bp = b; return (B_TRUE); } boolean_t mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, mlxcx_buffer_t **bp) { mlxcx_buffer_t *b; ddi_dma_attr_t attr; boolean_t ret; b = kmem_cache_alloc(mlxp->mlx_bufs_cache, KM_SLEEP); b->mlb_shard = shard; b->mlb_foreign = B_TRUE; mlxcx_dma_buf_attr(mlxp, &attr); ret = mlxcx_dma_init(mlxp, &b->mlb_dma, &attr, B_TRUE); if (!ret) { kmem_cache_free(mlxp->mlx_bufs_cache, b); return (B_FALSE); } *bp = b; return (B_TRUE); } static mlxcx_buffer_t * mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) { mlxcx_buffer_t *b; mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs; mutex_enter(&s->mlbs_mtx); if (s->mlbs_state != MLXCX_SHARD_READY) { mutex_exit(&s->mlbs_mtx); return (NULL); } if ((b = list_remove_head(&s->mlbs_free)) != NULL) { ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); ASSERT(b->mlb_foreign); b->mlb_state = MLXCX_BUFFER_ON_WQ; list_insert_tail(&s->mlbs_busy, b); } mutex_exit(&s->mlbs_mtx); return (b); } static mlxcx_buffer_t * mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz) { ddi_fm_error_t err; mlxcx_buffer_t *b; uint_t attempts = 0; copyb: if ((b = mlxcx_buf_take(mlxp, wq)) == NULL) return (NULL); ASSERT3U(b->mlb_dma.mxdb_len, >=, sz); bcopy(rptr, b->mlb_dma.mxdb_va, sz); MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV); ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err, DDI_FME_VERSION); if (err.fme_status != DDI_FM_OK) { ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle, DDI_FME_VERSION); mlxcx_buf_return(mlxp, b); if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) { return (NULL); } goto copyb; } return (b); } static mlxcx_buffer_t * mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mblk_t *mp, size_t off) { mlxcx_buffer_t *b; uint8_t *rptr; size_t sz; boolean_t ret; rptr = mp->b_rptr; sz = MBLKL(mp); #ifdef DEBUG if (off > 0) { ASSERT3U(off, <, sz); } #endif rptr += off; sz -= off; if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) { b = mlxcx_copy_data(mlxp, wq, rptr, sz); } else { b = mlxcx_buf_take_foreign(mlxp, wq); if (b == NULL) return (NULL); ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, B_FALSE); if (!ret) { mlxcx_buf_return(mlxp, b); b = mlxcx_copy_data(mlxp, wq, rptr, sz); } } return (b); } uint_t mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mblk_t *mpb, size_t off, mlxcx_buffer_t **bp) { mlxcx_buffer_t *b, *b0 = NULL; boolean_t first = B_TRUE; mblk_t *mp; size_t offset = off; size_t ncookies = 0; uint_t count = 0; for (mp = mpb; mp != NULL && ncookies <= MLXCX_SQE_MAX_PTRS; mp = mp->b_cont) { b = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, offset); if (b == NULL) goto failed; ncookies += b->mlb_dma.mxdb_ncookies; if (first) b0 = b; if (!first) b->mlb_state = MLXCX_BUFFER_ON_CHAIN; b->mlb_tx_mp = mp; b->mlb_tx_head = b0; b->mlb_used = MBLKL(mp) - offset; if (!first) list_insert_tail(&b0->mlb_tx_chain, b); first = B_FALSE; offset = 0; count++; } /* * The chain of mblks has resulted in too many cookies for * a single message. This is unusual, so take the hit to tidy * up, do a pullup to a single mblk and allocate the requisite * buf. */ if (ncookies > MLXCX_SQE_MAX_PTRS) { DTRACE_PROBE4(pullup, mlxcx_t *, mlxp, mlxcx_work_queue_t *, wq, mblk_t *, mpb, size_t, ncookies); if (b0 != NULL) mlxcx_buf_return_chain(mlxp, b0, B_TRUE); if ((mp = msgpullup(mpb, -1)) == NULL) return (0); b0 = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, off); if (b0 == NULL) { freemsg(mp); return (0); } freemsg(mpb); b0->mlb_tx_mp = mp; b0->mlb_tx_head = b0; b0->mlb_used = MBLKL(mp) - off; count = 1; } *bp = b0; return (count); failed: if (b0 != NULL) mlxcx_buf_return_chain(mlxp, b0, B_TRUE); return (0); } mlxcx_buffer_t * mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) { mlxcx_buffer_t *b; mlxcx_buf_shard_t *s = wq->mlwq_bufs; mutex_enter(&s->mlbs_mtx); if (s->mlbs_state != MLXCX_SHARD_READY) { mutex_exit(&s->mlbs_mtx); return (NULL); } if ((b = list_remove_head(&s->mlbs_free)) != NULL) { ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); b->mlb_state = MLXCX_BUFFER_ON_WQ; list_insert_tail(&s->mlbs_busy, b); } mutex_exit(&s->mlbs_mtx); return (b); } size_t mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp, size_t nbufs) { mlxcx_buffer_t *b; size_t done = 0; mlxcx_buf_shard_t *s; s = wq->mlwq_bufs; mutex_enter(&s->mlbs_mtx); if (s->mlbs_state != MLXCX_SHARD_READY) { mutex_exit(&s->mlbs_mtx); return (0); } while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) { ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); b->mlb_state = MLXCX_BUFFER_ON_WQ; list_insert_tail(&s->mlbs_busy, b); bp[done++] = b; } mutex_exit(&s->mlbs_mtx); return (done); } boolean_t mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b) { mlxcx_buf_shard_t *s = b->mlb_shard; VERIFY3U(b->mlb_state, ==, MLXCX_BUFFER_ON_WQ); ASSERT3P(b->mlb_mlx, ==, mlxp); if (b->mlb_mp == NULL) { b->mlb_mp = desballoc((unsigned char *)b->mlb_dma.mxdb_va, b->mlb_dma.mxdb_len, 0, &b->mlb_frtn); if (b->mlb_mp == NULL) return (B_FALSE); } b->mlb_state = MLXCX_BUFFER_ON_LOAN; b->mlb_wqe_index = 0; mutex_enter(&s->mlbs_mtx); list_remove(&s->mlbs_busy, b); list_insert_tail(&s->mlbs_loaned, b); mutex_exit(&s->mlbs_mtx); return (B_TRUE); } void mlxcx_buf_return_chain(mlxcx_t *mlxp, mlxcx_buffer_t *b0, boolean_t keepmp) { mlxcx_buffer_t *b; if (b0->mlb_tx_head != b0) { mlxcx_buf_return(mlxp, b0); return; } while ((b = list_head(&b0->mlb_tx_chain)) != NULL) { mlxcx_buf_return(mlxp, b); } if (keepmp) { b0->mlb_tx_mp = NULL; b0->mlb_tx_head = NULL; } mlxcx_buf_return(mlxp, b0); } void mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b) { mlxcx_buffer_state_t oldstate = b->mlb_state; mlxcx_buffer_t *txhead = b->mlb_tx_head; mlxcx_buf_shard_t *s = b->mlb_shard; mblk_t *mp = b->mlb_tx_mp; VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE); ASSERT3P(b->mlb_mlx, ==, mlxp); /* * The mlbs_mtx held below is a heavily contended lock, so it is * imperative we do as much of the buffer clean up outside the lock * as is possible. */ b->mlb_state = MLXCX_BUFFER_FREE; b->mlb_wqe_index = 0; b->mlb_tx_head = NULL; b->mlb_tx_mp = NULL; b->mlb_used = 0; b->mlb_wqebbs = 0; ASSERT(list_is_empty(&b->mlb_tx_chain)); if (b->mlb_foreign) { if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) { mlxcx_dma_unbind(mlxp, &b->mlb_dma); } } mutex_enter(&s->mlbs_mtx); switch (oldstate) { case MLXCX_BUFFER_INIT: break; case MLXCX_BUFFER_ON_WQ: list_remove(&s->mlbs_busy, b); break; case MLXCX_BUFFER_ON_LOAN: ASSERT(!b->mlb_foreign); list_remove(&s->mlbs_loaned, b); if (s->mlbs_state == MLXCX_SHARD_DRAINING) { /* * When we're draining, Eg during mac_stop(), * we destroy the buffer immediately rather than * recycling it. Otherwise we risk leaving it * on the free list and leaking it. */ list_insert_tail(&s->mlbs_free, b); mlxcx_buf_destroy(mlxp, b); /* * Teardown might be waiting for loaned list to empty. */ cv_broadcast(&s->mlbs_free_nonempty); mutex_exit(&s->mlbs_mtx); return; } break; case MLXCX_BUFFER_FREE: VERIFY(0); break; case MLXCX_BUFFER_ON_CHAIN: ASSERT(txhead != NULL); list_remove(&txhead->mlb_tx_chain, b); list_remove(&s->mlbs_busy, b); break; } list_insert_tail(&s->mlbs_free, b); cv_broadcast(&s->mlbs_free_nonempty); mutex_exit(&s->mlbs_mtx); /* * For TX chain heads, free the mblk_t after we let go of the lock. * This might be a borrowed buf that we in turn loaned to MAC, in which * case calling freemsg() on it will re-enter this very function -- so * we better not be holding the lock! */ if (txhead == b) freemsg(mp); } void mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b) { mlxcx_buf_shard_t *s = b->mlb_shard; VERIFY(b->mlb_state == MLXCX_BUFFER_FREE || b->mlb_state == MLXCX_BUFFER_INIT); ASSERT(mutex_owned(&s->mlbs_mtx)); if (b->mlb_state == MLXCX_BUFFER_FREE) list_remove(&s->mlbs_free, b); /* * This is going back to the kmem cache, so it needs to be set up in * the same way we expect a new buffer to come out (state INIT, other * fields NULL'd) */ b->mlb_state = MLXCX_BUFFER_INIT; b->mlb_shard = NULL; if (b->mlb_mp != NULL) { freeb(b->mlb_mp); ASSERT(b->mlb_mp == NULL); } mlxcx_dma_free(&b->mlb_dma); ASSERT(list_is_empty(&b->mlb_tx_chain)); kmem_cache_free(mlxp->mlx_bufs_cache, b); } void mlxcx_shard_ready(mlxcx_buf_shard_t *s) { mutex_enter(&s->mlbs_mtx); s->mlbs_state = MLXCX_SHARD_READY; mutex_exit(&s->mlbs_mtx); } void mlxcx_shard_draining(mlxcx_buf_shard_t *s) { mutex_enter(&s->mlbs_mtx); s->mlbs_state = MLXCX_SHARD_DRAINING; cv_broadcast(&s->mlbs_free_nonempty); mutex_exit(&s->mlbs_mtx); }