1 /*- 2 * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 * $FreeBSD$ 26 */ 27 28 #include "opt_rss.h" 29 #include "opt_ratelimit.h" 30 31 #include <dev/mlx5/mlx5_en/en.h> 32 33 #ifdef RATELIMIT 34 35 static int mlx5e_rl_open_workers(struct mlx5e_priv *); 36 static void mlx5e_rl_close_workers(struct mlx5e_priv *); 37 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS); 38 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x, 39 struct sysctl_oid *, const char *name, const char *desc); 40 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, 41 struct sysctl_oid *node, const char *name, const char *desc); 42 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value); 43 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value); 44 static if_snd_tag_modify_t mlx5e_rl_snd_tag_modify; 45 static if_snd_tag_query_t mlx5e_rl_snd_tag_query; 46 static if_snd_tag_free_t mlx5e_rl_snd_tag_free; 47 48 static const struct if_snd_tag_sw mlx5e_rl_snd_tag_sw = { 49 .snd_tag_modify = mlx5e_rl_snd_tag_modify, 50 .snd_tag_query = mlx5e_rl_snd_tag_query, 51 .snd_tag_free = mlx5e_rl_snd_tag_free, 52 .type = IF_SND_TAG_TYPE_RATE_LIMIT 53 }; 54 55 static void 56 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl, 57 struct mlx5e_sq_param *param) 58 { 59 void *sqc = param->sqc; 60 void *wq = MLX5_ADDR_OF(sqc, sqc, wq); 61 uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size); 62 63 MLX5_SET(wq, wq, log_wq_sz, log_sq_size); 64 MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); 65 MLX5_SET(wq, wq, pd, rl->priv->pdn); 66 67 param->wq.linear = 1; 68 } 69 70 static void 71 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl, 72 struct mlx5e_cq_param *param) 73 { 74 void *cqc = param->cqc; 75 uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size); 76 77 MLX5_SET(cqc, cqc, log_cq_size, log_sq_size); 78 MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs); 79 MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts); 80 MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index); 81 82 switch (rl->param.tx_coalesce_mode) { 83 case 0: 84 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); 85 break; 86 default: 87 if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe)) 88 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); 89 else 90 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); 91 break; 92 } 93 } 94 95 static void 96 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl, 97 struct mlx5e_rl_channel_param *cparam) 98 { 99 memset(cparam, 0, sizeof(*cparam)); 100 101 mlx5e_rl_build_sq_param(rl, &cparam->sq); 102 mlx5e_rl_build_cq_param(rl, &cparam->cq); 103 } 104 105 static int 106 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq, 107 struct mlx5e_sq_param *param, int ix) 108 { 109 struct mlx5_core_dev *mdev = priv->mdev; 110 void *sqc = param->sqc; 111 void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq); 112 int err; 113 114 /* Create DMA descriptor TAG */ 115 if ((err = -bus_dma_tag_create( 116 bus_get_dma_tag(mdev->pdev->dev.bsddev), 117 1, /* any alignment */ 118 0, /* no boundary */ 119 BUS_SPACE_MAXADDR, /* lowaddr */ 120 BUS_SPACE_MAXADDR, /* highaddr */ 121 NULL, NULL, /* filter, filterarg */ 122 MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */ 123 MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */ 124 MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */ 125 0, /* flags */ 126 NULL, NULL, /* lockfunc, lockfuncarg */ 127 &sq->dma_tag))) 128 goto done; 129 130 sq->mkey_be = cpu_to_be32(priv->mr.key); 131 sq->ifp = priv->ifp; 132 sq->priv = priv; 133 134 err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, 135 &sq->wq_ctrl); 136 if (err) 137 goto err_free_dma_tag; 138 139 sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; 140 141 err = mlx5e_alloc_sq_db(sq); 142 if (err) 143 goto err_sq_wq_destroy; 144 145 mlx5e_update_sq_inline(sq); 146 147 return (0); 148 149 err_sq_wq_destroy: 150 mlx5_wq_destroy(&sq->wq_ctrl); 151 err_free_dma_tag: 152 bus_dma_tag_destroy(sq->dma_tag); 153 done: 154 return (err); 155 } 156 157 static void 158 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq) 159 { 160 161 mlx5e_free_sq_db(sq); 162 mlx5_wq_destroy(&sq->wq_ctrl); 163 bus_dma_tag_destroy(sq->dma_tag); 164 } 165 166 static int 167 mlx5e_rl_query_sq(struct mlx5e_sq *sq) 168 { 169 void *out; 170 int inlen; 171 int err; 172 173 inlen = MLX5_ST_SZ_BYTES(query_sq_out); 174 out = mlx5_vzalloc(inlen); 175 if (!out) 176 return -ENOMEM; 177 178 err = mlx5_core_query_sq(sq->priv->mdev, sq->sqn, out); 179 if (err) 180 goto out; 181 182 sq->queue_handle = MLX5_GET(query_sq_out, out, sq_context.queue_handle); 183 184 out: 185 kvfree(out); 186 return err; 187 } 188 189 static int 190 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq, 191 struct mlx5e_sq_param *param, int ix) 192 { 193 int err; 194 195 err = mlx5e_rl_create_sq(priv, sq, param, ix); 196 if (err) 197 return (err); 198 199 err = mlx5e_enable_sq(sq, param, &priv->channel[ix].bfreg, priv->rl.tisn); 200 if (err) 201 goto err_destroy_sq; 202 203 err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); 204 if (err) 205 goto err_disable_sq; 206 207 if (MLX5_CAP_QOS(priv->mdev, qos_remap_pp)) { 208 err = mlx5e_rl_query_sq(sq); 209 if (err) { 210 mlx5_en_err(priv->ifp, "Failed retrieving send queue handle for" 211 "SQ remap - sqn=%u, err=(%d)\n", sq->sqn, err); 212 sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE; 213 } 214 } else 215 sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE; 216 217 WRITE_ONCE(sq->running, 1); 218 219 return (0); 220 221 err_disable_sq: 222 mlx5e_disable_sq(sq); 223 err_destroy_sq: 224 mlx5e_rl_destroy_sq(sq); 225 226 return (err); 227 } 228 229 static void 230 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq) 231 { 232 mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF); 233 mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF); 234 235 callout_init_mtx(&sq->cev_callout, &sq->lock, 0); 236 237 sq->cev_factor = priv->rl.param.tx_completion_fact; 238 239 /* ensure the TX completion event factor is not zero */ 240 if (sq->cev_factor == 0) 241 sq->cev_factor = 1; 242 } 243 244 static int 245 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix, 246 struct mlx5e_rl_channel_param *cparam, 247 struct mlx5e_sq *volatile *ppsq) 248 { 249 struct mlx5e_priv *priv = rlw->priv; 250 struct mlx5e_sq *sq; 251 int err; 252 253 sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO); 254 255 /* init mutexes */ 256 mlx5e_rl_chan_mtx_init(priv, sq); 257 258 /* open TX completion queue */ 259 err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq, 260 &mlx5e_tx_cq_comp, eq_ix); 261 if (err) 262 goto err_free; 263 264 err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix); 265 if (err) 266 goto err_close_tx_cq; 267 268 /* store TX channel pointer */ 269 *ppsq = sq; 270 271 /* poll TX queue initially */ 272 sq->cq.mcq.comp(&sq->cq.mcq, NULL); 273 274 return (0); 275 276 err_close_tx_cq: 277 mlx5e_close_cq(&sq->cq); 278 279 err_free: 280 /* destroy mutexes */ 281 mtx_destroy(&sq->lock); 282 mtx_destroy(&sq->comp_lock); 283 free(sq, M_MLX5EN); 284 atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL); 285 return (err); 286 } 287 288 static void 289 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq) 290 { 291 struct mlx5e_sq *sq = *ppsq; 292 293 /* check if channel is already closed */ 294 if (sq == NULL) 295 return; 296 /* ensure channel pointer is no longer used */ 297 *ppsq = NULL; 298 299 /* teardown and destroy SQ */ 300 mlx5e_drain_sq(sq); 301 mlx5e_disable_sq(sq); 302 mlx5e_rl_destroy_sq(sq); 303 304 /* close CQ */ 305 mlx5e_close_cq(&sq->cq); 306 307 /* destroy mutexes */ 308 mtx_destroy(&sq->lock); 309 mtx_destroy(&sq->comp_lock); 310 311 free(sq, M_MLX5EN); 312 } 313 314 static void 315 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl) 316 { 317 /* 318 * Limit the maximum distance between completion events to 319 * half of the currently set TX queue size. 320 * 321 * The maximum number of queue entries a single IP packet can 322 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS. 323 * 324 * The worst case max value is then given as below: 325 */ 326 uint64_t max = rl->param.tx_queue_size / 327 (2 * MLX5_SEND_WQE_MAX_WQEBBS); 328 329 /* 330 * Update the maximum completion factor value in case the 331 * tx_queue_size field changed. Ensure we don't overflow 332 * 16-bits. 333 */ 334 if (max < 1) 335 max = 1; 336 else if (max > 65535) 337 max = 65535; 338 rl->param.tx_completion_fact_max = max; 339 340 /* 341 * Verify that the current TX completion factor is within the 342 * given limits: 343 */ 344 if (rl->param.tx_completion_fact < 1) 345 rl->param.tx_completion_fact = 1; 346 else if (rl->param.tx_completion_fact > max) 347 rl->param.tx_completion_fact = max; 348 } 349 350 static int 351 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index) 352 { 353 struct mlx5e_priv *priv = sq->priv; 354 struct mlx5_core_dev *mdev = priv->mdev; 355 356 void *in; 357 void *sqc; 358 int inlen; 359 int err; 360 361 inlen = MLX5_ST_SZ_BYTES(modify_sq_in); 362 in = mlx5_vzalloc(inlen); 363 if (in == NULL) 364 return (-ENOMEM); 365 366 sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); 367 368 MLX5_SET(modify_sq_in, in, sqn, sq->sqn); 369 MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY); 370 MLX5_SET64(modify_sq_in, in, modify_bitmask, 1); 371 MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY); 372 MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index); 373 374 err = mlx5_core_modify_sq(mdev, in, inlen); 375 376 kvfree(in); 377 378 return (err); 379 } 380 381 /* 382 * This function will search the configured rate limit table for the 383 * best match to avoid that a single socket based application can 384 * allocate all the available hardware rates. If the user selected 385 * rate deviates too much from the closes rate available in the rate 386 * limit table, unlimited rate will be selected. 387 */ 388 static uint64_t 389 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate) 390 { 391 uint64_t distance = -1ULL; 392 uint64_t diff; 393 uint64_t retval = 0; /* unlimited */ 394 uint64_t x; 395 396 /* search for closest rate */ 397 for (x = 0; x != rl->param.tx_rates_def; x++) { 398 uint64_t rate = rl->rate_limit_table[x]; 399 if (rate == 0) 400 continue; 401 402 if (rate > user_rate) 403 diff = rate - user_rate; 404 else 405 diff = user_rate - rate; 406 407 /* check if distance is smaller than previous rate */ 408 if (diff < distance) { 409 distance = diff; 410 retval = rate; 411 } 412 } 413 414 /* range check for multiplication below */ 415 if (user_rate > rl->param.tx_limit_max) 416 user_rate = rl->param.tx_limit_max; 417 418 /* fallback to unlimited, if rate deviates too much */ 419 if (distance > howmany(user_rate * 420 rl->param.tx_allowed_deviation, 1000ULL)) 421 retval = 0; 422 423 return (retval); 424 } 425 426 static int 427 mlx5e_rl_post_sq_remap_wqe(struct mlx5e_iq *iq, u32 scq_handle, u32 sq_handle, 428 struct mlx5e_rl_channel *sq_channel) 429 { 430 const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_qos_remap_wqe), 431 MLX5_SEND_WQE_DS); 432 struct mlx5e_tx_qos_remap_wqe *wqe; 433 int pi; 434 435 mtx_lock(&iq->lock); 436 pi = mlx5e_iq_get_producer_index(iq); 437 if (pi < 0) { 438 mtx_unlock(&iq->lock); 439 return (-ENOMEM); 440 } 441 wqe = mlx5_wq_cyc_get_wqe(&iq->wq, pi); 442 443 memset(wqe, 0, sizeof(*wqe)); 444 445 wqe->qos_remap.qos_handle = cpu_to_be32(scq_handle); 446 wqe->qos_remap.queue_handle = cpu_to_be32(sq_handle); 447 448 wqe->ctrl.opmod_idx_opcode = cpu_to_be32((iq->pc << 8) | 449 MLX5_OPCODE_QOS_REMAP); 450 wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | ds_cnt); 451 wqe->ctrl.imm = cpu_to_be32(iq->priv->tisn[0] << 8); 452 wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL; 453 454 /* copy data for doorbell */ 455 memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32)); 456 457 iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); 458 iq->data[pi].p_refcount = &sq_channel->refcount; 459 atomic_add_int(iq->data[pi].p_refcount, 1); 460 iq->pc += iq->data[pi].num_wqebbs; 461 462 mlx5e_iq_notify_hw(iq); 463 464 mtx_unlock(&iq->lock); 465 466 return (0); /* success */ 467 } 468 469 static int 470 mlx5e_rl_remap_sq(struct mlx5e_sq *sq, uint16_t index, 471 struct mlx5e_rl_channel *sq_channel) 472 { 473 struct mlx5e_channel *iq_channel; 474 u32 scq_handle; 475 u32 sq_handle; 476 int error; 477 478 /* Specific SQ remap operations should be handled by same IQ */ 479 iq_channel = &sq->priv->channel[sq->sqn % sq->priv->params.num_channels]; 480 481 sq_handle = sq->queue_handle; 482 scq_handle = mlx5_rl_get_scq_handle(sq->priv->mdev, index); 483 484 if (sq_handle == MLX5_INVALID_QUEUE_HANDLE || 485 scq_handle == MLX5_INVALID_QUEUE_HANDLE) 486 error = -1; 487 else 488 error = mlx5e_rl_post_sq_remap_wqe(&iq_channel->iq, scq_handle, 489 sq_handle, sq_channel); 490 491 return (error); 492 } 493 494 /* 495 * This function sets the requested rate for a rate limit channel, in 496 * bits per second. The requested rate will be filtered through the 497 * find best rate function above. 498 */ 499 static int 500 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw, 501 struct mlx5e_rl_channel *channel, uint64_t rate) 502 { 503 struct mlx5e_rl_priv_data *rl = &rlw->priv->rl; 504 struct mlx5e_sq *sq; 505 uint64_t temp; 506 uint16_t index; 507 uint16_t burst; 508 int error; 509 bool use_sq_remap; 510 511 if (rate != 0) { 512 MLX5E_RL_WORKER_UNLOCK(rlw); 513 514 MLX5E_RL_RLOCK(rl); 515 516 /* get current burst size in bytes */ 517 temp = rl->param.tx_burst_size * 518 MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu); 519 520 /* limit burst size to 64K currently */ 521 if (temp > 65535) 522 temp = 65535; 523 burst = temp; 524 525 /* find best rate */ 526 rate = mlx5e_rl_find_best_rate_locked(rl, rate); 527 528 MLX5E_RL_RUNLOCK(rl); 529 530 if (rate == 0) { 531 /* rate doesn't exist, fallback to unlimited */ 532 index = 0; 533 rate = 0; 534 atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL); 535 } else { 536 /* get a reference on the new rate */ 537 error = -mlx5_rl_add_rate(rlw->priv->mdev, 538 howmany(rate, 1000), burst, &index); 539 540 if (error != 0) { 541 /* adding rate failed, fallback to unlimited */ 542 index = 0; 543 rate = 0; 544 atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL); 545 } 546 } 547 MLX5E_RL_WORKER_LOCK(rlw); 548 } else { 549 index = 0; 550 burst = 0; /* default */ 551 } 552 553 /* paced <--> non-paced transitions must go via FW */ 554 use_sq_remap = MLX5_CAP_QOS(rlw->priv->mdev, qos_remap_pp) && 555 channel->last_rate != 0 && rate != 0; 556 557 /* atomically swap rates */ 558 temp = channel->last_rate; 559 channel->last_rate = rate; 560 rate = temp; 561 562 /* atomically swap burst size */ 563 temp = channel->last_burst; 564 channel->last_burst = burst; 565 burst = temp; 566 567 MLX5E_RL_WORKER_UNLOCK(rlw); 568 /* put reference on the old rate, if any */ 569 if (rate != 0) { 570 mlx5_rl_remove_rate(rlw->priv->mdev, 571 howmany(rate, 1000), burst); 572 } 573 574 /* set new rate, if SQ is running */ 575 sq = channel->sq; 576 if (sq != NULL && READ_ONCE(sq->running) != 0) { 577 if (!use_sq_remap || mlx5e_rl_remap_sq(sq, index, channel)) { 578 while (atomic_load_int(&channel->refcount) != 0 && 579 rlw->priv->mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR && 580 pci_channel_offline(rlw->priv->mdev->pdev) == 0) 581 pause("W", 1); 582 error = mlx5e_rl_modify_sq(sq, index); 583 if (error != 0) 584 atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL); 585 } 586 } else 587 error = 0; 588 589 MLX5E_RL_WORKER_LOCK(rlw); 590 591 return (-error); 592 } 593 594 static void 595 mlx5e_rl_worker(void *arg) 596 { 597 struct thread *td; 598 struct mlx5e_rl_worker *rlw = arg; 599 struct mlx5e_rl_channel *channel; 600 struct mlx5e_priv *priv; 601 unsigned ix; 602 uint64_t x; 603 int error; 604 605 /* set thread priority */ 606 td = curthread; 607 608 thread_lock(td); 609 sched_prio(td, PI_SWI(SWI_NET)); 610 thread_unlock(td); 611 612 priv = rlw->priv; 613 614 /* compute completion vector */ 615 ix = (rlw - priv->rl.workers) % 616 priv->mdev->priv.eq_table.num_comp_vectors; 617 618 /* TODO bind to CPU */ 619 620 /* open all the SQs */ 621 MLX5E_RL_WORKER_LOCK(rlw); 622 for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) { 623 struct mlx5e_rl_channel *channel = rlw->channels + x; 624 625 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS) 626 if (channel->state == MLX5E_RL_ST_FREE) 627 continue; 628 #endif 629 MLX5E_RL_WORKER_UNLOCK(rlw); 630 631 MLX5E_RL_RLOCK(&priv->rl); 632 error = mlx5e_rl_open_channel(rlw, ix, 633 &priv->rl.chan_param, &channel->sq); 634 MLX5E_RL_RUNLOCK(&priv->rl); 635 636 MLX5E_RL_WORKER_LOCK(rlw); 637 if (error != 0) { 638 mlx5_en_err(priv->ifp, 639 "mlx5e_rl_open_channel failed: %d\n", error); 640 break; 641 } 642 mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate); 643 } 644 while (1) { 645 if (STAILQ_FIRST(&rlw->process_head) == NULL) { 646 /* check if we are tearing down */ 647 if (rlw->worker_done != 0) 648 break; 649 cv_wait(&rlw->cv, &rlw->mtx); 650 } 651 /* check if we are tearing down */ 652 if (rlw->worker_done != 0) 653 break; 654 channel = STAILQ_FIRST(&rlw->process_head); 655 if (channel != NULL) { 656 STAILQ_REMOVE_HEAD(&rlw->process_head, entry); 657 658 switch (channel->state) { 659 case MLX5E_RL_ST_MODIFY: 660 channel->state = MLX5E_RL_ST_USED; 661 MLX5E_RL_WORKER_UNLOCK(rlw); 662 663 /* create channel by demand */ 664 if (channel->sq == NULL) { 665 MLX5E_RL_RLOCK(&priv->rl); 666 error = mlx5e_rl_open_channel(rlw, ix, 667 &priv->rl.chan_param, &channel->sq); 668 MLX5E_RL_RUNLOCK(&priv->rl); 669 670 if (error != 0) { 671 mlx5_en_err(priv->ifp, 672 "mlx5e_rl_open_channel failed: %d\n", error); 673 } else { 674 atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL); 675 } 676 } else { 677 mlx5e_resume_sq(channel->sq); 678 } 679 680 MLX5E_RL_WORKER_LOCK(rlw); 681 /* convert from bytes/s to bits/s and set new rate */ 682 error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 683 channel->new_rate * 8ULL); 684 if (error != 0) { 685 mlx5_en_err(priv->ifp, 686 "mlx5e_rlw_channel_set_rate_locked failed: %d\n", 687 error); 688 } 689 break; 690 691 case MLX5E_RL_ST_DESTROY: 692 error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0); 693 if (error != 0) { 694 mlx5_en_err(priv->ifp, 695 "mlx5e_rlw_channel_set_rate_locked failed: %d\n", 696 error); 697 } 698 if (channel->sq != NULL) { 699 /* 700 * Make sure all packets are 701 * transmitted before SQ is 702 * returned to free list: 703 */ 704 MLX5E_RL_WORKER_UNLOCK(rlw); 705 mlx5e_drain_sq(channel->sq); 706 MLX5E_RL_WORKER_LOCK(rlw); 707 } 708 /* put the channel back into the free list */ 709 STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry); 710 channel->state = MLX5E_RL_ST_FREE; 711 atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL); 712 break; 713 default: 714 /* NOP */ 715 break; 716 } 717 } 718 } 719 720 /* close all the SQs */ 721 for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) { 722 struct mlx5e_rl_channel *channel = rlw->channels + x; 723 724 /* update the initial rate */ 725 channel->init_rate = channel->last_rate; 726 727 /* make sure we free up the rate resource */ 728 mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0); 729 730 if (channel->sq != NULL) { 731 MLX5E_RL_WORKER_UNLOCK(rlw); 732 mlx5e_rl_close_channel(&channel->sq); 733 atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL); 734 MLX5E_RL_WORKER_LOCK(rlw); 735 } 736 } 737 738 rlw->worker_done = 0; 739 cv_broadcast(&rlw->cv); 740 MLX5E_RL_WORKER_UNLOCK(rlw); 741 742 kthread_exit(); 743 } 744 745 static int 746 mlx5e_rl_open_tis(struct mlx5e_priv *priv) 747 { 748 struct mlx5_core_dev *mdev = priv->mdev; 749 u32 in[MLX5_ST_SZ_DW(create_tis_in)]; 750 void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); 751 752 memset(in, 0, sizeof(in)); 753 754 MLX5_SET(tisc, tisc, prio, 0); 755 MLX5_SET(tisc, tisc, transport_domain, priv->tdn); 756 757 return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn)); 758 } 759 760 static void 761 mlx5e_rl_close_tis(struct mlx5e_priv *priv) 762 { 763 mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn, 0); 764 } 765 766 static void 767 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param, 768 struct mlx5_core_dev *mdev) 769 { 770 /* ratelimit workers */ 771 param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors; 772 param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS; 773 774 /* range check */ 775 if (param->tx_worker_threads_def == 0 || 776 param->tx_worker_threads_def > param->tx_worker_threads_max) 777 param->tx_worker_threads_def = param->tx_worker_threads_max; 778 779 /* ratelimit channels */ 780 param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS / 781 param->tx_worker_threads_def; 782 param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS; 783 784 /* range check */ 785 if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER) 786 param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER; 787 788 /* set default burst size */ 789 param->tx_burst_size = 4; /* MTUs */ 790 791 /* 792 * Set maximum burst size 793 * 794 * The burst size is multiplied by the MTU and clamped to the 795 * range 0 ... 65535 bytes inclusivly before fed into the 796 * firmware. 797 * 798 * NOTE: If the burst size or MTU is changed only ratelimit 799 * connections made after the change will use the new burst 800 * size. 801 */ 802 param->tx_burst_size_max = 255; 803 804 /* get firmware rate limits in 1000bit/s and convert them to bit/s */ 805 param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL; 806 param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL; 807 808 /* ratelimit table size */ 809 param->tx_rates_max = mdev->priv.rl_table.max_size; 810 811 /* range check */ 812 if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES) 813 param->tx_rates_max = MLX5E_RL_MAX_TX_RATES; 814 815 /* set default number of rates */ 816 param->tx_rates_def = param->tx_rates_max; 817 818 /* set maximum allowed rate deviation */ 819 if (param->tx_limit_max != 0) { 820 /* 821 * Make sure the deviation multiplication doesn't 822 * overflow unsigned 64-bit: 823 */ 824 param->tx_allowed_deviation_max = -1ULL / 825 param->tx_limit_max; 826 } 827 /* set default rate deviation */ 828 param->tx_allowed_deviation = 50; /* 5.0% */ 829 830 /* channel parameters */ 831 param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); 832 param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT; 833 param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT; 834 param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT; 835 param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT; 836 } 837 838 static const char *mlx5e_rl_params_desc[] = { 839 MLX5E_RL_PARAMS(MLX5E_STATS_DESC) 840 }; 841 842 static const char *mlx5e_rl_table_params_desc[] = { 843 MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC) 844 }; 845 846 static const char *mlx5e_rl_stats_desc[] = { 847 MLX5E_RL_STATS(MLX5E_STATS_DESC) 848 }; 849 850 int 851 mlx5e_rl_init(struct mlx5e_priv *priv) 852 { 853 struct mlx5e_rl_priv_data *rl = &priv->rl; 854 struct sysctl_oid *node; 855 struct sysctl_oid *stats; 856 char buf[64]; 857 uint64_t i; 858 uint64_t j; 859 int error; 860 861 /* check if there is support for packet pacing */ 862 if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing)) 863 return (0); 864 865 rl->priv = priv; 866 867 sysctl_ctx_init(&rl->ctx); 868 869 sx_init(&rl->rl_sxlock, "ratelimit-sxlock"); 870 871 /* open own TIS domain for ratelimit SQs */ 872 error = mlx5e_rl_open_tis(priv); 873 if (error) 874 goto done; 875 876 /* setup default value for parameters */ 877 mlx5e_rl_set_default_params(&rl->param, priv->mdev); 878 879 /* update the completion factor */ 880 mlx5e_rl_sync_tx_completion_fact(rl); 881 882 /* create root node */ 883 node = SYSCTL_ADD_NODE(&rl->ctx, 884 SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, 885 "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support"); 886 887 if (node != NULL) { 888 /* create SYSCTLs */ 889 for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) { 890 mlx5e_rl_sysctl_add_u64_oid(rl, 891 MLX5E_RL_PARAMS_INDEX(arg[i]), 892 node, mlx5e_rl_params_desc[2 * i], 893 mlx5e_rl_params_desc[2 * i + 1]); 894 } 895 896 stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node), 897 OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 898 "Rate limiting statistics"); 899 if (stats != NULL) { 900 /* create SYSCTLs */ 901 for (i = 0; i != MLX5E_RL_STATS_NUM; i++) { 902 mlx5e_rl_sysctl_add_stats_u64_oid(rl, i, 903 stats, mlx5e_rl_stats_desc[2 * i], 904 mlx5e_rl_stats_desc[2 * i + 1]); 905 } 906 } 907 } 908 909 /* allocate workers array */ 910 rl->workers = malloc(sizeof(rl->workers[0]) * 911 rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO); 912 913 /* allocate rate limit array */ 914 rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) * 915 rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO); 916 917 if (node != NULL) { 918 /* create more SYSCTls */ 919 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, 920 "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD | 921 CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table, 922 "A", "Show table of all configured TX rates"); 923 924 /* try to fetch rate table from kernel environment */ 925 for (i = 0; i != rl->param.tx_rates_def; i++) { 926 /* compute path for tunable */ 927 snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d", 928 device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i); 929 if (TUNABLE_QUAD_FETCH(buf, &j)) 930 mlx5e_rl_tx_limit_add(rl, j); 931 } 932 933 /* setup rate table sysctls */ 934 for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) { 935 mlx5e_rl_sysctl_add_u64_oid(rl, 936 MLX5E_RL_PARAMS_INDEX(table_arg[i]), 937 node, mlx5e_rl_table_params_desc[2 * i], 938 mlx5e_rl_table_params_desc[2 * i + 1]); 939 } 940 } 941 942 for (j = 0; j < rl->param.tx_worker_threads_def; j++) { 943 struct mlx5e_rl_worker *rlw = rl->workers + j; 944 945 rlw->priv = priv; 946 947 cv_init(&rlw->cv, "mlx5-worker-cv"); 948 mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF); 949 STAILQ_INIT(&rlw->index_list_head); 950 STAILQ_INIT(&rlw->process_head); 951 952 rlw->channels = malloc(sizeof(rlw->channels[0]) * 953 rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO); 954 955 MLX5E_RL_WORKER_LOCK(rlw); 956 for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) { 957 struct mlx5e_rl_channel *channel = rlw->channels + i; 958 channel->worker = rlw; 959 STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry); 960 } 961 MLX5E_RL_WORKER_UNLOCK(rlw); 962 } 963 964 PRIV_LOCK(priv); 965 error = mlx5e_rl_open_workers(priv); 966 PRIV_UNLOCK(priv); 967 968 if (error != 0) { 969 mlx5_en_err(priv->ifp, 970 "mlx5e_rl_open_workers failed: %d\n", error); 971 } 972 973 return (0); 974 975 done: 976 sysctl_ctx_free(&rl->ctx); 977 sx_destroy(&rl->rl_sxlock); 978 return (error); 979 } 980 981 static int 982 mlx5e_rl_open_workers(struct mlx5e_priv *priv) 983 { 984 struct mlx5e_rl_priv_data *rl = &priv->rl; 985 struct thread *rl_thread = NULL; 986 struct proc *rl_proc = NULL; 987 uint64_t j; 988 int error; 989 990 if (priv->gone || rl->opened) 991 return (-EINVAL); 992 993 MLX5E_RL_WLOCK(rl); 994 /* compute channel parameters once */ 995 mlx5e_rl_build_channel_param(rl, &rl->chan_param); 996 MLX5E_RL_WUNLOCK(rl); 997 998 for (j = 0; j < rl->param.tx_worker_threads_def; j++) { 999 struct mlx5e_rl_worker *rlw = rl->workers + j; 1000 1001 /* start worker thread */ 1002 error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread, 1003 RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j); 1004 if (error != 0) { 1005 mlx5_en_err(rl->priv->ifp, 1006 "kproc_kthread_add failed: %d\n", error); 1007 rlw->worker_done = 1; 1008 } 1009 } 1010 1011 rl->opened = 1; 1012 1013 return (0); 1014 } 1015 1016 static void 1017 mlx5e_rl_close_workers(struct mlx5e_priv *priv) 1018 { 1019 struct mlx5e_rl_priv_data *rl = &priv->rl; 1020 uint64_t y; 1021 1022 if (rl->opened == 0) 1023 return; 1024 1025 /* tear down worker threads simultaneously */ 1026 for (y = 0; y < rl->param.tx_worker_threads_def; y++) { 1027 struct mlx5e_rl_worker *rlw = rl->workers + y; 1028 1029 /* tear down worker before freeing SQs */ 1030 MLX5E_RL_WORKER_LOCK(rlw); 1031 if (rlw->worker_done == 0) { 1032 rlw->worker_done = 1; 1033 cv_broadcast(&rlw->cv); 1034 } else { 1035 /* XXX thread not started */ 1036 rlw->worker_done = 0; 1037 } 1038 MLX5E_RL_WORKER_UNLOCK(rlw); 1039 } 1040 1041 /* wait for worker threads to exit */ 1042 for (y = 0; y < rl->param.tx_worker_threads_def; y++) { 1043 struct mlx5e_rl_worker *rlw = rl->workers + y; 1044 1045 /* tear down worker before freeing SQs */ 1046 MLX5E_RL_WORKER_LOCK(rlw); 1047 while (rlw->worker_done != 0) 1048 cv_wait(&rlw->cv, &rlw->mtx); 1049 MLX5E_RL_WORKER_UNLOCK(rlw); 1050 } 1051 1052 rl->opened = 0; 1053 } 1054 1055 static void 1056 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl) 1057 { 1058 unsigned x; 1059 1060 MLX5E_RL_WLOCK(rl); 1061 for (x = 0; x != rl->param.tx_rates_def; x++) 1062 rl->rate_limit_table[x] = 0; 1063 MLX5E_RL_WUNLOCK(rl); 1064 } 1065 1066 void 1067 mlx5e_rl_cleanup(struct mlx5e_priv *priv) 1068 { 1069 struct mlx5e_rl_priv_data *rl = &priv->rl; 1070 uint64_t y; 1071 1072 /* check if there is support for packet pacing */ 1073 if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing)) 1074 return; 1075 1076 /* TODO check if there is support for packet pacing */ 1077 1078 sysctl_ctx_free(&rl->ctx); 1079 1080 PRIV_LOCK(priv); 1081 mlx5e_rl_close_workers(priv); 1082 PRIV_UNLOCK(priv); 1083 1084 mlx5e_rl_reset_rates(rl); 1085 1086 /* close TIS domain */ 1087 mlx5e_rl_close_tis(priv); 1088 1089 for (y = 0; y < rl->param.tx_worker_threads_def; y++) { 1090 struct mlx5e_rl_worker *rlw = rl->workers + y; 1091 1092 cv_destroy(&rlw->cv); 1093 mtx_destroy(&rlw->mtx); 1094 free(rlw->channels, M_MLX5EN); 1095 } 1096 free(rl->rate_limit_table, M_MLX5EN); 1097 free(rl->workers, M_MLX5EN); 1098 sx_destroy(&rl->rl_sxlock); 1099 } 1100 1101 static void 1102 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw, 1103 struct mlx5e_rl_channel *channel) 1104 { 1105 STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry); 1106 cv_broadcast(&rlw->cv); 1107 } 1108 1109 static void 1110 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel) 1111 { 1112 if (channel == NULL) 1113 return; 1114 1115 MLX5E_RL_WORKER_LOCK(rlw); 1116 switch (channel->state) { 1117 case MLX5E_RL_ST_MODIFY: 1118 channel->state = MLX5E_RL_ST_DESTROY; 1119 break; 1120 case MLX5E_RL_ST_USED: 1121 channel->state = MLX5E_RL_ST_DESTROY; 1122 mlx5e_rlw_queue_channel_locked(rlw, channel); 1123 break; 1124 default: 1125 break; 1126 } 1127 MLX5E_RL_WORKER_UNLOCK(rlw); 1128 } 1129 1130 static int 1131 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate) 1132 { 1133 1134 MLX5E_RL_WORKER_LOCK(rlw); 1135 channel->new_rate = rate; 1136 switch (channel->state) { 1137 case MLX5E_RL_ST_USED: 1138 channel->state = MLX5E_RL_ST_MODIFY; 1139 mlx5e_rlw_queue_channel_locked(rlw, channel); 1140 break; 1141 default: 1142 break; 1143 } 1144 MLX5E_RL_WORKER_UNLOCK(rlw); 1145 1146 return (0); 1147 } 1148 1149 static int 1150 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, 1151 union if_snd_tag_query_params *params) 1152 { 1153 int retval; 1154 1155 MLX5E_RL_WORKER_LOCK(rlw); 1156 switch (channel->state) { 1157 case MLX5E_RL_ST_USED: 1158 params->rate_limit.max_rate = channel->last_rate; 1159 params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq); 1160 retval = 0; 1161 break; 1162 case MLX5E_RL_ST_MODIFY: 1163 params->rate_limit.max_rate = channel->last_rate; 1164 params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq); 1165 retval = EBUSY; 1166 break; 1167 default: 1168 retval = EINVAL; 1169 break; 1170 } 1171 MLX5E_RL_WORKER_UNLOCK(rlw); 1172 1173 return (retval); 1174 } 1175 1176 static int 1177 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw, 1178 struct mlx5e_rl_channel **pchannel) 1179 { 1180 struct mlx5e_rl_channel *channel; 1181 int retval = ENOMEM; 1182 1183 MLX5E_RL_WORKER_LOCK(rlw); 1184 /* Check for available channel in free list */ 1185 if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) { 1186 retval = 0; 1187 /* Remove head index from available list */ 1188 STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry); 1189 channel->state = MLX5E_RL_ST_USED; 1190 atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL); 1191 } else { 1192 atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL); 1193 } 1194 MLX5E_RL_WORKER_UNLOCK(rlw); 1195 1196 *pchannel = channel; 1197 #ifdef RATELIMIT_DEBUG 1198 mlx5_en_info(rlw->priv->ifp, 1199 "Channel pointer for rate limit connection is %p\n", channel); 1200 #endif 1201 return (retval); 1202 } 1203 1204 int 1205 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp, 1206 union if_snd_tag_alloc_params *params, 1207 struct m_snd_tag **ppmt) 1208 { 1209 struct mlx5e_rl_channel *channel; 1210 struct mlx5e_rl_worker *rlw; 1211 struct mlx5e_priv *priv; 1212 int error; 1213 1214 priv = ifp->if_softc; 1215 1216 /* check if there is support for packet pacing or if device is going away */ 1217 if (!MLX5_CAP_GEN(priv->mdev, qos) || 1218 !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone || 1219 params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT) 1220 return (EOPNOTSUPP); 1221 1222 /* compute worker thread this TCP connection belongs to */ 1223 rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) % 1224 priv->rl.param.tx_worker_threads_def); 1225 1226 error = mlx5e_find_available_tx_ring_index(rlw, &channel); 1227 if (error != 0) 1228 goto done; 1229 1230 error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate); 1231 if (error != 0) { 1232 mlx5e_rl_free(rlw, channel); 1233 goto done; 1234 } 1235 1236 /* store pointer to mbuf tag */ 1237 MPASS(channel->tag.refcount == 0); 1238 m_snd_tag_init(&channel->tag, ifp, &mlx5e_rl_snd_tag_sw); 1239 *ppmt = &channel->tag; 1240 done: 1241 return (error); 1242 } 1243 1244 1245 static int 1246 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params) 1247 { 1248 struct mlx5e_rl_channel *channel = 1249 container_of(pmt, struct mlx5e_rl_channel, tag); 1250 1251 return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate)); 1252 } 1253 1254 static int 1255 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) 1256 { 1257 struct mlx5e_rl_channel *channel = 1258 container_of(pmt, struct mlx5e_rl_channel, tag); 1259 1260 return (mlx5e_rl_query(channel->worker, channel, params)); 1261 } 1262 1263 static void 1264 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt) 1265 { 1266 struct mlx5e_rl_channel *channel = 1267 container_of(pmt, struct mlx5e_rl_channel, tag); 1268 1269 mlx5e_rl_free(channel->worker, channel); 1270 } 1271 1272 static int 1273 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS) 1274 { 1275 struct mlx5e_rl_priv_data *rl = arg1; 1276 struct mlx5e_priv *priv = rl->priv; 1277 struct sbuf sbuf; 1278 unsigned x; 1279 int error; 1280 1281 error = sysctl_wire_old_buffer(req, 0); 1282 if (error != 0) 1283 return (error); 1284 1285 PRIV_LOCK(priv); 1286 1287 sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req); 1288 1289 sbuf_printf(&sbuf, 1290 "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n" 1291 "\t" "--------------------------------------------\n"); 1292 1293 MLX5E_RL_RLOCK(rl); 1294 for (x = 0; x != rl->param.tx_rates_def; x++) { 1295 if (rl->rate_limit_table[x] == 0) 1296 continue; 1297 1298 sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n", 1299 x, (unsigned)rl->param.tx_burst_size, 1300 (long long)rl->rate_limit_table[x]); 1301 } 1302 MLX5E_RL_RUNLOCK(rl); 1303 1304 error = sbuf_finish(&sbuf); 1305 sbuf_delete(&sbuf); 1306 1307 PRIV_UNLOCK(priv); 1308 1309 return (error); 1310 } 1311 1312 static int 1313 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl) 1314 { 1315 uint64_t x; 1316 uint64_t y; 1317 1318 MLX5E_RL_WLOCK(rl); 1319 /* compute channel parameters once */ 1320 mlx5e_rl_build_channel_param(rl, &rl->chan_param); 1321 MLX5E_RL_WUNLOCK(rl); 1322 1323 for (y = 0; y != rl->param.tx_worker_threads_def; y++) { 1324 struct mlx5e_rl_worker *rlw = rl->workers + y; 1325 1326 for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) { 1327 struct mlx5e_rl_channel *channel; 1328 struct mlx5e_sq *sq; 1329 1330 channel = rlw->channels + x; 1331 sq = channel->sq; 1332 1333 if (sq == NULL) 1334 continue; 1335 1336 if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) { 1337 mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq, 1338 rl->param.tx_coalesce_usecs, 1339 rl->param.tx_coalesce_pkts, 1340 rl->param.tx_coalesce_mode); 1341 } else { 1342 mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq, 1343 rl->param.tx_coalesce_usecs, 1344 rl->param.tx_coalesce_pkts); 1345 } 1346 } 1347 } 1348 return (0); 1349 } 1350 1351 void 1352 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl) 1353 { 1354 uint64_t x; 1355 uint64_t y; 1356 1357 for (y = 0; y != rl->param.tx_worker_threads_def; y++) { 1358 struct mlx5e_rl_worker *rlw = rl->workers + y; 1359 1360 for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) { 1361 struct mlx5e_rl_channel *channel; 1362 struct mlx5e_sq *sq; 1363 1364 channel = rlw->channels + x; 1365 sq = channel->sq; 1366 1367 if (sq == NULL) 1368 continue; 1369 1370 mtx_lock(&sq->lock); 1371 mlx5e_update_sq_inline(sq); 1372 mtx_unlock(&sq->lock); 1373 } 1374 } 1375 } 1376 1377 static int 1378 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value) 1379 { 1380 unsigned x; 1381 int error; 1382 1383 if (value < 1000 || 1384 mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0) 1385 return (EINVAL); 1386 1387 MLX5E_RL_WLOCK(rl); 1388 error = ENOMEM; 1389 1390 /* check if rate already exists */ 1391 for (x = 0; x != rl->param.tx_rates_def; x++) { 1392 if (rl->rate_limit_table[x] != value) 1393 continue; 1394 error = EEXIST; 1395 break; 1396 } 1397 1398 /* check if there is a free rate entry */ 1399 if (x == rl->param.tx_rates_def) { 1400 for (x = 0; x != rl->param.tx_rates_def; x++) { 1401 if (rl->rate_limit_table[x] != 0) 1402 continue; 1403 rl->rate_limit_table[x] = value; 1404 error = 0; 1405 break; 1406 } 1407 } 1408 MLX5E_RL_WUNLOCK(rl); 1409 1410 return (error); 1411 } 1412 1413 static int 1414 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value) 1415 { 1416 unsigned x; 1417 int error; 1418 1419 if (value == 0) 1420 return (EINVAL); 1421 1422 MLX5E_RL_WLOCK(rl); 1423 1424 /* check if rate already exists */ 1425 for (x = 0; x != rl->param.tx_rates_def; x++) { 1426 if (rl->rate_limit_table[x] != value) 1427 continue; 1428 /* free up rate */ 1429 rl->rate_limit_table[x] = 0; 1430 break; 1431 } 1432 1433 /* check if there is a free rate entry */ 1434 if (x == rl->param.tx_rates_def) 1435 error = ENOENT; 1436 else 1437 error = 0; 1438 MLX5E_RL_WUNLOCK(rl); 1439 1440 return (error); 1441 } 1442 1443 static int 1444 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS) 1445 { 1446 struct mlx5e_rl_priv_data *rl = arg1; 1447 struct mlx5e_priv *priv = rl->priv; 1448 unsigned mode_modify; 1449 unsigned was_opened; 1450 uint64_t value; 1451 uint64_t old; 1452 int error; 1453 1454 PRIV_LOCK(priv); 1455 1456 MLX5E_RL_RLOCK(rl); 1457 value = rl->param.arg[arg2]; 1458 MLX5E_RL_RUNLOCK(rl); 1459 1460 if (req != NULL) { 1461 old = value; 1462 error = sysctl_handle_64(oidp, &value, 0, req); 1463 if (error || req->newptr == NULL || 1464 value == rl->param.arg[arg2]) 1465 goto done; 1466 } else { 1467 old = 0; 1468 error = 0; 1469 } 1470 1471 /* check if device is gone */ 1472 if (priv->gone) { 1473 error = ENXIO; 1474 goto done; 1475 } 1476 was_opened = rl->opened; 1477 mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify); 1478 1479 switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) { 1480 case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def): 1481 if (value > rl->param.tx_worker_threads_max) 1482 value = rl->param.tx_worker_threads_max; 1483 else if (value < 1) 1484 value = 1; 1485 1486 /* store new value */ 1487 rl->param.arg[arg2] = value; 1488 break; 1489 1490 case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def): 1491 if (value > rl->param.tx_channels_per_worker_max) 1492 value = rl->param.tx_channels_per_worker_max; 1493 else if (value < 1) 1494 value = 1; 1495 1496 /* store new value */ 1497 rl->param.arg[arg2] = value; 1498 break; 1499 1500 case MLX5E_RL_PARAMS_INDEX(tx_rates_def): 1501 if (value > rl->param.tx_rates_max) 1502 value = rl->param.tx_rates_max; 1503 else if (value < 1) 1504 value = 1; 1505 1506 /* store new value */ 1507 rl->param.arg[arg2] = value; 1508 break; 1509 1510 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs): 1511 /* range check */ 1512 if (value < 1) 1513 value = 0; 1514 else if (value > MLX5E_FLD_MAX(cqc, cq_period)) 1515 value = MLX5E_FLD_MAX(cqc, cq_period); 1516 1517 /* store new value */ 1518 rl->param.arg[arg2] = value; 1519 1520 /* check to avoid down and up the network interface */ 1521 if (was_opened) 1522 error = mlx5e_rl_refresh_channel_params(rl); 1523 break; 1524 1525 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts): 1526 /* import TX coal pkts */ 1527 if (value < 1) 1528 value = 0; 1529 else if (value > MLX5E_FLD_MAX(cqc, cq_max_count)) 1530 value = MLX5E_FLD_MAX(cqc, cq_max_count); 1531 1532 /* store new value */ 1533 rl->param.arg[arg2] = value; 1534 1535 /* check to avoid down and up the network interface */ 1536 if (was_opened) 1537 error = mlx5e_rl_refresh_channel_params(rl); 1538 break; 1539 1540 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode): 1541 /* network interface must be down */ 1542 if (was_opened != 0 && mode_modify == 0) 1543 mlx5e_rl_close_workers(priv); 1544 1545 /* import TX coalesce mode */ 1546 if (value != 0) 1547 value = 1; 1548 1549 /* store new value */ 1550 rl->param.arg[arg2] = value; 1551 1552 /* restart network interface, if any */ 1553 if (was_opened != 0) { 1554 if (mode_modify == 0) 1555 mlx5e_rl_open_workers(priv); 1556 else 1557 error = mlx5e_rl_refresh_channel_params(rl); 1558 } 1559 break; 1560 1561 case MLX5E_RL_PARAMS_INDEX(tx_queue_size): 1562 /* network interface must be down */ 1563 if (was_opened) 1564 mlx5e_rl_close_workers(priv); 1565 1566 /* import TX queue size */ 1567 if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) 1568 value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); 1569 else if (value > priv->params_ethtool.tx_queue_size_max) 1570 value = priv->params_ethtool.tx_queue_size_max; 1571 1572 /* store actual TX queue size */ 1573 value = 1ULL << order_base_2(value); 1574 1575 /* store new value */ 1576 rl->param.arg[arg2] = value; 1577 1578 /* verify TX completion factor */ 1579 mlx5e_rl_sync_tx_completion_fact(rl); 1580 1581 /* restart network interface, if any */ 1582 if (was_opened) 1583 mlx5e_rl_open_workers(priv); 1584 break; 1585 1586 case MLX5E_RL_PARAMS_INDEX(tx_completion_fact): 1587 /* network interface must be down */ 1588 if (was_opened) 1589 mlx5e_rl_close_workers(priv); 1590 1591 /* store new value */ 1592 rl->param.arg[arg2] = value; 1593 1594 /* verify parameter */ 1595 mlx5e_rl_sync_tx_completion_fact(rl); 1596 1597 /* restart network interface, if any */ 1598 if (was_opened) 1599 mlx5e_rl_open_workers(priv); 1600 break; 1601 1602 case MLX5E_RL_PARAMS_INDEX(tx_limit_add): 1603 error = mlx5e_rl_tx_limit_add(rl, value); 1604 break; 1605 1606 case MLX5E_RL_PARAMS_INDEX(tx_limit_clr): 1607 error = mlx5e_rl_tx_limit_clr(rl, value); 1608 break; 1609 1610 case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation): 1611 /* range check */ 1612 if (value > rl->param.tx_allowed_deviation_max) 1613 value = rl->param.tx_allowed_deviation_max; 1614 else if (value < rl->param.tx_allowed_deviation_min) 1615 value = rl->param.tx_allowed_deviation_min; 1616 1617 MLX5E_RL_WLOCK(rl); 1618 rl->param.arg[arg2] = value; 1619 MLX5E_RL_WUNLOCK(rl); 1620 break; 1621 1622 case MLX5E_RL_PARAMS_INDEX(tx_burst_size): 1623 /* range check */ 1624 if (value > rl->param.tx_burst_size_max) 1625 value = rl->param.tx_burst_size_max; 1626 else if (value < rl->param.tx_burst_size_min) 1627 value = rl->param.tx_burst_size_min; 1628 1629 MLX5E_RL_WLOCK(rl); 1630 rl->param.arg[arg2] = value; 1631 MLX5E_RL_WUNLOCK(rl); 1632 break; 1633 1634 default: 1635 break; 1636 } 1637 done: 1638 PRIV_UNLOCK(priv); 1639 return (error); 1640 } 1641 1642 static void 1643 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, 1644 struct sysctl_oid *node, const char *name, const char *desc) 1645 { 1646 /* 1647 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will 1648 * take care of loading default sysctl value from the kernel 1649 * environment, if any: 1650 */ 1651 if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) { 1652 /* read-only SYSCTLs */ 1653 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, 1654 name, CTLTYPE_U64 | CTLFLAG_RD | 1655 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); 1656 } else { 1657 if (strstr(name, "_def") != 0) { 1658 #ifdef RATELIMIT_DEBUG 1659 /* tunable read-only advanced SYSCTLs */ 1660 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, 1661 name, CTLTYPE_U64 | CTLFLAG_RDTUN | 1662 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); 1663 #endif 1664 } else { 1665 /* read-write SYSCTLs */ 1666 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, 1667 name, CTLTYPE_U64 | CTLFLAG_RWTUN | 1668 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); 1669 } 1670 } 1671 } 1672 1673 static void 1674 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, 1675 struct sysctl_oid *node, const char *name, const char *desc) 1676 { 1677 /* read-only SYSCTLs */ 1678 SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name, 1679 CTLFLAG_RD, &rl->stats.arg[x], 0, desc); 1680 } 1681 1682 #else 1683 1684 int 1685 mlx5e_rl_init(struct mlx5e_priv *priv) 1686 { 1687 1688 return (0); 1689 } 1690 1691 void 1692 mlx5e_rl_cleanup(struct mlx5e_priv *priv) 1693 { 1694 /* NOP */ 1695 } 1696 1697 #endif /* RATELIMIT */ 1698