1 /*- 2 * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 #include "opt_rss.h" 27 #include "opt_ratelimit.h" 28 29 #include <dev/mlx5/mlx5_en/en.h> 30 31 #ifdef RATELIMIT 32 33 static int mlx5e_rl_open_workers(struct mlx5e_priv *); 34 static void mlx5e_rl_close_workers(struct mlx5e_priv *); 35 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS); 36 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x, 37 struct sysctl_oid *, const char *name, const char *desc); 38 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, 39 struct sysctl_oid *node, const char *name, const char *desc); 40 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value); 41 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value); 42 static if_snd_tag_modify_t mlx5e_rl_snd_tag_modify; 43 static if_snd_tag_query_t mlx5e_rl_snd_tag_query; 44 static if_snd_tag_free_t mlx5e_rl_snd_tag_free; 45 46 static const struct if_snd_tag_sw mlx5e_rl_snd_tag_sw = { 47 .snd_tag_modify = mlx5e_rl_snd_tag_modify, 48 .snd_tag_query = mlx5e_rl_snd_tag_query, 49 .snd_tag_free = mlx5e_rl_snd_tag_free, 50 .type = IF_SND_TAG_TYPE_RATE_LIMIT 51 }; 52 53 static void 54 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl, 55 struct mlx5e_sq_param *param) 56 { 57 void *sqc = param->sqc; 58 void *wq = MLX5_ADDR_OF(sqc, sqc, wq); 59 uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size); 60 61 MLX5_SET(wq, wq, log_wq_sz, log_sq_size); 62 MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); 63 MLX5_SET(wq, wq, pd, rl->priv->pdn); 64 65 param->wq.linear = 1; 66 } 67 68 static void 69 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl, 70 struct mlx5e_cq_param *param) 71 { 72 void *cqc = param->cqc; 73 uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size); 74 75 MLX5_SET(cqc, cqc, log_cq_size, log_sq_size); 76 MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs); 77 MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts); 78 MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index); 79 80 switch (rl->param.tx_coalesce_mode) { 81 case 0: 82 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); 83 break; 84 default: 85 if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe)) 86 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); 87 else 88 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); 89 break; 90 } 91 } 92 93 static void 94 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl, 95 struct mlx5e_rl_channel_param *cparam) 96 { 97 memset(cparam, 0, sizeof(*cparam)); 98 99 mlx5e_rl_build_sq_param(rl, &cparam->sq); 100 mlx5e_rl_build_cq_param(rl, &cparam->cq); 101 } 102 103 static int 104 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq, 105 struct mlx5e_sq_param *param, int ix) 106 { 107 struct mlx5_core_dev *mdev = priv->mdev; 108 void *sqc = param->sqc; 109 void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq); 110 int err; 111 112 /* Create DMA descriptor TAG */ 113 if ((err = -bus_dma_tag_create( 114 bus_get_dma_tag(mdev->pdev->dev.bsddev), 115 1, /* any alignment */ 116 0, /* no boundary */ 117 BUS_SPACE_MAXADDR, /* lowaddr */ 118 BUS_SPACE_MAXADDR, /* highaddr */ 119 NULL, NULL, /* filter, filterarg */ 120 MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */ 121 MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */ 122 MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */ 123 0, /* flags */ 124 NULL, NULL, /* lockfunc, lockfuncarg */ 125 &sq->dma_tag))) 126 goto done; 127 128 sq->mkey_be = cpu_to_be32(priv->mr.key); 129 sq->ifp = priv->ifp; 130 sq->priv = priv; 131 132 err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, 133 &sq->wq_ctrl); 134 if (err) 135 goto err_free_dma_tag; 136 137 sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; 138 139 err = mlx5e_alloc_sq_db(sq); 140 if (err) 141 goto err_sq_wq_destroy; 142 143 mlx5e_update_sq_inline(sq); 144 145 return (0); 146 147 err_sq_wq_destroy: 148 mlx5_wq_destroy(&sq->wq_ctrl); 149 err_free_dma_tag: 150 bus_dma_tag_destroy(sq->dma_tag); 151 done: 152 return (err); 153 } 154 155 static void 156 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq) 157 { 158 159 mlx5e_free_sq_db(sq); 160 mlx5_wq_destroy(&sq->wq_ctrl); 161 bus_dma_tag_destroy(sq->dma_tag); 162 } 163 164 static int 165 mlx5e_rl_query_sq(struct mlx5e_sq *sq) 166 { 167 void *out; 168 int inlen; 169 int err; 170 171 inlen = MLX5_ST_SZ_BYTES(query_sq_out); 172 out = mlx5_vzalloc(inlen); 173 if (!out) 174 return -ENOMEM; 175 176 err = mlx5_core_query_sq(sq->priv->mdev, sq->sqn, out); 177 if (err) 178 goto out; 179 180 sq->queue_handle = MLX5_GET(query_sq_out, out, sq_context.queue_handle); 181 182 out: 183 kvfree(out); 184 return err; 185 } 186 187 static int 188 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq, 189 struct mlx5e_sq_param *param, int ix) 190 { 191 int err; 192 193 err = mlx5e_rl_create_sq(priv, sq, param, ix); 194 if (err) 195 return (err); 196 197 err = mlx5e_enable_sq(sq, param, &priv->channel[ix].bfreg, priv->rl.tisn); 198 if (err) 199 goto err_destroy_sq; 200 201 err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); 202 if (err) 203 goto err_disable_sq; 204 205 if (MLX5_CAP_QOS(priv->mdev, qos_remap_pp)) { 206 err = mlx5e_rl_query_sq(sq); 207 if (err) { 208 mlx5_en_err(priv->ifp, "Failed retrieving send queue handle for" 209 "SQ remap - sqn=%u, err=(%d)\n", sq->sqn, err); 210 sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE; 211 } 212 } else 213 sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE; 214 215 WRITE_ONCE(sq->running, 1); 216 217 return (0); 218 219 err_disable_sq: 220 mlx5e_disable_sq(sq); 221 err_destroy_sq: 222 mlx5e_rl_destroy_sq(sq); 223 224 return (err); 225 } 226 227 static void 228 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq) 229 { 230 mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF); 231 mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF); 232 233 callout_init_mtx(&sq->cev_callout, &sq->lock, 0); 234 235 sq->cev_factor = priv->rl.param.tx_completion_fact; 236 237 /* ensure the TX completion event factor is not zero */ 238 if (sq->cev_factor == 0) 239 sq->cev_factor = 1; 240 } 241 242 static int 243 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix, 244 struct mlx5e_rl_channel_param *cparam, 245 struct mlx5e_sq *volatile *ppsq) 246 { 247 struct mlx5e_priv *priv = rlw->priv; 248 struct mlx5e_sq *sq; 249 int err; 250 251 sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO); 252 253 /* init mutexes */ 254 mlx5e_rl_chan_mtx_init(priv, sq); 255 256 /* open TX completion queue */ 257 err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq, 258 &mlx5e_tx_cq_comp, eq_ix); 259 if (err) 260 goto err_free; 261 262 err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix); 263 if (err) 264 goto err_close_tx_cq; 265 266 /* store TX channel pointer */ 267 *ppsq = sq; 268 269 /* poll TX queue initially */ 270 sq->cq.mcq.comp(&sq->cq.mcq, NULL); 271 272 return (0); 273 274 err_close_tx_cq: 275 mlx5e_close_cq(&sq->cq); 276 277 err_free: 278 /* destroy mutexes */ 279 mtx_destroy(&sq->lock); 280 mtx_destroy(&sq->comp_lock); 281 free(sq, M_MLX5EN); 282 atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL); 283 return (err); 284 } 285 286 static void 287 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq) 288 { 289 struct mlx5e_sq *sq = *ppsq; 290 291 /* check if channel is already closed */ 292 if (sq == NULL) 293 return; 294 /* ensure channel pointer is no longer used */ 295 *ppsq = NULL; 296 297 /* teardown and destroy SQ */ 298 mlx5e_drain_sq(sq); 299 mlx5e_disable_sq(sq); 300 mlx5e_rl_destroy_sq(sq); 301 302 /* close CQ */ 303 mlx5e_close_cq(&sq->cq); 304 305 /* destroy mutexes */ 306 mtx_destroy(&sq->lock); 307 mtx_destroy(&sq->comp_lock); 308 309 free(sq, M_MLX5EN); 310 } 311 312 static void 313 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl) 314 { 315 /* 316 * Limit the maximum distance between completion events to 317 * half of the currently set TX queue size. 318 * 319 * The maximum number of queue entries a single IP packet can 320 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS. 321 * 322 * The worst case max value is then given as below: 323 */ 324 uint64_t max = rl->param.tx_queue_size / 325 (2 * MLX5_SEND_WQE_MAX_WQEBBS); 326 327 /* 328 * Update the maximum completion factor value in case the 329 * tx_queue_size field changed. Ensure we don't overflow 330 * 16-bits. 331 */ 332 if (max < 1) 333 max = 1; 334 else if (max > 65535) 335 max = 65535; 336 rl->param.tx_completion_fact_max = max; 337 338 /* 339 * Verify that the current TX completion factor is within the 340 * given limits: 341 */ 342 if (rl->param.tx_completion_fact < 1) 343 rl->param.tx_completion_fact = 1; 344 else if (rl->param.tx_completion_fact > max) 345 rl->param.tx_completion_fact = max; 346 } 347 348 static int 349 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index) 350 { 351 struct mlx5e_priv *priv = sq->priv; 352 struct mlx5_core_dev *mdev = priv->mdev; 353 354 void *in; 355 void *sqc; 356 int inlen; 357 int err; 358 359 inlen = MLX5_ST_SZ_BYTES(modify_sq_in); 360 in = mlx5_vzalloc(inlen); 361 if (in == NULL) 362 return (-ENOMEM); 363 364 sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); 365 366 MLX5_SET(modify_sq_in, in, sqn, sq->sqn); 367 MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY); 368 MLX5_SET64(modify_sq_in, in, modify_bitmask, 1); 369 MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY); 370 MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index); 371 372 err = mlx5_core_modify_sq(mdev, in, inlen); 373 374 kvfree(in); 375 376 return (err); 377 } 378 379 /* 380 * This function will search the configured rate limit table for the 381 * best match to avoid that a single socket based application can 382 * allocate all the available hardware rates. If the user selected 383 * rate deviates too much from the closes rate available in the rate 384 * limit table, unlimited rate will be selected. 385 */ 386 static uint64_t 387 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate) 388 { 389 uint64_t distance = -1ULL; 390 uint64_t diff; 391 uint64_t retval = 0; /* unlimited */ 392 uint64_t x; 393 394 /* search for closest rate */ 395 for (x = 0; x != rl->param.tx_rates_def; x++) { 396 uint64_t rate = rl->rate_limit_table[x]; 397 if (rate == 0) 398 continue; 399 400 if (rate > user_rate) 401 diff = rate - user_rate; 402 else 403 diff = user_rate - rate; 404 405 /* check if distance is smaller than previous rate */ 406 if (diff < distance) { 407 distance = diff; 408 retval = rate; 409 } 410 } 411 412 /* range check for multiplication below */ 413 if (user_rate > rl->param.tx_limit_max) 414 user_rate = rl->param.tx_limit_max; 415 416 /* fallback to unlimited, if rate deviates too much */ 417 if (distance > howmany(user_rate * 418 rl->param.tx_allowed_deviation, 1000ULL)) 419 retval = 0; 420 421 return (retval); 422 } 423 424 static int 425 mlx5e_rl_post_sq_remap_wqe(struct mlx5e_iq *iq, u32 scq_handle, u32 sq_handle, 426 struct mlx5e_rl_channel *sq_channel) 427 { 428 const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_qos_remap_wqe), 429 MLX5_SEND_WQE_DS); 430 struct mlx5e_tx_qos_remap_wqe *wqe; 431 int pi; 432 433 mtx_lock(&iq->lock); 434 pi = mlx5e_iq_get_producer_index(iq); 435 if (pi < 0) { 436 mtx_unlock(&iq->lock); 437 return (-ENOMEM); 438 } 439 wqe = mlx5_wq_cyc_get_wqe(&iq->wq, pi); 440 441 memset(wqe, 0, sizeof(*wqe)); 442 443 wqe->qos_remap.qos_handle = cpu_to_be32(scq_handle); 444 wqe->qos_remap.queue_handle = cpu_to_be32(sq_handle); 445 446 wqe->ctrl.opmod_idx_opcode = cpu_to_be32((iq->pc << 8) | 447 MLX5_OPCODE_QOS_REMAP); 448 wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | ds_cnt); 449 wqe->ctrl.imm = cpu_to_be32(iq->priv->tisn[0] << 8); 450 wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL; 451 452 /* copy data for doorbell */ 453 memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32)); 454 455 iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); 456 iq->data[pi].p_refcount = &sq_channel->refcount; 457 atomic_add_int(iq->data[pi].p_refcount, 1); 458 iq->pc += iq->data[pi].num_wqebbs; 459 460 mlx5e_iq_notify_hw(iq); 461 462 mtx_unlock(&iq->lock); 463 464 return (0); /* success */ 465 } 466 467 static int 468 mlx5e_rl_remap_sq(struct mlx5e_sq *sq, uint16_t index, 469 struct mlx5e_rl_channel *sq_channel) 470 { 471 struct mlx5e_channel *iq_channel; 472 u32 scq_handle; 473 u32 sq_handle; 474 int error; 475 476 /* Specific SQ remap operations should be handled by same IQ */ 477 iq_channel = &sq->priv->channel[sq->sqn % sq->priv->params.num_channels]; 478 479 sq_handle = sq->queue_handle; 480 scq_handle = mlx5_rl_get_scq_handle(sq->priv->mdev, index); 481 482 if (sq_handle == MLX5_INVALID_QUEUE_HANDLE || 483 scq_handle == MLX5_INVALID_QUEUE_HANDLE) 484 error = -1; 485 else 486 error = mlx5e_rl_post_sq_remap_wqe(&iq_channel->iq, scq_handle, 487 sq_handle, sq_channel); 488 489 return (error); 490 } 491 492 /* 493 * This function sets the requested rate for a rate limit channel, in 494 * bits per second. The requested rate will be filtered through the 495 * find best rate function above. 496 */ 497 static int 498 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw, 499 struct mlx5e_rl_channel *channel, uint64_t rate) 500 { 501 struct mlx5e_rl_priv_data *rl = &rlw->priv->rl; 502 struct mlx5e_sq *sq; 503 uint64_t temp; 504 uint16_t index; 505 uint16_t burst; 506 int error; 507 bool use_sq_remap; 508 509 if (rate != 0) { 510 MLX5E_RL_WORKER_UNLOCK(rlw); 511 512 MLX5E_RL_RLOCK(rl); 513 514 /* get current burst size in bytes */ 515 temp = rl->param.tx_burst_size * 516 MLX5E_SW2HW_MTU(if_getmtu(rlw->priv->ifp)); 517 518 /* limit burst size to 64K currently */ 519 if (temp > 65535) 520 temp = 65535; 521 burst = temp; 522 523 /* find best rate */ 524 rate = mlx5e_rl_find_best_rate_locked(rl, rate); 525 526 MLX5E_RL_RUNLOCK(rl); 527 528 if (rate == 0) { 529 /* rate doesn't exist, fallback to unlimited */ 530 index = 0; 531 rate = 0; 532 atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL); 533 } else { 534 /* get a reference on the new rate */ 535 error = -mlx5_rl_add_rate(rlw->priv->mdev, 536 howmany(rate, 1000), burst, &index); 537 538 if (error != 0) { 539 /* adding rate failed, fallback to unlimited */ 540 index = 0; 541 rate = 0; 542 atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL); 543 } 544 } 545 MLX5E_RL_WORKER_LOCK(rlw); 546 } else { 547 index = 0; 548 burst = 0; /* default */ 549 } 550 551 /* paced <--> non-paced transitions must go via FW */ 552 use_sq_remap = MLX5_CAP_QOS(rlw->priv->mdev, qos_remap_pp) && 553 channel->last_rate != 0 && rate != 0; 554 555 /* atomically swap rates */ 556 temp = channel->last_rate; 557 channel->last_rate = rate; 558 rate = temp; 559 560 /* atomically swap burst size */ 561 temp = channel->last_burst; 562 channel->last_burst = burst; 563 burst = temp; 564 565 MLX5E_RL_WORKER_UNLOCK(rlw); 566 /* put reference on the old rate, if any */ 567 if (rate != 0) { 568 mlx5_rl_remove_rate(rlw->priv->mdev, 569 howmany(rate, 1000), burst); 570 } 571 572 /* set new rate, if SQ is running */ 573 sq = channel->sq; 574 if (sq != NULL && READ_ONCE(sq->running) != 0) { 575 if (!use_sq_remap || mlx5e_rl_remap_sq(sq, index, channel)) { 576 while (atomic_load_int(&channel->refcount) != 0 && 577 rlw->priv->mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR && 578 pci_channel_offline(rlw->priv->mdev->pdev) == 0) 579 pause("W", 1); 580 error = mlx5e_rl_modify_sq(sq, index); 581 if (error != 0) 582 atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL); 583 } 584 } else 585 error = 0; 586 587 MLX5E_RL_WORKER_LOCK(rlw); 588 589 return (-error); 590 } 591 592 static void 593 mlx5e_rl_worker(void *arg) 594 { 595 struct thread *td; 596 struct mlx5e_rl_worker *rlw = arg; 597 struct mlx5e_rl_channel *channel; 598 struct mlx5e_priv *priv; 599 unsigned ix; 600 uint64_t x; 601 int error; 602 603 /* set thread priority */ 604 td = curthread; 605 606 thread_lock(td); 607 sched_prio(td, PI_SWI(SWI_NET)); 608 thread_unlock(td); 609 610 priv = rlw->priv; 611 612 /* compute completion vector */ 613 ix = (rlw - priv->rl.workers) % 614 priv->mdev->priv.eq_table.num_comp_vectors; 615 616 /* TODO bind to CPU */ 617 618 /* open all the SQs */ 619 MLX5E_RL_WORKER_LOCK(rlw); 620 for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) { 621 struct mlx5e_rl_channel *channel = rlw->channels + x; 622 623 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS) 624 if (channel->state == MLX5E_RL_ST_FREE) 625 continue; 626 #endif 627 MLX5E_RL_WORKER_UNLOCK(rlw); 628 629 MLX5E_RL_RLOCK(&priv->rl); 630 error = mlx5e_rl_open_channel(rlw, ix, 631 &priv->rl.chan_param, &channel->sq); 632 MLX5E_RL_RUNLOCK(&priv->rl); 633 634 MLX5E_RL_WORKER_LOCK(rlw); 635 if (error != 0) { 636 mlx5_en_err(priv->ifp, 637 "mlx5e_rl_open_channel failed: %d\n", error); 638 break; 639 } 640 mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate); 641 } 642 while (1) { 643 if (STAILQ_FIRST(&rlw->process_head) == NULL) { 644 /* check if we are tearing down */ 645 if (rlw->worker_done != 0) 646 break; 647 cv_wait(&rlw->cv, &rlw->mtx); 648 } 649 /* check if we are tearing down */ 650 if (rlw->worker_done != 0) 651 break; 652 channel = STAILQ_FIRST(&rlw->process_head); 653 if (channel != NULL) { 654 STAILQ_REMOVE_HEAD(&rlw->process_head, entry); 655 656 switch (channel->state) { 657 case MLX5E_RL_ST_MODIFY: 658 channel->state = MLX5E_RL_ST_USED; 659 MLX5E_RL_WORKER_UNLOCK(rlw); 660 661 /* create channel by demand */ 662 if (channel->sq == NULL) { 663 MLX5E_RL_RLOCK(&priv->rl); 664 error = mlx5e_rl_open_channel(rlw, ix, 665 &priv->rl.chan_param, &channel->sq); 666 MLX5E_RL_RUNLOCK(&priv->rl); 667 668 if (error != 0) { 669 mlx5_en_err(priv->ifp, 670 "mlx5e_rl_open_channel failed: %d\n", error); 671 } else { 672 atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL); 673 } 674 } else { 675 mlx5e_resume_sq(channel->sq); 676 } 677 678 MLX5E_RL_WORKER_LOCK(rlw); 679 /* convert from bytes/s to bits/s and set new rate */ 680 error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 681 channel->new_rate * 8ULL); 682 if (error != 0) { 683 mlx5_en_err(priv->ifp, 684 "mlx5e_rlw_channel_set_rate_locked failed: %d\n", 685 error); 686 } 687 break; 688 689 case MLX5E_RL_ST_DESTROY: 690 error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0); 691 if (error != 0) { 692 mlx5_en_err(priv->ifp, 693 "mlx5e_rlw_channel_set_rate_locked failed: %d\n", 694 error); 695 } 696 if (channel->sq != NULL) { 697 /* 698 * Make sure all packets are 699 * transmitted before SQ is 700 * returned to free list: 701 */ 702 MLX5E_RL_WORKER_UNLOCK(rlw); 703 mlx5e_drain_sq(channel->sq); 704 MLX5E_RL_WORKER_LOCK(rlw); 705 } 706 /* put the channel back into the free list */ 707 STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry); 708 channel->state = MLX5E_RL_ST_FREE; 709 atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL); 710 break; 711 default: 712 /* NOP */ 713 break; 714 } 715 } 716 } 717 718 /* close all the SQs */ 719 for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) { 720 struct mlx5e_rl_channel *channel = rlw->channels + x; 721 722 /* update the initial rate */ 723 channel->init_rate = channel->last_rate; 724 725 /* make sure we free up the rate resource */ 726 mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0); 727 728 if (channel->sq != NULL) { 729 MLX5E_RL_WORKER_UNLOCK(rlw); 730 mlx5e_rl_close_channel(&channel->sq); 731 atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL); 732 MLX5E_RL_WORKER_LOCK(rlw); 733 } 734 } 735 736 rlw->worker_done = 0; 737 cv_broadcast(&rlw->cv); 738 MLX5E_RL_WORKER_UNLOCK(rlw); 739 740 kthread_exit(); 741 } 742 743 static int 744 mlx5e_rl_open_tis(struct mlx5e_priv *priv) 745 { 746 struct mlx5_core_dev *mdev = priv->mdev; 747 u32 in[MLX5_ST_SZ_DW(create_tis_in)]; 748 void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); 749 750 memset(in, 0, sizeof(in)); 751 752 MLX5_SET(tisc, tisc, prio, 0); 753 MLX5_SET(tisc, tisc, transport_domain, priv->tdn); 754 755 return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn)); 756 } 757 758 static void 759 mlx5e_rl_close_tis(struct mlx5e_priv *priv) 760 { 761 mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn, 0); 762 } 763 764 static void 765 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param, 766 struct mlx5_core_dev *mdev) 767 { 768 /* ratelimit workers */ 769 param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors; 770 param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS; 771 772 /* range check */ 773 if (param->tx_worker_threads_def == 0 || 774 param->tx_worker_threads_def > param->tx_worker_threads_max) 775 param->tx_worker_threads_def = param->tx_worker_threads_max; 776 777 /* ratelimit channels */ 778 param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS / 779 param->tx_worker_threads_def; 780 param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS; 781 782 /* range check */ 783 if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER) 784 param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER; 785 786 /* set default burst size */ 787 param->tx_burst_size = 4; /* MTUs */ 788 789 /* 790 * Set maximum burst size 791 * 792 * The burst size is multiplied by the MTU and clamped to the 793 * range 0 ... 65535 bytes inclusivly before fed into the 794 * firmware. 795 * 796 * NOTE: If the burst size or MTU is changed only ratelimit 797 * connections made after the change will use the new burst 798 * size. 799 */ 800 param->tx_burst_size_max = 255; 801 802 /* get firmware rate limits in 1000bit/s and convert them to bit/s */ 803 param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL; 804 param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL; 805 806 /* ratelimit table size */ 807 param->tx_rates_max = mdev->priv.rl_table.max_size; 808 809 /* range check */ 810 if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES) 811 param->tx_rates_max = MLX5E_RL_MAX_TX_RATES; 812 813 /* set default number of rates */ 814 param->tx_rates_def = param->tx_rates_max; 815 816 /* set maximum allowed rate deviation */ 817 if (param->tx_limit_max != 0) { 818 /* 819 * Make sure the deviation multiplication doesn't 820 * overflow unsigned 64-bit: 821 */ 822 param->tx_allowed_deviation_max = -1ULL / 823 param->tx_limit_max; 824 } 825 /* set default rate deviation */ 826 param->tx_allowed_deviation = 50; /* 5.0% */ 827 828 /* channel parameters */ 829 param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); 830 param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT; 831 param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT; 832 param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT; 833 param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT; 834 } 835 836 static const char *mlx5e_rl_params_desc[] = { 837 MLX5E_RL_PARAMS(MLX5E_STATS_DESC) 838 }; 839 840 static const char *mlx5e_rl_table_params_desc[] = { 841 MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC) 842 }; 843 844 static const char *mlx5e_rl_stats_desc[] = { 845 MLX5E_RL_STATS(MLX5E_STATS_DESC) 846 }; 847 848 int 849 mlx5e_rl_init(struct mlx5e_priv *priv) 850 { 851 struct mlx5e_rl_priv_data *rl = &priv->rl; 852 struct sysctl_oid *node; 853 struct sysctl_oid *stats; 854 char buf[64]; 855 uint64_t i; 856 uint64_t j; 857 int error; 858 859 /* check if there is support for packet pacing */ 860 if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing)) 861 return (0); 862 863 rl->priv = priv; 864 865 sysctl_ctx_init(&rl->ctx); 866 867 sx_init(&rl->rl_sxlock, "ratelimit-sxlock"); 868 869 /* open own TIS domain for ratelimit SQs */ 870 error = mlx5e_rl_open_tis(priv); 871 if (error) 872 goto done; 873 874 /* setup default value for parameters */ 875 mlx5e_rl_set_default_params(&rl->param, priv->mdev); 876 877 /* update the completion factor */ 878 mlx5e_rl_sync_tx_completion_fact(rl); 879 880 /* create root node */ 881 node = SYSCTL_ADD_NODE(&rl->ctx, 882 SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, 883 "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support"); 884 885 if (node != NULL) { 886 /* create SYSCTLs */ 887 for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) { 888 mlx5e_rl_sysctl_add_u64_oid(rl, 889 MLX5E_RL_PARAMS_INDEX(arg[i]), 890 node, mlx5e_rl_params_desc[2 * i], 891 mlx5e_rl_params_desc[2 * i + 1]); 892 } 893 894 stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node), 895 OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 896 "Rate limiting statistics"); 897 if (stats != NULL) { 898 /* create SYSCTLs */ 899 for (i = 0; i != MLX5E_RL_STATS_NUM; i++) { 900 mlx5e_rl_sysctl_add_stats_u64_oid(rl, i, 901 stats, mlx5e_rl_stats_desc[2 * i], 902 mlx5e_rl_stats_desc[2 * i + 1]); 903 } 904 } 905 } 906 907 /* allocate workers array */ 908 rl->workers = malloc(sizeof(rl->workers[0]) * 909 rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO); 910 911 /* allocate rate limit array */ 912 rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) * 913 rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO); 914 915 if (node != NULL) { 916 /* create more SYSCTls */ 917 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, 918 "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD | 919 CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table, 920 "A", "Show table of all configured TX rates"); 921 922 /* try to fetch rate table from kernel environment */ 923 for (i = 0; i != rl->param.tx_rates_def; i++) { 924 /* compute path for tunable */ 925 snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d", 926 device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i); 927 if (TUNABLE_QUAD_FETCH(buf, &j)) 928 mlx5e_rl_tx_limit_add(rl, j); 929 } 930 931 /* setup rate table sysctls */ 932 for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) { 933 mlx5e_rl_sysctl_add_u64_oid(rl, 934 MLX5E_RL_PARAMS_INDEX(table_arg[i]), 935 node, mlx5e_rl_table_params_desc[2 * i], 936 mlx5e_rl_table_params_desc[2 * i + 1]); 937 } 938 } 939 940 for (j = 0; j < rl->param.tx_worker_threads_def; j++) { 941 struct mlx5e_rl_worker *rlw = rl->workers + j; 942 943 rlw->priv = priv; 944 945 cv_init(&rlw->cv, "mlx5-worker-cv"); 946 mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF); 947 STAILQ_INIT(&rlw->index_list_head); 948 STAILQ_INIT(&rlw->process_head); 949 950 rlw->channels = malloc(sizeof(rlw->channels[0]) * 951 rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO); 952 953 MLX5E_RL_WORKER_LOCK(rlw); 954 for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) { 955 struct mlx5e_rl_channel *channel = rlw->channels + i; 956 channel->worker = rlw; 957 STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry); 958 } 959 MLX5E_RL_WORKER_UNLOCK(rlw); 960 } 961 962 PRIV_LOCK(priv); 963 error = mlx5e_rl_open_workers(priv); 964 PRIV_UNLOCK(priv); 965 966 if (error != 0) { 967 mlx5_en_err(priv->ifp, 968 "mlx5e_rl_open_workers failed: %d\n", error); 969 } 970 971 return (0); 972 973 done: 974 sysctl_ctx_free(&rl->ctx); 975 sx_destroy(&rl->rl_sxlock); 976 return (error); 977 } 978 979 static int 980 mlx5e_rl_open_workers(struct mlx5e_priv *priv) 981 { 982 struct mlx5e_rl_priv_data *rl = &priv->rl; 983 struct thread *rl_thread = NULL; 984 struct proc *rl_proc = NULL; 985 uint64_t j; 986 int error; 987 988 if (priv->gone || rl->opened) 989 return (-EINVAL); 990 991 MLX5E_RL_WLOCK(rl); 992 /* compute channel parameters once */ 993 mlx5e_rl_build_channel_param(rl, &rl->chan_param); 994 MLX5E_RL_WUNLOCK(rl); 995 996 for (j = 0; j < rl->param.tx_worker_threads_def; j++) { 997 struct mlx5e_rl_worker *rlw = rl->workers + j; 998 999 /* start worker thread */ 1000 error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread, 1001 RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j); 1002 if (error != 0) { 1003 mlx5_en_err(rl->priv->ifp, 1004 "kproc_kthread_add failed: %d\n", error); 1005 rlw->worker_done = 1; 1006 } 1007 } 1008 1009 rl->opened = 1; 1010 1011 return (0); 1012 } 1013 1014 static void 1015 mlx5e_rl_close_workers(struct mlx5e_priv *priv) 1016 { 1017 struct mlx5e_rl_priv_data *rl = &priv->rl; 1018 uint64_t y; 1019 1020 if (rl->opened == 0) 1021 return; 1022 1023 /* tear down worker threads simultaneously */ 1024 for (y = 0; y < rl->param.tx_worker_threads_def; y++) { 1025 struct mlx5e_rl_worker *rlw = rl->workers + y; 1026 1027 /* tear down worker before freeing SQs */ 1028 MLX5E_RL_WORKER_LOCK(rlw); 1029 if (rlw->worker_done == 0) { 1030 rlw->worker_done = 1; 1031 cv_broadcast(&rlw->cv); 1032 } else { 1033 /* XXX thread not started */ 1034 rlw->worker_done = 0; 1035 } 1036 MLX5E_RL_WORKER_UNLOCK(rlw); 1037 } 1038 1039 /* wait for worker threads to exit */ 1040 for (y = 0; y < rl->param.tx_worker_threads_def; y++) { 1041 struct mlx5e_rl_worker *rlw = rl->workers + y; 1042 1043 /* tear down worker before freeing SQs */ 1044 MLX5E_RL_WORKER_LOCK(rlw); 1045 while (rlw->worker_done != 0) 1046 cv_wait(&rlw->cv, &rlw->mtx); 1047 MLX5E_RL_WORKER_UNLOCK(rlw); 1048 } 1049 1050 rl->opened = 0; 1051 } 1052 1053 static void 1054 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl) 1055 { 1056 unsigned x; 1057 1058 MLX5E_RL_WLOCK(rl); 1059 for (x = 0; x != rl->param.tx_rates_def; x++) 1060 rl->rate_limit_table[x] = 0; 1061 MLX5E_RL_WUNLOCK(rl); 1062 } 1063 1064 void 1065 mlx5e_rl_cleanup(struct mlx5e_priv *priv) 1066 { 1067 struct mlx5e_rl_priv_data *rl = &priv->rl; 1068 uint64_t y; 1069 1070 /* check if there is support for packet pacing */ 1071 if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing)) 1072 return; 1073 1074 /* TODO check if there is support for packet pacing */ 1075 1076 sysctl_ctx_free(&rl->ctx); 1077 1078 PRIV_LOCK(priv); 1079 mlx5e_rl_close_workers(priv); 1080 PRIV_UNLOCK(priv); 1081 1082 mlx5e_rl_reset_rates(rl); 1083 1084 /* close TIS domain */ 1085 mlx5e_rl_close_tis(priv); 1086 1087 for (y = 0; y < rl->param.tx_worker_threads_def; y++) { 1088 struct mlx5e_rl_worker *rlw = rl->workers + y; 1089 1090 cv_destroy(&rlw->cv); 1091 mtx_destroy(&rlw->mtx); 1092 free(rlw->channels, M_MLX5EN); 1093 } 1094 free(rl->rate_limit_table, M_MLX5EN); 1095 free(rl->workers, M_MLX5EN); 1096 sx_destroy(&rl->rl_sxlock); 1097 } 1098 1099 static void 1100 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw, 1101 struct mlx5e_rl_channel *channel) 1102 { 1103 STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry); 1104 cv_broadcast(&rlw->cv); 1105 } 1106 1107 static void 1108 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel) 1109 { 1110 if (channel == NULL) 1111 return; 1112 1113 MLX5E_RL_WORKER_LOCK(rlw); 1114 switch (channel->state) { 1115 case MLX5E_RL_ST_MODIFY: 1116 channel->state = MLX5E_RL_ST_DESTROY; 1117 break; 1118 case MLX5E_RL_ST_USED: 1119 channel->state = MLX5E_RL_ST_DESTROY; 1120 mlx5e_rlw_queue_channel_locked(rlw, channel); 1121 break; 1122 default: 1123 break; 1124 } 1125 MLX5E_RL_WORKER_UNLOCK(rlw); 1126 } 1127 1128 static int 1129 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate) 1130 { 1131 1132 MLX5E_RL_WORKER_LOCK(rlw); 1133 channel->new_rate = rate; 1134 switch (channel->state) { 1135 case MLX5E_RL_ST_USED: 1136 channel->state = MLX5E_RL_ST_MODIFY; 1137 mlx5e_rlw_queue_channel_locked(rlw, channel); 1138 break; 1139 default: 1140 break; 1141 } 1142 MLX5E_RL_WORKER_UNLOCK(rlw); 1143 1144 return (0); 1145 } 1146 1147 static int 1148 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, 1149 union if_snd_tag_query_params *params) 1150 { 1151 int retval; 1152 1153 MLX5E_RL_WORKER_LOCK(rlw); 1154 switch (channel->state) { 1155 case MLX5E_RL_ST_USED: 1156 params->rate_limit.max_rate = channel->last_rate; 1157 params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq); 1158 retval = 0; 1159 break; 1160 case MLX5E_RL_ST_MODIFY: 1161 params->rate_limit.max_rate = channel->last_rate; 1162 params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq); 1163 retval = EBUSY; 1164 break; 1165 default: 1166 retval = EINVAL; 1167 break; 1168 } 1169 MLX5E_RL_WORKER_UNLOCK(rlw); 1170 1171 return (retval); 1172 } 1173 1174 static int 1175 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw, 1176 struct mlx5e_rl_channel **pchannel) 1177 { 1178 struct mlx5e_rl_channel *channel; 1179 int retval = ENOMEM; 1180 1181 MLX5E_RL_WORKER_LOCK(rlw); 1182 /* Check for available channel in free list */ 1183 if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) { 1184 retval = 0; 1185 /* Remove head index from available list */ 1186 STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry); 1187 channel->state = MLX5E_RL_ST_USED; 1188 atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL); 1189 } else { 1190 atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL); 1191 } 1192 MLX5E_RL_WORKER_UNLOCK(rlw); 1193 1194 *pchannel = channel; 1195 #ifdef RATELIMIT_DEBUG 1196 mlx5_en_info(rlw->priv->ifp, 1197 "Channel pointer for rate limit connection is %p\n", channel); 1198 #endif 1199 return (retval); 1200 } 1201 1202 int 1203 mlx5e_rl_snd_tag_alloc(if_t ifp, 1204 union if_snd_tag_alloc_params *params, 1205 struct m_snd_tag **ppmt) 1206 { 1207 struct mlx5e_rl_channel *channel; 1208 struct mlx5e_rl_worker *rlw; 1209 struct mlx5e_priv *priv; 1210 int error; 1211 1212 priv = if_getsoftc(ifp); 1213 1214 /* check if there is support for packet pacing or if device is going away */ 1215 if (!MLX5_CAP_GEN(priv->mdev, qos) || 1216 !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone || 1217 params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT) 1218 return (EOPNOTSUPP); 1219 1220 /* compute worker thread this TCP connection belongs to */ 1221 rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) % 1222 priv->rl.param.tx_worker_threads_def); 1223 1224 error = mlx5e_find_available_tx_ring_index(rlw, &channel); 1225 if (error != 0) 1226 goto done; 1227 1228 error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate); 1229 if (error != 0) { 1230 mlx5e_rl_free(rlw, channel); 1231 goto done; 1232 } 1233 1234 /* store pointer to mbuf tag */ 1235 MPASS(channel->tag.refcount == 0); 1236 m_snd_tag_init(&channel->tag, ifp, &mlx5e_rl_snd_tag_sw); 1237 *ppmt = &channel->tag; 1238 done: 1239 return (error); 1240 } 1241 1242 1243 static int 1244 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params) 1245 { 1246 struct mlx5e_rl_channel *channel = 1247 container_of(pmt, struct mlx5e_rl_channel, tag); 1248 1249 return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate)); 1250 } 1251 1252 static int 1253 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) 1254 { 1255 struct mlx5e_rl_channel *channel = 1256 container_of(pmt, struct mlx5e_rl_channel, tag); 1257 1258 return (mlx5e_rl_query(channel->worker, channel, params)); 1259 } 1260 1261 static void 1262 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt) 1263 { 1264 struct mlx5e_rl_channel *channel = 1265 container_of(pmt, struct mlx5e_rl_channel, tag); 1266 1267 mlx5e_rl_free(channel->worker, channel); 1268 } 1269 1270 static int 1271 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS) 1272 { 1273 struct mlx5e_rl_priv_data *rl = arg1; 1274 struct mlx5e_priv *priv = rl->priv; 1275 struct sbuf sbuf; 1276 unsigned x; 1277 int error; 1278 1279 error = sysctl_wire_old_buffer(req, 0); 1280 if (error != 0) 1281 return (error); 1282 1283 PRIV_LOCK(priv); 1284 1285 sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req); 1286 1287 sbuf_printf(&sbuf, 1288 "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n" 1289 "\t" "--------------------------------------------\n"); 1290 1291 MLX5E_RL_RLOCK(rl); 1292 for (x = 0; x != rl->param.tx_rates_def; x++) { 1293 if (rl->rate_limit_table[x] == 0) 1294 continue; 1295 1296 sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n", 1297 x, (unsigned)rl->param.tx_burst_size, 1298 (long long)rl->rate_limit_table[x]); 1299 } 1300 MLX5E_RL_RUNLOCK(rl); 1301 1302 error = sbuf_finish(&sbuf); 1303 sbuf_delete(&sbuf); 1304 1305 PRIV_UNLOCK(priv); 1306 1307 return (error); 1308 } 1309 1310 static int 1311 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl) 1312 { 1313 uint64_t x; 1314 uint64_t y; 1315 1316 MLX5E_RL_WLOCK(rl); 1317 /* compute channel parameters once */ 1318 mlx5e_rl_build_channel_param(rl, &rl->chan_param); 1319 MLX5E_RL_WUNLOCK(rl); 1320 1321 for (y = 0; y != rl->param.tx_worker_threads_def; y++) { 1322 struct mlx5e_rl_worker *rlw = rl->workers + y; 1323 1324 for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) { 1325 struct mlx5e_rl_channel *channel; 1326 struct mlx5e_sq *sq; 1327 1328 channel = rlw->channels + x; 1329 sq = channel->sq; 1330 1331 if (sq == NULL) 1332 continue; 1333 1334 if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) { 1335 mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq, 1336 rl->param.tx_coalesce_usecs, 1337 rl->param.tx_coalesce_pkts, 1338 rl->param.tx_coalesce_mode); 1339 } else { 1340 mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq, 1341 rl->param.tx_coalesce_usecs, 1342 rl->param.tx_coalesce_pkts); 1343 } 1344 } 1345 } 1346 return (0); 1347 } 1348 1349 void 1350 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl) 1351 { 1352 uint64_t x; 1353 uint64_t y; 1354 1355 for (y = 0; y != rl->param.tx_worker_threads_def; y++) { 1356 struct mlx5e_rl_worker *rlw = rl->workers + y; 1357 1358 for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) { 1359 struct mlx5e_rl_channel *channel; 1360 struct mlx5e_sq *sq; 1361 1362 channel = rlw->channels + x; 1363 sq = channel->sq; 1364 1365 if (sq == NULL) 1366 continue; 1367 1368 mtx_lock(&sq->lock); 1369 mlx5e_update_sq_inline(sq); 1370 mtx_unlock(&sq->lock); 1371 } 1372 } 1373 } 1374 1375 static int 1376 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value) 1377 { 1378 unsigned x; 1379 int error; 1380 1381 if (value < 1000 || 1382 mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0) 1383 return (EINVAL); 1384 1385 MLX5E_RL_WLOCK(rl); 1386 error = ENOMEM; 1387 1388 /* check if rate already exists */ 1389 for (x = 0; x != rl->param.tx_rates_def; x++) { 1390 if (rl->rate_limit_table[x] != value) 1391 continue; 1392 error = EEXIST; 1393 break; 1394 } 1395 1396 /* check if there is a free rate entry */ 1397 if (x == rl->param.tx_rates_def) { 1398 for (x = 0; x != rl->param.tx_rates_def; x++) { 1399 if (rl->rate_limit_table[x] != 0) 1400 continue; 1401 rl->rate_limit_table[x] = value; 1402 error = 0; 1403 break; 1404 } 1405 } 1406 MLX5E_RL_WUNLOCK(rl); 1407 1408 return (error); 1409 } 1410 1411 static int 1412 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value) 1413 { 1414 unsigned x; 1415 int error; 1416 1417 if (value == 0) 1418 return (EINVAL); 1419 1420 MLX5E_RL_WLOCK(rl); 1421 1422 /* check if rate already exists */ 1423 for (x = 0; x != rl->param.tx_rates_def; x++) { 1424 if (rl->rate_limit_table[x] != value) 1425 continue; 1426 /* free up rate */ 1427 rl->rate_limit_table[x] = 0; 1428 break; 1429 } 1430 1431 /* check if there is a free rate entry */ 1432 if (x == rl->param.tx_rates_def) 1433 error = ENOENT; 1434 else 1435 error = 0; 1436 MLX5E_RL_WUNLOCK(rl); 1437 1438 return (error); 1439 } 1440 1441 static int 1442 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS) 1443 { 1444 struct mlx5e_rl_priv_data *rl = arg1; 1445 struct mlx5e_priv *priv = rl->priv; 1446 unsigned mode_modify; 1447 unsigned was_opened; 1448 uint64_t value; 1449 int error; 1450 1451 PRIV_LOCK(priv); 1452 1453 MLX5E_RL_RLOCK(rl); 1454 value = rl->param.arg[arg2]; 1455 MLX5E_RL_RUNLOCK(rl); 1456 1457 if (req != NULL) { 1458 error = sysctl_handle_64(oidp, &value, 0, req); 1459 if (error || req->newptr == NULL || 1460 value == rl->param.arg[arg2]) 1461 goto done; 1462 } else { 1463 error = 0; 1464 } 1465 1466 /* check if device is gone */ 1467 if (priv->gone) { 1468 error = ENXIO; 1469 goto done; 1470 } 1471 was_opened = rl->opened; 1472 mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify); 1473 1474 switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) { 1475 case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def): 1476 if (value > rl->param.tx_worker_threads_max) 1477 value = rl->param.tx_worker_threads_max; 1478 else if (value < 1) 1479 value = 1; 1480 1481 /* store new value */ 1482 rl->param.arg[arg2] = value; 1483 break; 1484 1485 case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def): 1486 if (value > rl->param.tx_channels_per_worker_max) 1487 value = rl->param.tx_channels_per_worker_max; 1488 else if (value < 1) 1489 value = 1; 1490 1491 /* store new value */ 1492 rl->param.arg[arg2] = value; 1493 break; 1494 1495 case MLX5E_RL_PARAMS_INDEX(tx_rates_def): 1496 if (value > rl->param.tx_rates_max) 1497 value = rl->param.tx_rates_max; 1498 else if (value < 1) 1499 value = 1; 1500 1501 /* store new value */ 1502 rl->param.arg[arg2] = value; 1503 break; 1504 1505 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs): 1506 /* range check */ 1507 if (value < 1) 1508 value = 0; 1509 else if (value > MLX5E_FLD_MAX(cqc, cq_period)) 1510 value = MLX5E_FLD_MAX(cqc, cq_period); 1511 1512 /* store new value */ 1513 rl->param.arg[arg2] = value; 1514 1515 /* check to avoid down and up the network interface */ 1516 if (was_opened) 1517 error = mlx5e_rl_refresh_channel_params(rl); 1518 break; 1519 1520 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts): 1521 /* import TX coal pkts */ 1522 if (value < 1) 1523 value = 0; 1524 else if (value > MLX5E_FLD_MAX(cqc, cq_max_count)) 1525 value = MLX5E_FLD_MAX(cqc, cq_max_count); 1526 1527 /* store new value */ 1528 rl->param.arg[arg2] = value; 1529 1530 /* check to avoid down and up the network interface */ 1531 if (was_opened) 1532 error = mlx5e_rl_refresh_channel_params(rl); 1533 break; 1534 1535 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode): 1536 /* network interface must be down */ 1537 if (was_opened != 0 && mode_modify == 0) 1538 mlx5e_rl_close_workers(priv); 1539 1540 /* import TX coalesce mode */ 1541 if (value != 0) 1542 value = 1; 1543 1544 /* store new value */ 1545 rl->param.arg[arg2] = value; 1546 1547 /* restart network interface, if any */ 1548 if (was_opened != 0) { 1549 if (mode_modify == 0) 1550 mlx5e_rl_open_workers(priv); 1551 else 1552 error = mlx5e_rl_refresh_channel_params(rl); 1553 } 1554 break; 1555 1556 case MLX5E_RL_PARAMS_INDEX(tx_queue_size): 1557 /* network interface must be down */ 1558 if (was_opened) 1559 mlx5e_rl_close_workers(priv); 1560 1561 /* import TX queue size */ 1562 if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) 1563 value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); 1564 else if (value > priv->params_ethtool.tx_queue_size_max) 1565 value = priv->params_ethtool.tx_queue_size_max; 1566 1567 /* store actual TX queue size */ 1568 value = 1ULL << order_base_2(value); 1569 1570 /* store new value */ 1571 rl->param.arg[arg2] = value; 1572 1573 /* verify TX completion factor */ 1574 mlx5e_rl_sync_tx_completion_fact(rl); 1575 1576 /* restart network interface, if any */ 1577 if (was_opened) 1578 mlx5e_rl_open_workers(priv); 1579 break; 1580 1581 case MLX5E_RL_PARAMS_INDEX(tx_completion_fact): 1582 /* network interface must be down */ 1583 if (was_opened) 1584 mlx5e_rl_close_workers(priv); 1585 1586 /* store new value */ 1587 rl->param.arg[arg2] = value; 1588 1589 /* verify parameter */ 1590 mlx5e_rl_sync_tx_completion_fact(rl); 1591 1592 /* restart network interface, if any */ 1593 if (was_opened) 1594 mlx5e_rl_open_workers(priv); 1595 break; 1596 1597 case MLX5E_RL_PARAMS_INDEX(tx_limit_add): 1598 error = mlx5e_rl_tx_limit_add(rl, value); 1599 break; 1600 1601 case MLX5E_RL_PARAMS_INDEX(tx_limit_clr): 1602 error = mlx5e_rl_tx_limit_clr(rl, value); 1603 break; 1604 1605 case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation): 1606 /* range check */ 1607 if (value > rl->param.tx_allowed_deviation_max) 1608 value = rl->param.tx_allowed_deviation_max; 1609 else if (value < rl->param.tx_allowed_deviation_min) 1610 value = rl->param.tx_allowed_deviation_min; 1611 1612 MLX5E_RL_WLOCK(rl); 1613 rl->param.arg[arg2] = value; 1614 MLX5E_RL_WUNLOCK(rl); 1615 break; 1616 1617 case MLX5E_RL_PARAMS_INDEX(tx_burst_size): 1618 /* range check */ 1619 if (value > rl->param.tx_burst_size_max) 1620 value = rl->param.tx_burst_size_max; 1621 else if (value < rl->param.tx_burst_size_min) 1622 value = rl->param.tx_burst_size_min; 1623 1624 MLX5E_RL_WLOCK(rl); 1625 rl->param.arg[arg2] = value; 1626 MLX5E_RL_WUNLOCK(rl); 1627 break; 1628 1629 default: 1630 break; 1631 } 1632 done: 1633 PRIV_UNLOCK(priv); 1634 return (error); 1635 } 1636 1637 static void 1638 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, 1639 struct sysctl_oid *node, const char *name, const char *desc) 1640 { 1641 /* 1642 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will 1643 * take care of loading default sysctl value from the kernel 1644 * environment, if any: 1645 */ 1646 if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) { 1647 /* read-only SYSCTLs */ 1648 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, 1649 name, CTLTYPE_U64 | CTLFLAG_RD | 1650 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); 1651 } else { 1652 if (strstr(name, "_def") != 0) { 1653 #ifdef RATELIMIT_DEBUG 1654 /* tunable read-only advanced SYSCTLs */ 1655 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, 1656 name, CTLTYPE_U64 | CTLFLAG_RDTUN | 1657 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); 1658 #endif 1659 } else { 1660 /* read-write SYSCTLs */ 1661 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, 1662 name, CTLTYPE_U64 | CTLFLAG_RWTUN | 1663 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); 1664 } 1665 } 1666 } 1667 1668 static void 1669 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, 1670 struct sysctl_oid *node, const char *name, const char *desc) 1671 { 1672 /* read-only SYSCTLs */ 1673 SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name, 1674 CTLFLAG_RD, &rl->stats.arg[x], 0, desc); 1675 } 1676 1677 #else 1678 1679 int 1680 mlx5e_rl_init(struct mlx5e_priv *priv) 1681 { 1682 1683 return (0); 1684 } 1685 1686 void 1687 mlx5e_rl_cleanup(struct mlx5e_priv *priv) 1688 { 1689 /* NOP */ 1690 } 1691 1692 #endif /* RATELIMIT */ 1693