1 /*-
2 * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26 #include "opt_rss.h"
27 #include "opt_ratelimit.h"
28
29 #include <dev/mlx5/mlx5_en/en.h>
30
31 #ifdef RATELIMIT
32
33 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
34 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
35 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
36 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
37 struct sysctl_oid *, const char *name, const char *desc);
38 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
39 struct sysctl_oid *node, const char *name, const char *desc);
40 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
41 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
42 static if_snd_tag_modify_t mlx5e_rl_snd_tag_modify;
43 static if_snd_tag_query_t mlx5e_rl_snd_tag_query;
44 static if_snd_tag_free_t mlx5e_rl_snd_tag_free;
45
46 static const struct if_snd_tag_sw mlx5e_rl_snd_tag_sw = {
47 .snd_tag_modify = mlx5e_rl_snd_tag_modify,
48 .snd_tag_query = mlx5e_rl_snd_tag_query,
49 .snd_tag_free = mlx5e_rl_snd_tag_free,
50 .type = IF_SND_TAG_TYPE_RATE_LIMIT
51 };
52
53 static void
mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data * rl,struct mlx5e_sq_param * param)54 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
55 struct mlx5e_sq_param *param)
56 {
57 void *sqc = param->sqc;
58 void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
59 uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
60
61 MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
62 MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
63 MLX5_SET(wq, wq, pd, rl->priv->pdn);
64
65 param->wq.linear = 1;
66 }
67
68 static void
mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data * rl,struct mlx5e_cq_param * param)69 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
70 struct mlx5e_cq_param *param)
71 {
72 void *cqc = param->cqc;
73 uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
74
75 MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
76 MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
77 MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
78 MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index);
79
80 switch (rl->param.tx_coalesce_mode) {
81 case 0:
82 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
83 break;
84 default:
85 if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
86 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
87 else
88 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
89 break;
90 }
91 }
92
93 static void
mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data * rl,struct mlx5e_rl_channel_param * cparam)94 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
95 struct mlx5e_rl_channel_param *cparam)
96 {
97 memset(cparam, 0, sizeof(*cparam));
98
99 mlx5e_rl_build_sq_param(rl, &cparam->sq);
100 mlx5e_rl_build_cq_param(rl, &cparam->cq);
101 }
102
103 static int
mlx5e_rl_create_sq(struct mlx5e_priv * priv,struct mlx5e_sq * sq,struct mlx5e_sq_param * param,int ix)104 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
105 struct mlx5e_sq_param *param, int ix)
106 {
107 struct mlx5_core_dev *mdev = priv->mdev;
108 void *sqc = param->sqc;
109 void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
110 int err;
111
112 /* Create DMA descriptor TAG */
113 if ((err = -bus_dma_tag_create(
114 bus_get_dma_tag(mdev->pdev->dev.bsddev),
115 1, /* any alignment */
116 0, /* no boundary */
117 BUS_SPACE_MAXADDR, /* lowaddr */
118 BUS_SPACE_MAXADDR, /* highaddr */
119 NULL, NULL, /* filter, filterarg */
120 MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */
121 MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */
122 MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */
123 0, /* flags */
124 NULL, NULL, /* lockfunc, lockfuncarg */
125 &sq->dma_tag)))
126 goto done;
127
128 sq->mkey_be = cpu_to_be32(priv->mr.key);
129 sq->ifp = priv->ifp;
130 sq->priv = priv;
131
132 err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq,
133 &sq->wq_ctrl);
134 if (err)
135 goto err_free_dma_tag;
136
137 sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
138
139 err = mlx5e_alloc_sq_db(sq);
140 if (err)
141 goto err_sq_wq_destroy;
142
143 mlx5e_update_sq_inline(sq);
144
145 return (0);
146
147 err_sq_wq_destroy:
148 mlx5_wq_destroy(&sq->wq_ctrl);
149 err_free_dma_tag:
150 bus_dma_tag_destroy(sq->dma_tag);
151 done:
152 return (err);
153 }
154
155 static void
mlx5e_rl_destroy_sq(struct mlx5e_sq * sq)156 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
157 {
158
159 mlx5e_free_sq_db(sq);
160 mlx5_wq_destroy(&sq->wq_ctrl);
161 bus_dma_tag_destroy(sq->dma_tag);
162 }
163
164 static int
mlx5e_rl_query_sq(struct mlx5e_sq * sq)165 mlx5e_rl_query_sq(struct mlx5e_sq *sq)
166 {
167 void *out;
168 int inlen;
169 int err;
170
171 inlen = MLX5_ST_SZ_BYTES(query_sq_out);
172 out = mlx5_vzalloc(inlen);
173 if (!out)
174 return -ENOMEM;
175
176 err = mlx5_core_query_sq(sq->priv->mdev, sq->sqn, out);
177 if (err)
178 goto out;
179
180 sq->queue_handle = MLX5_GET(query_sq_out, out, sq_context.queue_handle);
181
182 out:
183 kvfree(out);
184 return err;
185 }
186
187 static int
mlx5e_rl_open_sq(struct mlx5e_priv * priv,struct mlx5e_sq * sq,struct mlx5e_sq_param * param,int ix)188 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
189 struct mlx5e_sq_param *param, int ix)
190 {
191 int err;
192
193 err = mlx5e_rl_create_sq(priv, sq, param, ix);
194 if (err)
195 return (err);
196
197 err = mlx5e_enable_sq(sq, param, &priv->channel[ix].bfreg, priv->rl.tisn);
198 if (err)
199 goto err_destroy_sq;
200
201 err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
202 if (err)
203 goto err_disable_sq;
204
205 if (MLX5_CAP_QOS(priv->mdev, qos_remap_pp)) {
206 err = mlx5e_rl_query_sq(sq);
207 if (err) {
208 mlx5_en_err(priv->ifp, "Failed retrieving send queue handle for"
209 "SQ remap - sqn=%u, err=(%d)\n", sq->sqn, err);
210 sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE;
211 }
212 } else
213 sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE;
214
215 WRITE_ONCE(sq->running, 1);
216
217 return (0);
218
219 err_disable_sq:
220 mlx5e_disable_sq(sq);
221 err_destroy_sq:
222 mlx5e_rl_destroy_sq(sq);
223
224 return (err);
225 }
226
227 static void
mlx5e_rl_chan_mtx_init(struct mlx5e_priv * priv,struct mlx5e_sq * sq)228 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
229 {
230 mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
231 mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
232
233 callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
234
235 sq->cev_factor = priv->rl.param.tx_completion_fact;
236
237 /* ensure the TX completion event factor is not zero */
238 if (sq->cev_factor == 0)
239 sq->cev_factor = 1;
240 }
241
242 static int
mlx5e_rl_open_channel(struct mlx5e_rl_worker * rlw,int eq_ix,struct mlx5e_rl_channel_param * cparam,struct mlx5e_sq * volatile * ppsq)243 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
244 struct mlx5e_rl_channel_param *cparam,
245 struct mlx5e_sq *volatile *ppsq)
246 {
247 struct mlx5e_priv *priv = rlw->priv;
248 struct mlx5e_sq *sq;
249 int err;
250
251 sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
252
253 /* init mutexes */
254 mlx5e_rl_chan_mtx_init(priv, sq);
255
256 /* open TX completion queue */
257 err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
258 &mlx5e_tx_cq_comp, eq_ix);
259 if (err)
260 goto err_free;
261
262 err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
263 if (err)
264 goto err_close_tx_cq;
265
266 /* store TX channel pointer */
267 *ppsq = sq;
268
269 /* poll TX queue initially */
270 sq->cq.mcq.comp(&sq->cq.mcq, NULL);
271
272 return (0);
273
274 err_close_tx_cq:
275 mlx5e_close_cq(&sq->cq);
276
277 err_free:
278 /* destroy mutexes */
279 mtx_destroy(&sq->lock);
280 mtx_destroy(&sq->comp_lock);
281 free(sq, M_MLX5EN);
282 atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
283 return (err);
284 }
285
286 static void
mlx5e_rl_close_channel(struct mlx5e_sq * volatile * ppsq)287 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
288 {
289 struct mlx5e_sq *sq = *ppsq;
290
291 /* check if channel is already closed */
292 if (sq == NULL)
293 return;
294 /* ensure channel pointer is no longer used */
295 *ppsq = NULL;
296
297 /* teardown and destroy SQ */
298 mlx5e_drain_sq(sq);
299 mlx5e_disable_sq(sq);
300 mlx5e_rl_destroy_sq(sq);
301
302 /* close CQ */
303 mlx5e_close_cq(&sq->cq);
304
305 /* destroy mutexes */
306 mtx_destroy(&sq->lock);
307 mtx_destroy(&sq->comp_lock);
308
309 free(sq, M_MLX5EN);
310 }
311
312 static void
mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data * rl)313 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
314 {
315 /*
316 * Limit the maximum distance between completion events to
317 * half of the currently set TX queue size.
318 *
319 * The maximum number of queue entries a single IP packet can
320 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
321 *
322 * The worst case max value is then given as below:
323 */
324 uint64_t max = rl->param.tx_queue_size /
325 (2 * MLX5_SEND_WQE_MAX_WQEBBS);
326
327 /*
328 * Update the maximum completion factor value in case the
329 * tx_queue_size field changed. Ensure we don't overflow
330 * 16-bits.
331 */
332 if (max < 1)
333 max = 1;
334 else if (max > 65535)
335 max = 65535;
336 rl->param.tx_completion_fact_max = max;
337
338 /*
339 * Verify that the current TX completion factor is within the
340 * given limits:
341 */
342 if (rl->param.tx_completion_fact < 1)
343 rl->param.tx_completion_fact = 1;
344 else if (rl->param.tx_completion_fact > max)
345 rl->param.tx_completion_fact = max;
346 }
347
348 static int
mlx5e_rl_modify_sq(struct mlx5e_sq * sq,uint16_t rl_index)349 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
350 {
351 struct mlx5e_priv *priv = sq->priv;
352 struct mlx5_core_dev *mdev = priv->mdev;
353
354 void *in;
355 void *sqc;
356 int inlen;
357 int err;
358
359 inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
360 in = mlx5_vzalloc(inlen);
361 if (in == NULL)
362 return (-ENOMEM);
363
364 sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
365
366 MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
367 MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
368 MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
369 MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
370 MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
371
372 err = mlx5_core_modify_sq(mdev, in, inlen);
373
374 kvfree(in);
375
376 return (err);
377 }
378
379 /*
380 * This function will search the configured rate limit table for the
381 * best match to avoid that a single socket based application can
382 * allocate all the available hardware rates. If the user selected
383 * rate deviates too much from the closes rate available in the rate
384 * limit table, unlimited rate will be selected.
385 */
386 static uint64_t
mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data * rl,uint64_t user_rate)387 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
388 {
389 uint64_t distance = -1ULL;
390 uint64_t diff;
391 uint64_t retval = 0; /* unlimited */
392 uint64_t x;
393
394 /* search for closest rate */
395 for (x = 0; x != rl->param.tx_rates_def; x++) {
396 uint64_t rate = rl->rate_limit_table[x];
397 if (rate == 0)
398 continue;
399
400 if (rate > user_rate)
401 diff = rate - user_rate;
402 else
403 diff = user_rate - rate;
404
405 /* check if distance is smaller than previous rate */
406 if (diff < distance) {
407 distance = diff;
408 retval = rate;
409 }
410 }
411
412 /* range check for multiplication below */
413 if (user_rate > rl->param.tx_limit_max)
414 user_rate = rl->param.tx_limit_max;
415
416 /* fallback to unlimited, if rate deviates too much */
417 if (distance > howmany(user_rate *
418 rl->param.tx_allowed_deviation, 1000ULL))
419 retval = 0;
420
421 return (retval);
422 }
423
424 static int
mlx5e_rl_post_sq_remap_wqe(struct mlx5e_iq * iq,u32 scq_handle,u32 sq_handle,struct mlx5e_rl_channel * sq_channel)425 mlx5e_rl_post_sq_remap_wqe(struct mlx5e_iq *iq, u32 scq_handle, u32 sq_handle,
426 struct mlx5e_rl_channel *sq_channel)
427 {
428 const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_qos_remap_wqe),
429 MLX5_SEND_WQE_DS);
430 struct mlx5e_tx_qos_remap_wqe *wqe;
431 int pi;
432
433 mtx_lock(&iq->lock);
434 pi = mlx5e_iq_get_producer_index(iq);
435 if (pi < 0) {
436 mtx_unlock(&iq->lock);
437 return (-ENOMEM);
438 }
439 wqe = mlx5_wq_cyc_get_wqe(&iq->wq, pi);
440
441 memset(wqe, 0, sizeof(*wqe));
442
443 wqe->qos_remap.qos_handle = cpu_to_be32(scq_handle);
444 wqe->qos_remap.queue_handle = cpu_to_be32(sq_handle);
445
446 wqe->ctrl.opmod_idx_opcode = cpu_to_be32((iq->pc << 8) |
447 MLX5_OPCODE_QOS_REMAP);
448 wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | ds_cnt);
449 wqe->ctrl.imm = cpu_to_be32(iq->priv->tisn[0] << 8);
450 wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL;
451
452 /* copy data for doorbell */
453 memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32));
454
455 iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
456 iq->data[pi].p_refcount = &sq_channel->refcount;
457 atomic_add_int(iq->data[pi].p_refcount, 1);
458 iq->pc += iq->data[pi].num_wqebbs;
459
460 mlx5e_iq_notify_hw(iq);
461
462 mtx_unlock(&iq->lock);
463
464 return (0); /* success */
465 }
466
467 static int
mlx5e_rl_remap_sq(struct mlx5e_sq * sq,uint16_t index,struct mlx5e_rl_channel * sq_channel)468 mlx5e_rl_remap_sq(struct mlx5e_sq *sq, uint16_t index,
469 struct mlx5e_rl_channel *sq_channel)
470 {
471 struct mlx5e_channel *iq_channel;
472 u32 scq_handle;
473 u32 sq_handle;
474 int error;
475
476 /* Specific SQ remap operations should be handled by same IQ */
477 iq_channel = &sq->priv->channel[sq->sqn % sq->priv->params.num_channels];
478
479 sq_handle = sq->queue_handle;
480 scq_handle = mlx5_rl_get_scq_handle(sq->priv->mdev, index);
481
482 if (sq_handle == MLX5_INVALID_QUEUE_HANDLE ||
483 scq_handle == MLX5_INVALID_QUEUE_HANDLE)
484 error = -1;
485 else
486 error = mlx5e_rl_post_sq_remap_wqe(&iq_channel->iq, scq_handle,
487 sq_handle, sq_channel);
488
489 return (error);
490 }
491
492 /*
493 * This function sets the requested rate for a rate limit channel, in
494 * bits per second. The requested rate will be filtered through the
495 * find best rate function above.
496 */
497 static int
mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker * rlw,struct mlx5e_rl_channel * channel,uint64_t rate)498 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
499 struct mlx5e_rl_channel *channel, uint64_t rate)
500 {
501 struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
502 struct mlx5e_sq *sq;
503 uint64_t temp;
504 uint16_t index;
505 uint16_t burst;
506 int error;
507 bool use_sq_remap;
508
509 if (rate != 0) {
510 MLX5E_RL_WORKER_UNLOCK(rlw);
511
512 MLX5E_RL_RLOCK(rl);
513
514 /* get current burst size in bytes */
515 temp = rl->param.tx_burst_size *
516 MLX5E_SW2HW_MTU(if_getmtu(rlw->priv->ifp));
517
518 /* limit burst size to 64K currently */
519 if (temp > 65535)
520 temp = 65535;
521 burst = temp;
522
523 /* find best rate */
524 rate = mlx5e_rl_find_best_rate_locked(rl, rate);
525
526 MLX5E_RL_RUNLOCK(rl);
527
528 if (rate == 0) {
529 /* rate doesn't exist, fallback to unlimited */
530 index = 0;
531 rate = 0;
532 atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
533 } else {
534 /* get a reference on the new rate */
535 error = -mlx5_rl_add_rate(rlw->priv->mdev,
536 howmany(rate, 1000), burst, &index);
537
538 if (error != 0) {
539 /* adding rate failed, fallback to unlimited */
540 index = 0;
541 rate = 0;
542 atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
543 }
544 }
545 MLX5E_RL_WORKER_LOCK(rlw);
546 } else {
547 index = 0;
548 burst = 0; /* default */
549 }
550
551 /* paced <--> non-paced transitions must go via FW */
552 use_sq_remap = MLX5_CAP_QOS(rlw->priv->mdev, qos_remap_pp) &&
553 channel->last_rate != 0 && rate != 0;
554
555 /* atomically swap rates */
556 temp = channel->last_rate;
557 channel->last_rate = rate;
558 rate = temp;
559
560 /* atomically swap burst size */
561 temp = channel->last_burst;
562 channel->last_burst = burst;
563 burst = temp;
564
565 MLX5E_RL_WORKER_UNLOCK(rlw);
566 /* put reference on the old rate, if any */
567 if (rate != 0) {
568 mlx5_rl_remove_rate(rlw->priv->mdev,
569 howmany(rate, 1000), burst);
570 }
571
572 /* set new rate, if SQ is running */
573 sq = channel->sq;
574 if (sq != NULL && READ_ONCE(sq->running) != 0) {
575 if (!use_sq_remap || mlx5e_rl_remap_sq(sq, index, channel)) {
576 while (atomic_load_int(&channel->refcount) != 0 &&
577 rlw->priv->mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR &&
578 pci_channel_offline(rlw->priv->mdev->pdev) == 0)
579 pause("W", 1);
580 error = mlx5e_rl_modify_sq(sq, index);
581 if (error != 0)
582 atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
583 }
584 } else
585 error = 0;
586
587 MLX5E_RL_WORKER_LOCK(rlw);
588
589 return (-error);
590 }
591
592 static void
mlx5e_rl_worker(void * arg)593 mlx5e_rl_worker(void *arg)
594 {
595 struct thread *td;
596 struct mlx5e_rl_worker *rlw = arg;
597 struct mlx5e_rl_channel *channel;
598 struct mlx5e_priv *priv;
599 unsigned ix;
600 uint64_t x;
601 int error;
602
603 /* set thread priority */
604 td = curthread;
605
606 thread_lock(td);
607 sched_prio(td, PI_SWI(SWI_NET));
608 thread_unlock(td);
609
610 priv = rlw->priv;
611
612 /* compute completion vector */
613 ix = (rlw - priv->rl.workers) %
614 priv->mdev->priv.eq_table.num_comp_vectors;
615
616 /* TODO bind to CPU */
617
618 /* open all the SQs */
619 MLX5E_RL_WORKER_LOCK(rlw);
620 for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
621 struct mlx5e_rl_channel *channel = rlw->channels + x;
622
623 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
624 if (channel->state == MLX5E_RL_ST_FREE)
625 continue;
626 #endif
627 MLX5E_RL_WORKER_UNLOCK(rlw);
628
629 MLX5E_RL_RLOCK(&priv->rl);
630 error = mlx5e_rl_open_channel(rlw, ix,
631 &priv->rl.chan_param, &channel->sq);
632 MLX5E_RL_RUNLOCK(&priv->rl);
633
634 MLX5E_RL_WORKER_LOCK(rlw);
635 if (error != 0) {
636 mlx5_en_err(priv->ifp,
637 "mlx5e_rl_open_channel failed: %d\n", error);
638 break;
639 }
640 mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
641 }
642 while (1) {
643 if (STAILQ_FIRST(&rlw->process_head) == NULL) {
644 /* check if we are tearing down */
645 if (rlw->worker_done != 0)
646 break;
647 cv_wait(&rlw->cv, &rlw->mtx);
648 }
649 /* check if we are tearing down */
650 if (rlw->worker_done != 0)
651 break;
652 channel = STAILQ_FIRST(&rlw->process_head);
653 if (channel != NULL) {
654 STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
655
656 switch (channel->state) {
657 case MLX5E_RL_ST_MODIFY:
658 channel->state = MLX5E_RL_ST_USED;
659 MLX5E_RL_WORKER_UNLOCK(rlw);
660
661 /* create channel by demand */
662 if (channel->sq == NULL) {
663 MLX5E_RL_RLOCK(&priv->rl);
664 error = mlx5e_rl_open_channel(rlw, ix,
665 &priv->rl.chan_param, &channel->sq);
666 MLX5E_RL_RUNLOCK(&priv->rl);
667
668 if (error != 0) {
669 mlx5_en_err(priv->ifp,
670 "mlx5e_rl_open_channel failed: %d\n", error);
671 } else {
672 atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
673 }
674 } else {
675 mlx5e_resume_sq(channel->sq);
676 }
677
678 MLX5E_RL_WORKER_LOCK(rlw);
679 /* convert from bytes/s to bits/s and set new rate */
680 error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
681 channel->new_rate * 8ULL);
682 if (error != 0) {
683 mlx5_en_err(priv->ifp,
684 "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
685 error);
686 }
687 break;
688
689 case MLX5E_RL_ST_DESTROY:
690 error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
691 if (error != 0) {
692 mlx5_en_err(priv->ifp,
693 "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
694 error);
695 }
696 if (channel->sq != NULL) {
697 /*
698 * Make sure all packets are
699 * transmitted before SQ is
700 * returned to free list:
701 */
702 MLX5E_RL_WORKER_UNLOCK(rlw);
703 mlx5e_drain_sq(channel->sq);
704 MLX5E_RL_WORKER_LOCK(rlw);
705 }
706 /* put the channel back into the free list */
707 STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
708 channel->state = MLX5E_RL_ST_FREE;
709 atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
710 break;
711 default:
712 /* NOP */
713 break;
714 }
715 }
716 }
717
718 /* close all the SQs */
719 for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
720 struct mlx5e_rl_channel *channel = rlw->channels + x;
721
722 /* update the initial rate */
723 channel->init_rate = channel->last_rate;
724
725 /* make sure we free up the rate resource */
726 mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
727
728 if (channel->sq != NULL) {
729 MLX5E_RL_WORKER_UNLOCK(rlw);
730 mlx5e_rl_close_channel(&channel->sq);
731 atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
732 MLX5E_RL_WORKER_LOCK(rlw);
733 }
734 }
735
736 rlw->worker_done = 0;
737 cv_broadcast(&rlw->cv);
738 MLX5E_RL_WORKER_UNLOCK(rlw);
739
740 kthread_exit();
741 }
742
743 static int
mlx5e_rl_open_tis(struct mlx5e_priv * priv)744 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
745 {
746 struct mlx5_core_dev *mdev = priv->mdev;
747 u32 in[MLX5_ST_SZ_DW(create_tis_in)];
748 void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
749
750 memset(in, 0, sizeof(in));
751
752 MLX5_SET(tisc, tisc, prio, 0);
753 MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
754
755 return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
756 }
757
758 static void
mlx5e_rl_close_tis(struct mlx5e_priv * priv)759 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
760 {
761 mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn, 0);
762 }
763
764 static void
mlx5e_rl_set_default_params(struct mlx5e_rl_params * param,struct mlx5_core_dev * mdev)765 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
766 struct mlx5_core_dev *mdev)
767 {
768 /* ratelimit workers */
769 param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
770 param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
771
772 /* range check */
773 if (param->tx_worker_threads_def == 0 ||
774 param->tx_worker_threads_def > param->tx_worker_threads_max)
775 param->tx_worker_threads_def = param->tx_worker_threads_max;
776
777 /* ratelimit channels */
778 param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
779 param->tx_worker_threads_def;
780 param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
781
782 /* range check */
783 if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
784 param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
785
786 /* set default burst size */
787 param->tx_burst_size = 4; /* MTUs */
788
789 /*
790 * Set maximum burst size
791 *
792 * The burst size is multiplied by the MTU and clamped to the
793 * range 0 ... 65535 bytes inclusivly before fed into the
794 * firmware.
795 *
796 * NOTE: If the burst size or MTU is changed only ratelimit
797 * connections made after the change will use the new burst
798 * size.
799 */
800 param->tx_burst_size_max = 255;
801
802 /* get firmware rate limits in 1000bit/s and convert them to bit/s */
803 param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
804 param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
805
806 /* ratelimit table size */
807 param->tx_rates_max = mdev->priv.rl_table.max_size;
808
809 /* range check */
810 if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
811 param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
812
813 /* set default number of rates */
814 param->tx_rates_def = param->tx_rates_max;
815
816 /* set maximum allowed rate deviation */
817 if (param->tx_limit_max != 0) {
818 /*
819 * Make sure the deviation multiplication doesn't
820 * overflow unsigned 64-bit:
821 */
822 param->tx_allowed_deviation_max = -1ULL /
823 param->tx_limit_max;
824 }
825 /* set default rate deviation */
826 param->tx_allowed_deviation = 50; /* 5.0% */
827
828 /* channel parameters */
829 param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
830 param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
831 param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
832 param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
833 param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
834 }
835
836 static const char *mlx5e_rl_params_desc[] = {
837 MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
838 };
839
840 static const char *mlx5e_rl_table_params_desc[] = {
841 MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
842 };
843
844 static const char *mlx5e_rl_stats_desc[] = {
845 MLX5E_RL_STATS(MLX5E_STATS_DESC)
846 };
847
848 int
mlx5e_rl_init(struct mlx5e_priv * priv)849 mlx5e_rl_init(struct mlx5e_priv *priv)
850 {
851 struct mlx5e_rl_priv_data *rl = &priv->rl;
852 struct sysctl_oid *node;
853 struct sysctl_oid *stats;
854 char buf[64];
855 uint64_t i;
856 uint64_t j;
857 int error;
858
859 /* check if there is support for packet pacing */
860 if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
861 return (0);
862
863 rl->priv = priv;
864
865 sysctl_ctx_init(&rl->ctx);
866
867 sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
868
869 /* open own TIS domain for ratelimit SQs */
870 error = mlx5e_rl_open_tis(priv);
871 if (error)
872 goto done;
873
874 /* setup default value for parameters */
875 mlx5e_rl_set_default_params(&rl->param, priv->mdev);
876
877 /* update the completion factor */
878 mlx5e_rl_sync_tx_completion_fact(rl);
879
880 /* create root node */
881 node = SYSCTL_ADD_NODE(&rl->ctx,
882 SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
883 "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support");
884
885 if (node != NULL) {
886 /* create SYSCTLs */
887 for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
888 mlx5e_rl_sysctl_add_u64_oid(rl,
889 MLX5E_RL_PARAMS_INDEX(arg[i]),
890 node, mlx5e_rl_params_desc[2 * i],
891 mlx5e_rl_params_desc[2 * i + 1]);
892 }
893
894 stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
895 OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
896 "Rate limiting statistics");
897 if (stats != NULL) {
898 /* create SYSCTLs */
899 for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
900 mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
901 stats, mlx5e_rl_stats_desc[2 * i],
902 mlx5e_rl_stats_desc[2 * i + 1]);
903 }
904 }
905 }
906
907 /* allocate workers array */
908 rl->workers = malloc(sizeof(rl->workers[0]) *
909 rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
910
911 /* allocate rate limit array */
912 rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
913 rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
914
915 if (node != NULL) {
916 /* create more SYSCTls */
917 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
918 "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
919 CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
920 "A", "Show table of all configured TX rates");
921
922 /* try to fetch rate table from kernel environment */
923 for (i = 0; i != rl->param.tx_rates_def; i++) {
924 /* compute path for tunable */
925 snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
926 device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
927 if (TUNABLE_QUAD_FETCH(buf, &j))
928 mlx5e_rl_tx_limit_add(rl, j);
929 }
930
931 /* setup rate table sysctls */
932 for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
933 mlx5e_rl_sysctl_add_u64_oid(rl,
934 MLX5E_RL_PARAMS_INDEX(table_arg[i]),
935 node, mlx5e_rl_table_params_desc[2 * i],
936 mlx5e_rl_table_params_desc[2 * i + 1]);
937 }
938 }
939
940 for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
941 struct mlx5e_rl_worker *rlw = rl->workers + j;
942
943 rlw->priv = priv;
944
945 cv_init(&rlw->cv, "mlx5-worker-cv");
946 mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
947 STAILQ_INIT(&rlw->index_list_head);
948 STAILQ_INIT(&rlw->process_head);
949
950 rlw->channels = malloc(sizeof(rlw->channels[0]) *
951 rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
952
953 MLX5E_RL_WORKER_LOCK(rlw);
954 for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
955 struct mlx5e_rl_channel *channel = rlw->channels + i;
956 channel->worker = rlw;
957 STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
958 }
959 MLX5E_RL_WORKER_UNLOCK(rlw);
960 }
961
962 PRIV_LOCK(priv);
963 error = mlx5e_rl_open_workers(priv);
964 PRIV_UNLOCK(priv);
965
966 if (error != 0) {
967 mlx5_en_err(priv->ifp,
968 "mlx5e_rl_open_workers failed: %d\n", error);
969 }
970
971 return (0);
972
973 done:
974 sysctl_ctx_free(&rl->ctx);
975 sx_destroy(&rl->rl_sxlock);
976 return (error);
977 }
978
979 static int
mlx5e_rl_open_workers(struct mlx5e_priv * priv)980 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
981 {
982 struct mlx5e_rl_priv_data *rl = &priv->rl;
983 struct thread *rl_thread = NULL;
984 struct proc *rl_proc = NULL;
985 uint64_t j;
986 int error;
987
988 if (priv->gone || rl->opened)
989 return (-EINVAL);
990
991 MLX5E_RL_WLOCK(rl);
992 /* compute channel parameters once */
993 mlx5e_rl_build_channel_param(rl, &rl->chan_param);
994 MLX5E_RL_WUNLOCK(rl);
995
996 for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
997 struct mlx5e_rl_worker *rlw = rl->workers + j;
998
999 /* start worker thread */
1000 error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
1001 RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
1002 if (error != 0) {
1003 mlx5_en_err(rl->priv->ifp,
1004 "kproc_kthread_add failed: %d\n", error);
1005 rlw->worker_done = 1;
1006 }
1007 }
1008
1009 rl->opened = 1;
1010
1011 return (0);
1012 }
1013
1014 static void
mlx5e_rl_close_workers(struct mlx5e_priv * priv)1015 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
1016 {
1017 struct mlx5e_rl_priv_data *rl = &priv->rl;
1018 uint64_t y;
1019
1020 if (rl->opened == 0)
1021 return;
1022
1023 /* tear down worker threads simultaneously */
1024 for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
1025 struct mlx5e_rl_worker *rlw = rl->workers + y;
1026
1027 /* tear down worker before freeing SQs */
1028 MLX5E_RL_WORKER_LOCK(rlw);
1029 if (rlw->worker_done == 0) {
1030 rlw->worker_done = 1;
1031 cv_broadcast(&rlw->cv);
1032 } else {
1033 /* XXX thread not started */
1034 rlw->worker_done = 0;
1035 }
1036 MLX5E_RL_WORKER_UNLOCK(rlw);
1037 }
1038
1039 /* wait for worker threads to exit */
1040 for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
1041 struct mlx5e_rl_worker *rlw = rl->workers + y;
1042
1043 /* tear down worker before freeing SQs */
1044 MLX5E_RL_WORKER_LOCK(rlw);
1045 while (rlw->worker_done != 0)
1046 cv_wait(&rlw->cv, &rlw->mtx);
1047 MLX5E_RL_WORKER_UNLOCK(rlw);
1048 }
1049
1050 rl->opened = 0;
1051 }
1052
1053 static void
mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data * rl)1054 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
1055 {
1056 unsigned x;
1057
1058 MLX5E_RL_WLOCK(rl);
1059 for (x = 0; x != rl->param.tx_rates_def; x++)
1060 rl->rate_limit_table[x] = 0;
1061 MLX5E_RL_WUNLOCK(rl);
1062 }
1063
1064 void
mlx5e_rl_cleanup(struct mlx5e_priv * priv)1065 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1066 {
1067 struct mlx5e_rl_priv_data *rl = &priv->rl;
1068 uint64_t y;
1069
1070 /* check if there is support for packet pacing */
1071 if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
1072 return;
1073
1074 /* TODO check if there is support for packet pacing */
1075
1076 sysctl_ctx_free(&rl->ctx);
1077
1078 PRIV_LOCK(priv);
1079 mlx5e_rl_close_workers(priv);
1080 PRIV_UNLOCK(priv);
1081
1082 mlx5e_rl_reset_rates(rl);
1083
1084 /* close TIS domain */
1085 mlx5e_rl_close_tis(priv);
1086
1087 for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
1088 struct mlx5e_rl_worker *rlw = rl->workers + y;
1089
1090 cv_destroy(&rlw->cv);
1091 mtx_destroy(&rlw->mtx);
1092 free(rlw->channels, M_MLX5EN);
1093 }
1094 free(rl->rate_limit_table, M_MLX5EN);
1095 free(rl->workers, M_MLX5EN);
1096 sx_destroy(&rl->rl_sxlock);
1097 }
1098
1099 static void
mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker * rlw,struct mlx5e_rl_channel * channel)1100 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
1101 struct mlx5e_rl_channel *channel)
1102 {
1103 STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
1104 cv_broadcast(&rlw->cv);
1105 }
1106
1107 static void
mlx5e_rl_free(struct mlx5e_rl_worker * rlw,struct mlx5e_rl_channel * channel)1108 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
1109 {
1110 if (channel == NULL)
1111 return;
1112
1113 MLX5E_RL_WORKER_LOCK(rlw);
1114 switch (channel->state) {
1115 case MLX5E_RL_ST_MODIFY:
1116 channel->state = MLX5E_RL_ST_DESTROY;
1117 break;
1118 case MLX5E_RL_ST_USED:
1119 channel->state = MLX5E_RL_ST_DESTROY;
1120 mlx5e_rlw_queue_channel_locked(rlw, channel);
1121 break;
1122 default:
1123 break;
1124 }
1125 MLX5E_RL_WORKER_UNLOCK(rlw);
1126 }
1127
1128 static int
mlx5e_rl_modify(struct mlx5e_rl_worker * rlw,struct mlx5e_rl_channel * channel,uint64_t rate)1129 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1130 {
1131
1132 MLX5E_RL_WORKER_LOCK(rlw);
1133 channel->new_rate = rate;
1134 switch (channel->state) {
1135 case MLX5E_RL_ST_USED:
1136 channel->state = MLX5E_RL_ST_MODIFY;
1137 mlx5e_rlw_queue_channel_locked(rlw, channel);
1138 break;
1139 default:
1140 break;
1141 }
1142 MLX5E_RL_WORKER_UNLOCK(rlw);
1143
1144 return (0);
1145 }
1146
1147 static int
mlx5e_rl_query(struct mlx5e_rl_worker * rlw,struct mlx5e_rl_channel * channel,union if_snd_tag_query_params * params)1148 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
1149 union if_snd_tag_query_params *params)
1150 {
1151 int retval;
1152
1153 MLX5E_RL_WORKER_LOCK(rlw);
1154 switch (channel->state) {
1155 case MLX5E_RL_ST_USED:
1156 params->rate_limit.max_rate = channel->last_rate;
1157 params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1158 retval = 0;
1159 break;
1160 case MLX5E_RL_ST_MODIFY:
1161 params->rate_limit.max_rate = channel->last_rate;
1162 params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1163 retval = EBUSY;
1164 break;
1165 default:
1166 retval = EINVAL;
1167 break;
1168 }
1169 MLX5E_RL_WORKER_UNLOCK(rlw);
1170
1171 return (retval);
1172 }
1173
1174 static int
mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker * rlw,struct mlx5e_rl_channel ** pchannel)1175 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1176 struct mlx5e_rl_channel **pchannel)
1177 {
1178 struct mlx5e_rl_channel *channel;
1179 int retval = ENOMEM;
1180
1181 MLX5E_RL_WORKER_LOCK(rlw);
1182 /* Check for available channel in free list */
1183 if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1184 retval = 0;
1185 /* Remove head index from available list */
1186 STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1187 channel->state = MLX5E_RL_ST_USED;
1188 atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1189 } else {
1190 atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1191 }
1192 MLX5E_RL_WORKER_UNLOCK(rlw);
1193
1194 *pchannel = channel;
1195 #ifdef RATELIMIT_DEBUG
1196 mlx5_en_info(rlw->priv->ifp,
1197 "Channel pointer for rate limit connection is %p\n", channel);
1198 #endif
1199 return (retval);
1200 }
1201
1202 int
mlx5e_rl_snd_tag_alloc(if_t ifp,union if_snd_tag_alloc_params * params,struct m_snd_tag ** ppmt)1203 mlx5e_rl_snd_tag_alloc(if_t ifp,
1204 union if_snd_tag_alloc_params *params,
1205 struct m_snd_tag **ppmt)
1206 {
1207 struct mlx5e_rl_channel *channel;
1208 struct mlx5e_rl_worker *rlw;
1209 struct mlx5e_priv *priv;
1210 int error;
1211
1212 priv = if_getsoftc(ifp);
1213
1214 /* check if there is support for packet pacing or if device is going away */
1215 if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1216 !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1217 params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1218 return (EOPNOTSUPP);
1219
1220 /* compute worker thread this TCP connection belongs to */
1221 rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1222 priv->rl.param.tx_worker_threads_def);
1223
1224 error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1225 if (error != 0)
1226 goto done;
1227
1228 error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1229 if (error != 0) {
1230 mlx5e_rl_free(rlw, channel);
1231 goto done;
1232 }
1233
1234 /* store pointer to mbuf tag */
1235 MPASS(channel->tag.refcount == 0);
1236 m_snd_tag_init(&channel->tag, ifp, &mlx5e_rl_snd_tag_sw);
1237 *ppmt = &channel->tag;
1238 done:
1239 return (error);
1240 }
1241
1242
1243 static int
mlx5e_rl_snd_tag_modify(struct m_snd_tag * pmt,union if_snd_tag_modify_params * params)1244 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1245 {
1246 struct mlx5e_rl_channel *channel =
1247 container_of(pmt, struct mlx5e_rl_channel, tag);
1248
1249 return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1250 }
1251
1252 static int
mlx5e_rl_snd_tag_query(struct m_snd_tag * pmt,union if_snd_tag_query_params * params)1253 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1254 {
1255 struct mlx5e_rl_channel *channel =
1256 container_of(pmt, struct mlx5e_rl_channel, tag);
1257
1258 return (mlx5e_rl_query(channel->worker, channel, params));
1259 }
1260
1261 static void
mlx5e_rl_snd_tag_free(struct m_snd_tag * pmt)1262 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1263 {
1264 struct mlx5e_rl_channel *channel =
1265 container_of(pmt, struct mlx5e_rl_channel, tag);
1266
1267 mlx5e_rl_free(channel->worker, channel);
1268 }
1269
1270 static int
mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)1271 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1272 {
1273 struct mlx5e_rl_priv_data *rl = arg1;
1274 struct mlx5e_priv *priv = rl->priv;
1275 struct sbuf sbuf;
1276 unsigned x;
1277 int error;
1278
1279 error = sysctl_wire_old_buffer(req, 0);
1280 if (error != 0)
1281 return (error);
1282
1283 PRIV_LOCK(priv);
1284
1285 sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1286
1287 sbuf_printf(&sbuf,
1288 "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1289 "\t" "--------------------------------------------\n");
1290
1291 MLX5E_RL_RLOCK(rl);
1292 for (x = 0; x != rl->param.tx_rates_def; x++) {
1293 if (rl->rate_limit_table[x] == 0)
1294 continue;
1295
1296 sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1297 x, (unsigned)rl->param.tx_burst_size,
1298 (long long)rl->rate_limit_table[x]);
1299 }
1300 MLX5E_RL_RUNLOCK(rl);
1301
1302 error = sbuf_finish(&sbuf);
1303 sbuf_delete(&sbuf);
1304
1305 PRIV_UNLOCK(priv);
1306
1307 return (error);
1308 }
1309
1310 static int
mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data * rl)1311 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1312 {
1313 uint64_t x;
1314 uint64_t y;
1315
1316 MLX5E_RL_WLOCK(rl);
1317 /* compute channel parameters once */
1318 mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1319 MLX5E_RL_WUNLOCK(rl);
1320
1321 for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1322 struct mlx5e_rl_worker *rlw = rl->workers + y;
1323
1324 for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1325 struct mlx5e_rl_channel *channel;
1326 struct mlx5e_sq *sq;
1327
1328 channel = rlw->channels + x;
1329 sq = channel->sq;
1330
1331 if (sq == NULL)
1332 continue;
1333
1334 if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1335 mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1336 rl->param.tx_coalesce_usecs,
1337 rl->param.tx_coalesce_pkts,
1338 rl->param.tx_coalesce_mode);
1339 } else {
1340 mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1341 rl->param.tx_coalesce_usecs,
1342 rl->param.tx_coalesce_pkts);
1343 }
1344 }
1345 }
1346 return (0);
1347 }
1348
1349 void
mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data * rl)1350 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
1351 {
1352 uint64_t x;
1353 uint64_t y;
1354
1355 for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1356 struct mlx5e_rl_worker *rlw = rl->workers + y;
1357
1358 for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1359 struct mlx5e_rl_channel *channel;
1360 struct mlx5e_sq *sq;
1361
1362 channel = rlw->channels + x;
1363 sq = channel->sq;
1364
1365 if (sq == NULL)
1366 continue;
1367
1368 mtx_lock(&sq->lock);
1369 mlx5e_update_sq_inline(sq);
1370 mtx_unlock(&sq->lock);
1371 }
1372 }
1373 }
1374
1375 static int
mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data * rl,uint64_t value)1376 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1377 {
1378 unsigned x;
1379 int error;
1380
1381 if (value < 1000 ||
1382 mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1383 return (EINVAL);
1384
1385 MLX5E_RL_WLOCK(rl);
1386 error = ENOMEM;
1387
1388 /* check if rate already exists */
1389 for (x = 0; x != rl->param.tx_rates_def; x++) {
1390 if (rl->rate_limit_table[x] != value)
1391 continue;
1392 error = EEXIST;
1393 break;
1394 }
1395
1396 /* check if there is a free rate entry */
1397 if (x == rl->param.tx_rates_def) {
1398 for (x = 0; x != rl->param.tx_rates_def; x++) {
1399 if (rl->rate_limit_table[x] != 0)
1400 continue;
1401 rl->rate_limit_table[x] = value;
1402 error = 0;
1403 break;
1404 }
1405 }
1406 MLX5E_RL_WUNLOCK(rl);
1407
1408 return (error);
1409 }
1410
1411 static int
mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data * rl,uint64_t value)1412 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1413 {
1414 unsigned x;
1415 int error;
1416
1417 if (value == 0)
1418 return (EINVAL);
1419
1420 MLX5E_RL_WLOCK(rl);
1421
1422 /* check if rate already exists */
1423 for (x = 0; x != rl->param.tx_rates_def; x++) {
1424 if (rl->rate_limit_table[x] != value)
1425 continue;
1426 /* free up rate */
1427 rl->rate_limit_table[x] = 0;
1428 break;
1429 }
1430
1431 /* check if there is a free rate entry */
1432 if (x == rl->param.tx_rates_def)
1433 error = ENOENT;
1434 else
1435 error = 0;
1436 MLX5E_RL_WUNLOCK(rl);
1437
1438 return (error);
1439 }
1440
1441 static int
mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)1442 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1443 {
1444 struct mlx5e_rl_priv_data *rl = arg1;
1445 struct mlx5e_priv *priv = rl->priv;
1446 unsigned mode_modify;
1447 unsigned was_opened;
1448 uint64_t value;
1449 int error;
1450
1451 PRIV_LOCK(priv);
1452
1453 MLX5E_RL_RLOCK(rl);
1454 value = rl->param.arg[arg2];
1455 MLX5E_RL_RUNLOCK(rl);
1456
1457 if (req != NULL) {
1458 error = sysctl_handle_64(oidp, &value, 0, req);
1459 if (error || req->newptr == NULL ||
1460 value == rl->param.arg[arg2])
1461 goto done;
1462 } else {
1463 error = 0;
1464 }
1465
1466 /* check if device is gone */
1467 if (priv->gone) {
1468 error = ENXIO;
1469 goto done;
1470 }
1471 was_opened = rl->opened;
1472 mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1473
1474 switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1475 case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1476 if (value > rl->param.tx_worker_threads_max)
1477 value = rl->param.tx_worker_threads_max;
1478 else if (value < 1)
1479 value = 1;
1480
1481 /* store new value */
1482 rl->param.arg[arg2] = value;
1483 break;
1484
1485 case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1486 if (value > rl->param.tx_channels_per_worker_max)
1487 value = rl->param.tx_channels_per_worker_max;
1488 else if (value < 1)
1489 value = 1;
1490
1491 /* store new value */
1492 rl->param.arg[arg2] = value;
1493 break;
1494
1495 case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1496 if (value > rl->param.tx_rates_max)
1497 value = rl->param.tx_rates_max;
1498 else if (value < 1)
1499 value = 1;
1500
1501 /* store new value */
1502 rl->param.arg[arg2] = value;
1503 break;
1504
1505 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1506 /* range check */
1507 if (value < 1)
1508 value = 0;
1509 else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1510 value = MLX5E_FLD_MAX(cqc, cq_period);
1511
1512 /* store new value */
1513 rl->param.arg[arg2] = value;
1514
1515 /* check to avoid down and up the network interface */
1516 if (was_opened)
1517 error = mlx5e_rl_refresh_channel_params(rl);
1518 break;
1519
1520 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1521 /* import TX coal pkts */
1522 if (value < 1)
1523 value = 0;
1524 else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1525 value = MLX5E_FLD_MAX(cqc, cq_max_count);
1526
1527 /* store new value */
1528 rl->param.arg[arg2] = value;
1529
1530 /* check to avoid down and up the network interface */
1531 if (was_opened)
1532 error = mlx5e_rl_refresh_channel_params(rl);
1533 break;
1534
1535 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1536 /* network interface must be down */
1537 if (was_opened != 0 && mode_modify == 0)
1538 mlx5e_rl_close_workers(priv);
1539
1540 /* import TX coalesce mode */
1541 if (value != 0)
1542 value = 1;
1543
1544 /* store new value */
1545 rl->param.arg[arg2] = value;
1546
1547 /* restart network interface, if any */
1548 if (was_opened != 0) {
1549 if (mode_modify == 0)
1550 mlx5e_rl_open_workers(priv);
1551 else
1552 error = mlx5e_rl_refresh_channel_params(rl);
1553 }
1554 break;
1555
1556 case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1557 /* network interface must be down */
1558 if (was_opened)
1559 mlx5e_rl_close_workers(priv);
1560
1561 /* import TX queue size */
1562 if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1563 value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1564 else if (value > priv->params_ethtool.tx_queue_size_max)
1565 value = priv->params_ethtool.tx_queue_size_max;
1566
1567 /* store actual TX queue size */
1568 value = 1ULL << order_base_2(value);
1569
1570 /* store new value */
1571 rl->param.arg[arg2] = value;
1572
1573 /* verify TX completion factor */
1574 mlx5e_rl_sync_tx_completion_fact(rl);
1575
1576 /* restart network interface, if any */
1577 if (was_opened)
1578 mlx5e_rl_open_workers(priv);
1579 break;
1580
1581 case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1582 /* network interface must be down */
1583 if (was_opened)
1584 mlx5e_rl_close_workers(priv);
1585
1586 /* store new value */
1587 rl->param.arg[arg2] = value;
1588
1589 /* verify parameter */
1590 mlx5e_rl_sync_tx_completion_fact(rl);
1591
1592 /* restart network interface, if any */
1593 if (was_opened)
1594 mlx5e_rl_open_workers(priv);
1595 break;
1596
1597 case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1598 error = mlx5e_rl_tx_limit_add(rl, value);
1599 break;
1600
1601 case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1602 error = mlx5e_rl_tx_limit_clr(rl, value);
1603 break;
1604
1605 case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1606 /* range check */
1607 if (value > rl->param.tx_allowed_deviation_max)
1608 value = rl->param.tx_allowed_deviation_max;
1609 else if (value < rl->param.tx_allowed_deviation_min)
1610 value = rl->param.tx_allowed_deviation_min;
1611
1612 MLX5E_RL_WLOCK(rl);
1613 rl->param.arg[arg2] = value;
1614 MLX5E_RL_WUNLOCK(rl);
1615 break;
1616
1617 case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1618 /* range check */
1619 if (value > rl->param.tx_burst_size_max)
1620 value = rl->param.tx_burst_size_max;
1621 else if (value < rl->param.tx_burst_size_min)
1622 value = rl->param.tx_burst_size_min;
1623
1624 MLX5E_RL_WLOCK(rl);
1625 rl->param.arg[arg2] = value;
1626 MLX5E_RL_WUNLOCK(rl);
1627 break;
1628
1629 default:
1630 break;
1631 }
1632 done:
1633 PRIV_UNLOCK(priv);
1634 return (error);
1635 }
1636
1637 static void
mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data * rl,unsigned x,struct sysctl_oid * node,const char * name,const char * desc)1638 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1639 struct sysctl_oid *node, const char *name, const char *desc)
1640 {
1641 /*
1642 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1643 * take care of loading default sysctl value from the kernel
1644 * environment, if any:
1645 */
1646 if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1647 /* read-only SYSCTLs */
1648 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1649 name, CTLTYPE_U64 | CTLFLAG_RD |
1650 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1651 } else {
1652 if (strstr(name, "_def") != 0) {
1653 #ifdef RATELIMIT_DEBUG
1654 /* tunable read-only advanced SYSCTLs */
1655 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1656 name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1657 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1658 #endif
1659 } else {
1660 /* read-write SYSCTLs */
1661 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1662 name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1663 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1664 }
1665 }
1666 }
1667
1668 static void
mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data * rl,unsigned x,struct sysctl_oid * node,const char * name,const char * desc)1669 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1670 struct sysctl_oid *node, const char *name, const char *desc)
1671 {
1672 /* read-only SYSCTLs */
1673 SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1674 CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
1675 }
1676
1677 #else
1678
1679 int
mlx5e_rl_init(struct mlx5e_priv * priv)1680 mlx5e_rl_init(struct mlx5e_priv *priv)
1681 {
1682
1683 return (0);
1684 }
1685
1686 void
mlx5e_rl_cleanup(struct mlx5e_priv * priv)1687 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1688 {
1689 /* NOP */
1690 }
1691
1692 #endif /* RATELIMIT */
1693