xref: /freebsd/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c (revision a03411e84728e9b267056fd31c7d1d9d1dc1b01e)
1 /*-
2  * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 #include "opt_rss.h"
27 #include "opt_ratelimit.h"
28 
29 #include <dev/mlx5/mlx5_en/en.h>
30 
31 #ifdef RATELIMIT
32 
33 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
34 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
35 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
36 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
37     struct sysctl_oid *, const char *name, const char *desc);
38 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
39       struct sysctl_oid *node, const char *name, const char *desc);
40 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
41 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
42 static if_snd_tag_modify_t mlx5e_rl_snd_tag_modify;
43 static if_snd_tag_query_t mlx5e_rl_snd_tag_query;
44 static if_snd_tag_free_t mlx5e_rl_snd_tag_free;
45 
46 static const struct if_snd_tag_sw mlx5e_rl_snd_tag_sw = {
47 	.snd_tag_modify = mlx5e_rl_snd_tag_modify,
48 	.snd_tag_query = mlx5e_rl_snd_tag_query,
49 	.snd_tag_free = mlx5e_rl_snd_tag_free,
50 	.type = IF_SND_TAG_TYPE_RATE_LIMIT
51 };
52 
53 static void
54 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
55     struct mlx5e_sq_param *param)
56 {
57 	void *sqc = param->sqc;
58 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
59 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
60 
61 	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
62 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
63 	MLX5_SET(wq, wq, pd, rl->priv->pdn);
64 
65 	param->wq.linear = 1;
66 }
67 
68 static void
69 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
70     struct mlx5e_cq_param *param)
71 {
72 	void *cqc = param->cqc;
73 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
74 
75 	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
76 	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
77 	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
78 	MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index);
79 
80 	switch (rl->param.tx_coalesce_mode) {
81 	case 0:
82 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
83 		break;
84 	default:
85 		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
86 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
87 		else
88 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
89 		break;
90 	}
91 }
92 
93 static void
94 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
95     struct mlx5e_rl_channel_param *cparam)
96 {
97 	memset(cparam, 0, sizeof(*cparam));
98 
99 	mlx5e_rl_build_sq_param(rl, &cparam->sq);
100 	mlx5e_rl_build_cq_param(rl, &cparam->cq);
101 }
102 
103 static int
104 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
105     struct mlx5e_sq_param *param, int ix)
106 {
107 	struct mlx5_core_dev *mdev = priv->mdev;
108 	void *sqc = param->sqc;
109 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
110 	int err;
111 
112 	/* Create DMA descriptor TAG */
113 	if ((err = -bus_dma_tag_create(
114 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
115 	    1,				/* any alignment */
116 	    0,				/* no boundary */
117 	    BUS_SPACE_MAXADDR,		/* lowaddr */
118 	    BUS_SPACE_MAXADDR,		/* highaddr */
119 	    NULL, NULL,			/* filter, filterarg */
120 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
121 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
122 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
123 	    0,				/* flags */
124 	    NULL, NULL,			/* lockfunc, lockfuncarg */
125 	    &sq->dma_tag)))
126 		goto done;
127 
128 	sq->mkey_be = cpu_to_be32(priv->mr.key);
129 	sq->ifp = priv->ifp;
130 	sq->priv = priv;
131 
132 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
133 	    &sq->wq_ctrl);
134 	if (err)
135 		goto err_free_dma_tag;
136 
137 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
138 
139 	err = mlx5e_alloc_sq_db(sq);
140 	if (err)
141 		goto err_sq_wq_destroy;
142 
143 	mlx5e_update_sq_inline(sq);
144 
145 	return (0);
146 
147 err_sq_wq_destroy:
148 	mlx5_wq_destroy(&sq->wq_ctrl);
149 err_free_dma_tag:
150 	bus_dma_tag_destroy(sq->dma_tag);
151 done:
152 	return (err);
153 }
154 
155 static void
156 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
157 {
158 
159 	mlx5e_free_sq_db(sq);
160 	mlx5_wq_destroy(&sq->wq_ctrl);
161 	bus_dma_tag_destroy(sq->dma_tag);
162 }
163 
164 static int
165 mlx5e_rl_query_sq(struct mlx5e_sq *sq)
166 {
167 	void *out;
168         int inlen;
169         int err;
170 
171         inlen = MLX5_ST_SZ_BYTES(query_sq_out);
172         out = mlx5_vzalloc(inlen);
173         if (!out)
174                 return -ENOMEM;
175 
176         err = mlx5_core_query_sq(sq->priv->mdev, sq->sqn, out);
177         if (err)
178                 goto out;
179 
180         sq->queue_handle = MLX5_GET(query_sq_out, out, sq_context.queue_handle);
181 
182 out:
183         kvfree(out);
184         return err;
185 }
186 
187 static int
188 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
189     struct mlx5e_sq_param *param, int ix)
190 {
191 	int err;
192 
193 	err = mlx5e_rl_create_sq(priv, sq, param, ix);
194 	if (err)
195 		return (err);
196 
197 	err = mlx5e_enable_sq(sq, param, &priv->channel[ix].bfreg, priv->rl.tisn);
198 	if (err)
199 		goto err_destroy_sq;
200 
201 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
202 	if (err)
203 		goto err_disable_sq;
204 
205 	if (MLX5_CAP_QOS(priv->mdev, qos_remap_pp)) {
206 		err = mlx5e_rl_query_sq(sq);
207 		if (err) {
208 			mlx5_en_err(priv->ifp, "Failed retrieving send queue handle for"
209 			    "SQ remap - sqn=%u, err=(%d)\n", sq->sqn, err);
210 			sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE;
211 		}
212 	} else
213 		sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE;
214 
215 	WRITE_ONCE(sq->running, 1);
216 
217 	return (0);
218 
219 err_disable_sq:
220 	mlx5e_disable_sq(sq);
221 err_destroy_sq:
222 	mlx5e_rl_destroy_sq(sq);
223 
224 	return (err);
225 }
226 
227 static void
228 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
229 {
230 	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
231 	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
232 
233 	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
234 
235 	sq->cev_factor = priv->rl.param.tx_completion_fact;
236 
237 	/* ensure the TX completion event factor is not zero */
238 	if (sq->cev_factor == 0)
239 		sq->cev_factor = 1;
240 }
241 
242 static int
243 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
244     struct mlx5e_rl_channel_param *cparam,
245     struct mlx5e_sq *volatile *ppsq)
246 {
247 	struct mlx5e_priv *priv = rlw->priv;
248 	struct mlx5e_sq *sq;
249 	int err;
250 
251 	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
252 
253 	/* init mutexes */
254 	mlx5e_rl_chan_mtx_init(priv, sq);
255 
256 	/* open TX completion queue */
257 	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
258 	    &mlx5e_tx_cq_comp, eq_ix);
259 	if (err)
260 		goto err_free;
261 
262 	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
263 	if (err)
264 		goto err_close_tx_cq;
265 
266 	/* store TX channel pointer */
267 	*ppsq = sq;
268 
269 	/* poll TX queue initially */
270 	sq->cq.mcq.comp(&sq->cq.mcq, NULL);
271 
272 	return (0);
273 
274 err_close_tx_cq:
275 	mlx5e_close_cq(&sq->cq);
276 
277 err_free:
278 	/* destroy mutexes */
279 	mtx_destroy(&sq->lock);
280 	mtx_destroy(&sq->comp_lock);
281 	free(sq, M_MLX5EN);
282 	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
283 	return (err);
284 }
285 
286 static void
287 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
288 {
289 	struct mlx5e_sq *sq = *ppsq;
290 
291 	/* check if channel is already closed */
292 	if (sq == NULL)
293 		return;
294 	/* ensure channel pointer is no longer used */
295 	*ppsq = NULL;
296 
297 	/* teardown and destroy SQ */
298 	mlx5e_drain_sq(sq);
299 	mlx5e_disable_sq(sq);
300 	mlx5e_rl_destroy_sq(sq);
301 
302 	/* close CQ */
303 	mlx5e_close_cq(&sq->cq);
304 
305 	/* destroy mutexes */
306 	mtx_destroy(&sq->lock);
307 	mtx_destroy(&sq->comp_lock);
308 
309 	free(sq, M_MLX5EN);
310 }
311 
312 static void
313 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
314 {
315 	/*
316 	 * Limit the maximum distance between completion events to
317 	 * half of the currently set TX queue size.
318 	 *
319 	 * The maximum number of queue entries a single IP packet can
320 	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
321 	 *
322 	 * The worst case max value is then given as below:
323 	 */
324 	uint64_t max = rl->param.tx_queue_size /
325 	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
326 
327 	/*
328 	 * Update the maximum completion factor value in case the
329 	 * tx_queue_size field changed. Ensure we don't overflow
330 	 * 16-bits.
331 	 */
332 	if (max < 1)
333 		max = 1;
334 	else if (max > 65535)
335 		max = 65535;
336 	rl->param.tx_completion_fact_max = max;
337 
338 	/*
339 	 * Verify that the current TX completion factor is within the
340 	 * given limits:
341 	 */
342 	if (rl->param.tx_completion_fact < 1)
343 		rl->param.tx_completion_fact = 1;
344 	else if (rl->param.tx_completion_fact > max)
345 		rl->param.tx_completion_fact = max;
346 }
347 
348 static int
349 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
350 {
351 	struct mlx5e_priv *priv = sq->priv;
352 	struct mlx5_core_dev *mdev = priv->mdev;
353 
354 	void *in;
355 	void *sqc;
356 	int inlen;
357 	int err;
358 
359 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
360 	in = mlx5_vzalloc(inlen);
361 	if (in == NULL)
362 		return (-ENOMEM);
363 
364 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
365 
366 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
367 	MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
368 	MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
369 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
370 	MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
371 
372 	err = mlx5_core_modify_sq(mdev, in, inlen);
373 
374 	kvfree(in);
375 
376 	return (err);
377 }
378 
379 /*
380  * This function will search the configured rate limit table for the
381  * best match to avoid that a single socket based application can
382  * allocate all the available hardware rates. If the user selected
383  * rate deviates too much from the closes rate available in the rate
384  * limit table, unlimited rate will be selected.
385  */
386 static uint64_t
387 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
388 {
389 	uint64_t distance = -1ULL;
390 	uint64_t diff;
391 	uint64_t retval = 0;		/* unlimited */
392 	uint64_t x;
393 
394 	/* search for closest rate */
395 	for (x = 0; x != rl->param.tx_rates_def; x++) {
396 		uint64_t rate = rl->rate_limit_table[x];
397 		if (rate == 0)
398 			continue;
399 
400 		if (rate > user_rate)
401 			diff = rate - user_rate;
402 		else
403 			diff = user_rate - rate;
404 
405 		/* check if distance is smaller than previous rate */
406 		if (diff < distance) {
407 			distance = diff;
408 			retval = rate;
409 		}
410 	}
411 
412 	/* range check for multiplication below */
413 	if (user_rate > rl->param.tx_limit_max)
414 		user_rate = rl->param.tx_limit_max;
415 
416 	/* fallback to unlimited, if rate deviates too much */
417 	if (distance > howmany(user_rate *
418 	    rl->param.tx_allowed_deviation, 1000ULL))
419 		retval = 0;
420 
421 	return (retval);
422 }
423 
424 static int
425 mlx5e_rl_post_sq_remap_wqe(struct mlx5e_iq *iq, u32 scq_handle, u32 sq_handle,
426     struct mlx5e_rl_channel *sq_channel)
427 {
428 	const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_qos_remap_wqe),
429 	            MLX5_SEND_WQE_DS);
430 	struct mlx5e_tx_qos_remap_wqe *wqe;
431 	int pi;
432 
433 	mtx_lock(&iq->lock);
434 	pi = mlx5e_iq_get_producer_index(iq);
435 	if (pi < 0) {
436 		mtx_unlock(&iq->lock);
437 		return (-ENOMEM);
438 	}
439 	wqe = mlx5_wq_cyc_get_wqe(&iq->wq, pi);
440 
441 	memset(wqe, 0, sizeof(*wqe));
442 
443 	wqe->qos_remap.qos_handle = cpu_to_be32(scq_handle);
444 	wqe->qos_remap.queue_handle = cpu_to_be32(sq_handle);
445 
446 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((iq->pc << 8) |
447 	    MLX5_OPCODE_QOS_REMAP);
448 	wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | ds_cnt);
449 	wqe->ctrl.imm = cpu_to_be32(iq->priv->tisn[0] << 8);
450 	wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL;
451 
452 	/* copy data for doorbell */
453 	memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32));
454 
455 	iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
456 	iq->data[pi].p_refcount = &sq_channel->refcount;
457 	atomic_add_int(iq->data[pi].p_refcount, 1);
458 	iq->pc += iq->data[pi].num_wqebbs;
459 
460 	mlx5e_iq_notify_hw(iq);
461 
462 	mtx_unlock(&iq->lock);
463 
464 	return (0); /* success */
465 }
466 
467 static int
468 mlx5e_rl_remap_sq(struct mlx5e_sq *sq, uint16_t index,
469     struct mlx5e_rl_channel *sq_channel)
470 {
471 	struct mlx5e_channel *iq_channel;
472 	u32	scq_handle;
473 	u32	sq_handle;
474 	int 	error;
475 
476 	/* Specific SQ remap operations should be handled by same IQ */
477 	iq_channel = &sq->priv->channel[sq->sqn % sq->priv->params.num_channels];
478 
479 	sq_handle = sq->queue_handle;
480 	scq_handle = mlx5_rl_get_scq_handle(sq->priv->mdev, index);
481 
482 	if (sq_handle == MLX5_INVALID_QUEUE_HANDLE ||
483 	    scq_handle == MLX5_INVALID_QUEUE_HANDLE)
484 		error = -1;
485 	else
486 		error = mlx5e_rl_post_sq_remap_wqe(&iq_channel->iq, scq_handle,
487 		    sq_handle, sq_channel);
488 
489 	return (error);
490 }
491 
492 /*
493  * This function sets the requested rate for a rate limit channel, in
494  * bits per second. The requested rate will be filtered through the
495  * find best rate function above.
496  */
497 static int
498 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
499     struct mlx5e_rl_channel *channel, uint64_t rate)
500 {
501 	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
502 	struct mlx5e_sq *sq;
503 	uint64_t temp;
504 	uint16_t index;
505 	uint16_t burst;
506 	int error;
507 	bool use_sq_remap;
508 
509 	if (rate != 0) {
510 		MLX5E_RL_WORKER_UNLOCK(rlw);
511 
512 		MLX5E_RL_RLOCK(rl);
513 
514 		/* get current burst size in bytes */
515 		temp = rl->param.tx_burst_size *
516 		    MLX5E_SW2HW_MTU(if_getmtu(rlw->priv->ifp));
517 
518 		/* limit burst size to 64K currently */
519 		if (temp > 65535)
520 			temp = 65535;
521 		burst = temp;
522 
523 		/* find best rate */
524 		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
525 
526 		MLX5E_RL_RUNLOCK(rl);
527 
528 		if (rate == 0) {
529 			/* rate doesn't exist, fallback to unlimited */
530 			index = 0;
531 			rate = 0;
532 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
533 		} else {
534 			/* get a reference on the new rate */
535 			error = -mlx5_rl_add_rate(rlw->priv->mdev,
536 			    howmany(rate, 1000), burst, &index);
537 
538 			if (error != 0) {
539 				/* adding rate failed, fallback to unlimited */
540 				index = 0;
541 				rate = 0;
542 				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
543 			}
544 		}
545 		MLX5E_RL_WORKER_LOCK(rlw);
546 	} else {
547 		index = 0;
548 		burst = 0;	/* default */
549 	}
550 
551 	/* paced <--> non-paced transitions must go via FW */
552 	use_sq_remap = MLX5_CAP_QOS(rlw->priv->mdev, qos_remap_pp) &&
553 	    channel->last_rate != 0 && rate != 0;
554 
555 	/* atomically swap rates */
556 	temp = channel->last_rate;
557 	channel->last_rate = rate;
558 	rate = temp;
559 
560 	/* atomically swap burst size */
561 	temp = channel->last_burst;
562 	channel->last_burst = burst;
563 	burst = temp;
564 
565 	MLX5E_RL_WORKER_UNLOCK(rlw);
566 	/* put reference on the old rate, if any */
567 	if (rate != 0) {
568 		mlx5_rl_remove_rate(rlw->priv->mdev,
569 		    howmany(rate, 1000), burst);
570 	}
571 
572 	/* set new rate, if SQ is running */
573 	sq = channel->sq;
574 	if (sq != NULL && READ_ONCE(sq->running) != 0) {
575 		if (!use_sq_remap || mlx5e_rl_remap_sq(sq, index, channel)) {
576 			while (atomic_load_int(&channel->refcount) != 0 &&
577 			    rlw->priv->mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR &&
578 		            pci_channel_offline(rlw->priv->mdev->pdev) == 0)
579 				pause("W", 1);
580 			error = mlx5e_rl_modify_sq(sq, index);
581 			if (error != 0)
582 				atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
583 		}
584 	} else
585 		error = 0;
586 
587 	MLX5E_RL_WORKER_LOCK(rlw);
588 
589 	return (-error);
590 }
591 
592 static void
593 mlx5e_rl_worker(void *arg)
594 {
595 	struct thread *td;
596 	struct mlx5e_rl_worker *rlw = arg;
597 	struct mlx5e_rl_channel *channel;
598 	struct mlx5e_priv *priv;
599 	unsigned ix;
600 	uint64_t x;
601 	int error;
602 
603 	/* set thread priority */
604 	td = curthread;
605 
606 	thread_lock(td);
607 	sched_prio(td, PI_SWI(SWI_NET));
608 	thread_unlock(td);
609 
610 	priv = rlw->priv;
611 
612 	/* compute completion vector */
613 	ix = (rlw - priv->rl.workers) %
614 	    priv->mdev->priv.eq_table.num_comp_vectors;
615 
616 	/* TODO bind to CPU */
617 
618 	/* open all the SQs */
619 	MLX5E_RL_WORKER_LOCK(rlw);
620 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
621 		struct mlx5e_rl_channel *channel = rlw->channels + x;
622 
623 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
624 		if (channel->state == MLX5E_RL_ST_FREE)
625 			continue;
626 #endif
627 		MLX5E_RL_WORKER_UNLOCK(rlw);
628 
629 		MLX5E_RL_RLOCK(&priv->rl);
630 		error = mlx5e_rl_open_channel(rlw, ix,
631 		    &priv->rl.chan_param, &channel->sq);
632 		MLX5E_RL_RUNLOCK(&priv->rl);
633 
634 		MLX5E_RL_WORKER_LOCK(rlw);
635 		if (error != 0) {
636 			mlx5_en_err(priv->ifp,
637 			    "mlx5e_rl_open_channel failed: %d\n", error);
638 			break;
639 		}
640 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
641 	}
642 	while (1) {
643 		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
644 			/* check if we are tearing down */
645 			if (rlw->worker_done != 0)
646 				break;
647 			cv_wait(&rlw->cv, &rlw->mtx);
648 		}
649 		/* check if we are tearing down */
650 		if (rlw->worker_done != 0)
651 			break;
652 		channel = STAILQ_FIRST(&rlw->process_head);
653 		if (channel != NULL) {
654 			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
655 
656 			switch (channel->state) {
657 			case MLX5E_RL_ST_MODIFY:
658 				channel->state = MLX5E_RL_ST_USED;
659 				MLX5E_RL_WORKER_UNLOCK(rlw);
660 
661 				/* create channel by demand */
662 				if (channel->sq == NULL) {
663 					MLX5E_RL_RLOCK(&priv->rl);
664 					error = mlx5e_rl_open_channel(rlw, ix,
665 					    &priv->rl.chan_param, &channel->sq);
666 					MLX5E_RL_RUNLOCK(&priv->rl);
667 
668 					if (error != 0) {
669 						mlx5_en_err(priv->ifp,
670 						    "mlx5e_rl_open_channel failed: %d\n", error);
671 					} else {
672 						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
673 					}
674 				} else {
675 					mlx5e_resume_sq(channel->sq);
676 				}
677 
678 				MLX5E_RL_WORKER_LOCK(rlw);
679 				/* convert from bytes/s to bits/s and set new rate */
680 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
681 				    channel->new_rate * 8ULL);
682 				if (error != 0) {
683 					mlx5_en_err(priv->ifp,
684 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
685 					    error);
686 				}
687 				break;
688 
689 			case MLX5E_RL_ST_DESTROY:
690 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
691 				if (error != 0) {
692 					mlx5_en_err(priv->ifp,
693 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
694 					    error);
695 				}
696 				if (channel->sq != NULL) {
697 					/*
698 					 * Make sure all packets are
699 					 * transmitted before SQ is
700 					 * returned to free list:
701 					 */
702 					MLX5E_RL_WORKER_UNLOCK(rlw);
703 					mlx5e_drain_sq(channel->sq);
704 					MLX5E_RL_WORKER_LOCK(rlw);
705 				}
706 				/* put the channel back into the free list */
707 				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
708 				channel->state = MLX5E_RL_ST_FREE;
709 				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
710 				break;
711 			default:
712 				/* NOP */
713 				break;
714 			}
715 		}
716 	}
717 
718 	/* close all the SQs */
719 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
720 		struct mlx5e_rl_channel *channel = rlw->channels + x;
721 
722 		/* update the initial rate */
723 		channel->init_rate = channel->last_rate;
724 
725 		/* make sure we free up the rate resource */
726 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
727 
728 		if (channel->sq != NULL) {
729 			MLX5E_RL_WORKER_UNLOCK(rlw);
730 			mlx5e_rl_close_channel(&channel->sq);
731 			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
732 			MLX5E_RL_WORKER_LOCK(rlw);
733 		}
734 	}
735 
736 	rlw->worker_done = 0;
737 	cv_broadcast(&rlw->cv);
738 	MLX5E_RL_WORKER_UNLOCK(rlw);
739 
740 	kthread_exit();
741 }
742 
743 static int
744 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
745 {
746 	struct mlx5_core_dev *mdev = priv->mdev;
747 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
748 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
749 
750 	memset(in, 0, sizeof(in));
751 
752 	MLX5_SET(tisc, tisc, prio, 0);
753 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
754 
755 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
756 }
757 
758 static void
759 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
760 {
761 	mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn, 0);
762 }
763 
764 static void
765 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
766     struct mlx5_core_dev *mdev)
767 {
768 	/* ratelimit workers */
769 	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
770 	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
771 
772 	/* range check */
773 	if (param->tx_worker_threads_def == 0 ||
774 	    param->tx_worker_threads_def > param->tx_worker_threads_max)
775 		param->tx_worker_threads_def = param->tx_worker_threads_max;
776 
777 	/* ratelimit channels */
778 	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
779 	    param->tx_worker_threads_def;
780 	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
781 
782 	/* range check */
783 	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
784 		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
785 
786 	/* set default burst size */
787 	param->tx_burst_size = 4;	/* MTUs */
788 
789 	/*
790 	 * Set maximum burst size
791 	 *
792 	 * The burst size is multiplied by the MTU and clamped to the
793 	 * range 0 ... 65535 bytes inclusivly before fed into the
794 	 * firmware.
795 	 *
796 	 * NOTE: If the burst size or MTU is changed only ratelimit
797 	 * connections made after the change will use the new burst
798 	 * size.
799 	 */
800 	param->tx_burst_size_max = 255;
801 
802 	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
803 	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
804 	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
805 
806 	/* ratelimit table size */
807 	param->tx_rates_max = mdev->priv.rl_table.max_size;
808 
809 	/* range check */
810 	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
811 		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
812 
813 	/* set default number of rates */
814 	param->tx_rates_def = param->tx_rates_max;
815 
816 	/* set maximum allowed rate deviation */
817 	if (param->tx_limit_max != 0) {
818 		/*
819 		 * Make sure the deviation multiplication doesn't
820 		 * overflow unsigned 64-bit:
821 		 */
822 		param->tx_allowed_deviation_max = -1ULL /
823 		    param->tx_limit_max;
824 	}
825 	/* set default rate deviation */
826 	param->tx_allowed_deviation = 50;	/* 5.0% */
827 
828 	/* channel parameters */
829 	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
830 	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
831 	param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
832 	param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
833 	param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
834 }
835 
836 static const char *mlx5e_rl_params_desc[] = {
837 	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
838 };
839 
840 static const char *mlx5e_rl_table_params_desc[] = {
841 	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
842 };
843 
844 static const char *mlx5e_rl_stats_desc[] = {
845 	MLX5E_RL_STATS(MLX5E_STATS_DESC)
846 };
847 
848 int
849 mlx5e_rl_init(struct mlx5e_priv *priv)
850 {
851 	struct mlx5e_rl_priv_data *rl = &priv->rl;
852 	struct sysctl_oid *node;
853 	struct sysctl_oid *stats;
854 	char buf[64];
855 	uint64_t i;
856 	uint64_t j;
857 	int error;
858 
859 	/* check if there is support for packet pacing */
860 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
861 		return (0);
862 
863 	rl->priv = priv;
864 
865 	sysctl_ctx_init(&rl->ctx);
866 
867 	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
868 
869 	/* open own TIS domain for ratelimit SQs */
870 	error = mlx5e_rl_open_tis(priv);
871 	if (error)
872 		goto done;
873 
874 	/* setup default value for parameters */
875 	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
876 
877 	/* update the completion factor */
878 	mlx5e_rl_sync_tx_completion_fact(rl);
879 
880 	/* create root node */
881 	node = SYSCTL_ADD_NODE(&rl->ctx,
882 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
883 	    "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support");
884 
885 	if (node != NULL) {
886 		/* create SYSCTLs */
887 		for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
888 			mlx5e_rl_sysctl_add_u64_oid(rl,
889 			    MLX5E_RL_PARAMS_INDEX(arg[i]),
890 			    node, mlx5e_rl_params_desc[2 * i],
891 			    mlx5e_rl_params_desc[2 * i + 1]);
892 		}
893 
894 		stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
895 		    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
896 		    "Rate limiting statistics");
897 		if (stats != NULL) {
898 			/* create SYSCTLs */
899 			for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
900 				mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
901 				    stats, mlx5e_rl_stats_desc[2 * i],
902 				    mlx5e_rl_stats_desc[2 * i + 1]);
903 			}
904 		}
905 	}
906 
907 	/* allocate workers array */
908 	rl->workers = malloc(sizeof(rl->workers[0]) *
909 	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
910 
911 	/* allocate rate limit array */
912 	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
913 	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
914 
915 	if (node != NULL) {
916 		/* create more SYSCTls */
917 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
918 		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
919 		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
920 		    "A", "Show table of all configured TX rates");
921 
922 		/* try to fetch rate table from kernel environment */
923 		for (i = 0; i != rl->param.tx_rates_def; i++) {
924 			/* compute path for tunable */
925 			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
926 			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
927 			if (TUNABLE_QUAD_FETCH(buf, &j))
928 				mlx5e_rl_tx_limit_add(rl, j);
929 		}
930 
931 		/* setup rate table sysctls */
932 		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
933 			mlx5e_rl_sysctl_add_u64_oid(rl,
934 			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
935 			    node, mlx5e_rl_table_params_desc[2 * i],
936 			    mlx5e_rl_table_params_desc[2 * i + 1]);
937 		}
938 	}
939 
940 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
941 		struct mlx5e_rl_worker *rlw = rl->workers + j;
942 
943 		rlw->priv = priv;
944 
945 		cv_init(&rlw->cv, "mlx5-worker-cv");
946 		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
947 		STAILQ_INIT(&rlw->index_list_head);
948 		STAILQ_INIT(&rlw->process_head);
949 
950 		rlw->channels = malloc(sizeof(rlw->channels[0]) *
951 		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
952 
953 		MLX5E_RL_WORKER_LOCK(rlw);
954 		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
955 			struct mlx5e_rl_channel *channel = rlw->channels + i;
956 			channel->worker = rlw;
957 			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
958 		}
959 		MLX5E_RL_WORKER_UNLOCK(rlw);
960 	}
961 
962 	PRIV_LOCK(priv);
963 	error = mlx5e_rl_open_workers(priv);
964 	PRIV_UNLOCK(priv);
965 
966 	if (error != 0) {
967 		mlx5_en_err(priv->ifp,
968 		    "mlx5e_rl_open_workers failed: %d\n", error);
969 	}
970 
971 	return (0);
972 
973 done:
974 	sysctl_ctx_free(&rl->ctx);
975 	sx_destroy(&rl->rl_sxlock);
976 	return (error);
977 }
978 
979 static int
980 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
981 {
982 	struct mlx5e_rl_priv_data *rl = &priv->rl;
983 	struct thread *rl_thread = NULL;
984 	struct proc *rl_proc = NULL;
985 	uint64_t j;
986 	int error;
987 
988 	if (priv->gone || rl->opened)
989 		return (-EINVAL);
990 
991 	MLX5E_RL_WLOCK(rl);
992 	/* compute channel parameters once */
993 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
994 	MLX5E_RL_WUNLOCK(rl);
995 
996 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
997 		struct mlx5e_rl_worker *rlw = rl->workers + j;
998 
999 		/* start worker thread */
1000 		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
1001 		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
1002 		if (error != 0) {
1003 			mlx5_en_err(rl->priv->ifp,
1004 			    "kproc_kthread_add failed: %d\n", error);
1005 			rlw->worker_done = 1;
1006 		}
1007 	}
1008 
1009 	rl->opened = 1;
1010 
1011 	return (0);
1012 }
1013 
1014 static void
1015 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
1016 {
1017 	struct mlx5e_rl_priv_data *rl = &priv->rl;
1018 	uint64_t y;
1019 
1020 	if (rl->opened == 0)
1021 		return;
1022 
1023 	/* tear down worker threads simultaneously */
1024 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
1025 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1026 
1027 		/* tear down worker before freeing SQs */
1028 		MLX5E_RL_WORKER_LOCK(rlw);
1029 		if (rlw->worker_done == 0) {
1030 			rlw->worker_done = 1;
1031 			cv_broadcast(&rlw->cv);
1032 		} else {
1033 			/* XXX thread not started */
1034 			rlw->worker_done = 0;
1035 		}
1036 		MLX5E_RL_WORKER_UNLOCK(rlw);
1037 	}
1038 
1039 	/* wait for worker threads to exit */
1040 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
1041 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1042 
1043 		/* tear down worker before freeing SQs */
1044 		MLX5E_RL_WORKER_LOCK(rlw);
1045 		while (rlw->worker_done != 0)
1046 			cv_wait(&rlw->cv, &rlw->mtx);
1047 		MLX5E_RL_WORKER_UNLOCK(rlw);
1048 	}
1049 
1050 	rl->opened = 0;
1051 }
1052 
1053 static void
1054 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
1055 {
1056 	unsigned x;
1057 
1058 	MLX5E_RL_WLOCK(rl);
1059 	for (x = 0; x != rl->param.tx_rates_def; x++)
1060 		rl->rate_limit_table[x] = 0;
1061 	MLX5E_RL_WUNLOCK(rl);
1062 }
1063 
1064 void
1065 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1066 {
1067 	struct mlx5e_rl_priv_data *rl = &priv->rl;
1068 	uint64_t y;
1069 
1070 	/* check if there is support for packet pacing */
1071 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
1072 		return;
1073 
1074 	/* TODO check if there is support for packet pacing */
1075 
1076 	sysctl_ctx_free(&rl->ctx);
1077 
1078 	PRIV_LOCK(priv);
1079 	mlx5e_rl_close_workers(priv);
1080 	PRIV_UNLOCK(priv);
1081 
1082 	mlx5e_rl_reset_rates(rl);
1083 
1084 	/* close TIS domain */
1085 	mlx5e_rl_close_tis(priv);
1086 
1087 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
1088 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1089 
1090 		cv_destroy(&rlw->cv);
1091 		mtx_destroy(&rlw->mtx);
1092 		free(rlw->channels, M_MLX5EN);
1093 	}
1094 	free(rl->rate_limit_table, M_MLX5EN);
1095 	free(rl->workers, M_MLX5EN);
1096 	sx_destroy(&rl->rl_sxlock);
1097 }
1098 
1099 static void
1100 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
1101     struct mlx5e_rl_channel *channel)
1102 {
1103 	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
1104 	cv_broadcast(&rlw->cv);
1105 }
1106 
1107 static void
1108 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
1109 {
1110 	if (channel == NULL)
1111 		return;
1112 
1113 	MLX5E_RL_WORKER_LOCK(rlw);
1114 	switch (channel->state) {
1115 	case MLX5E_RL_ST_MODIFY:
1116 		channel->state = MLX5E_RL_ST_DESTROY;
1117 		break;
1118 	case MLX5E_RL_ST_USED:
1119 		channel->state = MLX5E_RL_ST_DESTROY;
1120 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1121 		break;
1122 	default:
1123 		break;
1124 	}
1125 	MLX5E_RL_WORKER_UNLOCK(rlw);
1126 }
1127 
1128 static int
1129 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1130 {
1131 
1132 	MLX5E_RL_WORKER_LOCK(rlw);
1133 	channel->new_rate = rate;
1134 	switch (channel->state) {
1135 	case MLX5E_RL_ST_USED:
1136 		channel->state = MLX5E_RL_ST_MODIFY;
1137 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1138 		break;
1139 	default:
1140 		break;
1141 	}
1142 	MLX5E_RL_WORKER_UNLOCK(rlw);
1143 
1144 	return (0);
1145 }
1146 
1147 static int
1148 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
1149     union if_snd_tag_query_params *params)
1150 {
1151 	int retval;
1152 
1153 	MLX5E_RL_WORKER_LOCK(rlw);
1154 	switch (channel->state) {
1155 	case MLX5E_RL_ST_USED:
1156 		params->rate_limit.max_rate = channel->last_rate;
1157 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1158 		retval = 0;
1159 		break;
1160 	case MLX5E_RL_ST_MODIFY:
1161 		params->rate_limit.max_rate = channel->last_rate;
1162 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1163 		retval = EBUSY;
1164 		break;
1165 	default:
1166 		retval = EINVAL;
1167 		break;
1168 	}
1169 	MLX5E_RL_WORKER_UNLOCK(rlw);
1170 
1171 	return (retval);
1172 }
1173 
1174 static int
1175 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1176     struct mlx5e_rl_channel **pchannel)
1177 {
1178 	struct mlx5e_rl_channel *channel;
1179 	int retval = ENOMEM;
1180 
1181 	MLX5E_RL_WORKER_LOCK(rlw);
1182 	/* Check for available channel in free list */
1183 	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1184 		retval = 0;
1185 		/* Remove head index from available list */
1186 		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1187 		channel->state = MLX5E_RL_ST_USED;
1188 		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1189 	} else {
1190 		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1191 	}
1192 	MLX5E_RL_WORKER_UNLOCK(rlw);
1193 
1194 	*pchannel = channel;
1195 #ifdef RATELIMIT_DEBUG
1196 	mlx5_en_info(rlw->priv->ifp,
1197 	    "Channel pointer for rate limit connection is %p\n", channel);
1198 #endif
1199 	return (retval);
1200 }
1201 
1202 int
1203 mlx5e_rl_snd_tag_alloc(if_t ifp,
1204     union if_snd_tag_alloc_params *params,
1205     struct m_snd_tag **ppmt)
1206 {
1207 	struct mlx5e_rl_channel *channel;
1208 	struct mlx5e_rl_worker *rlw;
1209 	struct mlx5e_priv *priv;
1210 	int error;
1211 
1212 	priv = if_getsoftc(ifp);
1213 
1214 	/* check if there is support for packet pacing or if device is going away */
1215 	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1216 	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1217 	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1218 		return (EOPNOTSUPP);
1219 
1220 	/* compute worker thread this TCP connection belongs to */
1221 	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1222 	    priv->rl.param.tx_worker_threads_def);
1223 
1224 	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1225 	if (error != 0)
1226 		goto done;
1227 
1228 	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1229 	if (error != 0) {
1230 		mlx5e_rl_free(rlw, channel);
1231 		goto done;
1232 	}
1233 
1234 	/* store pointer to mbuf tag */
1235 	MPASS(channel->tag.refcount == 0);
1236 	m_snd_tag_init(&channel->tag, ifp, &mlx5e_rl_snd_tag_sw);
1237 	*ppmt = &channel->tag;
1238 done:
1239 	return (error);
1240 }
1241 
1242 
1243 static int
1244 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1245 {
1246 	struct mlx5e_rl_channel *channel =
1247 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1248 
1249 	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1250 }
1251 
1252 static int
1253 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1254 {
1255 	struct mlx5e_rl_channel *channel =
1256 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1257 
1258 	return (mlx5e_rl_query(channel->worker, channel, params));
1259 }
1260 
1261 static void
1262 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1263 {
1264 	struct mlx5e_rl_channel *channel =
1265 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1266 
1267 	mlx5e_rl_free(channel->worker, channel);
1268 }
1269 
1270 static int
1271 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1272 {
1273 	struct mlx5e_rl_priv_data *rl = arg1;
1274 	struct mlx5e_priv *priv = rl->priv;
1275 	struct sbuf sbuf;
1276 	unsigned x;
1277 	int error;
1278 
1279 	error = sysctl_wire_old_buffer(req, 0);
1280 	if (error != 0)
1281 		return (error);
1282 
1283 	PRIV_LOCK(priv);
1284 
1285 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1286 
1287 	sbuf_printf(&sbuf,
1288 	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1289 	    "\t" "--------------------------------------------\n");
1290 
1291 	MLX5E_RL_RLOCK(rl);
1292 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1293 		if (rl->rate_limit_table[x] == 0)
1294 			continue;
1295 
1296 		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1297 		    x, (unsigned)rl->param.tx_burst_size,
1298 		    (long long)rl->rate_limit_table[x]);
1299 	}
1300 	MLX5E_RL_RUNLOCK(rl);
1301 
1302 	error = sbuf_finish(&sbuf);
1303 	sbuf_delete(&sbuf);
1304 
1305 	PRIV_UNLOCK(priv);
1306 
1307 	return (error);
1308 }
1309 
1310 static int
1311 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1312 {
1313 	uint64_t x;
1314 	uint64_t y;
1315 
1316 	MLX5E_RL_WLOCK(rl);
1317 	/* compute channel parameters once */
1318 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1319 	MLX5E_RL_WUNLOCK(rl);
1320 
1321 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1322 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1323 
1324 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1325 			struct mlx5e_rl_channel *channel;
1326 			struct mlx5e_sq *sq;
1327 
1328 			channel = rlw->channels + x;
1329 			sq = channel->sq;
1330 
1331 			if (sq == NULL)
1332 				continue;
1333 
1334 			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1335 				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1336 				    rl->param.tx_coalesce_usecs,
1337 				    rl->param.tx_coalesce_pkts,
1338 				    rl->param.tx_coalesce_mode);
1339 			} else {
1340 				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1341 				    rl->param.tx_coalesce_usecs,
1342 				    rl->param.tx_coalesce_pkts);
1343 			}
1344 		}
1345 	}
1346 	return (0);
1347 }
1348 
1349 void
1350 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
1351 {
1352 	uint64_t x;
1353 	uint64_t y;
1354 
1355 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1356 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1357 
1358 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1359 			struct mlx5e_rl_channel *channel;
1360 			struct mlx5e_sq *sq;
1361 
1362 			channel = rlw->channels + x;
1363 			sq = channel->sq;
1364 
1365 			if (sq == NULL)
1366 				continue;
1367 
1368 			mtx_lock(&sq->lock);
1369 			mlx5e_update_sq_inline(sq);
1370 			mtx_unlock(&sq->lock);
1371 		}
1372 	}
1373 }
1374 
1375 static int
1376 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1377 {
1378 	unsigned x;
1379 	int error;
1380 
1381 	if (value < 1000 ||
1382 	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1383 		return (EINVAL);
1384 
1385 	MLX5E_RL_WLOCK(rl);
1386 	error = ENOMEM;
1387 
1388 	/* check if rate already exists */
1389 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1390 		if (rl->rate_limit_table[x] != value)
1391 			continue;
1392 		error = EEXIST;
1393 		break;
1394 	}
1395 
1396 	/* check if there is a free rate entry */
1397 	if (x == rl->param.tx_rates_def) {
1398 		for (x = 0; x != rl->param.tx_rates_def; x++) {
1399 			if (rl->rate_limit_table[x] != 0)
1400 				continue;
1401 			rl->rate_limit_table[x] = value;
1402 			error = 0;
1403 			break;
1404 		}
1405 	}
1406 	MLX5E_RL_WUNLOCK(rl);
1407 
1408 	return (error);
1409 }
1410 
1411 static int
1412 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1413 {
1414 	unsigned x;
1415 	int error;
1416 
1417 	if (value == 0)
1418 		return (EINVAL);
1419 
1420 	MLX5E_RL_WLOCK(rl);
1421 
1422 	/* check if rate already exists */
1423 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1424 		if (rl->rate_limit_table[x] != value)
1425 			continue;
1426 		/* free up rate */
1427 		rl->rate_limit_table[x] = 0;
1428 		break;
1429 	}
1430 
1431 	/* check if there is a free rate entry */
1432 	if (x == rl->param.tx_rates_def)
1433 		error = ENOENT;
1434 	else
1435 		error = 0;
1436 	MLX5E_RL_WUNLOCK(rl);
1437 
1438 	return (error);
1439 }
1440 
1441 static int
1442 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1443 {
1444 	struct mlx5e_rl_priv_data *rl = arg1;
1445 	struct mlx5e_priv *priv = rl->priv;
1446 	unsigned mode_modify;
1447 	unsigned was_opened;
1448 	uint64_t value;
1449 	int error;
1450 
1451 	PRIV_LOCK(priv);
1452 
1453 	MLX5E_RL_RLOCK(rl);
1454 	value = rl->param.arg[arg2];
1455 	MLX5E_RL_RUNLOCK(rl);
1456 
1457 	if (req != NULL) {
1458 		error = sysctl_handle_64(oidp, &value, 0, req);
1459 		if (error || req->newptr == NULL ||
1460 		    value == rl->param.arg[arg2])
1461 			goto done;
1462 	} else {
1463 		error = 0;
1464 	}
1465 
1466 	/* check if device is gone */
1467 	if (priv->gone) {
1468 		error = ENXIO;
1469 		goto done;
1470 	}
1471 	was_opened = rl->opened;
1472 	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1473 
1474 	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1475 	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1476 		if (value > rl->param.tx_worker_threads_max)
1477 			value = rl->param.tx_worker_threads_max;
1478 		else if (value < 1)
1479 			value = 1;
1480 
1481 		/* store new value */
1482 		rl->param.arg[arg2] = value;
1483 		break;
1484 
1485 	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1486 		if (value > rl->param.tx_channels_per_worker_max)
1487 			value = rl->param.tx_channels_per_worker_max;
1488 		else if (value < 1)
1489 			value = 1;
1490 
1491 		/* store new value */
1492 		rl->param.arg[arg2] = value;
1493 		break;
1494 
1495 	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1496 		if (value > rl->param.tx_rates_max)
1497 			value = rl->param.tx_rates_max;
1498 		else if (value < 1)
1499 			value = 1;
1500 
1501 		/* store new value */
1502 		rl->param.arg[arg2] = value;
1503 		break;
1504 
1505 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1506 		/* range check */
1507 		if (value < 1)
1508 			value = 0;
1509 		else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1510 			value = MLX5E_FLD_MAX(cqc, cq_period);
1511 
1512 		/* store new value */
1513 		rl->param.arg[arg2] = value;
1514 
1515 		/* check to avoid down and up the network interface */
1516 		if (was_opened)
1517 			error = mlx5e_rl_refresh_channel_params(rl);
1518 		break;
1519 
1520 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1521 		/* import TX coal pkts */
1522 		if (value < 1)
1523 			value = 0;
1524 		else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1525 			value = MLX5E_FLD_MAX(cqc, cq_max_count);
1526 
1527 		/* store new value */
1528 		rl->param.arg[arg2] = value;
1529 
1530 		/* check to avoid down and up the network interface */
1531 		if (was_opened)
1532 			error = mlx5e_rl_refresh_channel_params(rl);
1533 		break;
1534 
1535 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1536 		/* network interface must be down */
1537 		if (was_opened != 0 && mode_modify == 0)
1538 			mlx5e_rl_close_workers(priv);
1539 
1540 		/* import TX coalesce mode */
1541 		if (value != 0)
1542 			value = 1;
1543 
1544 		/* store new value */
1545 		rl->param.arg[arg2] = value;
1546 
1547 		/* restart network interface, if any */
1548 		if (was_opened != 0) {
1549 			if (mode_modify == 0)
1550 				mlx5e_rl_open_workers(priv);
1551 			else
1552 				error = mlx5e_rl_refresh_channel_params(rl);
1553 		}
1554 		break;
1555 
1556 	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1557 		/* network interface must be down */
1558 		if (was_opened)
1559 			mlx5e_rl_close_workers(priv);
1560 
1561 		/* import TX queue size */
1562 		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1563 			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1564 		else if (value > priv->params_ethtool.tx_queue_size_max)
1565 			value = priv->params_ethtool.tx_queue_size_max;
1566 
1567 		/* store actual TX queue size */
1568 		value = 1ULL << order_base_2(value);
1569 
1570 		/* store new value */
1571 		rl->param.arg[arg2] = value;
1572 
1573 		/* verify TX completion factor */
1574 		mlx5e_rl_sync_tx_completion_fact(rl);
1575 
1576 		/* restart network interface, if any */
1577 		if (was_opened)
1578 			mlx5e_rl_open_workers(priv);
1579 		break;
1580 
1581 	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1582 		/* network interface must be down */
1583 		if (was_opened)
1584 			mlx5e_rl_close_workers(priv);
1585 
1586 		/* store new value */
1587 		rl->param.arg[arg2] = value;
1588 
1589 		/* verify parameter */
1590 		mlx5e_rl_sync_tx_completion_fact(rl);
1591 
1592 		/* restart network interface, if any */
1593 		if (was_opened)
1594 			mlx5e_rl_open_workers(priv);
1595 		break;
1596 
1597 	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1598 		error = mlx5e_rl_tx_limit_add(rl, value);
1599 		break;
1600 
1601 	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1602 		error = mlx5e_rl_tx_limit_clr(rl, value);
1603 		break;
1604 
1605 	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1606 		/* range check */
1607 		if (value > rl->param.tx_allowed_deviation_max)
1608 			value = rl->param.tx_allowed_deviation_max;
1609 		else if (value < rl->param.tx_allowed_deviation_min)
1610 			value = rl->param.tx_allowed_deviation_min;
1611 
1612 		MLX5E_RL_WLOCK(rl);
1613 		rl->param.arg[arg2] = value;
1614 		MLX5E_RL_WUNLOCK(rl);
1615 		break;
1616 
1617 	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1618 		/* range check */
1619 		if (value > rl->param.tx_burst_size_max)
1620 			value = rl->param.tx_burst_size_max;
1621 		else if (value < rl->param.tx_burst_size_min)
1622 			value = rl->param.tx_burst_size_min;
1623 
1624 		MLX5E_RL_WLOCK(rl);
1625 		rl->param.arg[arg2] = value;
1626 		MLX5E_RL_WUNLOCK(rl);
1627 		break;
1628 
1629 	default:
1630 		break;
1631 	}
1632 done:
1633 	PRIV_UNLOCK(priv);
1634 	return (error);
1635 }
1636 
1637 static void
1638 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1639     struct sysctl_oid *node, const char *name, const char *desc)
1640 {
1641 	/*
1642 	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1643 	 * take care of loading default sysctl value from the kernel
1644 	 * environment, if any:
1645 	 */
1646 	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1647 		/* read-only SYSCTLs */
1648 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1649 		    name, CTLTYPE_U64 | CTLFLAG_RD |
1650 		    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1651 	} else {
1652 		if (strstr(name, "_def") != 0) {
1653 #ifdef RATELIMIT_DEBUG
1654 			/* tunable read-only advanced SYSCTLs */
1655 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1656 			    name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1657 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1658 #endif
1659 		} else {
1660 			/* read-write SYSCTLs */
1661 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1662 			    name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1663 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1664 		}
1665 	}
1666 }
1667 
1668 static void
1669 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1670     struct sysctl_oid *node, const char *name, const char *desc)
1671 {
1672 	/* read-only SYSCTLs */
1673 	SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1674 	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
1675 }
1676 
1677 #else
1678 
1679 int
1680 mlx5e_rl_init(struct mlx5e_priv *priv)
1681 {
1682 
1683 	return (0);
1684 }
1685 
1686 void
1687 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1688 {
1689 	/* NOP */
1690 }
1691 
1692 #endif		/* RATELIMIT */
1693