xref: /freebsd/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c (revision f7c32ed617858bcd22f8d1b03199099d50125721)
1 /*-
2  * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "en.h"
29 
30 #ifdef RATELIMIT
31 
32 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
33 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
34 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
35 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
36     struct sysctl_oid *, const char *name, const char *desc);
37 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
38       struct sysctl_oid *node, const char *name, const char *desc);
39 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
40 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
41 static if_snd_tag_modify_t mlx5e_rl_snd_tag_modify;
42 static if_snd_tag_query_t mlx5e_rl_snd_tag_query;
43 static if_snd_tag_free_t mlx5e_rl_snd_tag_free;
44 
45 static const struct if_snd_tag_sw mlx5e_rl_snd_tag_sw = {
46 	.snd_tag_modify = mlx5e_rl_snd_tag_modify,
47 	.snd_tag_query = mlx5e_rl_snd_tag_query,
48 	.snd_tag_free = mlx5e_rl_snd_tag_free,
49 	.type = IF_SND_TAG_TYPE_RATE_LIMIT
50 };
51 
52 static void
53 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
54     struct mlx5e_sq_param *param)
55 {
56 	void *sqc = param->sqc;
57 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
58 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
59 
60 	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
61 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
62 	MLX5_SET(wq, wq, pd, rl->priv->pdn);
63 
64 	param->wq.linear = 1;
65 }
66 
67 static void
68 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
69     struct mlx5e_cq_param *param)
70 {
71 	void *cqc = param->cqc;
72 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
73 
74 	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
75 	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
76 	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
77 	MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index);
78 
79 	switch (rl->param.tx_coalesce_mode) {
80 	case 0:
81 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
82 		break;
83 	default:
84 		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
85 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
86 		else
87 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
88 		break;
89 	}
90 }
91 
92 static void
93 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
94     struct mlx5e_rl_channel_param *cparam)
95 {
96 	memset(cparam, 0, sizeof(*cparam));
97 
98 	mlx5e_rl_build_sq_param(rl, &cparam->sq);
99 	mlx5e_rl_build_cq_param(rl, &cparam->cq);
100 }
101 
102 static int
103 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
104     struct mlx5e_sq_param *param, int ix)
105 {
106 	struct mlx5_core_dev *mdev = priv->mdev;
107 	void *sqc = param->sqc;
108 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
109 	int err;
110 
111 	/* Create DMA descriptor TAG */
112 	if ((err = -bus_dma_tag_create(
113 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
114 	    1,				/* any alignment */
115 	    0,				/* no boundary */
116 	    BUS_SPACE_MAXADDR,		/* lowaddr */
117 	    BUS_SPACE_MAXADDR,		/* highaddr */
118 	    NULL, NULL,			/* filter, filterarg */
119 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
120 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
121 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
122 	    0,				/* flags */
123 	    NULL, NULL,			/* lockfunc, lockfuncarg */
124 	    &sq->dma_tag)))
125 		goto done;
126 
127 	sq->mkey_be = cpu_to_be32(priv->mr.key);
128 	sq->ifp = priv->ifp;
129 	sq->priv = priv;
130 
131 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
132 	    &sq->wq_ctrl);
133 	if (err)
134 		goto err_free_dma_tag;
135 
136 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
137 
138 	err = mlx5e_alloc_sq_db(sq);
139 	if (err)
140 		goto err_sq_wq_destroy;
141 
142 	mlx5e_update_sq_inline(sq);
143 
144 	return (0);
145 
146 err_sq_wq_destroy:
147 	mlx5_wq_destroy(&sq->wq_ctrl);
148 err_free_dma_tag:
149 	bus_dma_tag_destroy(sq->dma_tag);
150 done:
151 	return (err);
152 }
153 
154 static void
155 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
156 {
157 
158 	mlx5e_free_sq_db(sq);
159 	mlx5_wq_destroy(&sq->wq_ctrl);
160 	bus_dma_tag_destroy(sq->dma_tag);
161 }
162 
163 static int
164 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
165     struct mlx5e_sq_param *param, int ix)
166 {
167 	int err;
168 
169 	err = mlx5e_rl_create_sq(priv, sq, param, ix);
170 	if (err)
171 		return (err);
172 
173 	err = mlx5e_enable_sq(sq, param, &priv->channel[ix].bfreg, priv->rl.tisn);
174 	if (err)
175 		goto err_destroy_sq;
176 
177 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
178 	if (err)
179 		goto err_disable_sq;
180 
181 	WRITE_ONCE(sq->running, 1);
182 
183 	return (0);
184 
185 err_disable_sq:
186 	mlx5e_disable_sq(sq);
187 err_destroy_sq:
188 	mlx5e_rl_destroy_sq(sq);
189 
190 	return (err);
191 }
192 
193 static void
194 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
195 {
196 	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
197 	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
198 
199 	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
200 
201 	sq->cev_factor = priv->rl.param.tx_completion_fact;
202 
203 	/* ensure the TX completion event factor is not zero */
204 	if (sq->cev_factor == 0)
205 		sq->cev_factor = 1;
206 }
207 
208 static int
209 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
210     struct mlx5e_rl_channel_param *cparam,
211     struct mlx5e_sq *volatile *ppsq)
212 {
213 	struct mlx5e_priv *priv = rlw->priv;
214 	struct mlx5e_sq *sq;
215 	int err;
216 
217 	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
218 
219 	/* init mutexes */
220 	mlx5e_rl_chan_mtx_init(priv, sq);
221 
222 	/* open TX completion queue */
223 	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
224 	    &mlx5e_tx_cq_comp, eq_ix);
225 	if (err)
226 		goto err_free;
227 
228 	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
229 	if (err)
230 		goto err_close_tx_cq;
231 
232 	/* store TX channel pointer */
233 	*ppsq = sq;
234 
235 	/* poll TX queue initially */
236 	sq->cq.mcq.comp(&sq->cq.mcq, NULL);
237 
238 	return (0);
239 
240 err_close_tx_cq:
241 	mlx5e_close_cq(&sq->cq);
242 
243 err_free:
244 	/* destroy mutexes */
245 	mtx_destroy(&sq->lock);
246 	mtx_destroy(&sq->comp_lock);
247 	free(sq, M_MLX5EN);
248 	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
249 	return (err);
250 }
251 
252 static void
253 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
254 {
255 	struct mlx5e_sq *sq = *ppsq;
256 
257 	/* check if channel is already closed */
258 	if (sq == NULL)
259 		return;
260 	/* ensure channel pointer is no longer used */
261 	*ppsq = NULL;
262 
263 	/* teardown and destroy SQ */
264 	mlx5e_drain_sq(sq);
265 	mlx5e_disable_sq(sq);
266 	mlx5e_rl_destroy_sq(sq);
267 
268 	/* close CQ */
269 	mlx5e_close_cq(&sq->cq);
270 
271 	/* destroy mutexes */
272 	mtx_destroy(&sq->lock);
273 	mtx_destroy(&sq->comp_lock);
274 
275 	free(sq, M_MLX5EN);
276 }
277 
278 static void
279 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
280 {
281 	/*
282 	 * Limit the maximum distance between completion events to
283 	 * half of the currently set TX queue size.
284 	 *
285 	 * The maximum number of queue entries a single IP packet can
286 	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
287 	 *
288 	 * The worst case max value is then given as below:
289 	 */
290 	uint64_t max = rl->param.tx_queue_size /
291 	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
292 
293 	/*
294 	 * Update the maximum completion factor value in case the
295 	 * tx_queue_size field changed. Ensure we don't overflow
296 	 * 16-bits.
297 	 */
298 	if (max < 1)
299 		max = 1;
300 	else if (max > 65535)
301 		max = 65535;
302 	rl->param.tx_completion_fact_max = max;
303 
304 	/*
305 	 * Verify that the current TX completion factor is within the
306 	 * given limits:
307 	 */
308 	if (rl->param.tx_completion_fact < 1)
309 		rl->param.tx_completion_fact = 1;
310 	else if (rl->param.tx_completion_fact > max)
311 		rl->param.tx_completion_fact = max;
312 }
313 
314 static int
315 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
316 {
317 	struct mlx5e_priv *priv = sq->priv;
318 	struct mlx5_core_dev *mdev = priv->mdev;
319 
320 	void *in;
321 	void *sqc;
322 	int inlen;
323 	int err;
324 
325 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
326 	in = mlx5_vzalloc(inlen);
327 	if (in == NULL)
328 		return (-ENOMEM);
329 
330 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
331 
332 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
333 	MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
334 	MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
335 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
336 	MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
337 
338 	err = mlx5_core_modify_sq(mdev, in, inlen);
339 
340 	kvfree(in);
341 
342 	return (err);
343 }
344 
345 /*
346  * This function will search the configured rate limit table for the
347  * best match to avoid that a single socket based application can
348  * allocate all the available hardware rates. If the user selected
349  * rate deviates too much from the closes rate available in the rate
350  * limit table, unlimited rate will be selected.
351  */
352 static uint64_t
353 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
354 {
355 	uint64_t distance = -1ULL;
356 	uint64_t diff;
357 	uint64_t retval = 0;		/* unlimited */
358 	uint64_t x;
359 
360 	/* search for closest rate */
361 	for (x = 0; x != rl->param.tx_rates_def; x++) {
362 		uint64_t rate = rl->rate_limit_table[x];
363 		if (rate == 0)
364 			continue;
365 
366 		if (rate > user_rate)
367 			diff = rate - user_rate;
368 		else
369 			diff = user_rate - rate;
370 
371 		/* check if distance is smaller than previous rate */
372 		if (diff < distance) {
373 			distance = diff;
374 			retval = rate;
375 		}
376 	}
377 
378 	/* range check for multiplication below */
379 	if (user_rate > rl->param.tx_limit_max)
380 		user_rate = rl->param.tx_limit_max;
381 
382 	/* fallback to unlimited, if rate deviates too much */
383 	if (distance > howmany(user_rate *
384 	    rl->param.tx_allowed_deviation, 1000ULL))
385 		retval = 0;
386 
387 	return (retval);
388 }
389 
390 /*
391  * This function sets the requested rate for a rate limit channel, in
392  * bits per second. The requested rate will be filtered through the
393  * find best rate function above.
394  */
395 static int
396 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
397     struct mlx5e_rl_channel *channel, uint64_t rate)
398 {
399 	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
400 	struct mlx5e_sq *sq;
401 	uint64_t temp;
402 	uint16_t index;
403 	uint16_t burst;
404 	int error;
405 
406 	if (rate != 0) {
407 		MLX5E_RL_WORKER_UNLOCK(rlw);
408 
409 		MLX5E_RL_RLOCK(rl);
410 
411 		/* get current burst size in bytes */
412 		temp = rl->param.tx_burst_size *
413 		    MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
414 
415 		/* limit burst size to 64K currently */
416 		if (temp > 65535)
417 			temp = 65535;
418 		burst = temp;
419 
420 		/* find best rate */
421 		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
422 
423 		MLX5E_RL_RUNLOCK(rl);
424 
425 		if (rate == 0) {
426 			/* rate doesn't exist, fallback to unlimited */
427 			index = 0;
428 			rate = 0;
429 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
430 		} else {
431 			/* get a reference on the new rate */
432 			error = -mlx5_rl_add_rate(rlw->priv->mdev,
433 			    howmany(rate, 1000), burst, &index);
434 
435 			if (error != 0) {
436 				/* adding rate failed, fallback to unlimited */
437 				index = 0;
438 				rate = 0;
439 				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
440 			}
441 		}
442 		MLX5E_RL_WORKER_LOCK(rlw);
443 	} else {
444 		index = 0;
445 		burst = 0;	/* default */
446 	}
447 
448 	/* atomically swap rates */
449 	temp = channel->last_rate;
450 	channel->last_rate = rate;
451 	rate = temp;
452 
453 	/* atomically swap burst size */
454 	temp = channel->last_burst;
455 	channel->last_burst = burst;
456 	burst = temp;
457 
458 	MLX5E_RL_WORKER_UNLOCK(rlw);
459 	/* put reference on the old rate, if any */
460 	if (rate != 0) {
461 		mlx5_rl_remove_rate(rlw->priv->mdev,
462 		    howmany(rate, 1000), burst);
463 	}
464 
465 	/* set new rate, if SQ is running */
466 	sq = channel->sq;
467 	if (sq != NULL && READ_ONCE(sq->running) != 0) {
468 		error = mlx5e_rl_modify_sq(sq, index);
469 		if (error != 0)
470 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
471 	} else
472 		error = 0;
473 	MLX5E_RL_WORKER_LOCK(rlw);
474 
475 	return (-error);
476 }
477 
478 static void
479 mlx5e_rl_worker(void *arg)
480 {
481 	struct thread *td;
482 	struct mlx5e_rl_worker *rlw = arg;
483 	struct mlx5e_rl_channel *channel;
484 	struct mlx5e_priv *priv;
485 	unsigned ix;
486 	uint64_t x;
487 	int error;
488 
489 	/* set thread priority */
490 	td = curthread;
491 
492 	thread_lock(td);
493 	sched_prio(td, PI_SWI(SWI_NET));
494 	thread_unlock(td);
495 
496 	priv = rlw->priv;
497 
498 	/* compute completion vector */
499 	ix = (rlw - priv->rl.workers) %
500 	    priv->mdev->priv.eq_table.num_comp_vectors;
501 
502 	/* TODO bind to CPU */
503 
504 	/* open all the SQs */
505 	MLX5E_RL_WORKER_LOCK(rlw);
506 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
507 		struct mlx5e_rl_channel *channel = rlw->channels + x;
508 
509 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
510 		if (channel->state == MLX5E_RL_ST_FREE)
511 			continue;
512 #endif
513 		MLX5E_RL_WORKER_UNLOCK(rlw);
514 
515 		MLX5E_RL_RLOCK(&priv->rl);
516 		error = mlx5e_rl_open_channel(rlw, ix,
517 		    &priv->rl.chan_param, &channel->sq);
518 		MLX5E_RL_RUNLOCK(&priv->rl);
519 
520 		MLX5E_RL_WORKER_LOCK(rlw);
521 		if (error != 0) {
522 			mlx5_en_err(priv->ifp,
523 			    "mlx5e_rl_open_channel failed: %d\n", error);
524 			break;
525 		}
526 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
527 	}
528 	while (1) {
529 		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
530 			/* check if we are tearing down */
531 			if (rlw->worker_done != 0)
532 				break;
533 			cv_wait(&rlw->cv, &rlw->mtx);
534 		}
535 		/* check if we are tearing down */
536 		if (rlw->worker_done != 0)
537 			break;
538 		channel = STAILQ_FIRST(&rlw->process_head);
539 		if (channel != NULL) {
540 			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
541 
542 			switch (channel->state) {
543 			case MLX5E_RL_ST_MODIFY:
544 				channel->state = MLX5E_RL_ST_USED;
545 				MLX5E_RL_WORKER_UNLOCK(rlw);
546 
547 				/* create channel by demand */
548 				if (channel->sq == NULL) {
549 					MLX5E_RL_RLOCK(&priv->rl);
550 					error = mlx5e_rl_open_channel(rlw, ix,
551 					    &priv->rl.chan_param, &channel->sq);
552 					MLX5E_RL_RUNLOCK(&priv->rl);
553 
554 					if (error != 0) {
555 						mlx5_en_err(priv->ifp,
556 						    "mlx5e_rl_open_channel failed: %d\n", error);
557 					} else {
558 						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
559 					}
560 				} else {
561 					mlx5e_resume_sq(channel->sq);
562 				}
563 
564 				MLX5E_RL_WORKER_LOCK(rlw);
565 				/* convert from bytes/s to bits/s and set new rate */
566 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
567 				    channel->new_rate * 8ULL);
568 				if (error != 0) {
569 					mlx5_en_err(priv->ifp,
570 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
571 					    error);
572 				}
573 				break;
574 
575 			case MLX5E_RL_ST_DESTROY:
576 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
577 				if (error != 0) {
578 					mlx5_en_err(priv->ifp,
579 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
580 					    error);
581 				}
582 				if (channel->sq != NULL) {
583 					/*
584 					 * Make sure all packets are
585 					 * transmitted before SQ is
586 					 * returned to free list:
587 					 */
588 					MLX5E_RL_WORKER_UNLOCK(rlw);
589 					mlx5e_drain_sq(channel->sq);
590 					MLX5E_RL_WORKER_LOCK(rlw);
591 				}
592 				/* put the channel back into the free list */
593 				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
594 				channel->state = MLX5E_RL_ST_FREE;
595 				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
596 				break;
597 			default:
598 				/* NOP */
599 				break;
600 			}
601 		}
602 	}
603 
604 	/* close all the SQs */
605 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
606 		struct mlx5e_rl_channel *channel = rlw->channels + x;
607 
608 		/* update the initial rate */
609 		channel->init_rate = channel->last_rate;
610 
611 		/* make sure we free up the rate resource */
612 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
613 
614 		if (channel->sq != NULL) {
615 			MLX5E_RL_WORKER_UNLOCK(rlw);
616 			mlx5e_rl_close_channel(&channel->sq);
617 			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
618 			MLX5E_RL_WORKER_LOCK(rlw);
619 		}
620 	}
621 
622 	rlw->worker_done = 0;
623 	cv_broadcast(&rlw->cv);
624 	MLX5E_RL_WORKER_UNLOCK(rlw);
625 
626 	kthread_exit();
627 }
628 
629 static int
630 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
631 {
632 	struct mlx5_core_dev *mdev = priv->mdev;
633 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
634 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
635 
636 	memset(in, 0, sizeof(in));
637 
638 	MLX5_SET(tisc, tisc, prio, 0);
639 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
640 
641 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
642 }
643 
644 static void
645 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
646 {
647 	mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn, 0);
648 }
649 
650 static void
651 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
652     struct mlx5_core_dev *mdev)
653 {
654 	/* ratelimit workers */
655 	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
656 	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
657 
658 	/* range check */
659 	if (param->tx_worker_threads_def == 0 ||
660 	    param->tx_worker_threads_def > param->tx_worker_threads_max)
661 		param->tx_worker_threads_def = param->tx_worker_threads_max;
662 
663 	/* ratelimit channels */
664 	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
665 	    param->tx_worker_threads_def;
666 	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
667 
668 	/* range check */
669 	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
670 		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
671 
672 	/* set default burst size */
673 	param->tx_burst_size = 4;	/* MTUs */
674 
675 	/*
676 	 * Set maximum burst size
677 	 *
678 	 * The burst size is multiplied by the MTU and clamped to the
679 	 * range 0 ... 65535 bytes inclusivly before fed into the
680 	 * firmware.
681 	 *
682 	 * NOTE: If the burst size or MTU is changed only ratelimit
683 	 * connections made after the change will use the new burst
684 	 * size.
685 	 */
686 	param->tx_burst_size_max = 255;
687 
688 	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
689 	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
690 	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
691 
692 	/* ratelimit table size */
693 	param->tx_rates_max = mdev->priv.rl_table.max_size;
694 
695 	/* range check */
696 	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
697 		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
698 
699 	/* set default number of rates */
700 	param->tx_rates_def = param->tx_rates_max;
701 
702 	/* set maximum allowed rate deviation */
703 	if (param->tx_limit_max != 0) {
704 		/*
705 		 * Make sure the deviation multiplication doesn't
706 		 * overflow unsigned 64-bit:
707 		 */
708 		param->tx_allowed_deviation_max = -1ULL /
709 		    param->tx_limit_max;
710 	}
711 	/* set default rate deviation */
712 	param->tx_allowed_deviation = 50;	/* 5.0% */
713 
714 	/* channel parameters */
715 	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
716 	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
717 	param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
718 	param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
719 	param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
720 }
721 
722 static const char *mlx5e_rl_params_desc[] = {
723 	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
724 };
725 
726 static const char *mlx5e_rl_table_params_desc[] = {
727 	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
728 };
729 
730 static const char *mlx5e_rl_stats_desc[] = {
731 	MLX5E_RL_STATS(MLX5E_STATS_DESC)
732 };
733 
734 int
735 mlx5e_rl_init(struct mlx5e_priv *priv)
736 {
737 	struct mlx5e_rl_priv_data *rl = &priv->rl;
738 	struct sysctl_oid *node;
739 	struct sysctl_oid *stats;
740 	char buf[64];
741 	uint64_t i;
742 	uint64_t j;
743 	int error;
744 
745 	/* check if there is support for packet pacing */
746 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
747 		return (0);
748 
749 	rl->priv = priv;
750 
751 	sysctl_ctx_init(&rl->ctx);
752 
753 	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
754 
755 	/* open own TIS domain for ratelimit SQs */
756 	error = mlx5e_rl_open_tis(priv);
757 	if (error)
758 		goto done;
759 
760 	/* setup default value for parameters */
761 	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
762 
763 	/* update the completion factor */
764 	mlx5e_rl_sync_tx_completion_fact(rl);
765 
766 	/* create root node */
767 	node = SYSCTL_ADD_NODE(&rl->ctx,
768 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
769 	    "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support");
770 
771 	if (node != NULL) {
772 		/* create SYSCTLs */
773 		for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
774 			mlx5e_rl_sysctl_add_u64_oid(rl,
775 			    MLX5E_RL_PARAMS_INDEX(arg[i]),
776 			    node, mlx5e_rl_params_desc[2 * i],
777 			    mlx5e_rl_params_desc[2 * i + 1]);
778 		}
779 
780 		stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
781 		    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
782 		    "Rate limiting statistics");
783 		if (stats != NULL) {
784 			/* create SYSCTLs */
785 			for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
786 				mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
787 				    stats, mlx5e_rl_stats_desc[2 * i],
788 				    mlx5e_rl_stats_desc[2 * i + 1]);
789 			}
790 		}
791 	}
792 
793 	/* allocate workers array */
794 	rl->workers = malloc(sizeof(rl->workers[0]) *
795 	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
796 
797 	/* allocate rate limit array */
798 	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
799 	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
800 
801 	if (node != NULL) {
802 		/* create more SYSCTls */
803 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
804 		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
805 		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
806 		    "A", "Show table of all configured TX rates");
807 
808 		/* try to fetch rate table from kernel environment */
809 		for (i = 0; i != rl->param.tx_rates_def; i++) {
810 			/* compute path for tunable */
811 			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
812 			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
813 			if (TUNABLE_QUAD_FETCH(buf, &j))
814 				mlx5e_rl_tx_limit_add(rl, j);
815 		}
816 
817 		/* setup rate table sysctls */
818 		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
819 			mlx5e_rl_sysctl_add_u64_oid(rl,
820 			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
821 			    node, mlx5e_rl_table_params_desc[2 * i],
822 			    mlx5e_rl_table_params_desc[2 * i + 1]);
823 		}
824 	}
825 
826 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
827 		struct mlx5e_rl_worker *rlw = rl->workers + j;
828 
829 		rlw->priv = priv;
830 
831 		cv_init(&rlw->cv, "mlx5-worker-cv");
832 		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
833 		STAILQ_INIT(&rlw->index_list_head);
834 		STAILQ_INIT(&rlw->process_head);
835 
836 		rlw->channels = malloc(sizeof(rlw->channels[0]) *
837 		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
838 
839 		MLX5E_RL_WORKER_LOCK(rlw);
840 		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
841 			struct mlx5e_rl_channel *channel = rlw->channels + i;
842 			channel->worker = rlw;
843 			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
844 		}
845 		MLX5E_RL_WORKER_UNLOCK(rlw);
846 	}
847 
848 	PRIV_LOCK(priv);
849 	error = mlx5e_rl_open_workers(priv);
850 	PRIV_UNLOCK(priv);
851 
852 	if (error != 0) {
853 		mlx5_en_err(priv->ifp,
854 		    "mlx5e_rl_open_workers failed: %d\n", error);
855 	}
856 
857 	return (0);
858 
859 done:
860 	sysctl_ctx_free(&rl->ctx);
861 	sx_destroy(&rl->rl_sxlock);
862 	return (error);
863 }
864 
865 static int
866 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
867 {
868 	struct mlx5e_rl_priv_data *rl = &priv->rl;
869 	struct thread *rl_thread = NULL;
870 	struct proc *rl_proc = NULL;
871 	uint64_t j;
872 	int error;
873 
874 	if (priv->gone || rl->opened)
875 		return (-EINVAL);
876 
877 	MLX5E_RL_WLOCK(rl);
878 	/* compute channel parameters once */
879 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
880 	MLX5E_RL_WUNLOCK(rl);
881 
882 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
883 		struct mlx5e_rl_worker *rlw = rl->workers + j;
884 
885 		/* start worker thread */
886 		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
887 		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
888 		if (error != 0) {
889 			mlx5_en_err(rl->priv->ifp,
890 			    "kproc_kthread_add failed: %d\n", error);
891 			rlw->worker_done = 1;
892 		}
893 	}
894 
895 	rl->opened = 1;
896 
897 	return (0);
898 }
899 
900 static void
901 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
902 {
903 	struct mlx5e_rl_priv_data *rl = &priv->rl;
904 	uint64_t y;
905 
906 	if (rl->opened == 0)
907 		return;
908 
909 	/* tear down worker threads simultaneously */
910 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
911 		struct mlx5e_rl_worker *rlw = rl->workers + y;
912 
913 		/* tear down worker before freeing SQs */
914 		MLX5E_RL_WORKER_LOCK(rlw);
915 		if (rlw->worker_done == 0) {
916 			rlw->worker_done = 1;
917 			cv_broadcast(&rlw->cv);
918 		} else {
919 			/* XXX thread not started */
920 			rlw->worker_done = 0;
921 		}
922 		MLX5E_RL_WORKER_UNLOCK(rlw);
923 	}
924 
925 	/* wait for worker threads to exit */
926 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
927 		struct mlx5e_rl_worker *rlw = rl->workers + y;
928 
929 		/* tear down worker before freeing SQs */
930 		MLX5E_RL_WORKER_LOCK(rlw);
931 		while (rlw->worker_done != 0)
932 			cv_wait(&rlw->cv, &rlw->mtx);
933 		MLX5E_RL_WORKER_UNLOCK(rlw);
934 	}
935 
936 	rl->opened = 0;
937 }
938 
939 static void
940 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
941 {
942 	unsigned x;
943 
944 	MLX5E_RL_WLOCK(rl);
945 	for (x = 0; x != rl->param.tx_rates_def; x++)
946 		rl->rate_limit_table[x] = 0;
947 	MLX5E_RL_WUNLOCK(rl);
948 }
949 
950 void
951 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
952 {
953 	struct mlx5e_rl_priv_data *rl = &priv->rl;
954 	uint64_t y;
955 
956 	/* check if there is support for packet pacing */
957 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
958 		return;
959 
960 	/* TODO check if there is support for packet pacing */
961 
962 	sysctl_ctx_free(&rl->ctx);
963 
964 	PRIV_LOCK(priv);
965 	mlx5e_rl_close_workers(priv);
966 	PRIV_UNLOCK(priv);
967 
968 	mlx5e_rl_reset_rates(rl);
969 
970 	/* close TIS domain */
971 	mlx5e_rl_close_tis(priv);
972 
973 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
974 		struct mlx5e_rl_worker *rlw = rl->workers + y;
975 
976 		cv_destroy(&rlw->cv);
977 		mtx_destroy(&rlw->mtx);
978 		free(rlw->channels, M_MLX5EN);
979 	}
980 	free(rl->rate_limit_table, M_MLX5EN);
981 	free(rl->workers, M_MLX5EN);
982 	sx_destroy(&rl->rl_sxlock);
983 }
984 
985 static void
986 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
987     struct mlx5e_rl_channel *channel)
988 {
989 	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
990 	cv_broadcast(&rlw->cv);
991 }
992 
993 static void
994 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
995 {
996 	if (channel == NULL)
997 		return;
998 
999 	MLX5E_RL_WORKER_LOCK(rlw);
1000 	switch (channel->state) {
1001 	case MLX5E_RL_ST_MODIFY:
1002 		channel->state = MLX5E_RL_ST_DESTROY;
1003 		break;
1004 	case MLX5E_RL_ST_USED:
1005 		channel->state = MLX5E_RL_ST_DESTROY;
1006 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1007 		break;
1008 	default:
1009 		break;
1010 	}
1011 	MLX5E_RL_WORKER_UNLOCK(rlw);
1012 }
1013 
1014 static int
1015 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1016 {
1017 
1018 	MLX5E_RL_WORKER_LOCK(rlw);
1019 	channel->new_rate = rate;
1020 	switch (channel->state) {
1021 	case MLX5E_RL_ST_USED:
1022 		channel->state = MLX5E_RL_ST_MODIFY;
1023 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1024 		break;
1025 	default:
1026 		break;
1027 	}
1028 	MLX5E_RL_WORKER_UNLOCK(rlw);
1029 
1030 	return (0);
1031 }
1032 
1033 static int
1034 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
1035     union if_snd_tag_query_params *params)
1036 {
1037 	int retval;
1038 
1039 	MLX5E_RL_WORKER_LOCK(rlw);
1040 	switch (channel->state) {
1041 	case MLX5E_RL_ST_USED:
1042 		params->rate_limit.max_rate = channel->last_rate;
1043 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1044 		retval = 0;
1045 		break;
1046 	case MLX5E_RL_ST_MODIFY:
1047 		params->rate_limit.max_rate = channel->last_rate;
1048 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1049 		retval = EBUSY;
1050 		break;
1051 	default:
1052 		retval = EINVAL;
1053 		break;
1054 	}
1055 	MLX5E_RL_WORKER_UNLOCK(rlw);
1056 
1057 	return (retval);
1058 }
1059 
1060 static int
1061 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1062     struct mlx5e_rl_channel **pchannel)
1063 {
1064 	struct mlx5e_rl_channel *channel;
1065 	int retval = ENOMEM;
1066 
1067 	MLX5E_RL_WORKER_LOCK(rlw);
1068 	/* Check for available channel in free list */
1069 	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1070 		retval = 0;
1071 		/* Remove head index from available list */
1072 		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1073 		channel->state = MLX5E_RL_ST_USED;
1074 		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1075 	} else {
1076 		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1077 	}
1078 	MLX5E_RL_WORKER_UNLOCK(rlw);
1079 
1080 	*pchannel = channel;
1081 #ifdef RATELIMIT_DEBUG
1082 	mlx5_en_info(rlw->priv->ifp,
1083 	    "Channel pointer for rate limit connection is %p\n", channel);
1084 #endif
1085 	return (retval);
1086 }
1087 
1088 int
1089 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
1090     union if_snd_tag_alloc_params *params,
1091     struct m_snd_tag **ppmt)
1092 {
1093 	struct mlx5e_rl_channel *channel;
1094 	struct mlx5e_rl_worker *rlw;
1095 	struct mlx5e_priv *priv;
1096 	int error;
1097 
1098 	priv = ifp->if_softc;
1099 
1100 	/* check if there is support for packet pacing or if device is going away */
1101 	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1102 	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1103 	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1104 		return (EOPNOTSUPP);
1105 
1106 	/* compute worker thread this TCP connection belongs to */
1107 	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1108 	    priv->rl.param.tx_worker_threads_def);
1109 
1110 	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1111 	if (error != 0)
1112 		goto done;
1113 
1114 	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1115 	if (error != 0) {
1116 		mlx5e_rl_free(rlw, channel);
1117 		goto done;
1118 	}
1119 
1120 	/* store pointer to mbuf tag */
1121 	MPASS(channel->tag.refcount == 0);
1122 	m_snd_tag_init(&channel->tag, ifp, &mlx5e_rl_snd_tag_sw);
1123 	*ppmt = &channel->tag;
1124 done:
1125 	return (error);
1126 }
1127 
1128 
1129 static int
1130 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1131 {
1132 	struct mlx5e_rl_channel *channel =
1133 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1134 
1135 	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1136 }
1137 
1138 static int
1139 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1140 {
1141 	struct mlx5e_rl_channel *channel =
1142 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1143 
1144 	return (mlx5e_rl_query(channel->worker, channel, params));
1145 }
1146 
1147 static void
1148 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1149 {
1150 	struct mlx5e_rl_channel *channel =
1151 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1152 
1153 	mlx5e_rl_free(channel->worker, channel);
1154 }
1155 
1156 static int
1157 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1158 {
1159 	struct mlx5e_rl_priv_data *rl = arg1;
1160 	struct mlx5e_priv *priv = rl->priv;
1161 	struct sbuf sbuf;
1162 	unsigned x;
1163 	int error;
1164 
1165 	error = sysctl_wire_old_buffer(req, 0);
1166 	if (error != 0)
1167 		return (error);
1168 
1169 	PRIV_LOCK(priv);
1170 
1171 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1172 
1173 	sbuf_printf(&sbuf,
1174 	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1175 	    "\t" "--------------------------------------------\n");
1176 
1177 	MLX5E_RL_RLOCK(rl);
1178 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1179 		if (rl->rate_limit_table[x] == 0)
1180 			continue;
1181 
1182 		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1183 		    x, (unsigned)rl->param.tx_burst_size,
1184 		    (long long)rl->rate_limit_table[x]);
1185 	}
1186 	MLX5E_RL_RUNLOCK(rl);
1187 
1188 	error = sbuf_finish(&sbuf);
1189 	sbuf_delete(&sbuf);
1190 
1191 	PRIV_UNLOCK(priv);
1192 
1193 	return (error);
1194 }
1195 
1196 static int
1197 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1198 {
1199 	uint64_t x;
1200 	uint64_t y;
1201 
1202 	MLX5E_RL_WLOCK(rl);
1203 	/* compute channel parameters once */
1204 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1205 	MLX5E_RL_WUNLOCK(rl);
1206 
1207 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1208 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1209 
1210 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1211 			struct mlx5e_rl_channel *channel;
1212 			struct mlx5e_sq *sq;
1213 
1214 			channel = rlw->channels + x;
1215 			sq = channel->sq;
1216 
1217 			if (sq == NULL)
1218 				continue;
1219 
1220 			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1221 				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1222 				    rl->param.tx_coalesce_usecs,
1223 				    rl->param.tx_coalesce_pkts,
1224 				    rl->param.tx_coalesce_mode);
1225 			} else {
1226 				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1227 				    rl->param.tx_coalesce_usecs,
1228 				    rl->param.tx_coalesce_pkts);
1229 			}
1230 		}
1231 	}
1232 	return (0);
1233 }
1234 
1235 void
1236 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
1237 {
1238 	uint64_t x;
1239 	uint64_t y;
1240 
1241 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1242 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1243 
1244 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1245 			struct mlx5e_rl_channel *channel;
1246 			struct mlx5e_sq *sq;
1247 
1248 			channel = rlw->channels + x;
1249 			sq = channel->sq;
1250 
1251 			if (sq == NULL)
1252 				continue;
1253 
1254 			mtx_lock(&sq->lock);
1255 			mlx5e_update_sq_inline(sq);
1256 			mtx_unlock(&sq->lock);
1257 		}
1258 	}
1259 }
1260 
1261 static int
1262 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1263 {
1264 	unsigned x;
1265 	int error;
1266 
1267 	if (value < 1000 ||
1268 	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1269 		return (EINVAL);
1270 
1271 	MLX5E_RL_WLOCK(rl);
1272 	error = ENOMEM;
1273 
1274 	/* check if rate already exists */
1275 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1276 		if (rl->rate_limit_table[x] != value)
1277 			continue;
1278 		error = EEXIST;
1279 		break;
1280 	}
1281 
1282 	/* check if there is a free rate entry */
1283 	if (x == rl->param.tx_rates_def) {
1284 		for (x = 0; x != rl->param.tx_rates_def; x++) {
1285 			if (rl->rate_limit_table[x] != 0)
1286 				continue;
1287 			rl->rate_limit_table[x] = value;
1288 			error = 0;
1289 			break;
1290 		}
1291 	}
1292 	MLX5E_RL_WUNLOCK(rl);
1293 
1294 	return (error);
1295 }
1296 
1297 static int
1298 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1299 {
1300 	unsigned x;
1301 	int error;
1302 
1303 	if (value == 0)
1304 		return (EINVAL);
1305 
1306 	MLX5E_RL_WLOCK(rl);
1307 
1308 	/* check if rate already exists */
1309 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1310 		if (rl->rate_limit_table[x] != value)
1311 			continue;
1312 		/* free up rate */
1313 		rl->rate_limit_table[x] = 0;
1314 		break;
1315 	}
1316 
1317 	/* check if there is a free rate entry */
1318 	if (x == rl->param.tx_rates_def)
1319 		error = ENOENT;
1320 	else
1321 		error = 0;
1322 	MLX5E_RL_WUNLOCK(rl);
1323 
1324 	return (error);
1325 }
1326 
1327 static int
1328 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1329 {
1330 	struct mlx5e_rl_priv_data *rl = arg1;
1331 	struct mlx5e_priv *priv = rl->priv;
1332 	unsigned mode_modify;
1333 	unsigned was_opened;
1334 	uint64_t value;
1335 	uint64_t old;
1336 	int error;
1337 
1338 	PRIV_LOCK(priv);
1339 
1340 	MLX5E_RL_RLOCK(rl);
1341 	value = rl->param.arg[arg2];
1342 	MLX5E_RL_RUNLOCK(rl);
1343 
1344 	if (req != NULL) {
1345 		old = value;
1346 		error = sysctl_handle_64(oidp, &value, 0, req);
1347 		if (error || req->newptr == NULL ||
1348 		    value == rl->param.arg[arg2])
1349 			goto done;
1350 	} else {
1351 		old = 0;
1352 		error = 0;
1353 	}
1354 
1355 	/* check if device is gone */
1356 	if (priv->gone) {
1357 		error = ENXIO;
1358 		goto done;
1359 	}
1360 	was_opened = rl->opened;
1361 	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1362 
1363 	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1364 	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1365 		if (value > rl->param.tx_worker_threads_max)
1366 			value = rl->param.tx_worker_threads_max;
1367 		else if (value < 1)
1368 			value = 1;
1369 
1370 		/* store new value */
1371 		rl->param.arg[arg2] = value;
1372 		break;
1373 
1374 	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1375 		if (value > rl->param.tx_channels_per_worker_max)
1376 			value = rl->param.tx_channels_per_worker_max;
1377 		else if (value < 1)
1378 			value = 1;
1379 
1380 		/* store new value */
1381 		rl->param.arg[arg2] = value;
1382 		break;
1383 
1384 	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1385 		if (value > rl->param.tx_rates_max)
1386 			value = rl->param.tx_rates_max;
1387 		else if (value < 1)
1388 			value = 1;
1389 
1390 		/* store new value */
1391 		rl->param.arg[arg2] = value;
1392 		break;
1393 
1394 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1395 		/* range check */
1396 		if (value < 1)
1397 			value = 0;
1398 		else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1399 			value = MLX5E_FLD_MAX(cqc, cq_period);
1400 
1401 		/* store new value */
1402 		rl->param.arg[arg2] = value;
1403 
1404 		/* check to avoid down and up the network interface */
1405 		if (was_opened)
1406 			error = mlx5e_rl_refresh_channel_params(rl);
1407 		break;
1408 
1409 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1410 		/* import TX coal pkts */
1411 		if (value < 1)
1412 			value = 0;
1413 		else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1414 			value = MLX5E_FLD_MAX(cqc, cq_max_count);
1415 
1416 		/* store new value */
1417 		rl->param.arg[arg2] = value;
1418 
1419 		/* check to avoid down and up the network interface */
1420 		if (was_opened)
1421 			error = mlx5e_rl_refresh_channel_params(rl);
1422 		break;
1423 
1424 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1425 		/* network interface must be down */
1426 		if (was_opened != 0 && mode_modify == 0)
1427 			mlx5e_rl_close_workers(priv);
1428 
1429 		/* import TX coalesce mode */
1430 		if (value != 0)
1431 			value = 1;
1432 
1433 		/* store new value */
1434 		rl->param.arg[arg2] = value;
1435 
1436 		/* restart network interface, if any */
1437 		if (was_opened != 0) {
1438 			if (mode_modify == 0)
1439 				mlx5e_rl_open_workers(priv);
1440 			else
1441 				error = mlx5e_rl_refresh_channel_params(rl);
1442 		}
1443 		break;
1444 
1445 	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1446 		/* network interface must be down */
1447 		if (was_opened)
1448 			mlx5e_rl_close_workers(priv);
1449 
1450 		/* import TX queue size */
1451 		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1452 			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1453 		else if (value > priv->params_ethtool.tx_queue_size_max)
1454 			value = priv->params_ethtool.tx_queue_size_max;
1455 
1456 		/* store actual TX queue size */
1457 		value = 1ULL << order_base_2(value);
1458 
1459 		/* store new value */
1460 		rl->param.arg[arg2] = value;
1461 
1462 		/* verify TX completion factor */
1463 		mlx5e_rl_sync_tx_completion_fact(rl);
1464 
1465 		/* restart network interface, if any */
1466 		if (was_opened)
1467 			mlx5e_rl_open_workers(priv);
1468 		break;
1469 
1470 	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1471 		/* network interface must be down */
1472 		if (was_opened)
1473 			mlx5e_rl_close_workers(priv);
1474 
1475 		/* store new value */
1476 		rl->param.arg[arg2] = value;
1477 
1478 		/* verify parameter */
1479 		mlx5e_rl_sync_tx_completion_fact(rl);
1480 
1481 		/* restart network interface, if any */
1482 		if (was_opened)
1483 			mlx5e_rl_open_workers(priv);
1484 		break;
1485 
1486 	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1487 		error = mlx5e_rl_tx_limit_add(rl, value);
1488 		break;
1489 
1490 	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1491 		error = mlx5e_rl_tx_limit_clr(rl, value);
1492 		break;
1493 
1494 	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1495 		/* range check */
1496 		if (value > rl->param.tx_allowed_deviation_max)
1497 			value = rl->param.tx_allowed_deviation_max;
1498 		else if (value < rl->param.tx_allowed_deviation_min)
1499 			value = rl->param.tx_allowed_deviation_min;
1500 
1501 		MLX5E_RL_WLOCK(rl);
1502 		rl->param.arg[arg2] = value;
1503 		MLX5E_RL_WUNLOCK(rl);
1504 		break;
1505 
1506 	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1507 		/* range check */
1508 		if (value > rl->param.tx_burst_size_max)
1509 			value = rl->param.tx_burst_size_max;
1510 		else if (value < rl->param.tx_burst_size_min)
1511 			value = rl->param.tx_burst_size_min;
1512 
1513 		MLX5E_RL_WLOCK(rl);
1514 		rl->param.arg[arg2] = value;
1515 		MLX5E_RL_WUNLOCK(rl);
1516 		break;
1517 
1518 	default:
1519 		break;
1520 	}
1521 done:
1522 	PRIV_UNLOCK(priv);
1523 	return (error);
1524 }
1525 
1526 static void
1527 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1528     struct sysctl_oid *node, const char *name, const char *desc)
1529 {
1530 	/*
1531 	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1532 	 * take care of loading default sysctl value from the kernel
1533 	 * environment, if any:
1534 	 */
1535 	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1536 		/* read-only SYSCTLs */
1537 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1538 		    name, CTLTYPE_U64 | CTLFLAG_RD |
1539 		    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1540 	} else {
1541 		if (strstr(name, "_def") != 0) {
1542 #ifdef RATELIMIT_DEBUG
1543 			/* tunable read-only advanced SYSCTLs */
1544 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1545 			    name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1546 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1547 #endif
1548 		} else {
1549 			/* read-write SYSCTLs */
1550 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1551 			    name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1552 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1553 		}
1554 	}
1555 }
1556 
1557 static void
1558 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1559     struct sysctl_oid *node, const char *name, const char *desc)
1560 {
1561 	/* read-only SYSCTLs */
1562 	SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1563 	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
1564 }
1565 
1566 #else
1567 
1568 int
1569 mlx5e_rl_init(struct mlx5e_priv *priv)
1570 {
1571 
1572 	return (0);
1573 }
1574 
1575 void
1576 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1577 {
1578 	/* NOP */
1579 }
1580 
1581 #endif		/* RATELIMIT */
1582