xref: /freebsd/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c (revision c66ec88fed842fbaad62c30d510644ceb7bd2d71)
1 /*-
2  * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "en.h"
29 
30 #ifdef RATELIMIT
31 
32 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
33 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
34 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
35 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
36     struct sysctl_oid *, const char *name, const char *desc);
37 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
38       struct sysctl_oid *node, const char *name, const char *desc);
39 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
40 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
41 
42 static void
43 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
44     struct mlx5e_sq_param *param)
45 {
46 	void *sqc = param->sqc;
47 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
48 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
49 
50 	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
51 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
52 	MLX5_SET(wq, wq, pd, rl->priv->pdn);
53 
54 	param->wq.buf_numa_node = 0;
55 	param->wq.db_numa_node = 0;
56 	param->wq.linear = 1;
57 }
58 
59 static void
60 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
61     struct mlx5e_cq_param *param)
62 {
63 	void *cqc = param->cqc;
64 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
65 
66 	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
67 	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
68 	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
69 
70 	switch (rl->param.tx_coalesce_mode) {
71 	case 0:
72 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
73 		break;
74 	default:
75 		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
76 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
77 		else
78 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
79 		break;
80 	}
81 }
82 
83 static void
84 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
85     struct mlx5e_rl_channel_param *cparam)
86 {
87 	memset(cparam, 0, sizeof(*cparam));
88 
89 	mlx5e_rl_build_sq_param(rl, &cparam->sq);
90 	mlx5e_rl_build_cq_param(rl, &cparam->cq);
91 }
92 
93 static int
94 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
95     struct mlx5e_sq_param *param, int ix)
96 {
97 	struct mlx5_core_dev *mdev = priv->mdev;
98 	void *sqc = param->sqc;
99 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
100 	int err;
101 
102 	/* Create DMA descriptor TAG */
103 	if ((err = -bus_dma_tag_create(
104 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
105 	    1,				/* any alignment */
106 	    0,				/* no boundary */
107 	    BUS_SPACE_MAXADDR,		/* lowaddr */
108 	    BUS_SPACE_MAXADDR,		/* highaddr */
109 	    NULL, NULL,			/* filter, filterarg */
110 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
111 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
112 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
113 	    0,				/* flags */
114 	    NULL, NULL,			/* lockfunc, lockfuncarg */
115 	    &sq->dma_tag)))
116 		goto done;
117 
118 	/* use shared UAR */
119 	sq->uar_map = priv->bfreg.map;
120 
121 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
122 	    &sq->wq_ctrl);
123 	if (err)
124 		goto err_free_dma_tag;
125 
126 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
127 
128 	err = mlx5e_alloc_sq_db(sq);
129 	if (err)
130 		goto err_sq_wq_destroy;
131 
132 	sq->mkey_be = cpu_to_be32(priv->mr.key);
133 	sq->ifp = priv->ifp;
134 	sq->priv = priv;
135 
136 	mlx5e_update_sq_inline(sq);
137 
138 	return (0);
139 
140 err_sq_wq_destroy:
141 	mlx5_wq_destroy(&sq->wq_ctrl);
142 err_free_dma_tag:
143 	bus_dma_tag_destroy(sq->dma_tag);
144 done:
145 	return (err);
146 }
147 
148 static void
149 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
150 {
151 
152 	mlx5e_free_sq_db(sq);
153 	mlx5_wq_destroy(&sq->wq_ctrl);
154 	bus_dma_tag_destroy(sq->dma_tag);
155 }
156 
157 static int
158 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
159     struct mlx5e_sq_param *param, int ix)
160 {
161 	int err;
162 
163 	err = mlx5e_rl_create_sq(priv, sq, param, ix);
164 	if (err)
165 		return (err);
166 
167 	err = mlx5e_enable_sq(sq, param, priv->rl.tisn);
168 	if (err)
169 		goto err_destroy_sq;
170 
171 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
172 	if (err)
173 		goto err_disable_sq;
174 
175 	WRITE_ONCE(sq->running, 1);
176 
177 	return (0);
178 
179 err_disable_sq:
180 	mlx5e_disable_sq(sq);
181 err_destroy_sq:
182 	mlx5e_rl_destroy_sq(sq);
183 
184 	return (err);
185 }
186 
187 static void
188 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
189 {
190 	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
191 	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
192 
193 	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
194 
195 	sq->cev_factor = priv->rl.param.tx_completion_fact;
196 
197 	/* ensure the TX completion event factor is not zero */
198 	if (sq->cev_factor == 0)
199 		sq->cev_factor = 1;
200 }
201 
202 static int
203 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
204     struct mlx5e_rl_channel_param *cparam,
205     struct mlx5e_sq *volatile *ppsq)
206 {
207 	struct mlx5e_priv *priv = rlw->priv;
208 	struct mlx5e_sq *sq;
209 	int err;
210 
211 	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
212 
213 	/* init mutexes */
214 	mlx5e_rl_chan_mtx_init(priv, sq);
215 
216 	/* open TX completion queue */
217 	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
218 	    &mlx5e_tx_cq_comp, eq_ix);
219 	if (err)
220 		goto err_free;
221 
222 	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
223 	if (err)
224 		goto err_close_tx_cq;
225 
226 	/* store TX channel pointer */
227 	*ppsq = sq;
228 
229 	/* poll TX queue initially */
230 	sq->cq.mcq.comp(&sq->cq.mcq, NULL);
231 
232 	return (0);
233 
234 err_close_tx_cq:
235 	mlx5e_close_cq(&sq->cq);
236 
237 err_free:
238 	/* destroy mutexes */
239 	mtx_destroy(&sq->lock);
240 	mtx_destroy(&sq->comp_lock);
241 	free(sq, M_MLX5EN);
242 	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
243 	return (err);
244 }
245 
246 static void
247 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
248 {
249 	struct mlx5e_sq *sq = *ppsq;
250 
251 	/* check if channel is already closed */
252 	if (sq == NULL)
253 		return;
254 	/* ensure channel pointer is no longer used */
255 	*ppsq = NULL;
256 
257 	/* teardown and destroy SQ */
258 	mlx5e_drain_sq(sq);
259 	mlx5e_disable_sq(sq);
260 	mlx5e_rl_destroy_sq(sq);
261 
262 	/* close CQ */
263 	mlx5e_close_cq(&sq->cq);
264 
265 	/* destroy mutexes */
266 	mtx_destroy(&sq->lock);
267 	mtx_destroy(&sq->comp_lock);
268 
269 	free(sq, M_MLX5EN);
270 }
271 
272 static void
273 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
274 {
275 	/*
276 	 * Limit the maximum distance between completion events to
277 	 * half of the currently set TX queue size.
278 	 *
279 	 * The maximum number of queue entries a single IP packet can
280 	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
281 	 *
282 	 * The worst case max value is then given as below:
283 	 */
284 	uint64_t max = rl->param.tx_queue_size /
285 	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
286 
287 	/*
288 	 * Update the maximum completion factor value in case the
289 	 * tx_queue_size field changed. Ensure we don't overflow
290 	 * 16-bits.
291 	 */
292 	if (max < 1)
293 		max = 1;
294 	else if (max > 65535)
295 		max = 65535;
296 	rl->param.tx_completion_fact_max = max;
297 
298 	/*
299 	 * Verify that the current TX completion factor is within the
300 	 * given limits:
301 	 */
302 	if (rl->param.tx_completion_fact < 1)
303 		rl->param.tx_completion_fact = 1;
304 	else if (rl->param.tx_completion_fact > max)
305 		rl->param.tx_completion_fact = max;
306 }
307 
308 static int
309 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
310 {
311 	struct mlx5e_priv *priv = sq->priv;
312 	struct mlx5_core_dev *mdev = priv->mdev;
313 
314 	void *in;
315 	void *sqc;
316 	int inlen;
317 	int err;
318 
319 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
320 	in = mlx5_vzalloc(inlen);
321 	if (in == NULL)
322 		return (-ENOMEM);
323 
324 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
325 
326 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
327 	MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
328 	MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
329 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
330 	MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
331 
332 	err = mlx5_core_modify_sq(mdev, in, inlen);
333 
334 	kvfree(in);
335 
336 	return (err);
337 }
338 
339 /*
340  * This function will search the configured rate limit table for the
341  * best match to avoid that a single socket based application can
342  * allocate all the available hardware rates. If the user selected
343  * rate deviates too much from the closes rate available in the rate
344  * limit table, unlimited rate will be selected.
345  */
346 static uint64_t
347 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
348 {
349 	uint64_t distance = -1ULL;
350 	uint64_t diff;
351 	uint64_t retval = 0;		/* unlimited */
352 	uint64_t x;
353 
354 	/* search for closest rate */
355 	for (x = 0; x != rl->param.tx_rates_def; x++) {
356 		uint64_t rate = rl->rate_limit_table[x];
357 		if (rate == 0)
358 			continue;
359 
360 		if (rate > user_rate)
361 			diff = rate - user_rate;
362 		else
363 			diff = user_rate - rate;
364 
365 		/* check if distance is smaller than previous rate */
366 		if (diff < distance) {
367 			distance = diff;
368 			retval = rate;
369 		}
370 	}
371 
372 	/* range check for multiplication below */
373 	if (user_rate > rl->param.tx_limit_max)
374 		user_rate = rl->param.tx_limit_max;
375 
376 	/* fallback to unlimited, if rate deviates too much */
377 	if (distance > howmany(user_rate *
378 	    rl->param.tx_allowed_deviation, 1000ULL))
379 		retval = 0;
380 
381 	return (retval);
382 }
383 
384 /*
385  * This function sets the requested rate for a rate limit channel, in
386  * bits per second. The requested rate will be filtered through the
387  * find best rate function above.
388  */
389 static int
390 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
391     struct mlx5e_rl_channel *channel, uint64_t rate)
392 {
393 	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
394 	struct mlx5e_sq *sq;
395 	uint64_t temp;
396 	uint16_t index;
397 	uint16_t burst;
398 	int error;
399 
400 	if (rate != 0) {
401 		MLX5E_RL_WORKER_UNLOCK(rlw);
402 
403 		MLX5E_RL_RLOCK(rl);
404 
405 		/* get current burst size in bytes */
406 		temp = rl->param.tx_burst_size *
407 		    MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
408 
409 		/* limit burst size to 64K currently */
410 		if (temp > 65535)
411 			temp = 65535;
412 		burst = temp;
413 
414 		/* find best rate */
415 		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
416 
417 		MLX5E_RL_RUNLOCK(rl);
418 
419 		if (rate == 0) {
420 			/* rate doesn't exist, fallback to unlimited */
421 			index = 0;
422 			rate = 0;
423 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
424 		} else {
425 			/* get a reference on the new rate */
426 			error = -mlx5_rl_add_rate(rlw->priv->mdev,
427 			    howmany(rate, 1000), burst, &index);
428 
429 			if (error != 0) {
430 				/* adding rate failed, fallback to unlimited */
431 				index = 0;
432 				rate = 0;
433 				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
434 			}
435 		}
436 		MLX5E_RL_WORKER_LOCK(rlw);
437 	} else {
438 		index = 0;
439 		burst = 0;	/* default */
440 	}
441 
442 	/* atomically swap rates */
443 	temp = channel->last_rate;
444 	channel->last_rate = rate;
445 	rate = temp;
446 
447 	/* atomically swap burst size */
448 	temp = channel->last_burst;
449 	channel->last_burst = burst;
450 	burst = temp;
451 
452 	MLX5E_RL_WORKER_UNLOCK(rlw);
453 	/* put reference on the old rate, if any */
454 	if (rate != 0) {
455 		mlx5_rl_remove_rate(rlw->priv->mdev,
456 		    howmany(rate, 1000), burst);
457 	}
458 
459 	/* set new rate, if SQ is running */
460 	sq = channel->sq;
461 	if (sq != NULL && READ_ONCE(sq->running) != 0) {
462 		error = mlx5e_rl_modify_sq(sq, index);
463 		if (error != 0)
464 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
465 	} else
466 		error = 0;
467 	MLX5E_RL_WORKER_LOCK(rlw);
468 
469 	return (-error);
470 }
471 
472 static void
473 mlx5e_rl_worker(void *arg)
474 {
475 	struct thread *td;
476 	struct mlx5e_rl_worker *rlw = arg;
477 	struct mlx5e_rl_channel *channel;
478 	struct mlx5e_priv *priv;
479 	unsigned ix;
480 	uint64_t x;
481 	int error;
482 
483 	/* set thread priority */
484 	td = curthread;
485 
486 	thread_lock(td);
487 	sched_prio(td, PI_SWI(SWI_NET));
488 	thread_unlock(td);
489 
490 	priv = rlw->priv;
491 
492 	/* compute completion vector */
493 	ix = (rlw - priv->rl.workers) %
494 	    priv->mdev->priv.eq_table.num_comp_vectors;
495 
496 	/* TODO bind to CPU */
497 
498 	/* open all the SQs */
499 	MLX5E_RL_WORKER_LOCK(rlw);
500 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
501 		struct mlx5e_rl_channel *channel = rlw->channels + x;
502 
503 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
504 		if (channel->state == MLX5E_RL_ST_FREE)
505 			continue;
506 #endif
507 		MLX5E_RL_WORKER_UNLOCK(rlw);
508 
509 		MLX5E_RL_RLOCK(&priv->rl);
510 		error = mlx5e_rl_open_channel(rlw, ix,
511 		    &priv->rl.chan_param, &channel->sq);
512 		MLX5E_RL_RUNLOCK(&priv->rl);
513 
514 		MLX5E_RL_WORKER_LOCK(rlw);
515 		if (error != 0) {
516 			mlx5_en_err(priv->ifp,
517 			    "mlx5e_rl_open_channel failed: %d\n", error);
518 			break;
519 		}
520 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
521 	}
522 	while (1) {
523 		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
524 			/* check if we are tearing down */
525 			if (rlw->worker_done != 0)
526 				break;
527 			cv_wait(&rlw->cv, &rlw->mtx);
528 		}
529 		/* check if we are tearing down */
530 		if (rlw->worker_done != 0)
531 			break;
532 		channel = STAILQ_FIRST(&rlw->process_head);
533 		if (channel != NULL) {
534 			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
535 
536 			switch (channel->state) {
537 			case MLX5E_RL_ST_MODIFY:
538 				channel->state = MLX5E_RL_ST_USED;
539 				MLX5E_RL_WORKER_UNLOCK(rlw);
540 
541 				/* create channel by demand */
542 				if (channel->sq == NULL) {
543 					MLX5E_RL_RLOCK(&priv->rl);
544 					error = mlx5e_rl_open_channel(rlw, ix,
545 					    &priv->rl.chan_param, &channel->sq);
546 					MLX5E_RL_RUNLOCK(&priv->rl);
547 
548 					if (error != 0) {
549 						mlx5_en_err(priv->ifp,
550 						    "mlx5e_rl_open_channel failed: %d\n", error);
551 					} else {
552 						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
553 					}
554 				} else {
555 					mlx5e_resume_sq(channel->sq);
556 				}
557 
558 				MLX5E_RL_WORKER_LOCK(rlw);
559 				/* convert from bytes/s to bits/s and set new rate */
560 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
561 				    channel->new_rate * 8ULL);
562 				if (error != 0) {
563 					mlx5_en_err(priv->ifp,
564 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
565 					    error);
566 				}
567 				break;
568 
569 			case MLX5E_RL_ST_DESTROY:
570 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
571 				if (error != 0) {
572 					mlx5_en_err(priv->ifp,
573 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
574 					    error);
575 				}
576 				if (channel->sq != NULL) {
577 					/*
578 					 * Make sure all packets are
579 					 * transmitted before SQ is
580 					 * returned to free list:
581 					 */
582 					MLX5E_RL_WORKER_UNLOCK(rlw);
583 					mlx5e_drain_sq(channel->sq);
584 					MLX5E_RL_WORKER_LOCK(rlw);
585 				}
586 				/* put the channel back into the free list */
587 				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
588 				channel->state = MLX5E_RL_ST_FREE;
589 				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
590 				break;
591 			default:
592 				/* NOP */
593 				break;
594 			}
595 		}
596 	}
597 
598 	/* close all the SQs */
599 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
600 		struct mlx5e_rl_channel *channel = rlw->channels + x;
601 
602 		/* update the initial rate */
603 		channel->init_rate = channel->last_rate;
604 
605 		/* make sure we free up the rate resource */
606 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
607 
608 		if (channel->sq != NULL) {
609 			MLX5E_RL_WORKER_UNLOCK(rlw);
610 			mlx5e_rl_close_channel(&channel->sq);
611 			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
612 			MLX5E_RL_WORKER_LOCK(rlw);
613 		}
614 	}
615 
616 	rlw->worker_done = 0;
617 	cv_broadcast(&rlw->cv);
618 	MLX5E_RL_WORKER_UNLOCK(rlw);
619 
620 	kthread_exit();
621 }
622 
623 static int
624 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
625 {
626 	struct mlx5_core_dev *mdev = priv->mdev;
627 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
628 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
629 
630 	memset(in, 0, sizeof(in));
631 
632 	MLX5_SET(tisc, tisc, prio, 0);
633 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
634 
635 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
636 }
637 
638 static void
639 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
640 {
641 	mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn);
642 }
643 
644 static void
645 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
646     struct mlx5_core_dev *mdev)
647 {
648 	/* ratelimit workers */
649 	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
650 	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
651 
652 	/* range check */
653 	if (param->tx_worker_threads_def == 0 ||
654 	    param->tx_worker_threads_def > param->tx_worker_threads_max)
655 		param->tx_worker_threads_def = param->tx_worker_threads_max;
656 
657 	/* ratelimit channels */
658 	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
659 	    param->tx_worker_threads_def;
660 	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
661 
662 	/* range check */
663 	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
664 		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
665 
666 	/* set default burst size */
667 	param->tx_burst_size = 4;	/* MTUs */
668 
669 	/*
670 	 * Set maximum burst size
671 	 *
672 	 * The burst size is multiplied by the MTU and clamped to the
673 	 * range 0 ... 65535 bytes inclusivly before fed into the
674 	 * firmware.
675 	 *
676 	 * NOTE: If the burst size or MTU is changed only ratelimit
677 	 * connections made after the change will use the new burst
678 	 * size.
679 	 */
680 	param->tx_burst_size_max = 255;
681 
682 	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
683 	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
684 	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
685 
686 	/* ratelimit table size */
687 	param->tx_rates_max = mdev->priv.rl_table.max_size;
688 
689 	/* range check */
690 	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
691 		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
692 
693 	/* set default number of rates */
694 	param->tx_rates_def = param->tx_rates_max;
695 
696 	/* set maximum allowed rate deviation */
697 	if (param->tx_limit_max != 0) {
698 		/*
699 		 * Make sure the deviation multiplication doesn't
700 		 * overflow unsigned 64-bit:
701 		 */
702 		param->tx_allowed_deviation_max = -1ULL /
703 		    param->tx_limit_max;
704 	}
705 	/* set default rate deviation */
706 	param->tx_allowed_deviation = 50;	/* 5.0% */
707 
708 	/* channel parameters */
709 	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
710 	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
711 	param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
712 	param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
713 	param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
714 }
715 
716 static const char *mlx5e_rl_params_desc[] = {
717 	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
718 };
719 
720 static const char *mlx5e_rl_table_params_desc[] = {
721 	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
722 };
723 
724 static const char *mlx5e_rl_stats_desc[] = {
725 	MLX5E_RL_STATS(MLX5E_STATS_DESC)
726 };
727 
728 int
729 mlx5e_rl_init(struct mlx5e_priv *priv)
730 {
731 	struct mlx5e_rl_priv_data *rl = &priv->rl;
732 	struct sysctl_oid *node;
733 	struct sysctl_oid *stats;
734 	char buf[64];
735 	uint64_t i;
736 	uint64_t j;
737 	int error;
738 
739 	/* check if there is support for packet pacing */
740 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
741 		return (0);
742 
743 	rl->priv = priv;
744 
745 	sysctl_ctx_init(&rl->ctx);
746 
747 	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
748 
749 	/* open own TIS domain for ratelimit SQs */
750 	error = mlx5e_rl_open_tis(priv);
751 	if (error)
752 		goto done;
753 
754 	/* setup default value for parameters */
755 	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
756 
757 	/* update the completion factor */
758 	mlx5e_rl_sync_tx_completion_fact(rl);
759 
760 	/* create root node */
761 	node = SYSCTL_ADD_NODE(&rl->ctx,
762 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
763 	    "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support");
764 
765 	if (node != NULL) {
766 		/* create SYSCTLs */
767 		for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
768 			mlx5e_rl_sysctl_add_u64_oid(rl,
769 			    MLX5E_RL_PARAMS_INDEX(arg[i]),
770 			    node, mlx5e_rl_params_desc[2 * i],
771 			    mlx5e_rl_params_desc[2 * i + 1]);
772 		}
773 
774 		stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
775 		    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
776 		    "Rate limiting statistics");
777 		if (stats != NULL) {
778 			/* create SYSCTLs */
779 			for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
780 				mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
781 				    stats, mlx5e_rl_stats_desc[2 * i],
782 				    mlx5e_rl_stats_desc[2 * i + 1]);
783 			}
784 		}
785 	}
786 
787 	/* allocate workers array */
788 	rl->workers = malloc(sizeof(rl->workers[0]) *
789 	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
790 
791 	/* allocate rate limit array */
792 	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
793 	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
794 
795 	if (node != NULL) {
796 		/* create more SYSCTls */
797 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
798 		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
799 		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
800 		    "A", "Show table of all configured TX rates");
801 
802 		/* try to fetch rate table from kernel environment */
803 		for (i = 0; i != rl->param.tx_rates_def; i++) {
804 			/* compute path for tunable */
805 			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
806 			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
807 			if (TUNABLE_QUAD_FETCH(buf, &j))
808 				mlx5e_rl_tx_limit_add(rl, j);
809 		}
810 
811 		/* setup rate table sysctls */
812 		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
813 			mlx5e_rl_sysctl_add_u64_oid(rl,
814 			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
815 			    node, mlx5e_rl_table_params_desc[2 * i],
816 			    mlx5e_rl_table_params_desc[2 * i + 1]);
817 		}
818 	}
819 
820 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
821 		struct mlx5e_rl_worker *rlw = rl->workers + j;
822 
823 		rlw->priv = priv;
824 
825 		cv_init(&rlw->cv, "mlx5-worker-cv");
826 		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
827 		STAILQ_INIT(&rlw->index_list_head);
828 		STAILQ_INIT(&rlw->process_head);
829 
830 		rlw->channels = malloc(sizeof(rlw->channels[0]) *
831 		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
832 
833 		MLX5E_RL_WORKER_LOCK(rlw);
834 		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
835 			struct mlx5e_rl_channel *channel = rlw->channels + i;
836 			channel->worker = rlw;
837 			channel->tag.type = IF_SND_TAG_TYPE_RATE_LIMIT;
838 			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
839 		}
840 		MLX5E_RL_WORKER_UNLOCK(rlw);
841 	}
842 
843 	PRIV_LOCK(priv);
844 	error = mlx5e_rl_open_workers(priv);
845 	PRIV_UNLOCK(priv);
846 
847 	if (error != 0) {
848 		mlx5_en_err(priv->ifp,
849 		    "mlx5e_rl_open_workers failed: %d\n", error);
850 	}
851 
852 	return (0);
853 
854 done:
855 	sysctl_ctx_free(&rl->ctx);
856 	sx_destroy(&rl->rl_sxlock);
857 	return (error);
858 }
859 
860 static int
861 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
862 {
863 	struct mlx5e_rl_priv_data *rl = &priv->rl;
864 	struct thread *rl_thread = NULL;
865 	struct proc *rl_proc = NULL;
866 	uint64_t j;
867 	int error;
868 
869 	if (priv->gone || rl->opened)
870 		return (-EINVAL);
871 
872 	MLX5E_RL_WLOCK(rl);
873 	/* compute channel parameters once */
874 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
875 	MLX5E_RL_WUNLOCK(rl);
876 
877 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
878 		struct mlx5e_rl_worker *rlw = rl->workers + j;
879 
880 		/* start worker thread */
881 		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
882 		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
883 		if (error != 0) {
884 			mlx5_en_err(rl->priv->ifp,
885 			    "kproc_kthread_add failed: %d\n", error);
886 			rlw->worker_done = 1;
887 		}
888 	}
889 
890 	rl->opened = 1;
891 
892 	return (0);
893 }
894 
895 static void
896 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
897 {
898 	struct mlx5e_rl_priv_data *rl = &priv->rl;
899 	uint64_t y;
900 
901 	if (rl->opened == 0)
902 		return;
903 
904 	/* tear down worker threads simultaneously */
905 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
906 		struct mlx5e_rl_worker *rlw = rl->workers + y;
907 
908 		/* tear down worker before freeing SQs */
909 		MLX5E_RL_WORKER_LOCK(rlw);
910 		if (rlw->worker_done == 0) {
911 			rlw->worker_done = 1;
912 			cv_broadcast(&rlw->cv);
913 		} else {
914 			/* XXX thread not started */
915 			rlw->worker_done = 0;
916 		}
917 		MLX5E_RL_WORKER_UNLOCK(rlw);
918 	}
919 
920 	/* wait for worker threads to exit */
921 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
922 		struct mlx5e_rl_worker *rlw = rl->workers + y;
923 
924 		/* tear down worker before freeing SQs */
925 		MLX5E_RL_WORKER_LOCK(rlw);
926 		while (rlw->worker_done != 0)
927 			cv_wait(&rlw->cv, &rlw->mtx);
928 		MLX5E_RL_WORKER_UNLOCK(rlw);
929 	}
930 
931 	rl->opened = 0;
932 }
933 
934 static void
935 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
936 {
937 	unsigned x;
938 
939 	MLX5E_RL_WLOCK(rl);
940 	for (x = 0; x != rl->param.tx_rates_def; x++)
941 		rl->rate_limit_table[x] = 0;
942 	MLX5E_RL_WUNLOCK(rl);
943 }
944 
945 void
946 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
947 {
948 	struct mlx5e_rl_priv_data *rl = &priv->rl;
949 	uint64_t y;
950 
951 	/* check if there is support for packet pacing */
952 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
953 		return;
954 
955 	/* TODO check if there is support for packet pacing */
956 
957 	sysctl_ctx_free(&rl->ctx);
958 
959 	PRIV_LOCK(priv);
960 	mlx5e_rl_close_workers(priv);
961 	PRIV_UNLOCK(priv);
962 
963 	mlx5e_rl_reset_rates(rl);
964 
965 	/* close TIS domain */
966 	mlx5e_rl_close_tis(priv);
967 
968 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
969 		struct mlx5e_rl_worker *rlw = rl->workers + y;
970 
971 		cv_destroy(&rlw->cv);
972 		mtx_destroy(&rlw->mtx);
973 		free(rlw->channels, M_MLX5EN);
974 	}
975 	free(rl->rate_limit_table, M_MLX5EN);
976 	free(rl->workers, M_MLX5EN);
977 	sx_destroy(&rl->rl_sxlock);
978 }
979 
980 static void
981 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
982     struct mlx5e_rl_channel *channel)
983 {
984 	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
985 	cv_broadcast(&rlw->cv);
986 }
987 
988 static void
989 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
990 {
991 	if (channel == NULL)
992 		return;
993 
994 	MLX5E_RL_WORKER_LOCK(rlw);
995 	switch (channel->state) {
996 	case MLX5E_RL_ST_MODIFY:
997 		channel->state = MLX5E_RL_ST_DESTROY;
998 		break;
999 	case MLX5E_RL_ST_USED:
1000 		channel->state = MLX5E_RL_ST_DESTROY;
1001 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1002 		break;
1003 	default:
1004 		break;
1005 	}
1006 	MLX5E_RL_WORKER_UNLOCK(rlw);
1007 }
1008 
1009 static int
1010 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1011 {
1012 
1013 	MLX5E_RL_WORKER_LOCK(rlw);
1014 	channel->new_rate = rate;
1015 	switch (channel->state) {
1016 	case MLX5E_RL_ST_USED:
1017 		channel->state = MLX5E_RL_ST_MODIFY;
1018 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1019 		break;
1020 	default:
1021 		break;
1022 	}
1023 	MLX5E_RL_WORKER_UNLOCK(rlw);
1024 
1025 	return (0);
1026 }
1027 
1028 static int
1029 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
1030     union if_snd_tag_query_params *params)
1031 {
1032 	int retval;
1033 
1034 	MLX5E_RL_WORKER_LOCK(rlw);
1035 	switch (channel->state) {
1036 	case MLX5E_RL_ST_USED:
1037 		params->rate_limit.max_rate = channel->last_rate;
1038 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1039 		retval = 0;
1040 		break;
1041 	case MLX5E_RL_ST_MODIFY:
1042 		params->rate_limit.max_rate = channel->last_rate;
1043 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1044 		retval = EBUSY;
1045 		break;
1046 	default:
1047 		retval = EINVAL;
1048 		break;
1049 	}
1050 	MLX5E_RL_WORKER_UNLOCK(rlw);
1051 
1052 	return (retval);
1053 }
1054 
1055 static int
1056 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1057     struct mlx5e_rl_channel **pchannel)
1058 {
1059 	struct mlx5e_rl_channel *channel;
1060 	int retval = ENOMEM;
1061 
1062 	MLX5E_RL_WORKER_LOCK(rlw);
1063 	/* Check for available channel in free list */
1064 	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1065 		retval = 0;
1066 		/* Remove head index from available list */
1067 		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1068 		channel->state = MLX5E_RL_ST_USED;
1069 		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1070 	} else {
1071 		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1072 	}
1073 	MLX5E_RL_WORKER_UNLOCK(rlw);
1074 
1075 	*pchannel = channel;
1076 #ifdef RATELIMIT_DEBUG
1077 	mlx5_en_info(rlw->priv->ifp,
1078 	    "Channel pointer for rate limit connection is %p\n", channel);
1079 #endif
1080 	return (retval);
1081 }
1082 
1083 int
1084 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
1085     union if_snd_tag_alloc_params *params,
1086     struct m_snd_tag **ppmt)
1087 {
1088 	struct mlx5e_rl_channel *channel;
1089 	struct mlx5e_rl_worker *rlw;
1090 	struct mlx5e_priv *priv;
1091 	int error;
1092 
1093 	priv = ifp->if_softc;
1094 
1095 	/* check if there is support for packet pacing or if device is going away */
1096 	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1097 	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1098 	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1099 		return (EOPNOTSUPP);
1100 
1101 	/* compute worker thread this TCP connection belongs to */
1102 	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1103 	    priv->rl.param.tx_worker_threads_def);
1104 
1105 	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1106 	if (error != 0)
1107 		goto done;
1108 
1109 	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1110 	if (error != 0) {
1111 		mlx5e_rl_free(rlw, channel);
1112 		goto done;
1113 	}
1114 
1115 	/* store pointer to mbuf tag */
1116 	MPASS(channel->tag.refcount == 0);
1117 	m_snd_tag_init(&channel->tag, ifp, IF_SND_TAG_TYPE_RATE_LIMIT);
1118 	*ppmt = &channel->tag;
1119 done:
1120 	return (error);
1121 }
1122 
1123 
1124 int
1125 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1126 {
1127 	struct mlx5e_rl_channel *channel =
1128 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1129 
1130 	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1131 }
1132 
1133 int
1134 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1135 {
1136 	struct mlx5e_rl_channel *channel =
1137 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1138 
1139 	return (mlx5e_rl_query(channel->worker, channel, params));
1140 }
1141 
1142 void
1143 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1144 {
1145 	struct mlx5e_rl_channel *channel =
1146 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1147 
1148 	mlx5e_rl_free(channel->worker, channel);
1149 }
1150 
1151 static int
1152 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1153 {
1154 	struct mlx5e_rl_priv_data *rl = arg1;
1155 	struct mlx5e_priv *priv = rl->priv;
1156 	struct sbuf sbuf;
1157 	unsigned x;
1158 	int error;
1159 
1160 	error = sysctl_wire_old_buffer(req, 0);
1161 	if (error != 0)
1162 		return (error);
1163 
1164 	PRIV_LOCK(priv);
1165 
1166 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1167 
1168 	sbuf_printf(&sbuf,
1169 	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1170 	    "\t" "--------------------------------------------\n");
1171 
1172 	MLX5E_RL_RLOCK(rl);
1173 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1174 		if (rl->rate_limit_table[x] == 0)
1175 			continue;
1176 
1177 		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1178 		    x, (unsigned)rl->param.tx_burst_size,
1179 		    (long long)rl->rate_limit_table[x]);
1180 	}
1181 	MLX5E_RL_RUNLOCK(rl);
1182 
1183 	error = sbuf_finish(&sbuf);
1184 	sbuf_delete(&sbuf);
1185 
1186 	PRIV_UNLOCK(priv);
1187 
1188 	return (error);
1189 }
1190 
1191 static int
1192 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1193 {
1194 	uint64_t x;
1195 	uint64_t y;
1196 
1197 	MLX5E_RL_WLOCK(rl);
1198 	/* compute channel parameters once */
1199 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1200 	MLX5E_RL_WUNLOCK(rl);
1201 
1202 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1203 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1204 
1205 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1206 			struct mlx5e_rl_channel *channel;
1207 			struct mlx5e_sq *sq;
1208 
1209 			channel = rlw->channels + x;
1210 			sq = channel->sq;
1211 
1212 			if (sq == NULL)
1213 				continue;
1214 
1215 			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1216 				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1217 				    rl->param.tx_coalesce_usecs,
1218 				    rl->param.tx_coalesce_pkts,
1219 				    rl->param.tx_coalesce_mode);
1220 			} else {
1221 				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1222 				    rl->param.tx_coalesce_usecs,
1223 				    rl->param.tx_coalesce_pkts);
1224 			}
1225 		}
1226 	}
1227 	return (0);
1228 }
1229 
1230 void
1231 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
1232 {
1233 	uint64_t x;
1234 	uint64_t y;
1235 
1236 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1237 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1238 
1239 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1240 			struct mlx5e_rl_channel *channel;
1241 			struct mlx5e_sq *sq;
1242 
1243 			channel = rlw->channels + x;
1244 			sq = channel->sq;
1245 
1246 			if (sq == NULL)
1247 				continue;
1248 
1249 			mtx_lock(&sq->lock);
1250 			mlx5e_update_sq_inline(sq);
1251 			mtx_unlock(&sq->lock);
1252 		}
1253 	}
1254 }
1255 
1256 static int
1257 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1258 {
1259 	unsigned x;
1260 	int error;
1261 
1262 	if (value < 1000 ||
1263 	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1264 		return (EINVAL);
1265 
1266 	MLX5E_RL_WLOCK(rl);
1267 	error = ENOMEM;
1268 
1269 	/* check if rate already exists */
1270 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1271 		if (rl->rate_limit_table[x] != value)
1272 			continue;
1273 		error = EEXIST;
1274 		break;
1275 	}
1276 
1277 	/* check if there is a free rate entry */
1278 	if (x == rl->param.tx_rates_def) {
1279 		for (x = 0; x != rl->param.tx_rates_def; x++) {
1280 			if (rl->rate_limit_table[x] != 0)
1281 				continue;
1282 			rl->rate_limit_table[x] = value;
1283 			error = 0;
1284 			break;
1285 		}
1286 	}
1287 	MLX5E_RL_WUNLOCK(rl);
1288 
1289 	return (error);
1290 }
1291 
1292 static int
1293 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1294 {
1295 	unsigned x;
1296 	int error;
1297 
1298 	if (value == 0)
1299 		return (EINVAL);
1300 
1301 	MLX5E_RL_WLOCK(rl);
1302 
1303 	/* check if rate already exists */
1304 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1305 		if (rl->rate_limit_table[x] != value)
1306 			continue;
1307 		/* free up rate */
1308 		rl->rate_limit_table[x] = 0;
1309 		break;
1310 	}
1311 
1312 	/* check if there is a free rate entry */
1313 	if (x == rl->param.tx_rates_def)
1314 		error = ENOENT;
1315 	else
1316 		error = 0;
1317 	MLX5E_RL_WUNLOCK(rl);
1318 
1319 	return (error);
1320 }
1321 
1322 static int
1323 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1324 {
1325 	struct mlx5e_rl_priv_data *rl = arg1;
1326 	struct mlx5e_priv *priv = rl->priv;
1327 	unsigned mode_modify;
1328 	unsigned was_opened;
1329 	uint64_t value;
1330 	uint64_t old;
1331 	int error;
1332 
1333 	PRIV_LOCK(priv);
1334 
1335 	MLX5E_RL_RLOCK(rl);
1336 	value = rl->param.arg[arg2];
1337 	MLX5E_RL_RUNLOCK(rl);
1338 
1339 	if (req != NULL) {
1340 		old = value;
1341 		error = sysctl_handle_64(oidp, &value, 0, req);
1342 		if (error || req->newptr == NULL ||
1343 		    value == rl->param.arg[arg2])
1344 			goto done;
1345 	} else {
1346 		old = 0;
1347 		error = 0;
1348 	}
1349 
1350 	/* check if device is gone */
1351 	if (priv->gone) {
1352 		error = ENXIO;
1353 		goto done;
1354 	}
1355 	was_opened = rl->opened;
1356 	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1357 
1358 	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1359 	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1360 		if (value > rl->param.tx_worker_threads_max)
1361 			value = rl->param.tx_worker_threads_max;
1362 		else if (value < 1)
1363 			value = 1;
1364 
1365 		/* store new value */
1366 		rl->param.arg[arg2] = value;
1367 		break;
1368 
1369 	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1370 		if (value > rl->param.tx_channels_per_worker_max)
1371 			value = rl->param.tx_channels_per_worker_max;
1372 		else if (value < 1)
1373 			value = 1;
1374 
1375 		/* store new value */
1376 		rl->param.arg[arg2] = value;
1377 		break;
1378 
1379 	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1380 		if (value > rl->param.tx_rates_max)
1381 			value = rl->param.tx_rates_max;
1382 		else if (value < 1)
1383 			value = 1;
1384 
1385 		/* store new value */
1386 		rl->param.arg[arg2] = value;
1387 		break;
1388 
1389 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1390 		/* range check */
1391 		if (value < 1)
1392 			value = 0;
1393 		else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1394 			value = MLX5E_FLD_MAX(cqc, cq_period);
1395 
1396 		/* store new value */
1397 		rl->param.arg[arg2] = value;
1398 
1399 		/* check to avoid down and up the network interface */
1400 		if (was_opened)
1401 			error = mlx5e_rl_refresh_channel_params(rl);
1402 		break;
1403 
1404 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1405 		/* import TX coal pkts */
1406 		if (value < 1)
1407 			value = 0;
1408 		else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1409 			value = MLX5E_FLD_MAX(cqc, cq_max_count);
1410 
1411 		/* store new value */
1412 		rl->param.arg[arg2] = value;
1413 
1414 		/* check to avoid down and up the network interface */
1415 		if (was_opened)
1416 			error = mlx5e_rl_refresh_channel_params(rl);
1417 		break;
1418 
1419 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1420 		/* network interface must be down */
1421 		if (was_opened != 0 && mode_modify == 0)
1422 			mlx5e_rl_close_workers(priv);
1423 
1424 		/* import TX coalesce mode */
1425 		if (value != 0)
1426 			value = 1;
1427 
1428 		/* store new value */
1429 		rl->param.arg[arg2] = value;
1430 
1431 		/* restart network interface, if any */
1432 		if (was_opened != 0) {
1433 			if (mode_modify == 0)
1434 				mlx5e_rl_open_workers(priv);
1435 			else
1436 				error = mlx5e_rl_refresh_channel_params(rl);
1437 		}
1438 		break;
1439 
1440 	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1441 		/* network interface must be down */
1442 		if (was_opened)
1443 			mlx5e_rl_close_workers(priv);
1444 
1445 		/* import TX queue size */
1446 		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1447 			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1448 		else if (value > priv->params_ethtool.tx_queue_size_max)
1449 			value = priv->params_ethtool.tx_queue_size_max;
1450 
1451 		/* store actual TX queue size */
1452 		value = 1ULL << order_base_2(value);
1453 
1454 		/* store new value */
1455 		rl->param.arg[arg2] = value;
1456 
1457 		/* verify TX completion factor */
1458 		mlx5e_rl_sync_tx_completion_fact(rl);
1459 
1460 		/* restart network interface, if any */
1461 		if (was_opened)
1462 			mlx5e_rl_open_workers(priv);
1463 		break;
1464 
1465 	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1466 		/* network interface must be down */
1467 		if (was_opened)
1468 			mlx5e_rl_close_workers(priv);
1469 
1470 		/* store new value */
1471 		rl->param.arg[arg2] = value;
1472 
1473 		/* verify parameter */
1474 		mlx5e_rl_sync_tx_completion_fact(rl);
1475 
1476 		/* restart network interface, if any */
1477 		if (was_opened)
1478 			mlx5e_rl_open_workers(priv);
1479 		break;
1480 
1481 	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1482 		error = mlx5e_rl_tx_limit_add(rl, value);
1483 		break;
1484 
1485 	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1486 		error = mlx5e_rl_tx_limit_clr(rl, value);
1487 		break;
1488 
1489 	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1490 		/* range check */
1491 		if (value > rl->param.tx_allowed_deviation_max)
1492 			value = rl->param.tx_allowed_deviation_max;
1493 		else if (value < rl->param.tx_allowed_deviation_min)
1494 			value = rl->param.tx_allowed_deviation_min;
1495 
1496 		MLX5E_RL_WLOCK(rl);
1497 		rl->param.arg[arg2] = value;
1498 		MLX5E_RL_WUNLOCK(rl);
1499 		break;
1500 
1501 	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1502 		/* range check */
1503 		if (value > rl->param.tx_burst_size_max)
1504 			value = rl->param.tx_burst_size_max;
1505 		else if (value < rl->param.tx_burst_size_min)
1506 			value = rl->param.tx_burst_size_min;
1507 
1508 		MLX5E_RL_WLOCK(rl);
1509 		rl->param.arg[arg2] = value;
1510 		MLX5E_RL_WUNLOCK(rl);
1511 		break;
1512 
1513 	default:
1514 		break;
1515 	}
1516 done:
1517 	PRIV_UNLOCK(priv);
1518 	return (error);
1519 }
1520 
1521 static void
1522 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1523     struct sysctl_oid *node, const char *name, const char *desc)
1524 {
1525 	/*
1526 	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1527 	 * take care of loading default sysctl value from the kernel
1528 	 * environment, if any:
1529 	 */
1530 	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1531 		/* read-only SYSCTLs */
1532 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1533 		    name, CTLTYPE_U64 | CTLFLAG_RD |
1534 		    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1535 	} else {
1536 		if (strstr(name, "_def") != 0) {
1537 #ifdef RATELIMIT_DEBUG
1538 			/* tunable read-only advanced SYSCTLs */
1539 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1540 			    name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1541 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1542 #endif
1543 		} else {
1544 			/* read-write SYSCTLs */
1545 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1546 			    name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1547 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1548 		}
1549 	}
1550 }
1551 
1552 static void
1553 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1554     struct sysctl_oid *node, const char *name, const char *desc)
1555 {
1556 	/* read-only SYSCTLs */
1557 	SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1558 	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
1559 }
1560 
1561 #else
1562 
1563 int
1564 mlx5e_rl_init(struct mlx5e_priv *priv)
1565 {
1566 
1567 	return (0);
1568 }
1569 
1570 void
1571 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1572 {
1573 	/* NOP */
1574 }
1575 
1576 #endif		/* RATELIMIT */
1577