xref: /freebsd/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c (revision dd41de95a84d979615a2ef11df6850622bf6184e)
1 /*-
2  * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "en.h"
29 
30 #ifdef RATELIMIT
31 
32 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
33 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
34 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
35 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
36     struct sysctl_oid *, const char *name, const char *desc);
37 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
38       struct sysctl_oid *node, const char *name, const char *desc);
39 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
40 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
41 
42 static void
43 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
44     struct mlx5e_sq_param *param)
45 {
46 	void *sqc = param->sqc;
47 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
48 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
49 
50 	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
51 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
52 	MLX5_SET(wq, wq, pd, rl->priv->pdn);
53 
54 	param->wq.buf_numa_node = 0;
55 	param->wq.db_numa_node = 0;
56 	param->wq.linear = 1;
57 }
58 
59 static void
60 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
61     struct mlx5e_cq_param *param)
62 {
63 	void *cqc = param->cqc;
64 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
65 
66 	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
67 	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
68 	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
69 	MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index);
70 
71 	switch (rl->param.tx_coalesce_mode) {
72 	case 0:
73 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
74 		break;
75 	default:
76 		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
77 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
78 		else
79 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
80 		break;
81 	}
82 }
83 
84 static void
85 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
86     struct mlx5e_rl_channel_param *cparam)
87 {
88 	memset(cparam, 0, sizeof(*cparam));
89 
90 	mlx5e_rl_build_sq_param(rl, &cparam->sq);
91 	mlx5e_rl_build_cq_param(rl, &cparam->cq);
92 }
93 
94 static int
95 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
96     struct mlx5e_sq_param *param, int ix)
97 {
98 	struct mlx5_core_dev *mdev = priv->mdev;
99 	void *sqc = param->sqc;
100 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
101 	int err;
102 
103 	/* Create DMA descriptor TAG */
104 	if ((err = -bus_dma_tag_create(
105 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
106 	    1,				/* any alignment */
107 	    0,				/* no boundary */
108 	    BUS_SPACE_MAXADDR,		/* lowaddr */
109 	    BUS_SPACE_MAXADDR,		/* highaddr */
110 	    NULL, NULL,			/* filter, filterarg */
111 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
112 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
113 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
114 	    0,				/* flags */
115 	    NULL, NULL,			/* lockfunc, lockfuncarg */
116 	    &sq->dma_tag)))
117 		goto done;
118 
119 	/* use shared UAR */
120 	sq->uar_map = priv->bfreg.map;
121 
122 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
123 	    &sq->wq_ctrl);
124 	if (err)
125 		goto err_free_dma_tag;
126 
127 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
128 
129 	err = mlx5e_alloc_sq_db(sq);
130 	if (err)
131 		goto err_sq_wq_destroy;
132 
133 	sq->mkey_be = cpu_to_be32(priv->mr.key);
134 	sq->ifp = priv->ifp;
135 	sq->priv = priv;
136 
137 	mlx5e_update_sq_inline(sq);
138 
139 	return (0);
140 
141 err_sq_wq_destroy:
142 	mlx5_wq_destroy(&sq->wq_ctrl);
143 err_free_dma_tag:
144 	bus_dma_tag_destroy(sq->dma_tag);
145 done:
146 	return (err);
147 }
148 
149 static void
150 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
151 {
152 
153 	mlx5e_free_sq_db(sq);
154 	mlx5_wq_destroy(&sq->wq_ctrl);
155 	bus_dma_tag_destroy(sq->dma_tag);
156 }
157 
158 static int
159 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
160     struct mlx5e_sq_param *param, int ix)
161 {
162 	int err;
163 
164 	err = mlx5e_rl_create_sq(priv, sq, param, ix);
165 	if (err)
166 		return (err);
167 
168 	err = mlx5e_enable_sq(sq, param, priv->rl.tisn);
169 	if (err)
170 		goto err_destroy_sq;
171 
172 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
173 	if (err)
174 		goto err_disable_sq;
175 
176 	WRITE_ONCE(sq->running, 1);
177 
178 	return (0);
179 
180 err_disable_sq:
181 	mlx5e_disable_sq(sq);
182 err_destroy_sq:
183 	mlx5e_rl_destroy_sq(sq);
184 
185 	return (err);
186 }
187 
188 static void
189 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
190 {
191 	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
192 	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
193 
194 	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
195 
196 	sq->cev_factor = priv->rl.param.tx_completion_fact;
197 
198 	/* ensure the TX completion event factor is not zero */
199 	if (sq->cev_factor == 0)
200 		sq->cev_factor = 1;
201 }
202 
203 static int
204 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
205     struct mlx5e_rl_channel_param *cparam,
206     struct mlx5e_sq *volatile *ppsq)
207 {
208 	struct mlx5e_priv *priv = rlw->priv;
209 	struct mlx5e_sq *sq;
210 	int err;
211 
212 	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
213 
214 	/* init mutexes */
215 	mlx5e_rl_chan_mtx_init(priv, sq);
216 
217 	/* open TX completion queue */
218 	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
219 	    &mlx5e_tx_cq_comp, eq_ix);
220 	if (err)
221 		goto err_free;
222 
223 	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
224 	if (err)
225 		goto err_close_tx_cq;
226 
227 	/* store TX channel pointer */
228 	*ppsq = sq;
229 
230 	/* poll TX queue initially */
231 	sq->cq.mcq.comp(&sq->cq.mcq, NULL);
232 
233 	return (0);
234 
235 err_close_tx_cq:
236 	mlx5e_close_cq(&sq->cq);
237 
238 err_free:
239 	/* destroy mutexes */
240 	mtx_destroy(&sq->lock);
241 	mtx_destroy(&sq->comp_lock);
242 	free(sq, M_MLX5EN);
243 	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
244 	return (err);
245 }
246 
247 static void
248 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
249 {
250 	struct mlx5e_sq *sq = *ppsq;
251 
252 	/* check if channel is already closed */
253 	if (sq == NULL)
254 		return;
255 	/* ensure channel pointer is no longer used */
256 	*ppsq = NULL;
257 
258 	/* teardown and destroy SQ */
259 	mlx5e_drain_sq(sq);
260 	mlx5e_disable_sq(sq);
261 	mlx5e_rl_destroy_sq(sq);
262 
263 	/* close CQ */
264 	mlx5e_close_cq(&sq->cq);
265 
266 	/* destroy mutexes */
267 	mtx_destroy(&sq->lock);
268 	mtx_destroy(&sq->comp_lock);
269 
270 	free(sq, M_MLX5EN);
271 }
272 
273 static void
274 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
275 {
276 	/*
277 	 * Limit the maximum distance between completion events to
278 	 * half of the currently set TX queue size.
279 	 *
280 	 * The maximum number of queue entries a single IP packet can
281 	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
282 	 *
283 	 * The worst case max value is then given as below:
284 	 */
285 	uint64_t max = rl->param.tx_queue_size /
286 	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
287 
288 	/*
289 	 * Update the maximum completion factor value in case the
290 	 * tx_queue_size field changed. Ensure we don't overflow
291 	 * 16-bits.
292 	 */
293 	if (max < 1)
294 		max = 1;
295 	else if (max > 65535)
296 		max = 65535;
297 	rl->param.tx_completion_fact_max = max;
298 
299 	/*
300 	 * Verify that the current TX completion factor is within the
301 	 * given limits:
302 	 */
303 	if (rl->param.tx_completion_fact < 1)
304 		rl->param.tx_completion_fact = 1;
305 	else if (rl->param.tx_completion_fact > max)
306 		rl->param.tx_completion_fact = max;
307 }
308 
309 static int
310 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
311 {
312 	struct mlx5e_priv *priv = sq->priv;
313 	struct mlx5_core_dev *mdev = priv->mdev;
314 
315 	void *in;
316 	void *sqc;
317 	int inlen;
318 	int err;
319 
320 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
321 	in = mlx5_vzalloc(inlen);
322 	if (in == NULL)
323 		return (-ENOMEM);
324 
325 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
326 
327 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
328 	MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
329 	MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
330 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
331 	MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
332 
333 	err = mlx5_core_modify_sq(mdev, in, inlen);
334 
335 	kvfree(in);
336 
337 	return (err);
338 }
339 
340 /*
341  * This function will search the configured rate limit table for the
342  * best match to avoid that a single socket based application can
343  * allocate all the available hardware rates. If the user selected
344  * rate deviates too much from the closes rate available in the rate
345  * limit table, unlimited rate will be selected.
346  */
347 static uint64_t
348 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
349 {
350 	uint64_t distance = -1ULL;
351 	uint64_t diff;
352 	uint64_t retval = 0;		/* unlimited */
353 	uint64_t x;
354 
355 	/* search for closest rate */
356 	for (x = 0; x != rl->param.tx_rates_def; x++) {
357 		uint64_t rate = rl->rate_limit_table[x];
358 		if (rate == 0)
359 			continue;
360 
361 		if (rate > user_rate)
362 			diff = rate - user_rate;
363 		else
364 			diff = user_rate - rate;
365 
366 		/* check if distance is smaller than previous rate */
367 		if (diff < distance) {
368 			distance = diff;
369 			retval = rate;
370 		}
371 	}
372 
373 	/* range check for multiplication below */
374 	if (user_rate > rl->param.tx_limit_max)
375 		user_rate = rl->param.tx_limit_max;
376 
377 	/* fallback to unlimited, if rate deviates too much */
378 	if (distance > howmany(user_rate *
379 	    rl->param.tx_allowed_deviation, 1000ULL))
380 		retval = 0;
381 
382 	return (retval);
383 }
384 
385 /*
386  * This function sets the requested rate for a rate limit channel, in
387  * bits per second. The requested rate will be filtered through the
388  * find best rate function above.
389  */
390 static int
391 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
392     struct mlx5e_rl_channel *channel, uint64_t rate)
393 {
394 	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
395 	struct mlx5e_sq *sq;
396 	uint64_t temp;
397 	uint16_t index;
398 	uint16_t burst;
399 	int error;
400 
401 	if (rate != 0) {
402 		MLX5E_RL_WORKER_UNLOCK(rlw);
403 
404 		MLX5E_RL_RLOCK(rl);
405 
406 		/* get current burst size in bytes */
407 		temp = rl->param.tx_burst_size *
408 		    MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
409 
410 		/* limit burst size to 64K currently */
411 		if (temp > 65535)
412 			temp = 65535;
413 		burst = temp;
414 
415 		/* find best rate */
416 		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
417 
418 		MLX5E_RL_RUNLOCK(rl);
419 
420 		if (rate == 0) {
421 			/* rate doesn't exist, fallback to unlimited */
422 			index = 0;
423 			rate = 0;
424 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
425 		} else {
426 			/* get a reference on the new rate */
427 			error = -mlx5_rl_add_rate(rlw->priv->mdev,
428 			    howmany(rate, 1000), burst, &index);
429 
430 			if (error != 0) {
431 				/* adding rate failed, fallback to unlimited */
432 				index = 0;
433 				rate = 0;
434 				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
435 			}
436 		}
437 		MLX5E_RL_WORKER_LOCK(rlw);
438 	} else {
439 		index = 0;
440 		burst = 0;	/* default */
441 	}
442 
443 	/* atomically swap rates */
444 	temp = channel->last_rate;
445 	channel->last_rate = rate;
446 	rate = temp;
447 
448 	/* atomically swap burst size */
449 	temp = channel->last_burst;
450 	channel->last_burst = burst;
451 	burst = temp;
452 
453 	MLX5E_RL_WORKER_UNLOCK(rlw);
454 	/* put reference on the old rate, if any */
455 	if (rate != 0) {
456 		mlx5_rl_remove_rate(rlw->priv->mdev,
457 		    howmany(rate, 1000), burst);
458 	}
459 
460 	/* set new rate, if SQ is running */
461 	sq = channel->sq;
462 	if (sq != NULL && READ_ONCE(sq->running) != 0) {
463 		error = mlx5e_rl_modify_sq(sq, index);
464 		if (error != 0)
465 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
466 	} else
467 		error = 0;
468 	MLX5E_RL_WORKER_LOCK(rlw);
469 
470 	return (-error);
471 }
472 
473 static void
474 mlx5e_rl_worker(void *arg)
475 {
476 	struct thread *td;
477 	struct mlx5e_rl_worker *rlw = arg;
478 	struct mlx5e_rl_channel *channel;
479 	struct mlx5e_priv *priv;
480 	unsigned ix;
481 	uint64_t x;
482 	int error;
483 
484 	/* set thread priority */
485 	td = curthread;
486 
487 	thread_lock(td);
488 	sched_prio(td, PI_SWI(SWI_NET));
489 	thread_unlock(td);
490 
491 	priv = rlw->priv;
492 
493 	/* compute completion vector */
494 	ix = (rlw - priv->rl.workers) %
495 	    priv->mdev->priv.eq_table.num_comp_vectors;
496 
497 	/* TODO bind to CPU */
498 
499 	/* open all the SQs */
500 	MLX5E_RL_WORKER_LOCK(rlw);
501 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
502 		struct mlx5e_rl_channel *channel = rlw->channels + x;
503 
504 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
505 		if (channel->state == MLX5E_RL_ST_FREE)
506 			continue;
507 #endif
508 		MLX5E_RL_WORKER_UNLOCK(rlw);
509 
510 		MLX5E_RL_RLOCK(&priv->rl);
511 		error = mlx5e_rl_open_channel(rlw, ix,
512 		    &priv->rl.chan_param, &channel->sq);
513 		MLX5E_RL_RUNLOCK(&priv->rl);
514 
515 		MLX5E_RL_WORKER_LOCK(rlw);
516 		if (error != 0) {
517 			mlx5_en_err(priv->ifp,
518 			    "mlx5e_rl_open_channel failed: %d\n", error);
519 			break;
520 		}
521 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
522 	}
523 	while (1) {
524 		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
525 			/* check if we are tearing down */
526 			if (rlw->worker_done != 0)
527 				break;
528 			cv_wait(&rlw->cv, &rlw->mtx);
529 		}
530 		/* check if we are tearing down */
531 		if (rlw->worker_done != 0)
532 			break;
533 		channel = STAILQ_FIRST(&rlw->process_head);
534 		if (channel != NULL) {
535 			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
536 
537 			switch (channel->state) {
538 			case MLX5E_RL_ST_MODIFY:
539 				channel->state = MLX5E_RL_ST_USED;
540 				MLX5E_RL_WORKER_UNLOCK(rlw);
541 
542 				/* create channel by demand */
543 				if (channel->sq == NULL) {
544 					MLX5E_RL_RLOCK(&priv->rl);
545 					error = mlx5e_rl_open_channel(rlw, ix,
546 					    &priv->rl.chan_param, &channel->sq);
547 					MLX5E_RL_RUNLOCK(&priv->rl);
548 
549 					if (error != 0) {
550 						mlx5_en_err(priv->ifp,
551 						    "mlx5e_rl_open_channel failed: %d\n", error);
552 					} else {
553 						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
554 					}
555 				} else {
556 					mlx5e_resume_sq(channel->sq);
557 				}
558 
559 				MLX5E_RL_WORKER_LOCK(rlw);
560 				/* convert from bytes/s to bits/s and set new rate */
561 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
562 				    channel->new_rate * 8ULL);
563 				if (error != 0) {
564 					mlx5_en_err(priv->ifp,
565 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
566 					    error);
567 				}
568 				break;
569 
570 			case MLX5E_RL_ST_DESTROY:
571 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
572 				if (error != 0) {
573 					mlx5_en_err(priv->ifp,
574 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
575 					    error);
576 				}
577 				if (channel->sq != NULL) {
578 					/*
579 					 * Make sure all packets are
580 					 * transmitted before SQ is
581 					 * returned to free list:
582 					 */
583 					MLX5E_RL_WORKER_UNLOCK(rlw);
584 					mlx5e_drain_sq(channel->sq);
585 					MLX5E_RL_WORKER_LOCK(rlw);
586 				}
587 				/* put the channel back into the free list */
588 				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
589 				channel->state = MLX5E_RL_ST_FREE;
590 				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
591 				break;
592 			default:
593 				/* NOP */
594 				break;
595 			}
596 		}
597 	}
598 
599 	/* close all the SQs */
600 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
601 		struct mlx5e_rl_channel *channel = rlw->channels + x;
602 
603 		/* update the initial rate */
604 		channel->init_rate = channel->last_rate;
605 
606 		/* make sure we free up the rate resource */
607 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
608 
609 		if (channel->sq != NULL) {
610 			MLX5E_RL_WORKER_UNLOCK(rlw);
611 			mlx5e_rl_close_channel(&channel->sq);
612 			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
613 			MLX5E_RL_WORKER_LOCK(rlw);
614 		}
615 	}
616 
617 	rlw->worker_done = 0;
618 	cv_broadcast(&rlw->cv);
619 	MLX5E_RL_WORKER_UNLOCK(rlw);
620 
621 	kthread_exit();
622 }
623 
624 static int
625 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
626 {
627 	struct mlx5_core_dev *mdev = priv->mdev;
628 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
629 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
630 
631 	memset(in, 0, sizeof(in));
632 
633 	MLX5_SET(tisc, tisc, prio, 0);
634 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
635 
636 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
637 }
638 
639 static void
640 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
641 {
642 	mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn);
643 }
644 
645 static void
646 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
647     struct mlx5_core_dev *mdev)
648 {
649 	/* ratelimit workers */
650 	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
651 	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
652 
653 	/* range check */
654 	if (param->tx_worker_threads_def == 0 ||
655 	    param->tx_worker_threads_def > param->tx_worker_threads_max)
656 		param->tx_worker_threads_def = param->tx_worker_threads_max;
657 
658 	/* ratelimit channels */
659 	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
660 	    param->tx_worker_threads_def;
661 	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
662 
663 	/* range check */
664 	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
665 		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
666 
667 	/* set default burst size */
668 	param->tx_burst_size = 4;	/* MTUs */
669 
670 	/*
671 	 * Set maximum burst size
672 	 *
673 	 * The burst size is multiplied by the MTU and clamped to the
674 	 * range 0 ... 65535 bytes inclusivly before fed into the
675 	 * firmware.
676 	 *
677 	 * NOTE: If the burst size or MTU is changed only ratelimit
678 	 * connections made after the change will use the new burst
679 	 * size.
680 	 */
681 	param->tx_burst_size_max = 255;
682 
683 	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
684 	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
685 	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
686 
687 	/* ratelimit table size */
688 	param->tx_rates_max = mdev->priv.rl_table.max_size;
689 
690 	/* range check */
691 	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
692 		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
693 
694 	/* set default number of rates */
695 	param->tx_rates_def = param->tx_rates_max;
696 
697 	/* set maximum allowed rate deviation */
698 	if (param->tx_limit_max != 0) {
699 		/*
700 		 * Make sure the deviation multiplication doesn't
701 		 * overflow unsigned 64-bit:
702 		 */
703 		param->tx_allowed_deviation_max = -1ULL /
704 		    param->tx_limit_max;
705 	}
706 	/* set default rate deviation */
707 	param->tx_allowed_deviation = 50;	/* 5.0% */
708 
709 	/* channel parameters */
710 	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
711 	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
712 	param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
713 	param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
714 	param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
715 }
716 
717 static const char *mlx5e_rl_params_desc[] = {
718 	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
719 };
720 
721 static const char *mlx5e_rl_table_params_desc[] = {
722 	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
723 };
724 
725 static const char *mlx5e_rl_stats_desc[] = {
726 	MLX5E_RL_STATS(MLX5E_STATS_DESC)
727 };
728 
729 int
730 mlx5e_rl_init(struct mlx5e_priv *priv)
731 {
732 	struct mlx5e_rl_priv_data *rl = &priv->rl;
733 	struct sysctl_oid *node;
734 	struct sysctl_oid *stats;
735 	char buf[64];
736 	uint64_t i;
737 	uint64_t j;
738 	int error;
739 
740 	/* check if there is support for packet pacing */
741 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
742 		return (0);
743 
744 	rl->priv = priv;
745 
746 	sysctl_ctx_init(&rl->ctx);
747 
748 	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
749 
750 	/* open own TIS domain for ratelimit SQs */
751 	error = mlx5e_rl_open_tis(priv);
752 	if (error)
753 		goto done;
754 
755 	/* setup default value for parameters */
756 	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
757 
758 	/* update the completion factor */
759 	mlx5e_rl_sync_tx_completion_fact(rl);
760 
761 	/* create root node */
762 	node = SYSCTL_ADD_NODE(&rl->ctx,
763 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
764 	    "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support");
765 
766 	if (node != NULL) {
767 		/* create SYSCTLs */
768 		for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
769 			mlx5e_rl_sysctl_add_u64_oid(rl,
770 			    MLX5E_RL_PARAMS_INDEX(arg[i]),
771 			    node, mlx5e_rl_params_desc[2 * i],
772 			    mlx5e_rl_params_desc[2 * i + 1]);
773 		}
774 
775 		stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
776 		    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
777 		    "Rate limiting statistics");
778 		if (stats != NULL) {
779 			/* create SYSCTLs */
780 			for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
781 				mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
782 				    stats, mlx5e_rl_stats_desc[2 * i],
783 				    mlx5e_rl_stats_desc[2 * i + 1]);
784 			}
785 		}
786 	}
787 
788 	/* allocate workers array */
789 	rl->workers = malloc(sizeof(rl->workers[0]) *
790 	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
791 
792 	/* allocate rate limit array */
793 	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
794 	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
795 
796 	if (node != NULL) {
797 		/* create more SYSCTls */
798 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
799 		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
800 		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
801 		    "A", "Show table of all configured TX rates");
802 
803 		/* try to fetch rate table from kernel environment */
804 		for (i = 0; i != rl->param.tx_rates_def; i++) {
805 			/* compute path for tunable */
806 			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
807 			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
808 			if (TUNABLE_QUAD_FETCH(buf, &j))
809 				mlx5e_rl_tx_limit_add(rl, j);
810 		}
811 
812 		/* setup rate table sysctls */
813 		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
814 			mlx5e_rl_sysctl_add_u64_oid(rl,
815 			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
816 			    node, mlx5e_rl_table_params_desc[2 * i],
817 			    mlx5e_rl_table_params_desc[2 * i + 1]);
818 		}
819 	}
820 
821 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
822 		struct mlx5e_rl_worker *rlw = rl->workers + j;
823 
824 		rlw->priv = priv;
825 
826 		cv_init(&rlw->cv, "mlx5-worker-cv");
827 		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
828 		STAILQ_INIT(&rlw->index_list_head);
829 		STAILQ_INIT(&rlw->process_head);
830 
831 		rlw->channels = malloc(sizeof(rlw->channels[0]) *
832 		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
833 
834 		MLX5E_RL_WORKER_LOCK(rlw);
835 		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
836 			struct mlx5e_rl_channel *channel = rlw->channels + i;
837 			channel->worker = rlw;
838 			channel->tag.type = IF_SND_TAG_TYPE_RATE_LIMIT;
839 			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
840 		}
841 		MLX5E_RL_WORKER_UNLOCK(rlw);
842 	}
843 
844 	PRIV_LOCK(priv);
845 	error = mlx5e_rl_open_workers(priv);
846 	PRIV_UNLOCK(priv);
847 
848 	if (error != 0) {
849 		mlx5_en_err(priv->ifp,
850 		    "mlx5e_rl_open_workers failed: %d\n", error);
851 	}
852 
853 	return (0);
854 
855 done:
856 	sysctl_ctx_free(&rl->ctx);
857 	sx_destroy(&rl->rl_sxlock);
858 	return (error);
859 }
860 
861 static int
862 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
863 {
864 	struct mlx5e_rl_priv_data *rl = &priv->rl;
865 	struct thread *rl_thread = NULL;
866 	struct proc *rl_proc = NULL;
867 	uint64_t j;
868 	int error;
869 
870 	if (priv->gone || rl->opened)
871 		return (-EINVAL);
872 
873 	MLX5E_RL_WLOCK(rl);
874 	/* compute channel parameters once */
875 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
876 	MLX5E_RL_WUNLOCK(rl);
877 
878 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
879 		struct mlx5e_rl_worker *rlw = rl->workers + j;
880 
881 		/* start worker thread */
882 		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
883 		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
884 		if (error != 0) {
885 			mlx5_en_err(rl->priv->ifp,
886 			    "kproc_kthread_add failed: %d\n", error);
887 			rlw->worker_done = 1;
888 		}
889 	}
890 
891 	rl->opened = 1;
892 
893 	return (0);
894 }
895 
896 static void
897 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
898 {
899 	struct mlx5e_rl_priv_data *rl = &priv->rl;
900 	uint64_t y;
901 
902 	if (rl->opened == 0)
903 		return;
904 
905 	/* tear down worker threads simultaneously */
906 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
907 		struct mlx5e_rl_worker *rlw = rl->workers + y;
908 
909 		/* tear down worker before freeing SQs */
910 		MLX5E_RL_WORKER_LOCK(rlw);
911 		if (rlw->worker_done == 0) {
912 			rlw->worker_done = 1;
913 			cv_broadcast(&rlw->cv);
914 		} else {
915 			/* XXX thread not started */
916 			rlw->worker_done = 0;
917 		}
918 		MLX5E_RL_WORKER_UNLOCK(rlw);
919 	}
920 
921 	/* wait for worker threads to exit */
922 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
923 		struct mlx5e_rl_worker *rlw = rl->workers + y;
924 
925 		/* tear down worker before freeing SQs */
926 		MLX5E_RL_WORKER_LOCK(rlw);
927 		while (rlw->worker_done != 0)
928 			cv_wait(&rlw->cv, &rlw->mtx);
929 		MLX5E_RL_WORKER_UNLOCK(rlw);
930 	}
931 
932 	rl->opened = 0;
933 }
934 
935 static void
936 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
937 {
938 	unsigned x;
939 
940 	MLX5E_RL_WLOCK(rl);
941 	for (x = 0; x != rl->param.tx_rates_def; x++)
942 		rl->rate_limit_table[x] = 0;
943 	MLX5E_RL_WUNLOCK(rl);
944 }
945 
946 void
947 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
948 {
949 	struct mlx5e_rl_priv_data *rl = &priv->rl;
950 	uint64_t y;
951 
952 	/* check if there is support for packet pacing */
953 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
954 		return;
955 
956 	/* TODO check if there is support for packet pacing */
957 
958 	sysctl_ctx_free(&rl->ctx);
959 
960 	PRIV_LOCK(priv);
961 	mlx5e_rl_close_workers(priv);
962 	PRIV_UNLOCK(priv);
963 
964 	mlx5e_rl_reset_rates(rl);
965 
966 	/* close TIS domain */
967 	mlx5e_rl_close_tis(priv);
968 
969 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
970 		struct mlx5e_rl_worker *rlw = rl->workers + y;
971 
972 		cv_destroy(&rlw->cv);
973 		mtx_destroy(&rlw->mtx);
974 		free(rlw->channels, M_MLX5EN);
975 	}
976 	free(rl->rate_limit_table, M_MLX5EN);
977 	free(rl->workers, M_MLX5EN);
978 	sx_destroy(&rl->rl_sxlock);
979 }
980 
981 static void
982 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
983     struct mlx5e_rl_channel *channel)
984 {
985 	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
986 	cv_broadcast(&rlw->cv);
987 }
988 
989 static void
990 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
991 {
992 	if (channel == NULL)
993 		return;
994 
995 	MLX5E_RL_WORKER_LOCK(rlw);
996 	switch (channel->state) {
997 	case MLX5E_RL_ST_MODIFY:
998 		channel->state = MLX5E_RL_ST_DESTROY;
999 		break;
1000 	case MLX5E_RL_ST_USED:
1001 		channel->state = MLX5E_RL_ST_DESTROY;
1002 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1003 		break;
1004 	default:
1005 		break;
1006 	}
1007 	MLX5E_RL_WORKER_UNLOCK(rlw);
1008 }
1009 
1010 static int
1011 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1012 {
1013 
1014 	MLX5E_RL_WORKER_LOCK(rlw);
1015 	channel->new_rate = rate;
1016 	switch (channel->state) {
1017 	case MLX5E_RL_ST_USED:
1018 		channel->state = MLX5E_RL_ST_MODIFY;
1019 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1020 		break;
1021 	default:
1022 		break;
1023 	}
1024 	MLX5E_RL_WORKER_UNLOCK(rlw);
1025 
1026 	return (0);
1027 }
1028 
1029 static int
1030 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
1031     union if_snd_tag_query_params *params)
1032 {
1033 	int retval;
1034 
1035 	MLX5E_RL_WORKER_LOCK(rlw);
1036 	switch (channel->state) {
1037 	case MLX5E_RL_ST_USED:
1038 		params->rate_limit.max_rate = channel->last_rate;
1039 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1040 		retval = 0;
1041 		break;
1042 	case MLX5E_RL_ST_MODIFY:
1043 		params->rate_limit.max_rate = channel->last_rate;
1044 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1045 		retval = EBUSY;
1046 		break;
1047 	default:
1048 		retval = EINVAL;
1049 		break;
1050 	}
1051 	MLX5E_RL_WORKER_UNLOCK(rlw);
1052 
1053 	return (retval);
1054 }
1055 
1056 static int
1057 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1058     struct mlx5e_rl_channel **pchannel)
1059 {
1060 	struct mlx5e_rl_channel *channel;
1061 	int retval = ENOMEM;
1062 
1063 	MLX5E_RL_WORKER_LOCK(rlw);
1064 	/* Check for available channel in free list */
1065 	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1066 		retval = 0;
1067 		/* Remove head index from available list */
1068 		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1069 		channel->state = MLX5E_RL_ST_USED;
1070 		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1071 	} else {
1072 		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1073 	}
1074 	MLX5E_RL_WORKER_UNLOCK(rlw);
1075 
1076 	*pchannel = channel;
1077 #ifdef RATELIMIT_DEBUG
1078 	mlx5_en_info(rlw->priv->ifp,
1079 	    "Channel pointer for rate limit connection is %p\n", channel);
1080 #endif
1081 	return (retval);
1082 }
1083 
1084 int
1085 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
1086     union if_snd_tag_alloc_params *params,
1087     struct m_snd_tag **ppmt)
1088 {
1089 	struct mlx5e_rl_channel *channel;
1090 	struct mlx5e_rl_worker *rlw;
1091 	struct mlx5e_priv *priv;
1092 	int error;
1093 
1094 	priv = ifp->if_softc;
1095 
1096 	/* check if there is support for packet pacing or if device is going away */
1097 	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1098 	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1099 	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1100 		return (EOPNOTSUPP);
1101 
1102 	/* compute worker thread this TCP connection belongs to */
1103 	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1104 	    priv->rl.param.tx_worker_threads_def);
1105 
1106 	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1107 	if (error != 0)
1108 		goto done;
1109 
1110 	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1111 	if (error != 0) {
1112 		mlx5e_rl_free(rlw, channel);
1113 		goto done;
1114 	}
1115 
1116 	/* store pointer to mbuf tag */
1117 	MPASS(channel->tag.refcount == 0);
1118 	m_snd_tag_init(&channel->tag, ifp, IF_SND_TAG_TYPE_RATE_LIMIT);
1119 	*ppmt = &channel->tag;
1120 done:
1121 	return (error);
1122 }
1123 
1124 
1125 int
1126 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1127 {
1128 	struct mlx5e_rl_channel *channel =
1129 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1130 
1131 	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1132 }
1133 
1134 int
1135 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1136 {
1137 	struct mlx5e_rl_channel *channel =
1138 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1139 
1140 	return (mlx5e_rl_query(channel->worker, channel, params));
1141 }
1142 
1143 void
1144 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1145 {
1146 	struct mlx5e_rl_channel *channel =
1147 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1148 
1149 	mlx5e_rl_free(channel->worker, channel);
1150 }
1151 
1152 static int
1153 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1154 {
1155 	struct mlx5e_rl_priv_data *rl = arg1;
1156 	struct mlx5e_priv *priv = rl->priv;
1157 	struct sbuf sbuf;
1158 	unsigned x;
1159 	int error;
1160 
1161 	error = sysctl_wire_old_buffer(req, 0);
1162 	if (error != 0)
1163 		return (error);
1164 
1165 	PRIV_LOCK(priv);
1166 
1167 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1168 
1169 	sbuf_printf(&sbuf,
1170 	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1171 	    "\t" "--------------------------------------------\n");
1172 
1173 	MLX5E_RL_RLOCK(rl);
1174 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1175 		if (rl->rate_limit_table[x] == 0)
1176 			continue;
1177 
1178 		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1179 		    x, (unsigned)rl->param.tx_burst_size,
1180 		    (long long)rl->rate_limit_table[x]);
1181 	}
1182 	MLX5E_RL_RUNLOCK(rl);
1183 
1184 	error = sbuf_finish(&sbuf);
1185 	sbuf_delete(&sbuf);
1186 
1187 	PRIV_UNLOCK(priv);
1188 
1189 	return (error);
1190 }
1191 
1192 static int
1193 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1194 {
1195 	uint64_t x;
1196 	uint64_t y;
1197 
1198 	MLX5E_RL_WLOCK(rl);
1199 	/* compute channel parameters once */
1200 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1201 	MLX5E_RL_WUNLOCK(rl);
1202 
1203 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1204 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1205 
1206 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1207 			struct mlx5e_rl_channel *channel;
1208 			struct mlx5e_sq *sq;
1209 
1210 			channel = rlw->channels + x;
1211 			sq = channel->sq;
1212 
1213 			if (sq == NULL)
1214 				continue;
1215 
1216 			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1217 				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1218 				    rl->param.tx_coalesce_usecs,
1219 				    rl->param.tx_coalesce_pkts,
1220 				    rl->param.tx_coalesce_mode);
1221 			} else {
1222 				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1223 				    rl->param.tx_coalesce_usecs,
1224 				    rl->param.tx_coalesce_pkts);
1225 			}
1226 		}
1227 	}
1228 	return (0);
1229 }
1230 
1231 void
1232 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
1233 {
1234 	uint64_t x;
1235 	uint64_t y;
1236 
1237 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1238 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1239 
1240 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1241 			struct mlx5e_rl_channel *channel;
1242 			struct mlx5e_sq *sq;
1243 
1244 			channel = rlw->channels + x;
1245 			sq = channel->sq;
1246 
1247 			if (sq == NULL)
1248 				continue;
1249 
1250 			mtx_lock(&sq->lock);
1251 			mlx5e_update_sq_inline(sq);
1252 			mtx_unlock(&sq->lock);
1253 		}
1254 	}
1255 }
1256 
1257 static int
1258 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1259 {
1260 	unsigned x;
1261 	int error;
1262 
1263 	if (value < 1000 ||
1264 	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1265 		return (EINVAL);
1266 
1267 	MLX5E_RL_WLOCK(rl);
1268 	error = ENOMEM;
1269 
1270 	/* check if rate already exists */
1271 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1272 		if (rl->rate_limit_table[x] != value)
1273 			continue;
1274 		error = EEXIST;
1275 		break;
1276 	}
1277 
1278 	/* check if there is a free rate entry */
1279 	if (x == rl->param.tx_rates_def) {
1280 		for (x = 0; x != rl->param.tx_rates_def; x++) {
1281 			if (rl->rate_limit_table[x] != 0)
1282 				continue;
1283 			rl->rate_limit_table[x] = value;
1284 			error = 0;
1285 			break;
1286 		}
1287 	}
1288 	MLX5E_RL_WUNLOCK(rl);
1289 
1290 	return (error);
1291 }
1292 
1293 static int
1294 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1295 {
1296 	unsigned x;
1297 	int error;
1298 
1299 	if (value == 0)
1300 		return (EINVAL);
1301 
1302 	MLX5E_RL_WLOCK(rl);
1303 
1304 	/* check if rate already exists */
1305 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1306 		if (rl->rate_limit_table[x] != value)
1307 			continue;
1308 		/* free up rate */
1309 		rl->rate_limit_table[x] = 0;
1310 		break;
1311 	}
1312 
1313 	/* check if there is a free rate entry */
1314 	if (x == rl->param.tx_rates_def)
1315 		error = ENOENT;
1316 	else
1317 		error = 0;
1318 	MLX5E_RL_WUNLOCK(rl);
1319 
1320 	return (error);
1321 }
1322 
1323 static int
1324 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1325 {
1326 	struct mlx5e_rl_priv_data *rl = arg1;
1327 	struct mlx5e_priv *priv = rl->priv;
1328 	unsigned mode_modify;
1329 	unsigned was_opened;
1330 	uint64_t value;
1331 	uint64_t old;
1332 	int error;
1333 
1334 	PRIV_LOCK(priv);
1335 
1336 	MLX5E_RL_RLOCK(rl);
1337 	value = rl->param.arg[arg2];
1338 	MLX5E_RL_RUNLOCK(rl);
1339 
1340 	if (req != NULL) {
1341 		old = value;
1342 		error = sysctl_handle_64(oidp, &value, 0, req);
1343 		if (error || req->newptr == NULL ||
1344 		    value == rl->param.arg[arg2])
1345 			goto done;
1346 	} else {
1347 		old = 0;
1348 		error = 0;
1349 	}
1350 
1351 	/* check if device is gone */
1352 	if (priv->gone) {
1353 		error = ENXIO;
1354 		goto done;
1355 	}
1356 	was_opened = rl->opened;
1357 	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1358 
1359 	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1360 	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1361 		if (value > rl->param.tx_worker_threads_max)
1362 			value = rl->param.tx_worker_threads_max;
1363 		else if (value < 1)
1364 			value = 1;
1365 
1366 		/* store new value */
1367 		rl->param.arg[arg2] = value;
1368 		break;
1369 
1370 	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1371 		if (value > rl->param.tx_channels_per_worker_max)
1372 			value = rl->param.tx_channels_per_worker_max;
1373 		else if (value < 1)
1374 			value = 1;
1375 
1376 		/* store new value */
1377 		rl->param.arg[arg2] = value;
1378 		break;
1379 
1380 	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1381 		if (value > rl->param.tx_rates_max)
1382 			value = rl->param.tx_rates_max;
1383 		else if (value < 1)
1384 			value = 1;
1385 
1386 		/* store new value */
1387 		rl->param.arg[arg2] = value;
1388 		break;
1389 
1390 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1391 		/* range check */
1392 		if (value < 1)
1393 			value = 0;
1394 		else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1395 			value = MLX5E_FLD_MAX(cqc, cq_period);
1396 
1397 		/* store new value */
1398 		rl->param.arg[arg2] = value;
1399 
1400 		/* check to avoid down and up the network interface */
1401 		if (was_opened)
1402 			error = mlx5e_rl_refresh_channel_params(rl);
1403 		break;
1404 
1405 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1406 		/* import TX coal pkts */
1407 		if (value < 1)
1408 			value = 0;
1409 		else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1410 			value = MLX5E_FLD_MAX(cqc, cq_max_count);
1411 
1412 		/* store new value */
1413 		rl->param.arg[arg2] = value;
1414 
1415 		/* check to avoid down and up the network interface */
1416 		if (was_opened)
1417 			error = mlx5e_rl_refresh_channel_params(rl);
1418 		break;
1419 
1420 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1421 		/* network interface must be down */
1422 		if (was_opened != 0 && mode_modify == 0)
1423 			mlx5e_rl_close_workers(priv);
1424 
1425 		/* import TX coalesce mode */
1426 		if (value != 0)
1427 			value = 1;
1428 
1429 		/* store new value */
1430 		rl->param.arg[arg2] = value;
1431 
1432 		/* restart network interface, if any */
1433 		if (was_opened != 0) {
1434 			if (mode_modify == 0)
1435 				mlx5e_rl_open_workers(priv);
1436 			else
1437 				error = mlx5e_rl_refresh_channel_params(rl);
1438 		}
1439 		break;
1440 
1441 	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1442 		/* network interface must be down */
1443 		if (was_opened)
1444 			mlx5e_rl_close_workers(priv);
1445 
1446 		/* import TX queue size */
1447 		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1448 			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1449 		else if (value > priv->params_ethtool.tx_queue_size_max)
1450 			value = priv->params_ethtool.tx_queue_size_max;
1451 
1452 		/* store actual TX queue size */
1453 		value = 1ULL << order_base_2(value);
1454 
1455 		/* store new value */
1456 		rl->param.arg[arg2] = value;
1457 
1458 		/* verify TX completion factor */
1459 		mlx5e_rl_sync_tx_completion_fact(rl);
1460 
1461 		/* restart network interface, if any */
1462 		if (was_opened)
1463 			mlx5e_rl_open_workers(priv);
1464 		break;
1465 
1466 	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1467 		/* network interface must be down */
1468 		if (was_opened)
1469 			mlx5e_rl_close_workers(priv);
1470 
1471 		/* store new value */
1472 		rl->param.arg[arg2] = value;
1473 
1474 		/* verify parameter */
1475 		mlx5e_rl_sync_tx_completion_fact(rl);
1476 
1477 		/* restart network interface, if any */
1478 		if (was_opened)
1479 			mlx5e_rl_open_workers(priv);
1480 		break;
1481 
1482 	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1483 		error = mlx5e_rl_tx_limit_add(rl, value);
1484 		break;
1485 
1486 	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1487 		error = mlx5e_rl_tx_limit_clr(rl, value);
1488 		break;
1489 
1490 	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1491 		/* range check */
1492 		if (value > rl->param.tx_allowed_deviation_max)
1493 			value = rl->param.tx_allowed_deviation_max;
1494 		else if (value < rl->param.tx_allowed_deviation_min)
1495 			value = rl->param.tx_allowed_deviation_min;
1496 
1497 		MLX5E_RL_WLOCK(rl);
1498 		rl->param.arg[arg2] = value;
1499 		MLX5E_RL_WUNLOCK(rl);
1500 		break;
1501 
1502 	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1503 		/* range check */
1504 		if (value > rl->param.tx_burst_size_max)
1505 			value = rl->param.tx_burst_size_max;
1506 		else if (value < rl->param.tx_burst_size_min)
1507 			value = rl->param.tx_burst_size_min;
1508 
1509 		MLX5E_RL_WLOCK(rl);
1510 		rl->param.arg[arg2] = value;
1511 		MLX5E_RL_WUNLOCK(rl);
1512 		break;
1513 
1514 	default:
1515 		break;
1516 	}
1517 done:
1518 	PRIV_UNLOCK(priv);
1519 	return (error);
1520 }
1521 
1522 static void
1523 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1524     struct sysctl_oid *node, const char *name, const char *desc)
1525 {
1526 	/*
1527 	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1528 	 * take care of loading default sysctl value from the kernel
1529 	 * environment, if any:
1530 	 */
1531 	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1532 		/* read-only SYSCTLs */
1533 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1534 		    name, CTLTYPE_U64 | CTLFLAG_RD |
1535 		    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1536 	} else {
1537 		if (strstr(name, "_def") != 0) {
1538 #ifdef RATELIMIT_DEBUG
1539 			/* tunable read-only advanced SYSCTLs */
1540 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1541 			    name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1542 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1543 #endif
1544 		} else {
1545 			/* read-write SYSCTLs */
1546 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1547 			    name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1548 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1549 		}
1550 	}
1551 }
1552 
1553 static void
1554 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1555     struct sysctl_oid *node, const char *name, const char *desc)
1556 {
1557 	/* read-only SYSCTLs */
1558 	SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1559 	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
1560 }
1561 
1562 #else
1563 
1564 int
1565 mlx5e_rl_init(struct mlx5e_priv *priv)
1566 {
1567 
1568 	return (0);
1569 }
1570 
1571 void
1572 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1573 {
1574 	/* NOP */
1575 }
1576 
1577 #endif		/* RATELIMIT */
1578