xref: /freebsd/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c (revision eb81f38a62c9ae246955feceedb8c043e78f871f)
1 /*-
2  * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "en.h"
29 
30 #ifdef RATELIMIT
31 
32 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
33 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
34 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
35 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
36     struct sysctl_oid *, const char *name, const char *desc);
37 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
38       struct sysctl_oid *node, const char *name, const char *desc);
39 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
40 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
41 
42 static void
43 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
44     struct mlx5e_sq_param *param)
45 {
46 	void *sqc = param->sqc;
47 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
48 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
49 
50 	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
51 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
52 	MLX5_SET(wq, wq, pd, rl->priv->pdn);
53 
54 	param->wq.buf_numa_node = 0;
55 	param->wq.db_numa_node = 0;
56 	param->wq.linear = 1;
57 }
58 
59 static void
60 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
61     struct mlx5e_cq_param *param)
62 {
63 	void *cqc = param->cqc;
64 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
65 
66 	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
67 	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
68 	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
69 
70 	switch (rl->param.tx_coalesce_mode) {
71 	case 0:
72 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
73 		break;
74 	default:
75 		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
76 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
77 		else
78 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
79 		break;
80 	}
81 }
82 
83 static void
84 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
85     struct mlx5e_rl_channel_param *cparam)
86 {
87 	memset(cparam, 0, sizeof(*cparam));
88 
89 	mlx5e_rl_build_sq_param(rl, &cparam->sq);
90 	mlx5e_rl_build_cq_param(rl, &cparam->cq);
91 }
92 
93 static int
94 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
95     struct mlx5e_sq_param *param, int ix)
96 {
97 	struct mlx5_core_dev *mdev = priv->mdev;
98 	void *sqc = param->sqc;
99 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
100 	int err;
101 
102 	/* Create DMA descriptor TAG */
103 	if ((err = -bus_dma_tag_create(
104 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
105 	    1,				/* any alignment */
106 	    0,				/* no boundary */
107 	    BUS_SPACE_MAXADDR,		/* lowaddr */
108 	    BUS_SPACE_MAXADDR,		/* highaddr */
109 	    NULL, NULL,			/* filter, filterarg */
110 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
111 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
112 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
113 	    0,				/* flags */
114 	    NULL, NULL,			/* lockfunc, lockfuncarg */
115 	    &sq->dma_tag)))
116 		goto done;
117 
118 	/* use shared UAR */
119 	sq->uar = priv->rl.sq_uar;
120 
121 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
122 	    &sq->wq_ctrl);
123 	if (err)
124 		goto err_free_dma_tag;
125 
126 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
127 	/*
128 	 * The sq->bf_buf_size variable is intentionally left zero so
129 	 * that the doorbell writes will occur at the same memory
130 	 * location.
131 	 */
132 
133 	err = mlx5e_alloc_sq_db(sq);
134 	if (err)
135 		goto err_sq_wq_destroy;
136 
137 	sq->mkey_be = cpu_to_be32(priv->mr.key);
138 	sq->ifp = priv->ifp;
139 	sq->priv = priv;
140 	sq->max_inline = priv->params.tx_max_inline;
141 	sq->min_inline_mode = priv->params.tx_min_inline_mode;
142 	sq->vlan_inline_cap = MLX5_CAP_ETH(mdev, wqe_vlan_insert);
143 
144 	return (0);
145 
146 err_sq_wq_destroy:
147 	mlx5_wq_destroy(&sq->wq_ctrl);
148 err_free_dma_tag:
149 	bus_dma_tag_destroy(sq->dma_tag);
150 done:
151 	return (err);
152 }
153 
154 static void
155 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
156 {
157 
158 	mlx5e_free_sq_db(sq);
159 	mlx5_wq_destroy(&sq->wq_ctrl);
160 }
161 
162 static int
163 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
164     struct mlx5e_sq_param *param, int ix)
165 {
166 	int err;
167 
168 	err = mlx5e_rl_create_sq(priv, sq, param, ix);
169 	if (err)
170 		return (err);
171 
172 	err = mlx5e_enable_sq(sq, param, priv->rl.tisn);
173 	if (err)
174 		goto err_destroy_sq;
175 
176 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
177 	if (err)
178 		goto err_disable_sq;
179 
180 	return (0);
181 
182 err_disable_sq:
183 	mlx5e_disable_sq(sq);
184 err_destroy_sq:
185 	mlx5e_rl_destroy_sq(sq);
186 
187 	return (err);
188 }
189 
190 static void
191 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
192 {
193 	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
194 	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
195 
196 	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
197 
198 	sq->cev_factor = priv->rl.param.tx_completion_fact;
199 
200 	/* ensure the TX completion event factor is not zero */
201 	if (sq->cev_factor == 0)
202 		sq->cev_factor = 1;
203 }
204 
205 static int
206 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
207     struct mlx5e_rl_channel_param *cparam,
208     struct mlx5e_sq *volatile *ppsq)
209 {
210 	struct mlx5e_priv *priv = rlw->priv;
211 	struct mlx5e_sq *sq;
212 	int err;
213 
214 	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
215 
216 	/* init mutexes */
217 	mlx5e_rl_chan_mtx_init(priv, sq);
218 
219 	/* open TX completion queue */
220 	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
221 	    &mlx5e_tx_cq_comp, eq_ix);
222 	if (err)
223 		goto err_free;
224 
225 	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
226 	if (err)
227 		goto err_close_tx_cq;
228 
229 	/* store TX channel pointer */
230 	*ppsq = sq;
231 
232 	/* poll TX queue initially */
233 	sq->cq.mcq.comp(&sq->cq.mcq);
234 
235 	return (0);
236 
237 err_close_tx_cq:
238 	mlx5e_close_cq(&sq->cq);
239 
240 err_free:
241 	/* destroy mutexes */
242 	mtx_destroy(&sq->lock);
243 	mtx_destroy(&sq->comp_lock);
244 	free(sq, M_MLX5EN);
245 	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
246 	return (err);
247 }
248 
249 static void
250 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
251 {
252 	struct mlx5e_sq *sq = *ppsq;
253 
254 	/* check if channel is already closed */
255 	if (sq == NULL)
256 		return;
257 	/* ensure channel pointer is no longer used */
258 	*ppsq = NULL;
259 
260 	/* teardown and destroy SQ */
261 	mlx5e_drain_sq(sq);
262 	mlx5e_disable_sq(sq);
263 	mlx5e_rl_destroy_sq(sq);
264 
265 	/* close CQ */
266 	mlx5e_close_cq(&sq->cq);
267 
268 	/* destroy mutexes */
269 	mtx_destroy(&sq->lock);
270 	mtx_destroy(&sq->comp_lock);
271 
272 	free(sq, M_MLX5EN);
273 }
274 
275 static void
276 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
277 {
278 	/*
279 	 * Limit the maximum distance between completion events to
280 	 * half of the currently set TX queue size.
281 	 *
282 	 * The maximum number of queue entries a single IP packet can
283 	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
284 	 *
285 	 * The worst case max value is then given as below:
286 	 */
287 	uint64_t max = rl->param.tx_queue_size /
288 	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
289 
290 	/*
291 	 * Update the maximum completion factor value in case the
292 	 * tx_queue_size field changed. Ensure we don't overflow
293 	 * 16-bits.
294 	 */
295 	if (max < 1)
296 		max = 1;
297 	else if (max > 65535)
298 		max = 65535;
299 	rl->param.tx_completion_fact_max = max;
300 
301 	/*
302 	 * Verify that the current TX completion factor is within the
303 	 * given limits:
304 	 */
305 	if (rl->param.tx_completion_fact < 1)
306 		rl->param.tx_completion_fact = 1;
307 	else if (rl->param.tx_completion_fact > max)
308 		rl->param.tx_completion_fact = max;
309 }
310 
311 static int
312 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
313 {
314 	struct mlx5e_priv *priv = sq->priv;
315 	struct mlx5_core_dev *mdev = priv->mdev;
316 
317 	void *in;
318 	void *sqc;
319 	int inlen;
320 	int err;
321 
322 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
323 	in = mlx5_vzalloc(inlen);
324 	if (in == NULL)
325 		return (-ENOMEM);
326 
327 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
328 
329 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
330 	MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
331 	MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
332 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
333 	MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
334 
335 	err = mlx5_core_modify_sq(mdev, in, inlen);
336 
337 	kvfree(in);
338 
339 	return (err);
340 }
341 
342 /*
343  * This function will search the configured rate limit table for the
344  * best match to avoid that a single socket based application can
345  * allocate all the available hardware rates. If the user selected
346  * rate deviates too much from the closes rate available in the rate
347  * limit table, unlimited rate will be selected.
348  */
349 static uint64_t
350 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
351 {
352 	uint64_t distance = -1ULL;
353 	uint64_t diff;
354 	uint64_t retval = 0;		/* unlimited */
355 	uint64_t x;
356 
357 	/* search for closest rate */
358 	for (x = 0; x != rl->param.tx_rates_def; x++) {
359 		uint64_t rate = rl->rate_limit_table[x];
360 		if (rate == 0)
361 			continue;
362 
363 		if (rate > user_rate)
364 			diff = rate - user_rate;
365 		else
366 			diff = user_rate - rate;
367 
368 		/* check if distance is smaller than previous rate */
369 		if (diff < distance) {
370 			distance = diff;
371 			retval = rate;
372 		}
373 	}
374 
375 	/* range check for multiplication below */
376 	if (user_rate > rl->param.tx_limit_max)
377 		user_rate = rl->param.tx_limit_max;
378 
379 	/* fallback to unlimited, if rate deviates too much */
380 	if (distance > howmany(user_rate *
381 	    rl->param.tx_allowed_deviation, 1000ULL))
382 		retval = 0;
383 
384 	return (retval);
385 }
386 
387 /*
388  * This function sets the requested rate for a rate limit channel, in
389  * bits per second. The requested rate will be filtered through the
390  * find best rate function above.
391  */
392 static int
393 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
394     struct mlx5e_rl_channel *channel, uint64_t rate)
395 {
396 	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
397 	struct mlx5e_sq *sq;
398 	uint64_t temp;
399 	uint16_t index;
400 	uint16_t burst;
401 	int error;
402 
403 	if (rate != 0) {
404 		MLX5E_RL_WORKER_UNLOCK(rlw);
405 
406 		MLX5E_RL_RLOCK(rl);
407 
408 		/* get current burst size in bytes */
409 		temp = rl->param.tx_burst_size *
410 		    MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
411 
412 		/* limit burst size to 64K currently */
413 		if (temp > 65535)
414 			temp = 65535;
415 		burst = temp;
416 
417 		/* find best rate */
418 		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
419 
420 		MLX5E_RL_RUNLOCK(rl);
421 
422 		if (rate == 0) {
423 			/* rate doesn't exist, fallback to unlimited */
424 			error = EINVAL;
425 			index = 0;
426 			rate = 0;
427 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
428 		} else {
429 			/* get a reference on the new rate */
430 			error = -mlx5_rl_add_rate(rlw->priv->mdev,
431 			    howmany(rate, 1000), burst, &index);
432 
433 			if (error != 0) {
434 				/* adding rate failed, fallback to unlimited */
435 				index = 0;
436 				rate = 0;
437 				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
438 			}
439 		}
440 		MLX5E_RL_WORKER_LOCK(rlw);
441 	} else {
442 		index = 0;
443 		burst = 0;	/* default */
444 	}
445 
446 	/* atomically swap rates */
447 	temp = channel->last_rate;
448 	channel->last_rate = rate;
449 	rate = temp;
450 
451 	/* atomically swap burst size */
452 	temp = channel->last_burst;
453 	channel->last_burst = burst;
454 	burst = temp;
455 
456 	MLX5E_RL_WORKER_UNLOCK(rlw);
457 	/* put reference on the old rate, if any */
458 	if (rate != 0) {
459 		mlx5_rl_remove_rate(rlw->priv->mdev,
460 		    howmany(rate, 1000), burst);
461 	}
462 
463 	/* set new rate */
464 	sq = channel->sq;
465 	if (sq != NULL) {
466 		error = mlx5e_rl_modify_sq(sq, index);
467 		if (error != 0)
468 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
469 	} else
470 		error = 0;
471 	MLX5E_RL_WORKER_LOCK(rlw);
472 
473 	return (-error);
474 }
475 
476 static void
477 mlx5e_rl_worker(void *arg)
478 {
479 	struct thread *td;
480 	struct mlx5e_rl_worker *rlw = arg;
481 	struct mlx5e_rl_channel *channel;
482 	struct mlx5e_priv *priv;
483 	unsigned ix;
484 	uint64_t x;
485 	int error;
486 
487 	/* set thread priority */
488 	td = curthread;
489 
490 	thread_lock(td);
491 	sched_prio(td, PI_SWI(SWI_NET));
492 	thread_unlock(td);
493 
494 	priv = rlw->priv;
495 
496 	/* compute completion vector */
497 	ix = (rlw - priv->rl.workers) %
498 	    priv->mdev->priv.eq_table.num_comp_vectors;
499 
500 	/* TODO bind to CPU */
501 
502 	/* open all the SQs */
503 	MLX5E_RL_WORKER_LOCK(rlw);
504 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
505 		struct mlx5e_rl_channel *channel = rlw->channels + x;
506 
507 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
508 		if (channel->state == MLX5E_RL_ST_FREE)
509 			continue;
510 #endif
511 		MLX5E_RL_WORKER_UNLOCK(rlw);
512 
513 		MLX5E_RL_RLOCK(&priv->rl);
514 		error = mlx5e_rl_open_channel(rlw, ix,
515 		    &priv->rl.chan_param, &channel->sq);
516 		MLX5E_RL_RUNLOCK(&priv->rl);
517 
518 		MLX5E_RL_WORKER_LOCK(rlw);
519 		if (error != 0) {
520 			if_printf(priv->ifp,
521 			    "mlx5e_rl_open_channel failed: %d\n", error);
522 			break;
523 		}
524 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
525 	}
526 	while (1) {
527 		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
528 			/* check if we are tearing down */
529 			if (rlw->worker_done != 0)
530 				break;
531 			cv_wait(&rlw->cv, &rlw->mtx);
532 		}
533 		/* check if we are tearing down */
534 		if (rlw->worker_done != 0)
535 			break;
536 		channel = STAILQ_FIRST(&rlw->process_head);
537 		if (channel != NULL) {
538 			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
539 
540 			switch (channel->state) {
541 			case MLX5E_RL_ST_MODIFY:
542 				channel->state = MLX5E_RL_ST_USED;
543 				MLX5E_RL_WORKER_UNLOCK(rlw);
544 
545 				/* create channel by demand */
546 				if (channel->sq == NULL) {
547 					MLX5E_RL_RLOCK(&priv->rl);
548 					error = mlx5e_rl_open_channel(rlw, ix,
549 					    &priv->rl.chan_param, &channel->sq);
550 					MLX5E_RL_RUNLOCK(&priv->rl);
551 
552 					if (error != 0) {
553 						if_printf(priv->ifp,
554 						    "mlx5e_rl_open_channel failed: %d\n", error);
555 					} else {
556 						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
557 					}
558 				} else {
559 					mlx5e_resume_sq(channel->sq);
560 				}
561 
562 				MLX5E_RL_WORKER_LOCK(rlw);
563 				/* convert from bytes/s to bits/s and set new rate */
564 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
565 				    channel->new_rate * 8ULL);
566 				if (error != 0) {
567 					if_printf(priv->ifp,
568 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
569 					    error);
570 				}
571 				break;
572 
573 			case MLX5E_RL_ST_DESTROY:
574 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
575 				if (error != 0) {
576 					if_printf(priv->ifp,
577 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
578 					    error);
579 				}
580 				if (channel->sq != NULL) {
581 					/*
582 					 * Make sure all packets are
583 					 * transmitted before SQ is
584 					 * returned to free list:
585 					 */
586 					MLX5E_RL_WORKER_UNLOCK(rlw);
587 					mlx5e_drain_sq(channel->sq);
588 					MLX5E_RL_WORKER_LOCK(rlw);
589 				}
590 				/* put the channel back into the free list */
591 				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
592 				channel->state = MLX5E_RL_ST_FREE;
593 				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
594 				break;
595 			default:
596 				/* NOP */
597 				break;
598 			}
599 		}
600 	}
601 
602 	/* close all the SQs */
603 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
604 		struct mlx5e_rl_channel *channel = rlw->channels + x;
605 
606 		/* update the initial rate */
607 		channel->init_rate = channel->last_rate;
608 
609 		/* make sure we free up the rate resource */
610 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
611 
612 		if (channel->sq != NULL) {
613 			MLX5E_RL_WORKER_UNLOCK(rlw);
614 			mlx5e_rl_close_channel(&channel->sq);
615 			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
616 			MLX5E_RL_WORKER_LOCK(rlw);
617 		}
618 	}
619 
620 	rlw->worker_done = 0;
621 	cv_broadcast(&rlw->cv);
622 	MLX5E_RL_WORKER_UNLOCK(rlw);
623 
624 	kthread_exit();
625 }
626 
627 static int
628 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
629 {
630 	struct mlx5_core_dev *mdev = priv->mdev;
631 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
632 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
633 
634 	memset(in, 0, sizeof(in));
635 
636 	MLX5_SET(tisc, tisc, prio, 0);
637 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
638 
639 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
640 }
641 
642 static void
643 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
644 {
645 	mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn);
646 }
647 
648 static void
649 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
650     struct mlx5_core_dev *mdev)
651 {
652 	/* ratelimit workers */
653 	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
654 	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
655 
656 	/* range check */
657 	if (param->tx_worker_threads_def == 0 ||
658 	    param->tx_worker_threads_def > param->tx_worker_threads_max)
659 		param->tx_worker_threads_def = param->tx_worker_threads_max;
660 
661 	/* ratelimit channels */
662 	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
663 	    param->tx_worker_threads_def;
664 	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
665 
666 	/* range check */
667 	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
668 		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
669 
670 	/* set default burst size */
671 	param->tx_burst_size = 4;	/* MTUs */
672 
673 	/*
674 	 * Set maximum burst size
675 	 *
676 	 * The burst size is multiplied by the MTU and clamped to the
677 	 * range 0 ... 65535 bytes inclusivly before fed into the
678 	 * firmware.
679 	 *
680 	 * NOTE: If the burst size or MTU is changed only ratelimit
681 	 * connections made after the change will use the new burst
682 	 * size.
683 	 */
684 	param->tx_burst_size_max = 255;
685 
686 	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
687 	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
688 	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
689 
690 	/* ratelimit table size */
691 	param->tx_rates_max = mdev->priv.rl_table.max_size;
692 
693 	/* range check */
694 	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
695 		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
696 
697 	/* set default number of rates */
698 	param->tx_rates_def = param->tx_rates_max;
699 
700 	/* set maximum allowed rate deviation */
701 	if (param->tx_limit_max != 0) {
702 		/*
703 		 * Make sure the deviation multiplication doesn't
704 		 * overflow unsigned 64-bit:
705 		 */
706 		param->tx_allowed_deviation_max = -1ULL /
707 		    param->tx_limit_max;
708 	}
709 	/* set default rate deviation */
710 	param->tx_allowed_deviation = 50;	/* 5.0% */
711 
712 	/* channel parameters */
713 	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
714 	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
715 	param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
716 	param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
717 	param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
718 }
719 
720 static const char *mlx5e_rl_params_desc[] = {
721 	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
722 };
723 
724 static const char *mlx5e_rl_table_params_desc[] = {
725 	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
726 };
727 
728 static const char *mlx5e_rl_stats_desc[] = {
729 	MLX5E_RL_STATS(MLX5E_STATS_DESC)
730 };
731 
732 int
733 mlx5e_rl_init(struct mlx5e_priv *priv)
734 {
735 	struct mlx5e_rl_priv_data *rl = &priv->rl;
736 	struct sysctl_oid *node;
737 	struct sysctl_oid *stats;
738 	char buf[64];
739 	uint64_t i;
740 	uint64_t j;
741 	int error;
742 
743 	/* check if there is support for packet pacing */
744 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
745 		return (0);
746 
747 	rl->priv = priv;
748 
749 	sysctl_ctx_init(&rl->ctx);
750 
751 	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
752 
753 	/* allocate shared UAR for SQs */
754 	error = mlx5_alloc_map_uar(priv->mdev, &rl->sq_uar);
755 	if (error)
756 		goto done;
757 
758 	/* open own TIS domain for ratelimit SQs */
759 	error = mlx5e_rl_open_tis(priv);
760 	if (error)
761 		goto err_uar;
762 
763 	/* setup default value for parameters */
764 	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
765 
766 	/* update the completion factor */
767 	mlx5e_rl_sync_tx_completion_fact(rl);
768 
769 	/* create root node */
770 	node = SYSCTL_ADD_NODE(&rl->ctx,
771 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
772 	    "rate_limit", CTLFLAG_RW, NULL, "Rate limiting support");
773 
774 	if (node != NULL) {
775 		/* create SYSCTLs */
776 		for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
777 			mlx5e_rl_sysctl_add_u64_oid(rl,
778 			    MLX5E_RL_PARAMS_INDEX(arg[i]),
779 			    node, mlx5e_rl_params_desc[2 * i],
780 			    mlx5e_rl_params_desc[2 * i + 1]);
781 		}
782 
783 		stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
784 		    OID_AUTO, "stats", CTLFLAG_RD, NULL,
785 		    "Rate limiting statistics");
786 		if (stats != NULL) {
787 			/* create SYSCTLs */
788 			for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
789 				mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
790 				    stats, mlx5e_rl_stats_desc[2 * i],
791 				    mlx5e_rl_stats_desc[2 * i + 1]);
792 			}
793 		}
794 	}
795 
796 	/* allocate workers array */
797 	rl->workers = malloc(sizeof(rl->workers[0]) *
798 	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
799 
800 	/* allocate rate limit array */
801 	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
802 	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
803 
804 	if (node != NULL) {
805 		/* create more SYSCTls */
806 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
807 		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
808 		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
809 		    "A", "Show table of all configured TX rates");
810 
811 		/* try to fetch rate table from kernel environment */
812 		for (i = 0; i != rl->param.tx_rates_def; i++) {
813 			/* compute path for tunable */
814 			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
815 			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
816 			if (TUNABLE_QUAD_FETCH(buf, &j))
817 				mlx5e_rl_tx_limit_add(rl, j);
818 		}
819 
820 		/* setup rate table sysctls */
821 		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
822 			mlx5e_rl_sysctl_add_u64_oid(rl,
823 			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
824 			    node, mlx5e_rl_table_params_desc[2 * i],
825 			    mlx5e_rl_table_params_desc[2 * i + 1]);
826 		}
827 	}
828 
829 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
830 		struct mlx5e_rl_worker *rlw = rl->workers + j;
831 
832 		rlw->priv = priv;
833 
834 		cv_init(&rlw->cv, "mlx5-worker-cv");
835 		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
836 		STAILQ_INIT(&rlw->index_list_head);
837 		STAILQ_INIT(&rlw->process_head);
838 
839 		rlw->channels = malloc(sizeof(rlw->channels[0]) *
840 		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
841 
842 		MLX5E_RL_WORKER_LOCK(rlw);
843 		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
844 			struct mlx5e_rl_channel *channel = rlw->channels + i;
845 			channel->worker = rlw;
846 			channel->m_snd_tag.ifp = priv->ifp;
847 			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
848 		}
849 		MLX5E_RL_WORKER_UNLOCK(rlw);
850 	}
851 
852 	PRIV_LOCK(priv);
853 	error = mlx5e_rl_open_workers(priv);
854 	PRIV_UNLOCK(priv);
855 
856 	if (error != 0) {
857 		if_printf(priv->ifp,
858 		    "mlx5e_rl_open_workers failed: %d\n", error);
859 	}
860 
861 	return (0);
862 
863 err_uar:
864 	mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
865 done:
866 	sysctl_ctx_free(&rl->ctx);
867 	sx_destroy(&rl->rl_sxlock);
868 	return (error);
869 }
870 
871 static int
872 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
873 {
874 	struct mlx5e_rl_priv_data *rl = &priv->rl;
875 	struct thread *rl_thread = NULL;
876 	struct proc *rl_proc = NULL;
877 	uint64_t j;
878 	int error;
879 
880 	if (priv->gone || rl->opened)
881 		return (-EINVAL);
882 
883 	MLX5E_RL_WLOCK(rl);
884 	/* compute channel parameters once */
885 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
886 	MLX5E_RL_WUNLOCK(rl);
887 
888 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
889 		struct mlx5e_rl_worker *rlw = rl->workers + j;
890 
891 		/* start worker thread */
892 		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
893 		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
894 		if (error != 0) {
895 			if_printf(rl->priv->ifp,
896 			    "kproc_kthread_add failed: %d\n", error);
897 			rlw->worker_done = 1;
898 		}
899 	}
900 
901 	rl->opened = 1;
902 
903 	return (0);
904 }
905 
906 static void
907 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
908 {
909 	struct mlx5e_rl_priv_data *rl = &priv->rl;
910 	uint64_t y;
911 
912 	if (rl->opened == 0)
913 		return;
914 
915 	/* tear down worker threads simultaneously */
916 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
917 		struct mlx5e_rl_worker *rlw = rl->workers + y;
918 
919 		/* tear down worker before freeing SQs */
920 		MLX5E_RL_WORKER_LOCK(rlw);
921 		if (rlw->worker_done == 0) {
922 			rlw->worker_done = 1;
923 			cv_broadcast(&rlw->cv);
924 		} else {
925 			/* XXX thread not started */
926 			rlw->worker_done = 0;
927 		}
928 		MLX5E_RL_WORKER_UNLOCK(rlw);
929 	}
930 
931 	/* wait for worker threads to exit */
932 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
933 		struct mlx5e_rl_worker *rlw = rl->workers + y;
934 
935 		/* tear down worker before freeing SQs */
936 		MLX5E_RL_WORKER_LOCK(rlw);
937 		while (rlw->worker_done != 0)
938 			cv_wait(&rlw->cv, &rlw->mtx);
939 		MLX5E_RL_WORKER_UNLOCK(rlw);
940 	}
941 
942 	rl->opened = 0;
943 }
944 
945 static void
946 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
947 {
948 	unsigned x;
949 
950 	MLX5E_RL_WLOCK(rl);
951 	for (x = 0; x != rl->param.tx_rates_def; x++)
952 		rl->rate_limit_table[x] = 0;
953 	MLX5E_RL_WUNLOCK(rl);
954 }
955 
956 void
957 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
958 {
959 	struct mlx5e_rl_priv_data *rl = &priv->rl;
960 	uint64_t y;
961 
962 	/* check if there is support for packet pacing */
963 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
964 		return;
965 
966 	/* TODO check if there is support for packet pacing */
967 
968 	sysctl_ctx_free(&rl->ctx);
969 
970 	PRIV_LOCK(priv);
971 	mlx5e_rl_close_workers(priv);
972 	PRIV_UNLOCK(priv);
973 
974 	mlx5e_rl_reset_rates(rl);
975 
976 	/* free shared UAR for SQs */
977 	mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
978 
979 	/* close TIS domain */
980 	mlx5e_rl_close_tis(priv);
981 
982 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
983 		struct mlx5e_rl_worker *rlw = rl->workers + y;
984 
985 		cv_destroy(&rlw->cv);
986 		mtx_destroy(&rlw->mtx);
987 		free(rlw->channels, M_MLX5EN);
988 	}
989 	free(rl->rate_limit_table, M_MLX5EN);
990 	free(rl->workers, M_MLX5EN);
991 	sx_destroy(&rl->rl_sxlock);
992 }
993 
994 static void
995 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
996     struct mlx5e_rl_channel *channel)
997 {
998 	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
999 	cv_broadcast(&rlw->cv);
1000 }
1001 
1002 static void
1003 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
1004 {
1005 	if (channel == NULL)
1006 		return;
1007 
1008 	MLX5E_RL_WORKER_LOCK(rlw);
1009 	switch (channel->state) {
1010 	case MLX5E_RL_ST_MODIFY:
1011 		channel->state = MLX5E_RL_ST_DESTROY;
1012 		break;
1013 	case MLX5E_RL_ST_USED:
1014 		channel->state = MLX5E_RL_ST_DESTROY;
1015 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1016 		break;
1017 	default:
1018 		break;
1019 	}
1020 	MLX5E_RL_WORKER_UNLOCK(rlw);
1021 }
1022 
1023 static int
1024 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1025 {
1026 
1027 	MLX5E_RL_WORKER_LOCK(rlw);
1028 	channel->new_rate = rate;
1029 	switch (channel->state) {
1030 	case MLX5E_RL_ST_USED:
1031 		channel->state = MLX5E_RL_ST_MODIFY;
1032 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1033 		break;
1034 	default:
1035 		break;
1036 	}
1037 	MLX5E_RL_WORKER_UNLOCK(rlw);
1038 
1039 	return (0);
1040 }
1041 
1042 static int
1043 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t *prate)
1044 {
1045 	int retval;
1046 
1047 	MLX5E_RL_WORKER_LOCK(rlw);
1048 	switch (channel->state) {
1049 	case MLX5E_RL_ST_USED:
1050 		*prate = channel->last_rate;
1051 		retval = 0;
1052 		break;
1053 	case MLX5E_RL_ST_MODIFY:
1054 		retval = EBUSY;
1055 		break;
1056 	default:
1057 		retval = EINVAL;
1058 		break;
1059 	}
1060 	MLX5E_RL_WORKER_UNLOCK(rlw);
1061 
1062 	return (retval);
1063 }
1064 
1065 static int
1066 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1067     struct mlx5e_rl_channel **pchannel)
1068 {
1069 	struct mlx5e_rl_channel *channel;
1070 	int retval = ENOMEM;
1071 
1072 	MLX5E_RL_WORKER_LOCK(rlw);
1073 	/* Check for available channel in free list */
1074 	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1075 		retval = 0;
1076 		/* Remove head index from available list */
1077 		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1078 		channel->state = MLX5E_RL_ST_USED;
1079 		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1080 	} else {
1081 		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1082 	}
1083 	MLX5E_RL_WORKER_UNLOCK(rlw);
1084 
1085 	*pchannel = channel;
1086 #ifdef RATELIMIT_DEBUG
1087 	if_printf(rlw->priv->ifp, "Channel pointer for rate limit connection is %p\n", channel);
1088 #endif
1089 	return (retval);
1090 }
1091 
1092 int
1093 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
1094     union if_snd_tag_alloc_params *params,
1095     struct m_snd_tag **ppmt)
1096 {
1097 	struct mlx5e_rl_channel *channel;
1098 	struct mlx5e_rl_worker *rlw;
1099 	struct mlx5e_priv *priv;
1100 	int error;
1101 
1102 	priv = ifp->if_softc;
1103 
1104 	/* check if there is support for packet pacing or if device is going away */
1105 	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1106 	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1107 	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1108 		return (EOPNOTSUPP);
1109 
1110 	/* compute worker thread this TCP connection belongs to */
1111 	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1112 	    priv->rl.param.tx_worker_threads_def);
1113 
1114 	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1115 	if (error != 0)
1116 		goto done;
1117 
1118 	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1119 	if (error != 0) {
1120 		mlx5e_rl_free(rlw, channel);
1121 		goto done;
1122 	}
1123 
1124 	/* store pointer to mbuf tag */
1125 	*ppmt = &channel->m_snd_tag;
1126 done:
1127 	return (error);
1128 }
1129 
1130 
1131 int
1132 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1133 {
1134 	struct mlx5e_rl_channel *channel =
1135 	    container_of(pmt, struct mlx5e_rl_channel, m_snd_tag);
1136 
1137 	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1138 }
1139 
1140 int
1141 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1142 {
1143 	struct mlx5e_rl_channel *channel =
1144 	    container_of(pmt, struct mlx5e_rl_channel, m_snd_tag);
1145 
1146 	return (mlx5e_rl_query(channel->worker, channel, &params->rate_limit.max_rate));
1147 }
1148 
1149 void
1150 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1151 {
1152 	struct mlx5e_rl_channel *channel =
1153 	    container_of(pmt, struct mlx5e_rl_channel, m_snd_tag);
1154 
1155 	mlx5e_rl_free(channel->worker, channel);
1156 }
1157 
1158 static int
1159 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1160 {
1161 	struct mlx5e_rl_priv_data *rl = arg1;
1162 	struct mlx5e_priv *priv = rl->priv;
1163 	struct sbuf sbuf;
1164 	unsigned x;
1165 	int error;
1166 
1167 	error = sysctl_wire_old_buffer(req, 0);
1168 	if (error != 0)
1169 		return (error);
1170 
1171 	PRIV_LOCK(priv);
1172 
1173 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1174 
1175 	sbuf_printf(&sbuf,
1176 	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1177 	    "\t" "--------------------------------------------\n");
1178 
1179 	MLX5E_RL_RLOCK(rl);
1180 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1181 		if (rl->rate_limit_table[x] == 0)
1182 			continue;
1183 
1184 		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1185 		    x, (unsigned)rl->param.tx_burst_size,
1186 		    (long long)rl->rate_limit_table[x]);
1187 	}
1188 	MLX5E_RL_RUNLOCK(rl);
1189 
1190 	error = sbuf_finish(&sbuf);
1191 	sbuf_delete(&sbuf);
1192 
1193 	PRIV_UNLOCK(priv);
1194 
1195 	return (error);
1196 }
1197 
1198 static int
1199 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1200 {
1201 	uint64_t x;
1202 	uint64_t y;
1203 
1204 	MLX5E_RL_WLOCK(rl);
1205 	/* compute channel parameters once */
1206 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1207 	MLX5E_RL_WUNLOCK(rl);
1208 
1209 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1210 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1211 
1212 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1213 			struct mlx5e_rl_channel *channel;
1214 			struct mlx5e_sq *sq;
1215 
1216 			channel = rlw->channels + x;
1217 			sq = channel->sq;
1218 
1219 			if (sq == NULL)
1220 				continue;
1221 
1222 			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1223 				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1224 				    rl->param.tx_coalesce_usecs,
1225 				    rl->param.tx_coalesce_pkts,
1226 				    rl->param.tx_coalesce_mode);
1227 			} else {
1228 				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1229 				    rl->param.tx_coalesce_usecs,
1230 				    rl->param.tx_coalesce_pkts);
1231 			}
1232 		}
1233 	}
1234 	return (0);
1235 }
1236 
1237 static int
1238 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1239 {
1240 	unsigned x;
1241 	int error;
1242 
1243 	if (value < 1000 ||
1244 	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1245 		return (EINVAL);
1246 
1247 	MLX5E_RL_WLOCK(rl);
1248 	error = ENOMEM;
1249 
1250 	/* check if rate already exists */
1251 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1252 		if (rl->rate_limit_table[x] != value)
1253 			continue;
1254 		error = EEXIST;
1255 		break;
1256 	}
1257 
1258 	/* check if there is a free rate entry */
1259 	if (x == rl->param.tx_rates_def) {
1260 		for (x = 0; x != rl->param.tx_rates_def; x++) {
1261 			if (rl->rate_limit_table[x] != 0)
1262 				continue;
1263 			rl->rate_limit_table[x] = value;
1264 			error = 0;
1265 			break;
1266 		}
1267 	}
1268 	MLX5E_RL_WUNLOCK(rl);
1269 
1270 	return (error);
1271 }
1272 
1273 static int
1274 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1275 {
1276 	unsigned x;
1277 	int error;
1278 
1279 	if (value == 0)
1280 		return (EINVAL);
1281 
1282 	MLX5E_RL_WLOCK(rl);
1283 
1284 	/* check if rate already exists */
1285 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1286 		if (rl->rate_limit_table[x] != value)
1287 			continue;
1288 		/* free up rate */
1289 		rl->rate_limit_table[x] = 0;
1290 		break;
1291 	}
1292 
1293 	/* check if there is a free rate entry */
1294 	if (x == rl->param.tx_rates_def)
1295 		error = ENOENT;
1296 	else
1297 		error = 0;
1298 	MLX5E_RL_WUNLOCK(rl);
1299 
1300 	return (error);
1301 }
1302 
1303 static int
1304 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1305 {
1306 	struct mlx5e_rl_priv_data *rl = arg1;
1307 	struct mlx5e_priv *priv = rl->priv;
1308 	unsigned mode_modify;
1309 	unsigned was_opened;
1310 	uint64_t value;
1311 	uint64_t old;
1312 	int error;
1313 
1314 	PRIV_LOCK(priv);
1315 
1316 	MLX5E_RL_RLOCK(rl);
1317 	value = rl->param.arg[arg2];
1318 	MLX5E_RL_RUNLOCK(rl);
1319 
1320 	if (req != NULL) {
1321 		old = value;
1322 		error = sysctl_handle_64(oidp, &value, 0, req);
1323 		if (error || req->newptr == NULL ||
1324 		    value == rl->param.arg[arg2])
1325 			goto done;
1326 	} else {
1327 		old = 0;
1328 		error = 0;
1329 	}
1330 
1331 	/* check if device is gone */
1332 	if (priv->gone) {
1333 		error = ENXIO;
1334 		goto done;
1335 	}
1336 	was_opened = rl->opened;
1337 	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1338 
1339 	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1340 	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1341 		if (value > rl->param.tx_worker_threads_max)
1342 			value = rl->param.tx_worker_threads_max;
1343 		else if (value < 1)
1344 			value = 1;
1345 
1346 		/* store new value */
1347 		rl->param.arg[arg2] = value;
1348 		break;
1349 
1350 	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1351 		if (value > rl->param.tx_channels_per_worker_max)
1352 			value = rl->param.tx_channels_per_worker_max;
1353 		else if (value < 1)
1354 			value = 1;
1355 
1356 		/* store new value */
1357 		rl->param.arg[arg2] = value;
1358 		break;
1359 
1360 	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1361 		if (value > rl->param.tx_rates_max)
1362 			value = rl->param.tx_rates_max;
1363 		else if (value < 1)
1364 			value = 1;
1365 
1366 		/* store new value */
1367 		rl->param.arg[arg2] = value;
1368 		break;
1369 
1370 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1371 		/* range check */
1372 		if (value < 1)
1373 			value = 0;
1374 		else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1375 			value = MLX5E_FLD_MAX(cqc, cq_period);
1376 
1377 		/* store new value */
1378 		rl->param.arg[arg2] = value;
1379 
1380 		/* check to avoid down and up the network interface */
1381 		if (was_opened)
1382 			error = mlx5e_rl_refresh_channel_params(rl);
1383 		break;
1384 
1385 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1386 		/* import TX coal pkts */
1387 		if (value < 1)
1388 			value = 0;
1389 		else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1390 			value = MLX5E_FLD_MAX(cqc, cq_max_count);
1391 
1392 		/* store new value */
1393 		rl->param.arg[arg2] = value;
1394 
1395 		/* check to avoid down and up the network interface */
1396 		if (was_opened)
1397 			error = mlx5e_rl_refresh_channel_params(rl);
1398 		break;
1399 
1400 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1401 		/* network interface must be down */
1402 		if (was_opened != 0 && mode_modify == 0)
1403 			mlx5e_rl_close_workers(priv);
1404 
1405 		/* import TX coalesce mode */
1406 		if (value != 0)
1407 			value = 1;
1408 
1409 		/* store new value */
1410 		rl->param.arg[arg2] = value;
1411 
1412 		/* restart network interface, if any */
1413 		if (was_opened != 0) {
1414 			if (mode_modify == 0)
1415 				mlx5e_rl_open_workers(priv);
1416 			else
1417 				error = mlx5e_rl_refresh_channel_params(rl);
1418 		}
1419 		break;
1420 
1421 	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1422 		/* network interface must be down */
1423 		if (was_opened)
1424 			mlx5e_rl_close_workers(priv);
1425 
1426 		/* import TX queue size */
1427 		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1428 			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1429 		else if (value > priv->params_ethtool.tx_queue_size_max)
1430 			value = priv->params_ethtool.tx_queue_size_max;
1431 
1432 		/* store actual TX queue size */
1433 		value = 1ULL << order_base_2(value);
1434 
1435 		/* store new value */
1436 		rl->param.arg[arg2] = value;
1437 
1438 		/* verify TX completion factor */
1439 		mlx5e_rl_sync_tx_completion_fact(rl);
1440 
1441 		/* restart network interface, if any */
1442 		if (was_opened)
1443 			mlx5e_rl_open_workers(priv);
1444 		break;
1445 
1446 	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1447 		/* network interface must be down */
1448 		if (was_opened)
1449 			mlx5e_rl_close_workers(priv);
1450 
1451 		/* store new value */
1452 		rl->param.arg[arg2] = value;
1453 
1454 		/* verify parameter */
1455 		mlx5e_rl_sync_tx_completion_fact(rl);
1456 
1457 		/* restart network interface, if any */
1458 		if (was_opened)
1459 			mlx5e_rl_open_workers(priv);
1460 		break;
1461 
1462 	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1463 		error = mlx5e_rl_tx_limit_add(rl, value);
1464 		break;
1465 
1466 	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1467 		error = mlx5e_rl_tx_limit_clr(rl, value);
1468 		break;
1469 
1470 	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1471 		/* range check */
1472 		if (value > rl->param.tx_allowed_deviation_max)
1473 			value = rl->param.tx_allowed_deviation_max;
1474 		else if (value < rl->param.tx_allowed_deviation_min)
1475 			value = rl->param.tx_allowed_deviation_min;
1476 
1477 		MLX5E_RL_WLOCK(rl);
1478 		rl->param.arg[arg2] = value;
1479 		MLX5E_RL_WUNLOCK(rl);
1480 		break;
1481 
1482 	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1483 		/* range check */
1484 		if (value > rl->param.tx_burst_size_max)
1485 			value = rl->param.tx_burst_size_max;
1486 		else if (value < rl->param.tx_burst_size_min)
1487 			value = rl->param.tx_burst_size_min;
1488 
1489 		MLX5E_RL_WLOCK(rl);
1490 		rl->param.arg[arg2] = value;
1491 		MLX5E_RL_WUNLOCK(rl);
1492 		break;
1493 
1494 	default:
1495 		break;
1496 	}
1497 done:
1498 	PRIV_UNLOCK(priv);
1499 	return (error);
1500 }
1501 
1502 static void
1503 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1504     struct sysctl_oid *node, const char *name, const char *desc)
1505 {
1506 	/*
1507 	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1508 	 * take care of loading default sysctl value from the kernel
1509 	 * environment, if any:
1510 	 */
1511 	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1512 		/* read-only SYSCTLs */
1513 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1514 		    name, CTLTYPE_U64 | CTLFLAG_RD |
1515 		    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1516 	} else {
1517 		if (strstr(name, "_def") != 0) {
1518 #ifdef RATELIMIT_DEBUG
1519 			/* tunable read-only advanced SYSCTLs */
1520 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1521 			    name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1522 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1523 #endif
1524 		} else {
1525 			/* read-write SYSCTLs */
1526 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1527 			    name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1528 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1529 		}
1530 	}
1531 }
1532 
1533 static void
1534 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1535     struct sysctl_oid *node, const char *name, const char *desc)
1536 {
1537 	/* read-only SYSCTLs */
1538 	SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1539 	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
1540 }
1541 
1542 #endif
1543