xref: /freebsd/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c (revision 652a9748855320619e075c4e83aef2f5294412d2)
1 /*-
2  * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "en.h"
29 
30 #ifdef RATELIMIT
31 
32 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
33 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
34 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
35 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
36     struct sysctl_oid *, const char *name, const char *desc);
37 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
38       struct sysctl_oid *node, const char *name, const char *desc);
39 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
40 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
41 
42 static void
43 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
44     struct mlx5e_sq_param *param)
45 {
46 	void *sqc = param->sqc;
47 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
48 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
49 
50 	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
51 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
52 	MLX5_SET(wq, wq, pd, rl->priv->pdn);
53 
54 	param->wq.buf_numa_node = 0;
55 	param->wq.db_numa_node = 0;
56 	param->wq.linear = 1;
57 }
58 
59 static void
60 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
61     struct mlx5e_cq_param *param)
62 {
63 	void *cqc = param->cqc;
64 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
65 
66 	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
67 	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
68 	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
69 
70 	switch (rl->param.tx_coalesce_mode) {
71 	case 0:
72 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
73 		break;
74 	default:
75 		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
76 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
77 		else
78 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
79 		break;
80 	}
81 }
82 
83 static void
84 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
85     struct mlx5e_rl_channel_param *cparam)
86 {
87 	memset(cparam, 0, sizeof(*cparam));
88 
89 	mlx5e_rl_build_sq_param(rl, &cparam->sq);
90 	mlx5e_rl_build_cq_param(rl, &cparam->cq);
91 }
92 
93 static int
94 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
95     struct mlx5e_sq_param *param, int ix)
96 {
97 	struct mlx5_core_dev *mdev = priv->mdev;
98 	void *sqc = param->sqc;
99 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
100 	int err;
101 
102 	/* Create DMA descriptor TAG */
103 	if ((err = -bus_dma_tag_create(
104 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
105 	    1,				/* any alignment */
106 	    0,				/* no boundary */
107 	    BUS_SPACE_MAXADDR,		/* lowaddr */
108 	    BUS_SPACE_MAXADDR,		/* highaddr */
109 	    NULL, NULL,			/* filter, filterarg */
110 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
111 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
112 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
113 	    0,				/* flags */
114 	    NULL, NULL,			/* lockfunc, lockfuncarg */
115 	    &sq->dma_tag)))
116 		goto done;
117 
118 	/* use shared UAR */
119 	sq->uar = priv->rl.sq_uar;
120 
121 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
122 	    &sq->wq_ctrl);
123 	if (err)
124 		goto err_free_dma_tag;
125 
126 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
127 	/*
128 	 * The sq->bf_buf_size variable is intentionally left zero so
129 	 * that the doorbell writes will occur at the same memory
130 	 * location.
131 	 */
132 
133 	err = mlx5e_alloc_sq_db(sq);
134 	if (err)
135 		goto err_sq_wq_destroy;
136 
137 	sq->mkey_be = cpu_to_be32(priv->mr.key);
138 	sq->ifp = priv->ifp;
139 	sq->priv = priv;
140 
141 	mlx5e_update_sq_inline(sq);
142 
143 	return (0);
144 
145 err_sq_wq_destroy:
146 	mlx5_wq_destroy(&sq->wq_ctrl);
147 err_free_dma_tag:
148 	bus_dma_tag_destroy(sq->dma_tag);
149 done:
150 	return (err);
151 }
152 
153 static void
154 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
155 {
156 
157 	mlx5e_free_sq_db(sq);
158 	mlx5_wq_destroy(&sq->wq_ctrl);
159 	bus_dma_tag_destroy(sq->dma_tag);
160 }
161 
162 static int
163 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
164     struct mlx5e_sq_param *param, int ix)
165 {
166 	int err;
167 
168 	err = mlx5e_rl_create_sq(priv, sq, param, ix);
169 	if (err)
170 		return (err);
171 
172 	err = mlx5e_enable_sq(sq, param, priv->rl.tisn);
173 	if (err)
174 		goto err_destroy_sq;
175 
176 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
177 	if (err)
178 		goto err_disable_sq;
179 
180 	WRITE_ONCE(sq->running, 1);
181 
182 	return (0);
183 
184 err_disable_sq:
185 	mlx5e_disable_sq(sq);
186 err_destroy_sq:
187 	mlx5e_rl_destroy_sq(sq);
188 
189 	return (err);
190 }
191 
192 static void
193 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
194 {
195 	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
196 	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
197 
198 	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
199 
200 	sq->cev_factor = priv->rl.param.tx_completion_fact;
201 
202 	/* ensure the TX completion event factor is not zero */
203 	if (sq->cev_factor == 0)
204 		sq->cev_factor = 1;
205 }
206 
207 static int
208 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
209     struct mlx5e_rl_channel_param *cparam,
210     struct mlx5e_sq *volatile *ppsq)
211 {
212 	struct mlx5e_priv *priv = rlw->priv;
213 	struct mlx5e_sq *sq;
214 	int err;
215 
216 	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
217 
218 	/* init mutexes */
219 	mlx5e_rl_chan_mtx_init(priv, sq);
220 
221 	/* open TX completion queue */
222 	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
223 	    &mlx5e_tx_cq_comp, eq_ix);
224 	if (err)
225 		goto err_free;
226 
227 	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
228 	if (err)
229 		goto err_close_tx_cq;
230 
231 	/* store TX channel pointer */
232 	*ppsq = sq;
233 
234 	/* poll TX queue initially */
235 	sq->cq.mcq.comp(&sq->cq.mcq);
236 
237 	return (0);
238 
239 err_close_tx_cq:
240 	mlx5e_close_cq(&sq->cq);
241 
242 err_free:
243 	/* destroy mutexes */
244 	mtx_destroy(&sq->lock);
245 	mtx_destroy(&sq->comp_lock);
246 	free(sq, M_MLX5EN);
247 	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
248 	return (err);
249 }
250 
251 static void
252 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
253 {
254 	struct mlx5e_sq *sq = *ppsq;
255 
256 	/* check if channel is already closed */
257 	if (sq == NULL)
258 		return;
259 	/* ensure channel pointer is no longer used */
260 	*ppsq = NULL;
261 
262 	/* teardown and destroy SQ */
263 	mlx5e_drain_sq(sq);
264 	mlx5e_disable_sq(sq);
265 	mlx5e_rl_destroy_sq(sq);
266 
267 	/* close CQ */
268 	mlx5e_close_cq(&sq->cq);
269 
270 	/* destroy mutexes */
271 	mtx_destroy(&sq->lock);
272 	mtx_destroy(&sq->comp_lock);
273 
274 	free(sq, M_MLX5EN);
275 }
276 
277 static void
278 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
279 {
280 	/*
281 	 * Limit the maximum distance between completion events to
282 	 * half of the currently set TX queue size.
283 	 *
284 	 * The maximum number of queue entries a single IP packet can
285 	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
286 	 *
287 	 * The worst case max value is then given as below:
288 	 */
289 	uint64_t max = rl->param.tx_queue_size /
290 	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
291 
292 	/*
293 	 * Update the maximum completion factor value in case the
294 	 * tx_queue_size field changed. Ensure we don't overflow
295 	 * 16-bits.
296 	 */
297 	if (max < 1)
298 		max = 1;
299 	else if (max > 65535)
300 		max = 65535;
301 	rl->param.tx_completion_fact_max = max;
302 
303 	/*
304 	 * Verify that the current TX completion factor is within the
305 	 * given limits:
306 	 */
307 	if (rl->param.tx_completion_fact < 1)
308 		rl->param.tx_completion_fact = 1;
309 	else if (rl->param.tx_completion_fact > max)
310 		rl->param.tx_completion_fact = max;
311 }
312 
313 static int
314 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
315 {
316 	struct mlx5e_priv *priv = sq->priv;
317 	struct mlx5_core_dev *mdev = priv->mdev;
318 
319 	void *in;
320 	void *sqc;
321 	int inlen;
322 	int err;
323 
324 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
325 	in = mlx5_vzalloc(inlen);
326 	if (in == NULL)
327 		return (-ENOMEM);
328 
329 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
330 
331 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
332 	MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
333 	MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
334 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
335 	MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
336 
337 	err = mlx5_core_modify_sq(mdev, in, inlen);
338 
339 	kvfree(in);
340 
341 	return (err);
342 }
343 
344 /*
345  * This function will search the configured rate limit table for the
346  * best match to avoid that a single socket based application can
347  * allocate all the available hardware rates. If the user selected
348  * rate deviates too much from the closes rate available in the rate
349  * limit table, unlimited rate will be selected.
350  */
351 static uint64_t
352 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
353 {
354 	uint64_t distance = -1ULL;
355 	uint64_t diff;
356 	uint64_t retval = 0;		/* unlimited */
357 	uint64_t x;
358 
359 	/* search for closest rate */
360 	for (x = 0; x != rl->param.tx_rates_def; x++) {
361 		uint64_t rate = rl->rate_limit_table[x];
362 		if (rate == 0)
363 			continue;
364 
365 		if (rate > user_rate)
366 			diff = rate - user_rate;
367 		else
368 			diff = user_rate - rate;
369 
370 		/* check if distance is smaller than previous rate */
371 		if (diff < distance) {
372 			distance = diff;
373 			retval = rate;
374 		}
375 	}
376 
377 	/* range check for multiplication below */
378 	if (user_rate > rl->param.tx_limit_max)
379 		user_rate = rl->param.tx_limit_max;
380 
381 	/* fallback to unlimited, if rate deviates too much */
382 	if (distance > howmany(user_rate *
383 	    rl->param.tx_allowed_deviation, 1000ULL))
384 		retval = 0;
385 
386 	return (retval);
387 }
388 
389 /*
390  * This function sets the requested rate for a rate limit channel, in
391  * bits per second. The requested rate will be filtered through the
392  * find best rate function above.
393  */
394 static int
395 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
396     struct mlx5e_rl_channel *channel, uint64_t rate)
397 {
398 	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
399 	struct mlx5e_sq *sq;
400 	uint64_t temp;
401 	uint16_t index;
402 	uint16_t burst;
403 	int error;
404 
405 	if (rate != 0) {
406 		MLX5E_RL_WORKER_UNLOCK(rlw);
407 
408 		MLX5E_RL_RLOCK(rl);
409 
410 		/* get current burst size in bytes */
411 		temp = rl->param.tx_burst_size *
412 		    MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
413 
414 		/* limit burst size to 64K currently */
415 		if (temp > 65535)
416 			temp = 65535;
417 		burst = temp;
418 
419 		/* find best rate */
420 		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
421 
422 		MLX5E_RL_RUNLOCK(rl);
423 
424 		if (rate == 0) {
425 			/* rate doesn't exist, fallback to unlimited */
426 			index = 0;
427 			rate = 0;
428 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
429 		} else {
430 			/* get a reference on the new rate */
431 			error = -mlx5_rl_add_rate(rlw->priv->mdev,
432 			    howmany(rate, 1000), burst, &index);
433 
434 			if (error != 0) {
435 				/* adding rate failed, fallback to unlimited */
436 				index = 0;
437 				rate = 0;
438 				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
439 			}
440 		}
441 		MLX5E_RL_WORKER_LOCK(rlw);
442 	} else {
443 		index = 0;
444 		burst = 0;	/* default */
445 	}
446 
447 	/* atomically swap rates */
448 	temp = channel->last_rate;
449 	channel->last_rate = rate;
450 	rate = temp;
451 
452 	/* atomically swap burst size */
453 	temp = channel->last_burst;
454 	channel->last_burst = burst;
455 	burst = temp;
456 
457 	MLX5E_RL_WORKER_UNLOCK(rlw);
458 	/* put reference on the old rate, if any */
459 	if (rate != 0) {
460 		mlx5_rl_remove_rate(rlw->priv->mdev,
461 		    howmany(rate, 1000), burst);
462 	}
463 
464 	/* set new rate, if SQ is running */
465 	sq = channel->sq;
466 	if (sq != NULL && READ_ONCE(sq->running) != 0) {
467 		error = mlx5e_rl_modify_sq(sq, index);
468 		if (error != 0)
469 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
470 	} else
471 		error = 0;
472 	MLX5E_RL_WORKER_LOCK(rlw);
473 
474 	return (-error);
475 }
476 
477 static void
478 mlx5e_rl_worker(void *arg)
479 {
480 	struct thread *td;
481 	struct mlx5e_rl_worker *rlw = arg;
482 	struct mlx5e_rl_channel *channel;
483 	struct mlx5e_priv *priv;
484 	unsigned ix;
485 	uint64_t x;
486 	int error;
487 
488 	/* set thread priority */
489 	td = curthread;
490 
491 	thread_lock(td);
492 	sched_prio(td, PI_SWI(SWI_NET));
493 	thread_unlock(td);
494 
495 	priv = rlw->priv;
496 
497 	/* compute completion vector */
498 	ix = (rlw - priv->rl.workers) %
499 	    priv->mdev->priv.eq_table.num_comp_vectors;
500 
501 	/* TODO bind to CPU */
502 
503 	/* open all the SQs */
504 	MLX5E_RL_WORKER_LOCK(rlw);
505 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
506 		struct mlx5e_rl_channel *channel = rlw->channels + x;
507 
508 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
509 		if (channel->state == MLX5E_RL_ST_FREE)
510 			continue;
511 #endif
512 		MLX5E_RL_WORKER_UNLOCK(rlw);
513 
514 		MLX5E_RL_RLOCK(&priv->rl);
515 		error = mlx5e_rl_open_channel(rlw, ix,
516 		    &priv->rl.chan_param, &channel->sq);
517 		MLX5E_RL_RUNLOCK(&priv->rl);
518 
519 		MLX5E_RL_WORKER_LOCK(rlw);
520 		if (error != 0) {
521 			mlx5_en_err(priv->ifp,
522 			    "mlx5e_rl_open_channel failed: %d\n", error);
523 			break;
524 		}
525 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
526 	}
527 	while (1) {
528 		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
529 			/* check if we are tearing down */
530 			if (rlw->worker_done != 0)
531 				break;
532 			cv_wait(&rlw->cv, &rlw->mtx);
533 		}
534 		/* check if we are tearing down */
535 		if (rlw->worker_done != 0)
536 			break;
537 		channel = STAILQ_FIRST(&rlw->process_head);
538 		if (channel != NULL) {
539 			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
540 
541 			switch (channel->state) {
542 			case MLX5E_RL_ST_MODIFY:
543 				channel->state = MLX5E_RL_ST_USED;
544 				MLX5E_RL_WORKER_UNLOCK(rlw);
545 
546 				/* create channel by demand */
547 				if (channel->sq == NULL) {
548 					MLX5E_RL_RLOCK(&priv->rl);
549 					error = mlx5e_rl_open_channel(rlw, ix,
550 					    &priv->rl.chan_param, &channel->sq);
551 					MLX5E_RL_RUNLOCK(&priv->rl);
552 
553 					if (error != 0) {
554 						mlx5_en_err(priv->ifp,
555 						    "mlx5e_rl_open_channel failed: %d\n", error);
556 					} else {
557 						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
558 					}
559 				} else {
560 					mlx5e_resume_sq(channel->sq);
561 				}
562 
563 				MLX5E_RL_WORKER_LOCK(rlw);
564 				/* convert from bytes/s to bits/s and set new rate */
565 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
566 				    channel->new_rate * 8ULL);
567 				if (error != 0) {
568 					mlx5_en_err(priv->ifp,
569 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
570 					    error);
571 				}
572 				break;
573 
574 			case MLX5E_RL_ST_DESTROY:
575 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
576 				if (error != 0) {
577 					mlx5_en_err(priv->ifp,
578 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
579 					    error);
580 				}
581 				if (channel->sq != NULL) {
582 					/*
583 					 * Make sure all packets are
584 					 * transmitted before SQ is
585 					 * returned to free list:
586 					 */
587 					MLX5E_RL_WORKER_UNLOCK(rlw);
588 					mlx5e_drain_sq(channel->sq);
589 					MLX5E_RL_WORKER_LOCK(rlw);
590 				}
591 				/* put the channel back into the free list */
592 				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
593 				channel->state = MLX5E_RL_ST_FREE;
594 				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
595 				break;
596 			default:
597 				/* NOP */
598 				break;
599 			}
600 		}
601 	}
602 
603 	/* close all the SQs */
604 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
605 		struct mlx5e_rl_channel *channel = rlw->channels + x;
606 
607 		/* update the initial rate */
608 		channel->init_rate = channel->last_rate;
609 
610 		/* make sure we free up the rate resource */
611 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
612 
613 		if (channel->sq != NULL) {
614 			MLX5E_RL_WORKER_UNLOCK(rlw);
615 			mlx5e_rl_close_channel(&channel->sq);
616 			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
617 			MLX5E_RL_WORKER_LOCK(rlw);
618 		}
619 	}
620 
621 	rlw->worker_done = 0;
622 	cv_broadcast(&rlw->cv);
623 	MLX5E_RL_WORKER_UNLOCK(rlw);
624 
625 	kthread_exit();
626 }
627 
628 static int
629 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
630 {
631 	struct mlx5_core_dev *mdev = priv->mdev;
632 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
633 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
634 
635 	memset(in, 0, sizeof(in));
636 
637 	MLX5_SET(tisc, tisc, prio, 0);
638 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
639 
640 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
641 }
642 
643 static void
644 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
645 {
646 	mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn);
647 }
648 
649 static void
650 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
651     struct mlx5_core_dev *mdev)
652 {
653 	/* ratelimit workers */
654 	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
655 	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
656 
657 	/* range check */
658 	if (param->tx_worker_threads_def == 0 ||
659 	    param->tx_worker_threads_def > param->tx_worker_threads_max)
660 		param->tx_worker_threads_def = param->tx_worker_threads_max;
661 
662 	/* ratelimit channels */
663 	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
664 	    param->tx_worker_threads_def;
665 	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
666 
667 	/* range check */
668 	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
669 		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
670 
671 	/* set default burst size */
672 	param->tx_burst_size = 4;	/* MTUs */
673 
674 	/*
675 	 * Set maximum burst size
676 	 *
677 	 * The burst size is multiplied by the MTU and clamped to the
678 	 * range 0 ... 65535 bytes inclusivly before fed into the
679 	 * firmware.
680 	 *
681 	 * NOTE: If the burst size or MTU is changed only ratelimit
682 	 * connections made after the change will use the new burst
683 	 * size.
684 	 */
685 	param->tx_burst_size_max = 255;
686 
687 	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
688 	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
689 	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
690 
691 	/* ratelimit table size */
692 	param->tx_rates_max = mdev->priv.rl_table.max_size;
693 
694 	/* range check */
695 	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
696 		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
697 
698 	/* set default number of rates */
699 	param->tx_rates_def = param->tx_rates_max;
700 
701 	/* set maximum allowed rate deviation */
702 	if (param->tx_limit_max != 0) {
703 		/*
704 		 * Make sure the deviation multiplication doesn't
705 		 * overflow unsigned 64-bit:
706 		 */
707 		param->tx_allowed_deviation_max = -1ULL /
708 		    param->tx_limit_max;
709 	}
710 	/* set default rate deviation */
711 	param->tx_allowed_deviation = 50;	/* 5.0% */
712 
713 	/* channel parameters */
714 	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
715 	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
716 	param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
717 	param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
718 	param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
719 }
720 
721 static const char *mlx5e_rl_params_desc[] = {
722 	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
723 };
724 
725 static const char *mlx5e_rl_table_params_desc[] = {
726 	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
727 };
728 
729 static const char *mlx5e_rl_stats_desc[] = {
730 	MLX5E_RL_STATS(MLX5E_STATS_DESC)
731 };
732 
733 int
734 mlx5e_rl_init(struct mlx5e_priv *priv)
735 {
736 	struct mlx5e_rl_priv_data *rl = &priv->rl;
737 	struct sysctl_oid *node;
738 	struct sysctl_oid *stats;
739 	char buf[64];
740 	uint64_t i;
741 	uint64_t j;
742 	int error;
743 
744 	/* check if there is support for packet pacing */
745 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
746 		return (0);
747 
748 	rl->priv = priv;
749 
750 	sysctl_ctx_init(&rl->ctx);
751 
752 	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
753 
754 	/* allocate shared UAR for SQs */
755 	error = mlx5_alloc_map_uar(priv->mdev, &rl->sq_uar);
756 	if (error)
757 		goto done;
758 
759 	/* open own TIS domain for ratelimit SQs */
760 	error = mlx5e_rl_open_tis(priv);
761 	if (error)
762 		goto err_uar;
763 
764 	/* setup default value for parameters */
765 	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
766 
767 	/* update the completion factor */
768 	mlx5e_rl_sync_tx_completion_fact(rl);
769 
770 	/* create root node */
771 	node = SYSCTL_ADD_NODE(&rl->ctx,
772 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
773 	    "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support");
774 
775 	if (node != NULL) {
776 		/* create SYSCTLs */
777 		for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
778 			mlx5e_rl_sysctl_add_u64_oid(rl,
779 			    MLX5E_RL_PARAMS_INDEX(arg[i]),
780 			    node, mlx5e_rl_params_desc[2 * i],
781 			    mlx5e_rl_params_desc[2 * i + 1]);
782 		}
783 
784 		stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
785 		    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
786 		    "Rate limiting statistics");
787 		if (stats != NULL) {
788 			/* create SYSCTLs */
789 			for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
790 				mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
791 				    stats, mlx5e_rl_stats_desc[2 * i],
792 				    mlx5e_rl_stats_desc[2 * i + 1]);
793 			}
794 		}
795 	}
796 
797 	/* allocate workers array */
798 	rl->workers = malloc(sizeof(rl->workers[0]) *
799 	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
800 
801 	/* allocate rate limit array */
802 	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
803 	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
804 
805 	if (node != NULL) {
806 		/* create more SYSCTls */
807 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
808 		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
809 		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
810 		    "A", "Show table of all configured TX rates");
811 
812 		/* try to fetch rate table from kernel environment */
813 		for (i = 0; i != rl->param.tx_rates_def; i++) {
814 			/* compute path for tunable */
815 			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
816 			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
817 			if (TUNABLE_QUAD_FETCH(buf, &j))
818 				mlx5e_rl_tx_limit_add(rl, j);
819 		}
820 
821 		/* setup rate table sysctls */
822 		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
823 			mlx5e_rl_sysctl_add_u64_oid(rl,
824 			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
825 			    node, mlx5e_rl_table_params_desc[2 * i],
826 			    mlx5e_rl_table_params_desc[2 * i + 1]);
827 		}
828 	}
829 
830 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
831 		struct mlx5e_rl_worker *rlw = rl->workers + j;
832 
833 		rlw->priv = priv;
834 
835 		cv_init(&rlw->cv, "mlx5-worker-cv");
836 		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
837 		STAILQ_INIT(&rlw->index_list_head);
838 		STAILQ_INIT(&rlw->process_head);
839 
840 		rlw->channels = malloc(sizeof(rlw->channels[0]) *
841 		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
842 
843 		MLX5E_RL_WORKER_LOCK(rlw);
844 		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
845 			struct mlx5e_rl_channel *channel = rlw->channels + i;
846 			channel->worker = rlw;
847 			channel->tag.type = IF_SND_TAG_TYPE_RATE_LIMIT;
848 			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
849 		}
850 		MLX5E_RL_WORKER_UNLOCK(rlw);
851 	}
852 
853 	PRIV_LOCK(priv);
854 	error = mlx5e_rl_open_workers(priv);
855 	PRIV_UNLOCK(priv);
856 
857 	if (error != 0) {
858 		mlx5_en_err(priv->ifp,
859 		    "mlx5e_rl_open_workers failed: %d\n", error);
860 	}
861 
862 	return (0);
863 
864 err_uar:
865 	mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
866 done:
867 	sysctl_ctx_free(&rl->ctx);
868 	sx_destroy(&rl->rl_sxlock);
869 	return (error);
870 }
871 
872 static int
873 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
874 {
875 	struct mlx5e_rl_priv_data *rl = &priv->rl;
876 	struct thread *rl_thread = NULL;
877 	struct proc *rl_proc = NULL;
878 	uint64_t j;
879 	int error;
880 
881 	if (priv->gone || rl->opened)
882 		return (-EINVAL);
883 
884 	MLX5E_RL_WLOCK(rl);
885 	/* compute channel parameters once */
886 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
887 	MLX5E_RL_WUNLOCK(rl);
888 
889 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
890 		struct mlx5e_rl_worker *rlw = rl->workers + j;
891 
892 		/* start worker thread */
893 		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
894 		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
895 		if (error != 0) {
896 			mlx5_en_err(rl->priv->ifp,
897 			    "kproc_kthread_add failed: %d\n", error);
898 			rlw->worker_done = 1;
899 		}
900 	}
901 
902 	rl->opened = 1;
903 
904 	return (0);
905 }
906 
907 static void
908 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
909 {
910 	struct mlx5e_rl_priv_data *rl = &priv->rl;
911 	uint64_t y;
912 
913 	if (rl->opened == 0)
914 		return;
915 
916 	/* tear down worker threads simultaneously */
917 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
918 		struct mlx5e_rl_worker *rlw = rl->workers + y;
919 
920 		/* tear down worker before freeing SQs */
921 		MLX5E_RL_WORKER_LOCK(rlw);
922 		if (rlw->worker_done == 0) {
923 			rlw->worker_done = 1;
924 			cv_broadcast(&rlw->cv);
925 		} else {
926 			/* XXX thread not started */
927 			rlw->worker_done = 0;
928 		}
929 		MLX5E_RL_WORKER_UNLOCK(rlw);
930 	}
931 
932 	/* wait for worker threads to exit */
933 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
934 		struct mlx5e_rl_worker *rlw = rl->workers + y;
935 
936 		/* tear down worker before freeing SQs */
937 		MLX5E_RL_WORKER_LOCK(rlw);
938 		while (rlw->worker_done != 0)
939 			cv_wait(&rlw->cv, &rlw->mtx);
940 		MLX5E_RL_WORKER_UNLOCK(rlw);
941 	}
942 
943 	rl->opened = 0;
944 }
945 
946 static void
947 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
948 {
949 	unsigned x;
950 
951 	MLX5E_RL_WLOCK(rl);
952 	for (x = 0; x != rl->param.tx_rates_def; x++)
953 		rl->rate_limit_table[x] = 0;
954 	MLX5E_RL_WUNLOCK(rl);
955 }
956 
957 void
958 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
959 {
960 	struct mlx5e_rl_priv_data *rl = &priv->rl;
961 	uint64_t y;
962 
963 	/* check if there is support for packet pacing */
964 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
965 		return;
966 
967 	/* TODO check if there is support for packet pacing */
968 
969 	sysctl_ctx_free(&rl->ctx);
970 
971 	PRIV_LOCK(priv);
972 	mlx5e_rl_close_workers(priv);
973 	PRIV_UNLOCK(priv);
974 
975 	mlx5e_rl_reset_rates(rl);
976 
977 	/* free shared UAR for SQs */
978 	mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
979 
980 	/* close TIS domain */
981 	mlx5e_rl_close_tis(priv);
982 
983 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
984 		struct mlx5e_rl_worker *rlw = rl->workers + y;
985 
986 		cv_destroy(&rlw->cv);
987 		mtx_destroy(&rlw->mtx);
988 		free(rlw->channels, M_MLX5EN);
989 	}
990 	free(rl->rate_limit_table, M_MLX5EN);
991 	free(rl->workers, M_MLX5EN);
992 	sx_destroy(&rl->rl_sxlock);
993 }
994 
995 static void
996 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
997     struct mlx5e_rl_channel *channel)
998 {
999 	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
1000 	cv_broadcast(&rlw->cv);
1001 }
1002 
1003 static void
1004 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
1005 {
1006 	if (channel == NULL)
1007 		return;
1008 
1009 	MLX5E_RL_WORKER_LOCK(rlw);
1010 	switch (channel->state) {
1011 	case MLX5E_RL_ST_MODIFY:
1012 		channel->state = MLX5E_RL_ST_DESTROY;
1013 		break;
1014 	case MLX5E_RL_ST_USED:
1015 		channel->state = MLX5E_RL_ST_DESTROY;
1016 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1017 		break;
1018 	default:
1019 		break;
1020 	}
1021 	MLX5E_RL_WORKER_UNLOCK(rlw);
1022 }
1023 
1024 static int
1025 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1026 {
1027 
1028 	MLX5E_RL_WORKER_LOCK(rlw);
1029 	channel->new_rate = rate;
1030 	switch (channel->state) {
1031 	case MLX5E_RL_ST_USED:
1032 		channel->state = MLX5E_RL_ST_MODIFY;
1033 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1034 		break;
1035 	default:
1036 		break;
1037 	}
1038 	MLX5E_RL_WORKER_UNLOCK(rlw);
1039 
1040 	return (0);
1041 }
1042 
1043 static int
1044 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
1045     union if_snd_tag_query_params *params)
1046 {
1047 	int retval;
1048 
1049 	MLX5E_RL_WORKER_LOCK(rlw);
1050 	switch (channel->state) {
1051 	case MLX5E_RL_ST_USED:
1052 		params->rate_limit.max_rate = channel->last_rate;
1053 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1054 		retval = 0;
1055 		break;
1056 	case MLX5E_RL_ST_MODIFY:
1057 		params->rate_limit.max_rate = channel->last_rate;
1058 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1059 		retval = EBUSY;
1060 		break;
1061 	default:
1062 		retval = EINVAL;
1063 		break;
1064 	}
1065 	MLX5E_RL_WORKER_UNLOCK(rlw);
1066 
1067 	return (retval);
1068 }
1069 
1070 static int
1071 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1072     struct mlx5e_rl_channel **pchannel)
1073 {
1074 	struct mlx5e_rl_channel *channel;
1075 	int retval = ENOMEM;
1076 
1077 	MLX5E_RL_WORKER_LOCK(rlw);
1078 	/* Check for available channel in free list */
1079 	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1080 		retval = 0;
1081 		/* Remove head index from available list */
1082 		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1083 		channel->state = MLX5E_RL_ST_USED;
1084 		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1085 	} else {
1086 		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1087 	}
1088 	MLX5E_RL_WORKER_UNLOCK(rlw);
1089 
1090 	*pchannel = channel;
1091 #ifdef RATELIMIT_DEBUG
1092 	mlx5_en_info(rlw->priv->ifp,
1093 	    "Channel pointer for rate limit connection is %p\n", channel);
1094 #endif
1095 	return (retval);
1096 }
1097 
1098 int
1099 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
1100     union if_snd_tag_alloc_params *params,
1101     struct m_snd_tag **ppmt)
1102 {
1103 	struct mlx5e_rl_channel *channel;
1104 	struct mlx5e_rl_worker *rlw;
1105 	struct mlx5e_priv *priv;
1106 	int error;
1107 
1108 	priv = ifp->if_softc;
1109 
1110 	/* check if there is support for packet pacing or if device is going away */
1111 	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1112 	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1113 	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1114 		return (EOPNOTSUPP);
1115 
1116 	/* compute worker thread this TCP connection belongs to */
1117 	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1118 	    priv->rl.param.tx_worker_threads_def);
1119 
1120 	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1121 	if (error != 0)
1122 		goto done;
1123 
1124 	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1125 	if (error != 0) {
1126 		mlx5e_rl_free(rlw, channel);
1127 		goto done;
1128 	}
1129 
1130 	/* store pointer to mbuf tag */
1131 	MPASS(channel->tag.m_snd_tag.refcount == 0);
1132 	m_snd_tag_init(&channel->tag.m_snd_tag, ifp);
1133 	*ppmt = &channel->tag.m_snd_tag;
1134 done:
1135 	return (error);
1136 }
1137 
1138 
1139 int
1140 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1141 {
1142 	struct mlx5e_rl_channel *channel =
1143 	    container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag);
1144 
1145 	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1146 }
1147 
1148 int
1149 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1150 {
1151 	struct mlx5e_rl_channel *channel =
1152 	    container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag);
1153 
1154 	return (mlx5e_rl_query(channel->worker, channel, params));
1155 }
1156 
1157 void
1158 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1159 {
1160 	struct mlx5e_rl_channel *channel =
1161 	    container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag);
1162 
1163 	mlx5e_rl_free(channel->worker, channel);
1164 }
1165 
1166 static int
1167 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1168 {
1169 	struct mlx5e_rl_priv_data *rl = arg1;
1170 	struct mlx5e_priv *priv = rl->priv;
1171 	struct sbuf sbuf;
1172 	unsigned x;
1173 	int error;
1174 
1175 	error = sysctl_wire_old_buffer(req, 0);
1176 	if (error != 0)
1177 		return (error);
1178 
1179 	PRIV_LOCK(priv);
1180 
1181 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1182 
1183 	sbuf_printf(&sbuf,
1184 	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1185 	    "\t" "--------------------------------------------\n");
1186 
1187 	MLX5E_RL_RLOCK(rl);
1188 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1189 		if (rl->rate_limit_table[x] == 0)
1190 			continue;
1191 
1192 		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1193 		    x, (unsigned)rl->param.tx_burst_size,
1194 		    (long long)rl->rate_limit_table[x]);
1195 	}
1196 	MLX5E_RL_RUNLOCK(rl);
1197 
1198 	error = sbuf_finish(&sbuf);
1199 	sbuf_delete(&sbuf);
1200 
1201 	PRIV_UNLOCK(priv);
1202 
1203 	return (error);
1204 }
1205 
1206 static int
1207 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1208 {
1209 	uint64_t x;
1210 	uint64_t y;
1211 
1212 	MLX5E_RL_WLOCK(rl);
1213 	/* compute channel parameters once */
1214 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1215 	MLX5E_RL_WUNLOCK(rl);
1216 
1217 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1218 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1219 
1220 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1221 			struct mlx5e_rl_channel *channel;
1222 			struct mlx5e_sq *sq;
1223 
1224 			channel = rlw->channels + x;
1225 			sq = channel->sq;
1226 
1227 			if (sq == NULL)
1228 				continue;
1229 
1230 			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1231 				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1232 				    rl->param.tx_coalesce_usecs,
1233 				    rl->param.tx_coalesce_pkts,
1234 				    rl->param.tx_coalesce_mode);
1235 			} else {
1236 				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1237 				    rl->param.tx_coalesce_usecs,
1238 				    rl->param.tx_coalesce_pkts);
1239 			}
1240 		}
1241 	}
1242 	return (0);
1243 }
1244 
1245 void
1246 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
1247 {
1248 	uint64_t x;
1249 	uint64_t y;
1250 
1251 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1252 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1253 
1254 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1255 			struct mlx5e_rl_channel *channel;
1256 			struct mlx5e_sq *sq;
1257 
1258 			channel = rlw->channels + x;
1259 			sq = channel->sq;
1260 
1261 			if (sq == NULL)
1262 				continue;
1263 
1264 			mtx_lock(&sq->lock);
1265 			mlx5e_update_sq_inline(sq);
1266 			mtx_unlock(&sq->lock);
1267 		}
1268 	}
1269 }
1270 
1271 static int
1272 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1273 {
1274 	unsigned x;
1275 	int error;
1276 
1277 	if (value < 1000 ||
1278 	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1279 		return (EINVAL);
1280 
1281 	MLX5E_RL_WLOCK(rl);
1282 	error = ENOMEM;
1283 
1284 	/* check if rate already exists */
1285 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1286 		if (rl->rate_limit_table[x] != value)
1287 			continue;
1288 		error = EEXIST;
1289 		break;
1290 	}
1291 
1292 	/* check if there is a free rate entry */
1293 	if (x == rl->param.tx_rates_def) {
1294 		for (x = 0; x != rl->param.tx_rates_def; x++) {
1295 			if (rl->rate_limit_table[x] != 0)
1296 				continue;
1297 			rl->rate_limit_table[x] = value;
1298 			error = 0;
1299 			break;
1300 		}
1301 	}
1302 	MLX5E_RL_WUNLOCK(rl);
1303 
1304 	return (error);
1305 }
1306 
1307 static int
1308 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1309 {
1310 	unsigned x;
1311 	int error;
1312 
1313 	if (value == 0)
1314 		return (EINVAL);
1315 
1316 	MLX5E_RL_WLOCK(rl);
1317 
1318 	/* check if rate already exists */
1319 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1320 		if (rl->rate_limit_table[x] != value)
1321 			continue;
1322 		/* free up rate */
1323 		rl->rate_limit_table[x] = 0;
1324 		break;
1325 	}
1326 
1327 	/* check if there is a free rate entry */
1328 	if (x == rl->param.tx_rates_def)
1329 		error = ENOENT;
1330 	else
1331 		error = 0;
1332 	MLX5E_RL_WUNLOCK(rl);
1333 
1334 	return (error);
1335 }
1336 
1337 static int
1338 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1339 {
1340 	struct mlx5e_rl_priv_data *rl = arg1;
1341 	struct mlx5e_priv *priv = rl->priv;
1342 	unsigned mode_modify;
1343 	unsigned was_opened;
1344 	uint64_t value;
1345 	uint64_t old;
1346 	int error;
1347 
1348 	PRIV_LOCK(priv);
1349 
1350 	MLX5E_RL_RLOCK(rl);
1351 	value = rl->param.arg[arg2];
1352 	MLX5E_RL_RUNLOCK(rl);
1353 
1354 	if (req != NULL) {
1355 		old = value;
1356 		error = sysctl_handle_64(oidp, &value, 0, req);
1357 		if (error || req->newptr == NULL ||
1358 		    value == rl->param.arg[arg2])
1359 			goto done;
1360 	} else {
1361 		old = 0;
1362 		error = 0;
1363 	}
1364 
1365 	/* check if device is gone */
1366 	if (priv->gone) {
1367 		error = ENXIO;
1368 		goto done;
1369 	}
1370 	was_opened = rl->opened;
1371 	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1372 
1373 	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1374 	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1375 		if (value > rl->param.tx_worker_threads_max)
1376 			value = rl->param.tx_worker_threads_max;
1377 		else if (value < 1)
1378 			value = 1;
1379 
1380 		/* store new value */
1381 		rl->param.arg[arg2] = value;
1382 		break;
1383 
1384 	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1385 		if (value > rl->param.tx_channels_per_worker_max)
1386 			value = rl->param.tx_channels_per_worker_max;
1387 		else if (value < 1)
1388 			value = 1;
1389 
1390 		/* store new value */
1391 		rl->param.arg[arg2] = value;
1392 		break;
1393 
1394 	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1395 		if (value > rl->param.tx_rates_max)
1396 			value = rl->param.tx_rates_max;
1397 		else if (value < 1)
1398 			value = 1;
1399 
1400 		/* store new value */
1401 		rl->param.arg[arg2] = value;
1402 		break;
1403 
1404 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1405 		/* range check */
1406 		if (value < 1)
1407 			value = 0;
1408 		else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1409 			value = MLX5E_FLD_MAX(cqc, cq_period);
1410 
1411 		/* store new value */
1412 		rl->param.arg[arg2] = value;
1413 
1414 		/* check to avoid down and up the network interface */
1415 		if (was_opened)
1416 			error = mlx5e_rl_refresh_channel_params(rl);
1417 		break;
1418 
1419 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1420 		/* import TX coal pkts */
1421 		if (value < 1)
1422 			value = 0;
1423 		else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1424 			value = MLX5E_FLD_MAX(cqc, cq_max_count);
1425 
1426 		/* store new value */
1427 		rl->param.arg[arg2] = value;
1428 
1429 		/* check to avoid down and up the network interface */
1430 		if (was_opened)
1431 			error = mlx5e_rl_refresh_channel_params(rl);
1432 		break;
1433 
1434 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1435 		/* network interface must be down */
1436 		if (was_opened != 0 && mode_modify == 0)
1437 			mlx5e_rl_close_workers(priv);
1438 
1439 		/* import TX coalesce mode */
1440 		if (value != 0)
1441 			value = 1;
1442 
1443 		/* store new value */
1444 		rl->param.arg[arg2] = value;
1445 
1446 		/* restart network interface, if any */
1447 		if (was_opened != 0) {
1448 			if (mode_modify == 0)
1449 				mlx5e_rl_open_workers(priv);
1450 			else
1451 				error = mlx5e_rl_refresh_channel_params(rl);
1452 		}
1453 		break;
1454 
1455 	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1456 		/* network interface must be down */
1457 		if (was_opened)
1458 			mlx5e_rl_close_workers(priv);
1459 
1460 		/* import TX queue size */
1461 		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1462 			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1463 		else if (value > priv->params_ethtool.tx_queue_size_max)
1464 			value = priv->params_ethtool.tx_queue_size_max;
1465 
1466 		/* store actual TX queue size */
1467 		value = 1ULL << order_base_2(value);
1468 
1469 		/* store new value */
1470 		rl->param.arg[arg2] = value;
1471 
1472 		/* verify TX completion factor */
1473 		mlx5e_rl_sync_tx_completion_fact(rl);
1474 
1475 		/* restart network interface, if any */
1476 		if (was_opened)
1477 			mlx5e_rl_open_workers(priv);
1478 		break;
1479 
1480 	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1481 		/* network interface must be down */
1482 		if (was_opened)
1483 			mlx5e_rl_close_workers(priv);
1484 
1485 		/* store new value */
1486 		rl->param.arg[arg2] = value;
1487 
1488 		/* verify parameter */
1489 		mlx5e_rl_sync_tx_completion_fact(rl);
1490 
1491 		/* restart network interface, if any */
1492 		if (was_opened)
1493 			mlx5e_rl_open_workers(priv);
1494 		break;
1495 
1496 	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1497 		error = mlx5e_rl_tx_limit_add(rl, value);
1498 		break;
1499 
1500 	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1501 		error = mlx5e_rl_tx_limit_clr(rl, value);
1502 		break;
1503 
1504 	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1505 		/* range check */
1506 		if (value > rl->param.tx_allowed_deviation_max)
1507 			value = rl->param.tx_allowed_deviation_max;
1508 		else if (value < rl->param.tx_allowed_deviation_min)
1509 			value = rl->param.tx_allowed_deviation_min;
1510 
1511 		MLX5E_RL_WLOCK(rl);
1512 		rl->param.arg[arg2] = value;
1513 		MLX5E_RL_WUNLOCK(rl);
1514 		break;
1515 
1516 	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1517 		/* range check */
1518 		if (value > rl->param.tx_burst_size_max)
1519 			value = rl->param.tx_burst_size_max;
1520 		else if (value < rl->param.tx_burst_size_min)
1521 			value = rl->param.tx_burst_size_min;
1522 
1523 		MLX5E_RL_WLOCK(rl);
1524 		rl->param.arg[arg2] = value;
1525 		MLX5E_RL_WUNLOCK(rl);
1526 		break;
1527 
1528 	default:
1529 		break;
1530 	}
1531 done:
1532 	PRIV_UNLOCK(priv);
1533 	return (error);
1534 }
1535 
1536 static void
1537 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1538     struct sysctl_oid *node, const char *name, const char *desc)
1539 {
1540 	/*
1541 	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1542 	 * take care of loading default sysctl value from the kernel
1543 	 * environment, if any:
1544 	 */
1545 	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1546 		/* read-only SYSCTLs */
1547 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1548 		    name, CTLTYPE_U64 | CTLFLAG_RD |
1549 		    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1550 	} else {
1551 		if (strstr(name, "_def") != 0) {
1552 #ifdef RATELIMIT_DEBUG
1553 			/* tunable read-only advanced SYSCTLs */
1554 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1555 			    name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1556 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1557 #endif
1558 		} else {
1559 			/* read-write SYSCTLs */
1560 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1561 			    name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1562 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1563 		}
1564 	}
1565 }
1566 
1567 static void
1568 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1569     struct sysctl_oid *node, const char *name, const char *desc)
1570 {
1571 	/* read-only SYSCTLs */
1572 	SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1573 	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
1574 }
1575 
1576 #else
1577 
1578 int
1579 mlx5e_rl_init(struct mlx5e_priv *priv)
1580 {
1581 
1582 	return (0);
1583 }
1584 
1585 void
1586 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1587 {
1588 	/* NOP */
1589 }
1590 
1591 #endif		/* RATELIMIT */
1592