xref: /freebsd/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c (revision 547bc083d614f3639f5632d9e39d79e828519318)
1 /*-
2  * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "en.h"
29 
30 #ifdef RATELIMIT
31 
32 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
33 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
34 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
35 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
36     struct sysctl_oid *, const char *name, const char *desc);
37 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
38       struct sysctl_oid *node, const char *name, const char *desc);
39 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
40 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
41 
42 static void
43 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
44     struct mlx5e_sq_param *param)
45 {
46 	void *sqc = param->sqc;
47 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
48 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
49 
50 	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
51 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
52 	MLX5_SET(wq, wq, pd, rl->priv->pdn);
53 
54 	param->wq.buf_numa_node = 0;
55 	param->wq.db_numa_node = 0;
56 	param->wq.linear = 1;
57 }
58 
59 static void
60 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
61     struct mlx5e_cq_param *param)
62 {
63 	void *cqc = param->cqc;
64 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
65 
66 	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
67 	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
68 	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
69 
70 	switch (rl->param.tx_coalesce_mode) {
71 	case 0:
72 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
73 		break;
74 	default:
75 		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
76 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
77 		else
78 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
79 		break;
80 	}
81 }
82 
83 static void
84 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
85     struct mlx5e_rl_channel_param *cparam)
86 {
87 	memset(cparam, 0, sizeof(*cparam));
88 
89 	mlx5e_rl_build_sq_param(rl, &cparam->sq);
90 	mlx5e_rl_build_cq_param(rl, &cparam->cq);
91 }
92 
93 static int
94 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
95     struct mlx5e_sq_param *param, int ix)
96 {
97 	struct mlx5_core_dev *mdev = priv->mdev;
98 	void *sqc = param->sqc;
99 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
100 	int err;
101 
102 	/* Create DMA descriptor TAG */
103 	if ((err = -bus_dma_tag_create(
104 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
105 	    1,				/* any alignment */
106 	    0,				/* no boundary */
107 	    BUS_SPACE_MAXADDR,		/* lowaddr */
108 	    BUS_SPACE_MAXADDR,		/* highaddr */
109 	    NULL, NULL,			/* filter, filterarg */
110 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
111 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
112 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
113 	    0,				/* flags */
114 	    NULL, NULL,			/* lockfunc, lockfuncarg */
115 	    &sq->dma_tag)))
116 		goto done;
117 
118 	/* use shared UAR */
119 	sq->uar = priv->rl.sq_uar;
120 
121 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
122 	    &sq->wq_ctrl);
123 	if (err)
124 		goto err_free_dma_tag;
125 
126 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
127 	/*
128 	 * The sq->bf_buf_size variable is intentionally left zero so
129 	 * that the doorbell writes will occur at the same memory
130 	 * location.
131 	 */
132 
133 	err = mlx5e_alloc_sq_db(sq);
134 	if (err)
135 		goto err_sq_wq_destroy;
136 
137 	sq->mkey_be = cpu_to_be32(priv->mr.key);
138 	sq->ifp = priv->ifp;
139 	sq->priv = priv;
140 
141 	mlx5e_update_sq_inline(sq);
142 
143 	return (0);
144 
145 err_sq_wq_destroy:
146 	mlx5_wq_destroy(&sq->wq_ctrl);
147 err_free_dma_tag:
148 	bus_dma_tag_destroy(sq->dma_tag);
149 done:
150 	return (err);
151 }
152 
153 static void
154 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
155 {
156 
157 	mlx5e_free_sq_db(sq);
158 	mlx5_wq_destroy(&sq->wq_ctrl);
159 }
160 
161 static int
162 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
163     struct mlx5e_sq_param *param, int ix)
164 {
165 	int err;
166 
167 	err = mlx5e_rl_create_sq(priv, sq, param, ix);
168 	if (err)
169 		return (err);
170 
171 	err = mlx5e_enable_sq(sq, param, priv->rl.tisn);
172 	if (err)
173 		goto err_destroy_sq;
174 
175 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
176 	if (err)
177 		goto err_disable_sq;
178 
179 	return (0);
180 
181 err_disable_sq:
182 	mlx5e_disable_sq(sq);
183 err_destroy_sq:
184 	mlx5e_rl_destroy_sq(sq);
185 
186 	return (err);
187 }
188 
189 static void
190 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
191 {
192 	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
193 	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
194 
195 	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
196 
197 	sq->cev_factor = priv->rl.param.tx_completion_fact;
198 
199 	/* ensure the TX completion event factor is not zero */
200 	if (sq->cev_factor == 0)
201 		sq->cev_factor = 1;
202 }
203 
204 static int
205 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
206     struct mlx5e_rl_channel_param *cparam,
207     struct mlx5e_sq *volatile *ppsq)
208 {
209 	struct mlx5e_priv *priv = rlw->priv;
210 	struct mlx5e_sq *sq;
211 	int err;
212 
213 	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
214 
215 	/* init mutexes */
216 	mlx5e_rl_chan_mtx_init(priv, sq);
217 
218 	/* open TX completion queue */
219 	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
220 	    &mlx5e_tx_cq_comp, eq_ix);
221 	if (err)
222 		goto err_free;
223 
224 	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
225 	if (err)
226 		goto err_close_tx_cq;
227 
228 	/* store TX channel pointer */
229 	*ppsq = sq;
230 
231 	/* poll TX queue initially */
232 	sq->cq.mcq.comp(&sq->cq.mcq);
233 
234 	return (0);
235 
236 err_close_tx_cq:
237 	mlx5e_close_cq(&sq->cq);
238 
239 err_free:
240 	/* destroy mutexes */
241 	mtx_destroy(&sq->lock);
242 	mtx_destroy(&sq->comp_lock);
243 	free(sq, M_MLX5EN);
244 	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
245 	return (err);
246 }
247 
248 static void
249 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
250 {
251 	struct mlx5e_sq *sq = *ppsq;
252 
253 	/* check if channel is already closed */
254 	if (sq == NULL)
255 		return;
256 	/* ensure channel pointer is no longer used */
257 	*ppsq = NULL;
258 
259 	/* teardown and destroy SQ */
260 	mlx5e_drain_sq(sq);
261 	mlx5e_disable_sq(sq);
262 	mlx5e_rl_destroy_sq(sq);
263 
264 	/* close CQ */
265 	mlx5e_close_cq(&sq->cq);
266 
267 	/* destroy mutexes */
268 	mtx_destroy(&sq->lock);
269 	mtx_destroy(&sq->comp_lock);
270 
271 	free(sq, M_MLX5EN);
272 }
273 
274 static void
275 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
276 {
277 	/*
278 	 * Limit the maximum distance between completion events to
279 	 * half of the currently set TX queue size.
280 	 *
281 	 * The maximum number of queue entries a single IP packet can
282 	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
283 	 *
284 	 * The worst case max value is then given as below:
285 	 */
286 	uint64_t max = rl->param.tx_queue_size /
287 	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
288 
289 	/*
290 	 * Update the maximum completion factor value in case the
291 	 * tx_queue_size field changed. Ensure we don't overflow
292 	 * 16-bits.
293 	 */
294 	if (max < 1)
295 		max = 1;
296 	else if (max > 65535)
297 		max = 65535;
298 	rl->param.tx_completion_fact_max = max;
299 
300 	/*
301 	 * Verify that the current TX completion factor is within the
302 	 * given limits:
303 	 */
304 	if (rl->param.tx_completion_fact < 1)
305 		rl->param.tx_completion_fact = 1;
306 	else if (rl->param.tx_completion_fact > max)
307 		rl->param.tx_completion_fact = max;
308 }
309 
310 static int
311 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
312 {
313 	struct mlx5e_priv *priv = sq->priv;
314 	struct mlx5_core_dev *mdev = priv->mdev;
315 
316 	void *in;
317 	void *sqc;
318 	int inlen;
319 	int err;
320 
321 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
322 	in = mlx5_vzalloc(inlen);
323 	if (in == NULL)
324 		return (-ENOMEM);
325 
326 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
327 
328 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
329 	MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
330 	MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
331 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
332 	MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
333 
334 	err = mlx5_core_modify_sq(mdev, in, inlen);
335 
336 	kvfree(in);
337 
338 	return (err);
339 }
340 
341 /*
342  * This function will search the configured rate limit table for the
343  * best match to avoid that a single socket based application can
344  * allocate all the available hardware rates. If the user selected
345  * rate deviates too much from the closes rate available in the rate
346  * limit table, unlimited rate will be selected.
347  */
348 static uint64_t
349 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
350 {
351 	uint64_t distance = -1ULL;
352 	uint64_t diff;
353 	uint64_t retval = 0;		/* unlimited */
354 	uint64_t x;
355 
356 	/* search for closest rate */
357 	for (x = 0; x != rl->param.tx_rates_def; x++) {
358 		uint64_t rate = rl->rate_limit_table[x];
359 		if (rate == 0)
360 			continue;
361 
362 		if (rate > user_rate)
363 			diff = rate - user_rate;
364 		else
365 			diff = user_rate - rate;
366 
367 		/* check if distance is smaller than previous rate */
368 		if (diff < distance) {
369 			distance = diff;
370 			retval = rate;
371 		}
372 	}
373 
374 	/* range check for multiplication below */
375 	if (user_rate > rl->param.tx_limit_max)
376 		user_rate = rl->param.tx_limit_max;
377 
378 	/* fallback to unlimited, if rate deviates too much */
379 	if (distance > howmany(user_rate *
380 	    rl->param.tx_allowed_deviation, 1000ULL))
381 		retval = 0;
382 
383 	return (retval);
384 }
385 
386 /*
387  * This function sets the requested rate for a rate limit channel, in
388  * bits per second. The requested rate will be filtered through the
389  * find best rate function above.
390  */
391 static int
392 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
393     struct mlx5e_rl_channel *channel, uint64_t rate)
394 {
395 	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
396 	struct mlx5e_sq *sq;
397 	uint64_t temp;
398 	uint16_t index;
399 	uint16_t burst;
400 	int error;
401 
402 	if (rate != 0) {
403 		MLX5E_RL_WORKER_UNLOCK(rlw);
404 
405 		MLX5E_RL_RLOCK(rl);
406 
407 		/* get current burst size in bytes */
408 		temp = rl->param.tx_burst_size *
409 		    MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
410 
411 		/* limit burst size to 64K currently */
412 		if (temp > 65535)
413 			temp = 65535;
414 		burst = temp;
415 
416 		/* find best rate */
417 		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
418 
419 		MLX5E_RL_RUNLOCK(rl);
420 
421 		if (rate == 0) {
422 			/* rate doesn't exist, fallback to unlimited */
423 			index = 0;
424 			rate = 0;
425 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
426 		} else {
427 			/* get a reference on the new rate */
428 			error = -mlx5_rl_add_rate(rlw->priv->mdev,
429 			    howmany(rate, 1000), burst, &index);
430 
431 			if (error != 0) {
432 				/* adding rate failed, fallback to unlimited */
433 				index = 0;
434 				rate = 0;
435 				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
436 			}
437 		}
438 		MLX5E_RL_WORKER_LOCK(rlw);
439 	} else {
440 		index = 0;
441 		burst = 0;	/* default */
442 	}
443 
444 	/* atomically swap rates */
445 	temp = channel->last_rate;
446 	channel->last_rate = rate;
447 	rate = temp;
448 
449 	/* atomically swap burst size */
450 	temp = channel->last_burst;
451 	channel->last_burst = burst;
452 	burst = temp;
453 
454 	MLX5E_RL_WORKER_UNLOCK(rlw);
455 	/* put reference on the old rate, if any */
456 	if (rate != 0) {
457 		mlx5_rl_remove_rate(rlw->priv->mdev,
458 		    howmany(rate, 1000), burst);
459 	}
460 
461 	/* set new rate, if SQ is running */
462 	sq = channel->sq;
463 	if (sq != NULL && READ_ONCE(sq->running) != 0) {
464 		error = mlx5e_rl_modify_sq(sq, index);
465 		if (error != 0)
466 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
467 	} else
468 		error = 0;
469 	MLX5E_RL_WORKER_LOCK(rlw);
470 
471 	return (-error);
472 }
473 
474 static void
475 mlx5e_rl_worker(void *arg)
476 {
477 	struct thread *td;
478 	struct mlx5e_rl_worker *rlw = arg;
479 	struct mlx5e_rl_channel *channel;
480 	struct mlx5e_priv *priv;
481 	unsigned ix;
482 	uint64_t x;
483 	int error;
484 
485 	/* set thread priority */
486 	td = curthread;
487 
488 	thread_lock(td);
489 	sched_prio(td, PI_SWI(SWI_NET));
490 	thread_unlock(td);
491 
492 	priv = rlw->priv;
493 
494 	/* compute completion vector */
495 	ix = (rlw - priv->rl.workers) %
496 	    priv->mdev->priv.eq_table.num_comp_vectors;
497 
498 	/* TODO bind to CPU */
499 
500 	/* open all the SQs */
501 	MLX5E_RL_WORKER_LOCK(rlw);
502 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
503 		struct mlx5e_rl_channel *channel = rlw->channels + x;
504 
505 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
506 		if (channel->state == MLX5E_RL_ST_FREE)
507 			continue;
508 #endif
509 		MLX5E_RL_WORKER_UNLOCK(rlw);
510 
511 		MLX5E_RL_RLOCK(&priv->rl);
512 		error = mlx5e_rl_open_channel(rlw, ix,
513 		    &priv->rl.chan_param, &channel->sq);
514 		MLX5E_RL_RUNLOCK(&priv->rl);
515 
516 		MLX5E_RL_WORKER_LOCK(rlw);
517 		if (error != 0) {
518 			if_printf(priv->ifp,
519 			    "mlx5e_rl_open_channel failed: %d\n", error);
520 			break;
521 		}
522 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
523 	}
524 	while (1) {
525 		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
526 			/* check if we are tearing down */
527 			if (rlw->worker_done != 0)
528 				break;
529 			cv_wait(&rlw->cv, &rlw->mtx);
530 		}
531 		/* check if we are tearing down */
532 		if (rlw->worker_done != 0)
533 			break;
534 		channel = STAILQ_FIRST(&rlw->process_head);
535 		if (channel != NULL) {
536 			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
537 
538 			switch (channel->state) {
539 			case MLX5E_RL_ST_MODIFY:
540 				channel->state = MLX5E_RL_ST_USED;
541 				MLX5E_RL_WORKER_UNLOCK(rlw);
542 
543 				/* create channel by demand */
544 				if (channel->sq == NULL) {
545 					MLX5E_RL_RLOCK(&priv->rl);
546 					error = mlx5e_rl_open_channel(rlw, ix,
547 					    &priv->rl.chan_param, &channel->sq);
548 					MLX5E_RL_RUNLOCK(&priv->rl);
549 
550 					if (error != 0) {
551 						if_printf(priv->ifp,
552 						    "mlx5e_rl_open_channel failed: %d\n", error);
553 					} else {
554 						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
555 					}
556 				} else {
557 					mlx5e_resume_sq(channel->sq);
558 				}
559 
560 				MLX5E_RL_WORKER_LOCK(rlw);
561 				/* convert from bytes/s to bits/s and set new rate */
562 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
563 				    channel->new_rate * 8ULL);
564 				if (error != 0) {
565 					if_printf(priv->ifp,
566 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
567 					    error);
568 				}
569 				break;
570 
571 			case MLX5E_RL_ST_DESTROY:
572 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
573 				if (error != 0) {
574 					if_printf(priv->ifp,
575 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
576 					    error);
577 				}
578 				if (channel->sq != NULL) {
579 					/*
580 					 * Make sure all packets are
581 					 * transmitted before SQ is
582 					 * returned to free list:
583 					 */
584 					MLX5E_RL_WORKER_UNLOCK(rlw);
585 					mlx5e_drain_sq(channel->sq);
586 					MLX5E_RL_WORKER_LOCK(rlw);
587 				}
588 				/* put the channel back into the free list */
589 				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
590 				channel->state = MLX5E_RL_ST_FREE;
591 				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
592 				break;
593 			default:
594 				/* NOP */
595 				break;
596 			}
597 		}
598 	}
599 
600 	/* close all the SQs */
601 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
602 		struct mlx5e_rl_channel *channel = rlw->channels + x;
603 
604 		/* update the initial rate */
605 		channel->init_rate = channel->last_rate;
606 
607 		/* make sure we free up the rate resource */
608 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
609 
610 		if (channel->sq != NULL) {
611 			MLX5E_RL_WORKER_UNLOCK(rlw);
612 			mlx5e_rl_close_channel(&channel->sq);
613 			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
614 			MLX5E_RL_WORKER_LOCK(rlw);
615 		}
616 	}
617 
618 	rlw->worker_done = 0;
619 	cv_broadcast(&rlw->cv);
620 	MLX5E_RL_WORKER_UNLOCK(rlw);
621 
622 	kthread_exit();
623 }
624 
625 static int
626 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
627 {
628 	struct mlx5_core_dev *mdev = priv->mdev;
629 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
630 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
631 
632 	memset(in, 0, sizeof(in));
633 
634 	MLX5_SET(tisc, tisc, prio, 0);
635 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
636 
637 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
638 }
639 
640 static void
641 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
642 {
643 	mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn);
644 }
645 
646 static void
647 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
648     struct mlx5_core_dev *mdev)
649 {
650 	/* ratelimit workers */
651 	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
652 	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
653 
654 	/* range check */
655 	if (param->tx_worker_threads_def == 0 ||
656 	    param->tx_worker_threads_def > param->tx_worker_threads_max)
657 		param->tx_worker_threads_def = param->tx_worker_threads_max;
658 
659 	/* ratelimit channels */
660 	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
661 	    param->tx_worker_threads_def;
662 	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
663 
664 	/* range check */
665 	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
666 		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
667 
668 	/* set default burst size */
669 	param->tx_burst_size = 4;	/* MTUs */
670 
671 	/*
672 	 * Set maximum burst size
673 	 *
674 	 * The burst size is multiplied by the MTU and clamped to the
675 	 * range 0 ... 65535 bytes inclusivly before fed into the
676 	 * firmware.
677 	 *
678 	 * NOTE: If the burst size or MTU is changed only ratelimit
679 	 * connections made after the change will use the new burst
680 	 * size.
681 	 */
682 	param->tx_burst_size_max = 255;
683 
684 	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
685 	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
686 	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
687 
688 	/* ratelimit table size */
689 	param->tx_rates_max = mdev->priv.rl_table.max_size;
690 
691 	/* range check */
692 	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
693 		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
694 
695 	/* set default number of rates */
696 	param->tx_rates_def = param->tx_rates_max;
697 
698 	/* set maximum allowed rate deviation */
699 	if (param->tx_limit_max != 0) {
700 		/*
701 		 * Make sure the deviation multiplication doesn't
702 		 * overflow unsigned 64-bit:
703 		 */
704 		param->tx_allowed_deviation_max = -1ULL /
705 		    param->tx_limit_max;
706 	}
707 	/* set default rate deviation */
708 	param->tx_allowed_deviation = 50;	/* 5.0% */
709 
710 	/* channel parameters */
711 	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
712 	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
713 	param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
714 	param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
715 	param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
716 }
717 
718 static const char *mlx5e_rl_params_desc[] = {
719 	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
720 };
721 
722 static const char *mlx5e_rl_table_params_desc[] = {
723 	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
724 };
725 
726 static const char *mlx5e_rl_stats_desc[] = {
727 	MLX5E_RL_STATS(MLX5E_STATS_DESC)
728 };
729 
730 int
731 mlx5e_rl_init(struct mlx5e_priv *priv)
732 {
733 	struct mlx5e_rl_priv_data *rl = &priv->rl;
734 	struct sysctl_oid *node;
735 	struct sysctl_oid *stats;
736 	char buf[64];
737 	uint64_t i;
738 	uint64_t j;
739 	int error;
740 
741 	/* check if there is support for packet pacing */
742 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
743 		return (0);
744 
745 	rl->priv = priv;
746 
747 	sysctl_ctx_init(&rl->ctx);
748 
749 	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
750 
751 	/* allocate shared UAR for SQs */
752 	error = mlx5_alloc_map_uar(priv->mdev, &rl->sq_uar);
753 	if (error)
754 		goto done;
755 
756 	/* open own TIS domain for ratelimit SQs */
757 	error = mlx5e_rl_open_tis(priv);
758 	if (error)
759 		goto err_uar;
760 
761 	/* setup default value for parameters */
762 	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
763 
764 	/* update the completion factor */
765 	mlx5e_rl_sync_tx_completion_fact(rl);
766 
767 	/* create root node */
768 	node = SYSCTL_ADD_NODE(&rl->ctx,
769 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
770 	    "rate_limit", CTLFLAG_RW, NULL, "Rate limiting support");
771 
772 	if (node != NULL) {
773 		/* create SYSCTLs */
774 		for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
775 			mlx5e_rl_sysctl_add_u64_oid(rl,
776 			    MLX5E_RL_PARAMS_INDEX(arg[i]),
777 			    node, mlx5e_rl_params_desc[2 * i],
778 			    mlx5e_rl_params_desc[2 * i + 1]);
779 		}
780 
781 		stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
782 		    OID_AUTO, "stats", CTLFLAG_RD, NULL,
783 		    "Rate limiting statistics");
784 		if (stats != NULL) {
785 			/* create SYSCTLs */
786 			for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
787 				mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
788 				    stats, mlx5e_rl_stats_desc[2 * i],
789 				    mlx5e_rl_stats_desc[2 * i + 1]);
790 			}
791 		}
792 	}
793 
794 	/* allocate workers array */
795 	rl->workers = malloc(sizeof(rl->workers[0]) *
796 	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
797 
798 	/* allocate rate limit array */
799 	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
800 	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
801 
802 	if (node != NULL) {
803 		/* create more SYSCTls */
804 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
805 		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
806 		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
807 		    "A", "Show table of all configured TX rates");
808 
809 		/* try to fetch rate table from kernel environment */
810 		for (i = 0; i != rl->param.tx_rates_def; i++) {
811 			/* compute path for tunable */
812 			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
813 			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
814 			if (TUNABLE_QUAD_FETCH(buf, &j))
815 				mlx5e_rl_tx_limit_add(rl, j);
816 		}
817 
818 		/* setup rate table sysctls */
819 		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
820 			mlx5e_rl_sysctl_add_u64_oid(rl,
821 			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
822 			    node, mlx5e_rl_table_params_desc[2 * i],
823 			    mlx5e_rl_table_params_desc[2 * i + 1]);
824 		}
825 	}
826 
827 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
828 		struct mlx5e_rl_worker *rlw = rl->workers + j;
829 
830 		rlw->priv = priv;
831 
832 		cv_init(&rlw->cv, "mlx5-worker-cv");
833 		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
834 		STAILQ_INIT(&rlw->index_list_head);
835 		STAILQ_INIT(&rlw->process_head);
836 
837 		rlw->channels = malloc(sizeof(rlw->channels[0]) *
838 		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
839 
840 		MLX5E_RL_WORKER_LOCK(rlw);
841 		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
842 			struct mlx5e_rl_channel *channel = rlw->channels + i;
843 			channel->worker = rlw;
844 			channel->tag.m_snd_tag.ifp = priv->ifp;
845 			channel->tag.type = IF_SND_TAG_TYPE_RATE_LIMIT;
846 			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
847 		}
848 		MLX5E_RL_WORKER_UNLOCK(rlw);
849 	}
850 
851 	PRIV_LOCK(priv);
852 	error = mlx5e_rl_open_workers(priv);
853 	PRIV_UNLOCK(priv);
854 
855 	if (error != 0) {
856 		if_printf(priv->ifp,
857 		    "mlx5e_rl_open_workers failed: %d\n", error);
858 	}
859 
860 	return (0);
861 
862 err_uar:
863 	mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
864 done:
865 	sysctl_ctx_free(&rl->ctx);
866 	sx_destroy(&rl->rl_sxlock);
867 	return (error);
868 }
869 
870 static int
871 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
872 {
873 	struct mlx5e_rl_priv_data *rl = &priv->rl;
874 	struct thread *rl_thread = NULL;
875 	struct proc *rl_proc = NULL;
876 	uint64_t j;
877 	int error;
878 
879 	if (priv->gone || rl->opened)
880 		return (-EINVAL);
881 
882 	MLX5E_RL_WLOCK(rl);
883 	/* compute channel parameters once */
884 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
885 	MLX5E_RL_WUNLOCK(rl);
886 
887 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
888 		struct mlx5e_rl_worker *rlw = rl->workers + j;
889 
890 		/* start worker thread */
891 		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
892 		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
893 		if (error != 0) {
894 			if_printf(rl->priv->ifp,
895 			    "kproc_kthread_add failed: %d\n", error);
896 			rlw->worker_done = 1;
897 		}
898 	}
899 
900 	rl->opened = 1;
901 
902 	return (0);
903 }
904 
905 static void
906 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
907 {
908 	struct mlx5e_rl_priv_data *rl = &priv->rl;
909 	uint64_t y;
910 
911 	if (rl->opened == 0)
912 		return;
913 
914 	/* tear down worker threads simultaneously */
915 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
916 		struct mlx5e_rl_worker *rlw = rl->workers + y;
917 
918 		/* tear down worker before freeing SQs */
919 		MLX5E_RL_WORKER_LOCK(rlw);
920 		if (rlw->worker_done == 0) {
921 			rlw->worker_done = 1;
922 			cv_broadcast(&rlw->cv);
923 		} else {
924 			/* XXX thread not started */
925 			rlw->worker_done = 0;
926 		}
927 		MLX5E_RL_WORKER_UNLOCK(rlw);
928 	}
929 
930 	/* wait for worker threads to exit */
931 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
932 		struct mlx5e_rl_worker *rlw = rl->workers + y;
933 
934 		/* tear down worker before freeing SQs */
935 		MLX5E_RL_WORKER_LOCK(rlw);
936 		while (rlw->worker_done != 0)
937 			cv_wait(&rlw->cv, &rlw->mtx);
938 		MLX5E_RL_WORKER_UNLOCK(rlw);
939 	}
940 
941 	rl->opened = 0;
942 }
943 
944 static void
945 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
946 {
947 	unsigned x;
948 
949 	MLX5E_RL_WLOCK(rl);
950 	for (x = 0; x != rl->param.tx_rates_def; x++)
951 		rl->rate_limit_table[x] = 0;
952 	MLX5E_RL_WUNLOCK(rl);
953 }
954 
955 void
956 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
957 {
958 	struct mlx5e_rl_priv_data *rl = &priv->rl;
959 	uint64_t y;
960 
961 	/* check if there is support for packet pacing */
962 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
963 		return;
964 
965 	/* TODO check if there is support for packet pacing */
966 
967 	sysctl_ctx_free(&rl->ctx);
968 
969 	PRIV_LOCK(priv);
970 	mlx5e_rl_close_workers(priv);
971 	PRIV_UNLOCK(priv);
972 
973 	mlx5e_rl_reset_rates(rl);
974 
975 	/* free shared UAR for SQs */
976 	mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
977 
978 	/* close TIS domain */
979 	mlx5e_rl_close_tis(priv);
980 
981 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
982 		struct mlx5e_rl_worker *rlw = rl->workers + y;
983 
984 		cv_destroy(&rlw->cv);
985 		mtx_destroy(&rlw->mtx);
986 		free(rlw->channels, M_MLX5EN);
987 	}
988 	free(rl->rate_limit_table, M_MLX5EN);
989 	free(rl->workers, M_MLX5EN);
990 	sx_destroy(&rl->rl_sxlock);
991 }
992 
993 static void
994 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
995     struct mlx5e_rl_channel *channel)
996 {
997 	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
998 	cv_broadcast(&rlw->cv);
999 }
1000 
1001 static void
1002 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
1003 {
1004 	if (channel == NULL)
1005 		return;
1006 
1007 	MLX5E_RL_WORKER_LOCK(rlw);
1008 	switch (channel->state) {
1009 	case MLX5E_RL_ST_MODIFY:
1010 		channel->state = MLX5E_RL_ST_DESTROY;
1011 		break;
1012 	case MLX5E_RL_ST_USED:
1013 		channel->state = MLX5E_RL_ST_DESTROY;
1014 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1015 		break;
1016 	default:
1017 		break;
1018 	}
1019 	MLX5E_RL_WORKER_UNLOCK(rlw);
1020 }
1021 
1022 static int
1023 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1024 {
1025 
1026 	MLX5E_RL_WORKER_LOCK(rlw);
1027 	channel->new_rate = rate;
1028 	switch (channel->state) {
1029 	case MLX5E_RL_ST_USED:
1030 		channel->state = MLX5E_RL_ST_MODIFY;
1031 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1032 		break;
1033 	default:
1034 		break;
1035 	}
1036 	MLX5E_RL_WORKER_UNLOCK(rlw);
1037 
1038 	return (0);
1039 }
1040 
1041 static int
1042 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
1043     union if_snd_tag_query_params *params)
1044 {
1045 	int retval;
1046 
1047 	MLX5E_RL_WORKER_LOCK(rlw);
1048 	switch (channel->state) {
1049 	case MLX5E_RL_ST_USED:
1050 		params->rate_limit.max_rate = channel->last_rate;
1051 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1052 		retval = 0;
1053 		break;
1054 	case MLX5E_RL_ST_MODIFY:
1055 		params->rate_limit.max_rate = channel->last_rate;
1056 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1057 		retval = EBUSY;
1058 		break;
1059 	default:
1060 		retval = EINVAL;
1061 		break;
1062 	}
1063 	MLX5E_RL_WORKER_UNLOCK(rlw);
1064 
1065 	return (retval);
1066 }
1067 
1068 static int
1069 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1070     struct mlx5e_rl_channel **pchannel)
1071 {
1072 	struct mlx5e_rl_channel *channel;
1073 	int retval = ENOMEM;
1074 
1075 	MLX5E_RL_WORKER_LOCK(rlw);
1076 	/* Check for available channel in free list */
1077 	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1078 		retval = 0;
1079 		/* Remove head index from available list */
1080 		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1081 		channel->state = MLX5E_RL_ST_USED;
1082 		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1083 	} else {
1084 		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1085 	}
1086 	MLX5E_RL_WORKER_UNLOCK(rlw);
1087 
1088 	*pchannel = channel;
1089 #ifdef RATELIMIT_DEBUG
1090 	if_printf(rlw->priv->ifp, "Channel pointer for rate limit connection is %p\n", channel);
1091 #endif
1092 	return (retval);
1093 }
1094 
1095 int
1096 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
1097     union if_snd_tag_alloc_params *params,
1098     struct m_snd_tag **ppmt)
1099 {
1100 	struct mlx5e_rl_channel *channel;
1101 	struct mlx5e_rl_worker *rlw;
1102 	struct mlx5e_priv *priv;
1103 	int error;
1104 
1105 	priv = ifp->if_softc;
1106 
1107 	/* check if there is support for packet pacing or if device is going away */
1108 	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1109 	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1110 	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1111 		return (EOPNOTSUPP);
1112 
1113 	/* compute worker thread this TCP connection belongs to */
1114 	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1115 	    priv->rl.param.tx_worker_threads_def);
1116 
1117 	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1118 	if (error != 0)
1119 		goto done;
1120 
1121 	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1122 	if (error != 0) {
1123 		mlx5e_rl_free(rlw, channel);
1124 		goto done;
1125 	}
1126 
1127 	/* store pointer to mbuf tag */
1128 	*ppmt = &channel->tag.m_snd_tag;
1129 done:
1130 	return (error);
1131 }
1132 
1133 
1134 int
1135 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1136 {
1137 	struct mlx5e_rl_channel *channel =
1138 	    container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag);
1139 
1140 	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1141 }
1142 
1143 int
1144 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1145 {
1146 	struct mlx5e_rl_channel *channel =
1147 	    container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag);
1148 
1149 	return (mlx5e_rl_query(channel->worker, channel, params));
1150 }
1151 
1152 void
1153 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1154 {
1155 	struct mlx5e_rl_channel *channel =
1156 	    container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag);
1157 
1158 	mlx5e_rl_free(channel->worker, channel);
1159 }
1160 
1161 static int
1162 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1163 {
1164 	struct mlx5e_rl_priv_data *rl = arg1;
1165 	struct mlx5e_priv *priv = rl->priv;
1166 	struct sbuf sbuf;
1167 	unsigned x;
1168 	int error;
1169 
1170 	error = sysctl_wire_old_buffer(req, 0);
1171 	if (error != 0)
1172 		return (error);
1173 
1174 	PRIV_LOCK(priv);
1175 
1176 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1177 
1178 	sbuf_printf(&sbuf,
1179 	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1180 	    "\t" "--------------------------------------------\n");
1181 
1182 	MLX5E_RL_RLOCK(rl);
1183 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1184 		if (rl->rate_limit_table[x] == 0)
1185 			continue;
1186 
1187 		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1188 		    x, (unsigned)rl->param.tx_burst_size,
1189 		    (long long)rl->rate_limit_table[x]);
1190 	}
1191 	MLX5E_RL_RUNLOCK(rl);
1192 
1193 	error = sbuf_finish(&sbuf);
1194 	sbuf_delete(&sbuf);
1195 
1196 	PRIV_UNLOCK(priv);
1197 
1198 	return (error);
1199 }
1200 
1201 static int
1202 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1203 {
1204 	uint64_t x;
1205 	uint64_t y;
1206 
1207 	MLX5E_RL_WLOCK(rl);
1208 	/* compute channel parameters once */
1209 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1210 	MLX5E_RL_WUNLOCK(rl);
1211 
1212 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1213 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1214 
1215 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1216 			struct mlx5e_rl_channel *channel;
1217 			struct mlx5e_sq *sq;
1218 
1219 			channel = rlw->channels + x;
1220 			sq = channel->sq;
1221 
1222 			if (sq == NULL)
1223 				continue;
1224 
1225 			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1226 				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1227 				    rl->param.tx_coalesce_usecs,
1228 				    rl->param.tx_coalesce_pkts,
1229 				    rl->param.tx_coalesce_mode);
1230 			} else {
1231 				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1232 				    rl->param.tx_coalesce_usecs,
1233 				    rl->param.tx_coalesce_pkts);
1234 			}
1235 		}
1236 	}
1237 	return (0);
1238 }
1239 
1240 void
1241 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
1242 {
1243 	uint64_t x;
1244 	uint64_t y;
1245 
1246 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1247 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1248 
1249 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1250 			struct mlx5e_rl_channel *channel;
1251 			struct mlx5e_sq *sq;
1252 
1253 			channel = rlw->channels + x;
1254 			sq = channel->sq;
1255 
1256 			if (sq == NULL)
1257 				continue;
1258 
1259 			mtx_lock(&sq->lock);
1260 			mlx5e_update_sq_inline(sq);
1261 			mtx_unlock(&sq->lock);
1262 		}
1263 	}
1264 }
1265 
1266 static int
1267 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1268 {
1269 	unsigned x;
1270 	int error;
1271 
1272 	if (value < 1000 ||
1273 	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1274 		return (EINVAL);
1275 
1276 	MLX5E_RL_WLOCK(rl);
1277 	error = ENOMEM;
1278 
1279 	/* check if rate already exists */
1280 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1281 		if (rl->rate_limit_table[x] != value)
1282 			continue;
1283 		error = EEXIST;
1284 		break;
1285 	}
1286 
1287 	/* check if there is a free rate entry */
1288 	if (x == rl->param.tx_rates_def) {
1289 		for (x = 0; x != rl->param.tx_rates_def; x++) {
1290 			if (rl->rate_limit_table[x] != 0)
1291 				continue;
1292 			rl->rate_limit_table[x] = value;
1293 			error = 0;
1294 			break;
1295 		}
1296 	}
1297 	MLX5E_RL_WUNLOCK(rl);
1298 
1299 	return (error);
1300 }
1301 
1302 static int
1303 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1304 {
1305 	unsigned x;
1306 	int error;
1307 
1308 	if (value == 0)
1309 		return (EINVAL);
1310 
1311 	MLX5E_RL_WLOCK(rl);
1312 
1313 	/* check if rate already exists */
1314 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1315 		if (rl->rate_limit_table[x] != value)
1316 			continue;
1317 		/* free up rate */
1318 		rl->rate_limit_table[x] = 0;
1319 		break;
1320 	}
1321 
1322 	/* check if there is a free rate entry */
1323 	if (x == rl->param.tx_rates_def)
1324 		error = ENOENT;
1325 	else
1326 		error = 0;
1327 	MLX5E_RL_WUNLOCK(rl);
1328 
1329 	return (error);
1330 }
1331 
1332 static int
1333 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1334 {
1335 	struct mlx5e_rl_priv_data *rl = arg1;
1336 	struct mlx5e_priv *priv = rl->priv;
1337 	unsigned mode_modify;
1338 	unsigned was_opened;
1339 	uint64_t value;
1340 	uint64_t old;
1341 	int error;
1342 
1343 	PRIV_LOCK(priv);
1344 
1345 	MLX5E_RL_RLOCK(rl);
1346 	value = rl->param.arg[arg2];
1347 	MLX5E_RL_RUNLOCK(rl);
1348 
1349 	if (req != NULL) {
1350 		old = value;
1351 		error = sysctl_handle_64(oidp, &value, 0, req);
1352 		if (error || req->newptr == NULL ||
1353 		    value == rl->param.arg[arg2])
1354 			goto done;
1355 	} else {
1356 		old = 0;
1357 		error = 0;
1358 	}
1359 
1360 	/* check if device is gone */
1361 	if (priv->gone) {
1362 		error = ENXIO;
1363 		goto done;
1364 	}
1365 	was_opened = rl->opened;
1366 	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1367 
1368 	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1369 	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1370 		if (value > rl->param.tx_worker_threads_max)
1371 			value = rl->param.tx_worker_threads_max;
1372 		else if (value < 1)
1373 			value = 1;
1374 
1375 		/* store new value */
1376 		rl->param.arg[arg2] = value;
1377 		break;
1378 
1379 	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1380 		if (value > rl->param.tx_channels_per_worker_max)
1381 			value = rl->param.tx_channels_per_worker_max;
1382 		else if (value < 1)
1383 			value = 1;
1384 
1385 		/* store new value */
1386 		rl->param.arg[arg2] = value;
1387 		break;
1388 
1389 	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1390 		if (value > rl->param.tx_rates_max)
1391 			value = rl->param.tx_rates_max;
1392 		else if (value < 1)
1393 			value = 1;
1394 
1395 		/* store new value */
1396 		rl->param.arg[arg2] = value;
1397 		break;
1398 
1399 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1400 		/* range check */
1401 		if (value < 1)
1402 			value = 0;
1403 		else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1404 			value = MLX5E_FLD_MAX(cqc, cq_period);
1405 
1406 		/* store new value */
1407 		rl->param.arg[arg2] = value;
1408 
1409 		/* check to avoid down and up the network interface */
1410 		if (was_opened)
1411 			error = mlx5e_rl_refresh_channel_params(rl);
1412 		break;
1413 
1414 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1415 		/* import TX coal pkts */
1416 		if (value < 1)
1417 			value = 0;
1418 		else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1419 			value = MLX5E_FLD_MAX(cqc, cq_max_count);
1420 
1421 		/* store new value */
1422 		rl->param.arg[arg2] = value;
1423 
1424 		/* check to avoid down and up the network interface */
1425 		if (was_opened)
1426 			error = mlx5e_rl_refresh_channel_params(rl);
1427 		break;
1428 
1429 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1430 		/* network interface must be down */
1431 		if (was_opened != 0 && mode_modify == 0)
1432 			mlx5e_rl_close_workers(priv);
1433 
1434 		/* import TX coalesce mode */
1435 		if (value != 0)
1436 			value = 1;
1437 
1438 		/* store new value */
1439 		rl->param.arg[arg2] = value;
1440 
1441 		/* restart network interface, if any */
1442 		if (was_opened != 0) {
1443 			if (mode_modify == 0)
1444 				mlx5e_rl_open_workers(priv);
1445 			else
1446 				error = mlx5e_rl_refresh_channel_params(rl);
1447 		}
1448 		break;
1449 
1450 	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1451 		/* network interface must be down */
1452 		if (was_opened)
1453 			mlx5e_rl_close_workers(priv);
1454 
1455 		/* import TX queue size */
1456 		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1457 			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1458 		else if (value > priv->params_ethtool.tx_queue_size_max)
1459 			value = priv->params_ethtool.tx_queue_size_max;
1460 
1461 		/* store actual TX queue size */
1462 		value = 1ULL << order_base_2(value);
1463 
1464 		/* store new value */
1465 		rl->param.arg[arg2] = value;
1466 
1467 		/* verify TX completion factor */
1468 		mlx5e_rl_sync_tx_completion_fact(rl);
1469 
1470 		/* restart network interface, if any */
1471 		if (was_opened)
1472 			mlx5e_rl_open_workers(priv);
1473 		break;
1474 
1475 	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1476 		/* network interface must be down */
1477 		if (was_opened)
1478 			mlx5e_rl_close_workers(priv);
1479 
1480 		/* store new value */
1481 		rl->param.arg[arg2] = value;
1482 
1483 		/* verify parameter */
1484 		mlx5e_rl_sync_tx_completion_fact(rl);
1485 
1486 		/* restart network interface, if any */
1487 		if (was_opened)
1488 			mlx5e_rl_open_workers(priv);
1489 		break;
1490 
1491 	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1492 		error = mlx5e_rl_tx_limit_add(rl, value);
1493 		break;
1494 
1495 	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1496 		error = mlx5e_rl_tx_limit_clr(rl, value);
1497 		break;
1498 
1499 	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1500 		/* range check */
1501 		if (value > rl->param.tx_allowed_deviation_max)
1502 			value = rl->param.tx_allowed_deviation_max;
1503 		else if (value < rl->param.tx_allowed_deviation_min)
1504 			value = rl->param.tx_allowed_deviation_min;
1505 
1506 		MLX5E_RL_WLOCK(rl);
1507 		rl->param.arg[arg2] = value;
1508 		MLX5E_RL_WUNLOCK(rl);
1509 		break;
1510 
1511 	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1512 		/* range check */
1513 		if (value > rl->param.tx_burst_size_max)
1514 			value = rl->param.tx_burst_size_max;
1515 		else if (value < rl->param.tx_burst_size_min)
1516 			value = rl->param.tx_burst_size_min;
1517 
1518 		MLX5E_RL_WLOCK(rl);
1519 		rl->param.arg[arg2] = value;
1520 		MLX5E_RL_WUNLOCK(rl);
1521 		break;
1522 
1523 	default:
1524 		break;
1525 	}
1526 done:
1527 	PRIV_UNLOCK(priv);
1528 	return (error);
1529 }
1530 
1531 static void
1532 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1533     struct sysctl_oid *node, const char *name, const char *desc)
1534 {
1535 	/*
1536 	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1537 	 * take care of loading default sysctl value from the kernel
1538 	 * environment, if any:
1539 	 */
1540 	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1541 		/* read-only SYSCTLs */
1542 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1543 		    name, CTLTYPE_U64 | CTLFLAG_RD |
1544 		    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1545 	} else {
1546 		if (strstr(name, "_def") != 0) {
1547 #ifdef RATELIMIT_DEBUG
1548 			/* tunable read-only advanced SYSCTLs */
1549 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1550 			    name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1551 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1552 #endif
1553 		} else {
1554 			/* read-write SYSCTLs */
1555 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1556 			    name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1557 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1558 		}
1559 	}
1560 }
1561 
1562 static void
1563 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1564     struct sysctl_oid *node, const char *name, const char *desc)
1565 {
1566 	/* read-only SYSCTLs */
1567 	SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1568 	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
1569 }
1570 
1571 #endif
1572