xref: /freebsd/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c (revision 9729f076e4d93c5a37e78d427bfe0f1ab99bbcc6)
1 /*-
2  * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "opt_rss.h"
29 #include "opt_ratelimit.h"
30 
31 #include <dev/mlx5/mlx5_en/en.h>
32 
33 #ifdef RATELIMIT
34 
35 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
36 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
37 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
38 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
39     struct sysctl_oid *, const char *name, const char *desc);
40 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
41       struct sysctl_oid *node, const char *name, const char *desc);
42 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
43 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
44 static if_snd_tag_modify_t mlx5e_rl_snd_tag_modify;
45 static if_snd_tag_query_t mlx5e_rl_snd_tag_query;
46 static if_snd_tag_free_t mlx5e_rl_snd_tag_free;
47 
48 static const struct if_snd_tag_sw mlx5e_rl_snd_tag_sw = {
49 	.snd_tag_modify = mlx5e_rl_snd_tag_modify,
50 	.snd_tag_query = mlx5e_rl_snd_tag_query,
51 	.snd_tag_free = mlx5e_rl_snd_tag_free,
52 	.type = IF_SND_TAG_TYPE_RATE_LIMIT
53 };
54 
55 static void
56 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
57     struct mlx5e_sq_param *param)
58 {
59 	void *sqc = param->sqc;
60 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
61 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
62 
63 	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
64 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
65 	MLX5_SET(wq, wq, pd, rl->priv->pdn);
66 
67 	param->wq.linear = 1;
68 }
69 
70 static void
71 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
72     struct mlx5e_cq_param *param)
73 {
74 	void *cqc = param->cqc;
75 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
76 
77 	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
78 	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
79 	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
80 	MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index);
81 
82 	switch (rl->param.tx_coalesce_mode) {
83 	case 0:
84 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
85 		break;
86 	default:
87 		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
88 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
89 		else
90 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
91 		break;
92 	}
93 }
94 
95 static void
96 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
97     struct mlx5e_rl_channel_param *cparam)
98 {
99 	memset(cparam, 0, sizeof(*cparam));
100 
101 	mlx5e_rl_build_sq_param(rl, &cparam->sq);
102 	mlx5e_rl_build_cq_param(rl, &cparam->cq);
103 }
104 
105 static int
106 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
107     struct mlx5e_sq_param *param, int ix)
108 {
109 	struct mlx5_core_dev *mdev = priv->mdev;
110 	void *sqc = param->sqc;
111 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
112 	int err;
113 
114 	/* Create DMA descriptor TAG */
115 	if ((err = -bus_dma_tag_create(
116 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
117 	    1,				/* any alignment */
118 	    0,				/* no boundary */
119 	    BUS_SPACE_MAXADDR,		/* lowaddr */
120 	    BUS_SPACE_MAXADDR,		/* highaddr */
121 	    NULL, NULL,			/* filter, filterarg */
122 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
123 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
124 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
125 	    0,				/* flags */
126 	    NULL, NULL,			/* lockfunc, lockfuncarg */
127 	    &sq->dma_tag)))
128 		goto done;
129 
130 	sq->mkey_be = cpu_to_be32(priv->mr.key);
131 	sq->ifp = priv->ifp;
132 	sq->priv = priv;
133 
134 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
135 	    &sq->wq_ctrl);
136 	if (err)
137 		goto err_free_dma_tag;
138 
139 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
140 
141 	err = mlx5e_alloc_sq_db(sq);
142 	if (err)
143 		goto err_sq_wq_destroy;
144 
145 	mlx5e_update_sq_inline(sq);
146 
147 	return (0);
148 
149 err_sq_wq_destroy:
150 	mlx5_wq_destroy(&sq->wq_ctrl);
151 err_free_dma_tag:
152 	bus_dma_tag_destroy(sq->dma_tag);
153 done:
154 	return (err);
155 }
156 
157 static void
158 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
159 {
160 
161 	mlx5e_free_sq_db(sq);
162 	mlx5_wq_destroy(&sq->wq_ctrl);
163 	bus_dma_tag_destroy(sq->dma_tag);
164 }
165 
166 static int
167 mlx5e_rl_query_sq(struct mlx5e_sq *sq)
168 {
169 	void *out;
170         int inlen;
171         int err;
172 
173         inlen = MLX5_ST_SZ_BYTES(query_sq_out);
174         out = mlx5_vzalloc(inlen);
175         if (!out)
176                 return -ENOMEM;
177 
178         err = mlx5_core_query_sq(sq->priv->mdev, sq->sqn, out);
179         if (err)
180                 goto out;
181 
182         sq->queue_handle = MLX5_GET(query_sq_out, out, sq_context.queue_handle);
183 
184 out:
185         kvfree(out);
186         return err;
187 }
188 
189 static int
190 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
191     struct mlx5e_sq_param *param, int ix)
192 {
193 	int err;
194 
195 	err = mlx5e_rl_create_sq(priv, sq, param, ix);
196 	if (err)
197 		return (err);
198 
199 	err = mlx5e_enable_sq(sq, param, &priv->channel[ix].bfreg, priv->rl.tisn);
200 	if (err)
201 		goto err_destroy_sq;
202 
203 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
204 	if (err)
205 		goto err_disable_sq;
206 
207 	if (MLX5_CAP_QOS(priv->mdev, qos_remap_pp)) {
208 		err = mlx5e_rl_query_sq(sq);
209 		if (err) {
210 			mlx5_en_err(priv->ifp, "Failed retrieving send queue handle for"
211 			    "SQ remap - sqn=%u, err=(%d)\n", sq->sqn, err);
212 			sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE;
213 		}
214 	} else
215 		sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE;
216 
217 	WRITE_ONCE(sq->running, 1);
218 
219 	return (0);
220 
221 err_disable_sq:
222 	mlx5e_disable_sq(sq);
223 err_destroy_sq:
224 	mlx5e_rl_destroy_sq(sq);
225 
226 	return (err);
227 }
228 
229 static void
230 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
231 {
232 	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
233 	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
234 
235 	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
236 
237 	sq->cev_factor = priv->rl.param.tx_completion_fact;
238 
239 	/* ensure the TX completion event factor is not zero */
240 	if (sq->cev_factor == 0)
241 		sq->cev_factor = 1;
242 }
243 
244 static int
245 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
246     struct mlx5e_rl_channel_param *cparam,
247     struct mlx5e_sq *volatile *ppsq)
248 {
249 	struct mlx5e_priv *priv = rlw->priv;
250 	struct mlx5e_sq *sq;
251 	int err;
252 
253 	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
254 
255 	/* init mutexes */
256 	mlx5e_rl_chan_mtx_init(priv, sq);
257 
258 	/* open TX completion queue */
259 	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
260 	    &mlx5e_tx_cq_comp, eq_ix);
261 	if (err)
262 		goto err_free;
263 
264 	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
265 	if (err)
266 		goto err_close_tx_cq;
267 
268 	/* store TX channel pointer */
269 	*ppsq = sq;
270 
271 	/* poll TX queue initially */
272 	sq->cq.mcq.comp(&sq->cq.mcq, NULL);
273 
274 	return (0);
275 
276 err_close_tx_cq:
277 	mlx5e_close_cq(&sq->cq);
278 
279 err_free:
280 	/* destroy mutexes */
281 	mtx_destroy(&sq->lock);
282 	mtx_destroy(&sq->comp_lock);
283 	free(sq, M_MLX5EN);
284 	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
285 	return (err);
286 }
287 
288 static void
289 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
290 {
291 	struct mlx5e_sq *sq = *ppsq;
292 
293 	/* check if channel is already closed */
294 	if (sq == NULL)
295 		return;
296 	/* ensure channel pointer is no longer used */
297 	*ppsq = NULL;
298 
299 	/* teardown and destroy SQ */
300 	mlx5e_drain_sq(sq);
301 	mlx5e_disable_sq(sq);
302 	mlx5e_rl_destroy_sq(sq);
303 
304 	/* close CQ */
305 	mlx5e_close_cq(&sq->cq);
306 
307 	/* destroy mutexes */
308 	mtx_destroy(&sq->lock);
309 	mtx_destroy(&sq->comp_lock);
310 
311 	free(sq, M_MLX5EN);
312 }
313 
314 static void
315 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
316 {
317 	/*
318 	 * Limit the maximum distance between completion events to
319 	 * half of the currently set TX queue size.
320 	 *
321 	 * The maximum number of queue entries a single IP packet can
322 	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
323 	 *
324 	 * The worst case max value is then given as below:
325 	 */
326 	uint64_t max = rl->param.tx_queue_size /
327 	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
328 
329 	/*
330 	 * Update the maximum completion factor value in case the
331 	 * tx_queue_size field changed. Ensure we don't overflow
332 	 * 16-bits.
333 	 */
334 	if (max < 1)
335 		max = 1;
336 	else if (max > 65535)
337 		max = 65535;
338 	rl->param.tx_completion_fact_max = max;
339 
340 	/*
341 	 * Verify that the current TX completion factor is within the
342 	 * given limits:
343 	 */
344 	if (rl->param.tx_completion_fact < 1)
345 		rl->param.tx_completion_fact = 1;
346 	else if (rl->param.tx_completion_fact > max)
347 		rl->param.tx_completion_fact = max;
348 }
349 
350 static int
351 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
352 {
353 	struct mlx5e_priv *priv = sq->priv;
354 	struct mlx5_core_dev *mdev = priv->mdev;
355 
356 	void *in;
357 	void *sqc;
358 	int inlen;
359 	int err;
360 
361 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
362 	in = mlx5_vzalloc(inlen);
363 	if (in == NULL)
364 		return (-ENOMEM);
365 
366 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
367 
368 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
369 	MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
370 	MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
371 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
372 	MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
373 
374 	err = mlx5_core_modify_sq(mdev, in, inlen);
375 
376 	kvfree(in);
377 
378 	return (err);
379 }
380 
381 /*
382  * This function will search the configured rate limit table for the
383  * best match to avoid that a single socket based application can
384  * allocate all the available hardware rates. If the user selected
385  * rate deviates too much from the closes rate available in the rate
386  * limit table, unlimited rate will be selected.
387  */
388 static uint64_t
389 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
390 {
391 	uint64_t distance = -1ULL;
392 	uint64_t diff;
393 	uint64_t retval = 0;		/* unlimited */
394 	uint64_t x;
395 
396 	/* search for closest rate */
397 	for (x = 0; x != rl->param.tx_rates_def; x++) {
398 		uint64_t rate = rl->rate_limit_table[x];
399 		if (rate == 0)
400 			continue;
401 
402 		if (rate > user_rate)
403 			diff = rate - user_rate;
404 		else
405 			diff = user_rate - rate;
406 
407 		/* check if distance is smaller than previous rate */
408 		if (diff < distance) {
409 			distance = diff;
410 			retval = rate;
411 		}
412 	}
413 
414 	/* range check for multiplication below */
415 	if (user_rate > rl->param.tx_limit_max)
416 		user_rate = rl->param.tx_limit_max;
417 
418 	/* fallback to unlimited, if rate deviates too much */
419 	if (distance > howmany(user_rate *
420 	    rl->param.tx_allowed_deviation, 1000ULL))
421 		retval = 0;
422 
423 	return (retval);
424 }
425 
426 static int
427 mlx5e_rl_post_sq_remap_wqe(struct mlx5e_iq *iq, u32 scq_handle, u32 sq_handle,
428     struct mlx5e_rl_channel *sq_channel)
429 {
430 	const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_qos_remap_wqe),
431 	            MLX5_SEND_WQE_DS);
432 	struct mlx5e_tx_qos_remap_wqe *wqe;
433 	int pi;
434 
435 	mtx_lock(&iq->lock);
436 	pi = mlx5e_iq_get_producer_index(iq);
437 	if (pi < 0) {
438 		mtx_unlock(&iq->lock);
439 		return (-ENOMEM);
440 	}
441 	wqe = mlx5_wq_cyc_get_wqe(&iq->wq, pi);
442 
443 	memset(wqe, 0, sizeof(*wqe));
444 
445 	wqe->qos_remap.qos_handle = cpu_to_be32(scq_handle);
446 	wqe->qos_remap.queue_handle = cpu_to_be32(sq_handle);
447 
448 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((iq->pc << 8) |
449 	    MLX5_OPCODE_QOS_REMAP);
450 	wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | ds_cnt);
451 	wqe->ctrl.imm = cpu_to_be32(iq->priv->tisn[0] << 8);
452 	wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL;
453 
454 	/* copy data for doorbell */
455 	memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32));
456 
457 	iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
458 	iq->data[pi].p_refcount = &sq_channel->refcount;
459 	atomic_add_int(iq->data[pi].p_refcount, 1);
460 	iq->pc += iq->data[pi].num_wqebbs;
461 
462 	mlx5e_iq_notify_hw(iq);
463 
464 	mtx_unlock(&iq->lock);
465 
466 	return (0); /* success */
467 }
468 
469 static int
470 mlx5e_rl_remap_sq(struct mlx5e_sq *sq, uint16_t index,
471     struct mlx5e_rl_channel *sq_channel)
472 {
473 	struct mlx5e_channel *iq_channel;
474 	u32	scq_handle;
475 	u32	sq_handle;
476 	int 	error;
477 
478 	/* Specific SQ remap operations should be handled by same IQ */
479 	iq_channel = &sq->priv->channel[sq->sqn % sq->priv->params.num_channels];
480 
481 	sq_handle = sq->queue_handle;
482 	scq_handle = mlx5_rl_get_scq_handle(sq->priv->mdev, index);
483 
484 	if (sq_handle == MLX5_INVALID_QUEUE_HANDLE ||
485 	    scq_handle == MLX5_INVALID_QUEUE_HANDLE)
486 		error = -1;
487 	else
488 		error = mlx5e_rl_post_sq_remap_wqe(&iq_channel->iq, scq_handle,
489 		    sq_handle, sq_channel);
490 
491 	return (error);
492 }
493 
494 /*
495  * This function sets the requested rate for a rate limit channel, in
496  * bits per second. The requested rate will be filtered through the
497  * find best rate function above.
498  */
499 static int
500 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
501     struct mlx5e_rl_channel *channel, uint64_t rate)
502 {
503 	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
504 	struct mlx5e_sq *sq;
505 	uint64_t temp;
506 	uint16_t index;
507 	uint16_t burst;
508 	int error;
509 	bool use_sq_remap;
510 
511 	if (rate != 0) {
512 		MLX5E_RL_WORKER_UNLOCK(rlw);
513 
514 		MLX5E_RL_RLOCK(rl);
515 
516 		/* get current burst size in bytes */
517 		temp = rl->param.tx_burst_size *
518 		    MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
519 
520 		/* limit burst size to 64K currently */
521 		if (temp > 65535)
522 			temp = 65535;
523 		burst = temp;
524 
525 		/* find best rate */
526 		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
527 
528 		MLX5E_RL_RUNLOCK(rl);
529 
530 		if (rate == 0) {
531 			/* rate doesn't exist, fallback to unlimited */
532 			index = 0;
533 			rate = 0;
534 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
535 		} else {
536 			/* get a reference on the new rate */
537 			error = -mlx5_rl_add_rate(rlw->priv->mdev,
538 			    howmany(rate, 1000), burst, &index);
539 
540 			if (error != 0) {
541 				/* adding rate failed, fallback to unlimited */
542 				index = 0;
543 				rate = 0;
544 				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
545 			}
546 		}
547 		MLX5E_RL_WORKER_LOCK(rlw);
548 	} else {
549 		index = 0;
550 		burst = 0;	/* default */
551 	}
552 
553 	/* paced <--> non-paced transitions must go via FW */
554 	use_sq_remap = MLX5_CAP_QOS(rlw->priv->mdev, qos_remap_pp) &&
555 	    channel->last_rate != 0 && rate != 0;
556 
557 	/* atomically swap rates */
558 	temp = channel->last_rate;
559 	channel->last_rate = rate;
560 	rate = temp;
561 
562 	/* atomically swap burst size */
563 	temp = channel->last_burst;
564 	channel->last_burst = burst;
565 	burst = temp;
566 
567 	MLX5E_RL_WORKER_UNLOCK(rlw);
568 	/* put reference on the old rate, if any */
569 	if (rate != 0) {
570 		mlx5_rl_remove_rate(rlw->priv->mdev,
571 		    howmany(rate, 1000), burst);
572 	}
573 
574 	/* set new rate, if SQ is running */
575 	sq = channel->sq;
576 	if (sq != NULL && READ_ONCE(sq->running) != 0) {
577 		if (!use_sq_remap || mlx5e_rl_remap_sq(sq, index, channel)) {
578 			while (atomic_load_int(&channel->refcount) != 0 &&
579 			    rlw->priv->mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR &&
580 		            pci_channel_offline(rlw->priv->mdev->pdev) == 0)
581 				pause("W", 1);
582 			error = mlx5e_rl_modify_sq(sq, index);
583 			if (error != 0)
584 				atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
585 		}
586 	} else
587 		error = 0;
588 
589 	MLX5E_RL_WORKER_LOCK(rlw);
590 
591 	return (-error);
592 }
593 
594 static void
595 mlx5e_rl_worker(void *arg)
596 {
597 	struct thread *td;
598 	struct mlx5e_rl_worker *rlw = arg;
599 	struct mlx5e_rl_channel *channel;
600 	struct mlx5e_priv *priv;
601 	unsigned ix;
602 	uint64_t x;
603 	int error;
604 
605 	/* set thread priority */
606 	td = curthread;
607 
608 	thread_lock(td);
609 	sched_prio(td, PI_SWI(SWI_NET));
610 	thread_unlock(td);
611 
612 	priv = rlw->priv;
613 
614 	/* compute completion vector */
615 	ix = (rlw - priv->rl.workers) %
616 	    priv->mdev->priv.eq_table.num_comp_vectors;
617 
618 	/* TODO bind to CPU */
619 
620 	/* open all the SQs */
621 	MLX5E_RL_WORKER_LOCK(rlw);
622 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
623 		struct mlx5e_rl_channel *channel = rlw->channels + x;
624 
625 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
626 		if (channel->state == MLX5E_RL_ST_FREE)
627 			continue;
628 #endif
629 		MLX5E_RL_WORKER_UNLOCK(rlw);
630 
631 		MLX5E_RL_RLOCK(&priv->rl);
632 		error = mlx5e_rl_open_channel(rlw, ix,
633 		    &priv->rl.chan_param, &channel->sq);
634 		MLX5E_RL_RUNLOCK(&priv->rl);
635 
636 		MLX5E_RL_WORKER_LOCK(rlw);
637 		if (error != 0) {
638 			mlx5_en_err(priv->ifp,
639 			    "mlx5e_rl_open_channel failed: %d\n", error);
640 			break;
641 		}
642 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
643 	}
644 	while (1) {
645 		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
646 			/* check if we are tearing down */
647 			if (rlw->worker_done != 0)
648 				break;
649 			cv_wait(&rlw->cv, &rlw->mtx);
650 		}
651 		/* check if we are tearing down */
652 		if (rlw->worker_done != 0)
653 			break;
654 		channel = STAILQ_FIRST(&rlw->process_head);
655 		if (channel != NULL) {
656 			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
657 
658 			switch (channel->state) {
659 			case MLX5E_RL_ST_MODIFY:
660 				channel->state = MLX5E_RL_ST_USED;
661 				MLX5E_RL_WORKER_UNLOCK(rlw);
662 
663 				/* create channel by demand */
664 				if (channel->sq == NULL) {
665 					MLX5E_RL_RLOCK(&priv->rl);
666 					error = mlx5e_rl_open_channel(rlw, ix,
667 					    &priv->rl.chan_param, &channel->sq);
668 					MLX5E_RL_RUNLOCK(&priv->rl);
669 
670 					if (error != 0) {
671 						mlx5_en_err(priv->ifp,
672 						    "mlx5e_rl_open_channel failed: %d\n", error);
673 					} else {
674 						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
675 					}
676 				} else {
677 					mlx5e_resume_sq(channel->sq);
678 				}
679 
680 				MLX5E_RL_WORKER_LOCK(rlw);
681 				/* convert from bytes/s to bits/s and set new rate */
682 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
683 				    channel->new_rate * 8ULL);
684 				if (error != 0) {
685 					mlx5_en_err(priv->ifp,
686 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
687 					    error);
688 				}
689 				break;
690 
691 			case MLX5E_RL_ST_DESTROY:
692 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
693 				if (error != 0) {
694 					mlx5_en_err(priv->ifp,
695 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
696 					    error);
697 				}
698 				if (channel->sq != NULL) {
699 					/*
700 					 * Make sure all packets are
701 					 * transmitted before SQ is
702 					 * returned to free list:
703 					 */
704 					MLX5E_RL_WORKER_UNLOCK(rlw);
705 					mlx5e_drain_sq(channel->sq);
706 					MLX5E_RL_WORKER_LOCK(rlw);
707 				}
708 				/* put the channel back into the free list */
709 				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
710 				channel->state = MLX5E_RL_ST_FREE;
711 				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
712 				break;
713 			default:
714 				/* NOP */
715 				break;
716 			}
717 		}
718 	}
719 
720 	/* close all the SQs */
721 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
722 		struct mlx5e_rl_channel *channel = rlw->channels + x;
723 
724 		/* update the initial rate */
725 		channel->init_rate = channel->last_rate;
726 
727 		/* make sure we free up the rate resource */
728 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
729 
730 		if (channel->sq != NULL) {
731 			MLX5E_RL_WORKER_UNLOCK(rlw);
732 			mlx5e_rl_close_channel(&channel->sq);
733 			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
734 			MLX5E_RL_WORKER_LOCK(rlw);
735 		}
736 	}
737 
738 	rlw->worker_done = 0;
739 	cv_broadcast(&rlw->cv);
740 	MLX5E_RL_WORKER_UNLOCK(rlw);
741 
742 	kthread_exit();
743 }
744 
745 static int
746 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
747 {
748 	struct mlx5_core_dev *mdev = priv->mdev;
749 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
750 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
751 
752 	memset(in, 0, sizeof(in));
753 
754 	MLX5_SET(tisc, tisc, prio, 0);
755 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
756 
757 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
758 }
759 
760 static void
761 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
762 {
763 	mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn, 0);
764 }
765 
766 static void
767 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
768     struct mlx5_core_dev *mdev)
769 {
770 	/* ratelimit workers */
771 	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
772 	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
773 
774 	/* range check */
775 	if (param->tx_worker_threads_def == 0 ||
776 	    param->tx_worker_threads_def > param->tx_worker_threads_max)
777 		param->tx_worker_threads_def = param->tx_worker_threads_max;
778 
779 	/* ratelimit channels */
780 	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
781 	    param->tx_worker_threads_def;
782 	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
783 
784 	/* range check */
785 	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
786 		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
787 
788 	/* set default burst size */
789 	param->tx_burst_size = 4;	/* MTUs */
790 
791 	/*
792 	 * Set maximum burst size
793 	 *
794 	 * The burst size is multiplied by the MTU and clamped to the
795 	 * range 0 ... 65535 bytes inclusivly before fed into the
796 	 * firmware.
797 	 *
798 	 * NOTE: If the burst size or MTU is changed only ratelimit
799 	 * connections made after the change will use the new burst
800 	 * size.
801 	 */
802 	param->tx_burst_size_max = 255;
803 
804 	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
805 	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
806 	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
807 
808 	/* ratelimit table size */
809 	param->tx_rates_max = mdev->priv.rl_table.max_size;
810 
811 	/* range check */
812 	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
813 		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
814 
815 	/* set default number of rates */
816 	param->tx_rates_def = param->tx_rates_max;
817 
818 	/* set maximum allowed rate deviation */
819 	if (param->tx_limit_max != 0) {
820 		/*
821 		 * Make sure the deviation multiplication doesn't
822 		 * overflow unsigned 64-bit:
823 		 */
824 		param->tx_allowed_deviation_max = -1ULL /
825 		    param->tx_limit_max;
826 	}
827 	/* set default rate deviation */
828 	param->tx_allowed_deviation = 50;	/* 5.0% */
829 
830 	/* channel parameters */
831 	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
832 	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
833 	param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
834 	param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
835 	param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
836 }
837 
838 static const char *mlx5e_rl_params_desc[] = {
839 	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
840 };
841 
842 static const char *mlx5e_rl_table_params_desc[] = {
843 	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
844 };
845 
846 static const char *mlx5e_rl_stats_desc[] = {
847 	MLX5E_RL_STATS(MLX5E_STATS_DESC)
848 };
849 
850 int
851 mlx5e_rl_init(struct mlx5e_priv *priv)
852 {
853 	struct mlx5e_rl_priv_data *rl = &priv->rl;
854 	struct sysctl_oid *node;
855 	struct sysctl_oid *stats;
856 	char buf[64];
857 	uint64_t i;
858 	uint64_t j;
859 	int error;
860 
861 	/* check if there is support for packet pacing */
862 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
863 		return (0);
864 
865 	rl->priv = priv;
866 
867 	sysctl_ctx_init(&rl->ctx);
868 
869 	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
870 
871 	/* open own TIS domain for ratelimit SQs */
872 	error = mlx5e_rl_open_tis(priv);
873 	if (error)
874 		goto done;
875 
876 	/* setup default value for parameters */
877 	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
878 
879 	/* update the completion factor */
880 	mlx5e_rl_sync_tx_completion_fact(rl);
881 
882 	/* create root node */
883 	node = SYSCTL_ADD_NODE(&rl->ctx,
884 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
885 	    "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support");
886 
887 	if (node != NULL) {
888 		/* create SYSCTLs */
889 		for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
890 			mlx5e_rl_sysctl_add_u64_oid(rl,
891 			    MLX5E_RL_PARAMS_INDEX(arg[i]),
892 			    node, mlx5e_rl_params_desc[2 * i],
893 			    mlx5e_rl_params_desc[2 * i + 1]);
894 		}
895 
896 		stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
897 		    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
898 		    "Rate limiting statistics");
899 		if (stats != NULL) {
900 			/* create SYSCTLs */
901 			for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
902 				mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
903 				    stats, mlx5e_rl_stats_desc[2 * i],
904 				    mlx5e_rl_stats_desc[2 * i + 1]);
905 			}
906 		}
907 	}
908 
909 	/* allocate workers array */
910 	rl->workers = malloc(sizeof(rl->workers[0]) *
911 	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
912 
913 	/* allocate rate limit array */
914 	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
915 	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
916 
917 	if (node != NULL) {
918 		/* create more SYSCTls */
919 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
920 		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
921 		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
922 		    "A", "Show table of all configured TX rates");
923 
924 		/* try to fetch rate table from kernel environment */
925 		for (i = 0; i != rl->param.tx_rates_def; i++) {
926 			/* compute path for tunable */
927 			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
928 			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
929 			if (TUNABLE_QUAD_FETCH(buf, &j))
930 				mlx5e_rl_tx_limit_add(rl, j);
931 		}
932 
933 		/* setup rate table sysctls */
934 		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
935 			mlx5e_rl_sysctl_add_u64_oid(rl,
936 			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
937 			    node, mlx5e_rl_table_params_desc[2 * i],
938 			    mlx5e_rl_table_params_desc[2 * i + 1]);
939 		}
940 	}
941 
942 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
943 		struct mlx5e_rl_worker *rlw = rl->workers + j;
944 
945 		rlw->priv = priv;
946 
947 		cv_init(&rlw->cv, "mlx5-worker-cv");
948 		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
949 		STAILQ_INIT(&rlw->index_list_head);
950 		STAILQ_INIT(&rlw->process_head);
951 
952 		rlw->channels = malloc(sizeof(rlw->channels[0]) *
953 		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
954 
955 		MLX5E_RL_WORKER_LOCK(rlw);
956 		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
957 			struct mlx5e_rl_channel *channel = rlw->channels + i;
958 			channel->worker = rlw;
959 			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
960 		}
961 		MLX5E_RL_WORKER_UNLOCK(rlw);
962 	}
963 
964 	PRIV_LOCK(priv);
965 	error = mlx5e_rl_open_workers(priv);
966 	PRIV_UNLOCK(priv);
967 
968 	if (error != 0) {
969 		mlx5_en_err(priv->ifp,
970 		    "mlx5e_rl_open_workers failed: %d\n", error);
971 	}
972 
973 	return (0);
974 
975 done:
976 	sysctl_ctx_free(&rl->ctx);
977 	sx_destroy(&rl->rl_sxlock);
978 	return (error);
979 }
980 
981 static int
982 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
983 {
984 	struct mlx5e_rl_priv_data *rl = &priv->rl;
985 	struct thread *rl_thread = NULL;
986 	struct proc *rl_proc = NULL;
987 	uint64_t j;
988 	int error;
989 
990 	if (priv->gone || rl->opened)
991 		return (-EINVAL);
992 
993 	MLX5E_RL_WLOCK(rl);
994 	/* compute channel parameters once */
995 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
996 	MLX5E_RL_WUNLOCK(rl);
997 
998 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
999 		struct mlx5e_rl_worker *rlw = rl->workers + j;
1000 
1001 		/* start worker thread */
1002 		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
1003 		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
1004 		if (error != 0) {
1005 			mlx5_en_err(rl->priv->ifp,
1006 			    "kproc_kthread_add failed: %d\n", error);
1007 			rlw->worker_done = 1;
1008 		}
1009 	}
1010 
1011 	rl->opened = 1;
1012 
1013 	return (0);
1014 }
1015 
1016 static void
1017 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
1018 {
1019 	struct mlx5e_rl_priv_data *rl = &priv->rl;
1020 	uint64_t y;
1021 
1022 	if (rl->opened == 0)
1023 		return;
1024 
1025 	/* tear down worker threads simultaneously */
1026 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
1027 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1028 
1029 		/* tear down worker before freeing SQs */
1030 		MLX5E_RL_WORKER_LOCK(rlw);
1031 		if (rlw->worker_done == 0) {
1032 			rlw->worker_done = 1;
1033 			cv_broadcast(&rlw->cv);
1034 		} else {
1035 			/* XXX thread not started */
1036 			rlw->worker_done = 0;
1037 		}
1038 		MLX5E_RL_WORKER_UNLOCK(rlw);
1039 	}
1040 
1041 	/* wait for worker threads to exit */
1042 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
1043 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1044 
1045 		/* tear down worker before freeing SQs */
1046 		MLX5E_RL_WORKER_LOCK(rlw);
1047 		while (rlw->worker_done != 0)
1048 			cv_wait(&rlw->cv, &rlw->mtx);
1049 		MLX5E_RL_WORKER_UNLOCK(rlw);
1050 	}
1051 
1052 	rl->opened = 0;
1053 }
1054 
1055 static void
1056 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
1057 {
1058 	unsigned x;
1059 
1060 	MLX5E_RL_WLOCK(rl);
1061 	for (x = 0; x != rl->param.tx_rates_def; x++)
1062 		rl->rate_limit_table[x] = 0;
1063 	MLX5E_RL_WUNLOCK(rl);
1064 }
1065 
1066 void
1067 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1068 {
1069 	struct mlx5e_rl_priv_data *rl = &priv->rl;
1070 	uint64_t y;
1071 
1072 	/* check if there is support for packet pacing */
1073 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
1074 		return;
1075 
1076 	/* TODO check if there is support for packet pacing */
1077 
1078 	sysctl_ctx_free(&rl->ctx);
1079 
1080 	PRIV_LOCK(priv);
1081 	mlx5e_rl_close_workers(priv);
1082 	PRIV_UNLOCK(priv);
1083 
1084 	mlx5e_rl_reset_rates(rl);
1085 
1086 	/* close TIS domain */
1087 	mlx5e_rl_close_tis(priv);
1088 
1089 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
1090 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1091 
1092 		cv_destroy(&rlw->cv);
1093 		mtx_destroy(&rlw->mtx);
1094 		free(rlw->channels, M_MLX5EN);
1095 	}
1096 	free(rl->rate_limit_table, M_MLX5EN);
1097 	free(rl->workers, M_MLX5EN);
1098 	sx_destroy(&rl->rl_sxlock);
1099 }
1100 
1101 static void
1102 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
1103     struct mlx5e_rl_channel *channel)
1104 {
1105 	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
1106 	cv_broadcast(&rlw->cv);
1107 }
1108 
1109 static void
1110 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
1111 {
1112 	if (channel == NULL)
1113 		return;
1114 
1115 	MLX5E_RL_WORKER_LOCK(rlw);
1116 	switch (channel->state) {
1117 	case MLX5E_RL_ST_MODIFY:
1118 		channel->state = MLX5E_RL_ST_DESTROY;
1119 		break;
1120 	case MLX5E_RL_ST_USED:
1121 		channel->state = MLX5E_RL_ST_DESTROY;
1122 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1123 		break;
1124 	default:
1125 		break;
1126 	}
1127 	MLX5E_RL_WORKER_UNLOCK(rlw);
1128 }
1129 
1130 static int
1131 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1132 {
1133 
1134 	MLX5E_RL_WORKER_LOCK(rlw);
1135 	channel->new_rate = rate;
1136 	switch (channel->state) {
1137 	case MLX5E_RL_ST_USED:
1138 		channel->state = MLX5E_RL_ST_MODIFY;
1139 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1140 		break;
1141 	default:
1142 		break;
1143 	}
1144 	MLX5E_RL_WORKER_UNLOCK(rlw);
1145 
1146 	return (0);
1147 }
1148 
1149 static int
1150 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
1151     union if_snd_tag_query_params *params)
1152 {
1153 	int retval;
1154 
1155 	MLX5E_RL_WORKER_LOCK(rlw);
1156 	switch (channel->state) {
1157 	case MLX5E_RL_ST_USED:
1158 		params->rate_limit.max_rate = channel->last_rate;
1159 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1160 		retval = 0;
1161 		break;
1162 	case MLX5E_RL_ST_MODIFY:
1163 		params->rate_limit.max_rate = channel->last_rate;
1164 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1165 		retval = EBUSY;
1166 		break;
1167 	default:
1168 		retval = EINVAL;
1169 		break;
1170 	}
1171 	MLX5E_RL_WORKER_UNLOCK(rlw);
1172 
1173 	return (retval);
1174 }
1175 
1176 static int
1177 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1178     struct mlx5e_rl_channel **pchannel)
1179 {
1180 	struct mlx5e_rl_channel *channel;
1181 	int retval = ENOMEM;
1182 
1183 	MLX5E_RL_WORKER_LOCK(rlw);
1184 	/* Check for available channel in free list */
1185 	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1186 		retval = 0;
1187 		/* Remove head index from available list */
1188 		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1189 		channel->state = MLX5E_RL_ST_USED;
1190 		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1191 	} else {
1192 		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1193 	}
1194 	MLX5E_RL_WORKER_UNLOCK(rlw);
1195 
1196 	*pchannel = channel;
1197 #ifdef RATELIMIT_DEBUG
1198 	mlx5_en_info(rlw->priv->ifp,
1199 	    "Channel pointer for rate limit connection is %p\n", channel);
1200 #endif
1201 	return (retval);
1202 }
1203 
1204 int
1205 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
1206     union if_snd_tag_alloc_params *params,
1207     struct m_snd_tag **ppmt)
1208 {
1209 	struct mlx5e_rl_channel *channel;
1210 	struct mlx5e_rl_worker *rlw;
1211 	struct mlx5e_priv *priv;
1212 	int error;
1213 
1214 	priv = ifp->if_softc;
1215 
1216 	/* check if there is support for packet pacing or if device is going away */
1217 	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1218 	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1219 	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1220 		return (EOPNOTSUPP);
1221 
1222 	/* compute worker thread this TCP connection belongs to */
1223 	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1224 	    priv->rl.param.tx_worker_threads_def);
1225 
1226 	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1227 	if (error != 0)
1228 		goto done;
1229 
1230 	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1231 	if (error != 0) {
1232 		mlx5e_rl_free(rlw, channel);
1233 		goto done;
1234 	}
1235 
1236 	/* store pointer to mbuf tag */
1237 	MPASS(channel->tag.refcount == 0);
1238 	m_snd_tag_init(&channel->tag, ifp, &mlx5e_rl_snd_tag_sw);
1239 	*ppmt = &channel->tag;
1240 done:
1241 	return (error);
1242 }
1243 
1244 
1245 static int
1246 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1247 {
1248 	struct mlx5e_rl_channel *channel =
1249 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1250 
1251 	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1252 }
1253 
1254 static int
1255 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1256 {
1257 	struct mlx5e_rl_channel *channel =
1258 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1259 
1260 	return (mlx5e_rl_query(channel->worker, channel, params));
1261 }
1262 
1263 static void
1264 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1265 {
1266 	struct mlx5e_rl_channel *channel =
1267 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1268 
1269 	mlx5e_rl_free(channel->worker, channel);
1270 }
1271 
1272 static int
1273 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1274 {
1275 	struct mlx5e_rl_priv_data *rl = arg1;
1276 	struct mlx5e_priv *priv = rl->priv;
1277 	struct sbuf sbuf;
1278 	unsigned x;
1279 	int error;
1280 
1281 	error = sysctl_wire_old_buffer(req, 0);
1282 	if (error != 0)
1283 		return (error);
1284 
1285 	PRIV_LOCK(priv);
1286 
1287 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1288 
1289 	sbuf_printf(&sbuf,
1290 	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1291 	    "\t" "--------------------------------------------\n");
1292 
1293 	MLX5E_RL_RLOCK(rl);
1294 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1295 		if (rl->rate_limit_table[x] == 0)
1296 			continue;
1297 
1298 		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1299 		    x, (unsigned)rl->param.tx_burst_size,
1300 		    (long long)rl->rate_limit_table[x]);
1301 	}
1302 	MLX5E_RL_RUNLOCK(rl);
1303 
1304 	error = sbuf_finish(&sbuf);
1305 	sbuf_delete(&sbuf);
1306 
1307 	PRIV_UNLOCK(priv);
1308 
1309 	return (error);
1310 }
1311 
1312 static int
1313 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1314 {
1315 	uint64_t x;
1316 	uint64_t y;
1317 
1318 	MLX5E_RL_WLOCK(rl);
1319 	/* compute channel parameters once */
1320 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1321 	MLX5E_RL_WUNLOCK(rl);
1322 
1323 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1324 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1325 
1326 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1327 			struct mlx5e_rl_channel *channel;
1328 			struct mlx5e_sq *sq;
1329 
1330 			channel = rlw->channels + x;
1331 			sq = channel->sq;
1332 
1333 			if (sq == NULL)
1334 				continue;
1335 
1336 			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1337 				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1338 				    rl->param.tx_coalesce_usecs,
1339 				    rl->param.tx_coalesce_pkts,
1340 				    rl->param.tx_coalesce_mode);
1341 			} else {
1342 				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1343 				    rl->param.tx_coalesce_usecs,
1344 				    rl->param.tx_coalesce_pkts);
1345 			}
1346 		}
1347 	}
1348 	return (0);
1349 }
1350 
1351 void
1352 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
1353 {
1354 	uint64_t x;
1355 	uint64_t y;
1356 
1357 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1358 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1359 
1360 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1361 			struct mlx5e_rl_channel *channel;
1362 			struct mlx5e_sq *sq;
1363 
1364 			channel = rlw->channels + x;
1365 			sq = channel->sq;
1366 
1367 			if (sq == NULL)
1368 				continue;
1369 
1370 			mtx_lock(&sq->lock);
1371 			mlx5e_update_sq_inline(sq);
1372 			mtx_unlock(&sq->lock);
1373 		}
1374 	}
1375 }
1376 
1377 static int
1378 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1379 {
1380 	unsigned x;
1381 	int error;
1382 
1383 	if (value < 1000 ||
1384 	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1385 		return (EINVAL);
1386 
1387 	MLX5E_RL_WLOCK(rl);
1388 	error = ENOMEM;
1389 
1390 	/* check if rate already exists */
1391 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1392 		if (rl->rate_limit_table[x] != value)
1393 			continue;
1394 		error = EEXIST;
1395 		break;
1396 	}
1397 
1398 	/* check if there is a free rate entry */
1399 	if (x == rl->param.tx_rates_def) {
1400 		for (x = 0; x != rl->param.tx_rates_def; x++) {
1401 			if (rl->rate_limit_table[x] != 0)
1402 				continue;
1403 			rl->rate_limit_table[x] = value;
1404 			error = 0;
1405 			break;
1406 		}
1407 	}
1408 	MLX5E_RL_WUNLOCK(rl);
1409 
1410 	return (error);
1411 }
1412 
1413 static int
1414 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1415 {
1416 	unsigned x;
1417 	int error;
1418 
1419 	if (value == 0)
1420 		return (EINVAL);
1421 
1422 	MLX5E_RL_WLOCK(rl);
1423 
1424 	/* check if rate already exists */
1425 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1426 		if (rl->rate_limit_table[x] != value)
1427 			continue;
1428 		/* free up rate */
1429 		rl->rate_limit_table[x] = 0;
1430 		break;
1431 	}
1432 
1433 	/* check if there is a free rate entry */
1434 	if (x == rl->param.tx_rates_def)
1435 		error = ENOENT;
1436 	else
1437 		error = 0;
1438 	MLX5E_RL_WUNLOCK(rl);
1439 
1440 	return (error);
1441 }
1442 
1443 static int
1444 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1445 {
1446 	struct mlx5e_rl_priv_data *rl = arg1;
1447 	struct mlx5e_priv *priv = rl->priv;
1448 	unsigned mode_modify;
1449 	unsigned was_opened;
1450 	uint64_t value;
1451 	int error;
1452 
1453 	PRIV_LOCK(priv);
1454 
1455 	MLX5E_RL_RLOCK(rl);
1456 	value = rl->param.arg[arg2];
1457 	MLX5E_RL_RUNLOCK(rl);
1458 
1459 	if (req != NULL) {
1460 		error = sysctl_handle_64(oidp, &value, 0, req);
1461 		if (error || req->newptr == NULL ||
1462 		    value == rl->param.arg[arg2])
1463 			goto done;
1464 	} else {
1465 		error = 0;
1466 	}
1467 
1468 	/* check if device is gone */
1469 	if (priv->gone) {
1470 		error = ENXIO;
1471 		goto done;
1472 	}
1473 	was_opened = rl->opened;
1474 	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1475 
1476 	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1477 	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1478 		if (value > rl->param.tx_worker_threads_max)
1479 			value = rl->param.tx_worker_threads_max;
1480 		else if (value < 1)
1481 			value = 1;
1482 
1483 		/* store new value */
1484 		rl->param.arg[arg2] = value;
1485 		break;
1486 
1487 	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1488 		if (value > rl->param.tx_channels_per_worker_max)
1489 			value = rl->param.tx_channels_per_worker_max;
1490 		else if (value < 1)
1491 			value = 1;
1492 
1493 		/* store new value */
1494 		rl->param.arg[arg2] = value;
1495 		break;
1496 
1497 	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1498 		if (value > rl->param.tx_rates_max)
1499 			value = rl->param.tx_rates_max;
1500 		else if (value < 1)
1501 			value = 1;
1502 
1503 		/* store new value */
1504 		rl->param.arg[arg2] = value;
1505 		break;
1506 
1507 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1508 		/* range check */
1509 		if (value < 1)
1510 			value = 0;
1511 		else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1512 			value = MLX5E_FLD_MAX(cqc, cq_period);
1513 
1514 		/* store new value */
1515 		rl->param.arg[arg2] = value;
1516 
1517 		/* check to avoid down and up the network interface */
1518 		if (was_opened)
1519 			error = mlx5e_rl_refresh_channel_params(rl);
1520 		break;
1521 
1522 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1523 		/* import TX coal pkts */
1524 		if (value < 1)
1525 			value = 0;
1526 		else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1527 			value = MLX5E_FLD_MAX(cqc, cq_max_count);
1528 
1529 		/* store new value */
1530 		rl->param.arg[arg2] = value;
1531 
1532 		/* check to avoid down and up the network interface */
1533 		if (was_opened)
1534 			error = mlx5e_rl_refresh_channel_params(rl);
1535 		break;
1536 
1537 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1538 		/* network interface must be down */
1539 		if (was_opened != 0 && mode_modify == 0)
1540 			mlx5e_rl_close_workers(priv);
1541 
1542 		/* import TX coalesce mode */
1543 		if (value != 0)
1544 			value = 1;
1545 
1546 		/* store new value */
1547 		rl->param.arg[arg2] = value;
1548 
1549 		/* restart network interface, if any */
1550 		if (was_opened != 0) {
1551 			if (mode_modify == 0)
1552 				mlx5e_rl_open_workers(priv);
1553 			else
1554 				error = mlx5e_rl_refresh_channel_params(rl);
1555 		}
1556 		break;
1557 
1558 	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1559 		/* network interface must be down */
1560 		if (was_opened)
1561 			mlx5e_rl_close_workers(priv);
1562 
1563 		/* import TX queue size */
1564 		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1565 			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1566 		else if (value > priv->params_ethtool.tx_queue_size_max)
1567 			value = priv->params_ethtool.tx_queue_size_max;
1568 
1569 		/* store actual TX queue size */
1570 		value = 1ULL << order_base_2(value);
1571 
1572 		/* store new value */
1573 		rl->param.arg[arg2] = value;
1574 
1575 		/* verify TX completion factor */
1576 		mlx5e_rl_sync_tx_completion_fact(rl);
1577 
1578 		/* restart network interface, if any */
1579 		if (was_opened)
1580 			mlx5e_rl_open_workers(priv);
1581 		break;
1582 
1583 	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1584 		/* network interface must be down */
1585 		if (was_opened)
1586 			mlx5e_rl_close_workers(priv);
1587 
1588 		/* store new value */
1589 		rl->param.arg[arg2] = value;
1590 
1591 		/* verify parameter */
1592 		mlx5e_rl_sync_tx_completion_fact(rl);
1593 
1594 		/* restart network interface, if any */
1595 		if (was_opened)
1596 			mlx5e_rl_open_workers(priv);
1597 		break;
1598 
1599 	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1600 		error = mlx5e_rl_tx_limit_add(rl, value);
1601 		break;
1602 
1603 	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1604 		error = mlx5e_rl_tx_limit_clr(rl, value);
1605 		break;
1606 
1607 	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1608 		/* range check */
1609 		if (value > rl->param.tx_allowed_deviation_max)
1610 			value = rl->param.tx_allowed_deviation_max;
1611 		else if (value < rl->param.tx_allowed_deviation_min)
1612 			value = rl->param.tx_allowed_deviation_min;
1613 
1614 		MLX5E_RL_WLOCK(rl);
1615 		rl->param.arg[arg2] = value;
1616 		MLX5E_RL_WUNLOCK(rl);
1617 		break;
1618 
1619 	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1620 		/* range check */
1621 		if (value > rl->param.tx_burst_size_max)
1622 			value = rl->param.tx_burst_size_max;
1623 		else if (value < rl->param.tx_burst_size_min)
1624 			value = rl->param.tx_burst_size_min;
1625 
1626 		MLX5E_RL_WLOCK(rl);
1627 		rl->param.arg[arg2] = value;
1628 		MLX5E_RL_WUNLOCK(rl);
1629 		break;
1630 
1631 	default:
1632 		break;
1633 	}
1634 done:
1635 	PRIV_UNLOCK(priv);
1636 	return (error);
1637 }
1638 
1639 static void
1640 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1641     struct sysctl_oid *node, const char *name, const char *desc)
1642 {
1643 	/*
1644 	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1645 	 * take care of loading default sysctl value from the kernel
1646 	 * environment, if any:
1647 	 */
1648 	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1649 		/* read-only SYSCTLs */
1650 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1651 		    name, CTLTYPE_U64 | CTLFLAG_RD |
1652 		    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1653 	} else {
1654 		if (strstr(name, "_def") != 0) {
1655 #ifdef RATELIMIT_DEBUG
1656 			/* tunable read-only advanced SYSCTLs */
1657 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1658 			    name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1659 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1660 #endif
1661 		} else {
1662 			/* read-write SYSCTLs */
1663 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1664 			    name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1665 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1666 		}
1667 	}
1668 }
1669 
1670 static void
1671 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1672     struct sysctl_oid *node, const char *name, const char *desc)
1673 {
1674 	/* read-only SYSCTLs */
1675 	SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1676 	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
1677 }
1678 
1679 #else
1680 
1681 int
1682 mlx5e_rl_init(struct mlx5e_priv *priv)
1683 {
1684 
1685 	return (0);
1686 }
1687 
1688 void
1689 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1690 {
1691 	/* NOP */
1692 }
1693 
1694 #endif		/* RATELIMIT */
1695