xref: /freebsd/sys/dev/cxgbe/t4_sched.c (revision 25fb30bd9abc492359ad1f66901a06cb8cd08370)
1 /*-
2  * Copyright (c) 2017 Chelsio Communications, Inc.
3  * All rights reserved.
4  * Written by: Navdeep Parhar <np@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 #include "opt_ratelimit.h"
34 
35 #include <sys/types.h>
36 #include <sys/malloc.h>
37 #include <sys/queue.h>
38 #include <sys/sbuf.h>
39 #include <sys/taskqueue.h>
40 #include <sys/sysctl.h>
41 
42 #include "common/common.h"
43 #include "common/t4_regs.h"
44 #include "common/t4_regs_values.h"
45 #include "common/t4_msg.h"
46 
47 
48 static int
49 in_range(int val, int lo, int hi)
50 {
51 
52 	return (val < 0 || (val <= hi && val >= lo));
53 }
54 
55 static int
56 set_sched_class_config(struct adapter *sc, int minmax)
57 {
58 	int rc;
59 
60 	if (minmax < 0)
61 		return (EINVAL);
62 
63 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sscc");
64 	if (rc)
65 		return (rc);
66 	rc = -t4_sched_config(sc, FW_SCHED_TYPE_PKTSCHED, minmax, 1);
67 	end_synchronized_op(sc, 0);
68 
69 	return (rc);
70 }
71 
72 static int
73 set_sched_class_params(struct adapter *sc, struct t4_sched_class_params *p,
74     int sleep_ok)
75 {
76 	int rc, top_speed, fw_level, fw_mode, fw_rateunit, fw_ratemode;
77 	struct port_info *pi;
78 	struct tx_cl_rl_params *tc, old;
79 	bool check_pktsize = false;
80 
81 	if (p->level == SCHED_CLASS_LEVEL_CL_RL)
82 		fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL;
83 	else if (p->level == SCHED_CLASS_LEVEL_CL_WRR)
84 		fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR;
85 	else if (p->level == SCHED_CLASS_LEVEL_CH_RL)
86 		fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL;
87 	else
88 		return (EINVAL);
89 
90 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
91 		if (p->mode == SCHED_CLASS_MODE_CLASS)
92 			fw_mode = FW_SCHED_PARAMS_MODE_CLASS;
93 		else if (p->mode == SCHED_CLASS_MODE_FLOW) {
94 			check_pktsize = true;
95 			fw_mode = FW_SCHED_PARAMS_MODE_FLOW;
96 		} else
97 			return (EINVAL);
98 	} else
99 		fw_mode = 0;
100 
101 	/* Valid channel must always be provided. */
102 	if (p->channel < 0)
103 		return (EINVAL);
104 	if (!in_range(p->channel, 0, sc->chip_params->nchan - 1))
105 		return (ERANGE);
106 
107 	pi = sc->port[sc->chan_map[p->channel]];
108 	if (pi == NULL)
109 		return (ENXIO);
110 	MPASS(pi->tx_chan == p->channel);
111 	top_speed = port_top_speed(pi) * 1000000; /* Gbps -> Kbps */
112 
113 	if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
114 	    p->level == SCHED_CLASS_LEVEL_CH_RL) {
115 		/*
116 		 * Valid rate (mode, unit and values) must be provided.
117 		 */
118 
119 		if (p->minrate < 0)
120 			p->minrate = 0;
121 		if (p->maxrate < 0)
122 			return (EINVAL);
123 
124 		if (p->rateunit == SCHED_CLASS_RATEUNIT_BITS) {
125 			fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
126 			/* ratemode could be relative (%) or absolute. */
127 			if (p->ratemode == SCHED_CLASS_RATEMODE_REL) {
128 				fw_ratemode = FW_SCHED_PARAMS_RATE_REL;
129 				/* maxrate is % of port bandwidth. */
130 				if (!in_range(p->minrate, 0, 100) ||
131 				    !in_range(p->maxrate, 0, 100)) {
132 					return (ERANGE);
133 				}
134 			} else if (p->ratemode == SCHED_CLASS_RATEMODE_ABS) {
135 				fw_ratemode = FW_SCHED_PARAMS_RATE_ABS;
136 				/* maxrate is absolute value in kbps. */
137 				if (!in_range(p->minrate, 0, top_speed) ||
138 				    !in_range(p->maxrate, 0, top_speed)) {
139 					return (ERANGE);
140 				}
141 			} else
142 				return (EINVAL);
143 		} else if (p->rateunit == SCHED_CLASS_RATEUNIT_PKTS) {
144 			/* maxrate is the absolute value in pps. */
145 			check_pktsize = true;
146 			fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE;
147 		} else
148 			return (EINVAL);
149 	} else {
150 		MPASS(p->level == SCHED_CLASS_LEVEL_CL_WRR);
151 
152 		/*
153 		 * Valid weight must be provided.
154 		 */
155 		if (p->weight < 0)
156 		       return (EINVAL);
157 		if (!in_range(p->weight, 1, 99))
158 			return (ERANGE);
159 
160 		fw_rateunit = 0;
161 		fw_ratemode = 0;
162 	}
163 
164 	if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
165 	    p->level == SCHED_CLASS_LEVEL_CL_WRR) {
166 		/*
167 		 * Valid scheduling class must be provided.
168 		 */
169 		if (p->cl < 0)
170 			return (EINVAL);
171 		if (!in_range(p->cl, 0, sc->chip_params->nsched_cls - 1))
172 			return (ERANGE);
173 	}
174 
175 	if (check_pktsize) {
176 		if (p->pktsize < 0)
177 			return (EINVAL);
178 		if (!in_range(p->pktsize, 64, pi->vi[0].ifp->if_mtu))
179 			return (ERANGE);
180 	}
181 
182 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
183 		tc = &pi->sched_params->cl_rl[p->cl];
184 		mtx_lock(&sc->tc_lock);
185 		if (tc->refcount > 0 || tc->flags & (CLRL_SYNC | CLRL_ASYNC))
186 			rc = EBUSY;
187 		else {
188 			tc->flags |= CLRL_SYNC | CLRL_USER;
189 			tc->ratemode = fw_ratemode;
190 			tc->rateunit = fw_rateunit;
191 			tc->mode = fw_mode;
192 			tc->maxrate = p->maxrate;
193 			tc->pktsize = p->pktsize;
194 			rc = 0;
195 			old= *tc;
196 		}
197 		mtx_unlock(&sc->tc_lock);
198 		if (rc != 0)
199 			return (rc);
200 	}
201 
202 	rc = begin_synchronized_op(sc, NULL,
203 	    sleep_ok ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4sscp");
204 	if (rc != 0) {
205 		if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
206 			mtx_lock(&sc->tc_lock);
207 			*tc = old;
208 			mtx_unlock(&sc->tc_lock);
209 		}
210 		return (rc);
211 	}
212 	rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, fw_level, fw_mode,
213 	    fw_rateunit, fw_ratemode, p->channel, p->cl, p->minrate, p->maxrate,
214 	    p->weight, p->pktsize, 0, sleep_ok);
215 	end_synchronized_op(sc, sleep_ok ? 0 : LOCK_HELD);
216 
217 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
218 		mtx_lock(&sc->tc_lock);
219 		MPASS(tc->flags & CLRL_SYNC);
220 		MPASS(tc->flags & CLRL_USER);
221 		MPASS(tc->refcount == 0);
222 
223 		tc->flags &= ~CLRL_SYNC;
224 		if (rc == 0)
225 			tc->flags &= ~CLRL_ERR;
226 		else
227 			tc->flags |= CLRL_ERR;
228 		mtx_unlock(&sc->tc_lock);
229 	}
230 
231 	return (rc);
232 }
233 
234 static void
235 update_tx_sched(void *context, int pending)
236 {
237 	int i, j, rc;
238 	struct port_info *pi;
239 	struct tx_cl_rl_params *tc;
240 	struct adapter *sc = context;
241 	const int n = sc->chip_params->nsched_cls;
242 
243 	mtx_lock(&sc->tc_lock);
244 	for_each_port(sc, i) {
245 		pi = sc->port[i];
246 		tc = &pi->sched_params->cl_rl[0];
247 		for (j = 0; j < n; j++, tc++) {
248 			MPASS(mtx_owned(&sc->tc_lock));
249 			if ((tc->flags & CLRL_ASYNC) == 0)
250 				continue;
251 			mtx_unlock(&sc->tc_lock);
252 
253 			if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
254 			    "t4utxs") != 0) {
255 				mtx_lock(&sc->tc_lock);
256 				continue;
257 			}
258 			rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED,
259 			    FW_SCHED_PARAMS_LEVEL_CL_RL, tc->mode, tc->rateunit,
260 			    tc->ratemode, pi->tx_chan, j, 0, tc->maxrate, 0,
261 			    tc->pktsize, tc->burstsize, 1);
262 			end_synchronized_op(sc, 0);
263 
264 			mtx_lock(&sc->tc_lock);
265 			MPASS(tc->flags & CLRL_ASYNC);
266 			tc->flags &= ~CLRL_ASYNC;
267 			if (rc == 0)
268 				tc->flags &= ~CLRL_ERR;
269 			else
270 				tc->flags |= CLRL_ERR;
271 		}
272 	}
273 	mtx_unlock(&sc->tc_lock);
274 }
275 
276 int
277 t4_set_sched_class(struct adapter *sc, struct t4_sched_params *p)
278 {
279 
280 	if (p->type != SCHED_CLASS_TYPE_PACKET)
281 		return (EINVAL);
282 
283 	if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG)
284 		return (set_sched_class_config(sc, p->u.config.minmax));
285 
286 	if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS)
287 		return (set_sched_class_params(sc, &p->u.params, 1));
288 
289 	return (EINVAL);
290 }
291 
292 static int
293 bind_txq_to_traffic_class(struct adapter *sc, struct sge_txq *txq, int idx)
294 {
295 	struct tx_cl_rl_params *tc0, *tc;
296 	int rc, old_idx;
297 	uint32_t fw_mnem, fw_class;
298 
299 	if (!(txq->eq.flags & EQ_ALLOCATED))
300 		return (EAGAIN);
301 
302 	mtx_lock(&sc->tc_lock);
303 	if (txq->tc_idx == -2) {
304 		rc = EBUSY;	/* Another bind/unbind in progress already. */
305 		goto done;
306 	}
307 	if (idx == txq->tc_idx) {
308 		rc = 0;		/* No change, nothing to do. */
309 		goto done;
310 	}
311 
312 	tc0 = &sc->port[txq->eq.tx_chan]->sched_params->cl_rl[0];
313 	if (idx != -1) {
314 		/*
315 		 * Bind to a different class at index idx.
316 		 */
317 		tc = &tc0[idx];
318 		if (tc->flags & CLRL_ERR) {
319 			rc = ENXIO;
320 			goto done;
321 		} else {
322 			/*
323 			 * Ok to proceed.  Place a reference on the new class
324 			 * while still holding on to the reference on the
325 			 * previous class, if any.
326 			 */
327 			tc->refcount++;
328 		}
329 	}
330 	/* Mark as busy before letting go of the lock. */
331 	old_idx = txq->tc_idx;
332 	txq->tc_idx = -2;
333 	mtx_unlock(&sc->tc_lock);
334 
335 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4btxq");
336 	if (rc != 0)
337 		return (rc);
338 	fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
339 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) |
340 	    V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id));
341 	fw_class = idx < 0 ? 0xffffffff : idx;
342 	rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_mnem, &fw_class);
343 	end_synchronized_op(sc, 0);
344 
345 	mtx_lock(&sc->tc_lock);
346 	MPASS(txq->tc_idx == -2);
347 	if (rc == 0) {
348 		/*
349 		 * Unbind, bind, or bind to a different class succeeded.  Remove
350 		 * the reference on the old traffic class, if any.
351 		 */
352 		if (old_idx != -1) {
353 			tc = &tc0[old_idx];
354 			MPASS(tc->refcount > 0);
355 			tc->refcount--;
356 		}
357 		txq->tc_idx = idx;
358 	} else {
359 		/*
360 		 * Unbind, bind, or bind to a different class failed.  Remove
361 		 * the anticipatory reference on the new traffic class, if any.
362 		 */
363 		if (idx != -1) {
364 			tc = &tc0[idx];
365 			MPASS(tc->refcount > 0);
366 			tc->refcount--;
367 		}
368 		txq->tc_idx = old_idx;
369 	}
370 done:
371 	MPASS(txq->tc_idx >= -1 && txq->tc_idx < sc->chip_params->nsched_cls);
372 	mtx_unlock(&sc->tc_lock);
373 	return (rc);
374 }
375 
376 int
377 t4_set_sched_queue(struct adapter *sc, struct t4_sched_queue *p)
378 {
379 	struct port_info *pi = NULL;
380 	struct vi_info *vi;
381 	struct sge_txq *txq;
382 	int i, rc;
383 
384 	if (p->port >= sc->params.nports)
385 		return (EINVAL);
386 
387 	/*
388 	 * XXX: cxgbetool allows the user to specify the physical port only.  So
389 	 * we always operate on the main VI.
390 	 */
391 	pi = sc->port[p->port];
392 	vi = &pi->vi[0];
393 
394 	/* Checking VI_INIT_DONE outside a synch-op is a harmless race here. */
395 	if (!(vi->flags & VI_INIT_DONE))
396 		return (EAGAIN);
397 	MPASS(vi->ntxq > 0);
398 
399 	if (!in_range(p->queue, 0, vi->ntxq - 1) ||
400 	    !in_range(p->cl, 0, sc->chip_params->nsched_cls - 1))
401 		return (EINVAL);
402 
403 	if (p->queue < 0) {
404 		/*
405 		 * Change the scheduling on all the TX queues for the
406 		 * interface.
407 		 */
408 		for_each_txq(vi, i, txq) {
409 			rc = bind_txq_to_traffic_class(sc, txq, p->cl);
410 			if (rc != 0)
411 				break;
412 		}
413 	} else {
414 		/*
415 		 * If op.queue is non-negative, then we're only changing the
416 		 * scheduling on a single specified TX queue.
417 		 */
418 		txq = &sc->sge.txq[vi->first_txq + p->queue];
419 		rc = bind_txq_to_traffic_class(sc, txq, p->cl);
420 	}
421 
422 	return (rc);
423 }
424 
425 int
426 t4_init_tx_sched(struct adapter *sc)
427 {
428 	int i, j;
429 	const int n = sc->chip_params->nsched_cls;
430 	struct port_info *pi;
431 	struct tx_cl_rl_params *tc;
432 
433 	mtx_init(&sc->tc_lock, "tx_sched lock", NULL, MTX_DEF);
434 	TASK_INIT(&sc->tc_task, 0, update_tx_sched, sc);
435 	for_each_port(sc, i) {
436 		pi = sc->port[i];
437 		pi->sched_params = malloc(sizeof(*pi->sched_params) +
438 		    n * sizeof(*tc), M_CXGBE, M_ZERO | M_WAITOK);
439 		tc = &pi->sched_params->cl_rl[0];
440 		for (j = 0; j < n; j++, tc++) {
441 			tc->refcount = 0;
442 			tc->ratemode = FW_SCHED_PARAMS_RATE_ABS;
443 			tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
444 			tc->mode = FW_SCHED_PARAMS_MODE_CLASS;
445 			tc->maxrate = 1000 * 1000;	/* 1 Gbps.  Arbitrary */
446 
447 			if (t4_sched_params_cl_rl_kbps(sc, pi->tx_chan, j,
448 			    tc->mode, tc->maxrate, tc->pktsize, 1) != 0)
449 				tc->flags = CLRL_ERR;
450 		}
451 	}
452 
453 	return (0);
454 }
455 
456 int
457 t4_free_tx_sched(struct adapter *sc)
458 {
459 	int i;
460 
461 	taskqueue_drain(taskqueue_thread, &sc->tc_task);
462 
463 	for_each_port(sc, i) {
464 		if (sc->port[i] != NULL)
465 			free(sc->port[i]->sched_params, M_CXGBE);
466 	}
467 
468 	if (mtx_initialized(&sc->tc_lock))
469 		mtx_destroy(&sc->tc_lock);
470 
471 	return (0);
472 }
473 
474 void
475 t4_update_tx_sched(struct adapter *sc)
476 {
477 
478 	taskqueue_enqueue(taskqueue_thread, &sc->tc_task);
479 }
480 
481 int
482 t4_reserve_cl_rl_kbps(struct adapter *sc, int port_id, u_int maxrate,
483     int *tc_idx)
484 {
485 	int rc = 0, fa = -1, i, pktsize, burstsize;
486 	bool update;
487 	struct tx_cl_rl_params *tc;
488 	struct port_info *pi;
489 
490 	MPASS(port_id >= 0 && port_id < sc->params.nports);
491 
492 	pi = sc->port[port_id];
493 	if (pi->sched_params->pktsize > 0)
494 		pktsize = pi->sched_params->pktsize;
495 	else
496 		pktsize = pi->vi[0].ifp->if_mtu;
497 	if (pi->sched_params->burstsize > 0)
498 		burstsize = pi->sched_params->burstsize;
499 	else
500 		burstsize = pktsize * 4;
501 	tc = &pi->sched_params->cl_rl[0];
502 
503 	update = false;
504 	mtx_lock(&sc->tc_lock);
505 	for (i = 0; i < sc->chip_params->nsched_cls; i++, tc++) {
506 		if (fa < 0 && tc->refcount == 0 && !(tc->flags & CLRL_USER))
507 			fa = i;		/* first available */
508 
509 		if (tc->ratemode == FW_SCHED_PARAMS_RATE_ABS &&
510 		    tc->rateunit == FW_SCHED_PARAMS_UNIT_BITRATE &&
511 		    tc->mode == FW_SCHED_PARAMS_MODE_FLOW &&
512 		    tc->maxrate == maxrate && tc->pktsize == pktsize &&
513 		    tc->burstsize == burstsize) {
514 			tc->refcount++;
515 			*tc_idx = i;
516 			if ((tc->flags & (CLRL_ERR | CLRL_ASYNC | CLRL_SYNC)) ==
517 			    CLRL_ERR) {
518 				update = true;
519 			}
520 			goto done;
521 		}
522 	}
523 	/* Not found */
524 	MPASS(i == sc->chip_params->nsched_cls);
525 	if (fa != -1) {
526 		tc = &pi->sched_params->cl_rl[fa];
527 		tc->refcount = 1;
528 		tc->ratemode = FW_SCHED_PARAMS_RATE_ABS;
529 		tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
530 		tc->mode = FW_SCHED_PARAMS_MODE_FLOW;
531 		tc->maxrate = maxrate;
532 		tc->pktsize = pktsize;
533 		tc->burstsize = burstsize;
534 		*tc_idx = fa;
535 		update = true;
536 	} else {
537 		*tc_idx = -1;
538 		rc = ENOSPC;
539 	}
540 done:
541 	mtx_unlock(&sc->tc_lock);
542 	if (update) {
543 		tc->flags |= CLRL_ASYNC;
544 		t4_update_tx_sched(sc);
545 	}
546 	return (rc);
547 }
548 
549 void
550 t4_release_cl_rl(struct adapter *sc, int port_id, int tc_idx)
551 {
552 	struct tx_cl_rl_params *tc;
553 
554 	MPASS(port_id >= 0 && port_id < sc->params.nports);
555 	MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls);
556 
557 	mtx_lock(&sc->tc_lock);
558 	tc = &sc->port[port_id]->sched_params->cl_rl[tc_idx];
559 	MPASS(tc->refcount > 0);
560 	tc->refcount--;
561 	mtx_unlock(&sc->tc_lock);
562 }
563 
564 int
565 sysctl_tc(SYSCTL_HANDLER_ARGS)
566 {
567 	struct vi_info *vi = arg1;
568 	struct port_info *pi;
569 	struct adapter *sc;
570 	struct sge_txq *txq;
571 	int qidx = arg2, rc, tc_idx;
572 
573 	MPASS(qidx >= 0 && qidx < vi->ntxq);
574 	pi = vi->pi;
575 	sc = pi->adapter;
576 	txq = &sc->sge.txq[vi->first_txq + qidx];
577 
578 	tc_idx = txq->tc_idx;
579 	rc = sysctl_handle_int(oidp, &tc_idx, 0, req);
580 	if (rc != 0 || req->newptr == NULL)
581 		return (rc);
582 
583 	if (sc->flags & IS_VF)
584 		return (EPERM);
585 	if (!in_range(tc_idx, 0, sc->chip_params->nsched_cls - 1))
586 		return (EINVAL);
587 
588 	return (bind_txq_to_traffic_class(sc, txq, tc_idx));
589 }
590 
591 int
592 sysctl_tc_params(SYSCTL_HANDLER_ARGS)
593 {
594 	struct adapter *sc = arg1;
595 	struct tx_cl_rl_params tc;
596 	struct sbuf *sb;
597 	int i, rc, port_id, mbps, gbps;
598 
599 	rc = sysctl_wire_old_buffer(req, 0);
600 	if (rc != 0)
601 		return (rc);
602 
603 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
604 	if (sb == NULL)
605 		return (ENOMEM);
606 
607 	port_id = arg2 >> 16;
608 	MPASS(port_id < sc->params.nports);
609 	MPASS(sc->port[port_id] != NULL);
610 	i = arg2 & 0xffff;
611 	MPASS(i < sc->chip_params->nsched_cls);
612 
613 	mtx_lock(&sc->tc_lock);
614 	tc = sc->port[port_id]->sched_params->cl_rl[i];
615 	mtx_unlock(&sc->tc_lock);
616 
617 	switch (tc.rateunit) {
618 	case SCHED_CLASS_RATEUNIT_BITS:
619 		switch (tc.ratemode) {
620 		case SCHED_CLASS_RATEMODE_REL:
621 			/* XXX: top speed or actual link speed? */
622 			gbps = port_top_speed(sc->port[port_id]);
623 			sbuf_printf(sb, "%u%% of %uGbps", tc.maxrate, gbps);
624 			break;
625 		case SCHED_CLASS_RATEMODE_ABS:
626 			mbps = tc.maxrate / 1000;
627 			gbps = tc.maxrate / 1000000;
628 			if (tc.maxrate == gbps * 1000000)
629 				sbuf_printf(sb, "%uGbps", gbps);
630 			else if (tc.maxrate == mbps * 1000)
631 				sbuf_printf(sb, "%uMbps", mbps);
632 			else
633 				sbuf_printf(sb, "%uKbps", tc.maxrate);
634 			break;
635 		default:
636 			rc = ENXIO;
637 			goto done;
638 		}
639 		break;
640 	case SCHED_CLASS_RATEUNIT_PKTS:
641 		sbuf_printf(sb, "%upps", tc.maxrate);
642 		break;
643 	default:
644 		rc = ENXIO;
645 		goto done;
646 	}
647 
648 	switch (tc.mode) {
649 	case SCHED_CLASS_MODE_CLASS:
650 		sbuf_printf(sb, " aggregate");
651 		break;
652 	case SCHED_CLASS_MODE_FLOW:
653 		sbuf_printf(sb, " per-flow");
654 		if (tc.pktsize > 0)
655 			sbuf_printf(sb, " pkt-size %u", tc.pktsize);
656 		if (tc.burstsize > 0)
657 			sbuf_printf(sb, " burst-size %u", tc.burstsize);
658 		break;
659 	default:
660 		rc = ENXIO;
661 		goto done;
662 	}
663 
664 done:
665 	if (rc == 0)
666 		rc = sbuf_finish(sb);
667 	sbuf_delete(sb);
668 
669 	return (rc);
670 }
671 
672 #ifdef RATELIMIT
673 void
674 t4_init_etid_table(struct adapter *sc)
675 {
676 	int i;
677 	struct tid_info *t;
678 
679 	if (!is_ethoffload(sc))
680 		return;
681 
682 	t = &sc->tids;
683 	MPASS(t->netids > 0);
684 
685 	mtx_init(&t->etid_lock, "etid lock", NULL, MTX_DEF);
686 	t->etid_tab = malloc(sizeof(*t->etid_tab) * t->netids, M_CXGBE,
687 			M_ZERO | M_WAITOK);
688 	t->efree = t->etid_tab;
689 	t->etids_in_use = 0;
690 	for (i = 1; i < t->netids; i++)
691 		t->etid_tab[i - 1].next = &t->etid_tab[i];
692 	t->etid_tab[t->netids - 1].next = NULL;
693 }
694 
695 void
696 t4_free_etid_table(struct adapter *sc)
697 {
698 	struct tid_info *t;
699 
700 	if (!is_ethoffload(sc))
701 		return;
702 
703 	t = &sc->tids;
704 	MPASS(t->netids > 0);
705 
706 	free(t->etid_tab, M_CXGBE);
707 	t->etid_tab = NULL;
708 
709 	if (mtx_initialized(&t->etid_lock))
710 		mtx_destroy(&t->etid_lock);
711 }
712 
713 /* etid services */
714 static int alloc_etid(struct adapter *, struct cxgbe_rate_tag *);
715 static void free_etid(struct adapter *, int);
716 
717 static int
718 alloc_etid(struct adapter *sc, struct cxgbe_rate_tag *cst)
719 {
720 	struct tid_info *t = &sc->tids;
721 	int etid = -1;
722 
723 	mtx_lock(&t->etid_lock);
724 	if (t->efree) {
725 		union etid_entry *p = t->efree;
726 
727 		etid = p - t->etid_tab + t->etid_base;
728 		t->efree = p->next;
729 		p->cst = cst;
730 		t->etids_in_use++;
731 	}
732 	mtx_unlock(&t->etid_lock);
733 	return (etid);
734 }
735 
736 struct cxgbe_rate_tag *
737 lookup_etid(struct adapter *sc, int etid)
738 {
739 	struct tid_info *t = &sc->tids;
740 
741 	return (t->etid_tab[etid - t->etid_base].cst);
742 }
743 
744 static void
745 free_etid(struct adapter *sc, int etid)
746 {
747 	struct tid_info *t = &sc->tids;
748 	union etid_entry *p = &t->etid_tab[etid - t->etid_base];
749 
750 	mtx_lock(&t->etid_lock);
751 	p->next = t->efree;
752 	t->efree = p;
753 	t->etids_in_use--;
754 	mtx_unlock(&t->etid_lock);
755 }
756 
757 int
758 cxgbe_rate_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
759     struct m_snd_tag **pt)
760 {
761 	int rc, schedcl;
762 	struct vi_info *vi = ifp->if_softc;
763 	struct port_info *pi = vi->pi;
764 	struct adapter *sc = pi->adapter;
765 	struct cxgbe_rate_tag *cst;
766 
767 	MPASS(params->hdr.type == IF_SND_TAG_TYPE_RATE_LIMIT);
768 
769 	rc = t4_reserve_cl_rl_kbps(sc, pi->port_id,
770 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
771 	if (rc != 0)
772 		return (rc);
773 	MPASS(schedcl >= 0 && schedcl < sc->chip_params->nsched_cls);
774 
775 	cst = malloc(sizeof(*cst), M_CXGBE, M_ZERO | M_NOWAIT);
776 	if (cst == NULL) {
777 failed:
778 		t4_release_cl_rl(sc, pi->port_id, schedcl);
779 		return (ENOMEM);
780 	}
781 
782 	cst->etid = alloc_etid(sc, cst);
783 	if (cst->etid < 0) {
784 		free(cst, M_CXGBE);
785 		goto failed;
786 	}
787 
788 	mtx_init(&cst->lock, "cst_lock", NULL, MTX_DEF);
789 	mbufq_init(&cst->pending_tx, INT_MAX);
790 	mbufq_init(&cst->pending_fwack, INT_MAX);
791 	m_snd_tag_init(&cst->com, ifp, IF_SND_TAG_TYPE_RATE_LIMIT);
792 	cst->flags |= EO_FLOWC_PENDING | EO_SND_TAG_REF;
793 	cst->adapter = sc;
794 	cst->port_id = pi->port_id;
795 	cst->schedcl = schedcl;
796 	cst->max_rate = params->rate_limit.max_rate;
797 	cst->tx_credits = sc->params.eo_wr_cred;
798 	cst->tx_total = cst->tx_credits;
799 	cst->plen = 0;
800 	cst->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
801 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
802 	    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
803 
804 	/*
805 	 * Queues will be selected later when the connection flowid is available.
806 	 */
807 
808 	*pt = &cst->com;
809 	return (0);
810 }
811 
812 /*
813  * Change in parameters, no change in ifp.
814  */
815 int
816 cxgbe_rate_tag_modify(struct m_snd_tag *mst,
817     union if_snd_tag_modify_params *params)
818 {
819 	int rc, schedcl;
820 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
821 	struct adapter *sc = cst->adapter;
822 
823 	/* XXX: is schedcl -1 ok here? */
824 	MPASS(cst->schedcl >= 0 && cst->schedcl < sc->chip_params->nsched_cls);
825 
826 	mtx_lock(&cst->lock);
827 	MPASS(cst->flags & EO_SND_TAG_REF);
828 	rc = t4_reserve_cl_rl_kbps(sc, cst->port_id,
829 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
830 	if (rc != 0)
831 		return (rc);
832 	MPASS(schedcl >= 0 && schedcl < sc->chip_params->nsched_cls);
833 	t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
834 	cst->schedcl = schedcl;
835 	cst->max_rate = params->rate_limit.max_rate;
836 	mtx_unlock(&cst->lock);
837 
838 	return (0);
839 }
840 
841 int
842 cxgbe_rate_tag_query(struct m_snd_tag *mst,
843     union if_snd_tag_query_params *params)
844 {
845 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
846 
847 	params->rate_limit.max_rate = cst->max_rate;
848 
849 #define CST_TO_MST_QLEVEL_SCALE (IF_SND_QUEUE_LEVEL_MAX / cst->tx_total)
850 	params->rate_limit.queue_level =
851 		(cst->tx_total - cst->tx_credits) * CST_TO_MST_QLEVEL_SCALE;
852 
853 	return (0);
854 }
855 
856 /*
857  * Unlocks cst and frees it.
858  */
859 void
860 cxgbe_rate_tag_free_locked(struct cxgbe_rate_tag *cst)
861 {
862 	struct adapter *sc = cst->adapter;
863 
864 	mtx_assert(&cst->lock, MA_OWNED);
865 	MPASS((cst->flags & EO_SND_TAG_REF) == 0);
866 	MPASS(cst->tx_credits == cst->tx_total);
867 	MPASS(cst->plen == 0);
868 	MPASS(mbufq_first(&cst->pending_tx) == NULL);
869 	MPASS(mbufq_first(&cst->pending_fwack) == NULL);
870 
871 	if (cst->etid >= 0)
872 		free_etid(sc, cst->etid);
873 	if (cst->schedcl != -1)
874 		t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
875 	mtx_unlock(&cst->lock);
876 	mtx_destroy(&cst->lock);
877 	free(cst, M_CXGBE);
878 }
879 
880 void
881 cxgbe_rate_tag_free(struct m_snd_tag *mst)
882 {
883 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
884 
885 	mtx_lock(&cst->lock);
886 
887 	/* The kernel is done with the snd_tag.  Remove its reference. */
888 	MPASS(cst->flags & EO_SND_TAG_REF);
889 	cst->flags &= ~EO_SND_TAG_REF;
890 
891 	if (cst->ncompl == 0) {
892 		/*
893 		 * No fw4_ack in flight.  Free the tag right away if there are
894 		 * no outstanding credits.  Request the firmware to return all
895 		 * credits for the etid otherwise.
896 		 */
897 		if (cst->tx_credits == cst->tx_total) {
898 			cxgbe_rate_tag_free_locked(cst);
899 			return;	/* cst is gone. */
900 		}
901 		send_etid_flush_wr(cst);
902 	}
903 	mtx_unlock(&cst->lock);
904 }
905 
906 void
907 cxgbe_ratelimit_query(struct ifnet *ifp, struct if_ratelimit_query_results *q)
908 {
909 	struct vi_info *vi = ifp->if_softc;
910 	struct adapter *sc = vi->adapter;
911 
912 	q->rate_table = NULL;
913 	q->flags = RT_IS_SELECTABLE;
914 	/*
915 	 * Absolute max limits from the firmware configuration.  Practical
916 	 * limits depend on the burstsize, pktsize (ifp->if_mtu ultimately) and
917 	 * the card's cclk.
918 	 */
919 	q->max_flows = sc->tids.netids;
920 	q->number_of_rates = sc->chip_params->nsched_cls;
921 	q->min_segment_burst = 4; /* matches PKTSCHED_BURST in the firmware. */
922 
923 #if 1
924 	if (chip_id(sc) < CHELSIO_T6) {
925 		/* Based on testing by rrs@ with a T580 at burstsize = 4. */
926 		MPASS(q->min_segment_burst == 4);
927 		q->max_flows = min(4000, q->max_flows);
928 	} else {
929 		/* XXX: TBD, carried forward from T5 for now. */
930 		q->max_flows = min(4000, q->max_flows);
931 	}
932 
933 	/*
934 	 * XXX: tcp_ratelimit.c grabs all available rates on link-up before it
935 	 * even knows whether hw pacing will be used or not.  This prevents
936 	 * other consumers like SO_MAX_PACING_RATE or those using cxgbetool or
937 	 * the private ioctls from using any of traffic classes.
938 	 *
939 	 * Underreport the number of rates to tcp_ratelimit so that it doesn't
940 	 * hog all of them.  This can be removed if/when tcp_ratelimit switches
941 	 * to making its allocations on first-use rather than link-up.  There is
942 	 * nothing wrong with one particular consumer reserving all the classes
943 	 * but it should do so only if it'll actually use hw rate limiting.
944 	 */
945 	q->number_of_rates /= 4;
946 #endif
947 }
948 #endif
949