xref: /freebsd/sys/dev/cxgbe/t4_sched.c (revision 18054d0220cfc8df9c9568c437bd6fbb59d53c3c)
1 /*-
2  * Copyright (c) 2017 Chelsio Communications, Inc.
3  * All rights reserved.
4  * Written by: Navdeep Parhar <np@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 #include "opt_ratelimit.h"
34 
35 #include <sys/types.h>
36 #include <sys/malloc.h>
37 #include <sys/queue.h>
38 #include <sys/sbuf.h>
39 #include <sys/taskqueue.h>
40 #include <sys/sysctl.h>
41 
42 #include "common/common.h"
43 #include "common/t4_regs.h"
44 #include "common/t4_regs_values.h"
45 #include "common/t4_msg.h"
46 
47 static int
48 in_range(int val, int lo, int hi)
49 {
50 
51 	return (val < 0 || (val <= hi && val >= lo));
52 }
53 
54 static int
55 set_sched_class_config(struct adapter *sc, int minmax)
56 {
57 	int rc;
58 
59 	if (minmax < 0)
60 		return (EINVAL);
61 
62 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sscc");
63 	if (rc)
64 		return (rc);
65 	if (hw_off_limits(sc))
66 		rc = ENXIO;
67 	else
68 		rc = -t4_sched_config(sc, FW_SCHED_TYPE_PKTSCHED, minmax, 1);
69 	end_synchronized_op(sc, 0);
70 
71 	return (rc);
72 }
73 
74 static int
75 set_sched_class_params(struct adapter *sc, struct t4_sched_class_params *p,
76     int sleep_ok)
77 {
78 	int rc, top_speed, fw_level, fw_mode, fw_rateunit, fw_ratemode;
79 	struct port_info *pi;
80 	struct tx_cl_rl_params *tc, old;
81 	bool check_pktsize = false;
82 
83 	if (p->level == SCHED_CLASS_LEVEL_CL_RL)
84 		fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL;
85 	else if (p->level == SCHED_CLASS_LEVEL_CL_WRR)
86 		fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR;
87 	else if (p->level == SCHED_CLASS_LEVEL_CH_RL)
88 		fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL;
89 	else
90 		return (EINVAL);
91 
92 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
93 		if (p->mode == SCHED_CLASS_MODE_CLASS)
94 			fw_mode = FW_SCHED_PARAMS_MODE_CLASS;
95 		else if (p->mode == SCHED_CLASS_MODE_FLOW) {
96 			check_pktsize = true;
97 			fw_mode = FW_SCHED_PARAMS_MODE_FLOW;
98 		} else
99 			return (EINVAL);
100 	} else
101 		fw_mode = 0;
102 
103 	/* Valid channel must always be provided. */
104 	if (p->channel < 0)
105 		return (EINVAL);
106 	if (!in_range(p->channel, 0, sc->chip_params->nchan - 1))
107 		return (ERANGE);
108 
109 	pi = sc->port[sc->chan_map[p->channel]];
110 	if (pi == NULL)
111 		return (ENXIO);
112 	MPASS(pi->tx_chan == p->channel);
113 	top_speed = port_top_speed(pi) * 1000000; /* Gbps -> Kbps */
114 
115 	if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
116 	    p->level == SCHED_CLASS_LEVEL_CH_RL) {
117 		/*
118 		 * Valid rate (mode, unit and values) must be provided.
119 		 */
120 
121 		if (p->minrate < 0)
122 			p->minrate = 0;
123 		if (p->maxrate < 0)
124 			return (EINVAL);
125 
126 		if (p->rateunit == SCHED_CLASS_RATEUNIT_BITS) {
127 			fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
128 			/* ratemode could be relative (%) or absolute. */
129 			if (p->ratemode == SCHED_CLASS_RATEMODE_REL) {
130 				fw_ratemode = FW_SCHED_PARAMS_RATE_REL;
131 				/* maxrate is % of port bandwidth. */
132 				if (!in_range(p->minrate, 0, 100) ||
133 				    !in_range(p->maxrate, 0, 100)) {
134 					return (ERANGE);
135 				}
136 			} else if (p->ratemode == SCHED_CLASS_RATEMODE_ABS) {
137 				fw_ratemode = FW_SCHED_PARAMS_RATE_ABS;
138 				/* maxrate is absolute value in kbps. */
139 				if (!in_range(p->minrate, 0, top_speed) ||
140 				    !in_range(p->maxrate, 0, top_speed)) {
141 					return (ERANGE);
142 				}
143 			} else
144 				return (EINVAL);
145 		} else if (p->rateunit == SCHED_CLASS_RATEUNIT_PKTS) {
146 			/* maxrate is the absolute value in pps. */
147 			check_pktsize = true;
148 			fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE;
149 		} else
150 			return (EINVAL);
151 	} else {
152 		MPASS(p->level == SCHED_CLASS_LEVEL_CL_WRR);
153 
154 		/*
155 		 * Valid weight must be provided.
156 		 */
157 		if (p->weight < 0)
158 		       return (EINVAL);
159 		if (!in_range(p->weight, 1, 99))
160 			return (ERANGE);
161 
162 		fw_rateunit = 0;
163 		fw_ratemode = 0;
164 	}
165 
166 	if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
167 	    p->level == SCHED_CLASS_LEVEL_CL_WRR) {
168 		/*
169 		 * Valid scheduling class must be provided.
170 		 */
171 		if (p->cl < 0)
172 			return (EINVAL);
173 		if (!in_range(p->cl, 0, sc->params.nsched_cls - 1))
174 			return (ERANGE);
175 	}
176 
177 	if (check_pktsize) {
178 		if (p->pktsize < 0)
179 			return (EINVAL);
180 		if (!in_range(p->pktsize, 64, pi->vi[0].ifp->if_mtu))
181 			return (ERANGE);
182 	}
183 
184 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
185 		tc = &pi->sched_params->cl_rl[p->cl];
186 		mtx_lock(&sc->tc_lock);
187 		if (tc->refcount > 0 || tc->state == CS_HW_UPDATE_IN_PROGRESS)
188 			rc = EBUSY;
189 		else {
190 			old = *tc;
191 
192 			tc->flags |= CF_USER;
193 			tc->state = CS_HW_UPDATE_IN_PROGRESS;
194 			tc->ratemode = fw_ratemode;
195 			tc->rateunit = fw_rateunit;
196 			tc->mode = fw_mode;
197 			tc->maxrate = p->maxrate;
198 			tc->pktsize = p->pktsize;
199 			rc = 0;
200 		}
201 		mtx_unlock(&sc->tc_lock);
202 		if (rc != 0)
203 			return (rc);
204 	}
205 
206 	rc = begin_synchronized_op(sc, NULL,
207 	    sleep_ok ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4sscp");
208 	if (rc != 0) {
209 		if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
210 			mtx_lock(&sc->tc_lock);
211 			MPASS(tc->refcount == 0);
212 			MPASS(tc->flags & CF_USER);
213 			MPASS(tc->state == CS_HW_UPDATE_IN_PROGRESS);
214 			*tc = old;
215 			mtx_unlock(&sc->tc_lock);
216 		}
217 		return (rc);
218 	}
219 	if (!hw_off_limits(sc)) {
220 		rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, fw_level,
221 		    fw_mode, fw_rateunit, fw_ratemode, p->channel, p->cl,
222 		    p->minrate, p->maxrate, p->weight, p->pktsize, 0, sleep_ok);
223 	}
224 	end_synchronized_op(sc, sleep_ok ? 0 : LOCK_HELD);
225 
226 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
227 		mtx_lock(&sc->tc_lock);
228 		MPASS(tc->refcount == 0);
229 		MPASS(tc->flags & CF_USER);
230 		MPASS(tc->state == CS_HW_UPDATE_IN_PROGRESS);
231 
232 		if (rc == 0)
233 			tc->state = CS_HW_CONFIGURED;
234 		else {
235 			/* parameters failed so we don't park at params_set */
236 			tc->state = CS_UNINITIALIZED;
237 			tc->flags &= ~CF_USER;
238 			CH_ERR(pi, "failed to configure traffic class %d: %d.  "
239 			    "params: mode %d, rateunit %d, ratemode %d, "
240 			    "channel %d, minrate %d, maxrate %d, pktsize %d, "
241 			    "burstsize %d\n", p->cl, rc, fw_mode, fw_rateunit,
242 			    fw_ratemode, p->channel, p->minrate, p->maxrate,
243 			    p->pktsize, 0);
244 		}
245 		mtx_unlock(&sc->tc_lock);
246 	}
247 
248 	return (rc);
249 }
250 
251 static void
252 update_tx_sched(void *context, int pending)
253 {
254 	int i, j, rc;
255 	struct port_info *pi;
256 	struct tx_cl_rl_params *tc;
257 	struct adapter *sc = context;
258 	const int n = sc->params.nsched_cls;
259 
260 	mtx_lock(&sc->tc_lock);
261 	for_each_port(sc, i) {
262 		pi = sc->port[i];
263 		tc = &pi->sched_params->cl_rl[0];
264 		for (j = 0; j < n; j++, tc++) {
265 			MPASS(mtx_owned(&sc->tc_lock));
266 			if (tc->state != CS_HW_UPDATE_REQUESTED)
267 				continue;
268 			mtx_unlock(&sc->tc_lock);
269 
270 			if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
271 			    "t4utxs") != 0) {
272 				mtx_lock(&sc->tc_lock);
273 				continue;
274 			}
275 			rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED,
276 			    FW_SCHED_PARAMS_LEVEL_CL_RL, tc->mode, tc->rateunit,
277 			    tc->ratemode, pi->tx_chan, j, 0, tc->maxrate, 0,
278 			    tc->pktsize, tc->burstsize, 1);
279 			end_synchronized_op(sc, 0);
280 
281 			mtx_lock(&sc->tc_lock);
282 			MPASS(tc->state == CS_HW_UPDATE_REQUESTED);
283 			if (rc == 0) {
284 				tc->state = CS_HW_CONFIGURED;
285 				continue;
286 			}
287 			/* parameters failed so we try to avoid params_set */
288 			if (tc->refcount > 0)
289 				tc->state = CS_PARAMS_SET;
290 			else
291 				tc->state = CS_UNINITIALIZED;
292 			CH_ERR(pi, "failed to configure traffic class %d: %d.  "
293 			    "params: mode %d, rateunit %d, ratemode %d, "
294 			    "channel %d, minrate %d, maxrate %d, pktsize %d, "
295 			    "burstsize %d\n", j, rc, tc->mode, tc->rateunit,
296 			    tc->ratemode, pi->tx_chan, 0, tc->maxrate,
297 			    tc->pktsize, tc->burstsize);
298 		}
299 	}
300 	mtx_unlock(&sc->tc_lock);
301 }
302 
303 int
304 t4_set_sched_class(struct adapter *sc, struct t4_sched_params *p)
305 {
306 
307 	if (p->type != SCHED_CLASS_TYPE_PACKET)
308 		return (EINVAL);
309 
310 	if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG)
311 		return (set_sched_class_config(sc, p->u.config.minmax));
312 
313 	if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS)
314 		return (set_sched_class_params(sc, &p->u.params, 1));
315 
316 	return (EINVAL);
317 }
318 
319 static int
320 bind_txq_to_traffic_class(struct adapter *sc, struct sge_txq *txq, int idx)
321 {
322 	struct tx_cl_rl_params *tc0, *tc;
323 	int rc, old_idx;
324 	uint32_t fw_mnem, fw_class;
325 
326 	if (!(txq->eq.flags & EQ_HW_ALLOCATED))
327 		return (ENXIO);
328 
329 	mtx_lock(&sc->tc_lock);
330 	if (txq->tc_idx == -2) {
331 		rc = EBUSY;	/* Another bind/unbind in progress already. */
332 		goto done;
333 	}
334 	if (idx == txq->tc_idx) {
335 		rc = 0;		/* No change, nothing to do. */
336 		goto done;
337 	}
338 
339 	tc0 = &sc->port[txq->eq.tx_chan]->sched_params->cl_rl[0];
340 	if (idx != -1) {
341 		/*
342 		 * Bind to a different class at index idx.
343 		 */
344 		tc = &tc0[idx];
345 		if (tc->state != CS_HW_CONFIGURED) {
346 			rc = ENXIO;
347 			goto done;
348 		} else {
349 			/*
350 			 * Ok to proceed.  Place a reference on the new class
351 			 * while still holding on to the reference on the
352 			 * previous class, if any.
353 			 */
354 			tc->refcount++;
355 		}
356 	}
357 	/* Mark as busy before letting go of the lock. */
358 	old_idx = txq->tc_idx;
359 	txq->tc_idx = -2;
360 	mtx_unlock(&sc->tc_lock);
361 
362 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4btxq");
363 	if (rc == 0) {
364 		fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
365 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) |
366 		    V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id));
367 		fw_class = idx < 0 ? 0xffffffff : idx;
368 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_mnem,
369 		    &fw_class);
370 		end_synchronized_op(sc, 0);
371 	}
372 
373 	mtx_lock(&sc->tc_lock);
374 	MPASS(txq->tc_idx == -2);
375 	if (rc == 0) {
376 		/*
377 		 * Unbind, bind, or bind to a different class succeeded.  Remove
378 		 * the reference on the old traffic class, if any.
379 		 */
380 		if (old_idx != -1) {
381 			tc = &tc0[old_idx];
382 			MPASS(tc->refcount > 0);
383 			tc->refcount--;
384 		}
385 		txq->tc_idx = idx;
386 	} else {
387 		/*
388 		 * Unbind, bind, or bind to a different class failed.  Remove
389 		 * the anticipatory reference on the new traffic class, if any.
390 		 */
391 		if (idx != -1) {
392 			tc = &tc0[idx];
393 			MPASS(tc->refcount > 0);
394 			tc->refcount--;
395 		}
396 		txq->tc_idx = old_idx;
397 	}
398 done:
399 	MPASS(txq->tc_idx >= -1 && txq->tc_idx < sc->params.nsched_cls);
400 	mtx_unlock(&sc->tc_lock);
401 	return (rc);
402 }
403 
404 int
405 t4_set_sched_queue(struct adapter *sc, struct t4_sched_queue *p)
406 {
407 	struct port_info *pi = NULL;
408 	struct vi_info *vi;
409 	struct sge_txq *txq;
410 	int i, rc;
411 
412 	if (p->port >= sc->params.nports)
413 		return (EINVAL);
414 
415 	/*
416 	 * XXX: cxgbetool allows the user to specify the physical port only.  So
417 	 * we always operate on the main VI.
418 	 */
419 	pi = sc->port[p->port];
420 	vi = &pi->vi[0];
421 
422 	/* Checking VI_INIT_DONE outside a synch-op is a harmless race here. */
423 	if (!(vi->flags & VI_INIT_DONE))
424 		return (EAGAIN);
425 	MPASS(vi->ntxq > 0);
426 
427 	if (!in_range(p->queue, 0, vi->ntxq - 1) ||
428 	    !in_range(p->cl, 0, sc->params.nsched_cls - 1))
429 		return (EINVAL);
430 
431 	if (p->queue < 0) {
432 		/*
433 		 * Change the scheduling on all the TX queues for the
434 		 * interface.
435 		 */
436 		for_each_txq(vi, i, txq) {
437 			rc = bind_txq_to_traffic_class(sc, txq, p->cl);
438 			if (rc != 0)
439 				break;
440 		}
441 	} else {
442 		/*
443 		 * If op.queue is non-negative, then we're only changing the
444 		 * scheduling on a single specified TX queue.
445 		 */
446 		txq = &sc->sge.txq[vi->first_txq + p->queue];
447 		rc = bind_txq_to_traffic_class(sc, txq, p->cl);
448 	}
449 
450 	return (rc);
451 }
452 
453 int
454 t4_init_tx_sched(struct adapter *sc)
455 {
456 	int i;
457 	const int n = sc->params.nsched_cls;
458 	struct port_info *pi;
459 
460 	mtx_init(&sc->tc_lock, "tx_sched lock", NULL, MTX_DEF);
461 	TASK_INIT(&sc->tc_task, 0, update_tx_sched, sc);
462 	for_each_port(sc, i) {
463 		pi = sc->port[i];
464 		pi->sched_params = malloc(sizeof(*pi->sched_params) +
465 		    n * sizeof(struct tx_cl_rl_params), M_CXGBE, M_ZERO | M_WAITOK);
466 	}
467 
468 	return (0);
469 }
470 
471 int
472 t4_free_tx_sched(struct adapter *sc)
473 {
474 	int i;
475 
476 	taskqueue_drain(taskqueue_thread, &sc->tc_task);
477 
478 	for_each_port(sc, i) {
479 		if (sc->port[i] != NULL)
480 			free(sc->port[i]->sched_params, M_CXGBE);
481 	}
482 
483 	if (mtx_initialized(&sc->tc_lock))
484 		mtx_destroy(&sc->tc_lock);
485 
486 	return (0);
487 }
488 
489 void
490 t4_update_tx_sched(struct adapter *sc)
491 {
492 
493 	taskqueue_enqueue(taskqueue_thread, &sc->tc_task);
494 }
495 
496 int
497 t4_reserve_cl_rl_kbps(struct adapter *sc, int port_id, u_int maxrate,
498     int *tc_idx)
499 {
500 	int rc = 0, fa, fa2, i, pktsize, burstsize;
501 	bool update;
502 	struct tx_cl_rl_params *tc;
503 	struct port_info *pi;
504 
505 	MPASS(port_id >= 0 && port_id < sc->params.nports);
506 
507 	pi = sc->port[port_id];
508 	if (pi->sched_params->pktsize > 0)
509 		pktsize = pi->sched_params->pktsize;
510 	else
511 		pktsize = pi->vi[0].ifp->if_mtu;
512 	if (pi->sched_params->burstsize > 0)
513 		burstsize = pi->sched_params->burstsize;
514 	else
515 		burstsize = pktsize * 4;
516 	tc = &pi->sched_params->cl_rl[0];
517 
518 	update = false;
519 	fa = fa2 = -1;
520 	mtx_lock(&sc->tc_lock);
521 	for (i = 0; i < sc->params.nsched_cls; i++, tc++) {
522 		if (tc->state >= CS_PARAMS_SET &&
523 		    tc->ratemode == FW_SCHED_PARAMS_RATE_ABS &&
524 		    tc->rateunit == FW_SCHED_PARAMS_UNIT_BITRATE &&
525 		    tc->mode == FW_SCHED_PARAMS_MODE_FLOW &&
526 		    tc->maxrate == maxrate && tc->pktsize == pktsize &&
527 		    tc->burstsize == burstsize) {
528 			tc->refcount++;
529 			*tc_idx = i;
530 			if (tc->state == CS_PARAMS_SET) {
531 				tc->state = CS_HW_UPDATE_REQUESTED;
532 				update = true;
533 			}
534 			goto done;
535 		}
536 
537 		if (fa < 0 && tc->state == CS_UNINITIALIZED) {
538 			MPASS(tc->refcount == 0);
539 			fa = i;		/* first available, never used. */
540 		}
541 		if (fa2 < 0 && tc->refcount == 0 && !(tc->flags & CF_USER)) {
542 			fa2 = i;	/* first available, used previously.  */
543 		}
544 	}
545 	/* Not found */
546 	MPASS(i == sc->params.nsched_cls);
547 	if (fa == -1)
548 		fa = fa2;
549 	if (fa == -1) {
550 		*tc_idx = -1;
551 		rc = ENOSPC;
552 	} else {
553 		MPASS(fa >= 0 && fa < sc->params.nsched_cls);
554 		tc = &pi->sched_params->cl_rl[fa];
555 		MPASS(!(tc->flags & CF_USER));
556 		MPASS(tc->refcount == 0);
557 
558 		tc->refcount = 1;
559 		tc->state = CS_HW_UPDATE_REQUESTED;
560 		tc->ratemode = FW_SCHED_PARAMS_RATE_ABS;
561 		tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
562 		tc->mode = FW_SCHED_PARAMS_MODE_FLOW;
563 		tc->maxrate = maxrate;
564 		tc->pktsize = pktsize;
565 		tc->burstsize = burstsize;
566 		*tc_idx = fa;
567 		update = true;
568 	}
569 done:
570 	mtx_unlock(&sc->tc_lock);
571 	if (update)
572 		t4_update_tx_sched(sc);
573 	return (rc);
574 }
575 
576 void
577 t4_release_cl_rl(struct adapter *sc, int port_id, int tc_idx)
578 {
579 	struct tx_cl_rl_params *tc;
580 
581 	MPASS(port_id >= 0 && port_id < sc->params.nports);
582 	MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls);
583 
584 	mtx_lock(&sc->tc_lock);
585 	tc = &sc->port[port_id]->sched_params->cl_rl[tc_idx];
586 	MPASS(tc->refcount > 0);
587 	tc->refcount--;
588 	mtx_unlock(&sc->tc_lock);
589 }
590 
591 int
592 sysctl_tc(SYSCTL_HANDLER_ARGS)
593 {
594 	struct vi_info *vi = arg1;
595 	struct adapter *sc = vi->adapter;
596 	struct sge_txq *txq;
597 	int qidx = arg2, rc, tc_idx;
598 
599 	MPASS(qidx >= vi->first_txq && qidx < vi->first_txq + vi->ntxq);
600 
601 	txq = &sc->sge.txq[qidx];
602 	tc_idx = txq->tc_idx;
603 	rc = sysctl_handle_int(oidp, &tc_idx, 0, req);
604 	if (rc != 0 || req->newptr == NULL)
605 		return (rc);
606 
607 	if (sc->flags & IS_VF)
608 		return (EPERM);
609 	if (!in_range(tc_idx, 0, sc->params.nsched_cls - 1))
610 		return (EINVAL);
611 
612 	return (bind_txq_to_traffic_class(sc, txq, tc_idx));
613 }
614 
615 int
616 sysctl_tc_params(SYSCTL_HANDLER_ARGS)
617 {
618 	struct adapter *sc = arg1;
619 	struct tx_cl_rl_params tc;
620 	struct sbuf *sb;
621 	int i, rc, port_id, mbps, gbps;
622 
623 	rc = sysctl_wire_old_buffer(req, 0);
624 	if (rc != 0)
625 		return (rc);
626 
627 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
628 	if (sb == NULL)
629 		return (ENOMEM);
630 
631 	port_id = arg2 >> 16;
632 	MPASS(port_id < sc->params.nports);
633 	MPASS(sc->port[port_id] != NULL);
634 	i = arg2 & 0xffff;
635 	MPASS(i < sc->params.nsched_cls);
636 
637 	mtx_lock(&sc->tc_lock);
638 	tc = sc->port[port_id]->sched_params->cl_rl[i];
639 	mtx_unlock(&sc->tc_lock);
640 
641 	if (tc.state < CS_PARAMS_SET) {
642 		sbuf_printf(sb, "uninitialized");
643 		goto done;
644 	}
645 
646 	switch (tc.rateunit) {
647 	case SCHED_CLASS_RATEUNIT_BITS:
648 		switch (tc.ratemode) {
649 		case SCHED_CLASS_RATEMODE_REL:
650 			/* XXX: top speed or actual link speed? */
651 			gbps = port_top_speed(sc->port[port_id]);
652 			sbuf_printf(sb, "%u%% of %uGbps", tc.maxrate, gbps);
653 			break;
654 		case SCHED_CLASS_RATEMODE_ABS:
655 			mbps = tc.maxrate / 1000;
656 			gbps = tc.maxrate / 1000000;
657 			if (tc.maxrate == gbps * 1000000)
658 				sbuf_printf(sb, "%uGbps", gbps);
659 			else if (tc.maxrate == mbps * 1000)
660 				sbuf_printf(sb, "%uMbps", mbps);
661 			else
662 				sbuf_printf(sb, "%uKbps", tc.maxrate);
663 			break;
664 		default:
665 			rc = ENXIO;
666 			goto done;
667 		}
668 		break;
669 	case SCHED_CLASS_RATEUNIT_PKTS:
670 		sbuf_printf(sb, "%upps", tc.maxrate);
671 		break;
672 	default:
673 		rc = ENXIO;
674 		goto done;
675 	}
676 
677 	switch (tc.mode) {
678 	case SCHED_CLASS_MODE_CLASS:
679 		/* Note that pktsize and burstsize are not used in this mode. */
680 		sbuf_printf(sb, " aggregate");
681 		break;
682 	case SCHED_CLASS_MODE_FLOW:
683 		sbuf_printf(sb, " per-flow");
684 		if (tc.pktsize > 0)
685 			sbuf_printf(sb, " pkt-size %u", tc.pktsize);
686 		if (tc.burstsize > 0)
687 			sbuf_printf(sb, " burst-size %u", tc.burstsize);
688 		break;
689 	default:
690 		rc = ENXIO;
691 		goto done;
692 	}
693 
694 done:
695 	if (rc == 0)
696 		rc = sbuf_finish(sb);
697 	sbuf_delete(sb);
698 
699 	return (rc);
700 }
701 
702 #ifdef RATELIMIT
703 void
704 t4_init_etid_table(struct adapter *sc)
705 {
706 	int i;
707 	struct tid_info *t;
708 
709 	if (!is_ethoffload(sc))
710 		return;
711 
712 	t = &sc->tids;
713 	MPASS(t->netids > 0);
714 
715 	mtx_init(&t->etid_lock, "etid lock", NULL, MTX_DEF);
716 	t->etid_tab = malloc(sizeof(*t->etid_tab) * t->netids, M_CXGBE,
717 			M_ZERO | M_WAITOK);
718 	t->efree = t->etid_tab;
719 	t->etids_in_use = 0;
720 	for (i = 1; i < t->netids; i++)
721 		t->etid_tab[i - 1].next = &t->etid_tab[i];
722 	t->etid_tab[t->netids - 1].next = NULL;
723 }
724 
725 void
726 t4_free_etid_table(struct adapter *sc)
727 {
728 	struct tid_info *t;
729 
730 	if (!is_ethoffload(sc))
731 		return;
732 
733 	t = &sc->tids;
734 	MPASS(t->netids > 0);
735 
736 	free(t->etid_tab, M_CXGBE);
737 	t->etid_tab = NULL;
738 
739 	if (mtx_initialized(&t->etid_lock))
740 		mtx_destroy(&t->etid_lock);
741 }
742 
743 /* etid services */
744 static int alloc_etid(struct adapter *, struct cxgbe_rate_tag *);
745 static void free_etid(struct adapter *, int);
746 
747 static int
748 alloc_etid(struct adapter *sc, struct cxgbe_rate_tag *cst)
749 {
750 	struct tid_info *t = &sc->tids;
751 	int etid = -1;
752 
753 	mtx_lock(&t->etid_lock);
754 	if (t->efree) {
755 		union etid_entry *p = t->efree;
756 
757 		etid = p - t->etid_tab + t->etid_base;
758 		t->efree = p->next;
759 		p->cst = cst;
760 		t->etids_in_use++;
761 	}
762 	mtx_unlock(&t->etid_lock);
763 	return (etid);
764 }
765 
766 struct cxgbe_rate_tag *
767 lookup_etid(struct adapter *sc, int etid)
768 {
769 	struct tid_info *t = &sc->tids;
770 
771 	return (t->etid_tab[etid - t->etid_base].cst);
772 }
773 
774 static void
775 free_etid(struct adapter *sc, int etid)
776 {
777 	struct tid_info *t = &sc->tids;
778 	union etid_entry *p = &t->etid_tab[etid - t->etid_base];
779 
780 	mtx_lock(&t->etid_lock);
781 	p->next = t->efree;
782 	t->efree = p;
783 	t->etids_in_use--;
784 	mtx_unlock(&t->etid_lock);
785 }
786 
787 static int cxgbe_rate_tag_modify(struct m_snd_tag *,
788     union if_snd_tag_modify_params *);
789 static int cxgbe_rate_tag_query(struct m_snd_tag *,
790     union if_snd_tag_query_params *);
791 static void cxgbe_rate_tag_free(struct m_snd_tag *);
792 
793 static const struct if_snd_tag_sw cxgbe_rate_tag_sw = {
794 	.snd_tag_modify = cxgbe_rate_tag_modify,
795 	.snd_tag_query = cxgbe_rate_tag_query,
796 	.snd_tag_free = cxgbe_rate_tag_free,
797 	.type = IF_SND_TAG_TYPE_RATE_LIMIT
798 };
799 
800 int
801 cxgbe_rate_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
802     struct m_snd_tag **pt)
803 {
804 	int rc, schedcl;
805 	struct vi_info *vi = ifp->if_softc;
806 	struct port_info *pi = vi->pi;
807 	struct adapter *sc = pi->adapter;
808 	struct cxgbe_rate_tag *cst;
809 
810 	MPASS(params->hdr.type == IF_SND_TAG_TYPE_RATE_LIMIT);
811 
812 	rc = t4_reserve_cl_rl_kbps(sc, pi->port_id,
813 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
814 	if (rc != 0)
815 		return (rc);
816 	MPASS(schedcl >= 0 && schedcl < sc->params.nsched_cls);
817 
818 	cst = malloc(sizeof(*cst), M_CXGBE, M_ZERO | M_NOWAIT);
819 	if (cst == NULL) {
820 failed:
821 		t4_release_cl_rl(sc, pi->port_id, schedcl);
822 		return (ENOMEM);
823 	}
824 
825 	cst->etid = alloc_etid(sc, cst);
826 	if (cst->etid < 0) {
827 		free(cst, M_CXGBE);
828 		goto failed;
829 	}
830 
831 	mtx_init(&cst->lock, "cst_lock", NULL, MTX_DEF);
832 	mbufq_init(&cst->pending_tx, INT_MAX);
833 	mbufq_init(&cst->pending_fwack, INT_MAX);
834 	m_snd_tag_init(&cst->com, ifp, &cxgbe_rate_tag_sw);
835 	cst->flags |= EO_FLOWC_PENDING | EO_SND_TAG_REF;
836 	cst->adapter = sc;
837 	cst->port_id = pi->port_id;
838 	cst->schedcl = schedcl;
839 	cst->max_rate = params->rate_limit.max_rate;
840 	cst->tx_credits = sc->params.eo_wr_cred;
841 	cst->tx_total = cst->tx_credits;
842 	cst->plen = 0;
843 	cst->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
844 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
845 	    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
846 
847 	/*
848 	 * Queues will be selected later when the connection flowid is available.
849 	 */
850 
851 	*pt = &cst->com;
852 	return (0);
853 }
854 
855 /*
856  * Change in parameters, no change in ifp.
857  */
858 static int
859 cxgbe_rate_tag_modify(struct m_snd_tag *mst,
860     union if_snd_tag_modify_params *params)
861 {
862 	int rc, schedcl;
863 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
864 	struct adapter *sc = cst->adapter;
865 
866 	/* XXX: is schedcl -1 ok here? */
867 	MPASS(cst->schedcl >= 0 && cst->schedcl < sc->params.nsched_cls);
868 
869 	mtx_lock(&cst->lock);
870 	MPASS(cst->flags & EO_SND_TAG_REF);
871 	rc = t4_reserve_cl_rl_kbps(sc, cst->port_id,
872 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
873 	if (rc != 0)
874 		return (rc);
875 	MPASS(schedcl >= 0 && schedcl < sc->params.nsched_cls);
876 	t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
877 	cst->schedcl = schedcl;
878 	cst->max_rate = params->rate_limit.max_rate;
879 	mtx_unlock(&cst->lock);
880 
881 	return (0);
882 }
883 
884 static int
885 cxgbe_rate_tag_query(struct m_snd_tag *mst,
886     union if_snd_tag_query_params *params)
887 {
888 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
889 
890 	params->rate_limit.max_rate = cst->max_rate;
891 
892 #define CST_TO_MST_QLEVEL_SCALE (IF_SND_QUEUE_LEVEL_MAX / cst->tx_total)
893 	params->rate_limit.queue_level =
894 		(cst->tx_total - cst->tx_credits) * CST_TO_MST_QLEVEL_SCALE;
895 
896 	return (0);
897 }
898 
899 /*
900  * Unlocks cst and frees it.
901  */
902 void
903 cxgbe_rate_tag_free_locked(struct cxgbe_rate_tag *cst)
904 {
905 	struct adapter *sc = cst->adapter;
906 
907 	mtx_assert(&cst->lock, MA_OWNED);
908 	MPASS((cst->flags & EO_SND_TAG_REF) == 0);
909 	MPASS(cst->tx_credits == cst->tx_total);
910 	MPASS(cst->plen == 0);
911 	MPASS(mbufq_first(&cst->pending_tx) == NULL);
912 	MPASS(mbufq_first(&cst->pending_fwack) == NULL);
913 
914 	if (cst->etid >= 0)
915 		free_etid(sc, cst->etid);
916 	if (cst->schedcl != -1)
917 		t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
918 	mtx_unlock(&cst->lock);
919 	mtx_destroy(&cst->lock);
920 	free(cst, M_CXGBE);
921 }
922 
923 static void
924 cxgbe_rate_tag_free(struct m_snd_tag *mst)
925 {
926 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
927 
928 	mtx_lock(&cst->lock);
929 
930 	/* The kernel is done with the snd_tag.  Remove its reference. */
931 	MPASS(cst->flags & EO_SND_TAG_REF);
932 	cst->flags &= ~EO_SND_TAG_REF;
933 
934 	if (cst->ncompl == 0) {
935 		/*
936 		 * No fw4_ack in flight.  Free the tag right away if there are
937 		 * no outstanding credits.  Request the firmware to return all
938 		 * credits for the etid otherwise.
939 		 */
940 		if (cst->tx_credits == cst->tx_total) {
941 			cxgbe_rate_tag_free_locked(cst);
942 			return;	/* cst is gone. */
943 		}
944 		send_etid_flush_wr(cst);
945 	}
946 	mtx_unlock(&cst->lock);
947 }
948 
949 void
950 cxgbe_ratelimit_query(struct ifnet *ifp, struct if_ratelimit_query_results *q)
951 {
952 	struct vi_info *vi = ifp->if_softc;
953 	struct adapter *sc = vi->adapter;
954 
955 	q->rate_table = NULL;
956 	q->flags = RT_IS_SELECTABLE;
957 	/*
958 	 * Absolute max limits from the firmware configuration.  Practical
959 	 * limits depend on the burstsize, pktsize (ifp->if_mtu ultimately) and
960 	 * the card's cclk.
961 	 */
962 	q->max_flows = sc->tids.netids;
963 	q->number_of_rates = sc->params.nsched_cls;
964 	q->min_segment_burst = 4; /* matches PKTSCHED_BURST in the firmware. */
965 
966 #if 1
967 	if (chip_id(sc) < CHELSIO_T6) {
968 		/* Based on testing by rrs@ with a T580 at burstsize = 4. */
969 		MPASS(q->min_segment_burst == 4);
970 		q->max_flows = min(4000, q->max_flows);
971 	} else {
972 		/* XXX: TBD, carried forward from T5 for now. */
973 		q->max_flows = min(4000, q->max_flows);
974 	}
975 
976 	/*
977 	 * XXX: tcp_ratelimit.c grabs all available rates on link-up before it
978 	 * even knows whether hw pacing will be used or not.  This prevents
979 	 * other consumers like SO_MAX_PACING_RATE or those using cxgbetool or
980 	 * the private ioctls from using any of traffic classes.
981 	 *
982 	 * Underreport the number of rates to tcp_ratelimit so that it doesn't
983 	 * hog all of them.  This can be removed if/when tcp_ratelimit switches
984 	 * to making its allocations on first-use rather than link-up.  There is
985 	 * nothing wrong with one particular consumer reserving all the classes
986 	 * but it should do so only if it'll actually use hw rate limiting.
987 	 */
988 	q->number_of_rates /= 4;
989 #endif
990 }
991 #endif
992