xref: /freebsd/sys/dev/cxgbe/t4_sched.c (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1 /*-
2  * Copyright (c) 2017 Chelsio Communications, Inc.
3  * All rights reserved.
4  * Written by: Navdeep Parhar <np@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 #include "opt_inet.h"
30 #include "opt_inet6.h"
31 #include "opt_ratelimit.h"
32 
33 #include <sys/types.h>
34 #include <sys/malloc.h>
35 #include <sys/queue.h>
36 #include <sys/sbuf.h>
37 #include <sys/taskqueue.h>
38 #include <sys/sysctl.h>
39 
40 #include "common/common.h"
41 #include "common/t4_regs.h"
42 #include "common/t4_regs_values.h"
43 #include "common/t4_msg.h"
44 
45 static int
46 in_range(int val, int lo, int hi)
47 {
48 
49 	return (val < 0 || (val <= hi && val >= lo));
50 }
51 
52 static int
53 set_sched_class_config(struct adapter *sc, int minmax)
54 {
55 	int rc;
56 
57 	if (minmax < 0)
58 		return (EINVAL);
59 
60 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sscc");
61 	if (rc)
62 		return (rc);
63 	if (hw_off_limits(sc))
64 		rc = ENXIO;
65 	else
66 		rc = -t4_sched_config(sc, FW_SCHED_TYPE_PKTSCHED, minmax, 1);
67 	end_synchronized_op(sc, 0);
68 
69 	return (rc);
70 }
71 
72 static int
73 set_sched_class_params(struct adapter *sc, struct t4_sched_class_params *p,
74     int sleep_ok)
75 {
76 	int rc, top_speed, fw_level, fw_mode, fw_rateunit, fw_ratemode;
77 	struct port_info *pi;
78 	struct tx_cl_rl_params *tc, old;
79 	bool check_pktsize = false;
80 
81 	if (p->level == SCHED_CLASS_LEVEL_CL_RL)
82 		fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL;
83 	else if (p->level == SCHED_CLASS_LEVEL_CL_WRR)
84 		fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR;
85 	else if (p->level == SCHED_CLASS_LEVEL_CH_RL)
86 		fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL;
87 	else
88 		return (EINVAL);
89 
90 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
91 		if (p->mode == SCHED_CLASS_MODE_CLASS)
92 			fw_mode = FW_SCHED_PARAMS_MODE_CLASS;
93 		else if (p->mode == SCHED_CLASS_MODE_FLOW) {
94 			check_pktsize = true;
95 			fw_mode = FW_SCHED_PARAMS_MODE_FLOW;
96 		} else
97 			return (EINVAL);
98 	} else
99 		fw_mode = 0;
100 
101 	/* Valid channel must always be provided. */
102 	if (p->channel < 0)
103 		return (EINVAL);
104 	if (!in_range(p->channel, 0, sc->chip_params->nchan - 1))
105 		return (ERANGE);
106 
107 	pi = sc->port[sc->chan_map[p->channel]];
108 	if (pi == NULL)
109 		return (ENXIO);
110 	MPASS(pi->tx_chan == p->channel);
111 	top_speed = port_top_speed(pi) * 1000000; /* Gbps -> Kbps */
112 
113 	if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
114 	    p->level == SCHED_CLASS_LEVEL_CH_RL) {
115 		/*
116 		 * Valid rate (mode, unit and values) must be provided.
117 		 */
118 
119 		if (p->minrate < 0)
120 			p->minrate = 0;
121 		if (p->maxrate < 0)
122 			return (EINVAL);
123 
124 		if (p->rateunit == SCHED_CLASS_RATEUNIT_BITS) {
125 			fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
126 			/* ratemode could be relative (%) or absolute. */
127 			if (p->ratemode == SCHED_CLASS_RATEMODE_REL) {
128 				fw_ratemode = FW_SCHED_PARAMS_RATE_REL;
129 				/* maxrate is % of port bandwidth. */
130 				if (!in_range(p->minrate, 0, 100) ||
131 				    !in_range(p->maxrate, 0, 100)) {
132 					return (ERANGE);
133 				}
134 			} else if (p->ratemode == SCHED_CLASS_RATEMODE_ABS) {
135 				fw_ratemode = FW_SCHED_PARAMS_RATE_ABS;
136 				/* maxrate is absolute value in kbps. */
137 				if (!in_range(p->minrate, 0, top_speed) ||
138 				    !in_range(p->maxrate, 0, top_speed)) {
139 					return (ERANGE);
140 				}
141 			} else
142 				return (EINVAL);
143 		} else if (p->rateunit == SCHED_CLASS_RATEUNIT_PKTS) {
144 			/* maxrate is the absolute value in pps. */
145 			check_pktsize = true;
146 			fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE;
147 		} else
148 			return (EINVAL);
149 	} else {
150 		MPASS(p->level == SCHED_CLASS_LEVEL_CL_WRR);
151 
152 		/*
153 		 * Valid weight must be provided.
154 		 */
155 		if (p->weight < 0)
156 		       return (EINVAL);
157 		if (!in_range(p->weight, 1, 99))
158 			return (ERANGE);
159 
160 		fw_rateunit = 0;
161 		fw_ratemode = 0;
162 	}
163 
164 	if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
165 	    p->level == SCHED_CLASS_LEVEL_CL_WRR) {
166 		/*
167 		 * Valid scheduling class must be provided.
168 		 */
169 		if (p->cl < 0)
170 			return (EINVAL);
171 		if (!in_range(p->cl, 0, sc->params.nsched_cls - 1))
172 			return (ERANGE);
173 	}
174 
175 	if (check_pktsize) {
176 		if (p->pktsize < 0)
177 			return (EINVAL);
178 		if (!in_range(p->pktsize, 64, if_getmtu(pi->vi[0].ifp)))
179 			return (ERANGE);
180 	}
181 
182 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
183 		tc = &pi->sched_params->cl_rl[p->cl];
184 		mtx_lock(&sc->tc_lock);
185 		if (tc->refcount > 0 || tc->state == CS_HW_UPDATE_IN_PROGRESS)
186 			rc = EBUSY;
187 		else {
188 			old = *tc;
189 
190 			tc->flags |= CF_USER;
191 			tc->state = CS_HW_UPDATE_IN_PROGRESS;
192 			tc->ratemode = fw_ratemode;
193 			tc->rateunit = fw_rateunit;
194 			tc->mode = fw_mode;
195 			tc->maxrate = p->maxrate;
196 			tc->pktsize = p->pktsize;
197 			rc = 0;
198 		}
199 		mtx_unlock(&sc->tc_lock);
200 		if (rc != 0)
201 			return (rc);
202 	}
203 
204 	rc = begin_synchronized_op(sc, NULL,
205 	    sleep_ok ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4sscp");
206 	if (rc != 0) {
207 		if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
208 			mtx_lock(&sc->tc_lock);
209 			MPASS(tc->refcount == 0);
210 			MPASS(tc->flags & CF_USER);
211 			MPASS(tc->state == CS_HW_UPDATE_IN_PROGRESS);
212 			*tc = old;
213 			mtx_unlock(&sc->tc_lock);
214 		}
215 		return (rc);
216 	}
217 	if (!hw_off_limits(sc)) {
218 		rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, fw_level,
219 		    fw_mode, fw_rateunit, fw_ratemode, p->channel, p->cl,
220 		    p->minrate, p->maxrate, p->weight, p->pktsize, 0, sleep_ok);
221 	}
222 	end_synchronized_op(sc, sleep_ok ? 0 : LOCK_HELD);
223 
224 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
225 		mtx_lock(&sc->tc_lock);
226 		MPASS(tc->refcount == 0);
227 		MPASS(tc->flags & CF_USER);
228 		MPASS(tc->state == CS_HW_UPDATE_IN_PROGRESS);
229 
230 		if (rc == 0)
231 			tc->state = CS_HW_CONFIGURED;
232 		else {
233 			/* parameters failed so we don't park at params_set */
234 			tc->state = CS_UNINITIALIZED;
235 			tc->flags &= ~CF_USER;
236 			CH_ERR(pi, "failed to configure traffic class %d: %d.  "
237 			    "params: mode %d, rateunit %d, ratemode %d, "
238 			    "channel %d, minrate %d, maxrate %d, pktsize %d, "
239 			    "burstsize %d\n", p->cl, rc, fw_mode, fw_rateunit,
240 			    fw_ratemode, p->channel, p->minrate, p->maxrate,
241 			    p->pktsize, 0);
242 		}
243 		mtx_unlock(&sc->tc_lock);
244 	}
245 
246 	return (rc);
247 }
248 
249 static void
250 update_tx_sched(void *context, int pending)
251 {
252 	int i, j, rc;
253 	struct port_info *pi;
254 	struct tx_cl_rl_params *tc;
255 	struct adapter *sc = context;
256 	const int n = sc->params.nsched_cls;
257 
258 	mtx_lock(&sc->tc_lock);
259 	for_each_port(sc, i) {
260 		pi = sc->port[i];
261 		tc = &pi->sched_params->cl_rl[0];
262 		for (j = 0; j < n; j++, tc++) {
263 			MPASS(mtx_owned(&sc->tc_lock));
264 			if (tc->state != CS_HW_UPDATE_REQUESTED)
265 				continue;
266 			mtx_unlock(&sc->tc_lock);
267 
268 			if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
269 			    "t4utxs") != 0) {
270 				mtx_lock(&sc->tc_lock);
271 				continue;
272 			}
273 			rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED,
274 			    FW_SCHED_PARAMS_LEVEL_CL_RL, tc->mode, tc->rateunit,
275 			    tc->ratemode, pi->tx_chan, j, 0, tc->maxrate, 0,
276 			    tc->pktsize, tc->burstsize, 1);
277 			end_synchronized_op(sc, 0);
278 
279 			mtx_lock(&sc->tc_lock);
280 			MPASS(tc->state == CS_HW_UPDATE_REQUESTED);
281 			if (rc == 0) {
282 				tc->state = CS_HW_CONFIGURED;
283 				continue;
284 			}
285 			/* parameters failed so we try to avoid params_set */
286 			if (tc->refcount > 0)
287 				tc->state = CS_PARAMS_SET;
288 			else
289 				tc->state = CS_UNINITIALIZED;
290 			CH_ERR(pi, "failed to configure traffic class %d: %d.  "
291 			    "params: mode %d, rateunit %d, ratemode %d, "
292 			    "channel %d, minrate %d, maxrate %d, pktsize %d, "
293 			    "burstsize %d\n", j, rc, tc->mode, tc->rateunit,
294 			    tc->ratemode, pi->tx_chan, 0, tc->maxrate,
295 			    tc->pktsize, tc->burstsize);
296 		}
297 	}
298 	mtx_unlock(&sc->tc_lock);
299 }
300 
301 int
302 t4_set_sched_class(struct adapter *sc, struct t4_sched_params *p)
303 {
304 
305 	if (p->type != SCHED_CLASS_TYPE_PACKET)
306 		return (EINVAL);
307 
308 	if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG)
309 		return (set_sched_class_config(sc, p->u.config.minmax));
310 
311 	if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS)
312 		return (set_sched_class_params(sc, &p->u.params, 1));
313 
314 	return (EINVAL);
315 }
316 
317 static int
318 bind_txq_to_traffic_class(struct adapter *sc, struct sge_txq *txq, int idx)
319 {
320 	struct tx_cl_rl_params *tc0, *tc;
321 	int rc, old_idx;
322 	uint32_t fw_mnem, fw_class;
323 
324 	if (!(txq->eq.flags & EQ_HW_ALLOCATED))
325 		return (ENXIO);
326 
327 	mtx_lock(&sc->tc_lock);
328 	if (txq->tc_idx == -2) {
329 		rc = EBUSY;	/* Another bind/unbind in progress already. */
330 		goto done;
331 	}
332 	if (idx == txq->tc_idx) {
333 		rc = 0;		/* No change, nothing to do. */
334 		goto done;
335 	}
336 
337 	tc0 = &sc->port[txq->eq.tx_chan]->sched_params->cl_rl[0];
338 	if (idx != -1) {
339 		/*
340 		 * Bind to a different class at index idx.
341 		 */
342 		tc = &tc0[idx];
343 		if (tc->state != CS_HW_CONFIGURED) {
344 			rc = ENXIO;
345 			goto done;
346 		} else {
347 			/*
348 			 * Ok to proceed.  Place a reference on the new class
349 			 * while still holding on to the reference on the
350 			 * previous class, if any.
351 			 */
352 			tc->refcount++;
353 		}
354 	}
355 	/* Mark as busy before letting go of the lock. */
356 	old_idx = txq->tc_idx;
357 	txq->tc_idx = -2;
358 	mtx_unlock(&sc->tc_lock);
359 
360 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4btxq");
361 	if (rc == 0) {
362 		fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
363 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) |
364 		    V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id));
365 		fw_class = idx < 0 ? 0xffffffff : idx;
366 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_mnem,
367 		    &fw_class);
368 		end_synchronized_op(sc, 0);
369 	}
370 
371 	mtx_lock(&sc->tc_lock);
372 	MPASS(txq->tc_idx == -2);
373 	if (rc == 0) {
374 		/*
375 		 * Unbind, bind, or bind to a different class succeeded.  Remove
376 		 * the reference on the old traffic class, if any.
377 		 */
378 		if (old_idx != -1) {
379 			tc = &tc0[old_idx];
380 			MPASS(tc->refcount > 0);
381 			tc->refcount--;
382 		}
383 		txq->tc_idx = idx;
384 	} else {
385 		/*
386 		 * Unbind, bind, or bind to a different class failed.  Remove
387 		 * the anticipatory reference on the new traffic class, if any.
388 		 */
389 		if (idx != -1) {
390 			tc = &tc0[idx];
391 			MPASS(tc->refcount > 0);
392 			tc->refcount--;
393 		}
394 		txq->tc_idx = old_idx;
395 	}
396 done:
397 	MPASS(txq->tc_idx >= -1 && txq->tc_idx < sc->params.nsched_cls);
398 	mtx_unlock(&sc->tc_lock);
399 	return (rc);
400 }
401 
402 int
403 t4_set_sched_queue(struct adapter *sc, struct t4_sched_queue *p)
404 {
405 	struct port_info *pi = NULL;
406 	struct vi_info *vi;
407 	struct sge_txq *txq;
408 	int i, rc;
409 
410 	if (p->port >= sc->params.nports)
411 		return (EINVAL);
412 
413 	/*
414 	 * XXX: cxgbetool allows the user to specify the physical port only.  So
415 	 * we always operate on the main VI.
416 	 */
417 	pi = sc->port[p->port];
418 	vi = &pi->vi[0];
419 
420 	/* Checking VI_INIT_DONE outside a synch-op is a harmless race here. */
421 	if (!(vi->flags & VI_INIT_DONE))
422 		return (EAGAIN);
423 	MPASS(vi->ntxq > 0);
424 
425 	if (!in_range(p->queue, 0, vi->ntxq - 1) ||
426 	    !in_range(p->cl, 0, sc->params.nsched_cls - 1))
427 		return (EINVAL);
428 
429 	if (p->queue < 0) {
430 		/*
431 		 * Change the scheduling on all the TX queues for the
432 		 * interface.
433 		 */
434 		for_each_txq(vi, i, txq) {
435 			rc = bind_txq_to_traffic_class(sc, txq, p->cl);
436 			if (rc != 0)
437 				break;
438 		}
439 	} else {
440 		/*
441 		 * If op.queue is non-negative, then we're only changing the
442 		 * scheduling on a single specified TX queue.
443 		 */
444 		txq = &sc->sge.txq[vi->first_txq + p->queue];
445 		rc = bind_txq_to_traffic_class(sc, txq, p->cl);
446 	}
447 
448 	return (rc);
449 }
450 
451 int
452 t4_init_tx_sched(struct adapter *sc)
453 {
454 	int i;
455 	const int n = sc->params.nsched_cls;
456 	struct port_info *pi;
457 
458 	mtx_init(&sc->tc_lock, "tx_sched lock", NULL, MTX_DEF);
459 	TASK_INIT(&sc->tc_task, 0, update_tx_sched, sc);
460 	for_each_port(sc, i) {
461 		pi = sc->port[i];
462 		pi->sched_params = malloc(sizeof(*pi->sched_params) +
463 		    n * sizeof(struct tx_cl_rl_params), M_CXGBE, M_ZERO | M_WAITOK);
464 	}
465 
466 	return (0);
467 }
468 
469 int
470 t4_free_tx_sched(struct adapter *sc)
471 {
472 	int i;
473 
474 	taskqueue_drain(taskqueue_thread, &sc->tc_task);
475 
476 	for_each_port(sc, i) {
477 		if (sc->port[i] != NULL)
478 			free(sc->port[i]->sched_params, M_CXGBE);
479 	}
480 
481 	if (mtx_initialized(&sc->tc_lock))
482 		mtx_destroy(&sc->tc_lock);
483 
484 	return (0);
485 }
486 
487 void
488 t4_update_tx_sched(struct adapter *sc)
489 {
490 
491 	taskqueue_enqueue(taskqueue_thread, &sc->tc_task);
492 }
493 
494 int
495 t4_reserve_cl_rl_kbps(struct adapter *sc, int port_id, u_int maxrate,
496     int *tc_idx)
497 {
498 	int rc = 0, fa, fa2, i, pktsize, burstsize;
499 	bool update;
500 	struct tx_cl_rl_params *tc;
501 	struct port_info *pi;
502 
503 	MPASS(port_id >= 0 && port_id < sc->params.nports);
504 
505 	pi = sc->port[port_id];
506 	if (pi->sched_params->pktsize > 0)
507 		pktsize = pi->sched_params->pktsize;
508 	else
509 		pktsize = if_getmtu(pi->vi[0].ifp);
510 	if (pi->sched_params->burstsize > 0)
511 		burstsize = pi->sched_params->burstsize;
512 	else
513 		burstsize = pktsize * 4;
514 	tc = &pi->sched_params->cl_rl[0];
515 
516 	update = false;
517 	fa = fa2 = -1;
518 	mtx_lock(&sc->tc_lock);
519 	for (i = 0; i < sc->params.nsched_cls; i++, tc++) {
520 		if (tc->state >= CS_PARAMS_SET &&
521 		    tc->ratemode == FW_SCHED_PARAMS_RATE_ABS &&
522 		    tc->rateunit == FW_SCHED_PARAMS_UNIT_BITRATE &&
523 		    tc->mode == FW_SCHED_PARAMS_MODE_FLOW &&
524 		    tc->maxrate == maxrate && tc->pktsize == pktsize &&
525 		    tc->burstsize == burstsize) {
526 			tc->refcount++;
527 			*tc_idx = i;
528 			if (tc->state == CS_PARAMS_SET) {
529 				tc->state = CS_HW_UPDATE_REQUESTED;
530 				update = true;
531 			}
532 			goto done;
533 		}
534 
535 		if (fa < 0 && tc->state == CS_UNINITIALIZED) {
536 			MPASS(tc->refcount == 0);
537 			fa = i;		/* first available, never used. */
538 		}
539 		if (fa2 < 0 && tc->refcount == 0 && !(tc->flags & CF_USER)) {
540 			fa2 = i;	/* first available, used previously.  */
541 		}
542 	}
543 	/* Not found */
544 	MPASS(i == sc->params.nsched_cls);
545 	if (fa == -1)
546 		fa = fa2;
547 	if (fa == -1) {
548 		*tc_idx = -1;
549 		rc = ENOSPC;
550 	} else {
551 		MPASS(fa >= 0 && fa < sc->params.nsched_cls);
552 		tc = &pi->sched_params->cl_rl[fa];
553 		MPASS(!(tc->flags & CF_USER));
554 		MPASS(tc->refcount == 0);
555 
556 		tc->refcount = 1;
557 		tc->state = CS_HW_UPDATE_REQUESTED;
558 		tc->ratemode = FW_SCHED_PARAMS_RATE_ABS;
559 		tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
560 		tc->mode = FW_SCHED_PARAMS_MODE_FLOW;
561 		tc->maxrate = maxrate;
562 		tc->pktsize = pktsize;
563 		tc->burstsize = burstsize;
564 		*tc_idx = fa;
565 		update = true;
566 	}
567 done:
568 	mtx_unlock(&sc->tc_lock);
569 	if (update)
570 		t4_update_tx_sched(sc);
571 	return (rc);
572 }
573 
574 void
575 t4_release_cl_rl(struct adapter *sc, int port_id, int tc_idx)
576 {
577 	struct tx_cl_rl_params *tc;
578 
579 	MPASS(port_id >= 0 && port_id < sc->params.nports);
580 	MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls);
581 
582 	mtx_lock(&sc->tc_lock);
583 	tc = &sc->port[port_id]->sched_params->cl_rl[tc_idx];
584 	MPASS(tc->refcount > 0);
585 	tc->refcount--;
586 	mtx_unlock(&sc->tc_lock);
587 }
588 
589 int
590 sysctl_tc(SYSCTL_HANDLER_ARGS)
591 {
592 	struct vi_info *vi = arg1;
593 	struct adapter *sc = vi->adapter;
594 	struct sge_txq *txq;
595 	int qidx = arg2, rc, tc_idx;
596 
597 	MPASS(qidx >= vi->first_txq && qidx < vi->first_txq + vi->ntxq);
598 
599 	txq = &sc->sge.txq[qidx];
600 	tc_idx = txq->tc_idx;
601 	rc = sysctl_handle_int(oidp, &tc_idx, 0, req);
602 	if (rc != 0 || req->newptr == NULL)
603 		return (rc);
604 
605 	if (sc->flags & IS_VF)
606 		return (EPERM);
607 	if (!in_range(tc_idx, 0, sc->params.nsched_cls - 1))
608 		return (EINVAL);
609 
610 	return (bind_txq_to_traffic_class(sc, txq, tc_idx));
611 }
612 
613 int
614 sysctl_tc_params(SYSCTL_HANDLER_ARGS)
615 {
616 	struct adapter *sc = arg1;
617 	struct tx_cl_rl_params tc;
618 	struct sbuf *sb;
619 	int i, rc, port_id, mbps, gbps;
620 
621 	rc = sysctl_wire_old_buffer(req, 0);
622 	if (rc != 0)
623 		return (rc);
624 
625 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
626 	if (sb == NULL)
627 		return (ENOMEM);
628 
629 	port_id = arg2 >> 16;
630 	MPASS(port_id < sc->params.nports);
631 	MPASS(sc->port[port_id] != NULL);
632 	i = arg2 & 0xffff;
633 	MPASS(i < sc->params.nsched_cls);
634 
635 	mtx_lock(&sc->tc_lock);
636 	tc = sc->port[port_id]->sched_params->cl_rl[i];
637 	mtx_unlock(&sc->tc_lock);
638 
639 	if (tc.state < CS_PARAMS_SET) {
640 		sbuf_printf(sb, "uninitialized");
641 		goto done;
642 	}
643 
644 	switch (tc.rateunit) {
645 	case SCHED_CLASS_RATEUNIT_BITS:
646 		switch (tc.ratemode) {
647 		case SCHED_CLASS_RATEMODE_REL:
648 			/* XXX: top speed or actual link speed? */
649 			gbps = port_top_speed(sc->port[port_id]);
650 			sbuf_printf(sb, "%u%% of %uGbps", tc.maxrate, gbps);
651 			break;
652 		case SCHED_CLASS_RATEMODE_ABS:
653 			mbps = tc.maxrate / 1000;
654 			gbps = tc.maxrate / 1000000;
655 			if (tc.maxrate == gbps * 1000000)
656 				sbuf_printf(sb, "%uGbps", gbps);
657 			else if (tc.maxrate == mbps * 1000)
658 				sbuf_printf(sb, "%uMbps", mbps);
659 			else
660 				sbuf_printf(sb, "%uKbps", tc.maxrate);
661 			break;
662 		default:
663 			rc = ENXIO;
664 			goto done;
665 		}
666 		break;
667 	case SCHED_CLASS_RATEUNIT_PKTS:
668 		sbuf_printf(sb, "%upps", tc.maxrate);
669 		break;
670 	default:
671 		rc = ENXIO;
672 		goto done;
673 	}
674 
675 	switch (tc.mode) {
676 	case SCHED_CLASS_MODE_CLASS:
677 		/* Note that pktsize and burstsize are not used in this mode. */
678 		sbuf_printf(sb, " aggregate");
679 		break;
680 	case SCHED_CLASS_MODE_FLOW:
681 		sbuf_printf(sb, " per-flow");
682 		if (tc.pktsize > 0)
683 			sbuf_printf(sb, " pkt-size %u", tc.pktsize);
684 		if (tc.burstsize > 0)
685 			sbuf_printf(sb, " burst-size %u", tc.burstsize);
686 		break;
687 	default:
688 		rc = ENXIO;
689 		goto done;
690 	}
691 
692 done:
693 	if (rc == 0)
694 		rc = sbuf_finish(sb);
695 	sbuf_delete(sb);
696 
697 	return (rc);
698 }
699 
700 #ifdef RATELIMIT
701 void
702 t4_init_etid_table(struct adapter *sc)
703 {
704 	int i;
705 	struct tid_info *t;
706 
707 	if (!is_ethoffload(sc))
708 		return;
709 
710 	t = &sc->tids;
711 	MPASS(t->netids > 0);
712 
713 	mtx_init(&t->etid_lock, "etid lock", NULL, MTX_DEF);
714 	t->etid_tab = malloc(sizeof(*t->etid_tab) * t->netids, M_CXGBE,
715 			M_ZERO | M_WAITOK);
716 	t->efree = t->etid_tab;
717 	t->etids_in_use = 0;
718 	for (i = 1; i < t->netids; i++)
719 		t->etid_tab[i - 1].next = &t->etid_tab[i];
720 	t->etid_tab[t->netids - 1].next = NULL;
721 }
722 
723 void
724 t4_free_etid_table(struct adapter *sc)
725 {
726 	struct tid_info *t;
727 
728 	if (!is_ethoffload(sc))
729 		return;
730 
731 	t = &sc->tids;
732 	MPASS(t->netids > 0);
733 
734 	free(t->etid_tab, M_CXGBE);
735 	t->etid_tab = NULL;
736 
737 	if (mtx_initialized(&t->etid_lock))
738 		mtx_destroy(&t->etid_lock);
739 }
740 
741 /* etid services */
742 static int alloc_etid(struct adapter *, struct cxgbe_rate_tag *);
743 static void free_etid(struct adapter *, int);
744 
745 static int
746 alloc_etid(struct adapter *sc, struct cxgbe_rate_tag *cst)
747 {
748 	struct tid_info *t = &sc->tids;
749 	int etid = -1;
750 
751 	mtx_lock(&t->etid_lock);
752 	if (t->efree) {
753 		union etid_entry *p = t->efree;
754 
755 		etid = p - t->etid_tab + t->etid_base;
756 		t->efree = p->next;
757 		p->cst = cst;
758 		t->etids_in_use++;
759 	}
760 	mtx_unlock(&t->etid_lock);
761 	return (etid);
762 }
763 
764 struct cxgbe_rate_tag *
765 lookup_etid(struct adapter *sc, int etid)
766 {
767 	struct tid_info *t = &sc->tids;
768 
769 	return (t->etid_tab[etid - t->etid_base].cst);
770 }
771 
772 static void
773 free_etid(struct adapter *sc, int etid)
774 {
775 	struct tid_info *t = &sc->tids;
776 	union etid_entry *p = &t->etid_tab[etid - t->etid_base];
777 
778 	mtx_lock(&t->etid_lock);
779 	p->next = t->efree;
780 	t->efree = p;
781 	t->etids_in_use--;
782 	mtx_unlock(&t->etid_lock);
783 }
784 
785 static int cxgbe_rate_tag_modify(struct m_snd_tag *,
786     union if_snd_tag_modify_params *);
787 static int cxgbe_rate_tag_query(struct m_snd_tag *,
788     union if_snd_tag_query_params *);
789 static void cxgbe_rate_tag_free(struct m_snd_tag *);
790 
791 static const struct if_snd_tag_sw cxgbe_rate_tag_sw = {
792 	.snd_tag_modify = cxgbe_rate_tag_modify,
793 	.snd_tag_query = cxgbe_rate_tag_query,
794 	.snd_tag_free = cxgbe_rate_tag_free,
795 	.type = IF_SND_TAG_TYPE_RATE_LIMIT
796 };
797 
798 int
799 cxgbe_rate_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params,
800     struct m_snd_tag **pt)
801 {
802 	int rc, schedcl;
803 	struct vi_info *vi = if_getsoftc(ifp);
804 	struct port_info *pi = vi->pi;
805 	struct adapter *sc = pi->adapter;
806 	struct cxgbe_rate_tag *cst;
807 
808 	MPASS(params->hdr.type == IF_SND_TAG_TYPE_RATE_LIMIT);
809 
810 	rc = t4_reserve_cl_rl_kbps(sc, pi->port_id,
811 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
812 	if (rc != 0)
813 		return (rc);
814 	MPASS(schedcl >= 0 && schedcl < sc->params.nsched_cls);
815 
816 	cst = malloc(sizeof(*cst), M_CXGBE, M_ZERO | M_NOWAIT);
817 	if (cst == NULL) {
818 failed:
819 		t4_release_cl_rl(sc, pi->port_id, schedcl);
820 		return (ENOMEM);
821 	}
822 
823 	cst->etid = alloc_etid(sc, cst);
824 	if (cst->etid < 0) {
825 		free(cst, M_CXGBE);
826 		goto failed;
827 	}
828 
829 	mtx_init(&cst->lock, "cst_lock", NULL, MTX_DEF);
830 	mbufq_init(&cst->pending_tx, INT_MAX);
831 	mbufq_init(&cst->pending_fwack, INT_MAX);
832 	m_snd_tag_init(&cst->com, ifp, &cxgbe_rate_tag_sw);
833 	cst->flags |= EO_FLOWC_PENDING | EO_SND_TAG_REF;
834 	cst->adapter = sc;
835 	cst->port_id = pi->port_id;
836 	cst->schedcl = schedcl;
837 	cst->max_rate = params->rate_limit.max_rate;
838 	cst->tx_credits = sc->params.eo_wr_cred;
839 	cst->tx_total = cst->tx_credits;
840 	cst->plen = 0;
841 	cst->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
842 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
843 	    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
844 
845 	/*
846 	 * Queues will be selected later when the connection flowid is available.
847 	 */
848 
849 	*pt = &cst->com;
850 	return (0);
851 }
852 
853 /*
854  * Change in parameters, no change in ifp.
855  */
856 static int
857 cxgbe_rate_tag_modify(struct m_snd_tag *mst,
858     union if_snd_tag_modify_params *params)
859 {
860 	int rc, schedcl;
861 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
862 	struct adapter *sc = cst->adapter;
863 
864 	/* XXX: is schedcl -1 ok here? */
865 	MPASS(cst->schedcl >= 0 && cst->schedcl < sc->params.nsched_cls);
866 
867 	mtx_lock(&cst->lock);
868 	MPASS(cst->flags & EO_SND_TAG_REF);
869 	rc = t4_reserve_cl_rl_kbps(sc, cst->port_id,
870 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
871 	if (rc != 0)
872 		return (rc);
873 	MPASS(schedcl >= 0 && schedcl < sc->params.nsched_cls);
874 	t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
875 	cst->schedcl = schedcl;
876 	cst->max_rate = params->rate_limit.max_rate;
877 	mtx_unlock(&cst->lock);
878 
879 	return (0);
880 }
881 
882 static int
883 cxgbe_rate_tag_query(struct m_snd_tag *mst,
884     union if_snd_tag_query_params *params)
885 {
886 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
887 
888 	params->rate_limit.max_rate = cst->max_rate;
889 
890 #define CST_TO_MST_QLEVEL_SCALE (IF_SND_QUEUE_LEVEL_MAX / cst->tx_total)
891 	params->rate_limit.queue_level =
892 		(cst->tx_total - cst->tx_credits) * CST_TO_MST_QLEVEL_SCALE;
893 
894 	return (0);
895 }
896 
897 /*
898  * Unlocks cst and frees it.
899  */
900 void
901 cxgbe_rate_tag_free_locked(struct cxgbe_rate_tag *cst)
902 {
903 	struct adapter *sc = cst->adapter;
904 
905 	mtx_assert(&cst->lock, MA_OWNED);
906 	MPASS((cst->flags & EO_SND_TAG_REF) == 0);
907 	MPASS(cst->tx_credits == cst->tx_total);
908 	MPASS(cst->plen == 0);
909 	MPASS(mbufq_first(&cst->pending_tx) == NULL);
910 	MPASS(mbufq_first(&cst->pending_fwack) == NULL);
911 
912 	if (cst->etid >= 0)
913 		free_etid(sc, cst->etid);
914 	if (cst->schedcl != -1)
915 		t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
916 	mtx_unlock(&cst->lock);
917 	mtx_destroy(&cst->lock);
918 	free(cst, M_CXGBE);
919 }
920 
921 static void
922 cxgbe_rate_tag_free(struct m_snd_tag *mst)
923 {
924 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
925 
926 	mtx_lock(&cst->lock);
927 
928 	/* The kernel is done with the snd_tag.  Remove its reference. */
929 	MPASS(cst->flags & EO_SND_TAG_REF);
930 	cst->flags &= ~EO_SND_TAG_REF;
931 
932 	if (cst->ncompl == 0) {
933 		/*
934 		 * No fw4_ack in flight.  Free the tag right away if there are
935 		 * no outstanding credits.  Request the firmware to return all
936 		 * credits for the etid otherwise.
937 		 */
938 		if (cst->tx_credits == cst->tx_total) {
939 			cxgbe_rate_tag_free_locked(cst);
940 			return;	/* cst is gone. */
941 		}
942 		send_etid_flush_wr(cst);
943 	}
944 	mtx_unlock(&cst->lock);
945 }
946 
947 void
948 cxgbe_ratelimit_query(if_t ifp, struct if_ratelimit_query_results *q)
949 {
950 	struct vi_info *vi = if_getsoftc(ifp);
951 	struct adapter *sc = vi->adapter;
952 
953 	q->rate_table = NULL;
954 	q->flags = RT_IS_SELECTABLE;
955 	/*
956 	 * Absolute max limits from the firmware configuration.  Practical
957 	 * limits depend on the burstsize, pktsize (if_getmtu(ifp) ultimately) and
958 	 * the card's cclk.
959 	 */
960 	q->max_flows = sc->tids.netids;
961 	q->number_of_rates = sc->params.nsched_cls;
962 	q->min_segment_burst = 4; /* matches PKTSCHED_BURST in the firmware. */
963 
964 #if 1
965 	if (chip_id(sc) < CHELSIO_T6) {
966 		/* Based on testing by rrs@ with a T580 at burstsize = 4. */
967 		MPASS(q->min_segment_burst == 4);
968 		q->max_flows = min(4000, q->max_flows);
969 	} else {
970 		/* XXX: TBD, carried forward from T5 for now. */
971 		q->max_flows = min(4000, q->max_flows);
972 	}
973 
974 	/*
975 	 * XXX: tcp_ratelimit.c grabs all available rates on link-up before it
976 	 * even knows whether hw pacing will be used or not.  This prevents
977 	 * other consumers like SO_MAX_PACING_RATE or those using cxgbetool or
978 	 * the private ioctls from using any of traffic classes.
979 	 *
980 	 * Underreport the number of rates to tcp_ratelimit so that it doesn't
981 	 * hog all of them.  This can be removed if/when tcp_ratelimit switches
982 	 * to making its allocations on first-use rather than link-up.  There is
983 	 * nothing wrong with one particular consumer reserving all the classes
984 	 * but it should do so only if it'll actually use hw rate limiting.
985 	 */
986 	q->number_of_rates /= 4;
987 #endif
988 }
989 #endif
990