1 /* 2 * net/sched/sch_mqprio.c 3 * 4 * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * version 2 as published by the Free Software Foundation. 9 */ 10 11 #include <linux/types.h> 12 #include <linux/slab.h> 13 #include <linux/kernel.h> 14 #include <linux/string.h> 15 #include <linux/errno.h> 16 #include <linux/skbuff.h> 17 #include <linux/module.h> 18 #include <net/netlink.h> 19 #include <net/pkt_sched.h> 20 #include <net/sch_generic.h> 21 #include <net/pkt_cls.h> 22 23 struct mqprio_sched { 24 struct Qdisc **qdiscs; 25 u16 mode; 26 u16 shaper; 27 int hw_offload; 28 u32 flags; 29 u64 min_rate[TC_QOPT_MAX_QUEUE]; 30 u64 max_rate[TC_QOPT_MAX_QUEUE]; 31 }; 32 33 static void mqprio_destroy(struct Qdisc *sch) 34 { 35 struct net_device *dev = qdisc_dev(sch); 36 struct mqprio_sched *priv = qdisc_priv(sch); 37 unsigned int ntx; 38 39 if (priv->qdiscs) { 40 for (ntx = 0; 41 ntx < dev->num_tx_queues && priv->qdiscs[ntx]; 42 ntx++) 43 qdisc_destroy(priv->qdiscs[ntx]); 44 kfree(priv->qdiscs); 45 } 46 47 if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) { 48 struct tc_mqprio_qopt_offload mqprio = { { 0 } }; 49 50 switch (priv->mode) { 51 case TC_MQPRIO_MODE_DCB: 52 case TC_MQPRIO_MODE_CHANNEL: 53 dev->netdev_ops->ndo_setup_tc(dev, 54 TC_SETUP_QDISC_MQPRIO, 55 &mqprio); 56 break; 57 default: 58 return; 59 } 60 } else { 61 netdev_set_num_tc(dev, 0); 62 } 63 } 64 65 static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) 66 { 67 int i, j; 68 69 /* Verify num_tc is not out of max range */ 70 if (qopt->num_tc > TC_MAX_QUEUE) 71 return -EINVAL; 72 73 /* Verify priority mapping uses valid tcs */ 74 for (i = 0; i < TC_BITMASK + 1; i++) { 75 if (qopt->prio_tc_map[i] >= qopt->num_tc) 76 return -EINVAL; 77 } 78 79 /* Limit qopt->hw to maximum supported offload value. Drivers have 80 * the option of overriding this later if they don't support the a 81 * given offload type. 82 */ 83 if (qopt->hw > TC_MQPRIO_HW_OFFLOAD_MAX) 84 qopt->hw = TC_MQPRIO_HW_OFFLOAD_MAX; 85 86 /* If hardware offload is requested we will leave it to the device 87 * to either populate the queue counts itself or to validate the 88 * provided queue counts. If ndo_setup_tc is not present then 89 * hardware doesn't support offload and we should return an error. 90 */ 91 if (qopt->hw) 92 return dev->netdev_ops->ndo_setup_tc ? 0 : -EINVAL; 93 94 for (i = 0; i < qopt->num_tc; i++) { 95 unsigned int last = qopt->offset[i] + qopt->count[i]; 96 97 /* Verify the queue count is in tx range being equal to the 98 * real_num_tx_queues indicates the last queue is in use. 99 */ 100 if (qopt->offset[i] >= dev->real_num_tx_queues || 101 !qopt->count[i] || 102 last > dev->real_num_tx_queues) 103 return -EINVAL; 104 105 /* Verify that the offset and counts do not overlap */ 106 for (j = i + 1; j < qopt->num_tc; j++) { 107 if (last > qopt->offset[j]) 108 return -EINVAL; 109 } 110 } 111 112 return 0; 113 } 114 115 static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = { 116 [TCA_MQPRIO_MODE] = { .len = sizeof(u16) }, 117 [TCA_MQPRIO_SHAPER] = { .len = sizeof(u16) }, 118 [TCA_MQPRIO_MIN_RATE64] = { .type = NLA_NESTED }, 119 [TCA_MQPRIO_MAX_RATE64] = { .type = NLA_NESTED }, 120 }; 121 122 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, 123 const struct nla_policy *policy, int len) 124 { 125 int nested_len = nla_len(nla) - NLA_ALIGN(len); 126 127 if (nested_len >= nla_attr_size(0)) 128 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), 129 nested_len, policy, NULL); 130 131 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); 132 return 0; 133 } 134 135 static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) 136 { 137 struct net_device *dev = qdisc_dev(sch); 138 struct mqprio_sched *priv = qdisc_priv(sch); 139 struct netdev_queue *dev_queue; 140 struct Qdisc *qdisc; 141 int i, err = -EOPNOTSUPP; 142 struct tc_mqprio_qopt *qopt = NULL; 143 struct nlattr *tb[TCA_MQPRIO_MAX + 1]; 144 struct nlattr *attr; 145 int rem; 146 int len; 147 148 BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); 149 BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); 150 151 if (sch->parent != TC_H_ROOT) 152 return -EOPNOTSUPP; 153 154 if (!netif_is_multiqueue(dev)) 155 return -EOPNOTSUPP; 156 157 /* make certain can allocate enough classids to handle queues */ 158 if (dev->num_tx_queues >= TC_H_MIN_PRIORITY) 159 return -ENOMEM; 160 161 if (!opt || nla_len(opt) < sizeof(*qopt)) 162 return -EINVAL; 163 164 qopt = nla_data(opt); 165 if (mqprio_parse_opt(dev, qopt)) 166 return -EINVAL; 167 168 len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); 169 if (len > 0) { 170 err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, 171 sizeof(*qopt)); 172 if (err < 0) 173 return err; 174 175 if (!qopt->hw) 176 return -EINVAL; 177 178 if (tb[TCA_MQPRIO_MODE]) { 179 priv->flags |= TC_MQPRIO_F_MODE; 180 priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]); 181 } 182 183 if (tb[TCA_MQPRIO_SHAPER]) { 184 priv->flags |= TC_MQPRIO_F_SHAPER; 185 priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]); 186 } 187 188 if (tb[TCA_MQPRIO_MIN_RATE64]) { 189 if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) 190 return -EINVAL; 191 i = 0; 192 nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], 193 rem) { 194 if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) 195 return -EINVAL; 196 if (i >= qopt->num_tc) 197 break; 198 priv->min_rate[i] = *(u64 *)nla_data(attr); 199 i++; 200 } 201 priv->flags |= TC_MQPRIO_F_MIN_RATE; 202 } 203 204 if (tb[TCA_MQPRIO_MAX_RATE64]) { 205 if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) 206 return -EINVAL; 207 i = 0; 208 nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], 209 rem) { 210 if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) 211 return -EINVAL; 212 if (i >= qopt->num_tc) 213 break; 214 priv->max_rate[i] = *(u64 *)nla_data(attr); 215 i++; 216 } 217 priv->flags |= TC_MQPRIO_F_MAX_RATE; 218 } 219 } 220 221 /* pre-allocate qdisc, attachment can't fail */ 222 priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), 223 GFP_KERNEL); 224 if (!priv->qdiscs) 225 return -ENOMEM; 226 227 for (i = 0; i < dev->num_tx_queues; i++) { 228 dev_queue = netdev_get_tx_queue(dev, i); 229 qdisc = qdisc_create_dflt(dev_queue, 230 get_default_qdisc_ops(dev, i), 231 TC_H_MAKE(TC_H_MAJ(sch->handle), 232 TC_H_MIN(i + 1))); 233 if (!qdisc) 234 return -ENOMEM; 235 236 priv->qdiscs[i] = qdisc; 237 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; 238 } 239 240 /* If the mqprio options indicate that hardware should own 241 * the queue mapping then run ndo_setup_tc otherwise use the 242 * supplied and verified mapping 243 */ 244 if (qopt->hw) { 245 struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt}; 246 247 switch (priv->mode) { 248 case TC_MQPRIO_MODE_DCB: 249 if (priv->shaper != TC_MQPRIO_SHAPER_DCB) 250 return -EINVAL; 251 break; 252 case TC_MQPRIO_MODE_CHANNEL: 253 mqprio.flags = priv->flags; 254 if (priv->flags & TC_MQPRIO_F_MODE) 255 mqprio.mode = priv->mode; 256 if (priv->flags & TC_MQPRIO_F_SHAPER) 257 mqprio.shaper = priv->shaper; 258 if (priv->flags & TC_MQPRIO_F_MIN_RATE) 259 for (i = 0; i < mqprio.qopt.num_tc; i++) 260 mqprio.min_rate[i] = priv->min_rate[i]; 261 if (priv->flags & TC_MQPRIO_F_MAX_RATE) 262 for (i = 0; i < mqprio.qopt.num_tc; i++) 263 mqprio.max_rate[i] = priv->max_rate[i]; 264 break; 265 default: 266 return -EINVAL; 267 } 268 err = dev->netdev_ops->ndo_setup_tc(dev, 269 TC_SETUP_QDISC_MQPRIO, 270 &mqprio); 271 if (err) 272 return err; 273 274 priv->hw_offload = mqprio.qopt.hw; 275 } else { 276 netdev_set_num_tc(dev, qopt->num_tc); 277 for (i = 0; i < qopt->num_tc; i++) 278 netdev_set_tc_queue(dev, i, 279 qopt->count[i], qopt->offset[i]); 280 } 281 282 /* Always use supplied priority mappings */ 283 for (i = 0; i < TC_BITMASK + 1; i++) 284 netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]); 285 286 sch->flags |= TCQ_F_MQROOT; 287 return 0; 288 } 289 290 static void mqprio_attach(struct Qdisc *sch) 291 { 292 struct net_device *dev = qdisc_dev(sch); 293 struct mqprio_sched *priv = qdisc_priv(sch); 294 struct Qdisc *qdisc, *old; 295 unsigned int ntx; 296 297 /* Attach underlying qdisc */ 298 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { 299 qdisc = priv->qdiscs[ntx]; 300 old = dev_graft_qdisc(qdisc->dev_queue, qdisc); 301 if (old) 302 qdisc_destroy(old); 303 if (ntx < dev->real_num_tx_queues) 304 qdisc_hash_add(qdisc, false); 305 } 306 kfree(priv->qdiscs); 307 priv->qdiscs = NULL; 308 } 309 310 static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, 311 unsigned long cl) 312 { 313 struct net_device *dev = qdisc_dev(sch); 314 unsigned long ntx = cl - 1; 315 316 if (ntx >= dev->num_tx_queues) 317 return NULL; 318 return netdev_get_tx_queue(dev, ntx); 319 } 320 321 static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, 322 struct Qdisc **old) 323 { 324 struct net_device *dev = qdisc_dev(sch); 325 struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); 326 327 if (!dev_queue) 328 return -EINVAL; 329 330 if (dev->flags & IFF_UP) 331 dev_deactivate(dev); 332 333 *old = dev_graft_qdisc(dev_queue, new); 334 335 if (new) 336 new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; 337 338 if (dev->flags & IFF_UP) 339 dev_activate(dev); 340 341 return 0; 342 } 343 344 static int dump_rates(struct mqprio_sched *priv, 345 struct tc_mqprio_qopt *opt, struct sk_buff *skb) 346 { 347 struct nlattr *nest; 348 int i; 349 350 if (priv->flags & TC_MQPRIO_F_MIN_RATE) { 351 nest = nla_nest_start(skb, TCA_MQPRIO_MIN_RATE64); 352 if (!nest) 353 goto nla_put_failure; 354 355 for (i = 0; i < opt->num_tc; i++) { 356 if (nla_put(skb, TCA_MQPRIO_MIN_RATE64, 357 sizeof(priv->min_rate[i]), 358 &priv->min_rate[i])) 359 goto nla_put_failure; 360 } 361 nla_nest_end(skb, nest); 362 } 363 364 if (priv->flags & TC_MQPRIO_F_MAX_RATE) { 365 nest = nla_nest_start(skb, TCA_MQPRIO_MAX_RATE64); 366 if (!nest) 367 goto nla_put_failure; 368 369 for (i = 0; i < opt->num_tc; i++) { 370 if (nla_put(skb, TCA_MQPRIO_MAX_RATE64, 371 sizeof(priv->max_rate[i]), 372 &priv->max_rate[i])) 373 goto nla_put_failure; 374 } 375 nla_nest_end(skb, nest); 376 } 377 return 0; 378 379 nla_put_failure: 380 nla_nest_cancel(skb, nest); 381 return -1; 382 } 383 384 static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) 385 { 386 struct net_device *dev = qdisc_dev(sch); 387 struct mqprio_sched *priv = qdisc_priv(sch); 388 struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb); 389 struct tc_mqprio_qopt opt = { 0 }; 390 struct Qdisc *qdisc; 391 unsigned int i; 392 393 sch->q.qlen = 0; 394 memset(&sch->bstats, 0, sizeof(sch->bstats)); 395 memset(&sch->qstats, 0, sizeof(sch->qstats)); 396 397 for (i = 0; i < dev->num_tx_queues; i++) { 398 qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); 399 spin_lock_bh(qdisc_lock(qdisc)); 400 sch->q.qlen += qdisc->q.qlen; 401 sch->bstats.bytes += qdisc->bstats.bytes; 402 sch->bstats.packets += qdisc->bstats.packets; 403 sch->qstats.backlog += qdisc->qstats.backlog; 404 sch->qstats.drops += qdisc->qstats.drops; 405 sch->qstats.requeues += qdisc->qstats.requeues; 406 sch->qstats.overlimits += qdisc->qstats.overlimits; 407 spin_unlock_bh(qdisc_lock(qdisc)); 408 } 409 410 opt.num_tc = netdev_get_num_tc(dev); 411 memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); 412 opt.hw = priv->hw_offload; 413 414 for (i = 0; i < netdev_get_num_tc(dev); i++) { 415 opt.count[i] = dev->tc_to_txq[i].count; 416 opt.offset[i] = dev->tc_to_txq[i].offset; 417 } 418 419 if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt)) 420 goto nla_put_failure; 421 422 if ((priv->flags & TC_MQPRIO_F_MODE) && 423 nla_put_u16(skb, TCA_MQPRIO_MODE, priv->mode)) 424 goto nla_put_failure; 425 426 if ((priv->flags & TC_MQPRIO_F_SHAPER) && 427 nla_put_u16(skb, TCA_MQPRIO_SHAPER, priv->shaper)) 428 goto nla_put_failure; 429 430 if ((priv->flags & TC_MQPRIO_F_MIN_RATE || 431 priv->flags & TC_MQPRIO_F_MAX_RATE) && 432 (dump_rates(priv, &opt, skb) != 0)) 433 goto nla_put_failure; 434 435 return nla_nest_end(skb, nla); 436 nla_put_failure: 437 nlmsg_trim(skb, nla); 438 return -1; 439 } 440 441 static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl) 442 { 443 struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); 444 445 if (!dev_queue) 446 return NULL; 447 448 return dev_queue->qdisc_sleeping; 449 } 450 451 static unsigned long mqprio_find(struct Qdisc *sch, u32 classid) 452 { 453 struct net_device *dev = qdisc_dev(sch); 454 unsigned int ntx = TC_H_MIN(classid); 455 456 /* There are essentially two regions here that have valid classid 457 * values. The first region will have a classid value of 1 through 458 * num_tx_queues. All of these are backed by actual Qdiscs. 459 */ 460 if (ntx < TC_H_MIN_PRIORITY) 461 return (ntx <= dev->num_tx_queues) ? ntx : 0; 462 463 /* The second region represents the hardware traffic classes. These 464 * are represented by classid values of TC_H_MIN_PRIORITY through 465 * TC_H_MIN_PRIORITY + netdev_get_num_tc - 1 466 */ 467 return ((ntx - TC_H_MIN_PRIORITY) < netdev_get_num_tc(dev)) ? ntx : 0; 468 } 469 470 static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl, 471 struct sk_buff *skb, struct tcmsg *tcm) 472 { 473 if (cl < TC_H_MIN_PRIORITY) { 474 struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); 475 struct net_device *dev = qdisc_dev(sch); 476 int tc = netdev_txq_to_tc(dev, cl - 1); 477 478 tcm->tcm_parent = (tc < 0) ? 0 : 479 TC_H_MAKE(TC_H_MAJ(sch->handle), 480 TC_H_MIN(tc + TC_H_MIN_PRIORITY)); 481 tcm->tcm_info = dev_queue->qdisc_sleeping->handle; 482 } else { 483 tcm->tcm_parent = TC_H_ROOT; 484 tcm->tcm_info = 0; 485 } 486 tcm->tcm_handle |= TC_H_MIN(cl); 487 return 0; 488 } 489 490 static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, 491 struct gnet_dump *d) 492 __releases(d->lock) 493 __acquires(d->lock) 494 { 495 if (cl >= TC_H_MIN_PRIORITY) { 496 int i; 497 __u32 qlen = 0; 498 struct Qdisc *qdisc; 499 struct gnet_stats_queue qstats = {0}; 500 struct gnet_stats_basic_packed bstats = {0}; 501 struct net_device *dev = qdisc_dev(sch); 502 struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; 503 504 /* Drop lock here it will be reclaimed before touching 505 * statistics this is required because the d->lock we 506 * hold here is the look on dev_queue->qdisc_sleeping 507 * also acquired below. 508 */ 509 if (d->lock) 510 spin_unlock_bh(d->lock); 511 512 for (i = tc.offset; i < tc.offset + tc.count; i++) { 513 struct netdev_queue *q = netdev_get_tx_queue(dev, i); 514 515 qdisc = rtnl_dereference(q->qdisc); 516 spin_lock_bh(qdisc_lock(qdisc)); 517 qlen += qdisc->q.qlen; 518 bstats.bytes += qdisc->bstats.bytes; 519 bstats.packets += qdisc->bstats.packets; 520 qstats.backlog += qdisc->qstats.backlog; 521 qstats.drops += qdisc->qstats.drops; 522 qstats.requeues += qdisc->qstats.requeues; 523 qstats.overlimits += qdisc->qstats.overlimits; 524 spin_unlock_bh(qdisc_lock(qdisc)); 525 } 526 /* Reclaim root sleeping lock before completing stats */ 527 if (d->lock) 528 spin_lock_bh(d->lock); 529 if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 || 530 gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) 531 return -1; 532 } else { 533 struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); 534 535 sch = dev_queue->qdisc_sleeping; 536 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), 537 d, NULL, &sch->bstats) < 0 || 538 gnet_stats_copy_queue(d, NULL, 539 &sch->qstats, sch->q.qlen) < 0) 540 return -1; 541 } 542 return 0; 543 } 544 545 static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) 546 { 547 struct net_device *dev = qdisc_dev(sch); 548 unsigned long ntx; 549 550 if (arg->stop) 551 return; 552 553 /* Walk hierarchy with a virtual class per tc */ 554 arg->count = arg->skip; 555 for (ntx = arg->skip; ntx < netdev_get_num_tc(dev); ntx++) { 556 if (arg->fn(sch, ntx + TC_H_MIN_PRIORITY, arg) < 0) { 557 arg->stop = 1; 558 return; 559 } 560 arg->count++; 561 } 562 563 /* Pad the values and skip over unused traffic classes */ 564 if (ntx < TC_MAX_QUEUE) { 565 arg->count = TC_MAX_QUEUE; 566 ntx = TC_MAX_QUEUE; 567 } 568 569 /* Reset offset, sort out remaining per-queue qdiscs */ 570 for (ntx -= TC_MAX_QUEUE; ntx < dev->num_tx_queues; ntx++) { 571 if (arg->fn(sch, ntx + 1, arg) < 0) { 572 arg->stop = 1; 573 return; 574 } 575 arg->count++; 576 } 577 } 578 579 static struct netdev_queue *mqprio_select_queue(struct Qdisc *sch, 580 struct tcmsg *tcm) 581 { 582 return mqprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); 583 } 584 585 static const struct Qdisc_class_ops mqprio_class_ops = { 586 .graft = mqprio_graft, 587 .leaf = mqprio_leaf, 588 .find = mqprio_find, 589 .walk = mqprio_walk, 590 .dump = mqprio_dump_class, 591 .dump_stats = mqprio_dump_class_stats, 592 .select_queue = mqprio_select_queue, 593 }; 594 595 static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { 596 .cl_ops = &mqprio_class_ops, 597 .id = "mqprio", 598 .priv_size = sizeof(struct mqprio_sched), 599 .init = mqprio_init, 600 .destroy = mqprio_destroy, 601 .attach = mqprio_attach, 602 .dump = mqprio_dump, 603 .owner = THIS_MODULE, 604 }; 605 606 static int __init mqprio_module_init(void) 607 { 608 return register_qdisc(&mqprio_qdisc_ops); 609 } 610 611 static void __exit mqprio_module_exit(void) 612 { 613 unregister_qdisc(&mqprio_qdisc_ops); 614 } 615 616 module_init(mqprio_module_init); 617 module_exit(mqprio_module_exit); 618 619 MODULE_LICENSE("GPL"); 620