1 // SPDX-License-Identifier: GPL-2.0 2 3 /* net/sched/sch_taprio.c Time Aware Priority Scheduler 4 * 5 * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> 6 * 7 */ 8 9 #include <linux/ethtool.h> 10 #include <linux/types.h> 11 #include <linux/slab.h> 12 #include <linux/kernel.h> 13 #include <linux/string.h> 14 #include <linux/list.h> 15 #include <linux/errno.h> 16 #include <linux/skbuff.h> 17 #include <linux/math64.h> 18 #include <linux/module.h> 19 #include <linux/spinlock.h> 20 #include <linux/rcupdate.h> 21 #include <linux/time.h> 22 #include <net/netlink.h> 23 #include <net/pkt_sched.h> 24 #include <net/pkt_cls.h> 25 #include <net/sch_generic.h> 26 #include <net/sock.h> 27 #include <net/tcp.h> 28 29 #include "sch_mqprio_lib.h" 30 31 static LIST_HEAD(taprio_list); 32 static struct static_key_false taprio_have_broken_mqprio; 33 static struct static_key_false taprio_have_working_mqprio; 34 35 #define TAPRIO_ALL_GATES_OPEN -1 36 37 #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) 38 #define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) 39 #define TAPRIO_FLAGS_INVALID U32_MAX 40 41 struct sched_entry { 42 /* Durations between this GCL entry and the GCL entry where the 43 * respective traffic class gate closes 44 */ 45 u64 gate_duration[TC_MAX_QUEUE]; 46 atomic_t budget[TC_MAX_QUEUE]; 47 /* The qdisc makes some effort so that no packet leaves 48 * after this time 49 */ 50 ktime_t gate_close_time[TC_MAX_QUEUE]; 51 struct list_head list; 52 /* Used to calculate when to advance the schedule */ 53 ktime_t end_time; 54 ktime_t next_txtime; 55 int index; 56 u32 gate_mask; 57 u32 interval; 58 u8 command; 59 }; 60 61 struct sched_gate_list { 62 /* Longest non-zero contiguous gate durations per traffic class, 63 * or 0 if a traffic class gate never opens during the schedule. 64 */ 65 u64 max_open_gate_duration[TC_MAX_QUEUE]; 66 u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ 67 u32 max_sdu[TC_MAX_QUEUE]; /* for dump */ 68 struct rcu_head rcu; 69 struct list_head entries; 70 size_t num_entries; 71 ktime_t cycle_end_time; 72 s64 cycle_time; 73 s64 cycle_time_extension; 74 s64 base_time; 75 }; 76 77 struct taprio_sched { 78 struct Qdisc **qdiscs; 79 struct Qdisc *root; 80 u32 flags; 81 enum tk_offsets tk_offset; 82 int clockid; 83 bool offloaded; 84 bool detected_mqprio; 85 bool broken_mqprio; 86 atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ 87 * speeds it's sub-nanoseconds per byte 88 */ 89 90 /* Protects the update side of the RCU protected current_entry */ 91 spinlock_t current_entry_lock; 92 struct sched_entry __rcu *current_entry; 93 struct sched_gate_list __rcu *oper_sched; 94 struct sched_gate_list __rcu *admin_sched; 95 struct hrtimer advance_timer; 96 struct list_head taprio_list; 97 int cur_txq[TC_MAX_QUEUE]; 98 u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */ 99 u32 txtime_delay; 100 }; 101 102 struct __tc_taprio_qopt_offload { 103 refcount_t users; 104 struct tc_taprio_qopt_offload offload; 105 }; 106 107 static void taprio_calculate_gate_durations(struct taprio_sched *q, 108 struct sched_gate_list *sched) 109 { 110 struct net_device *dev = qdisc_dev(q->root); 111 int num_tc = netdev_get_num_tc(dev); 112 struct sched_entry *entry, *cur; 113 int tc; 114 115 list_for_each_entry(entry, &sched->entries, list) { 116 u32 gates_still_open = entry->gate_mask; 117 118 /* For each traffic class, calculate each open gate duration, 119 * starting at this schedule entry and ending at the schedule 120 * entry containing a gate close event for that TC. 121 */ 122 cur = entry; 123 124 do { 125 if (!gates_still_open) 126 break; 127 128 for (tc = 0; tc < num_tc; tc++) { 129 if (!(gates_still_open & BIT(tc))) 130 continue; 131 132 if (cur->gate_mask & BIT(tc)) 133 entry->gate_duration[tc] += cur->interval; 134 else 135 gates_still_open &= ~BIT(tc); 136 } 137 138 cur = list_next_entry_circular(cur, &sched->entries, list); 139 } while (cur != entry); 140 141 /* Keep track of the maximum gate duration for each traffic 142 * class, taking care to not confuse a traffic class which is 143 * temporarily closed with one that is always closed. 144 */ 145 for (tc = 0; tc < num_tc; tc++) 146 if (entry->gate_duration[tc] && 147 sched->max_open_gate_duration[tc] < entry->gate_duration[tc]) 148 sched->max_open_gate_duration[tc] = entry->gate_duration[tc]; 149 } 150 } 151 152 static bool taprio_entry_allows_tx(ktime_t skb_end_time, 153 struct sched_entry *entry, int tc) 154 { 155 return ktime_before(skb_end_time, entry->gate_close_time[tc]); 156 } 157 158 static ktime_t sched_base_time(const struct sched_gate_list *sched) 159 { 160 if (!sched) 161 return KTIME_MAX; 162 163 return ns_to_ktime(sched->base_time); 164 } 165 166 static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) 167 { 168 /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ 169 enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); 170 171 switch (tk_offset) { 172 case TK_OFFS_MAX: 173 return mono; 174 default: 175 return ktime_mono_to_any(mono, tk_offset); 176 } 177 } 178 179 static ktime_t taprio_get_time(const struct taprio_sched *q) 180 { 181 return taprio_mono_to_any(q, ktime_get()); 182 } 183 184 static void taprio_free_sched_cb(struct rcu_head *head) 185 { 186 struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); 187 struct sched_entry *entry, *n; 188 189 list_for_each_entry_safe(entry, n, &sched->entries, list) { 190 list_del(&entry->list); 191 kfree(entry); 192 } 193 194 kfree(sched); 195 } 196 197 static void switch_schedules(struct taprio_sched *q, 198 struct sched_gate_list **admin, 199 struct sched_gate_list **oper) 200 { 201 rcu_assign_pointer(q->oper_sched, *admin); 202 rcu_assign_pointer(q->admin_sched, NULL); 203 204 if (*oper) 205 call_rcu(&(*oper)->rcu, taprio_free_sched_cb); 206 207 *oper = *admin; 208 *admin = NULL; 209 } 210 211 /* Get how much time has been already elapsed in the current cycle. */ 212 static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) 213 { 214 ktime_t time_since_sched_start; 215 s32 time_elapsed; 216 217 time_since_sched_start = ktime_sub(time, sched->base_time); 218 div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); 219 220 return time_elapsed; 221 } 222 223 static ktime_t get_interval_end_time(struct sched_gate_list *sched, 224 struct sched_gate_list *admin, 225 struct sched_entry *entry, 226 ktime_t intv_start) 227 { 228 s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); 229 ktime_t intv_end, cycle_ext_end, cycle_end; 230 231 cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); 232 intv_end = ktime_add_ns(intv_start, entry->interval); 233 cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); 234 235 if (ktime_before(intv_end, cycle_end)) 236 return intv_end; 237 else if (admin && admin != sched && 238 ktime_after(admin->base_time, cycle_end) && 239 ktime_before(admin->base_time, cycle_ext_end)) 240 return admin->base_time; 241 else 242 return cycle_end; 243 } 244 245 static int length_to_duration(struct taprio_sched *q, int len) 246 { 247 return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); 248 } 249 250 static int duration_to_length(struct taprio_sched *q, u64 duration) 251 { 252 return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); 253 } 254 255 /* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the 256 * q->max_sdu[] requested by the user and the max_sdu dynamically determined by 257 * the maximum open gate durations at the given link speed. 258 */ 259 static void taprio_update_queue_max_sdu(struct taprio_sched *q, 260 struct sched_gate_list *sched, 261 struct qdisc_size_table *stab) 262 { 263 struct net_device *dev = qdisc_dev(q->root); 264 int num_tc = netdev_get_num_tc(dev); 265 u32 max_sdu_from_user; 266 u32 max_sdu_dynamic; 267 u32 max_sdu; 268 int tc; 269 270 for (tc = 0; tc < num_tc; tc++) { 271 max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX; 272 273 /* TC gate never closes => keep the queueMaxSDU 274 * selected by the user 275 */ 276 if (sched->max_open_gate_duration[tc] == sched->cycle_time) { 277 max_sdu_dynamic = U32_MAX; 278 } else { 279 u32 max_frm_len; 280 281 max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]); 282 if (stab) 283 max_frm_len -= stab->szopts.overhead; 284 max_sdu_dynamic = max_frm_len - dev->hard_header_len; 285 } 286 287 max_sdu = min(max_sdu_dynamic, max_sdu_from_user); 288 289 if (max_sdu != U32_MAX) { 290 sched->max_frm_len[tc] = max_sdu + dev->hard_header_len; 291 sched->max_sdu[tc] = max_sdu; 292 } else { 293 sched->max_frm_len[tc] = U32_MAX; /* never oversized */ 294 sched->max_sdu[tc] = 0; 295 } 296 } 297 } 298 299 /* Returns the entry corresponding to next available interval. If 300 * validate_interval is set, it only validates whether the timestamp occurs 301 * when the gate corresponding to the skb's traffic class is open. 302 */ 303 static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, 304 struct Qdisc *sch, 305 struct sched_gate_list *sched, 306 struct sched_gate_list *admin, 307 ktime_t time, 308 ktime_t *interval_start, 309 ktime_t *interval_end, 310 bool validate_interval) 311 { 312 ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; 313 ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; 314 struct sched_entry *entry = NULL, *entry_found = NULL; 315 struct taprio_sched *q = qdisc_priv(sch); 316 struct net_device *dev = qdisc_dev(sch); 317 bool entry_available = false; 318 s32 cycle_elapsed; 319 int tc, n; 320 321 tc = netdev_get_prio_tc_map(dev, skb->priority); 322 packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); 323 324 *interval_start = 0; 325 *interval_end = 0; 326 327 if (!sched) 328 return NULL; 329 330 cycle = sched->cycle_time; 331 cycle_elapsed = get_cycle_time_elapsed(sched, time); 332 curr_intv_end = ktime_sub_ns(time, cycle_elapsed); 333 cycle_end = ktime_add_ns(curr_intv_end, cycle); 334 335 list_for_each_entry(entry, &sched->entries, list) { 336 curr_intv_start = curr_intv_end; 337 curr_intv_end = get_interval_end_time(sched, admin, entry, 338 curr_intv_start); 339 340 if (ktime_after(curr_intv_start, cycle_end)) 341 break; 342 343 if (!(entry->gate_mask & BIT(tc)) || 344 packet_transmit_time > entry->interval) 345 continue; 346 347 txtime = entry->next_txtime; 348 349 if (ktime_before(txtime, time) || validate_interval) { 350 transmit_end_time = ktime_add_ns(time, packet_transmit_time); 351 if ((ktime_before(curr_intv_start, time) && 352 ktime_before(transmit_end_time, curr_intv_end)) || 353 (ktime_after(curr_intv_start, time) && !validate_interval)) { 354 entry_found = entry; 355 *interval_start = curr_intv_start; 356 *interval_end = curr_intv_end; 357 break; 358 } else if (!entry_available && !validate_interval) { 359 /* Here, we are just trying to find out the 360 * first available interval in the next cycle. 361 */ 362 entry_available = true; 363 entry_found = entry; 364 *interval_start = ktime_add_ns(curr_intv_start, cycle); 365 *interval_end = ktime_add_ns(curr_intv_end, cycle); 366 } 367 } else if (ktime_before(txtime, earliest_txtime) && 368 !entry_available) { 369 earliest_txtime = txtime; 370 entry_found = entry; 371 n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); 372 *interval_start = ktime_add(curr_intv_start, n * cycle); 373 *interval_end = ktime_add(curr_intv_end, n * cycle); 374 } 375 } 376 377 return entry_found; 378 } 379 380 static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) 381 { 382 struct taprio_sched *q = qdisc_priv(sch); 383 struct sched_gate_list *sched, *admin; 384 ktime_t interval_start, interval_end; 385 struct sched_entry *entry; 386 387 rcu_read_lock(); 388 sched = rcu_dereference(q->oper_sched); 389 admin = rcu_dereference(q->admin_sched); 390 391 entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, 392 &interval_start, &interval_end, true); 393 rcu_read_unlock(); 394 395 return entry; 396 } 397 398 static bool taprio_flags_valid(u32 flags) 399 { 400 /* Make sure no other flag bits are set. */ 401 if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | 402 TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) 403 return false; 404 /* txtime-assist and full offload are mutually exclusive */ 405 if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && 406 (flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) 407 return false; 408 return true; 409 } 410 411 /* This returns the tstamp value set by TCP in terms of the set clock. */ 412 static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) 413 { 414 unsigned int offset = skb_network_offset(skb); 415 const struct ipv6hdr *ipv6h; 416 const struct iphdr *iph; 417 struct ipv6hdr _ipv6h; 418 419 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); 420 if (!ipv6h) 421 return 0; 422 423 if (ipv6h->version == 4) { 424 iph = (struct iphdr *)ipv6h; 425 offset += iph->ihl * 4; 426 427 /* special-case 6in4 tunnelling, as that is a common way to get 428 * v6 connectivity in the home 429 */ 430 if (iph->protocol == IPPROTO_IPV6) { 431 ipv6h = skb_header_pointer(skb, offset, 432 sizeof(_ipv6h), &_ipv6h); 433 434 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) 435 return 0; 436 } else if (iph->protocol != IPPROTO_TCP) { 437 return 0; 438 } 439 } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) { 440 return 0; 441 } 442 443 return taprio_mono_to_any(q, skb->skb_mstamp_ns); 444 } 445 446 /* There are a few scenarios where we will have to modify the txtime from 447 * what is read from next_txtime in sched_entry. They are: 448 * 1. If txtime is in the past, 449 * a. The gate for the traffic class is currently open and packet can be 450 * transmitted before it closes, schedule the packet right away. 451 * b. If the gate corresponding to the traffic class is going to open later 452 * in the cycle, set the txtime of packet to the interval start. 453 * 2. If txtime is in the future, there are packets corresponding to the 454 * current traffic class waiting to be transmitted. So, the following 455 * possibilities exist: 456 * a. We can transmit the packet before the window containing the txtime 457 * closes. 458 * b. The window might close before the transmission can be completed 459 * successfully. So, schedule the packet in the next open window. 460 */ 461 static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) 462 { 463 ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; 464 struct taprio_sched *q = qdisc_priv(sch); 465 struct sched_gate_list *sched, *admin; 466 ktime_t minimum_time, now, txtime; 467 int len, packet_transmit_time; 468 struct sched_entry *entry; 469 bool sched_changed; 470 471 now = taprio_get_time(q); 472 minimum_time = ktime_add_ns(now, q->txtime_delay); 473 474 tcp_tstamp = get_tcp_tstamp(q, skb); 475 minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp); 476 477 rcu_read_lock(); 478 admin = rcu_dereference(q->admin_sched); 479 sched = rcu_dereference(q->oper_sched); 480 if (admin && ktime_after(minimum_time, admin->base_time)) 481 switch_schedules(q, &admin, &sched); 482 483 /* Until the schedule starts, all the queues are open */ 484 if (!sched || ktime_before(minimum_time, sched->base_time)) { 485 txtime = minimum_time; 486 goto done; 487 } 488 489 len = qdisc_pkt_len(skb); 490 packet_transmit_time = length_to_duration(q, len); 491 492 do { 493 sched_changed = false; 494 495 entry = find_entry_to_transmit(skb, sch, sched, admin, 496 minimum_time, 497 &interval_start, &interval_end, 498 false); 499 if (!entry) { 500 txtime = 0; 501 goto done; 502 } 503 504 txtime = entry->next_txtime; 505 txtime = max_t(ktime_t, txtime, minimum_time); 506 txtime = max_t(ktime_t, txtime, interval_start); 507 508 if (admin && admin != sched && 509 ktime_after(txtime, admin->base_time)) { 510 sched = admin; 511 sched_changed = true; 512 continue; 513 } 514 515 transmit_end_time = ktime_add(txtime, packet_transmit_time); 516 minimum_time = transmit_end_time; 517 518 /* Update the txtime of current entry to the next time it's 519 * interval starts. 520 */ 521 if (ktime_after(transmit_end_time, interval_end)) 522 entry->next_txtime = ktime_add(interval_start, sched->cycle_time); 523 } while (sched_changed || ktime_after(transmit_end_time, interval_end)); 524 525 entry->next_txtime = transmit_end_time; 526 527 done: 528 rcu_read_unlock(); 529 return txtime; 530 } 531 532 /* Devices with full offload are expected to honor this in hardware */ 533 static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch, 534 struct sk_buff *skb) 535 { 536 struct taprio_sched *q = qdisc_priv(sch); 537 struct net_device *dev = qdisc_dev(sch); 538 struct sched_gate_list *sched; 539 int prio = skb->priority; 540 bool exceeds = false; 541 u8 tc; 542 543 tc = netdev_get_prio_tc_map(dev, prio); 544 545 rcu_read_lock(); 546 sched = rcu_dereference(q->oper_sched); 547 if (sched && skb->len > sched->max_frm_len[tc]) 548 exceeds = true; 549 rcu_read_unlock(); 550 551 return exceeds; 552 } 553 554 static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, 555 struct Qdisc *child, struct sk_buff **to_free) 556 { 557 struct taprio_sched *q = qdisc_priv(sch); 558 559 /* sk_flags are only safe to use on full sockets. */ 560 if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { 561 if (!is_valid_interval(skb, sch)) 562 return qdisc_drop(skb, sch, to_free); 563 } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { 564 skb->tstamp = get_packet_txtime(skb, sch); 565 if (!skb->tstamp) 566 return qdisc_drop(skb, sch, to_free); 567 } 568 569 qdisc_qstats_backlog_inc(sch, skb); 570 sch->q.qlen++; 571 572 return qdisc_enqueue(skb, child, to_free); 573 } 574 575 static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, 576 struct Qdisc *child, 577 struct sk_buff **to_free) 578 { 579 unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); 580 netdev_features_t features = netif_skb_features(skb); 581 struct sk_buff *segs, *nskb; 582 int ret; 583 584 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 585 if (IS_ERR_OR_NULL(segs)) 586 return qdisc_drop(skb, sch, to_free); 587 588 skb_list_walk_safe(segs, segs, nskb) { 589 skb_mark_not_on_list(segs); 590 qdisc_skb_cb(segs)->pkt_len = segs->len; 591 slen += segs->len; 592 593 /* FIXME: we should be segmenting to a smaller size 594 * rather than dropping these 595 */ 596 if (taprio_skb_exceeds_queue_max_sdu(sch, segs)) 597 ret = qdisc_drop(segs, sch, to_free); 598 else 599 ret = taprio_enqueue_one(segs, sch, child, to_free); 600 601 if (ret != NET_XMIT_SUCCESS) { 602 if (net_xmit_drop_count(ret)) 603 qdisc_qstats_drop(sch); 604 } else { 605 numsegs++; 606 } 607 } 608 609 if (numsegs > 1) 610 qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); 611 consume_skb(skb); 612 613 return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; 614 } 615 616 /* Will not be called in the full offload case, since the TX queues are 617 * attached to the Qdisc created using qdisc_create_dflt() 618 */ 619 static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, 620 struct sk_buff **to_free) 621 { 622 struct taprio_sched *q = qdisc_priv(sch); 623 struct Qdisc *child; 624 int queue; 625 626 queue = skb_get_queue_mapping(skb); 627 628 child = q->qdiscs[queue]; 629 if (unlikely(!child)) 630 return qdisc_drop(skb, sch, to_free); 631 632 if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { 633 /* Large packets might not be transmitted when the transmission 634 * duration exceeds any configured interval. Therefore, segment 635 * the skb into smaller chunks. Drivers with full offload are 636 * expected to handle this in hardware. 637 */ 638 if (skb_is_gso(skb)) 639 return taprio_enqueue_segmented(skb, sch, child, 640 to_free); 641 642 return qdisc_drop(skb, sch, to_free); 643 } 644 645 return taprio_enqueue_one(skb, sch, child, to_free); 646 } 647 648 static struct sk_buff *taprio_peek(struct Qdisc *sch) 649 { 650 WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented"); 651 return NULL; 652 } 653 654 static void taprio_set_budgets(struct taprio_sched *q, 655 struct sched_gate_list *sched, 656 struct sched_entry *entry) 657 { 658 struct net_device *dev = qdisc_dev(q->root); 659 int num_tc = netdev_get_num_tc(dev); 660 int tc, budget; 661 662 for (tc = 0; tc < num_tc; tc++) { 663 /* Traffic classes which never close have infinite budget */ 664 if (entry->gate_duration[tc] == sched->cycle_time) 665 budget = INT_MAX; 666 else 667 budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC, 668 atomic64_read(&q->picos_per_byte)); 669 670 atomic_set(&entry->budget[tc], budget); 671 } 672 } 673 674 /* When an skb is sent, it consumes from the budget of all traffic classes */ 675 static int taprio_update_budgets(struct sched_entry *entry, size_t len, 676 int tc_consumed, int num_tc) 677 { 678 int tc, budget, new_budget = 0; 679 680 for (tc = 0; tc < num_tc; tc++) { 681 budget = atomic_read(&entry->budget[tc]); 682 /* Don't consume from infinite budget */ 683 if (budget == INT_MAX) { 684 if (tc == tc_consumed) 685 new_budget = budget; 686 continue; 687 } 688 689 if (tc == tc_consumed) 690 new_budget = atomic_sub_return(len, &entry->budget[tc]); 691 else 692 atomic_sub(len, &entry->budget[tc]); 693 } 694 695 return new_budget; 696 } 697 698 static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq, 699 struct sched_entry *entry, 700 u32 gate_mask) 701 { 702 struct taprio_sched *q = qdisc_priv(sch); 703 struct net_device *dev = qdisc_dev(sch); 704 struct Qdisc *child = q->qdiscs[txq]; 705 int num_tc = netdev_get_num_tc(dev); 706 struct sk_buff *skb; 707 ktime_t guard; 708 int prio; 709 int len; 710 u8 tc; 711 712 if (unlikely(!child)) 713 return NULL; 714 715 if (TXTIME_ASSIST_IS_ENABLED(q->flags)) 716 goto skip_peek_checks; 717 718 skb = child->ops->peek(child); 719 if (!skb) 720 return NULL; 721 722 prio = skb->priority; 723 tc = netdev_get_prio_tc_map(dev, prio); 724 725 if (!(gate_mask & BIT(tc))) 726 return NULL; 727 728 len = qdisc_pkt_len(skb); 729 guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); 730 731 /* In the case that there's no gate entry, there's no 732 * guard band ... 733 */ 734 if (gate_mask != TAPRIO_ALL_GATES_OPEN && 735 !taprio_entry_allows_tx(guard, entry, tc)) 736 return NULL; 737 738 /* ... and no budget. */ 739 if (gate_mask != TAPRIO_ALL_GATES_OPEN && 740 taprio_update_budgets(entry, len, tc, num_tc) < 0) 741 return NULL; 742 743 skip_peek_checks: 744 skb = child->ops->dequeue(child); 745 if (unlikely(!skb)) 746 return NULL; 747 748 qdisc_bstats_update(sch, skb); 749 qdisc_qstats_backlog_dec(sch, skb); 750 sch->q.qlen--; 751 752 return skb; 753 } 754 755 static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq) 756 { 757 int offset = dev->tc_to_txq[tc].offset; 758 int count = dev->tc_to_txq[tc].count; 759 760 (*txq)++; 761 if (*txq == offset + count) 762 *txq = offset; 763 } 764 765 /* Prioritize higher traffic classes, and select among TXQs belonging to the 766 * same TC using round robin 767 */ 768 static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, 769 struct sched_entry *entry, 770 u32 gate_mask) 771 { 772 struct taprio_sched *q = qdisc_priv(sch); 773 struct net_device *dev = qdisc_dev(sch); 774 int num_tc = netdev_get_num_tc(dev); 775 struct sk_buff *skb; 776 int tc; 777 778 for (tc = num_tc - 1; tc >= 0; tc--) { 779 int first_txq = q->cur_txq[tc]; 780 781 if (!(gate_mask & BIT(tc))) 782 continue; 783 784 do { 785 skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc], 786 entry, gate_mask); 787 788 taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]); 789 790 if (skb) 791 return skb; 792 } while (q->cur_txq[tc] != first_txq); 793 } 794 795 return NULL; 796 } 797 798 /* Broken way of prioritizing smaller TXQ indices and ignoring the traffic 799 * class other than to determine whether the gate is open or not 800 */ 801 static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, 802 struct sched_entry *entry, 803 u32 gate_mask) 804 { 805 struct net_device *dev = qdisc_dev(sch); 806 struct sk_buff *skb; 807 int i; 808 809 for (i = 0; i < dev->num_tx_queues; i++) { 810 skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); 811 if (skb) 812 return skb; 813 } 814 815 return NULL; 816 } 817 818 /* Will not be called in the full offload case, since the TX queues are 819 * attached to the Qdisc created using qdisc_create_dflt() 820 */ 821 static struct sk_buff *taprio_dequeue(struct Qdisc *sch) 822 { 823 struct taprio_sched *q = qdisc_priv(sch); 824 struct sk_buff *skb = NULL; 825 struct sched_entry *entry; 826 u32 gate_mask; 827 828 rcu_read_lock(); 829 entry = rcu_dereference(q->current_entry); 830 /* if there's no entry, it means that the schedule didn't 831 * start yet, so force all gates to be open, this is in 832 * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 833 * "AdminGateStates" 834 */ 835 gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; 836 if (!gate_mask) 837 goto done; 838 839 if (static_branch_unlikely(&taprio_have_broken_mqprio) && 840 !static_branch_likely(&taprio_have_working_mqprio)) { 841 /* Single NIC kind which is broken */ 842 skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); 843 } else if (static_branch_likely(&taprio_have_working_mqprio) && 844 !static_branch_unlikely(&taprio_have_broken_mqprio)) { 845 /* Single NIC kind which prioritizes properly */ 846 skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); 847 } else { 848 /* Mixed NIC kinds present in system, need dynamic testing */ 849 if (q->broken_mqprio) 850 skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); 851 else 852 skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); 853 } 854 855 done: 856 rcu_read_unlock(); 857 858 return skb; 859 } 860 861 static bool should_restart_cycle(const struct sched_gate_list *oper, 862 const struct sched_entry *entry) 863 { 864 if (list_is_last(&entry->list, &oper->entries)) 865 return true; 866 867 if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0) 868 return true; 869 870 return false; 871 } 872 873 static bool should_change_schedules(const struct sched_gate_list *admin, 874 const struct sched_gate_list *oper, 875 ktime_t end_time) 876 { 877 ktime_t next_base_time, extension_time; 878 879 if (!admin) 880 return false; 881 882 next_base_time = sched_base_time(admin); 883 884 /* This is the simple case, the end_time would fall after 885 * the next schedule base_time. 886 */ 887 if (ktime_compare(next_base_time, end_time) <= 0) 888 return true; 889 890 /* This is the cycle_time_extension case, if the end_time 891 * plus the amount that can be extended would fall after the 892 * next schedule base_time, we can extend the current schedule 893 * for that amount. 894 */ 895 extension_time = ktime_add_ns(end_time, oper->cycle_time_extension); 896 897 /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about 898 * how precisely the extension should be made. So after 899 * conformance testing, this logic may change. 900 */ 901 if (ktime_compare(next_base_time, extension_time) <= 0) 902 return true; 903 904 return false; 905 } 906 907 static enum hrtimer_restart advance_sched(struct hrtimer *timer) 908 { 909 struct taprio_sched *q = container_of(timer, struct taprio_sched, 910 advance_timer); 911 struct net_device *dev = qdisc_dev(q->root); 912 struct sched_gate_list *oper, *admin; 913 int num_tc = netdev_get_num_tc(dev); 914 struct sched_entry *entry, *next; 915 struct Qdisc *sch = q->root; 916 ktime_t end_time; 917 int tc; 918 919 spin_lock(&q->current_entry_lock); 920 entry = rcu_dereference_protected(q->current_entry, 921 lockdep_is_held(&q->current_entry_lock)); 922 oper = rcu_dereference_protected(q->oper_sched, 923 lockdep_is_held(&q->current_entry_lock)); 924 admin = rcu_dereference_protected(q->admin_sched, 925 lockdep_is_held(&q->current_entry_lock)); 926 927 if (!oper) 928 switch_schedules(q, &admin, &oper); 929 930 /* This can happen in two cases: 1. this is the very first run 931 * of this function (i.e. we weren't running any schedule 932 * previously); 2. The previous schedule just ended. The first 933 * entry of all schedules are pre-calculated during the 934 * schedule initialization. 935 */ 936 if (unlikely(!entry || entry->end_time == oper->base_time)) { 937 next = list_first_entry(&oper->entries, struct sched_entry, 938 list); 939 end_time = next->end_time; 940 goto first_run; 941 } 942 943 if (should_restart_cycle(oper, entry)) { 944 next = list_first_entry(&oper->entries, struct sched_entry, 945 list); 946 oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time, 947 oper->cycle_time); 948 } else { 949 next = list_next_entry(entry, list); 950 } 951 952 end_time = ktime_add_ns(entry->end_time, next->interval); 953 end_time = min_t(ktime_t, end_time, oper->cycle_end_time); 954 955 for (tc = 0; tc < num_tc; tc++) { 956 if (next->gate_duration[tc] == oper->cycle_time) 957 next->gate_close_time[tc] = KTIME_MAX; 958 else 959 next->gate_close_time[tc] = ktime_add_ns(entry->end_time, 960 next->gate_duration[tc]); 961 } 962 963 if (should_change_schedules(admin, oper, end_time)) { 964 /* Set things so the next time this runs, the new 965 * schedule runs. 966 */ 967 end_time = sched_base_time(admin); 968 switch_schedules(q, &admin, &oper); 969 } 970 971 next->end_time = end_time; 972 taprio_set_budgets(q, oper, next); 973 974 first_run: 975 rcu_assign_pointer(q->current_entry, next); 976 spin_unlock(&q->current_entry_lock); 977 978 hrtimer_set_expires(&q->advance_timer, end_time); 979 980 rcu_read_lock(); 981 __netif_schedule(sch); 982 rcu_read_unlock(); 983 984 return HRTIMER_RESTART; 985 } 986 987 static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { 988 [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, 989 [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, 990 [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, 991 [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, 992 }; 993 994 static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { 995 [TCA_TAPRIO_TC_ENTRY_INDEX] = { .type = NLA_U32 }, 996 [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, 997 }; 998 999 static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { 1000 [TCA_TAPRIO_ATTR_PRIOMAP] = { 1001 .len = sizeof(struct tc_mqprio_qopt) 1002 }, 1003 [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, 1004 [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, 1005 [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, 1006 [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, 1007 [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 }, 1008 [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, 1009 [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 }, 1010 [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, 1011 [TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED }, 1012 }; 1013 1014 static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb, 1015 struct sched_entry *entry, 1016 struct netlink_ext_ack *extack) 1017 { 1018 int min_duration = length_to_duration(q, ETH_ZLEN); 1019 u32 interval = 0; 1020 1021 if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) 1022 entry->command = nla_get_u8( 1023 tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); 1024 1025 if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) 1026 entry->gate_mask = nla_get_u32( 1027 tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); 1028 1029 if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) 1030 interval = nla_get_u32( 1031 tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); 1032 1033 /* The interval should allow at least the minimum ethernet 1034 * frame to go out. 1035 */ 1036 if (interval < min_duration) { 1037 NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); 1038 return -EINVAL; 1039 } 1040 1041 entry->interval = interval; 1042 1043 return 0; 1044 } 1045 1046 static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n, 1047 struct sched_entry *entry, int index, 1048 struct netlink_ext_ack *extack) 1049 { 1050 struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; 1051 int err; 1052 1053 err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, 1054 entry_policy, NULL); 1055 if (err < 0) { 1056 NL_SET_ERR_MSG(extack, "Could not parse nested entry"); 1057 return -EINVAL; 1058 } 1059 1060 entry->index = index; 1061 1062 return fill_sched_entry(q, tb, entry, extack); 1063 } 1064 1065 static int parse_sched_list(struct taprio_sched *q, struct nlattr *list, 1066 struct sched_gate_list *sched, 1067 struct netlink_ext_ack *extack) 1068 { 1069 struct nlattr *n; 1070 int err, rem; 1071 int i = 0; 1072 1073 if (!list) 1074 return -EINVAL; 1075 1076 nla_for_each_nested(n, list, rem) { 1077 struct sched_entry *entry; 1078 1079 if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { 1080 NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); 1081 continue; 1082 } 1083 1084 entry = kzalloc(sizeof(*entry), GFP_KERNEL); 1085 if (!entry) { 1086 NL_SET_ERR_MSG(extack, "Not enough memory for entry"); 1087 return -ENOMEM; 1088 } 1089 1090 err = parse_sched_entry(q, n, entry, i, extack); 1091 if (err < 0) { 1092 kfree(entry); 1093 return err; 1094 } 1095 1096 list_add_tail(&entry->list, &sched->entries); 1097 i++; 1098 } 1099 1100 sched->num_entries = i; 1101 1102 return i; 1103 } 1104 1105 static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, 1106 struct sched_gate_list *new, 1107 struct netlink_ext_ack *extack) 1108 { 1109 int err = 0; 1110 1111 if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) { 1112 NL_SET_ERR_MSG(extack, "Adding a single entry is not supported"); 1113 return -ENOTSUPP; 1114 } 1115 1116 if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) 1117 new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); 1118 1119 if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) 1120 new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); 1121 1122 if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) 1123 new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); 1124 1125 if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) 1126 err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], 1127 new, extack); 1128 if (err < 0) 1129 return err; 1130 1131 if (!new->cycle_time) { 1132 struct sched_entry *entry; 1133 ktime_t cycle = 0; 1134 1135 list_for_each_entry(entry, &new->entries, list) 1136 cycle = ktime_add_ns(cycle, entry->interval); 1137 1138 if (!cycle) { 1139 NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0"); 1140 return -EINVAL; 1141 } 1142 1143 new->cycle_time = cycle; 1144 } 1145 1146 taprio_calculate_gate_durations(q, new); 1147 1148 return 0; 1149 } 1150 1151 static int taprio_parse_mqprio_opt(struct net_device *dev, 1152 struct tc_mqprio_qopt *qopt, 1153 struct netlink_ext_ack *extack, 1154 u32 taprio_flags) 1155 { 1156 bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); 1157 1158 if (!qopt && !dev->num_tc) { 1159 NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); 1160 return -EINVAL; 1161 } 1162 1163 /* If num_tc is already set, it means that the user already 1164 * configured the mqprio part 1165 */ 1166 if (dev->num_tc) 1167 return 0; 1168 1169 /* taprio imposes that traffic classes map 1:n to tx queues */ 1170 if (qopt->num_tc > dev->num_tx_queues) { 1171 NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); 1172 return -EINVAL; 1173 } 1174 1175 /* For some reason, in txtime-assist mode, we allow TXQ ranges for 1176 * different TCs to overlap, and just validate the TXQ ranges. 1177 */ 1178 return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs, 1179 extack); 1180 } 1181 1182 static int taprio_get_start_time(struct Qdisc *sch, 1183 struct sched_gate_list *sched, 1184 ktime_t *start) 1185 { 1186 struct taprio_sched *q = qdisc_priv(sch); 1187 ktime_t now, base, cycle; 1188 s64 n; 1189 1190 base = sched_base_time(sched); 1191 now = taprio_get_time(q); 1192 1193 if (ktime_after(base, now)) { 1194 *start = base; 1195 return 0; 1196 } 1197 1198 cycle = sched->cycle_time; 1199 1200 /* The qdisc is expected to have at least one sched_entry. Moreover, 1201 * any entry must have 'interval' > 0. Thus if the cycle time is zero, 1202 * something went really wrong. In that case, we should warn about this 1203 * inconsistent state and return error. 1204 */ 1205 if (WARN_ON(!cycle)) 1206 return -EFAULT; 1207 1208 /* Schedule the start time for the beginning of the next 1209 * cycle. 1210 */ 1211 n = div64_s64(ktime_sub_ns(now, base), cycle); 1212 *start = ktime_add_ns(base, (n + 1) * cycle); 1213 return 0; 1214 } 1215 1216 static void setup_first_end_time(struct taprio_sched *q, 1217 struct sched_gate_list *sched, ktime_t base) 1218 { 1219 struct net_device *dev = qdisc_dev(q->root); 1220 int num_tc = netdev_get_num_tc(dev); 1221 struct sched_entry *first; 1222 ktime_t cycle; 1223 int tc; 1224 1225 first = list_first_entry(&sched->entries, 1226 struct sched_entry, list); 1227 1228 cycle = sched->cycle_time; 1229 1230 /* FIXME: find a better place to do this */ 1231 sched->cycle_end_time = ktime_add_ns(base, cycle); 1232 1233 first->end_time = ktime_add_ns(base, first->interval); 1234 taprio_set_budgets(q, sched, first); 1235 1236 for (tc = 0; tc < num_tc; tc++) { 1237 if (first->gate_duration[tc] == sched->cycle_time) 1238 first->gate_close_time[tc] = KTIME_MAX; 1239 else 1240 first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]); 1241 } 1242 1243 rcu_assign_pointer(q->current_entry, NULL); 1244 } 1245 1246 static void taprio_start_sched(struct Qdisc *sch, 1247 ktime_t start, struct sched_gate_list *new) 1248 { 1249 struct taprio_sched *q = qdisc_priv(sch); 1250 ktime_t expires; 1251 1252 if (FULL_OFFLOAD_IS_ENABLED(q->flags)) 1253 return; 1254 1255 expires = hrtimer_get_expires(&q->advance_timer); 1256 if (expires == 0) 1257 expires = KTIME_MAX; 1258 1259 /* If the new schedule starts before the next expiration, we 1260 * reprogram it to the earliest one, so we change the admin 1261 * schedule to the operational one at the right time. 1262 */ 1263 start = min_t(ktime_t, start, expires); 1264 1265 hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); 1266 } 1267 1268 static void taprio_set_picos_per_byte(struct net_device *dev, 1269 struct taprio_sched *q) 1270 { 1271 struct ethtool_link_ksettings ecmd; 1272 int speed = SPEED_10; 1273 int picos_per_byte; 1274 int err; 1275 1276 err = __ethtool_get_link_ksettings(dev, &ecmd); 1277 if (err < 0) 1278 goto skip; 1279 1280 if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) 1281 speed = ecmd.base.speed; 1282 1283 skip: 1284 picos_per_byte = (USEC_PER_SEC * 8) / speed; 1285 1286 atomic64_set(&q->picos_per_byte, picos_per_byte); 1287 netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", 1288 dev->name, (long long)atomic64_read(&q->picos_per_byte), 1289 ecmd.base.speed); 1290 } 1291 1292 static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, 1293 void *ptr) 1294 { 1295 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1296 struct sched_gate_list *oper, *admin; 1297 struct qdisc_size_table *stab; 1298 struct taprio_sched *q; 1299 1300 ASSERT_RTNL(); 1301 1302 if (event != NETDEV_UP && event != NETDEV_CHANGE) 1303 return NOTIFY_DONE; 1304 1305 list_for_each_entry(q, &taprio_list, taprio_list) { 1306 if (dev != qdisc_dev(q->root)) 1307 continue; 1308 1309 taprio_set_picos_per_byte(dev, q); 1310 1311 stab = rtnl_dereference(q->root->stab); 1312 1313 oper = rtnl_dereference(q->oper_sched); 1314 if (oper) 1315 taprio_update_queue_max_sdu(q, oper, stab); 1316 1317 admin = rtnl_dereference(q->admin_sched); 1318 if (admin) 1319 taprio_update_queue_max_sdu(q, admin, stab); 1320 1321 break; 1322 } 1323 1324 return NOTIFY_DONE; 1325 } 1326 1327 static void setup_txtime(struct taprio_sched *q, 1328 struct sched_gate_list *sched, ktime_t base) 1329 { 1330 struct sched_entry *entry; 1331 u32 interval = 0; 1332 1333 list_for_each_entry(entry, &sched->entries, list) { 1334 entry->next_txtime = ktime_add_ns(base, interval); 1335 interval += entry->interval; 1336 } 1337 } 1338 1339 static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) 1340 { 1341 struct __tc_taprio_qopt_offload *__offload; 1342 1343 __offload = kzalloc(struct_size(__offload, offload.entries, num_entries), 1344 GFP_KERNEL); 1345 if (!__offload) 1346 return NULL; 1347 1348 refcount_set(&__offload->users, 1); 1349 1350 return &__offload->offload; 1351 } 1352 1353 struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload 1354 *offload) 1355 { 1356 struct __tc_taprio_qopt_offload *__offload; 1357 1358 __offload = container_of(offload, struct __tc_taprio_qopt_offload, 1359 offload); 1360 1361 refcount_inc(&__offload->users); 1362 1363 return offload; 1364 } 1365 EXPORT_SYMBOL_GPL(taprio_offload_get); 1366 1367 void taprio_offload_free(struct tc_taprio_qopt_offload *offload) 1368 { 1369 struct __tc_taprio_qopt_offload *__offload; 1370 1371 __offload = container_of(offload, struct __tc_taprio_qopt_offload, 1372 offload); 1373 1374 if (!refcount_dec_and_test(&__offload->users)) 1375 return; 1376 1377 kfree(__offload); 1378 } 1379 EXPORT_SYMBOL_GPL(taprio_offload_free); 1380 1381 /* The function will only serve to keep the pointers to the "oper" and "admin" 1382 * schedules valid in relation to their base times, so when calling dump() the 1383 * users looks at the right schedules. 1384 * When using full offload, the admin configuration is promoted to oper at the 1385 * base_time in the PHC time domain. But because the system time is not 1386 * necessarily in sync with that, we can't just trigger a hrtimer to call 1387 * switch_schedules at the right hardware time. 1388 * At the moment we call this by hand right away from taprio, but in the future 1389 * it will be useful to create a mechanism for drivers to notify taprio of the 1390 * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump(). 1391 * This is left as TODO. 1392 */ 1393 static void taprio_offload_config_changed(struct taprio_sched *q) 1394 { 1395 struct sched_gate_list *oper, *admin; 1396 1397 oper = rtnl_dereference(q->oper_sched); 1398 admin = rtnl_dereference(q->admin_sched); 1399 1400 switch_schedules(q, &admin, &oper); 1401 } 1402 1403 static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) 1404 { 1405 u32 i, queue_mask = 0; 1406 1407 for (i = 0; i < dev->num_tc; i++) { 1408 u32 offset, count; 1409 1410 if (!(tc_mask & BIT(i))) 1411 continue; 1412 1413 offset = dev->tc_to_txq[i].offset; 1414 count = dev->tc_to_txq[i].count; 1415 1416 queue_mask |= GENMASK(offset + count - 1, offset); 1417 } 1418 1419 return queue_mask; 1420 } 1421 1422 static void taprio_sched_to_offload(struct net_device *dev, 1423 struct sched_gate_list *sched, 1424 struct tc_taprio_qopt_offload *offload, 1425 const struct tc_taprio_caps *caps) 1426 { 1427 struct sched_entry *entry; 1428 int i = 0; 1429 1430 offload->base_time = sched->base_time; 1431 offload->cycle_time = sched->cycle_time; 1432 offload->cycle_time_extension = sched->cycle_time_extension; 1433 1434 list_for_each_entry(entry, &sched->entries, list) { 1435 struct tc_taprio_sched_entry *e = &offload->entries[i]; 1436 1437 e->command = entry->command; 1438 e->interval = entry->interval; 1439 if (caps->gate_mask_per_txq) 1440 e->gate_mask = tc_map_to_queue_mask(dev, 1441 entry->gate_mask); 1442 else 1443 e->gate_mask = entry->gate_mask; 1444 1445 i++; 1446 } 1447 1448 offload->num_entries = i; 1449 } 1450 1451 static void taprio_detect_broken_mqprio(struct taprio_sched *q) 1452 { 1453 struct net_device *dev = qdisc_dev(q->root); 1454 struct tc_taprio_caps caps; 1455 1456 qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, 1457 &caps, sizeof(caps)); 1458 1459 q->broken_mqprio = caps.broken_mqprio; 1460 if (q->broken_mqprio) 1461 static_branch_inc(&taprio_have_broken_mqprio); 1462 else 1463 static_branch_inc(&taprio_have_working_mqprio); 1464 1465 q->detected_mqprio = true; 1466 } 1467 1468 static void taprio_cleanup_broken_mqprio(struct taprio_sched *q) 1469 { 1470 if (!q->detected_mqprio) 1471 return; 1472 1473 if (q->broken_mqprio) 1474 static_branch_dec(&taprio_have_broken_mqprio); 1475 else 1476 static_branch_dec(&taprio_have_working_mqprio); 1477 } 1478 1479 static int taprio_enable_offload(struct net_device *dev, 1480 struct taprio_sched *q, 1481 struct sched_gate_list *sched, 1482 struct netlink_ext_ack *extack) 1483 { 1484 const struct net_device_ops *ops = dev->netdev_ops; 1485 struct tc_taprio_qopt_offload *offload; 1486 struct tc_taprio_caps caps; 1487 int tc, err = 0; 1488 1489 if (!ops->ndo_setup_tc) { 1490 NL_SET_ERR_MSG(extack, 1491 "Device does not support taprio offload"); 1492 return -EOPNOTSUPP; 1493 } 1494 1495 qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, 1496 &caps, sizeof(caps)); 1497 1498 if (!caps.supports_queue_max_sdu) { 1499 for (tc = 0; tc < TC_MAX_QUEUE; tc++) { 1500 if (q->max_sdu[tc]) { 1501 NL_SET_ERR_MSG_MOD(extack, 1502 "Device does not handle queueMaxSDU"); 1503 return -EOPNOTSUPP; 1504 } 1505 } 1506 } 1507 1508 offload = taprio_offload_alloc(sched->num_entries); 1509 if (!offload) { 1510 NL_SET_ERR_MSG(extack, 1511 "Not enough memory for enabling offload mode"); 1512 return -ENOMEM; 1513 } 1514 offload->enable = 1; 1515 mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); 1516 taprio_sched_to_offload(dev, sched, offload, &caps); 1517 1518 for (tc = 0; tc < TC_MAX_QUEUE; tc++) 1519 offload->max_sdu[tc] = q->max_sdu[tc]; 1520 1521 err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); 1522 if (err < 0) { 1523 NL_SET_ERR_MSG(extack, 1524 "Device failed to setup taprio offload"); 1525 goto done; 1526 } 1527 1528 q->offloaded = true; 1529 1530 done: 1531 taprio_offload_free(offload); 1532 1533 return err; 1534 } 1535 1536 static int taprio_disable_offload(struct net_device *dev, 1537 struct taprio_sched *q, 1538 struct netlink_ext_ack *extack) 1539 { 1540 const struct net_device_ops *ops = dev->netdev_ops; 1541 struct tc_taprio_qopt_offload *offload; 1542 int err; 1543 1544 if (!q->offloaded) 1545 return 0; 1546 1547 offload = taprio_offload_alloc(0); 1548 if (!offload) { 1549 NL_SET_ERR_MSG(extack, 1550 "Not enough memory to disable offload mode"); 1551 return -ENOMEM; 1552 } 1553 offload->enable = 0; 1554 1555 err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); 1556 if (err < 0) { 1557 NL_SET_ERR_MSG(extack, 1558 "Device failed to disable offload"); 1559 goto out; 1560 } 1561 1562 q->offloaded = false; 1563 1564 out: 1565 taprio_offload_free(offload); 1566 1567 return err; 1568 } 1569 1570 /* If full offload is enabled, the only possible clockid is the net device's 1571 * PHC. For that reason, specifying a clockid through netlink is incorrect. 1572 * For txtime-assist, it is implicitly assumed that the device's PHC is kept 1573 * in sync with the specified clockid via a user space daemon such as phc2sys. 1574 * For both software taprio and txtime-assist, the clockid is used for the 1575 * hrtimer that advances the schedule and hence mandatory. 1576 */ 1577 static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, 1578 struct netlink_ext_ack *extack) 1579 { 1580 struct taprio_sched *q = qdisc_priv(sch); 1581 struct net_device *dev = qdisc_dev(sch); 1582 int err = -EINVAL; 1583 1584 if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { 1585 const struct ethtool_ops *ops = dev->ethtool_ops; 1586 struct ethtool_ts_info info = { 1587 .cmd = ETHTOOL_GET_TS_INFO, 1588 .phc_index = -1, 1589 }; 1590 1591 if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { 1592 NL_SET_ERR_MSG(extack, 1593 "The 'clockid' cannot be specified for full offload"); 1594 goto out; 1595 } 1596 1597 if (ops && ops->get_ts_info) 1598 err = ops->get_ts_info(dev, &info); 1599 1600 if (err || info.phc_index < 0) { 1601 NL_SET_ERR_MSG(extack, 1602 "Device does not have a PTP clock"); 1603 err = -ENOTSUPP; 1604 goto out; 1605 } 1606 } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { 1607 int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); 1608 enum tk_offsets tk_offset; 1609 1610 /* We only support static clockids and we don't allow 1611 * for it to be modified after the first init. 1612 */ 1613 if (clockid < 0 || 1614 (q->clockid != -1 && q->clockid != clockid)) { 1615 NL_SET_ERR_MSG(extack, 1616 "Changing the 'clockid' of a running schedule is not supported"); 1617 err = -ENOTSUPP; 1618 goto out; 1619 } 1620 1621 switch (clockid) { 1622 case CLOCK_REALTIME: 1623 tk_offset = TK_OFFS_REAL; 1624 break; 1625 case CLOCK_MONOTONIC: 1626 tk_offset = TK_OFFS_MAX; 1627 break; 1628 case CLOCK_BOOTTIME: 1629 tk_offset = TK_OFFS_BOOT; 1630 break; 1631 case CLOCK_TAI: 1632 tk_offset = TK_OFFS_TAI; 1633 break; 1634 default: 1635 NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); 1636 err = -EINVAL; 1637 goto out; 1638 } 1639 /* This pairs with READ_ONCE() in taprio_mono_to_any */ 1640 WRITE_ONCE(q->tk_offset, tk_offset); 1641 1642 q->clockid = clockid; 1643 } else { 1644 NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); 1645 goto out; 1646 } 1647 1648 /* Everything went ok, return success. */ 1649 err = 0; 1650 1651 out: 1652 return err; 1653 } 1654 1655 static int taprio_parse_tc_entry(struct Qdisc *sch, 1656 struct nlattr *opt, 1657 u32 max_sdu[TC_QOPT_MAX_QUEUE], 1658 unsigned long *seen_tcs, 1659 struct netlink_ext_ack *extack) 1660 { 1661 struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { }; 1662 struct net_device *dev = qdisc_dev(sch); 1663 u32 val = 0; 1664 int err, tc; 1665 1666 err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt, 1667 taprio_tc_policy, extack); 1668 if (err < 0) 1669 return err; 1670 1671 if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) { 1672 NL_SET_ERR_MSG_MOD(extack, "TC entry index missing"); 1673 return -EINVAL; 1674 } 1675 1676 tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); 1677 if (tc >= TC_QOPT_MAX_QUEUE) { 1678 NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); 1679 return -ERANGE; 1680 } 1681 1682 if (*seen_tcs & BIT(tc)) { 1683 NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry"); 1684 return -EINVAL; 1685 } 1686 1687 *seen_tcs |= BIT(tc); 1688 1689 if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) 1690 val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]); 1691 1692 if (val > dev->max_mtu) { 1693 NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU"); 1694 return -ERANGE; 1695 } 1696 1697 max_sdu[tc] = val; 1698 1699 return 0; 1700 } 1701 1702 static int taprio_parse_tc_entries(struct Qdisc *sch, 1703 struct nlattr *opt, 1704 struct netlink_ext_ack *extack) 1705 { 1706 struct taprio_sched *q = qdisc_priv(sch); 1707 u32 max_sdu[TC_QOPT_MAX_QUEUE]; 1708 unsigned long seen_tcs = 0; 1709 struct nlattr *n; 1710 int tc, rem; 1711 int err = 0; 1712 1713 for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) 1714 max_sdu[tc] = q->max_sdu[tc]; 1715 1716 nla_for_each_nested(n, opt, rem) { 1717 if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY) 1718 continue; 1719 1720 err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, 1721 extack); 1722 if (err) 1723 goto out; 1724 } 1725 1726 for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) 1727 q->max_sdu[tc] = max_sdu[tc]; 1728 1729 out: 1730 return err; 1731 } 1732 1733 static int taprio_mqprio_cmp(const struct net_device *dev, 1734 const struct tc_mqprio_qopt *mqprio) 1735 { 1736 int i; 1737 1738 if (!mqprio || mqprio->num_tc != dev->num_tc) 1739 return -1; 1740 1741 for (i = 0; i < mqprio->num_tc; i++) 1742 if (dev->tc_to_txq[i].count != mqprio->count[i] || 1743 dev->tc_to_txq[i].offset != mqprio->offset[i]) 1744 return -1; 1745 1746 for (i = 0; i <= TC_BITMASK; i++) 1747 if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i]) 1748 return -1; 1749 1750 return 0; 1751 } 1752 1753 /* The semantics of the 'flags' argument in relation to 'change()' 1754 * requests, are interpreted following two rules (which are applied in 1755 * this order): (1) an omitted 'flags' argument is interpreted as 1756 * zero; (2) the 'flags' of a "running" taprio instance cannot be 1757 * changed. 1758 */ 1759 static int taprio_new_flags(const struct nlattr *attr, u32 old, 1760 struct netlink_ext_ack *extack) 1761 { 1762 u32 new = 0; 1763 1764 if (attr) 1765 new = nla_get_u32(attr); 1766 1767 if (old != TAPRIO_FLAGS_INVALID && old != new) { 1768 NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); 1769 return -EOPNOTSUPP; 1770 } 1771 1772 if (!taprio_flags_valid(new)) { 1773 NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); 1774 return -EINVAL; 1775 } 1776 1777 return new; 1778 } 1779 1780 static int taprio_change(struct Qdisc *sch, struct nlattr *opt, 1781 struct netlink_ext_ack *extack) 1782 { 1783 struct qdisc_size_table *stab = rtnl_dereference(sch->stab); 1784 struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; 1785 struct sched_gate_list *oper, *admin, *new_admin; 1786 struct taprio_sched *q = qdisc_priv(sch); 1787 struct net_device *dev = qdisc_dev(sch); 1788 struct tc_mqprio_qopt *mqprio = NULL; 1789 unsigned long flags; 1790 ktime_t start; 1791 int i, err; 1792 1793 err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, 1794 taprio_policy, extack); 1795 if (err < 0) 1796 return err; 1797 1798 if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) 1799 mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); 1800 1801 err = taprio_new_flags(tb[TCA_TAPRIO_ATTR_FLAGS], 1802 q->flags, extack); 1803 if (err < 0) 1804 return err; 1805 1806 q->flags = err; 1807 1808 err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); 1809 if (err < 0) 1810 return err; 1811 1812 err = taprio_parse_tc_entries(sch, opt, extack); 1813 if (err) 1814 return err; 1815 1816 new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); 1817 if (!new_admin) { 1818 NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); 1819 return -ENOMEM; 1820 } 1821 INIT_LIST_HEAD(&new_admin->entries); 1822 1823 oper = rtnl_dereference(q->oper_sched); 1824 admin = rtnl_dereference(q->admin_sched); 1825 1826 /* no changes - no new mqprio settings */ 1827 if (!taprio_mqprio_cmp(dev, mqprio)) 1828 mqprio = NULL; 1829 1830 if (mqprio && (oper || admin)) { 1831 NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); 1832 err = -ENOTSUPP; 1833 goto free_sched; 1834 } 1835 1836 err = parse_taprio_schedule(q, tb, new_admin, extack); 1837 if (err < 0) 1838 goto free_sched; 1839 1840 if (new_admin->num_entries == 0) { 1841 NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule"); 1842 err = -EINVAL; 1843 goto free_sched; 1844 } 1845 1846 err = taprio_parse_clockid(sch, tb, extack); 1847 if (err < 0) 1848 goto free_sched; 1849 1850 taprio_set_picos_per_byte(dev, q); 1851 taprio_update_queue_max_sdu(q, new_admin, stab); 1852 1853 if (mqprio) { 1854 err = netdev_set_num_tc(dev, mqprio->num_tc); 1855 if (err) 1856 goto free_sched; 1857 for (i = 0; i < mqprio->num_tc; i++) { 1858 netdev_set_tc_queue(dev, i, 1859 mqprio->count[i], 1860 mqprio->offset[i]); 1861 q->cur_txq[i] = mqprio->offset[i]; 1862 } 1863 1864 /* Always use supplied priority mappings */ 1865 for (i = 0; i <= TC_BITMASK; i++) 1866 netdev_set_prio_tc_map(dev, i, 1867 mqprio->prio_tc_map[i]); 1868 } 1869 1870 if (FULL_OFFLOAD_IS_ENABLED(q->flags)) 1871 err = taprio_enable_offload(dev, q, new_admin, extack); 1872 else 1873 err = taprio_disable_offload(dev, q, extack); 1874 if (err) 1875 goto free_sched; 1876 1877 /* Protects against enqueue()/dequeue() */ 1878 spin_lock_bh(qdisc_lock(sch)); 1879 1880 if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { 1881 if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { 1882 NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); 1883 err = -EINVAL; 1884 goto unlock; 1885 } 1886 1887 q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); 1888 } 1889 1890 if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && 1891 !FULL_OFFLOAD_IS_ENABLED(q->flags) && 1892 !hrtimer_active(&q->advance_timer)) { 1893 hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); 1894 q->advance_timer.function = advance_sched; 1895 } 1896 1897 err = taprio_get_start_time(sch, new_admin, &start); 1898 if (err < 0) { 1899 NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); 1900 goto unlock; 1901 } 1902 1903 setup_txtime(q, new_admin, start); 1904 1905 if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { 1906 if (!oper) { 1907 rcu_assign_pointer(q->oper_sched, new_admin); 1908 err = 0; 1909 new_admin = NULL; 1910 goto unlock; 1911 } 1912 1913 rcu_assign_pointer(q->admin_sched, new_admin); 1914 if (admin) 1915 call_rcu(&admin->rcu, taprio_free_sched_cb); 1916 } else { 1917 setup_first_end_time(q, new_admin, start); 1918 1919 /* Protects against advance_sched() */ 1920 spin_lock_irqsave(&q->current_entry_lock, flags); 1921 1922 taprio_start_sched(sch, start, new_admin); 1923 1924 rcu_assign_pointer(q->admin_sched, new_admin); 1925 if (admin) 1926 call_rcu(&admin->rcu, taprio_free_sched_cb); 1927 1928 spin_unlock_irqrestore(&q->current_entry_lock, flags); 1929 1930 if (FULL_OFFLOAD_IS_ENABLED(q->flags)) 1931 taprio_offload_config_changed(q); 1932 } 1933 1934 new_admin = NULL; 1935 err = 0; 1936 1937 if (!stab) 1938 NL_SET_ERR_MSG_MOD(extack, 1939 "Size table not specified, frame length estimations may be inaccurate"); 1940 1941 unlock: 1942 spin_unlock_bh(qdisc_lock(sch)); 1943 1944 free_sched: 1945 if (new_admin) 1946 call_rcu(&new_admin->rcu, taprio_free_sched_cb); 1947 1948 return err; 1949 } 1950 1951 static void taprio_reset(struct Qdisc *sch) 1952 { 1953 struct taprio_sched *q = qdisc_priv(sch); 1954 struct net_device *dev = qdisc_dev(sch); 1955 int i; 1956 1957 hrtimer_cancel(&q->advance_timer); 1958 1959 if (q->qdiscs) { 1960 for (i = 0; i < dev->num_tx_queues; i++) 1961 if (q->qdiscs[i]) 1962 qdisc_reset(q->qdiscs[i]); 1963 } 1964 } 1965 1966 static void taprio_destroy(struct Qdisc *sch) 1967 { 1968 struct taprio_sched *q = qdisc_priv(sch); 1969 struct net_device *dev = qdisc_dev(sch); 1970 struct sched_gate_list *oper, *admin; 1971 unsigned int i; 1972 1973 list_del(&q->taprio_list); 1974 1975 /* Note that taprio_reset() might not be called if an error 1976 * happens in qdisc_create(), after taprio_init() has been called. 1977 */ 1978 hrtimer_cancel(&q->advance_timer); 1979 qdisc_synchronize(sch); 1980 1981 taprio_disable_offload(dev, q, NULL); 1982 1983 if (q->qdiscs) { 1984 for (i = 0; i < dev->num_tx_queues; i++) 1985 qdisc_put(q->qdiscs[i]); 1986 1987 kfree(q->qdiscs); 1988 } 1989 q->qdiscs = NULL; 1990 1991 netdev_reset_tc(dev); 1992 1993 oper = rtnl_dereference(q->oper_sched); 1994 admin = rtnl_dereference(q->admin_sched); 1995 1996 if (oper) 1997 call_rcu(&oper->rcu, taprio_free_sched_cb); 1998 1999 if (admin) 2000 call_rcu(&admin->rcu, taprio_free_sched_cb); 2001 2002 taprio_cleanup_broken_mqprio(q); 2003 } 2004 2005 static int taprio_init(struct Qdisc *sch, struct nlattr *opt, 2006 struct netlink_ext_ack *extack) 2007 { 2008 struct taprio_sched *q = qdisc_priv(sch); 2009 struct net_device *dev = qdisc_dev(sch); 2010 int i; 2011 2012 spin_lock_init(&q->current_entry_lock); 2013 2014 hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); 2015 q->advance_timer.function = advance_sched; 2016 2017 q->root = sch; 2018 2019 /* We only support static clockids. Use an invalid value as default 2020 * and get the valid one on taprio_change(). 2021 */ 2022 q->clockid = -1; 2023 q->flags = TAPRIO_FLAGS_INVALID; 2024 2025 list_add(&q->taprio_list, &taprio_list); 2026 2027 if (sch->parent != TC_H_ROOT) { 2028 NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc"); 2029 return -EOPNOTSUPP; 2030 } 2031 2032 if (!netif_is_multiqueue(dev)) { 2033 NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required"); 2034 return -EOPNOTSUPP; 2035 } 2036 2037 /* pre-allocate qdisc, attachment can't fail */ 2038 q->qdiscs = kcalloc(dev->num_tx_queues, 2039 sizeof(q->qdiscs[0]), 2040 GFP_KERNEL); 2041 2042 if (!q->qdiscs) 2043 return -ENOMEM; 2044 2045 if (!opt) 2046 return -EINVAL; 2047 2048 for (i = 0; i < dev->num_tx_queues; i++) { 2049 struct netdev_queue *dev_queue; 2050 struct Qdisc *qdisc; 2051 2052 dev_queue = netdev_get_tx_queue(dev, i); 2053 qdisc = qdisc_create_dflt(dev_queue, 2054 &pfifo_qdisc_ops, 2055 TC_H_MAKE(TC_H_MAJ(sch->handle), 2056 TC_H_MIN(i + 1)), 2057 extack); 2058 if (!qdisc) 2059 return -ENOMEM; 2060 2061 if (i < dev->real_num_tx_queues) 2062 qdisc_hash_add(qdisc, false); 2063 2064 q->qdiscs[i] = qdisc; 2065 } 2066 2067 taprio_detect_broken_mqprio(q); 2068 2069 return taprio_change(sch, opt, extack); 2070 } 2071 2072 static void taprio_attach(struct Qdisc *sch) 2073 { 2074 struct taprio_sched *q = qdisc_priv(sch); 2075 struct net_device *dev = qdisc_dev(sch); 2076 unsigned int ntx; 2077 2078 /* Attach underlying qdisc */ 2079 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { 2080 struct Qdisc *qdisc = q->qdiscs[ntx]; 2081 struct Qdisc *old; 2082 2083 if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { 2084 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; 2085 old = dev_graft_qdisc(qdisc->dev_queue, qdisc); 2086 } else { 2087 old = dev_graft_qdisc(qdisc->dev_queue, sch); 2088 qdisc_refcount_inc(sch); 2089 } 2090 if (old) 2091 qdisc_put(old); 2092 } 2093 2094 /* access to the child qdiscs is not needed in offload mode */ 2095 if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { 2096 kfree(q->qdiscs); 2097 q->qdiscs = NULL; 2098 } 2099 } 2100 2101 static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, 2102 unsigned long cl) 2103 { 2104 struct net_device *dev = qdisc_dev(sch); 2105 unsigned long ntx = cl - 1; 2106 2107 if (ntx >= dev->num_tx_queues) 2108 return NULL; 2109 2110 return netdev_get_tx_queue(dev, ntx); 2111 } 2112 2113 static int taprio_graft(struct Qdisc *sch, unsigned long cl, 2114 struct Qdisc *new, struct Qdisc **old, 2115 struct netlink_ext_ack *extack) 2116 { 2117 struct taprio_sched *q = qdisc_priv(sch); 2118 struct net_device *dev = qdisc_dev(sch); 2119 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); 2120 2121 if (!dev_queue) 2122 return -EINVAL; 2123 2124 if (dev->flags & IFF_UP) 2125 dev_deactivate(dev); 2126 2127 if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { 2128 *old = dev_graft_qdisc(dev_queue, new); 2129 } else { 2130 *old = q->qdiscs[cl - 1]; 2131 q->qdiscs[cl - 1] = new; 2132 } 2133 2134 if (new) 2135 new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; 2136 2137 if (dev->flags & IFF_UP) 2138 dev_activate(dev); 2139 2140 return 0; 2141 } 2142 2143 static int dump_entry(struct sk_buff *msg, 2144 const struct sched_entry *entry) 2145 { 2146 struct nlattr *item; 2147 2148 item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY); 2149 if (!item) 2150 return -ENOSPC; 2151 2152 if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) 2153 goto nla_put_failure; 2154 2155 if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) 2156 goto nla_put_failure; 2157 2158 if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, 2159 entry->gate_mask)) 2160 goto nla_put_failure; 2161 2162 if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, 2163 entry->interval)) 2164 goto nla_put_failure; 2165 2166 return nla_nest_end(msg, item); 2167 2168 nla_put_failure: 2169 nla_nest_cancel(msg, item); 2170 return -1; 2171 } 2172 2173 static int dump_schedule(struct sk_buff *msg, 2174 const struct sched_gate_list *root) 2175 { 2176 struct nlattr *entry_list; 2177 struct sched_entry *entry; 2178 2179 if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, 2180 root->base_time, TCA_TAPRIO_PAD)) 2181 return -1; 2182 2183 if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, 2184 root->cycle_time, TCA_TAPRIO_PAD)) 2185 return -1; 2186 2187 if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, 2188 root->cycle_time_extension, TCA_TAPRIO_PAD)) 2189 return -1; 2190 2191 entry_list = nla_nest_start_noflag(msg, 2192 TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); 2193 if (!entry_list) 2194 goto error_nest; 2195 2196 list_for_each_entry(entry, &root->entries, list) { 2197 if (dump_entry(msg, entry) < 0) 2198 goto error_nest; 2199 } 2200 2201 nla_nest_end(msg, entry_list); 2202 return 0; 2203 2204 error_nest: 2205 nla_nest_cancel(msg, entry_list); 2206 return -1; 2207 } 2208 2209 static int taprio_dump_tc_entries(struct sk_buff *skb, 2210 struct sched_gate_list *sched) 2211 { 2212 struct nlattr *n; 2213 int tc; 2214 2215 for (tc = 0; tc < TC_MAX_QUEUE; tc++) { 2216 n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY); 2217 if (!n) 2218 return -EMSGSIZE; 2219 2220 if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc)) 2221 goto nla_put_failure; 2222 2223 if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU, 2224 sched->max_sdu[tc])) 2225 goto nla_put_failure; 2226 2227 nla_nest_end(skb, n); 2228 } 2229 2230 return 0; 2231 2232 nla_put_failure: 2233 nla_nest_cancel(skb, n); 2234 return -EMSGSIZE; 2235 } 2236 2237 static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) 2238 { 2239 struct taprio_sched *q = qdisc_priv(sch); 2240 struct net_device *dev = qdisc_dev(sch); 2241 struct sched_gate_list *oper, *admin; 2242 struct tc_mqprio_qopt opt = { 0 }; 2243 struct nlattr *nest, *sched_nest; 2244 2245 oper = rtnl_dereference(q->oper_sched); 2246 admin = rtnl_dereference(q->admin_sched); 2247 2248 mqprio_qopt_reconstruct(dev, &opt); 2249 2250 nest = nla_nest_start_noflag(skb, TCA_OPTIONS); 2251 if (!nest) 2252 goto start_error; 2253 2254 if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) 2255 goto options_error; 2256 2257 if (!FULL_OFFLOAD_IS_ENABLED(q->flags) && 2258 nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) 2259 goto options_error; 2260 2261 if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) 2262 goto options_error; 2263 2264 if (q->txtime_delay && 2265 nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) 2266 goto options_error; 2267 2268 if (oper && taprio_dump_tc_entries(skb, oper)) 2269 goto options_error; 2270 2271 if (oper && dump_schedule(skb, oper)) 2272 goto options_error; 2273 2274 if (!admin) 2275 goto done; 2276 2277 sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); 2278 if (!sched_nest) 2279 goto options_error; 2280 2281 if (dump_schedule(skb, admin)) 2282 goto admin_error; 2283 2284 nla_nest_end(skb, sched_nest); 2285 2286 done: 2287 return nla_nest_end(skb, nest); 2288 2289 admin_error: 2290 nla_nest_cancel(skb, sched_nest); 2291 2292 options_error: 2293 nla_nest_cancel(skb, nest); 2294 2295 start_error: 2296 return -ENOSPC; 2297 } 2298 2299 static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) 2300 { 2301 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); 2302 2303 if (!dev_queue) 2304 return NULL; 2305 2306 return dev_queue->qdisc_sleeping; 2307 } 2308 2309 static unsigned long taprio_find(struct Qdisc *sch, u32 classid) 2310 { 2311 unsigned int ntx = TC_H_MIN(classid); 2312 2313 if (!taprio_queue_get(sch, ntx)) 2314 return 0; 2315 return ntx; 2316 } 2317 2318 static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, 2319 struct sk_buff *skb, struct tcmsg *tcm) 2320 { 2321 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); 2322 2323 tcm->tcm_parent = TC_H_ROOT; 2324 tcm->tcm_handle |= TC_H_MIN(cl); 2325 tcm->tcm_info = dev_queue->qdisc_sleeping->handle; 2326 2327 return 0; 2328 } 2329 2330 static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, 2331 struct gnet_dump *d) 2332 __releases(d->lock) 2333 __acquires(d->lock) 2334 { 2335 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); 2336 2337 sch = dev_queue->qdisc_sleeping; 2338 if (gnet_stats_copy_basic(d, NULL, &sch->bstats, true) < 0 || 2339 qdisc_qstats_copy(d, sch) < 0) 2340 return -1; 2341 return 0; 2342 } 2343 2344 static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) 2345 { 2346 struct net_device *dev = qdisc_dev(sch); 2347 unsigned long ntx; 2348 2349 if (arg->stop) 2350 return; 2351 2352 arg->count = arg->skip; 2353 for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { 2354 if (!tc_qdisc_stats_dump(sch, ntx + 1, arg)) 2355 break; 2356 } 2357 } 2358 2359 static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, 2360 struct tcmsg *tcm) 2361 { 2362 return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); 2363 } 2364 2365 static const struct Qdisc_class_ops taprio_class_ops = { 2366 .graft = taprio_graft, 2367 .leaf = taprio_leaf, 2368 .find = taprio_find, 2369 .walk = taprio_walk, 2370 .dump = taprio_dump_class, 2371 .dump_stats = taprio_dump_class_stats, 2372 .select_queue = taprio_select_queue, 2373 }; 2374 2375 static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { 2376 .cl_ops = &taprio_class_ops, 2377 .id = "taprio", 2378 .priv_size = sizeof(struct taprio_sched), 2379 .init = taprio_init, 2380 .change = taprio_change, 2381 .destroy = taprio_destroy, 2382 .reset = taprio_reset, 2383 .attach = taprio_attach, 2384 .peek = taprio_peek, 2385 .dequeue = taprio_dequeue, 2386 .enqueue = taprio_enqueue, 2387 .dump = taprio_dump, 2388 .owner = THIS_MODULE, 2389 }; 2390 2391 static struct notifier_block taprio_device_notifier = { 2392 .notifier_call = taprio_dev_notifier, 2393 }; 2394 2395 static int __init taprio_module_init(void) 2396 { 2397 int err = register_netdevice_notifier(&taprio_device_notifier); 2398 2399 if (err) 2400 return err; 2401 2402 return register_qdisc(&taprio_qdisc_ops); 2403 } 2404 2405 static void __exit taprio_module_exit(void) 2406 { 2407 unregister_qdisc(&taprio_qdisc_ops); 2408 unregister_netdevice_notifier(&taprio_device_notifier); 2409 } 2410 2411 module_init(taprio_module_init); 2412 module_exit(taprio_module_exit); 2413 MODULE_LICENSE("GPL"); 2414