1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple CPU accounting cgroup controller 4 */ 5 6 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 7 #include <asm/cputime.h> 8 #endif 9 10 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 11 12 DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); 13 14 /* 15 * There are no locks covering percpu hardirq/softirq time. 16 * They are only modified in vtime_account, on corresponding CPU 17 * with interrupts disabled. So, writes are safe. 18 * They are read and saved off onto struct rq in update_rq_clock(). 19 * This may result in other CPU reading this CPU's IRQ time and can 20 * race with irq/vtime_account on this CPU. We would either get old 21 * or new value with a side effect of accounting a slice of IRQ time to wrong 22 * task when IRQ is in progress while we read rq->clock. That is a worthy 23 * compromise in place of having locks on each IRQ in account_system_time. 24 */ 25 DEFINE_PER_CPU(struct irqtime, cpu_irqtime); 26 27 void enable_sched_clock_irqtime(void) 28 { 29 static_branch_enable(&sched_clock_irqtime); 30 } 31 32 void disable_sched_clock_irqtime(void) 33 { 34 static_branch_disable(&sched_clock_irqtime); 35 } 36 37 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, 38 enum cpu_usage_stat idx) 39 { 40 u64 *cpustat = kcpustat_this_cpu->cpustat; 41 42 u64_stats_update_begin(&irqtime->sync); 43 cpustat[idx] += delta; 44 irqtime->total += delta; 45 irqtime->tick_delta += delta; 46 u64_stats_update_end(&irqtime->sync); 47 } 48 49 /* 50 * Called after incrementing preempt_count on {soft,}irq_enter 51 * and before decrementing preempt_count on {soft,}irq_exit. 52 */ 53 void irqtime_account_irq(struct task_struct *curr, unsigned int offset) 54 { 55 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 56 unsigned int pc; 57 s64 delta; 58 int cpu; 59 60 if (!irqtime_enabled()) 61 return; 62 63 cpu = smp_processor_id(); 64 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; 65 irqtime->irq_start_time += delta; 66 pc = irq_count() - offset; 67 68 /* 69 * We do not account for softirq time from ksoftirqd here. 70 * We want to continue accounting softirq time to ksoftirqd thread 71 * in that case, so as not to confuse scheduler with a special task 72 * that do not consume any time, but still wants to run. 73 */ 74 if (pc & HARDIRQ_MASK) 75 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); 76 else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) 77 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); 78 } 79 80 static u64 irqtime_tick_accounted(u64 maxtime) 81 { 82 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 83 u64 delta; 84 85 delta = min(irqtime->tick_delta, maxtime); 86 irqtime->tick_delta -= delta; 87 88 return delta; 89 } 90 91 #else /* CONFIG_IRQ_TIME_ACCOUNTING */ 92 93 static u64 irqtime_tick_accounted(u64 dummy) 94 { 95 return 0; 96 } 97 98 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 99 100 static inline void task_group_account_field(struct task_struct *p, int index, 101 u64 tmp) 102 { 103 /* 104 * Since all updates are sure to touch the root cgroup, we 105 * get ourselves ahead and touch it first. If the root cgroup 106 * is the only cgroup, then nothing else should be necessary. 107 * 108 */ 109 __this_cpu_add(kernel_cpustat.cpustat[index], tmp); 110 111 cgroup_account_cputime_field(p, index, tmp); 112 } 113 114 /* 115 * Account user CPU time to a process. 116 * @p: the process that the CPU time gets accounted to 117 * @cputime: the CPU time spent in user space since the last update 118 */ 119 void account_user_time(struct task_struct *p, u64 cputime) 120 { 121 int index; 122 123 /* Add user time to process. */ 124 p->utime += cputime; 125 account_group_user_time(p, cputime); 126 127 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 128 129 /* Add user time to cpustat. */ 130 task_group_account_field(p, index, cputime); 131 132 /* Account for user time used */ 133 acct_account_cputime(p); 134 } 135 136 /* 137 * Account guest CPU time to a process. 138 * @p: the process that the CPU time gets accounted to 139 * @cputime: the CPU time spent in virtual machine since the last update 140 */ 141 void account_guest_time(struct task_struct *p, u64 cputime) 142 { 143 u64 *cpustat = kcpustat_this_cpu->cpustat; 144 145 /* Add guest time to process. */ 146 p->utime += cputime; 147 account_group_user_time(p, cputime); 148 p->gtime += cputime; 149 150 /* Add guest time to cpustat. */ 151 if (task_nice(p) > 0) { 152 task_group_account_field(p, CPUTIME_NICE, cputime); 153 cpustat[CPUTIME_GUEST_NICE] += cputime; 154 } else { 155 task_group_account_field(p, CPUTIME_USER, cputime); 156 cpustat[CPUTIME_GUEST] += cputime; 157 } 158 } 159 160 /* 161 * Account system CPU time to a process and desired cpustat field 162 * @p: the process that the CPU time gets accounted to 163 * @cputime: the CPU time spent in kernel space since the last update 164 * @index: pointer to cpustat field that has to be updated 165 */ 166 void account_system_index_time(struct task_struct *p, 167 u64 cputime, enum cpu_usage_stat index) 168 { 169 /* Add system time to process. */ 170 p->stime += cputime; 171 account_group_system_time(p, cputime); 172 173 /* Add system time to cpustat. */ 174 task_group_account_field(p, index, cputime); 175 176 /* Account for system time used */ 177 acct_account_cputime(p); 178 } 179 180 /* 181 * Account system CPU time to a process. 182 * @p: the process that the CPU time gets accounted to 183 * @hardirq_offset: the offset to subtract from hardirq_count() 184 * @cputime: the CPU time spent in kernel space since the last update 185 */ 186 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 187 { 188 int index; 189 190 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 191 account_guest_time(p, cputime); 192 return; 193 } 194 195 if (hardirq_count() - hardirq_offset) 196 index = CPUTIME_IRQ; 197 else if (in_serving_softirq()) 198 index = CPUTIME_SOFTIRQ; 199 else 200 index = CPUTIME_SYSTEM; 201 202 account_system_index_time(p, cputime, index); 203 } 204 205 /* 206 * Account for involuntary wait time. 207 * @cputime: the CPU time spent in involuntary wait 208 */ 209 void account_steal_time(u64 cputime) 210 { 211 u64 *cpustat = kcpustat_this_cpu->cpustat; 212 213 cpustat[CPUTIME_STEAL] += cputime; 214 } 215 216 /* 217 * Account for idle time. 218 * @cputime: the CPU time spent in idle wait 219 */ 220 void account_idle_time(u64 cputime) 221 { 222 u64 *cpustat = kcpustat_this_cpu->cpustat; 223 struct rq *rq = this_rq(); 224 225 if (atomic_read(&rq->nr_iowait) > 0) 226 cpustat[CPUTIME_IOWAIT] += cputime; 227 else 228 cpustat[CPUTIME_IDLE] += cputime; 229 } 230 231 232 #ifdef CONFIG_SCHED_CORE 233 /* 234 * Account for forceidle time due to core scheduling. 235 * 236 * REQUIRES: schedstat is enabled. 237 */ 238 void __account_forceidle_time(struct task_struct *p, u64 delta) 239 { 240 __schedstat_add(p->stats.core_forceidle_sum, delta); 241 242 task_group_account_field(p, CPUTIME_FORCEIDLE, delta); 243 } 244 #endif 245 246 /* 247 * When a guest is interrupted for a longer amount of time, missed clock 248 * ticks are not redelivered later. Due to that, this function may on 249 * occasion account more time than the calling functions think elapsed. 250 */ 251 static __always_inline u64 steal_account_process_time(u64 maxtime) 252 { 253 #ifdef CONFIG_PARAVIRT 254 if (static_key_false(¶virt_steal_enabled)) { 255 u64 steal; 256 257 steal = paravirt_steal_clock(smp_processor_id()); 258 steal -= this_rq()->prev_steal_time; 259 steal = min(steal, maxtime); 260 account_steal_time(steal); 261 this_rq()->prev_steal_time += steal; 262 263 return steal; 264 } 265 #endif 266 return 0; 267 } 268 269 /* 270 * Account how much elapsed time was spent in steal, IRQ, or softirq time. 271 */ 272 static inline u64 account_other_time(u64 max) 273 { 274 u64 accounted; 275 276 lockdep_assert_irqs_disabled(); 277 278 accounted = steal_account_process_time(max); 279 280 if (accounted < max) 281 accounted += irqtime_tick_accounted(max - accounted); 282 283 return accounted; 284 } 285 286 #ifdef CONFIG_64BIT 287 static inline u64 read_sum_exec_runtime(struct task_struct *t) 288 { 289 return t->se.sum_exec_runtime; 290 } 291 #else 292 static u64 read_sum_exec_runtime(struct task_struct *t) 293 { 294 u64 ns; 295 struct rq_flags rf; 296 struct rq *rq; 297 298 rq = task_rq_lock(t, &rf); 299 ns = t->se.sum_exec_runtime; 300 task_rq_unlock(rq, t, &rf); 301 302 return ns; 303 } 304 #endif 305 306 /* 307 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live 308 * tasks (sum on group iteration) belonging to @tsk's group. 309 */ 310 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 311 { 312 struct signal_struct *sig = tsk->signal; 313 u64 utime, stime; 314 struct task_struct *t; 315 unsigned int seq, nextseq; 316 unsigned long flags; 317 318 /* 319 * Update current task runtime to account pending time since last 320 * scheduler action or thread_group_cputime() call. This thread group 321 * might have other running tasks on different CPUs, but updating 322 * their runtime can affect syscall performance, so we skip account 323 * those pending times and rely only on values updated on tick or 324 * other scheduler action. 325 */ 326 if (same_thread_group(current, tsk)) 327 (void) task_sched_runtime(current); 328 329 rcu_read_lock(); 330 /* Attempt a lockless read on the first round. */ 331 nextseq = 0; 332 do { 333 seq = nextseq; 334 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); 335 times->utime = sig->utime; 336 times->stime = sig->stime; 337 times->sum_exec_runtime = sig->sum_sched_runtime; 338 339 for_each_thread(tsk, t) { 340 task_cputime(t, &utime, &stime); 341 times->utime += utime; 342 times->stime += stime; 343 times->sum_exec_runtime += read_sum_exec_runtime(t); 344 } 345 /* If lockless access failed, take the lock. */ 346 nextseq = 1; 347 } while (need_seqretry(&sig->stats_lock, seq)); 348 done_seqretry_irqrestore(&sig->stats_lock, seq, flags); 349 rcu_read_unlock(); 350 } 351 352 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 353 /* 354 * Account a tick to a process and cpustat 355 * @p: the process that the CPU time gets accounted to 356 * @user_tick: is the tick from userspace 357 * @rq: the pointer to rq 358 * 359 * Tick demultiplexing follows the order 360 * - pending hardirq update 361 * - pending softirq update 362 * - user_time 363 * - idle_time 364 * - system time 365 * - check for guest_time 366 * - else account as system_time 367 * 368 * Check for hardirq is done both for system and user time as there is 369 * no timer going off while we are on hardirq and hence we may never get an 370 * opportunity to update it solely in system time. 371 * p->stime and friends are only updated on system time and not on IRQ 372 * softirq as those do not count in task exec_runtime any more. 373 */ 374 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 375 int ticks) 376 { 377 u64 other, cputime = TICK_NSEC * ticks; 378 379 /* 380 * When returning from idle, many ticks can get accounted at 381 * once, including some ticks of steal, IRQ, and softirq time. 382 * Subtract those ticks from the amount of time accounted to 383 * idle, or potentially user or system time. Due to rounding, 384 * other time can exceed ticks occasionally. 385 */ 386 other = account_other_time(ULONG_MAX); 387 if (other >= cputime) 388 return; 389 390 cputime -= other; 391 392 if (this_cpu_ksoftirqd() == p) { 393 /* 394 * ksoftirqd time do not get accounted in cpu_softirq_time. 395 * So, we have to handle it separately here. 396 * Also, p->stime needs to be updated for ksoftirqd. 397 */ 398 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); 399 } else if (user_tick) { 400 account_user_time(p, cputime); 401 } else if (p == this_rq()->idle) { 402 account_idle_time(cputime); 403 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 404 account_guest_time(p, cputime); 405 } else { 406 account_system_index_time(p, cputime, CPUTIME_SYSTEM); 407 } 408 } 409 410 static void irqtime_account_idle_ticks(int ticks) 411 { 412 irqtime_account_process_tick(current, 0, ticks); 413 } 414 #else /* CONFIG_IRQ_TIME_ACCOUNTING */ 415 static inline void irqtime_account_idle_ticks(int ticks) { } 416 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 417 int nr_ticks) { } 418 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 419 420 /* 421 * Use precise platform statistics if available: 422 */ 423 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 424 425 void vtime_account_irq(struct task_struct *tsk, unsigned int offset) 426 { 427 unsigned int pc = irq_count() - offset; 428 429 if (pc & HARDIRQ_OFFSET) { 430 vtime_account_hardirq(tsk); 431 } else if (pc & SOFTIRQ_OFFSET) { 432 vtime_account_softirq(tsk); 433 } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && 434 is_idle_task(tsk)) { 435 vtime_account_idle(tsk); 436 } else { 437 vtime_account_kernel(tsk); 438 } 439 } 440 441 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 442 u64 *ut, u64 *st) 443 { 444 *ut = curr->utime; 445 *st = curr->stime; 446 } 447 448 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 449 { 450 *ut = p->utime; 451 *st = p->stime; 452 } 453 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 454 455 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 456 { 457 struct task_cputime cputime; 458 459 thread_group_cputime(p, &cputime); 460 461 *ut = cputime.utime; 462 *st = cputime.stime; 463 } 464 465 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ 466 467 /* 468 * Account a single tick of CPU time. 469 * @p: the process that the CPU time gets accounted to 470 * @user_tick: indicates if the tick is a user or a system tick 471 */ 472 void account_process_tick(struct task_struct *p, int user_tick) 473 { 474 u64 cputime, steal; 475 476 if (vtime_accounting_enabled_this_cpu()) 477 return; 478 479 if (irqtime_enabled()) { 480 irqtime_account_process_tick(p, user_tick, 1); 481 return; 482 } 483 484 cputime = TICK_NSEC; 485 steal = steal_account_process_time(ULONG_MAX); 486 487 if (steal >= cputime) 488 return; 489 490 cputime -= steal; 491 492 if (user_tick) 493 account_user_time(p, cputime); 494 else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) 495 account_system_time(p, HARDIRQ_OFFSET, cputime); 496 else 497 account_idle_time(cputime); 498 } 499 500 /* 501 * Account multiple ticks of idle time. 502 * @ticks: number of stolen ticks 503 */ 504 void account_idle_ticks(unsigned long ticks) 505 { 506 u64 cputime, steal; 507 508 if (irqtime_enabled()) { 509 irqtime_account_idle_ticks(ticks); 510 return; 511 } 512 513 cputime = ticks * TICK_NSEC; 514 steal = steal_account_process_time(ULONG_MAX); 515 516 if (steal >= cputime) 517 return; 518 519 cputime -= steal; 520 account_idle_time(cputime); 521 } 522 523 /* 524 * Adjust tick based cputime random precision against scheduler runtime 525 * accounting. 526 * 527 * Tick based cputime accounting depend on random scheduling timeslices of a 528 * task to be interrupted or not by the timer. Depending on these 529 * circumstances, the number of these interrupts may be over or 530 * under-optimistic, matching the real user and system cputime with a variable 531 * precision. 532 * 533 * Fix this by scaling these tick based values against the total runtime 534 * accounted by the CFS scheduler. 535 * 536 * This code provides the following guarantees: 537 * 538 * stime + utime == rtime 539 * stime_i+1 >= stime_i, utime_i+1 >= utime_i 540 * 541 * Assuming that rtime_i+1 >= rtime_i. 542 */ 543 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 544 u64 *ut, u64 *st) 545 { 546 u64 rtime, stime, utime; 547 unsigned long flags; 548 549 /* Serialize concurrent callers such that we can honour our guarantees */ 550 raw_spin_lock_irqsave(&prev->lock, flags); 551 rtime = curr->sum_exec_runtime; 552 553 /* 554 * This is possible under two circumstances: 555 * - rtime isn't monotonic after all (a bug); 556 * - we got reordered by the lock. 557 * 558 * In both cases this acts as a filter such that the rest of the code 559 * can assume it is monotonic regardless of anything else. 560 */ 561 if (prev->stime + prev->utime >= rtime) 562 goto out; 563 564 stime = curr->stime; 565 utime = curr->utime; 566 567 /* 568 * If either stime or utime are 0, assume all runtime is userspace. 569 * Once a task gets some ticks, the monotonicity code at 'update:' 570 * will ensure things converge to the observed ratio. 571 */ 572 if (stime == 0) { 573 utime = rtime; 574 goto update; 575 } 576 577 if (utime == 0) { 578 stime = rtime; 579 goto update; 580 } 581 582 stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); 583 /* 584 * Because mul_u64_u64_div_u64() can approximate on some 585 * achitectures; enforce the constraint that: a*b/(b+c) <= a. 586 */ 587 if (unlikely(stime > rtime)) 588 stime = rtime; 589 590 update: 591 /* 592 * Make sure stime doesn't go backwards; this preserves monotonicity 593 * for utime because rtime is monotonic. 594 * 595 * utime_i+1 = rtime_i+1 - stime_i 596 * = rtime_i+1 - (rtime_i - utime_i) 597 * = (rtime_i+1 - rtime_i) + utime_i 598 * >= utime_i 599 */ 600 if (stime < prev->stime) 601 stime = prev->stime; 602 utime = rtime - stime; 603 604 /* 605 * Make sure utime doesn't go backwards; this still preserves 606 * monotonicity for stime, analogous argument to above. 607 */ 608 if (utime < prev->utime) { 609 utime = prev->utime; 610 stime = rtime - utime; 611 } 612 613 prev->stime = stime; 614 prev->utime = utime; 615 out: 616 *ut = prev->utime; 617 *st = prev->stime; 618 raw_spin_unlock_irqrestore(&prev->lock, flags); 619 } 620 621 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 622 { 623 struct task_cputime cputime = { 624 .sum_exec_runtime = p->se.sum_exec_runtime, 625 }; 626 627 if (task_cputime(p, &cputime.utime, &cputime.stime)) 628 cputime.sum_exec_runtime = task_sched_runtime(p); 629 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 630 } 631 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 632 633 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 634 { 635 struct task_cputime cputime; 636 637 thread_group_cputime(p, &cputime); 638 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 639 } 640 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 641 642 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 643 static u64 vtime_delta(struct vtime *vtime) 644 { 645 unsigned long long clock; 646 647 clock = sched_clock(); 648 if (clock < vtime->starttime) 649 return 0; 650 651 return clock - vtime->starttime; 652 } 653 654 static u64 get_vtime_delta(struct vtime *vtime) 655 { 656 u64 delta = vtime_delta(vtime); 657 u64 other; 658 659 /* 660 * Unlike tick based timing, vtime based timing never has lost 661 * ticks, and no need for steal time accounting to make up for 662 * lost ticks. Vtime accounts a rounded version of actual 663 * elapsed time. Limit account_other_time to prevent rounding 664 * errors from causing elapsed vtime to go negative. 665 */ 666 other = account_other_time(delta); 667 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); 668 vtime->starttime += delta; 669 670 return delta - other; 671 } 672 673 static void vtime_account_system(struct task_struct *tsk, 674 struct vtime *vtime) 675 { 676 vtime->stime += get_vtime_delta(vtime); 677 if (vtime->stime >= TICK_NSEC) { 678 account_system_time(tsk, irq_count(), vtime->stime); 679 vtime->stime = 0; 680 } 681 } 682 683 static void vtime_account_guest(struct task_struct *tsk, 684 struct vtime *vtime) 685 { 686 vtime->gtime += get_vtime_delta(vtime); 687 if (vtime->gtime >= TICK_NSEC) { 688 account_guest_time(tsk, vtime->gtime); 689 vtime->gtime = 0; 690 } 691 } 692 693 static void __vtime_account_kernel(struct task_struct *tsk, 694 struct vtime *vtime) 695 { 696 /* We might have scheduled out from guest path */ 697 if (vtime->state == VTIME_GUEST) 698 vtime_account_guest(tsk, vtime); 699 else 700 vtime_account_system(tsk, vtime); 701 } 702 703 void vtime_account_kernel(struct task_struct *tsk) 704 { 705 struct vtime *vtime = &tsk->vtime; 706 707 if (!vtime_delta(vtime)) 708 return; 709 710 write_seqcount_begin(&vtime->seqcount); 711 __vtime_account_kernel(tsk, vtime); 712 write_seqcount_end(&vtime->seqcount); 713 } 714 715 void vtime_user_enter(struct task_struct *tsk) 716 { 717 struct vtime *vtime = &tsk->vtime; 718 719 write_seqcount_begin(&vtime->seqcount); 720 vtime_account_system(tsk, vtime); 721 vtime->state = VTIME_USER; 722 write_seqcount_end(&vtime->seqcount); 723 } 724 725 void vtime_user_exit(struct task_struct *tsk) 726 { 727 struct vtime *vtime = &tsk->vtime; 728 729 write_seqcount_begin(&vtime->seqcount); 730 vtime->utime += get_vtime_delta(vtime); 731 if (vtime->utime >= TICK_NSEC) { 732 account_user_time(tsk, vtime->utime); 733 vtime->utime = 0; 734 } 735 vtime->state = VTIME_SYS; 736 write_seqcount_end(&vtime->seqcount); 737 } 738 739 void vtime_guest_enter(struct task_struct *tsk) 740 { 741 struct vtime *vtime = &tsk->vtime; 742 /* 743 * The flags must be updated under the lock with 744 * the vtime_starttime flush and update. 745 * That enforces a right ordering and update sequence 746 * synchronization against the reader (task_gtime()) 747 * that can thus safely catch up with a tickless delta. 748 */ 749 write_seqcount_begin(&vtime->seqcount); 750 vtime_account_system(tsk, vtime); 751 tsk->flags |= PF_VCPU; 752 vtime->state = VTIME_GUEST; 753 write_seqcount_end(&vtime->seqcount); 754 } 755 EXPORT_SYMBOL_GPL(vtime_guest_enter); 756 757 void vtime_guest_exit(struct task_struct *tsk) 758 { 759 struct vtime *vtime = &tsk->vtime; 760 761 write_seqcount_begin(&vtime->seqcount); 762 vtime_account_guest(tsk, vtime); 763 tsk->flags &= ~PF_VCPU; 764 vtime->state = VTIME_SYS; 765 write_seqcount_end(&vtime->seqcount); 766 } 767 EXPORT_SYMBOL_GPL(vtime_guest_exit); 768 769 void vtime_account_idle(struct task_struct *tsk) 770 { 771 account_idle_time(get_vtime_delta(&tsk->vtime)); 772 } 773 774 void vtime_task_switch_generic(struct task_struct *prev) 775 { 776 struct vtime *vtime = &prev->vtime; 777 778 write_seqcount_begin(&vtime->seqcount); 779 if (vtime->state == VTIME_IDLE) 780 vtime_account_idle(prev); 781 else 782 __vtime_account_kernel(prev, vtime); 783 vtime->state = VTIME_INACTIVE; 784 vtime->cpu = -1; 785 write_seqcount_end(&vtime->seqcount); 786 787 vtime = ¤t->vtime; 788 789 write_seqcount_begin(&vtime->seqcount); 790 if (is_idle_task(current)) 791 vtime->state = VTIME_IDLE; 792 else if (current->flags & PF_VCPU) 793 vtime->state = VTIME_GUEST; 794 else 795 vtime->state = VTIME_SYS; 796 vtime->starttime = sched_clock(); 797 vtime->cpu = smp_processor_id(); 798 write_seqcount_end(&vtime->seqcount); 799 } 800 801 void vtime_init_idle(struct task_struct *t, int cpu) 802 { 803 struct vtime *vtime = &t->vtime; 804 unsigned long flags; 805 806 local_irq_save(flags); 807 write_seqcount_begin(&vtime->seqcount); 808 vtime->state = VTIME_IDLE; 809 vtime->starttime = sched_clock(); 810 vtime->cpu = cpu; 811 write_seqcount_end(&vtime->seqcount); 812 local_irq_restore(flags); 813 } 814 815 u64 task_gtime(struct task_struct *t) 816 { 817 struct vtime *vtime = &t->vtime; 818 unsigned int seq; 819 u64 gtime; 820 821 if (!vtime_accounting_enabled()) 822 return t->gtime; 823 824 do { 825 seq = read_seqcount_begin(&vtime->seqcount); 826 827 gtime = t->gtime; 828 if (vtime->state == VTIME_GUEST) 829 gtime += vtime->gtime + vtime_delta(vtime); 830 831 } while (read_seqcount_retry(&vtime->seqcount, seq)); 832 833 return gtime; 834 } 835 836 /* 837 * Fetch cputime raw values from fields of task_struct and 838 * add up the pending nohz execution time since the last 839 * cputime snapshot. 840 */ 841 bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) 842 { 843 struct vtime *vtime = &t->vtime; 844 unsigned int seq; 845 u64 delta; 846 int ret; 847 848 if (!vtime_accounting_enabled()) { 849 *utime = t->utime; 850 *stime = t->stime; 851 return false; 852 } 853 854 do { 855 ret = false; 856 seq = read_seqcount_begin(&vtime->seqcount); 857 858 *utime = t->utime; 859 *stime = t->stime; 860 861 /* Task is sleeping or idle, nothing to add */ 862 if (vtime->state < VTIME_SYS) 863 continue; 864 865 ret = true; 866 delta = vtime_delta(vtime); 867 868 /* 869 * Task runs either in user (including guest) or kernel space, 870 * add pending nohz time to the right place. 871 */ 872 if (vtime->state == VTIME_SYS) 873 *stime += vtime->stime + delta; 874 else 875 *utime += vtime->utime + delta; 876 } while (read_seqcount_retry(&vtime->seqcount, seq)); 877 878 return ret; 879 } 880 881 static int vtime_state_fetch(struct vtime *vtime, int cpu) 882 { 883 int state = READ_ONCE(vtime->state); 884 885 /* 886 * We raced against a context switch, fetch the 887 * kcpustat task again. 888 */ 889 if (vtime->cpu != cpu && vtime->cpu != -1) 890 return -EAGAIN; 891 892 /* 893 * Two possible things here: 894 * 1) We are seeing the scheduling out task (prev) or any past one. 895 * 2) We are seeing the scheduling in task (next) but it hasn't 896 * passed though vtime_task_switch() yet so the pending 897 * cputime of the prev task may not be flushed yet. 898 * 899 * Case 1) is ok but 2) is not. So wait for a safe VTIME state. 900 */ 901 if (state == VTIME_INACTIVE) 902 return -EAGAIN; 903 904 return state; 905 } 906 907 static u64 kcpustat_user_vtime(struct vtime *vtime) 908 { 909 if (vtime->state == VTIME_USER) 910 return vtime->utime + vtime_delta(vtime); 911 else if (vtime->state == VTIME_GUEST) 912 return vtime->gtime + vtime_delta(vtime); 913 return 0; 914 } 915 916 static int kcpustat_field_vtime(u64 *cpustat, 917 struct task_struct *tsk, 918 enum cpu_usage_stat usage, 919 int cpu, u64 *val) 920 { 921 struct vtime *vtime = &tsk->vtime; 922 unsigned int seq; 923 924 do { 925 int state; 926 927 seq = read_seqcount_begin(&vtime->seqcount); 928 929 state = vtime_state_fetch(vtime, cpu); 930 if (state < 0) 931 return state; 932 933 *val = cpustat[usage]; 934 935 /* 936 * Nice VS unnice cputime accounting may be inaccurate if 937 * the nice value has changed since the last vtime update. 938 * But proper fix would involve interrupting target on nice 939 * updates which is a no go on nohz_full (although the scheduler 940 * may still interrupt the target if rescheduling is needed...) 941 */ 942 switch (usage) { 943 case CPUTIME_SYSTEM: 944 if (state == VTIME_SYS) 945 *val += vtime->stime + vtime_delta(vtime); 946 break; 947 case CPUTIME_USER: 948 if (task_nice(tsk) <= 0) 949 *val += kcpustat_user_vtime(vtime); 950 break; 951 case CPUTIME_NICE: 952 if (task_nice(tsk) > 0) 953 *val += kcpustat_user_vtime(vtime); 954 break; 955 case CPUTIME_GUEST: 956 if (state == VTIME_GUEST && task_nice(tsk) <= 0) 957 *val += vtime->gtime + vtime_delta(vtime); 958 break; 959 case CPUTIME_GUEST_NICE: 960 if (state == VTIME_GUEST && task_nice(tsk) > 0) 961 *val += vtime->gtime + vtime_delta(vtime); 962 break; 963 default: 964 break; 965 } 966 } while (read_seqcount_retry(&vtime->seqcount, seq)); 967 968 return 0; 969 } 970 971 u64 kcpustat_field(struct kernel_cpustat *kcpustat, 972 enum cpu_usage_stat usage, int cpu) 973 { 974 u64 *cpustat = kcpustat->cpustat; 975 u64 val = cpustat[usage]; 976 struct rq *rq; 977 int err; 978 979 if (!vtime_accounting_enabled_cpu(cpu)) 980 return val; 981 982 rq = cpu_rq(cpu); 983 984 for (;;) { 985 struct task_struct *curr; 986 987 rcu_read_lock(); 988 curr = rcu_dereference(rq->curr); 989 if (WARN_ON_ONCE(!curr)) { 990 rcu_read_unlock(); 991 return cpustat[usage]; 992 } 993 994 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val); 995 rcu_read_unlock(); 996 997 if (!err) 998 return val; 999 1000 cpu_relax(); 1001 } 1002 } 1003 EXPORT_SYMBOL_GPL(kcpustat_field); 1004 1005 static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, 1006 const struct kernel_cpustat *src, 1007 struct task_struct *tsk, int cpu) 1008 { 1009 struct vtime *vtime = &tsk->vtime; 1010 unsigned int seq; 1011 1012 do { 1013 u64 *cpustat; 1014 u64 delta; 1015 int state; 1016 1017 seq = read_seqcount_begin(&vtime->seqcount); 1018 1019 state = vtime_state_fetch(vtime, cpu); 1020 if (state < 0) 1021 return state; 1022 1023 *dst = *src; 1024 cpustat = dst->cpustat; 1025 1026 /* Task is sleeping, dead or idle, nothing to add */ 1027 if (state < VTIME_SYS) 1028 continue; 1029 1030 delta = vtime_delta(vtime); 1031 1032 /* 1033 * Task runs either in user (including guest) or kernel space, 1034 * add pending nohz time to the right place. 1035 */ 1036 if (state == VTIME_SYS) { 1037 cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; 1038 } else if (state == VTIME_USER) { 1039 if (task_nice(tsk) > 0) 1040 cpustat[CPUTIME_NICE] += vtime->utime + delta; 1041 else 1042 cpustat[CPUTIME_USER] += vtime->utime + delta; 1043 } else { 1044 WARN_ON_ONCE(state != VTIME_GUEST); 1045 if (task_nice(tsk) > 0) { 1046 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; 1047 cpustat[CPUTIME_NICE] += vtime->gtime + delta; 1048 } else { 1049 cpustat[CPUTIME_GUEST] += vtime->gtime + delta; 1050 cpustat[CPUTIME_USER] += vtime->gtime + delta; 1051 } 1052 } 1053 } while (read_seqcount_retry(&vtime->seqcount, seq)); 1054 1055 return 0; 1056 } 1057 1058 void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) 1059 { 1060 const struct kernel_cpustat *src = &kcpustat_cpu(cpu); 1061 struct rq *rq; 1062 int err; 1063 1064 if (!vtime_accounting_enabled_cpu(cpu)) { 1065 *dst = *src; 1066 return; 1067 } 1068 1069 rq = cpu_rq(cpu); 1070 1071 for (;;) { 1072 struct task_struct *curr; 1073 1074 rcu_read_lock(); 1075 curr = rcu_dereference(rq->curr); 1076 if (WARN_ON_ONCE(!curr)) { 1077 rcu_read_unlock(); 1078 *dst = *src; 1079 return; 1080 } 1081 1082 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); 1083 rcu_read_unlock(); 1084 1085 if (!err) 1086 return; 1087 1088 cpu_relax(); 1089 } 1090 } 1091 EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); 1092 1093 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 1094