1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple CPU accounting cgroup controller 4 */ 5 #include <linux/sched/cputime.h> 6 #include <linux/tsacct_kern.h> 7 #include "sched.h" 8 9 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 10 #include <asm/cputime.h> 11 #endif 12 13 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 14 15 DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); 16 17 /* 18 * There are no locks covering percpu hardirq/softirq time. 19 * They are only modified in vtime_account, on corresponding CPU 20 * with interrupts disabled. So, writes are safe. 21 * They are read and saved off onto struct rq in update_rq_clock(). 22 * This may result in other CPU reading this CPU's IRQ time and can 23 * race with irq/vtime_account on this CPU. We would either get old 24 * or new value with a side effect of accounting a slice of IRQ time to wrong 25 * task when IRQ is in progress while we read rq->clock. That is a worthy 26 * compromise in place of having locks on each IRQ in account_system_time. 27 */ 28 DEFINE_PER_CPU(struct irqtime, cpu_irqtime); 29 30 void enable_sched_clock_irqtime(void) 31 { 32 static_branch_enable(&sched_clock_irqtime); 33 } 34 35 void disable_sched_clock_irqtime(void) 36 { 37 if (irqtime_enabled()) 38 static_branch_disable(&sched_clock_irqtime); 39 } 40 41 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, 42 enum cpu_usage_stat idx) 43 { 44 u64 *cpustat = kcpustat_this_cpu->cpustat; 45 46 u64_stats_update_begin(&irqtime->sync); 47 cpustat[idx] += delta; 48 irqtime->total += delta; 49 irqtime->tick_delta += delta; 50 u64_stats_update_end(&irqtime->sync); 51 } 52 53 /* 54 * Called after incrementing preempt_count on {soft,}irq_enter 55 * and before decrementing preempt_count on {soft,}irq_exit. 56 */ 57 void irqtime_account_irq(struct task_struct *curr, unsigned int offset) 58 { 59 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 60 unsigned int pc; 61 s64 delta; 62 int cpu; 63 64 if (!irqtime_enabled()) 65 return; 66 67 cpu = smp_processor_id(); 68 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; 69 irqtime->irq_start_time += delta; 70 pc = irq_count() - offset; 71 72 /* 73 * We do not account for softirq time from ksoftirqd here. 74 * We want to continue accounting softirq time to ksoftirqd thread 75 * in that case, so as not to confuse scheduler with a special task 76 * that do not consume any time, but still wants to run. 77 */ 78 if (pc & HARDIRQ_MASK) 79 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); 80 else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) 81 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); 82 } 83 84 static u64 irqtime_tick_accounted(u64 maxtime) 85 { 86 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 87 u64 delta; 88 89 delta = min(irqtime->tick_delta, maxtime); 90 irqtime->tick_delta -= delta; 91 92 return delta; 93 } 94 95 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 96 97 static u64 irqtime_tick_accounted(u64 dummy) 98 { 99 return 0; 100 } 101 102 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 103 104 static inline void task_group_account_field(struct task_struct *p, int index, 105 u64 tmp) 106 { 107 /* 108 * Since all updates are sure to touch the root cgroup, we 109 * get ourselves ahead and touch it first. If the root cgroup 110 * is the only cgroup, then nothing else should be necessary. 111 * 112 */ 113 __this_cpu_add(kernel_cpustat.cpustat[index], tmp); 114 115 cgroup_account_cputime_field(p, index, tmp); 116 } 117 118 /* 119 * Account user CPU time to a process. 120 * @p: the process that the CPU time gets accounted to 121 * @cputime: the CPU time spent in user space since the last update 122 */ 123 void account_user_time(struct task_struct *p, u64 cputime) 124 { 125 int index; 126 127 /* Add user time to process. */ 128 p->utime += cputime; 129 account_group_user_time(p, cputime); 130 131 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 132 133 /* Add user time to cpustat. */ 134 task_group_account_field(p, index, cputime); 135 136 /* Account for user time used */ 137 acct_account_cputime(p); 138 } 139 140 /* 141 * Account guest CPU time to a process. 142 * @p: the process that the CPU time gets accounted to 143 * @cputime: the CPU time spent in virtual machine since the last update 144 */ 145 void account_guest_time(struct task_struct *p, u64 cputime) 146 { 147 u64 *cpustat = kcpustat_this_cpu->cpustat; 148 149 /* Add guest time to process. */ 150 p->utime += cputime; 151 account_group_user_time(p, cputime); 152 p->gtime += cputime; 153 154 /* Add guest time to cpustat. */ 155 if (task_nice(p) > 0) { 156 task_group_account_field(p, CPUTIME_NICE, cputime); 157 cpustat[CPUTIME_GUEST_NICE] += cputime; 158 } else { 159 task_group_account_field(p, CPUTIME_USER, cputime); 160 cpustat[CPUTIME_GUEST] += cputime; 161 } 162 } 163 164 /* 165 * Account system CPU time to a process and desired cpustat field 166 * @p: the process that the CPU time gets accounted to 167 * @cputime: the CPU time spent in kernel space since the last update 168 * @index: pointer to cpustat field that has to be updated 169 */ 170 void account_system_index_time(struct task_struct *p, 171 u64 cputime, enum cpu_usage_stat index) 172 { 173 /* Add system time to process. */ 174 p->stime += cputime; 175 account_group_system_time(p, cputime); 176 177 /* Add system time to cpustat. */ 178 task_group_account_field(p, index, cputime); 179 180 /* Account for system time used */ 181 acct_account_cputime(p); 182 } 183 184 /* 185 * Account system CPU time to a process. 186 * @p: the process that the CPU time gets accounted to 187 * @hardirq_offset: the offset to subtract from hardirq_count() 188 * @cputime: the CPU time spent in kernel space since the last update 189 */ 190 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 191 { 192 int index; 193 194 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 195 account_guest_time(p, cputime); 196 return; 197 } 198 199 if (hardirq_count() - hardirq_offset) 200 index = CPUTIME_IRQ; 201 else if (in_serving_softirq()) 202 index = CPUTIME_SOFTIRQ; 203 else 204 index = CPUTIME_SYSTEM; 205 206 account_system_index_time(p, cputime, index); 207 } 208 209 /* 210 * Account for involuntary wait time. 211 * @cputime: the CPU time spent in involuntary wait 212 */ 213 void account_steal_time(u64 cputime) 214 { 215 u64 *cpustat = kcpustat_this_cpu->cpustat; 216 217 cpustat[CPUTIME_STEAL] += cputime; 218 } 219 220 /* 221 * Account for idle time. 222 * @cputime: the CPU time spent in idle wait 223 */ 224 void account_idle_time(u64 cputime) 225 { 226 u64 *cpustat = kcpustat_this_cpu->cpustat; 227 struct rq *rq = this_rq(); 228 229 if (atomic_read(&rq->nr_iowait) > 0) 230 cpustat[CPUTIME_IOWAIT] += cputime; 231 else 232 cpustat[CPUTIME_IDLE] += cputime; 233 } 234 235 236 #ifdef CONFIG_SCHED_CORE 237 /* 238 * Account for forceidle time due to core scheduling. 239 * 240 * REQUIRES: schedstat is enabled. 241 */ 242 void __account_forceidle_time(struct task_struct *p, u64 delta) 243 { 244 __schedstat_add(p->stats.core_forceidle_sum, delta); 245 246 task_group_account_field(p, CPUTIME_FORCEIDLE, delta); 247 } 248 #endif /* CONFIG_SCHED_CORE */ 249 250 /* 251 * When a guest is interrupted for a longer amount of time, missed clock 252 * ticks are not redelivered later. Due to that, this function may on 253 * occasion account more time than the calling functions think elapsed. 254 */ 255 #ifdef CONFIG_PARAVIRT 256 struct static_key paravirt_steal_enabled; 257 258 #ifdef CONFIG_HAVE_PV_STEAL_CLOCK_GEN 259 static u64 native_steal_clock(int cpu) 260 { 261 return 0; 262 } 263 264 DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); 265 #endif 266 #endif 267 268 static __always_inline u64 steal_account_process_time(u64 maxtime) 269 { 270 #ifdef CONFIG_PARAVIRT 271 if (static_key_false(¶virt_steal_enabled)) { 272 u64 steal; 273 274 steal = paravirt_steal_clock(smp_processor_id()); 275 steal -= this_rq()->prev_steal_time; 276 steal = min(steal, maxtime); 277 account_steal_time(steal); 278 this_rq()->prev_steal_time += steal; 279 280 return steal; 281 } 282 #endif /* CONFIG_PARAVIRT */ 283 return 0; 284 } 285 286 /* 287 * Account how much elapsed time was spent in steal, IRQ, or softirq time. 288 */ 289 static inline u64 account_other_time(u64 max) 290 { 291 u64 accounted; 292 293 lockdep_assert_irqs_disabled(); 294 295 accounted = steal_account_process_time(max); 296 297 if (accounted < max) 298 accounted += irqtime_tick_accounted(max - accounted); 299 300 return accounted; 301 } 302 303 #ifdef CONFIG_64BIT 304 static inline u64 read_sum_exec_runtime(struct task_struct *t) 305 { 306 return t->se.sum_exec_runtime; 307 } 308 #else /* !CONFIG_64BIT: */ 309 static u64 read_sum_exec_runtime(struct task_struct *t) 310 { 311 u64 ns; 312 struct rq_flags rf; 313 struct rq *rq; 314 315 rq = task_rq_lock(t, &rf); 316 ns = t->se.sum_exec_runtime; 317 task_rq_unlock(rq, t, &rf); 318 319 return ns; 320 } 321 #endif /* !CONFIG_64BIT */ 322 323 /* 324 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live 325 * tasks (sum on group iteration) belonging to @tsk's group. 326 */ 327 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 328 { 329 struct signal_struct *sig = tsk->signal; 330 struct task_struct *t; 331 u64 utime, stime; 332 333 /* 334 * Update current task runtime to account pending time since last 335 * scheduler action or thread_group_cputime() call. This thread group 336 * might have other running tasks on different CPUs, but updating 337 * their runtime can affect syscall performance, so we skip account 338 * those pending times and rely only on values updated on tick or 339 * other scheduler action. 340 */ 341 if (same_thread_group(current, tsk)) 342 (void) task_sched_runtime(current); 343 344 guard(rcu)(); 345 scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { 346 times->utime = sig->utime; 347 times->stime = sig->stime; 348 times->sum_exec_runtime = sig->sum_sched_runtime; 349 350 __for_each_thread(sig, t) { 351 task_cputime(t, &utime, &stime); 352 times->utime += utime; 353 times->stime += stime; 354 times->sum_exec_runtime += read_sum_exec_runtime(t); 355 } 356 } 357 } 358 359 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 360 /* 361 * Account a tick to a process and cpustat 362 * @p: the process that the CPU time gets accounted to 363 * @user_tick: is the tick from userspace 364 * @rq: the pointer to rq 365 * 366 * Tick demultiplexing follows the order 367 * - pending hardirq update 368 * - pending softirq update 369 * - user_time 370 * - idle_time 371 * - system time 372 * - check for guest_time 373 * - else account as system_time 374 * 375 * Check for hardirq is done both for system and user time as there is 376 * no timer going off while we are on hardirq and hence we may never get an 377 * opportunity to update it solely in system time. 378 * p->stime and friends are only updated on system time and not on IRQ 379 * softirq as those do not count in task exec_runtime any more. 380 */ 381 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 382 int ticks) 383 { 384 u64 other, cputime = TICK_NSEC * ticks; 385 386 /* 387 * When returning from idle, many ticks can get accounted at 388 * once, including some ticks of steal, IRQ, and softirq time. 389 * Subtract those ticks from the amount of time accounted to 390 * idle, or potentially user or system time. Due to rounding, 391 * other time can exceed ticks occasionally. 392 */ 393 other = account_other_time(ULONG_MAX); 394 if (other >= cputime) 395 return; 396 397 cputime -= other; 398 399 if (this_cpu_ksoftirqd() == p) { 400 /* 401 * ksoftirqd time do not get accounted in cpu_softirq_time. 402 * So, we have to handle it separately here. 403 * Also, p->stime needs to be updated for ksoftirqd. 404 */ 405 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); 406 } else if (user_tick) { 407 account_user_time(p, cputime); 408 } else if (p == this_rq()->idle) { 409 account_idle_time(cputime); 410 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 411 account_guest_time(p, cputime); 412 } else { 413 account_system_index_time(p, cputime, CPUTIME_SYSTEM); 414 } 415 } 416 417 static void irqtime_account_idle_ticks(int ticks) 418 { 419 irqtime_account_process_tick(current, 0, ticks); 420 } 421 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 422 static inline void irqtime_account_idle_ticks(int ticks) { } 423 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 424 int nr_ticks) { } 425 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 426 427 /* 428 * Use precise platform statistics if available: 429 */ 430 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 431 432 void vtime_account_irq(struct task_struct *tsk, unsigned int offset) 433 { 434 unsigned int pc = irq_count() - offset; 435 436 if (pc & HARDIRQ_OFFSET) { 437 vtime_account_hardirq(tsk); 438 } else if (pc & SOFTIRQ_OFFSET) { 439 vtime_account_softirq(tsk); 440 } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && 441 is_idle_task(tsk)) { 442 vtime_account_idle(tsk); 443 } else { 444 vtime_account_kernel(tsk); 445 } 446 } 447 448 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 449 u64 *ut, u64 *st) 450 { 451 *ut = curr->utime; 452 *st = curr->stime; 453 } 454 455 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 456 { 457 *ut = p->utime; 458 *st = p->stime; 459 } 460 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 461 462 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 463 { 464 struct task_cputime cputime; 465 466 thread_group_cputime(p, &cputime); 467 468 *ut = cputime.utime; 469 *st = cputime.stime; 470 } 471 472 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ 473 474 /* 475 * Account a single tick of CPU time. 476 * @p: the process that the CPU time gets accounted to 477 * @user_tick: indicates if the tick is a user or a system tick 478 */ 479 void account_process_tick(struct task_struct *p, int user_tick) 480 { 481 u64 cputime, steal; 482 483 if (vtime_accounting_enabled_this_cpu()) 484 return; 485 486 if (irqtime_enabled()) { 487 irqtime_account_process_tick(p, user_tick, 1); 488 return; 489 } 490 491 cputime = TICK_NSEC; 492 steal = steal_account_process_time(ULONG_MAX); 493 494 if (steal >= cputime) 495 return; 496 497 cputime -= steal; 498 499 if (user_tick) 500 account_user_time(p, cputime); 501 else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) 502 account_system_time(p, HARDIRQ_OFFSET, cputime); 503 else 504 account_idle_time(cputime); 505 } 506 507 /* 508 * Account multiple ticks of idle time. 509 * @ticks: number of stolen ticks 510 */ 511 void account_idle_ticks(unsigned long ticks) 512 { 513 u64 cputime, steal; 514 515 if (irqtime_enabled()) { 516 irqtime_account_idle_ticks(ticks); 517 return; 518 } 519 520 cputime = ticks * TICK_NSEC; 521 steal = steal_account_process_time(ULONG_MAX); 522 523 if (steal >= cputime) 524 return; 525 526 cputime -= steal; 527 account_idle_time(cputime); 528 } 529 530 /* 531 * Adjust tick based cputime random precision against scheduler runtime 532 * accounting. 533 * 534 * Tick based cputime accounting depend on random scheduling timeslices of a 535 * task to be interrupted or not by the timer. Depending on these 536 * circumstances, the number of these interrupts may be over or 537 * under-optimistic, matching the real user and system cputime with a variable 538 * precision. 539 * 540 * Fix this by scaling these tick based values against the total runtime 541 * accounted by the CFS scheduler. 542 * 543 * This code provides the following guarantees: 544 * 545 * stime + utime == rtime 546 * stime_i+1 >= stime_i, utime_i+1 >= utime_i 547 * 548 * Assuming that rtime_i+1 >= rtime_i. 549 */ 550 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 551 u64 *ut, u64 *st) 552 { 553 u64 rtime, stime, utime; 554 unsigned long flags; 555 556 /* Serialize concurrent callers such that we can honour our guarantees */ 557 raw_spin_lock_irqsave(&prev->lock, flags); 558 rtime = curr->sum_exec_runtime; 559 560 /* 561 * This is possible under two circumstances: 562 * - rtime isn't monotonic after all (a bug); 563 * - we got reordered by the lock. 564 * 565 * In both cases this acts as a filter such that the rest of the code 566 * can assume it is monotonic regardless of anything else. 567 */ 568 if (prev->stime + prev->utime >= rtime) 569 goto out; 570 571 stime = curr->stime; 572 utime = curr->utime; 573 574 /* 575 * If either stime or utime are 0, assume all runtime is userspace. 576 * Once a task gets some ticks, the monotonicity code at 'update:' 577 * will ensure things converge to the observed ratio. 578 */ 579 if (stime == 0) { 580 utime = rtime; 581 goto update; 582 } 583 584 if (utime == 0) { 585 stime = rtime; 586 goto update; 587 } 588 589 stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); 590 /* 591 * Because mul_u64_u64_div_u64() can approximate on some 592 * achitectures; enforce the constraint that: a*b/(b+c) <= a. 593 */ 594 if (unlikely(stime > rtime)) 595 stime = rtime; 596 597 update: 598 /* 599 * Make sure stime doesn't go backwards; this preserves monotonicity 600 * for utime because rtime is monotonic. 601 * 602 * utime_i+1 = rtime_i+1 - stime_i 603 * = rtime_i+1 - (rtime_i - utime_i) 604 * = (rtime_i+1 - rtime_i) + utime_i 605 * >= utime_i 606 */ 607 if (stime < prev->stime) 608 stime = prev->stime; 609 utime = rtime - stime; 610 611 /* 612 * Make sure utime doesn't go backwards; this still preserves 613 * monotonicity for stime, analogous argument to above. 614 */ 615 if (utime < prev->utime) { 616 utime = prev->utime; 617 stime = rtime - utime; 618 } 619 620 prev->stime = stime; 621 prev->utime = utime; 622 out: 623 *ut = prev->utime; 624 *st = prev->stime; 625 raw_spin_unlock_irqrestore(&prev->lock, flags); 626 } 627 628 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 629 { 630 struct task_cputime cputime = { 631 .sum_exec_runtime = p->se.sum_exec_runtime, 632 }; 633 634 if (task_cputime(p, &cputime.utime, &cputime.stime)) 635 cputime.sum_exec_runtime = task_sched_runtime(p); 636 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 637 } 638 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 639 640 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 641 { 642 struct task_cputime cputime; 643 644 thread_group_cputime(p, &cputime); 645 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 646 } 647 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 648 649 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 650 static u64 vtime_delta(struct vtime *vtime) 651 { 652 unsigned long long clock; 653 654 clock = sched_clock(); 655 if (clock < vtime->starttime) 656 return 0; 657 658 return clock - vtime->starttime; 659 } 660 661 static u64 get_vtime_delta(struct vtime *vtime) 662 { 663 u64 delta = vtime_delta(vtime); 664 u64 other; 665 666 /* 667 * Unlike tick based timing, vtime based timing never has lost 668 * ticks, and no need for steal time accounting to make up for 669 * lost ticks. Vtime accounts a rounded version of actual 670 * elapsed time. Limit account_other_time to prevent rounding 671 * errors from causing elapsed vtime to go negative. 672 */ 673 other = account_other_time(delta); 674 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); 675 vtime->starttime += delta; 676 677 return delta - other; 678 } 679 680 static void vtime_account_system(struct task_struct *tsk, 681 struct vtime *vtime) 682 { 683 vtime->stime += get_vtime_delta(vtime); 684 if (vtime->stime >= TICK_NSEC) { 685 account_system_time(tsk, irq_count(), vtime->stime); 686 vtime->stime = 0; 687 } 688 } 689 690 static void vtime_account_guest(struct task_struct *tsk, 691 struct vtime *vtime) 692 { 693 vtime->gtime += get_vtime_delta(vtime); 694 if (vtime->gtime >= TICK_NSEC) { 695 account_guest_time(tsk, vtime->gtime); 696 vtime->gtime = 0; 697 } 698 } 699 700 static void __vtime_account_kernel(struct task_struct *tsk, 701 struct vtime *vtime) 702 { 703 /* We might have scheduled out from guest path */ 704 if (vtime->state == VTIME_GUEST) 705 vtime_account_guest(tsk, vtime); 706 else 707 vtime_account_system(tsk, vtime); 708 } 709 710 void vtime_account_kernel(struct task_struct *tsk) 711 { 712 struct vtime *vtime = &tsk->vtime; 713 714 if (!vtime_delta(vtime)) 715 return; 716 717 write_seqcount_begin(&vtime->seqcount); 718 __vtime_account_kernel(tsk, vtime); 719 write_seqcount_end(&vtime->seqcount); 720 } 721 722 void vtime_user_enter(struct task_struct *tsk) 723 { 724 struct vtime *vtime = &tsk->vtime; 725 726 write_seqcount_begin(&vtime->seqcount); 727 vtime_account_system(tsk, vtime); 728 vtime->state = VTIME_USER; 729 write_seqcount_end(&vtime->seqcount); 730 } 731 732 void vtime_user_exit(struct task_struct *tsk) 733 { 734 struct vtime *vtime = &tsk->vtime; 735 736 write_seqcount_begin(&vtime->seqcount); 737 vtime->utime += get_vtime_delta(vtime); 738 if (vtime->utime >= TICK_NSEC) { 739 account_user_time(tsk, vtime->utime); 740 vtime->utime = 0; 741 } 742 vtime->state = VTIME_SYS; 743 write_seqcount_end(&vtime->seqcount); 744 } 745 746 void vtime_guest_enter(struct task_struct *tsk) 747 { 748 struct vtime *vtime = &tsk->vtime; 749 /* 750 * The flags must be updated under the lock with 751 * the vtime_starttime flush and update. 752 * That enforces a right ordering and update sequence 753 * synchronization against the reader (task_gtime()) 754 * that can thus safely catch up with a tickless delta. 755 */ 756 write_seqcount_begin(&vtime->seqcount); 757 vtime_account_system(tsk, vtime); 758 tsk->flags |= PF_VCPU; 759 vtime->state = VTIME_GUEST; 760 write_seqcount_end(&vtime->seqcount); 761 } 762 EXPORT_SYMBOL_GPL(vtime_guest_enter); 763 764 void vtime_guest_exit(struct task_struct *tsk) 765 { 766 struct vtime *vtime = &tsk->vtime; 767 768 write_seqcount_begin(&vtime->seqcount); 769 vtime_account_guest(tsk, vtime); 770 tsk->flags &= ~PF_VCPU; 771 vtime->state = VTIME_SYS; 772 write_seqcount_end(&vtime->seqcount); 773 } 774 EXPORT_SYMBOL_GPL(vtime_guest_exit); 775 776 void vtime_account_idle(struct task_struct *tsk) 777 { 778 account_idle_time(get_vtime_delta(&tsk->vtime)); 779 } 780 781 void vtime_task_switch_generic(struct task_struct *prev) 782 { 783 struct vtime *vtime = &prev->vtime; 784 785 write_seqcount_begin(&vtime->seqcount); 786 if (vtime->state == VTIME_IDLE) 787 vtime_account_idle(prev); 788 else 789 __vtime_account_kernel(prev, vtime); 790 vtime->state = VTIME_INACTIVE; 791 vtime->cpu = -1; 792 write_seqcount_end(&vtime->seqcount); 793 794 vtime = ¤t->vtime; 795 796 write_seqcount_begin(&vtime->seqcount); 797 if (is_idle_task(current)) 798 vtime->state = VTIME_IDLE; 799 else if (current->flags & PF_VCPU) 800 vtime->state = VTIME_GUEST; 801 else 802 vtime->state = VTIME_SYS; 803 vtime->starttime = sched_clock(); 804 vtime->cpu = smp_processor_id(); 805 write_seqcount_end(&vtime->seqcount); 806 } 807 808 void vtime_init_idle(struct task_struct *t, int cpu) 809 { 810 struct vtime *vtime = &t->vtime; 811 unsigned long flags; 812 813 local_irq_save(flags); 814 write_seqcount_begin(&vtime->seqcount); 815 vtime->state = VTIME_IDLE; 816 vtime->starttime = sched_clock(); 817 vtime->cpu = cpu; 818 write_seqcount_end(&vtime->seqcount); 819 local_irq_restore(flags); 820 } 821 822 u64 task_gtime(struct task_struct *t) 823 { 824 struct vtime *vtime = &t->vtime; 825 unsigned int seq; 826 u64 gtime; 827 828 if (!vtime_accounting_enabled()) 829 return t->gtime; 830 831 do { 832 seq = read_seqcount_begin(&vtime->seqcount); 833 834 gtime = t->gtime; 835 if (vtime->state == VTIME_GUEST) 836 gtime += vtime->gtime + vtime_delta(vtime); 837 838 } while (read_seqcount_retry(&vtime->seqcount, seq)); 839 840 return gtime; 841 } 842 843 /* 844 * Fetch cputime raw values from fields of task_struct and 845 * add up the pending nohz execution time since the last 846 * cputime snapshot. 847 */ 848 bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) 849 { 850 struct vtime *vtime = &t->vtime; 851 unsigned int seq; 852 u64 delta; 853 int ret; 854 855 if (!vtime_accounting_enabled()) { 856 *utime = t->utime; 857 *stime = t->stime; 858 return false; 859 } 860 861 do { 862 ret = false; 863 seq = read_seqcount_begin(&vtime->seqcount); 864 865 *utime = t->utime; 866 *stime = t->stime; 867 868 /* Task is sleeping or idle, nothing to add */ 869 if (vtime->state < VTIME_SYS) 870 continue; 871 872 ret = true; 873 delta = vtime_delta(vtime); 874 875 /* 876 * Task runs either in user (including guest) or kernel space, 877 * add pending nohz time to the right place. 878 */ 879 if (vtime->state == VTIME_SYS) 880 *stime += vtime->stime + delta; 881 else 882 *utime += vtime->utime + delta; 883 } while (read_seqcount_retry(&vtime->seqcount, seq)); 884 885 return ret; 886 } 887 888 static int vtime_state_fetch(struct vtime *vtime, int cpu) 889 { 890 int state = READ_ONCE(vtime->state); 891 892 /* 893 * We raced against a context switch, fetch the 894 * kcpustat task again. 895 */ 896 if (vtime->cpu != cpu && vtime->cpu != -1) 897 return -EAGAIN; 898 899 /* 900 * Two possible things here: 901 * 1) We are seeing the scheduling out task (prev) or any past one. 902 * 2) We are seeing the scheduling in task (next) but it hasn't 903 * passed though vtime_task_switch() yet so the pending 904 * cputime of the prev task may not be flushed yet. 905 * 906 * Case 1) is ok but 2) is not. So wait for a safe VTIME state. 907 */ 908 if (state == VTIME_INACTIVE) 909 return -EAGAIN; 910 911 return state; 912 } 913 914 static u64 kcpustat_user_vtime(struct vtime *vtime) 915 { 916 if (vtime->state == VTIME_USER) 917 return vtime->utime + vtime_delta(vtime); 918 else if (vtime->state == VTIME_GUEST) 919 return vtime->gtime + vtime_delta(vtime); 920 return 0; 921 } 922 923 static int kcpustat_field_vtime(u64 *cpustat, 924 struct task_struct *tsk, 925 enum cpu_usage_stat usage, 926 int cpu, u64 *val) 927 { 928 struct vtime *vtime = &tsk->vtime; 929 unsigned int seq; 930 931 do { 932 int state; 933 934 seq = read_seqcount_begin(&vtime->seqcount); 935 936 state = vtime_state_fetch(vtime, cpu); 937 if (state < 0) 938 return state; 939 940 *val = cpustat[usage]; 941 942 /* 943 * Nice VS unnice cputime accounting may be inaccurate if 944 * the nice value has changed since the last vtime update. 945 * But proper fix would involve interrupting target on nice 946 * updates which is a no go on nohz_full (although the scheduler 947 * may still interrupt the target if rescheduling is needed...) 948 */ 949 switch (usage) { 950 case CPUTIME_SYSTEM: 951 if (state == VTIME_SYS) 952 *val += vtime->stime + vtime_delta(vtime); 953 break; 954 case CPUTIME_USER: 955 if (task_nice(tsk) <= 0) 956 *val += kcpustat_user_vtime(vtime); 957 break; 958 case CPUTIME_NICE: 959 if (task_nice(tsk) > 0) 960 *val += kcpustat_user_vtime(vtime); 961 break; 962 case CPUTIME_GUEST: 963 if (state == VTIME_GUEST && task_nice(tsk) <= 0) 964 *val += vtime->gtime + vtime_delta(vtime); 965 break; 966 case CPUTIME_GUEST_NICE: 967 if (state == VTIME_GUEST && task_nice(tsk) > 0) 968 *val += vtime->gtime + vtime_delta(vtime); 969 break; 970 default: 971 break; 972 } 973 } while (read_seqcount_retry(&vtime->seqcount, seq)); 974 975 return 0; 976 } 977 978 u64 kcpustat_field(struct kernel_cpustat *kcpustat, 979 enum cpu_usage_stat usage, int cpu) 980 { 981 u64 *cpustat = kcpustat->cpustat; 982 u64 val = cpustat[usage]; 983 struct rq *rq; 984 int err; 985 986 if (!vtime_accounting_enabled_cpu(cpu)) 987 return val; 988 989 rq = cpu_rq(cpu); 990 991 for (;;) { 992 struct task_struct *curr; 993 994 rcu_read_lock(); 995 curr = rcu_dereference(rq->curr); 996 if (WARN_ON_ONCE(!curr)) { 997 rcu_read_unlock(); 998 return cpustat[usage]; 999 } 1000 1001 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val); 1002 rcu_read_unlock(); 1003 1004 if (!err) 1005 return val; 1006 1007 cpu_relax(); 1008 } 1009 } 1010 EXPORT_SYMBOL_GPL(kcpustat_field); 1011 1012 static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, 1013 const struct kernel_cpustat *src, 1014 struct task_struct *tsk, int cpu) 1015 { 1016 struct vtime *vtime = &tsk->vtime; 1017 unsigned int seq; 1018 1019 do { 1020 u64 *cpustat; 1021 u64 delta; 1022 int state; 1023 1024 seq = read_seqcount_begin(&vtime->seqcount); 1025 1026 state = vtime_state_fetch(vtime, cpu); 1027 if (state < 0) 1028 return state; 1029 1030 *dst = *src; 1031 cpustat = dst->cpustat; 1032 1033 /* Task is sleeping, dead or idle, nothing to add */ 1034 if (state < VTIME_SYS) 1035 continue; 1036 1037 delta = vtime_delta(vtime); 1038 1039 /* 1040 * Task runs either in user (including guest) or kernel space, 1041 * add pending nohz time to the right place. 1042 */ 1043 if (state == VTIME_SYS) { 1044 cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; 1045 } else if (state == VTIME_USER) { 1046 if (task_nice(tsk) > 0) 1047 cpustat[CPUTIME_NICE] += vtime->utime + delta; 1048 else 1049 cpustat[CPUTIME_USER] += vtime->utime + delta; 1050 } else { 1051 WARN_ON_ONCE(state != VTIME_GUEST); 1052 if (task_nice(tsk) > 0) { 1053 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; 1054 cpustat[CPUTIME_NICE] += vtime->gtime + delta; 1055 } else { 1056 cpustat[CPUTIME_GUEST] += vtime->gtime + delta; 1057 cpustat[CPUTIME_USER] += vtime->gtime + delta; 1058 } 1059 } 1060 } while (read_seqcount_retry(&vtime->seqcount, seq)); 1061 1062 return 0; 1063 } 1064 1065 void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) 1066 { 1067 const struct kernel_cpustat *src = &kcpustat_cpu(cpu); 1068 struct rq *rq; 1069 int err; 1070 1071 if (!vtime_accounting_enabled_cpu(cpu)) { 1072 *dst = *src; 1073 return; 1074 } 1075 1076 rq = cpu_rq(cpu); 1077 1078 for (;;) { 1079 struct task_struct *curr; 1080 1081 rcu_read_lock(); 1082 curr = rcu_dereference(rq->curr); 1083 if (WARN_ON_ONCE(!curr)) { 1084 rcu_read_unlock(); 1085 *dst = *src; 1086 return; 1087 } 1088 1089 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); 1090 rcu_read_unlock(); 1091 1092 if (!err) 1093 return; 1094 1095 cpu_relax(); 1096 } 1097 } 1098 EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); 1099 1100 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 1101