1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple CPU accounting cgroup controller 4 */ 5 #include <linux/sched/cputime.h> 6 #include <linux/tsacct_kern.h> 7 #include "sched.h" 8 9 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 10 #include <asm/cputime.h> 11 #endif 12 13 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 14 15 DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); 16 17 /* 18 * There are no locks covering percpu hardirq/softirq time. 19 * They are only modified in vtime_account, on corresponding CPU 20 * with interrupts disabled. So, writes are safe. 21 * They are read and saved off onto struct rq in update_rq_clock(). 22 * This may result in other CPU reading this CPU's IRQ time and can 23 * race with irq/vtime_account on this CPU. We would either get old 24 * or new value with a side effect of accounting a slice of IRQ time to wrong 25 * task when IRQ is in progress while we read rq->clock. That is a worthy 26 * compromise in place of having locks on each IRQ in account_system_time. 27 */ 28 DEFINE_PER_CPU(struct irqtime, cpu_irqtime); 29 30 void enable_sched_clock_irqtime(void) 31 { 32 static_branch_enable(&sched_clock_irqtime); 33 } 34 35 void disable_sched_clock_irqtime(void) 36 { 37 if (irqtime_enabled()) 38 static_branch_disable(&sched_clock_irqtime); 39 } 40 41 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, 42 enum cpu_usage_stat idx) 43 { 44 u64 *cpustat = kcpustat_this_cpu->cpustat; 45 46 u64_stats_update_begin(&irqtime->sync); 47 cpustat[idx] += delta; 48 irqtime->total += delta; 49 irqtime->tick_delta += delta; 50 u64_stats_update_end(&irqtime->sync); 51 } 52 53 /* 54 * Called after incrementing preempt_count on {soft,}irq_enter 55 * and before decrementing preempt_count on {soft,}irq_exit. 56 */ 57 void irqtime_account_irq(struct task_struct *curr, unsigned int offset) 58 { 59 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 60 unsigned int pc; 61 s64 delta; 62 int cpu; 63 64 if (!irqtime_enabled()) 65 return; 66 67 cpu = smp_processor_id(); 68 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; 69 irqtime->irq_start_time += delta; 70 pc = irq_count() - offset; 71 72 /* 73 * We do not account for softirq time from ksoftirqd here. 74 * We want to continue accounting softirq time to ksoftirqd thread 75 * in that case, so as not to confuse scheduler with a special task 76 * that do not consume any time, but still wants to run. 77 */ 78 if (pc & HARDIRQ_MASK) 79 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); 80 else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) 81 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); 82 } 83 84 static u64 irqtime_tick_accounted(u64 maxtime) 85 { 86 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 87 u64 delta; 88 89 delta = min(irqtime->tick_delta, maxtime); 90 irqtime->tick_delta -= delta; 91 92 return delta; 93 } 94 95 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 96 97 static u64 irqtime_tick_accounted(u64 dummy) 98 { 99 return 0; 100 } 101 102 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 103 104 static inline void task_group_account_field(struct task_struct *p, int index, 105 u64 tmp) 106 { 107 /* 108 * Since all updates are sure to touch the root cgroup, we 109 * get ourselves ahead and touch it first. If the root cgroup 110 * is the only cgroup, then nothing else should be necessary. 111 * 112 */ 113 __this_cpu_add(kernel_cpustat.cpustat[index], tmp); 114 115 cgroup_account_cputime_field(p, index, tmp); 116 } 117 118 /* 119 * Account user CPU time to a process. 120 * @p: the process that the CPU time gets accounted to 121 * @cputime: the CPU time spent in user space since the last update 122 */ 123 void account_user_time(struct task_struct *p, u64 cputime) 124 { 125 int index; 126 127 /* Add user time to process. */ 128 p->utime += cputime; 129 account_group_user_time(p, cputime); 130 131 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 132 133 /* Add user time to cpustat. */ 134 task_group_account_field(p, index, cputime); 135 136 /* Account for user time used */ 137 acct_account_cputime(p); 138 } 139 140 /* 141 * Account guest CPU time to a process. 142 * @p: the process that the CPU time gets accounted to 143 * @cputime: the CPU time spent in virtual machine since the last update 144 */ 145 void account_guest_time(struct task_struct *p, u64 cputime) 146 { 147 u64 *cpustat = kcpustat_this_cpu->cpustat; 148 149 /* Add guest time to process. */ 150 p->utime += cputime; 151 account_group_user_time(p, cputime); 152 p->gtime += cputime; 153 154 /* Add guest time to cpustat. */ 155 if (task_nice(p) > 0) { 156 task_group_account_field(p, CPUTIME_NICE, cputime); 157 cpustat[CPUTIME_GUEST_NICE] += cputime; 158 } else { 159 task_group_account_field(p, CPUTIME_USER, cputime); 160 cpustat[CPUTIME_GUEST] += cputime; 161 } 162 } 163 164 /* 165 * Account system CPU time to a process and desired cpustat field 166 * @p: the process that the CPU time gets accounted to 167 * @cputime: the CPU time spent in kernel space since the last update 168 * @index: pointer to cpustat field that has to be updated 169 */ 170 void account_system_index_time(struct task_struct *p, 171 u64 cputime, enum cpu_usage_stat index) 172 { 173 /* Add system time to process. */ 174 p->stime += cputime; 175 account_group_system_time(p, cputime); 176 177 /* Add system time to cpustat. */ 178 task_group_account_field(p, index, cputime); 179 180 /* Account for system time used */ 181 acct_account_cputime(p); 182 } 183 184 /* 185 * Account system CPU time to a process. 186 * @p: the process that the CPU time gets accounted to 187 * @hardirq_offset: the offset to subtract from hardirq_count() 188 * @cputime: the CPU time spent in kernel space since the last update 189 */ 190 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 191 { 192 int index; 193 194 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 195 account_guest_time(p, cputime); 196 return; 197 } 198 199 if (hardirq_count() - hardirq_offset) 200 index = CPUTIME_IRQ; 201 else if (in_serving_softirq()) 202 index = CPUTIME_SOFTIRQ; 203 else 204 index = CPUTIME_SYSTEM; 205 206 account_system_index_time(p, cputime, index); 207 } 208 209 /* 210 * Account for involuntary wait time. 211 * @cputime: the CPU time spent in involuntary wait 212 */ 213 void account_steal_time(u64 cputime) 214 { 215 u64 *cpustat = kcpustat_this_cpu->cpustat; 216 217 cpustat[CPUTIME_STEAL] += cputime; 218 } 219 220 /* 221 * Account for idle time. 222 * @cputime: the CPU time spent in idle wait 223 */ 224 void account_idle_time(u64 cputime) 225 { 226 u64 *cpustat = kcpustat_this_cpu->cpustat; 227 struct rq *rq = this_rq(); 228 229 if (atomic_read(&rq->nr_iowait) > 0) 230 cpustat[CPUTIME_IOWAIT] += cputime; 231 else 232 cpustat[CPUTIME_IDLE] += cputime; 233 } 234 235 236 #ifdef CONFIG_SCHED_CORE 237 /* 238 * Account for forceidle time due to core scheduling. 239 * 240 * REQUIRES: schedstat is enabled. 241 */ 242 void __account_forceidle_time(struct task_struct *p, u64 delta) 243 { 244 __schedstat_add(p->stats.core_forceidle_sum, delta); 245 246 task_group_account_field(p, CPUTIME_FORCEIDLE, delta); 247 } 248 #endif /* CONFIG_SCHED_CORE */ 249 250 /* 251 * When a guest is interrupted for a longer amount of time, missed clock 252 * ticks are not redelivered later. Due to that, this function may on 253 * occasion account more time than the calling functions think elapsed. 254 */ 255 static __always_inline u64 steal_account_process_time(u64 maxtime) 256 { 257 #ifdef CONFIG_PARAVIRT 258 if (static_key_false(¶virt_steal_enabled)) { 259 u64 steal; 260 261 steal = paravirt_steal_clock(smp_processor_id()); 262 steal -= this_rq()->prev_steal_time; 263 steal = min(steal, maxtime); 264 account_steal_time(steal); 265 this_rq()->prev_steal_time += steal; 266 267 return steal; 268 } 269 #endif /* CONFIG_PARAVIRT */ 270 return 0; 271 } 272 273 /* 274 * Account how much elapsed time was spent in steal, IRQ, or softirq time. 275 */ 276 static inline u64 account_other_time(u64 max) 277 { 278 u64 accounted; 279 280 lockdep_assert_irqs_disabled(); 281 282 accounted = steal_account_process_time(max); 283 284 if (accounted < max) 285 accounted += irqtime_tick_accounted(max - accounted); 286 287 return accounted; 288 } 289 290 #ifdef CONFIG_64BIT 291 static inline u64 read_sum_exec_runtime(struct task_struct *t) 292 { 293 return t->se.sum_exec_runtime; 294 } 295 #else /* !CONFIG_64BIT: */ 296 static u64 read_sum_exec_runtime(struct task_struct *t) 297 { 298 u64 ns; 299 struct rq_flags rf; 300 struct rq *rq; 301 302 rq = task_rq_lock(t, &rf); 303 ns = t->se.sum_exec_runtime; 304 task_rq_unlock(rq, t, &rf); 305 306 return ns; 307 } 308 #endif /* !CONFIG_64BIT */ 309 310 /* 311 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live 312 * tasks (sum on group iteration) belonging to @tsk's group. 313 */ 314 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 315 { 316 struct signal_struct *sig = tsk->signal; 317 struct task_struct *t; 318 u64 utime, stime; 319 320 /* 321 * Update current task runtime to account pending time since last 322 * scheduler action or thread_group_cputime() call. This thread group 323 * might have other running tasks on different CPUs, but updating 324 * their runtime can affect syscall performance, so we skip account 325 * those pending times and rely only on values updated on tick or 326 * other scheduler action. 327 */ 328 if (same_thread_group(current, tsk)) 329 (void) task_sched_runtime(current); 330 331 guard(rcu)(); 332 scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { 333 times->utime = sig->utime; 334 times->stime = sig->stime; 335 times->sum_exec_runtime = sig->sum_sched_runtime; 336 337 __for_each_thread(sig, t) { 338 task_cputime(t, &utime, &stime); 339 times->utime += utime; 340 times->stime += stime; 341 times->sum_exec_runtime += read_sum_exec_runtime(t); 342 } 343 } 344 } 345 346 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 347 /* 348 * Account a tick to a process and cpustat 349 * @p: the process that the CPU time gets accounted to 350 * @user_tick: is the tick from userspace 351 * @rq: the pointer to rq 352 * 353 * Tick demultiplexing follows the order 354 * - pending hardirq update 355 * - pending softirq update 356 * - user_time 357 * - idle_time 358 * - system time 359 * - check for guest_time 360 * - else account as system_time 361 * 362 * Check for hardirq is done both for system and user time as there is 363 * no timer going off while we are on hardirq and hence we may never get an 364 * opportunity to update it solely in system time. 365 * p->stime and friends are only updated on system time and not on IRQ 366 * softirq as those do not count in task exec_runtime any more. 367 */ 368 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 369 int ticks) 370 { 371 u64 other, cputime = TICK_NSEC * ticks; 372 373 /* 374 * When returning from idle, many ticks can get accounted at 375 * once, including some ticks of steal, IRQ, and softirq time. 376 * Subtract those ticks from the amount of time accounted to 377 * idle, or potentially user or system time. Due to rounding, 378 * other time can exceed ticks occasionally. 379 */ 380 other = account_other_time(ULONG_MAX); 381 if (other >= cputime) 382 return; 383 384 cputime -= other; 385 386 if (this_cpu_ksoftirqd() == p) { 387 /* 388 * ksoftirqd time do not get accounted in cpu_softirq_time. 389 * So, we have to handle it separately here. 390 * Also, p->stime needs to be updated for ksoftirqd. 391 */ 392 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); 393 } else if (user_tick) { 394 account_user_time(p, cputime); 395 } else if (p == this_rq()->idle) { 396 account_idle_time(cputime); 397 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 398 account_guest_time(p, cputime); 399 } else { 400 account_system_index_time(p, cputime, CPUTIME_SYSTEM); 401 } 402 } 403 404 static void irqtime_account_idle_ticks(int ticks) 405 { 406 irqtime_account_process_tick(current, 0, ticks); 407 } 408 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 409 static inline void irqtime_account_idle_ticks(int ticks) { } 410 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 411 int nr_ticks) { } 412 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 413 414 /* 415 * Use precise platform statistics if available: 416 */ 417 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 418 419 void vtime_account_irq(struct task_struct *tsk, unsigned int offset) 420 { 421 unsigned int pc = irq_count() - offset; 422 423 if (pc & HARDIRQ_OFFSET) { 424 vtime_account_hardirq(tsk); 425 } else if (pc & SOFTIRQ_OFFSET) { 426 vtime_account_softirq(tsk); 427 } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && 428 is_idle_task(tsk)) { 429 vtime_account_idle(tsk); 430 } else { 431 vtime_account_kernel(tsk); 432 } 433 } 434 435 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 436 u64 *ut, u64 *st) 437 { 438 *ut = curr->utime; 439 *st = curr->stime; 440 } 441 442 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 443 { 444 *ut = p->utime; 445 *st = p->stime; 446 } 447 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 448 449 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 450 { 451 struct task_cputime cputime; 452 453 thread_group_cputime(p, &cputime); 454 455 *ut = cputime.utime; 456 *st = cputime.stime; 457 } 458 459 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ 460 461 /* 462 * Account a single tick of CPU time. 463 * @p: the process that the CPU time gets accounted to 464 * @user_tick: indicates if the tick is a user or a system tick 465 */ 466 void account_process_tick(struct task_struct *p, int user_tick) 467 { 468 u64 cputime, steal; 469 470 if (vtime_accounting_enabled_this_cpu()) 471 return; 472 473 if (irqtime_enabled()) { 474 irqtime_account_process_tick(p, user_tick, 1); 475 return; 476 } 477 478 cputime = TICK_NSEC; 479 steal = steal_account_process_time(ULONG_MAX); 480 481 if (steal >= cputime) 482 return; 483 484 cputime -= steal; 485 486 if (user_tick) 487 account_user_time(p, cputime); 488 else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) 489 account_system_time(p, HARDIRQ_OFFSET, cputime); 490 else 491 account_idle_time(cputime); 492 } 493 494 /* 495 * Account multiple ticks of idle time. 496 * @ticks: number of stolen ticks 497 */ 498 void account_idle_ticks(unsigned long ticks) 499 { 500 u64 cputime, steal; 501 502 if (irqtime_enabled()) { 503 irqtime_account_idle_ticks(ticks); 504 return; 505 } 506 507 cputime = ticks * TICK_NSEC; 508 steal = steal_account_process_time(ULONG_MAX); 509 510 if (steal >= cputime) 511 return; 512 513 cputime -= steal; 514 account_idle_time(cputime); 515 } 516 517 /* 518 * Adjust tick based cputime random precision against scheduler runtime 519 * accounting. 520 * 521 * Tick based cputime accounting depend on random scheduling timeslices of a 522 * task to be interrupted or not by the timer. Depending on these 523 * circumstances, the number of these interrupts may be over or 524 * under-optimistic, matching the real user and system cputime with a variable 525 * precision. 526 * 527 * Fix this by scaling these tick based values against the total runtime 528 * accounted by the CFS scheduler. 529 * 530 * This code provides the following guarantees: 531 * 532 * stime + utime == rtime 533 * stime_i+1 >= stime_i, utime_i+1 >= utime_i 534 * 535 * Assuming that rtime_i+1 >= rtime_i. 536 */ 537 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 538 u64 *ut, u64 *st) 539 { 540 u64 rtime, stime, utime; 541 unsigned long flags; 542 543 /* Serialize concurrent callers such that we can honour our guarantees */ 544 raw_spin_lock_irqsave(&prev->lock, flags); 545 rtime = curr->sum_exec_runtime; 546 547 /* 548 * This is possible under two circumstances: 549 * - rtime isn't monotonic after all (a bug); 550 * - we got reordered by the lock. 551 * 552 * In both cases this acts as a filter such that the rest of the code 553 * can assume it is monotonic regardless of anything else. 554 */ 555 if (prev->stime + prev->utime >= rtime) 556 goto out; 557 558 stime = curr->stime; 559 utime = curr->utime; 560 561 /* 562 * If either stime or utime are 0, assume all runtime is userspace. 563 * Once a task gets some ticks, the monotonicity code at 'update:' 564 * will ensure things converge to the observed ratio. 565 */ 566 if (stime == 0) { 567 utime = rtime; 568 goto update; 569 } 570 571 if (utime == 0) { 572 stime = rtime; 573 goto update; 574 } 575 576 stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); 577 /* 578 * Because mul_u64_u64_div_u64() can approximate on some 579 * achitectures; enforce the constraint that: a*b/(b+c) <= a. 580 */ 581 if (unlikely(stime > rtime)) 582 stime = rtime; 583 584 update: 585 /* 586 * Make sure stime doesn't go backwards; this preserves monotonicity 587 * for utime because rtime is monotonic. 588 * 589 * utime_i+1 = rtime_i+1 - stime_i 590 * = rtime_i+1 - (rtime_i - utime_i) 591 * = (rtime_i+1 - rtime_i) + utime_i 592 * >= utime_i 593 */ 594 if (stime < prev->stime) 595 stime = prev->stime; 596 utime = rtime - stime; 597 598 /* 599 * Make sure utime doesn't go backwards; this still preserves 600 * monotonicity for stime, analogous argument to above. 601 */ 602 if (utime < prev->utime) { 603 utime = prev->utime; 604 stime = rtime - utime; 605 } 606 607 prev->stime = stime; 608 prev->utime = utime; 609 out: 610 *ut = prev->utime; 611 *st = prev->stime; 612 raw_spin_unlock_irqrestore(&prev->lock, flags); 613 } 614 615 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 616 { 617 struct task_cputime cputime = { 618 .sum_exec_runtime = p->se.sum_exec_runtime, 619 }; 620 621 if (task_cputime(p, &cputime.utime, &cputime.stime)) 622 cputime.sum_exec_runtime = task_sched_runtime(p); 623 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 624 } 625 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 626 627 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 628 { 629 struct task_cputime cputime; 630 631 thread_group_cputime(p, &cputime); 632 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 633 } 634 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 635 636 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 637 static u64 vtime_delta(struct vtime *vtime) 638 { 639 unsigned long long clock; 640 641 clock = sched_clock(); 642 if (clock < vtime->starttime) 643 return 0; 644 645 return clock - vtime->starttime; 646 } 647 648 static u64 get_vtime_delta(struct vtime *vtime) 649 { 650 u64 delta = vtime_delta(vtime); 651 u64 other; 652 653 /* 654 * Unlike tick based timing, vtime based timing never has lost 655 * ticks, and no need for steal time accounting to make up for 656 * lost ticks. Vtime accounts a rounded version of actual 657 * elapsed time. Limit account_other_time to prevent rounding 658 * errors from causing elapsed vtime to go negative. 659 */ 660 other = account_other_time(delta); 661 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); 662 vtime->starttime += delta; 663 664 return delta - other; 665 } 666 667 static void vtime_account_system(struct task_struct *tsk, 668 struct vtime *vtime) 669 { 670 vtime->stime += get_vtime_delta(vtime); 671 if (vtime->stime >= TICK_NSEC) { 672 account_system_time(tsk, irq_count(), vtime->stime); 673 vtime->stime = 0; 674 } 675 } 676 677 static void vtime_account_guest(struct task_struct *tsk, 678 struct vtime *vtime) 679 { 680 vtime->gtime += get_vtime_delta(vtime); 681 if (vtime->gtime >= TICK_NSEC) { 682 account_guest_time(tsk, vtime->gtime); 683 vtime->gtime = 0; 684 } 685 } 686 687 static void __vtime_account_kernel(struct task_struct *tsk, 688 struct vtime *vtime) 689 { 690 /* We might have scheduled out from guest path */ 691 if (vtime->state == VTIME_GUEST) 692 vtime_account_guest(tsk, vtime); 693 else 694 vtime_account_system(tsk, vtime); 695 } 696 697 void vtime_account_kernel(struct task_struct *tsk) 698 { 699 struct vtime *vtime = &tsk->vtime; 700 701 if (!vtime_delta(vtime)) 702 return; 703 704 write_seqcount_begin(&vtime->seqcount); 705 __vtime_account_kernel(tsk, vtime); 706 write_seqcount_end(&vtime->seqcount); 707 } 708 709 void vtime_user_enter(struct task_struct *tsk) 710 { 711 struct vtime *vtime = &tsk->vtime; 712 713 write_seqcount_begin(&vtime->seqcount); 714 vtime_account_system(tsk, vtime); 715 vtime->state = VTIME_USER; 716 write_seqcount_end(&vtime->seqcount); 717 } 718 719 void vtime_user_exit(struct task_struct *tsk) 720 { 721 struct vtime *vtime = &tsk->vtime; 722 723 write_seqcount_begin(&vtime->seqcount); 724 vtime->utime += get_vtime_delta(vtime); 725 if (vtime->utime >= TICK_NSEC) { 726 account_user_time(tsk, vtime->utime); 727 vtime->utime = 0; 728 } 729 vtime->state = VTIME_SYS; 730 write_seqcount_end(&vtime->seqcount); 731 } 732 733 void vtime_guest_enter(struct task_struct *tsk) 734 { 735 struct vtime *vtime = &tsk->vtime; 736 /* 737 * The flags must be updated under the lock with 738 * the vtime_starttime flush and update. 739 * That enforces a right ordering and update sequence 740 * synchronization against the reader (task_gtime()) 741 * that can thus safely catch up with a tickless delta. 742 */ 743 write_seqcount_begin(&vtime->seqcount); 744 vtime_account_system(tsk, vtime); 745 tsk->flags |= PF_VCPU; 746 vtime->state = VTIME_GUEST; 747 write_seqcount_end(&vtime->seqcount); 748 } 749 EXPORT_SYMBOL_GPL(vtime_guest_enter); 750 751 void vtime_guest_exit(struct task_struct *tsk) 752 { 753 struct vtime *vtime = &tsk->vtime; 754 755 write_seqcount_begin(&vtime->seqcount); 756 vtime_account_guest(tsk, vtime); 757 tsk->flags &= ~PF_VCPU; 758 vtime->state = VTIME_SYS; 759 write_seqcount_end(&vtime->seqcount); 760 } 761 EXPORT_SYMBOL_GPL(vtime_guest_exit); 762 763 void vtime_account_idle(struct task_struct *tsk) 764 { 765 account_idle_time(get_vtime_delta(&tsk->vtime)); 766 } 767 768 void vtime_task_switch_generic(struct task_struct *prev) 769 { 770 struct vtime *vtime = &prev->vtime; 771 772 write_seqcount_begin(&vtime->seqcount); 773 if (vtime->state == VTIME_IDLE) 774 vtime_account_idle(prev); 775 else 776 __vtime_account_kernel(prev, vtime); 777 vtime->state = VTIME_INACTIVE; 778 vtime->cpu = -1; 779 write_seqcount_end(&vtime->seqcount); 780 781 vtime = ¤t->vtime; 782 783 write_seqcount_begin(&vtime->seqcount); 784 if (is_idle_task(current)) 785 vtime->state = VTIME_IDLE; 786 else if (current->flags & PF_VCPU) 787 vtime->state = VTIME_GUEST; 788 else 789 vtime->state = VTIME_SYS; 790 vtime->starttime = sched_clock(); 791 vtime->cpu = smp_processor_id(); 792 write_seqcount_end(&vtime->seqcount); 793 } 794 795 void vtime_init_idle(struct task_struct *t, int cpu) 796 { 797 struct vtime *vtime = &t->vtime; 798 unsigned long flags; 799 800 local_irq_save(flags); 801 write_seqcount_begin(&vtime->seqcount); 802 vtime->state = VTIME_IDLE; 803 vtime->starttime = sched_clock(); 804 vtime->cpu = cpu; 805 write_seqcount_end(&vtime->seqcount); 806 local_irq_restore(flags); 807 } 808 809 u64 task_gtime(struct task_struct *t) 810 { 811 struct vtime *vtime = &t->vtime; 812 unsigned int seq; 813 u64 gtime; 814 815 if (!vtime_accounting_enabled()) 816 return t->gtime; 817 818 do { 819 seq = read_seqcount_begin(&vtime->seqcount); 820 821 gtime = t->gtime; 822 if (vtime->state == VTIME_GUEST) 823 gtime += vtime->gtime + vtime_delta(vtime); 824 825 } while (read_seqcount_retry(&vtime->seqcount, seq)); 826 827 return gtime; 828 } 829 830 /* 831 * Fetch cputime raw values from fields of task_struct and 832 * add up the pending nohz execution time since the last 833 * cputime snapshot. 834 */ 835 bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) 836 { 837 struct vtime *vtime = &t->vtime; 838 unsigned int seq; 839 u64 delta; 840 int ret; 841 842 if (!vtime_accounting_enabled()) { 843 *utime = t->utime; 844 *stime = t->stime; 845 return false; 846 } 847 848 do { 849 ret = false; 850 seq = read_seqcount_begin(&vtime->seqcount); 851 852 *utime = t->utime; 853 *stime = t->stime; 854 855 /* Task is sleeping or idle, nothing to add */ 856 if (vtime->state < VTIME_SYS) 857 continue; 858 859 ret = true; 860 delta = vtime_delta(vtime); 861 862 /* 863 * Task runs either in user (including guest) or kernel space, 864 * add pending nohz time to the right place. 865 */ 866 if (vtime->state == VTIME_SYS) 867 *stime += vtime->stime + delta; 868 else 869 *utime += vtime->utime + delta; 870 } while (read_seqcount_retry(&vtime->seqcount, seq)); 871 872 return ret; 873 } 874 875 static int vtime_state_fetch(struct vtime *vtime, int cpu) 876 { 877 int state = READ_ONCE(vtime->state); 878 879 /* 880 * We raced against a context switch, fetch the 881 * kcpustat task again. 882 */ 883 if (vtime->cpu != cpu && vtime->cpu != -1) 884 return -EAGAIN; 885 886 /* 887 * Two possible things here: 888 * 1) We are seeing the scheduling out task (prev) or any past one. 889 * 2) We are seeing the scheduling in task (next) but it hasn't 890 * passed though vtime_task_switch() yet so the pending 891 * cputime of the prev task may not be flushed yet. 892 * 893 * Case 1) is ok but 2) is not. So wait for a safe VTIME state. 894 */ 895 if (state == VTIME_INACTIVE) 896 return -EAGAIN; 897 898 return state; 899 } 900 901 static u64 kcpustat_user_vtime(struct vtime *vtime) 902 { 903 if (vtime->state == VTIME_USER) 904 return vtime->utime + vtime_delta(vtime); 905 else if (vtime->state == VTIME_GUEST) 906 return vtime->gtime + vtime_delta(vtime); 907 return 0; 908 } 909 910 static int kcpustat_field_vtime(u64 *cpustat, 911 struct task_struct *tsk, 912 enum cpu_usage_stat usage, 913 int cpu, u64 *val) 914 { 915 struct vtime *vtime = &tsk->vtime; 916 unsigned int seq; 917 918 do { 919 int state; 920 921 seq = read_seqcount_begin(&vtime->seqcount); 922 923 state = vtime_state_fetch(vtime, cpu); 924 if (state < 0) 925 return state; 926 927 *val = cpustat[usage]; 928 929 /* 930 * Nice VS unnice cputime accounting may be inaccurate if 931 * the nice value has changed since the last vtime update. 932 * But proper fix would involve interrupting target on nice 933 * updates which is a no go on nohz_full (although the scheduler 934 * may still interrupt the target if rescheduling is needed...) 935 */ 936 switch (usage) { 937 case CPUTIME_SYSTEM: 938 if (state == VTIME_SYS) 939 *val += vtime->stime + vtime_delta(vtime); 940 break; 941 case CPUTIME_USER: 942 if (task_nice(tsk) <= 0) 943 *val += kcpustat_user_vtime(vtime); 944 break; 945 case CPUTIME_NICE: 946 if (task_nice(tsk) > 0) 947 *val += kcpustat_user_vtime(vtime); 948 break; 949 case CPUTIME_GUEST: 950 if (state == VTIME_GUEST && task_nice(tsk) <= 0) 951 *val += vtime->gtime + vtime_delta(vtime); 952 break; 953 case CPUTIME_GUEST_NICE: 954 if (state == VTIME_GUEST && task_nice(tsk) > 0) 955 *val += vtime->gtime + vtime_delta(vtime); 956 break; 957 default: 958 break; 959 } 960 } while (read_seqcount_retry(&vtime->seqcount, seq)); 961 962 return 0; 963 } 964 965 u64 kcpustat_field(struct kernel_cpustat *kcpustat, 966 enum cpu_usage_stat usage, int cpu) 967 { 968 u64 *cpustat = kcpustat->cpustat; 969 u64 val = cpustat[usage]; 970 struct rq *rq; 971 int err; 972 973 if (!vtime_accounting_enabled_cpu(cpu)) 974 return val; 975 976 rq = cpu_rq(cpu); 977 978 for (;;) { 979 struct task_struct *curr; 980 981 rcu_read_lock(); 982 curr = rcu_dereference(rq->curr); 983 if (WARN_ON_ONCE(!curr)) { 984 rcu_read_unlock(); 985 return cpustat[usage]; 986 } 987 988 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val); 989 rcu_read_unlock(); 990 991 if (!err) 992 return val; 993 994 cpu_relax(); 995 } 996 } 997 EXPORT_SYMBOL_GPL(kcpustat_field); 998 999 static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, 1000 const struct kernel_cpustat *src, 1001 struct task_struct *tsk, int cpu) 1002 { 1003 struct vtime *vtime = &tsk->vtime; 1004 unsigned int seq; 1005 1006 do { 1007 u64 *cpustat; 1008 u64 delta; 1009 int state; 1010 1011 seq = read_seqcount_begin(&vtime->seqcount); 1012 1013 state = vtime_state_fetch(vtime, cpu); 1014 if (state < 0) 1015 return state; 1016 1017 *dst = *src; 1018 cpustat = dst->cpustat; 1019 1020 /* Task is sleeping, dead or idle, nothing to add */ 1021 if (state < VTIME_SYS) 1022 continue; 1023 1024 delta = vtime_delta(vtime); 1025 1026 /* 1027 * Task runs either in user (including guest) or kernel space, 1028 * add pending nohz time to the right place. 1029 */ 1030 if (state == VTIME_SYS) { 1031 cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; 1032 } else if (state == VTIME_USER) { 1033 if (task_nice(tsk) > 0) 1034 cpustat[CPUTIME_NICE] += vtime->utime + delta; 1035 else 1036 cpustat[CPUTIME_USER] += vtime->utime + delta; 1037 } else { 1038 WARN_ON_ONCE(state != VTIME_GUEST); 1039 if (task_nice(tsk) > 0) { 1040 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; 1041 cpustat[CPUTIME_NICE] += vtime->gtime + delta; 1042 } else { 1043 cpustat[CPUTIME_GUEST] += vtime->gtime + delta; 1044 cpustat[CPUTIME_USER] += vtime->gtime + delta; 1045 } 1046 } 1047 } while (read_seqcount_retry(&vtime->seqcount, seq)); 1048 1049 return 0; 1050 } 1051 1052 void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) 1053 { 1054 const struct kernel_cpustat *src = &kcpustat_cpu(cpu); 1055 struct rq *rq; 1056 int err; 1057 1058 if (!vtime_accounting_enabled_cpu(cpu)) { 1059 *dst = *src; 1060 return; 1061 } 1062 1063 rq = cpu_rq(cpu); 1064 1065 for (;;) { 1066 struct task_struct *curr; 1067 1068 rcu_read_lock(); 1069 curr = rcu_dereference(rq->curr); 1070 if (WARN_ON_ONCE(!curr)) { 1071 rcu_read_unlock(); 1072 *dst = *src; 1073 return; 1074 } 1075 1076 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); 1077 rcu_read_unlock(); 1078 1079 if (!err) 1080 return; 1081 1082 cpu_relax(); 1083 } 1084 } 1085 EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); 1086 1087 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 1088