1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple CPU accounting cgroup controller 4 */ 5 #include <linux/sched/cputime.h> 6 #include <linux/tsacct_kern.h> 7 #include "sched.h" 8 9 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 10 #include <asm/cputime.h> 11 #endif 12 13 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 14 15 /* 16 * There are no locks covering percpu hardirq/softirq time. 17 * They are only modified in vtime_account, on corresponding CPU 18 * with interrupts disabled. So, writes are safe. 19 * They are read and saved off onto struct rq in update_rq_clock(). 20 * This may result in other CPU reading this CPU's IRQ time and can 21 * race with irq/vtime_account on this CPU. We would either get old 22 * or new value with a side effect of accounting a slice of IRQ time to wrong 23 * task when IRQ is in progress while we read rq->clock. That is a worthy 24 * compromise in place of having locks on each IRQ in account_system_time. 25 */ 26 DEFINE_PER_CPU(struct irqtime, cpu_irqtime); 27 28 int sched_clock_irqtime; 29 30 void enable_sched_clock_irqtime(void) 31 { 32 sched_clock_irqtime = 1; 33 } 34 35 void disable_sched_clock_irqtime(void) 36 { 37 sched_clock_irqtime = 0; 38 } 39 40 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, 41 enum cpu_usage_stat idx) 42 { 43 u64 *cpustat = kcpustat_this_cpu->cpustat; 44 45 u64_stats_update_begin(&irqtime->sync); 46 cpustat[idx] += delta; 47 irqtime->total += delta; 48 irqtime->tick_delta += delta; 49 u64_stats_update_end(&irqtime->sync); 50 } 51 52 /* 53 * Called after incrementing preempt_count on {soft,}irq_enter 54 * and before decrementing preempt_count on {soft,}irq_exit. 55 */ 56 void irqtime_account_irq(struct task_struct *curr, unsigned int offset) 57 { 58 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 59 unsigned int pc; 60 s64 delta; 61 int cpu; 62 63 if (!irqtime_enabled()) 64 return; 65 66 cpu = smp_processor_id(); 67 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; 68 irqtime->irq_start_time += delta; 69 pc = irq_count() - offset; 70 71 /* 72 * We do not account for softirq time from ksoftirqd here. 73 * We want to continue accounting softirq time to ksoftirqd thread 74 * in that case, so as not to confuse scheduler with a special task 75 * that do not consume any time, but still wants to run. 76 */ 77 if (pc & HARDIRQ_MASK) 78 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); 79 else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) 80 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); 81 } 82 83 static u64 irqtime_tick_accounted(u64 maxtime) 84 { 85 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 86 u64 delta; 87 88 delta = min(irqtime->tick_delta, maxtime); 89 irqtime->tick_delta -= delta; 90 91 return delta; 92 } 93 94 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 95 96 static u64 irqtime_tick_accounted(u64 dummy) 97 { 98 return 0; 99 } 100 101 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 102 103 static inline void task_group_account_field(struct task_struct *p, int index, 104 u64 tmp) 105 { 106 /* 107 * Since all updates are sure to touch the root cgroup, we 108 * get ourselves ahead and touch it first. If the root cgroup 109 * is the only cgroup, then nothing else should be necessary. 110 * 111 */ 112 __this_cpu_add(kernel_cpustat.cpustat[index], tmp); 113 114 cgroup_account_cputime_field(p, index, tmp); 115 } 116 117 /* 118 * Account user CPU time to a process. 119 * @p: the process that the CPU time gets accounted to 120 * @cputime: the CPU time spent in user space since the last update 121 */ 122 void account_user_time(struct task_struct *p, u64 cputime) 123 { 124 int index; 125 126 /* Add user time to process. */ 127 p->utime += cputime; 128 account_group_user_time(p, cputime); 129 130 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 131 132 /* Add user time to cpustat. */ 133 task_group_account_field(p, index, cputime); 134 135 /* Account for user time used */ 136 acct_account_cputime(p); 137 } 138 139 /* 140 * Account guest CPU time to a process. 141 * @p: the process that the CPU time gets accounted to 142 * @cputime: the CPU time spent in virtual machine since the last update 143 */ 144 void account_guest_time(struct task_struct *p, u64 cputime) 145 { 146 u64 *cpustat = kcpustat_this_cpu->cpustat; 147 148 /* Add guest time to process. */ 149 p->utime += cputime; 150 account_group_user_time(p, cputime); 151 p->gtime += cputime; 152 153 /* Add guest time to cpustat. */ 154 if (task_nice(p) > 0) { 155 task_group_account_field(p, CPUTIME_NICE, cputime); 156 cpustat[CPUTIME_GUEST_NICE] += cputime; 157 } else { 158 task_group_account_field(p, CPUTIME_USER, cputime); 159 cpustat[CPUTIME_GUEST] += cputime; 160 } 161 } 162 163 /* 164 * Account system CPU time to a process and desired cpustat field 165 * @p: the process that the CPU time gets accounted to 166 * @cputime: the CPU time spent in kernel space since the last update 167 * @index: pointer to cpustat field that has to be updated 168 */ 169 void account_system_index_time(struct task_struct *p, 170 u64 cputime, enum cpu_usage_stat index) 171 { 172 /* Add system time to process. */ 173 p->stime += cputime; 174 account_group_system_time(p, cputime); 175 176 /* Add system time to cpustat. */ 177 task_group_account_field(p, index, cputime); 178 179 /* Account for system time used */ 180 acct_account_cputime(p); 181 } 182 183 /* 184 * Account system CPU time to a process. 185 * @p: the process that the CPU time gets accounted to 186 * @hardirq_offset: the offset to subtract from hardirq_count() 187 * @cputime: the CPU time spent in kernel space since the last update 188 */ 189 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 190 { 191 int index; 192 193 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 194 account_guest_time(p, cputime); 195 return; 196 } 197 198 if (hardirq_count() - hardirq_offset) 199 index = CPUTIME_IRQ; 200 else if (in_serving_softirq()) 201 index = CPUTIME_SOFTIRQ; 202 else 203 index = CPUTIME_SYSTEM; 204 205 account_system_index_time(p, cputime, index); 206 } 207 208 /* 209 * Account for involuntary wait time. 210 * @cputime: the CPU time spent in involuntary wait 211 */ 212 void account_steal_time(u64 cputime) 213 { 214 u64 *cpustat = kcpustat_this_cpu->cpustat; 215 216 cpustat[CPUTIME_STEAL] += cputime; 217 } 218 219 /* 220 * Account for idle time. 221 * @cputime: the CPU time spent in idle wait 222 */ 223 void account_idle_time(u64 cputime) 224 { 225 u64 *cpustat = kcpustat_this_cpu->cpustat; 226 struct rq *rq = this_rq(); 227 228 if (atomic_read(&rq->nr_iowait) > 0) 229 cpustat[CPUTIME_IOWAIT] += cputime; 230 else 231 cpustat[CPUTIME_IDLE] += cputime; 232 } 233 234 235 #ifdef CONFIG_SCHED_CORE 236 /* 237 * Account for forceidle time due to core scheduling. 238 * 239 * REQUIRES: schedstat is enabled. 240 */ 241 void __account_forceidle_time(struct task_struct *p, u64 delta) 242 { 243 __schedstat_add(p->stats.core_forceidle_sum, delta); 244 245 task_group_account_field(p, CPUTIME_FORCEIDLE, delta); 246 } 247 #endif /* CONFIG_SCHED_CORE */ 248 249 /* 250 * When a guest is interrupted for a longer amount of time, missed clock 251 * ticks are not redelivered later. Due to that, this function may on 252 * occasion account more time than the calling functions think elapsed. 253 */ 254 #ifdef CONFIG_PARAVIRT 255 struct static_key paravirt_steal_enabled; 256 257 #ifdef CONFIG_HAVE_PV_STEAL_CLOCK_GEN 258 static u64 native_steal_clock(int cpu) 259 { 260 return 0; 261 } 262 263 DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); 264 #endif 265 #endif 266 267 static __always_inline u64 steal_account_process_time(u64 maxtime) 268 { 269 #ifdef CONFIG_PARAVIRT 270 if (static_key_false(¶virt_steal_enabled)) { 271 u64 steal; 272 273 steal = paravirt_steal_clock(smp_processor_id()); 274 steal -= this_rq()->prev_steal_time; 275 steal = min(steal, maxtime); 276 account_steal_time(steal); 277 this_rq()->prev_steal_time += steal; 278 279 return steal; 280 } 281 #endif /* CONFIG_PARAVIRT */ 282 return 0; 283 } 284 285 /* 286 * Account how much elapsed time was spent in steal, IRQ, or softirq time. 287 */ 288 static inline u64 account_other_time(u64 max) 289 { 290 u64 accounted; 291 292 lockdep_assert_irqs_disabled(); 293 294 accounted = steal_account_process_time(max); 295 296 if (accounted < max) 297 accounted += irqtime_tick_accounted(max - accounted); 298 299 return accounted; 300 } 301 302 #ifdef CONFIG_64BIT 303 static inline u64 read_sum_exec_runtime(struct task_struct *t) 304 { 305 return t->se.sum_exec_runtime; 306 } 307 #else /* !CONFIG_64BIT: */ 308 static u64 read_sum_exec_runtime(struct task_struct *t) 309 { 310 u64 ns; 311 struct rq_flags rf; 312 struct rq *rq; 313 314 rq = task_rq_lock(t, &rf); 315 ns = t->se.sum_exec_runtime; 316 task_rq_unlock(rq, t, &rf); 317 318 return ns; 319 } 320 #endif /* !CONFIG_64BIT */ 321 322 /* 323 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live 324 * tasks (sum on group iteration) belonging to @tsk's group. 325 */ 326 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 327 { 328 struct signal_struct *sig = tsk->signal; 329 struct task_struct *t; 330 u64 utime, stime; 331 332 /* 333 * Update current task runtime to account pending time since last 334 * scheduler action or thread_group_cputime() call. This thread group 335 * might have other running tasks on different CPUs, but updating 336 * their runtime can affect syscall performance, so we skip account 337 * those pending times and rely only on values updated on tick or 338 * other scheduler action. 339 */ 340 if (same_thread_group(current, tsk)) 341 (void) task_sched_runtime(current); 342 343 guard(rcu)(); 344 scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { 345 times->utime = sig->utime; 346 times->stime = sig->stime; 347 times->sum_exec_runtime = sig->sum_sched_runtime; 348 349 __for_each_thread(sig, t) { 350 task_cputime(t, &utime, &stime); 351 times->utime += utime; 352 times->stime += stime; 353 times->sum_exec_runtime += read_sum_exec_runtime(t); 354 } 355 } 356 } 357 358 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 359 /* 360 * Account a tick to a process and cpustat 361 * @p: the process that the CPU time gets accounted to 362 * @user_tick: is the tick from userspace 363 * @rq: the pointer to rq 364 * 365 * Tick demultiplexing follows the order 366 * - pending hardirq update 367 * - pending softirq update 368 * - user_time 369 * - idle_time 370 * - system time 371 * - check for guest_time 372 * - else account as system_time 373 * 374 * Check for hardirq is done both for system and user time as there is 375 * no timer going off while we are on hardirq and hence we may never get an 376 * opportunity to update it solely in system time. 377 * p->stime and friends are only updated on system time and not on IRQ 378 * softirq as those do not count in task exec_runtime any more. 379 */ 380 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 381 int ticks) 382 { 383 u64 other, cputime = TICK_NSEC * ticks; 384 385 /* 386 * When returning from idle, many ticks can get accounted at 387 * once, including some ticks of steal, IRQ, and softirq time. 388 * Subtract those ticks from the amount of time accounted to 389 * idle, or potentially user or system time. Due to rounding, 390 * other time can exceed ticks occasionally. 391 */ 392 other = account_other_time(ULONG_MAX); 393 if (other >= cputime) 394 return; 395 396 cputime -= other; 397 398 if (this_cpu_ksoftirqd() == p) { 399 /* 400 * ksoftirqd time do not get accounted in cpu_softirq_time. 401 * So, we have to handle it separately here. 402 * Also, p->stime needs to be updated for ksoftirqd. 403 */ 404 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); 405 } else if (user_tick) { 406 account_user_time(p, cputime); 407 } else if (p == this_rq()->idle) { 408 account_idle_time(cputime); 409 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 410 account_guest_time(p, cputime); 411 } else { 412 account_system_index_time(p, cputime, CPUTIME_SYSTEM); 413 } 414 } 415 416 static void irqtime_account_idle_ticks(int ticks) 417 { 418 irqtime_account_process_tick(current, 0, ticks); 419 } 420 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 421 static inline void irqtime_account_idle_ticks(int ticks) { } 422 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 423 int nr_ticks) { } 424 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 425 426 /* 427 * Use precise platform statistics if available: 428 */ 429 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 430 431 void vtime_account_irq(struct task_struct *tsk, unsigned int offset) 432 { 433 unsigned int pc = irq_count() - offset; 434 435 if (pc & HARDIRQ_OFFSET) { 436 vtime_account_hardirq(tsk); 437 } else if (pc & SOFTIRQ_OFFSET) { 438 vtime_account_softirq(tsk); 439 } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && 440 is_idle_task(tsk)) { 441 vtime_account_idle(tsk); 442 } else { 443 vtime_account_kernel(tsk); 444 } 445 } 446 447 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 448 u64 *ut, u64 *st) 449 { 450 *ut = curr->utime; 451 *st = curr->stime; 452 } 453 454 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 455 { 456 *ut = p->utime; 457 *st = p->stime; 458 } 459 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 460 461 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 462 { 463 struct task_cputime cputime; 464 465 thread_group_cputime(p, &cputime); 466 467 *ut = cputime.utime; 468 *st = cputime.stime; 469 } 470 471 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ 472 473 /* 474 * Account a single tick of CPU time. 475 * @p: the process that the CPU time gets accounted to 476 * @user_tick: indicates if the tick is a user or a system tick 477 */ 478 void account_process_tick(struct task_struct *p, int user_tick) 479 { 480 u64 cputime, steal; 481 482 if (vtime_accounting_enabled_this_cpu()) 483 return; 484 485 if (irqtime_enabled()) { 486 irqtime_account_process_tick(p, user_tick, 1); 487 return; 488 } 489 490 cputime = TICK_NSEC; 491 steal = steal_account_process_time(ULONG_MAX); 492 493 if (steal >= cputime) 494 return; 495 496 cputime -= steal; 497 498 if (user_tick) 499 account_user_time(p, cputime); 500 else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) 501 account_system_time(p, HARDIRQ_OFFSET, cputime); 502 else 503 account_idle_time(cputime); 504 } 505 506 /* 507 * Account multiple ticks of idle time. 508 * @ticks: number of stolen ticks 509 */ 510 void account_idle_ticks(unsigned long ticks) 511 { 512 u64 cputime, steal; 513 514 if (irqtime_enabled()) { 515 irqtime_account_idle_ticks(ticks); 516 return; 517 } 518 519 cputime = ticks * TICK_NSEC; 520 steal = steal_account_process_time(ULONG_MAX); 521 522 if (steal >= cputime) 523 return; 524 525 cputime -= steal; 526 account_idle_time(cputime); 527 } 528 529 /* 530 * Adjust tick based cputime random precision against scheduler runtime 531 * accounting. 532 * 533 * Tick based cputime accounting depend on random scheduling timeslices of a 534 * task to be interrupted or not by the timer. Depending on these 535 * circumstances, the number of these interrupts may be over or 536 * under-optimistic, matching the real user and system cputime with a variable 537 * precision. 538 * 539 * Fix this by scaling these tick based values against the total runtime 540 * accounted by the CFS scheduler. 541 * 542 * This code provides the following guarantees: 543 * 544 * stime + utime == rtime 545 * stime_i+1 >= stime_i, utime_i+1 >= utime_i 546 * 547 * Assuming that rtime_i+1 >= rtime_i. 548 */ 549 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 550 u64 *ut, u64 *st) 551 { 552 u64 rtime, stime, utime; 553 unsigned long flags; 554 555 /* Serialize concurrent callers such that we can honour our guarantees */ 556 raw_spin_lock_irqsave(&prev->lock, flags); 557 rtime = curr->sum_exec_runtime; 558 559 /* 560 * This is possible under two circumstances: 561 * - rtime isn't monotonic after all (a bug); 562 * - we got reordered by the lock. 563 * 564 * In both cases this acts as a filter such that the rest of the code 565 * can assume it is monotonic regardless of anything else. 566 */ 567 if (prev->stime + prev->utime >= rtime) 568 goto out; 569 570 stime = curr->stime; 571 utime = curr->utime; 572 573 /* 574 * If either stime or utime are 0, assume all runtime is userspace. 575 * Once a task gets some ticks, the monotonicity code at 'update:' 576 * will ensure things converge to the observed ratio. 577 */ 578 if (stime == 0) { 579 utime = rtime; 580 goto update; 581 } 582 583 if (utime == 0) { 584 stime = rtime; 585 goto update; 586 } 587 588 stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); 589 /* 590 * Because mul_u64_u64_div_u64() can approximate on some 591 * achitectures; enforce the constraint that: a*b/(b+c) <= a. 592 */ 593 if (unlikely(stime > rtime)) 594 stime = rtime; 595 596 update: 597 /* 598 * Make sure stime doesn't go backwards; this preserves monotonicity 599 * for utime because rtime is monotonic. 600 * 601 * utime_i+1 = rtime_i+1 - stime_i 602 * = rtime_i+1 - (rtime_i - utime_i) 603 * = (rtime_i+1 - rtime_i) + utime_i 604 * >= utime_i 605 */ 606 if (stime < prev->stime) 607 stime = prev->stime; 608 utime = rtime - stime; 609 610 /* 611 * Make sure utime doesn't go backwards; this still preserves 612 * monotonicity for stime, analogous argument to above. 613 */ 614 if (utime < prev->utime) { 615 utime = prev->utime; 616 stime = rtime - utime; 617 } 618 619 prev->stime = stime; 620 prev->utime = utime; 621 out: 622 *ut = prev->utime; 623 *st = prev->stime; 624 raw_spin_unlock_irqrestore(&prev->lock, flags); 625 } 626 627 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 628 { 629 struct task_cputime cputime = { 630 .sum_exec_runtime = p->se.sum_exec_runtime, 631 }; 632 633 if (task_cputime(p, &cputime.utime, &cputime.stime)) 634 cputime.sum_exec_runtime = task_sched_runtime(p); 635 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 636 } 637 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 638 639 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 640 { 641 struct task_cputime cputime; 642 643 thread_group_cputime(p, &cputime); 644 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 645 } 646 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 647 648 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 649 static u64 vtime_delta(struct vtime *vtime) 650 { 651 unsigned long long clock; 652 653 clock = sched_clock(); 654 if (clock < vtime->starttime) 655 return 0; 656 657 return clock - vtime->starttime; 658 } 659 660 static u64 get_vtime_delta(struct vtime *vtime) 661 { 662 u64 delta = vtime_delta(vtime); 663 u64 other; 664 665 /* 666 * Unlike tick based timing, vtime based timing never has lost 667 * ticks, and no need for steal time accounting to make up for 668 * lost ticks. Vtime accounts a rounded version of actual 669 * elapsed time. Limit account_other_time to prevent rounding 670 * errors from causing elapsed vtime to go negative. 671 */ 672 other = account_other_time(delta); 673 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); 674 vtime->starttime += delta; 675 676 return delta - other; 677 } 678 679 static void vtime_account_system(struct task_struct *tsk, 680 struct vtime *vtime) 681 { 682 vtime->stime += get_vtime_delta(vtime); 683 if (vtime->stime >= TICK_NSEC) { 684 account_system_time(tsk, irq_count(), vtime->stime); 685 vtime->stime = 0; 686 } 687 } 688 689 static void vtime_account_guest(struct task_struct *tsk, 690 struct vtime *vtime) 691 { 692 vtime->gtime += get_vtime_delta(vtime); 693 if (vtime->gtime >= TICK_NSEC) { 694 account_guest_time(tsk, vtime->gtime); 695 vtime->gtime = 0; 696 } 697 } 698 699 static void __vtime_account_kernel(struct task_struct *tsk, 700 struct vtime *vtime) 701 { 702 /* We might have scheduled out from guest path */ 703 if (vtime->state == VTIME_GUEST) 704 vtime_account_guest(tsk, vtime); 705 else 706 vtime_account_system(tsk, vtime); 707 } 708 709 void vtime_account_kernel(struct task_struct *tsk) 710 { 711 struct vtime *vtime = &tsk->vtime; 712 713 if (!vtime_delta(vtime)) 714 return; 715 716 write_seqcount_begin(&vtime->seqcount); 717 __vtime_account_kernel(tsk, vtime); 718 write_seqcount_end(&vtime->seqcount); 719 } 720 721 void vtime_user_enter(struct task_struct *tsk) 722 { 723 struct vtime *vtime = &tsk->vtime; 724 725 write_seqcount_begin(&vtime->seqcount); 726 vtime_account_system(tsk, vtime); 727 vtime->state = VTIME_USER; 728 write_seqcount_end(&vtime->seqcount); 729 } 730 731 void vtime_user_exit(struct task_struct *tsk) 732 { 733 struct vtime *vtime = &tsk->vtime; 734 735 write_seqcount_begin(&vtime->seqcount); 736 vtime->utime += get_vtime_delta(vtime); 737 if (vtime->utime >= TICK_NSEC) { 738 account_user_time(tsk, vtime->utime); 739 vtime->utime = 0; 740 } 741 vtime->state = VTIME_SYS; 742 write_seqcount_end(&vtime->seqcount); 743 } 744 745 void vtime_guest_enter(struct task_struct *tsk) 746 { 747 struct vtime *vtime = &tsk->vtime; 748 /* 749 * The flags must be updated under the lock with 750 * the vtime_starttime flush and update. 751 * That enforces a right ordering and update sequence 752 * synchronization against the reader (task_gtime()) 753 * that can thus safely catch up with a tickless delta. 754 */ 755 write_seqcount_begin(&vtime->seqcount); 756 vtime_account_system(tsk, vtime); 757 tsk->flags |= PF_VCPU; 758 vtime->state = VTIME_GUEST; 759 write_seqcount_end(&vtime->seqcount); 760 } 761 EXPORT_SYMBOL_GPL(vtime_guest_enter); 762 763 void vtime_guest_exit(struct task_struct *tsk) 764 { 765 struct vtime *vtime = &tsk->vtime; 766 767 write_seqcount_begin(&vtime->seqcount); 768 vtime_account_guest(tsk, vtime); 769 tsk->flags &= ~PF_VCPU; 770 vtime->state = VTIME_SYS; 771 write_seqcount_end(&vtime->seqcount); 772 } 773 EXPORT_SYMBOL_GPL(vtime_guest_exit); 774 775 void vtime_account_idle(struct task_struct *tsk) 776 { 777 account_idle_time(get_vtime_delta(&tsk->vtime)); 778 } 779 780 void vtime_task_switch_generic(struct task_struct *prev) 781 { 782 struct vtime *vtime = &prev->vtime; 783 784 write_seqcount_begin(&vtime->seqcount); 785 if (vtime->state == VTIME_IDLE) 786 vtime_account_idle(prev); 787 else 788 __vtime_account_kernel(prev, vtime); 789 vtime->state = VTIME_INACTIVE; 790 vtime->cpu = -1; 791 write_seqcount_end(&vtime->seqcount); 792 793 vtime = ¤t->vtime; 794 795 write_seqcount_begin(&vtime->seqcount); 796 if (is_idle_task(current)) 797 vtime->state = VTIME_IDLE; 798 else if (current->flags & PF_VCPU) 799 vtime->state = VTIME_GUEST; 800 else 801 vtime->state = VTIME_SYS; 802 vtime->starttime = sched_clock(); 803 vtime->cpu = smp_processor_id(); 804 write_seqcount_end(&vtime->seqcount); 805 } 806 807 void vtime_init_idle(struct task_struct *t, int cpu) 808 { 809 struct vtime *vtime = &t->vtime; 810 unsigned long flags; 811 812 local_irq_save(flags); 813 write_seqcount_begin(&vtime->seqcount); 814 vtime->state = VTIME_IDLE; 815 vtime->starttime = sched_clock(); 816 vtime->cpu = cpu; 817 write_seqcount_end(&vtime->seqcount); 818 local_irq_restore(flags); 819 } 820 821 u64 task_gtime(struct task_struct *t) 822 { 823 struct vtime *vtime = &t->vtime; 824 unsigned int seq; 825 u64 gtime; 826 827 if (!vtime_accounting_enabled()) 828 return t->gtime; 829 830 do { 831 seq = read_seqcount_begin(&vtime->seqcount); 832 833 gtime = t->gtime; 834 if (vtime->state == VTIME_GUEST) 835 gtime += vtime->gtime + vtime_delta(vtime); 836 837 } while (read_seqcount_retry(&vtime->seqcount, seq)); 838 839 return gtime; 840 } 841 842 /* 843 * Fetch cputime raw values from fields of task_struct and 844 * add up the pending nohz execution time since the last 845 * cputime snapshot. 846 */ 847 bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) 848 { 849 struct vtime *vtime = &t->vtime; 850 unsigned int seq; 851 u64 delta; 852 int ret; 853 854 if (!vtime_accounting_enabled()) { 855 *utime = t->utime; 856 *stime = t->stime; 857 return false; 858 } 859 860 do { 861 ret = false; 862 seq = read_seqcount_begin(&vtime->seqcount); 863 864 *utime = t->utime; 865 *stime = t->stime; 866 867 /* Task is sleeping or idle, nothing to add */ 868 if (vtime->state < VTIME_SYS) 869 continue; 870 871 ret = true; 872 delta = vtime_delta(vtime); 873 874 /* 875 * Task runs either in user (including guest) or kernel space, 876 * add pending nohz time to the right place. 877 */ 878 if (vtime->state == VTIME_SYS) 879 *stime += vtime->stime + delta; 880 else 881 *utime += vtime->utime + delta; 882 } while (read_seqcount_retry(&vtime->seqcount, seq)); 883 884 return ret; 885 } 886 887 static int vtime_state_fetch(struct vtime *vtime, int cpu) 888 { 889 int state = READ_ONCE(vtime->state); 890 891 /* 892 * We raced against a context switch, fetch the 893 * kcpustat task again. 894 */ 895 if (vtime->cpu != cpu && vtime->cpu != -1) 896 return -EAGAIN; 897 898 /* 899 * Two possible things here: 900 * 1) We are seeing the scheduling out task (prev) or any past one. 901 * 2) We are seeing the scheduling in task (next) but it hasn't 902 * passed though vtime_task_switch() yet so the pending 903 * cputime of the prev task may not be flushed yet. 904 * 905 * Case 1) is ok but 2) is not. So wait for a safe VTIME state. 906 */ 907 if (state == VTIME_INACTIVE) 908 return -EAGAIN; 909 910 return state; 911 } 912 913 static u64 kcpustat_user_vtime(struct vtime *vtime) 914 { 915 if (vtime->state == VTIME_USER) 916 return vtime->utime + vtime_delta(vtime); 917 else if (vtime->state == VTIME_GUEST) 918 return vtime->gtime + vtime_delta(vtime); 919 return 0; 920 } 921 922 static int kcpustat_field_vtime(u64 *cpustat, 923 struct task_struct *tsk, 924 enum cpu_usage_stat usage, 925 int cpu, u64 *val) 926 { 927 struct vtime *vtime = &tsk->vtime; 928 unsigned int seq; 929 930 do { 931 int state; 932 933 seq = read_seqcount_begin(&vtime->seqcount); 934 935 state = vtime_state_fetch(vtime, cpu); 936 if (state < 0) 937 return state; 938 939 *val = cpustat[usage]; 940 941 /* 942 * Nice VS unnice cputime accounting may be inaccurate if 943 * the nice value has changed since the last vtime update. 944 * But proper fix would involve interrupting target on nice 945 * updates which is a no go on nohz_full (although the scheduler 946 * may still interrupt the target if rescheduling is needed...) 947 */ 948 switch (usage) { 949 case CPUTIME_SYSTEM: 950 if (state == VTIME_SYS) 951 *val += vtime->stime + vtime_delta(vtime); 952 break; 953 case CPUTIME_USER: 954 if (task_nice(tsk) <= 0) 955 *val += kcpustat_user_vtime(vtime); 956 break; 957 case CPUTIME_NICE: 958 if (task_nice(tsk) > 0) 959 *val += kcpustat_user_vtime(vtime); 960 break; 961 case CPUTIME_GUEST: 962 if (state == VTIME_GUEST && task_nice(tsk) <= 0) 963 *val += vtime->gtime + vtime_delta(vtime); 964 break; 965 case CPUTIME_GUEST_NICE: 966 if (state == VTIME_GUEST && task_nice(tsk) > 0) 967 *val += vtime->gtime + vtime_delta(vtime); 968 break; 969 default: 970 break; 971 } 972 } while (read_seqcount_retry(&vtime->seqcount, seq)); 973 974 return 0; 975 } 976 977 u64 kcpustat_field(struct kernel_cpustat *kcpustat, 978 enum cpu_usage_stat usage, int cpu) 979 { 980 u64 *cpustat = kcpustat->cpustat; 981 u64 val = cpustat[usage]; 982 struct rq *rq; 983 int err; 984 985 if (!vtime_accounting_enabled_cpu(cpu)) 986 return val; 987 988 rq = cpu_rq(cpu); 989 990 for (;;) { 991 struct task_struct *curr; 992 993 rcu_read_lock(); 994 curr = rcu_dereference(rq->curr); 995 if (WARN_ON_ONCE(!curr)) { 996 rcu_read_unlock(); 997 return cpustat[usage]; 998 } 999 1000 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val); 1001 rcu_read_unlock(); 1002 1003 if (!err) 1004 return val; 1005 1006 cpu_relax(); 1007 } 1008 } 1009 EXPORT_SYMBOL_GPL(kcpustat_field); 1010 1011 static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, 1012 const struct kernel_cpustat *src, 1013 struct task_struct *tsk, int cpu) 1014 { 1015 struct vtime *vtime = &tsk->vtime; 1016 unsigned int seq; 1017 1018 do { 1019 u64 *cpustat; 1020 u64 delta; 1021 int state; 1022 1023 seq = read_seqcount_begin(&vtime->seqcount); 1024 1025 state = vtime_state_fetch(vtime, cpu); 1026 if (state < 0) 1027 return state; 1028 1029 *dst = *src; 1030 cpustat = dst->cpustat; 1031 1032 /* Task is sleeping, dead or idle, nothing to add */ 1033 if (state < VTIME_SYS) 1034 continue; 1035 1036 delta = vtime_delta(vtime); 1037 1038 /* 1039 * Task runs either in user (including guest) or kernel space, 1040 * add pending nohz time to the right place. 1041 */ 1042 if (state == VTIME_SYS) { 1043 cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; 1044 } else if (state == VTIME_USER) { 1045 if (task_nice(tsk) > 0) 1046 cpustat[CPUTIME_NICE] += vtime->utime + delta; 1047 else 1048 cpustat[CPUTIME_USER] += vtime->utime + delta; 1049 } else { 1050 WARN_ON_ONCE(state != VTIME_GUEST); 1051 if (task_nice(tsk) > 0) { 1052 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; 1053 cpustat[CPUTIME_NICE] += vtime->gtime + delta; 1054 } else { 1055 cpustat[CPUTIME_GUEST] += vtime->gtime + delta; 1056 cpustat[CPUTIME_USER] += vtime->gtime + delta; 1057 } 1058 } 1059 } while (read_seqcount_retry(&vtime->seqcount, seq)); 1060 1061 return 0; 1062 } 1063 1064 void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) 1065 { 1066 const struct kernel_cpustat *src = &kcpustat_cpu(cpu); 1067 struct rq *rq; 1068 int err; 1069 1070 if (!vtime_accounting_enabled_cpu(cpu)) { 1071 *dst = *src; 1072 return; 1073 } 1074 1075 rq = cpu_rq(cpu); 1076 1077 for (;;) { 1078 struct task_struct *curr; 1079 1080 rcu_read_lock(); 1081 curr = rcu_dereference(rq->curr); 1082 if (WARN_ON_ONCE(!curr)) { 1083 rcu_read_unlock(); 1084 *dst = *src; 1085 return; 1086 } 1087 1088 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); 1089 rcu_read_unlock(); 1090 1091 if (!err) 1092 return; 1093 1094 cpu_relax(); 1095 } 1096 } 1097 EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); 1098 1099 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 1100