1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple CPU accounting cgroup controller 4 */ 5 #include <linux/sched/cputime.h> 6 #include <linux/tsacct_kern.h> 7 #include "sched.h" 8 9 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 10 #include <asm/cputime.h> 11 #endif 12 13 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 14 15 /* 16 * There are no locks covering percpu hardirq/softirq time. 17 * They are only modified in vtime_account, on corresponding CPU 18 * with interrupts disabled. So, writes are safe. 19 * They are read and saved off onto struct rq in update_rq_clock(). 20 * This may result in other CPU reading this CPU's IRQ time and can 21 * race with irq/vtime_account on this CPU. We would either get old 22 * or new value with a side effect of accounting a slice of IRQ time to wrong 23 * task when IRQ is in progress while we read rq->clock. That is a worthy 24 * compromise in place of having locks on each IRQ in account_system_time. 25 */ 26 DEFINE_PER_CPU(struct irqtime, cpu_irqtime); 27 28 int sched_clock_irqtime; 29 30 void enable_sched_clock_irqtime(void) 31 { 32 sched_clock_irqtime = 1; 33 } 34 35 void disable_sched_clock_irqtime(void) 36 { 37 sched_clock_irqtime = 0; 38 } 39 40 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, 41 enum cpu_usage_stat idx) 42 { 43 u64 *cpustat = kcpustat_this_cpu->cpustat; 44 45 u64_stats_update_begin(&irqtime->sync); 46 cpustat[idx] += delta; 47 irqtime->total += delta; 48 irqtime->tick_delta += delta; 49 u64_stats_update_end(&irqtime->sync); 50 } 51 52 /* 53 * Called after incrementing preempt_count on {soft,}irq_enter 54 * and before decrementing preempt_count on {soft,}irq_exit. 55 */ 56 void irqtime_account_irq(struct task_struct *curr, unsigned int offset) 57 { 58 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 59 unsigned int pc; 60 s64 delta; 61 int cpu; 62 63 if (!irqtime_enabled()) 64 return; 65 66 cpu = smp_processor_id(); 67 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; 68 irqtime->irq_start_time += delta; 69 pc = irq_count() - offset; 70 71 /* 72 * We do not account for softirq time from ksoftirqd here. 73 * We want to continue accounting softirq time to ksoftirqd thread 74 * in that case, so as not to confuse scheduler with a special task 75 * that do not consume any time, but still wants to run. 76 */ 77 if (pc & HARDIRQ_MASK) 78 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); 79 else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) 80 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); 81 } 82 83 static u64 irqtime_tick_accounted(u64 maxtime) 84 { 85 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 86 u64 delta; 87 88 delta = min(irqtime->tick_delta, maxtime); 89 irqtime->tick_delta -= delta; 90 91 return delta; 92 } 93 94 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 95 96 static u64 irqtime_tick_accounted(u64 dummy) 97 { 98 return 0; 99 } 100 101 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 102 103 static inline void task_group_account_field(struct task_struct *p, int index, 104 u64 tmp) 105 { 106 /* 107 * Since all updates are sure to touch the root cgroup, we 108 * get ourselves ahead and touch it first. If the root cgroup 109 * is the only cgroup, then nothing else should be necessary. 110 * 111 */ 112 __this_cpu_add(kernel_cpustat.cpustat[index], tmp); 113 114 cgroup_account_cputime_field(p, index, tmp); 115 } 116 117 /* 118 * Account user CPU time to a process. 119 * @p: the process that the CPU time gets accounted to 120 * @cputime: the CPU time spent in user space since the last update 121 */ 122 void account_user_time(struct task_struct *p, u64 cputime) 123 { 124 int index; 125 126 /* Add user time to process. */ 127 p->utime += cputime; 128 account_group_user_time(p, cputime); 129 130 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 131 132 /* Add user time to cpustat. */ 133 task_group_account_field(p, index, cputime); 134 135 /* Account for user time used */ 136 acct_account_cputime(p); 137 } 138 139 /* 140 * Account guest CPU time to a process. 141 * @p: the process that the CPU time gets accounted to 142 * @cputime: the CPU time spent in virtual machine since the last update 143 */ 144 void account_guest_time(struct task_struct *p, u64 cputime) 145 { 146 u64 *cpustat = kcpustat_this_cpu->cpustat; 147 148 /* Add guest time to process. */ 149 p->utime += cputime; 150 account_group_user_time(p, cputime); 151 p->gtime += cputime; 152 153 /* Add guest time to cpustat. */ 154 if (task_nice(p) > 0) { 155 task_group_account_field(p, CPUTIME_NICE, cputime); 156 cpustat[CPUTIME_GUEST_NICE] += cputime; 157 } else { 158 task_group_account_field(p, CPUTIME_USER, cputime); 159 cpustat[CPUTIME_GUEST] += cputime; 160 } 161 } 162 163 /* 164 * Account system CPU time to a process and desired cpustat field 165 * @p: the process that the CPU time gets accounted to 166 * @cputime: the CPU time spent in kernel space since the last update 167 * @index: pointer to cpustat field that has to be updated 168 */ 169 void account_system_index_time(struct task_struct *p, 170 u64 cputime, enum cpu_usage_stat index) 171 { 172 /* Add system time to process. */ 173 p->stime += cputime; 174 account_group_system_time(p, cputime); 175 176 /* Add system time to cpustat. */ 177 task_group_account_field(p, index, cputime); 178 179 /* Account for system time used */ 180 acct_account_cputime(p); 181 } 182 183 /* 184 * Account system CPU time to a process. 185 * @p: the process that the CPU time gets accounted to 186 * @hardirq_offset: the offset to subtract from hardirq_count() 187 * @cputime: the CPU time spent in kernel space since the last update 188 */ 189 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 190 { 191 int index; 192 193 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 194 account_guest_time(p, cputime); 195 return; 196 } 197 198 if (hardirq_count() - hardirq_offset) 199 index = CPUTIME_IRQ; 200 else if (in_serving_softirq()) 201 index = CPUTIME_SOFTIRQ; 202 else 203 index = CPUTIME_SYSTEM; 204 205 account_system_index_time(p, cputime, index); 206 } 207 208 /* 209 * Account for involuntary wait time. 210 * @cputime: the CPU time spent in involuntary wait 211 */ 212 void account_steal_time(u64 cputime) 213 { 214 u64 *cpustat = kcpustat_this_cpu->cpustat; 215 216 cpustat[CPUTIME_STEAL] += cputime; 217 } 218 219 /* 220 * Account for idle time. 221 * @cputime: the CPU time spent in idle wait 222 */ 223 void account_idle_time(u64 cputime) 224 { 225 u64 *cpustat = kcpustat_this_cpu->cpustat; 226 struct rq *rq = this_rq(); 227 228 if (atomic_read(&rq->nr_iowait) > 0) 229 cpustat[CPUTIME_IOWAIT] += cputime; 230 else 231 cpustat[CPUTIME_IDLE] += cputime; 232 } 233 234 235 #ifdef CONFIG_SCHED_CORE 236 /* 237 * Account for forceidle time due to core scheduling. 238 * 239 * REQUIRES: schedstat is enabled. 240 */ 241 void __account_forceidle_time(struct task_struct *p, u64 delta) 242 { 243 __schedstat_add(p->stats.core_forceidle_sum, delta); 244 245 task_group_account_field(p, CPUTIME_FORCEIDLE, delta); 246 } 247 #endif /* CONFIG_SCHED_CORE */ 248 249 /* 250 * When a guest is interrupted for a longer amount of time, missed clock 251 * ticks are not redelivered later. Due to that, this function may on 252 * occasion account more time than the calling functions think elapsed. 253 */ 254 static __always_inline u64 steal_account_process_time(u64 maxtime) 255 { 256 #ifdef CONFIG_PARAVIRT 257 if (static_key_false(¶virt_steal_enabled)) { 258 u64 steal; 259 260 steal = paravirt_steal_clock(smp_processor_id()); 261 steal -= this_rq()->prev_steal_time; 262 steal = min(steal, maxtime); 263 account_steal_time(steal); 264 this_rq()->prev_steal_time += steal; 265 266 return steal; 267 } 268 #endif /* CONFIG_PARAVIRT */ 269 return 0; 270 } 271 272 /* 273 * Account how much elapsed time was spent in steal, IRQ, or softirq time. 274 */ 275 static inline u64 account_other_time(u64 max) 276 { 277 u64 accounted; 278 279 lockdep_assert_irqs_disabled(); 280 281 accounted = steal_account_process_time(max); 282 283 if (accounted < max) 284 accounted += irqtime_tick_accounted(max - accounted); 285 286 return accounted; 287 } 288 289 #ifdef CONFIG_64BIT 290 static inline u64 read_sum_exec_runtime(struct task_struct *t) 291 { 292 return t->se.sum_exec_runtime; 293 } 294 #else /* !CONFIG_64BIT: */ 295 static u64 read_sum_exec_runtime(struct task_struct *t) 296 { 297 u64 ns; 298 struct rq_flags rf; 299 struct rq *rq; 300 301 rq = task_rq_lock(t, &rf); 302 ns = t->se.sum_exec_runtime; 303 task_rq_unlock(rq, t, &rf); 304 305 return ns; 306 } 307 #endif /* !CONFIG_64BIT */ 308 309 /* 310 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live 311 * tasks (sum on group iteration) belonging to @tsk's group. 312 */ 313 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 314 { 315 struct signal_struct *sig = tsk->signal; 316 struct task_struct *t; 317 u64 utime, stime; 318 319 /* 320 * Update current task runtime to account pending time since last 321 * scheduler action or thread_group_cputime() call. This thread group 322 * might have other running tasks on different CPUs, but updating 323 * their runtime can affect syscall performance, so we skip account 324 * those pending times and rely only on values updated on tick or 325 * other scheduler action. 326 */ 327 if (same_thread_group(current, tsk)) 328 (void) task_sched_runtime(current); 329 330 guard(rcu)(); 331 scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { 332 times->utime = sig->utime; 333 times->stime = sig->stime; 334 times->sum_exec_runtime = sig->sum_sched_runtime; 335 336 __for_each_thread(sig, t) { 337 task_cputime(t, &utime, &stime); 338 times->utime += utime; 339 times->stime += stime; 340 times->sum_exec_runtime += read_sum_exec_runtime(t); 341 } 342 } 343 } 344 345 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 346 /* 347 * Account a tick to a process and cpustat 348 * @p: the process that the CPU time gets accounted to 349 * @user_tick: is the tick from userspace 350 * @rq: the pointer to rq 351 * 352 * Tick demultiplexing follows the order 353 * - pending hardirq update 354 * - pending softirq update 355 * - user_time 356 * - idle_time 357 * - system time 358 * - check for guest_time 359 * - else account as system_time 360 * 361 * Check for hardirq is done both for system and user time as there is 362 * no timer going off while we are on hardirq and hence we may never get an 363 * opportunity to update it solely in system time. 364 * p->stime and friends are only updated on system time and not on IRQ 365 * softirq as those do not count in task exec_runtime any more. 366 */ 367 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 368 int ticks) 369 { 370 u64 other, cputime = TICK_NSEC * ticks; 371 372 /* 373 * When returning from idle, many ticks can get accounted at 374 * once, including some ticks of steal, IRQ, and softirq time. 375 * Subtract those ticks from the amount of time accounted to 376 * idle, or potentially user or system time. Due to rounding, 377 * other time can exceed ticks occasionally. 378 */ 379 other = account_other_time(ULONG_MAX); 380 if (other >= cputime) 381 return; 382 383 cputime -= other; 384 385 if (this_cpu_ksoftirqd() == p) { 386 /* 387 * ksoftirqd time do not get accounted in cpu_softirq_time. 388 * So, we have to handle it separately here. 389 * Also, p->stime needs to be updated for ksoftirqd. 390 */ 391 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); 392 } else if (user_tick) { 393 account_user_time(p, cputime); 394 } else if (p == this_rq()->idle) { 395 account_idle_time(cputime); 396 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 397 account_guest_time(p, cputime); 398 } else { 399 account_system_index_time(p, cputime, CPUTIME_SYSTEM); 400 } 401 } 402 403 static void irqtime_account_idle_ticks(int ticks) 404 { 405 irqtime_account_process_tick(current, 0, ticks); 406 } 407 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 408 static inline void irqtime_account_idle_ticks(int ticks) { } 409 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 410 int nr_ticks) { } 411 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 412 413 /* 414 * Use precise platform statistics if available: 415 */ 416 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 417 418 void vtime_account_irq(struct task_struct *tsk, unsigned int offset) 419 { 420 unsigned int pc = irq_count() - offset; 421 422 if (pc & HARDIRQ_OFFSET) { 423 vtime_account_hardirq(tsk); 424 } else if (pc & SOFTIRQ_OFFSET) { 425 vtime_account_softirq(tsk); 426 } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && 427 is_idle_task(tsk)) { 428 vtime_account_idle(tsk); 429 } else { 430 vtime_account_kernel(tsk); 431 } 432 } 433 434 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 435 u64 *ut, u64 *st) 436 { 437 *ut = curr->utime; 438 *st = curr->stime; 439 } 440 441 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 442 { 443 *ut = p->utime; 444 *st = p->stime; 445 } 446 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 447 448 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 449 { 450 struct task_cputime cputime; 451 452 thread_group_cputime(p, &cputime); 453 454 *ut = cputime.utime; 455 *st = cputime.stime; 456 } 457 458 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ 459 460 /* 461 * Account a single tick of CPU time. 462 * @p: the process that the CPU time gets accounted to 463 * @user_tick: indicates if the tick is a user or a system tick 464 */ 465 void account_process_tick(struct task_struct *p, int user_tick) 466 { 467 u64 cputime, steal; 468 469 if (vtime_accounting_enabled_this_cpu()) 470 return; 471 472 if (irqtime_enabled()) { 473 irqtime_account_process_tick(p, user_tick, 1); 474 return; 475 } 476 477 cputime = TICK_NSEC; 478 steal = steal_account_process_time(ULONG_MAX); 479 480 if (steal >= cputime) 481 return; 482 483 cputime -= steal; 484 485 if (user_tick) 486 account_user_time(p, cputime); 487 else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) 488 account_system_time(p, HARDIRQ_OFFSET, cputime); 489 else 490 account_idle_time(cputime); 491 } 492 493 /* 494 * Account multiple ticks of idle time. 495 * @ticks: number of stolen ticks 496 */ 497 void account_idle_ticks(unsigned long ticks) 498 { 499 u64 cputime, steal; 500 501 if (irqtime_enabled()) { 502 irqtime_account_idle_ticks(ticks); 503 return; 504 } 505 506 cputime = ticks * TICK_NSEC; 507 steal = steal_account_process_time(ULONG_MAX); 508 509 if (steal >= cputime) 510 return; 511 512 cputime -= steal; 513 account_idle_time(cputime); 514 } 515 516 /* 517 * Adjust tick based cputime random precision against scheduler runtime 518 * accounting. 519 * 520 * Tick based cputime accounting depend on random scheduling timeslices of a 521 * task to be interrupted or not by the timer. Depending on these 522 * circumstances, the number of these interrupts may be over or 523 * under-optimistic, matching the real user and system cputime with a variable 524 * precision. 525 * 526 * Fix this by scaling these tick based values against the total runtime 527 * accounted by the CFS scheduler. 528 * 529 * This code provides the following guarantees: 530 * 531 * stime + utime == rtime 532 * stime_i+1 >= stime_i, utime_i+1 >= utime_i 533 * 534 * Assuming that rtime_i+1 >= rtime_i. 535 */ 536 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 537 u64 *ut, u64 *st) 538 { 539 u64 rtime, stime, utime; 540 unsigned long flags; 541 542 /* Serialize concurrent callers such that we can honour our guarantees */ 543 raw_spin_lock_irqsave(&prev->lock, flags); 544 rtime = curr->sum_exec_runtime; 545 546 /* 547 * This is possible under two circumstances: 548 * - rtime isn't monotonic after all (a bug); 549 * - we got reordered by the lock. 550 * 551 * In both cases this acts as a filter such that the rest of the code 552 * can assume it is monotonic regardless of anything else. 553 */ 554 if (prev->stime + prev->utime >= rtime) 555 goto out; 556 557 stime = curr->stime; 558 utime = curr->utime; 559 560 /* 561 * If either stime or utime are 0, assume all runtime is userspace. 562 * Once a task gets some ticks, the monotonicity code at 'update:' 563 * will ensure things converge to the observed ratio. 564 */ 565 if (stime == 0) { 566 utime = rtime; 567 goto update; 568 } 569 570 if (utime == 0) { 571 stime = rtime; 572 goto update; 573 } 574 575 stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); 576 /* 577 * Because mul_u64_u64_div_u64() can approximate on some 578 * achitectures; enforce the constraint that: a*b/(b+c) <= a. 579 */ 580 if (unlikely(stime > rtime)) 581 stime = rtime; 582 583 update: 584 /* 585 * Make sure stime doesn't go backwards; this preserves monotonicity 586 * for utime because rtime is monotonic. 587 * 588 * utime_i+1 = rtime_i+1 - stime_i 589 * = rtime_i+1 - (rtime_i - utime_i) 590 * = (rtime_i+1 - rtime_i) + utime_i 591 * >= utime_i 592 */ 593 if (stime < prev->stime) 594 stime = prev->stime; 595 utime = rtime - stime; 596 597 /* 598 * Make sure utime doesn't go backwards; this still preserves 599 * monotonicity for stime, analogous argument to above. 600 */ 601 if (utime < prev->utime) { 602 utime = prev->utime; 603 stime = rtime - utime; 604 } 605 606 prev->stime = stime; 607 prev->utime = utime; 608 out: 609 *ut = prev->utime; 610 *st = prev->stime; 611 raw_spin_unlock_irqrestore(&prev->lock, flags); 612 } 613 614 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 615 { 616 struct task_cputime cputime = { 617 .sum_exec_runtime = p->se.sum_exec_runtime, 618 }; 619 620 if (task_cputime(p, &cputime.utime, &cputime.stime)) 621 cputime.sum_exec_runtime = task_sched_runtime(p); 622 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 623 } 624 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 625 626 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 627 { 628 struct task_cputime cputime; 629 630 thread_group_cputime(p, &cputime); 631 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 632 } 633 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 634 635 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 636 static u64 vtime_delta(struct vtime *vtime) 637 { 638 unsigned long long clock; 639 640 clock = sched_clock(); 641 if (clock < vtime->starttime) 642 return 0; 643 644 return clock - vtime->starttime; 645 } 646 647 static u64 get_vtime_delta(struct vtime *vtime) 648 { 649 u64 delta = vtime_delta(vtime); 650 u64 other; 651 652 /* 653 * Unlike tick based timing, vtime based timing never has lost 654 * ticks, and no need for steal time accounting to make up for 655 * lost ticks. Vtime accounts a rounded version of actual 656 * elapsed time. Limit account_other_time to prevent rounding 657 * errors from causing elapsed vtime to go negative. 658 */ 659 other = account_other_time(delta); 660 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); 661 vtime->starttime += delta; 662 663 return delta - other; 664 } 665 666 static void vtime_account_system(struct task_struct *tsk, 667 struct vtime *vtime) 668 { 669 vtime->stime += get_vtime_delta(vtime); 670 if (vtime->stime >= TICK_NSEC) { 671 account_system_time(tsk, irq_count(), vtime->stime); 672 vtime->stime = 0; 673 } 674 } 675 676 static void vtime_account_guest(struct task_struct *tsk, 677 struct vtime *vtime) 678 { 679 vtime->gtime += get_vtime_delta(vtime); 680 if (vtime->gtime >= TICK_NSEC) { 681 account_guest_time(tsk, vtime->gtime); 682 vtime->gtime = 0; 683 } 684 } 685 686 static void __vtime_account_kernel(struct task_struct *tsk, 687 struct vtime *vtime) 688 { 689 /* We might have scheduled out from guest path */ 690 if (vtime->state == VTIME_GUEST) 691 vtime_account_guest(tsk, vtime); 692 else 693 vtime_account_system(tsk, vtime); 694 } 695 696 void vtime_account_kernel(struct task_struct *tsk) 697 { 698 struct vtime *vtime = &tsk->vtime; 699 700 if (!vtime_delta(vtime)) 701 return; 702 703 write_seqcount_begin(&vtime->seqcount); 704 __vtime_account_kernel(tsk, vtime); 705 write_seqcount_end(&vtime->seqcount); 706 } 707 708 void vtime_user_enter(struct task_struct *tsk) 709 { 710 struct vtime *vtime = &tsk->vtime; 711 712 write_seqcount_begin(&vtime->seqcount); 713 vtime_account_system(tsk, vtime); 714 vtime->state = VTIME_USER; 715 write_seqcount_end(&vtime->seqcount); 716 } 717 718 void vtime_user_exit(struct task_struct *tsk) 719 { 720 struct vtime *vtime = &tsk->vtime; 721 722 write_seqcount_begin(&vtime->seqcount); 723 vtime->utime += get_vtime_delta(vtime); 724 if (vtime->utime >= TICK_NSEC) { 725 account_user_time(tsk, vtime->utime); 726 vtime->utime = 0; 727 } 728 vtime->state = VTIME_SYS; 729 write_seqcount_end(&vtime->seqcount); 730 } 731 732 void vtime_guest_enter(struct task_struct *tsk) 733 { 734 struct vtime *vtime = &tsk->vtime; 735 /* 736 * The flags must be updated under the lock with 737 * the vtime_starttime flush and update. 738 * That enforces a right ordering and update sequence 739 * synchronization against the reader (task_gtime()) 740 * that can thus safely catch up with a tickless delta. 741 */ 742 write_seqcount_begin(&vtime->seqcount); 743 vtime_account_system(tsk, vtime); 744 tsk->flags |= PF_VCPU; 745 vtime->state = VTIME_GUEST; 746 write_seqcount_end(&vtime->seqcount); 747 } 748 EXPORT_SYMBOL_GPL(vtime_guest_enter); 749 750 void vtime_guest_exit(struct task_struct *tsk) 751 { 752 struct vtime *vtime = &tsk->vtime; 753 754 write_seqcount_begin(&vtime->seqcount); 755 vtime_account_guest(tsk, vtime); 756 tsk->flags &= ~PF_VCPU; 757 vtime->state = VTIME_SYS; 758 write_seqcount_end(&vtime->seqcount); 759 } 760 EXPORT_SYMBOL_GPL(vtime_guest_exit); 761 762 void vtime_account_idle(struct task_struct *tsk) 763 { 764 account_idle_time(get_vtime_delta(&tsk->vtime)); 765 } 766 767 void vtime_task_switch_generic(struct task_struct *prev) 768 { 769 struct vtime *vtime = &prev->vtime; 770 771 write_seqcount_begin(&vtime->seqcount); 772 if (vtime->state == VTIME_IDLE) 773 vtime_account_idle(prev); 774 else 775 __vtime_account_kernel(prev, vtime); 776 vtime->state = VTIME_INACTIVE; 777 vtime->cpu = -1; 778 write_seqcount_end(&vtime->seqcount); 779 780 vtime = ¤t->vtime; 781 782 write_seqcount_begin(&vtime->seqcount); 783 if (is_idle_task(current)) 784 vtime->state = VTIME_IDLE; 785 else if (current->flags & PF_VCPU) 786 vtime->state = VTIME_GUEST; 787 else 788 vtime->state = VTIME_SYS; 789 vtime->starttime = sched_clock(); 790 vtime->cpu = smp_processor_id(); 791 write_seqcount_end(&vtime->seqcount); 792 } 793 794 void vtime_init_idle(struct task_struct *t, int cpu) 795 { 796 struct vtime *vtime = &t->vtime; 797 unsigned long flags; 798 799 local_irq_save(flags); 800 write_seqcount_begin(&vtime->seqcount); 801 vtime->state = VTIME_IDLE; 802 vtime->starttime = sched_clock(); 803 vtime->cpu = cpu; 804 write_seqcount_end(&vtime->seqcount); 805 local_irq_restore(flags); 806 } 807 808 u64 task_gtime(struct task_struct *t) 809 { 810 struct vtime *vtime = &t->vtime; 811 unsigned int seq; 812 u64 gtime; 813 814 if (!vtime_accounting_enabled()) 815 return t->gtime; 816 817 do { 818 seq = read_seqcount_begin(&vtime->seqcount); 819 820 gtime = t->gtime; 821 if (vtime->state == VTIME_GUEST) 822 gtime += vtime->gtime + vtime_delta(vtime); 823 824 } while (read_seqcount_retry(&vtime->seqcount, seq)); 825 826 return gtime; 827 } 828 829 /* 830 * Fetch cputime raw values from fields of task_struct and 831 * add up the pending nohz execution time since the last 832 * cputime snapshot. 833 */ 834 bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) 835 { 836 struct vtime *vtime = &t->vtime; 837 unsigned int seq; 838 u64 delta; 839 int ret; 840 841 if (!vtime_accounting_enabled()) { 842 *utime = t->utime; 843 *stime = t->stime; 844 return false; 845 } 846 847 do { 848 ret = false; 849 seq = read_seqcount_begin(&vtime->seqcount); 850 851 *utime = t->utime; 852 *stime = t->stime; 853 854 /* Task is sleeping or idle, nothing to add */ 855 if (vtime->state < VTIME_SYS) 856 continue; 857 858 ret = true; 859 delta = vtime_delta(vtime); 860 861 /* 862 * Task runs either in user (including guest) or kernel space, 863 * add pending nohz time to the right place. 864 */ 865 if (vtime->state == VTIME_SYS) 866 *stime += vtime->stime + delta; 867 else 868 *utime += vtime->utime + delta; 869 } while (read_seqcount_retry(&vtime->seqcount, seq)); 870 871 return ret; 872 } 873 874 static int vtime_state_fetch(struct vtime *vtime, int cpu) 875 { 876 int state = READ_ONCE(vtime->state); 877 878 /* 879 * We raced against a context switch, fetch the 880 * kcpustat task again. 881 */ 882 if (vtime->cpu != cpu && vtime->cpu != -1) 883 return -EAGAIN; 884 885 /* 886 * Two possible things here: 887 * 1) We are seeing the scheduling out task (prev) or any past one. 888 * 2) We are seeing the scheduling in task (next) but it hasn't 889 * passed though vtime_task_switch() yet so the pending 890 * cputime of the prev task may not be flushed yet. 891 * 892 * Case 1) is ok but 2) is not. So wait for a safe VTIME state. 893 */ 894 if (state == VTIME_INACTIVE) 895 return -EAGAIN; 896 897 return state; 898 } 899 900 static u64 kcpustat_user_vtime(struct vtime *vtime) 901 { 902 if (vtime->state == VTIME_USER) 903 return vtime->utime + vtime_delta(vtime); 904 else if (vtime->state == VTIME_GUEST) 905 return vtime->gtime + vtime_delta(vtime); 906 return 0; 907 } 908 909 static int kcpustat_field_vtime(u64 *cpustat, 910 struct task_struct *tsk, 911 enum cpu_usage_stat usage, 912 int cpu, u64 *val) 913 { 914 struct vtime *vtime = &tsk->vtime; 915 unsigned int seq; 916 917 do { 918 int state; 919 920 seq = read_seqcount_begin(&vtime->seqcount); 921 922 state = vtime_state_fetch(vtime, cpu); 923 if (state < 0) 924 return state; 925 926 *val = cpustat[usage]; 927 928 /* 929 * Nice VS unnice cputime accounting may be inaccurate if 930 * the nice value has changed since the last vtime update. 931 * But proper fix would involve interrupting target on nice 932 * updates which is a no go on nohz_full (although the scheduler 933 * may still interrupt the target if rescheduling is needed...) 934 */ 935 switch (usage) { 936 case CPUTIME_SYSTEM: 937 if (state == VTIME_SYS) 938 *val += vtime->stime + vtime_delta(vtime); 939 break; 940 case CPUTIME_USER: 941 if (task_nice(tsk) <= 0) 942 *val += kcpustat_user_vtime(vtime); 943 break; 944 case CPUTIME_NICE: 945 if (task_nice(tsk) > 0) 946 *val += kcpustat_user_vtime(vtime); 947 break; 948 case CPUTIME_GUEST: 949 if (state == VTIME_GUEST && task_nice(tsk) <= 0) 950 *val += vtime->gtime + vtime_delta(vtime); 951 break; 952 case CPUTIME_GUEST_NICE: 953 if (state == VTIME_GUEST && task_nice(tsk) > 0) 954 *val += vtime->gtime + vtime_delta(vtime); 955 break; 956 default: 957 break; 958 } 959 } while (read_seqcount_retry(&vtime->seqcount, seq)); 960 961 return 0; 962 } 963 964 u64 kcpustat_field(struct kernel_cpustat *kcpustat, 965 enum cpu_usage_stat usage, int cpu) 966 { 967 u64 *cpustat = kcpustat->cpustat; 968 u64 val = cpustat[usage]; 969 struct rq *rq; 970 int err; 971 972 if (!vtime_accounting_enabled_cpu(cpu)) 973 return val; 974 975 rq = cpu_rq(cpu); 976 977 for (;;) { 978 struct task_struct *curr; 979 980 rcu_read_lock(); 981 curr = rcu_dereference(rq->curr); 982 if (WARN_ON_ONCE(!curr)) { 983 rcu_read_unlock(); 984 return cpustat[usage]; 985 } 986 987 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val); 988 rcu_read_unlock(); 989 990 if (!err) 991 return val; 992 993 cpu_relax(); 994 } 995 } 996 EXPORT_SYMBOL_GPL(kcpustat_field); 997 998 static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, 999 const struct kernel_cpustat *src, 1000 struct task_struct *tsk, int cpu) 1001 { 1002 struct vtime *vtime = &tsk->vtime; 1003 unsigned int seq; 1004 1005 do { 1006 u64 *cpustat; 1007 u64 delta; 1008 int state; 1009 1010 seq = read_seqcount_begin(&vtime->seqcount); 1011 1012 state = vtime_state_fetch(vtime, cpu); 1013 if (state < 0) 1014 return state; 1015 1016 *dst = *src; 1017 cpustat = dst->cpustat; 1018 1019 /* Task is sleeping, dead or idle, nothing to add */ 1020 if (state < VTIME_SYS) 1021 continue; 1022 1023 delta = vtime_delta(vtime); 1024 1025 /* 1026 * Task runs either in user (including guest) or kernel space, 1027 * add pending nohz time to the right place. 1028 */ 1029 if (state == VTIME_SYS) { 1030 cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; 1031 } else if (state == VTIME_USER) { 1032 if (task_nice(tsk) > 0) 1033 cpustat[CPUTIME_NICE] += vtime->utime + delta; 1034 else 1035 cpustat[CPUTIME_USER] += vtime->utime + delta; 1036 } else { 1037 WARN_ON_ONCE(state != VTIME_GUEST); 1038 if (task_nice(tsk) > 0) { 1039 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; 1040 cpustat[CPUTIME_NICE] += vtime->gtime + delta; 1041 } else { 1042 cpustat[CPUTIME_GUEST] += vtime->gtime + delta; 1043 cpustat[CPUTIME_USER] += vtime->gtime + delta; 1044 } 1045 } 1046 } while (read_seqcount_retry(&vtime->seqcount, seq)); 1047 1048 return 0; 1049 } 1050 1051 void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) 1052 { 1053 const struct kernel_cpustat *src = &kcpustat_cpu(cpu); 1054 struct rq *rq; 1055 int err; 1056 1057 if (!vtime_accounting_enabled_cpu(cpu)) { 1058 *dst = *src; 1059 return; 1060 } 1061 1062 rq = cpu_rq(cpu); 1063 1064 for (;;) { 1065 struct task_struct *curr; 1066 1067 rcu_read_lock(); 1068 curr = rcu_dereference(rq->curr); 1069 if (WARN_ON_ONCE(!curr)) { 1070 rcu_read_unlock(); 1071 *dst = *src; 1072 return; 1073 } 1074 1075 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); 1076 rcu_read_unlock(); 1077 1078 if (!err) 1079 return; 1080 1081 cpu_relax(); 1082 } 1083 } 1084 EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); 1085 1086 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 1087