1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple CPU accounting cgroup controller 4 */ 5 #include <linux/sched/clock.h> 6 #include <linux/sched/cputime.h> 7 #include <linux/tsacct_kern.h> 8 #include "sched.h" 9 10 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 11 #include <asm/cputime.h> 12 #endif 13 14 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 15 16 DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); 17 18 /* 19 * There are no locks covering percpu hardirq/softirq time. 20 * They are only modified in vtime_account, on corresponding CPU 21 * with interrupts disabled. So, writes are safe. 22 * They are read and saved off onto struct rq in update_rq_clock(). 23 * This may result in other CPU reading this CPU's IRQ time and can 24 * race with irq/vtime_account on this CPU. We would either get old 25 * or new value with a side effect of accounting a slice of IRQ time to wrong 26 * task when IRQ is in progress while we read rq->clock. That is a worthy 27 * compromise in place of having locks on each IRQ in account_system_time. 28 */ 29 DEFINE_PER_CPU(struct irqtime, cpu_irqtime); 30 31 void enable_sched_clock_irqtime(void) 32 { 33 static_branch_enable(&sched_clock_irqtime); 34 } 35 36 void disable_sched_clock_irqtime(void) 37 { 38 if (irqtime_enabled()) 39 static_branch_disable(&sched_clock_irqtime); 40 } 41 42 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, 43 enum cpu_usage_stat idx) 44 { 45 u64 *cpustat = kcpustat_this_cpu->cpustat; 46 47 u64_stats_update_begin(&irqtime->sync); 48 cpustat[idx] += delta; 49 irqtime->total += delta; 50 if (!kcpustat_idle_dyntick()) 51 irqtime->tick_delta += delta; 52 u64_stats_update_end(&irqtime->sync); 53 } 54 55 /* 56 * Called after incrementing preempt_count on {soft,}irq_enter 57 * and before decrementing preempt_count on {soft,}irq_exit. 58 */ 59 void irqtime_account_irq(struct task_struct *curr, unsigned int offset) 60 { 61 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 62 unsigned int pc; 63 s64 delta; 64 int cpu; 65 66 if (!irqtime_enabled()) 67 return; 68 69 cpu = smp_processor_id(); 70 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; 71 irqtime->irq_start_time += delta; 72 pc = irq_count() - offset; 73 74 /* 75 * We do not account for softirq time from ksoftirqd here. 76 * We want to continue accounting softirq time to ksoftirqd thread 77 * in that case, so as not to confuse scheduler with a special task 78 * that do not consume any time, but still wants to run. 79 */ 80 if (pc & HARDIRQ_MASK) 81 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); 82 else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) 83 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); 84 } 85 86 static u64 irqtime_tick_accounted(u64 maxtime) 87 { 88 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 89 u64 delta; 90 91 delta = min(irqtime->tick_delta, maxtime); 92 irqtime->tick_delta -= delta; 93 94 return delta; 95 } 96 97 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 98 99 static u64 irqtime_tick_accounted(u64 dummy) 100 { 101 return 0; 102 } 103 104 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 105 106 static inline void task_group_account_field(struct task_struct *p, int index, 107 u64 tmp) 108 { 109 /* 110 * Since all updates are sure to touch the root cgroup, we 111 * get ourselves ahead and touch it first. If the root cgroup 112 * is the only cgroup, then nothing else should be necessary. 113 * 114 */ 115 __this_cpu_add(kernel_cpustat.cpustat[index], tmp); 116 117 cgroup_account_cputime_field(p, index, tmp); 118 } 119 120 /* 121 * Account user CPU time to a process. 122 * @p: the process that the CPU time gets accounted to 123 * @cputime: the CPU time spent in user space since the last update 124 */ 125 void account_user_time(struct task_struct *p, u64 cputime) 126 { 127 int index; 128 129 /* Add user time to process. */ 130 p->utime += cputime; 131 account_group_user_time(p, cputime); 132 133 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 134 135 /* Add user time to cpustat. */ 136 task_group_account_field(p, index, cputime); 137 138 /* Account for user time used */ 139 acct_account_cputime(p); 140 } 141 142 /* 143 * Account guest CPU time to a process. 144 * @p: the process that the CPU time gets accounted to 145 * @cputime: the CPU time spent in virtual machine since the last update 146 */ 147 void account_guest_time(struct task_struct *p, u64 cputime) 148 { 149 u64 *cpustat = kcpustat_this_cpu->cpustat; 150 151 /* Add guest time to process. */ 152 p->utime += cputime; 153 account_group_user_time(p, cputime); 154 p->gtime += cputime; 155 156 /* Add guest time to cpustat. */ 157 if (task_nice(p) > 0) { 158 task_group_account_field(p, CPUTIME_NICE, cputime); 159 cpustat[CPUTIME_GUEST_NICE] += cputime; 160 } else { 161 task_group_account_field(p, CPUTIME_USER, cputime); 162 cpustat[CPUTIME_GUEST] += cputime; 163 } 164 } 165 166 /* 167 * Account system CPU time to a process and desired cpustat field 168 * @p: the process that the CPU time gets accounted to 169 * @cputime: the CPU time spent in kernel space since the last update 170 * @index: pointer to cpustat field that has to be updated 171 */ 172 void account_system_index_time(struct task_struct *p, 173 u64 cputime, enum cpu_usage_stat index) 174 { 175 /* Add system time to process. */ 176 p->stime += cputime; 177 account_group_system_time(p, cputime); 178 179 /* Add system time to cpustat. */ 180 task_group_account_field(p, index, cputime); 181 182 /* Account for system time used */ 183 acct_account_cputime(p); 184 } 185 186 /* 187 * Account system CPU time to a process. 188 * @p: the process that the CPU time gets accounted to 189 * @hardirq_offset: the offset to subtract from hardirq_count() 190 * @cputime: the CPU time spent in kernel space since the last update 191 */ 192 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 193 { 194 int index; 195 196 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 197 account_guest_time(p, cputime); 198 return; 199 } 200 201 if (hardirq_count() - hardirq_offset) 202 index = CPUTIME_IRQ; 203 else if (in_serving_softirq()) 204 index = CPUTIME_SOFTIRQ; 205 else 206 index = CPUTIME_SYSTEM; 207 208 account_system_index_time(p, cputime, index); 209 } 210 211 /* 212 * Account for involuntary wait time. 213 * @cputime: the CPU time spent in involuntary wait 214 */ 215 void account_steal_time(u64 cputime) 216 { 217 u64 *cpustat = kcpustat_this_cpu->cpustat; 218 219 cpustat[CPUTIME_STEAL] += cputime; 220 } 221 222 /* 223 * Account for idle time. 224 * @cputime: the CPU time spent in idle wait 225 */ 226 void account_idle_time(u64 cputime) 227 { 228 u64 *cpustat = kcpustat_this_cpu->cpustat; 229 struct rq *rq = this_rq(); 230 231 if (atomic_read(&rq->nr_iowait) > 0) 232 cpustat[CPUTIME_IOWAIT] += cputime; 233 else 234 cpustat[CPUTIME_IDLE] += cputime; 235 } 236 237 238 #ifdef CONFIG_SCHED_CORE 239 /* 240 * Account for forceidle time due to core scheduling. 241 * 242 * REQUIRES: schedstat is enabled. 243 */ 244 void __account_forceidle_time(struct task_struct *p, u64 delta) 245 { 246 __schedstat_add(p->stats.core_forceidle_sum, delta); 247 248 task_group_account_field(p, CPUTIME_FORCEIDLE, delta); 249 } 250 #endif /* CONFIG_SCHED_CORE */ 251 252 /* 253 * When a guest is interrupted for a longer amount of time, missed clock 254 * ticks are not redelivered later. Due to that, this function may on 255 * occasion account more time than the calling functions think elapsed. 256 */ 257 #ifdef CONFIG_PARAVIRT 258 struct static_key paravirt_steal_enabled; 259 260 #ifdef CONFIG_HAVE_PV_STEAL_CLOCK_GEN 261 static u64 native_steal_clock(int cpu) 262 { 263 return 0; 264 } 265 266 DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); 267 #endif 268 #endif 269 270 static __always_inline u64 steal_account_process_time(u64 maxtime) 271 { 272 #ifdef CONFIG_PARAVIRT 273 if (static_key_false(¶virt_steal_enabled)) { 274 u64 steal; 275 276 steal = paravirt_steal_clock(smp_processor_id()); 277 steal -= this_rq()->prev_steal_time; 278 steal = min(steal, maxtime); 279 account_steal_time(steal); 280 this_rq()->prev_steal_time += steal; 281 282 return steal; 283 } 284 #endif /* CONFIG_PARAVIRT */ 285 return 0; 286 } 287 288 /* 289 * Account how much elapsed time was spent in steal, IRQ, or softirq time. 290 */ 291 static inline u64 account_other_time(u64 max) 292 { 293 u64 accounted; 294 295 lockdep_assert_irqs_disabled(); 296 297 accounted = steal_account_process_time(max); 298 299 if (accounted < max) 300 accounted += irqtime_tick_accounted(max - accounted); 301 302 return accounted; 303 } 304 305 #ifdef CONFIG_64BIT 306 static inline u64 read_sum_exec_runtime(struct task_struct *t) 307 { 308 return t->se.sum_exec_runtime; 309 } 310 #else /* !CONFIG_64BIT: */ 311 static u64 read_sum_exec_runtime(struct task_struct *t) 312 { 313 u64 ns; 314 struct rq_flags rf; 315 struct rq *rq; 316 317 rq = task_rq_lock(t, &rf); 318 ns = t->se.sum_exec_runtime; 319 task_rq_unlock(rq, t, &rf); 320 321 return ns; 322 } 323 #endif /* !CONFIG_64BIT */ 324 325 /* 326 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live 327 * tasks (sum on group iteration) belonging to @tsk's group. 328 */ 329 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 330 { 331 struct signal_struct *sig = tsk->signal; 332 struct task_struct *t; 333 u64 utime, stime; 334 335 /* 336 * Update current task runtime to account pending time since last 337 * scheduler action or thread_group_cputime() call. This thread group 338 * might have other running tasks on different CPUs, but updating 339 * their runtime can affect syscall performance, so we skip account 340 * those pending times and rely only on values updated on tick or 341 * other scheduler action. 342 */ 343 if (same_thread_group(current, tsk)) 344 (void) task_sched_runtime(current); 345 346 guard(rcu)(); 347 scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { 348 times->utime = sig->utime; 349 times->stime = sig->stime; 350 times->sum_exec_runtime = sig->sum_sched_runtime; 351 352 __for_each_thread(sig, t) { 353 task_cputime(t, &utime, &stime); 354 times->utime += utime; 355 times->stime += stime; 356 times->sum_exec_runtime += read_sum_exec_runtime(t); 357 } 358 } 359 } 360 361 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 362 /* 363 * Account a tick to a process and cpustat 364 * @p: the process that the CPU time gets accounted to 365 * @user_tick: is the tick from userspace 366 * @rq: the pointer to rq 367 * 368 * Tick demultiplexing follows the order 369 * - pending hardirq update 370 * - pending softirq update 371 * - user_time 372 * - idle_time 373 * - system time 374 * - check for guest_time 375 * - else account as system_time 376 * 377 * Check for hardirq is done both for system and user time as there is 378 * no timer going off while we are on hardirq and hence we may never get an 379 * opportunity to update it solely in system time. 380 * p->stime and friends are only updated on system time and not on IRQ 381 * softirq as those do not count in task exec_runtime any more. 382 */ 383 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 384 int ticks) 385 { 386 u64 other, cputime = TICK_NSEC * ticks; 387 388 /* 389 * When returning from idle, many ticks can get accounted at 390 * once, including some ticks of steal, IRQ, and softirq time. 391 * Subtract those ticks from the amount of time accounted to 392 * idle, or potentially user or system time. Due to rounding, 393 * other time can exceed ticks occasionally. 394 */ 395 other = account_other_time(ULONG_MAX); 396 if (other >= cputime) 397 return; 398 399 cputime -= other; 400 401 if (this_cpu_ksoftirqd() == p) { 402 /* 403 * ksoftirqd time do not get accounted in cpu_softirq_time. 404 * So, we have to handle it separately here. 405 * Also, p->stime needs to be updated for ksoftirqd. 406 */ 407 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); 408 } else if (user_tick) { 409 account_user_time(p, cputime); 410 } else if (p == this_rq()->idle) { 411 account_idle_time(cputime); 412 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 413 account_guest_time(p, cputime); 414 } else { 415 account_system_index_time(p, cputime, CPUTIME_SYSTEM); 416 } 417 } 418 419 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 420 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 421 int nr_ticks) { } 422 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 423 424 #ifdef CONFIG_NO_HZ_COMMON 425 static void kcpustat_idle_stop(struct kernel_cpustat *kc, u64 now) 426 { 427 u64 *cpustat = kc->cpustat; 428 u64 delta, steal, steal_delta; 429 int iowait; 430 431 if (!kc->idle_elapse) 432 return; 433 434 iowait = nr_iowait_cpu(smp_processor_id()) > 0; 435 delta = now - kc->idle_entrytime; 436 steal = steal_account_process_time(delta); 437 438 /* 439 * Record the idle time after substracting the steal time from 440 * previous update sequence. Don't substract the steal time from 441 * the current update sequence to avoid readers moving backward. 442 */ 443 write_seqcount_begin(&kc->idle_sleeptime_seq); 444 steal_delta = min_t(u64, kc->idle_stealtime[iowait], delta); 445 delta -= steal_delta; 446 kc->idle_stealtime[iowait] -= steal_delta; 447 448 if (iowait) 449 cpustat[CPUTIME_IOWAIT] += delta; 450 else 451 cpustat[CPUTIME_IDLE] += delta; 452 453 kc->idle_stealtime[iowait] += steal; 454 kc->idle_entrytime = now; 455 kc->idle_elapse = false; 456 write_seqcount_end(&kc->idle_sleeptime_seq); 457 } 458 459 static void kcpustat_idle_start(struct kernel_cpustat *kc, u64 now) 460 { 461 /* Irqtime accounting might have been enabled in the middle of the IRQ */ 462 if (kc->idle_elapse) 463 return; 464 465 write_seqcount_begin(&kc->idle_sleeptime_seq); 466 kc->idle_entrytime = now; 467 kc->idle_elapse = true; 468 write_seqcount_end(&kc->idle_sleeptime_seq); 469 } 470 471 void kcpustat_dyntick_stop(u64 now) 472 { 473 struct kernel_cpustat *kc = kcpustat_this_cpu; 474 475 if (!vtime_generic_enabled_this_cpu()) { 476 WARN_ON_ONCE(!kc->idle_dyntick); 477 kcpustat_idle_stop(kc, now); 478 kc->idle_dyntick = false; 479 vtime_dyntick_stop(); 480 } 481 } 482 483 void kcpustat_dyntick_start(u64 now) 484 { 485 struct kernel_cpustat *kc = kcpustat_this_cpu; 486 487 if (!vtime_generic_enabled_this_cpu()) { 488 vtime_dyntick_start(); 489 kc->idle_dyntick = true; 490 kcpustat_idle_start(kc, now); 491 } 492 } 493 494 void kcpustat_irq_enter(u64 now) 495 { 496 struct kernel_cpustat *kc = kcpustat_this_cpu; 497 498 if (!vtime_generic_enabled_this_cpu() && 499 (irqtime_enabled() || vtime_accounting_enabled_this_cpu())) 500 kcpustat_idle_stop(kc, now); 501 } 502 503 void kcpustat_irq_exit(u64 now) 504 { 505 struct kernel_cpustat *kc = kcpustat_this_cpu; 506 507 /* 508 * Generic vtime already does its own idle accounting. 509 * But irqtime accounting or arch vtime which also accounts IRQs 510 * need to pause nohz accounting. Resume nohz accounting as long 511 * as the irqtime config is enabled to handle case where irqtime 512 * accounting got runtime disabled in the middle of an IRQ. 513 */ 514 if (!vtime_generic_enabled_this_cpu() && 515 (IS_ENABLED(CONFIG_IRQ_TIME_ACCOUNTING) || vtime_accounting_enabled_this_cpu())) 516 kcpustat_idle_start(kc, now); 517 } 518 519 static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx, 520 bool compute_delta, u64 now) 521 { 522 struct kernel_cpustat *kc = &kcpustat_cpu(cpu); 523 int iowait = idx == CPUTIME_IOWAIT; 524 u64 *cpustat = kc->cpustat; 525 unsigned int seq; 526 u64 idle; 527 528 do { 529 seq = read_seqcount_begin(&kc->idle_sleeptime_seq); 530 531 idle = cpustat[idx]; 532 533 if (kc->idle_elapse && compute_delta && now > kc->idle_entrytime) { 534 u64 delta = now - kc->idle_entrytime; 535 536 delta -= min_t(u64, kc->idle_stealtime[iowait], delta); 537 idle += delta; 538 } 539 } while (read_seqcount_retry(&kc->idle_sleeptime_seq, seq)); 540 541 return idle; 542 } 543 544 u64 kcpustat_field_idle(int cpu) 545 { 546 return kcpustat_field_dyntick(cpu, CPUTIME_IDLE, 547 !nr_iowait_cpu(cpu), ktime_get()); 548 } 549 EXPORT_SYMBOL_GPL(kcpustat_field_idle); 550 551 u64 kcpustat_field_iowait(int cpu) 552 { 553 return kcpustat_field_dyntick(cpu, CPUTIME_IOWAIT, 554 nr_iowait_cpu(cpu), ktime_get()); 555 } 556 EXPORT_SYMBOL_GPL(kcpustat_field_iowait); 557 #else 558 static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx, 559 bool compute_delta, ktime_t now) 560 { 561 return kcpustat_cpu(cpu).cpustat[idx]; 562 } 563 #endif /* CONFIG_NO_HZ_COMMON */ 564 565 static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx, 566 bool compute_delta, u64 *last_update_time) 567 { 568 ktime_t now = ktime_get(); 569 u64 res; 570 571 if (vtime_generic_enabled_cpu(cpu)) 572 res = kcpustat_field(idx, cpu); 573 else 574 res = kcpustat_field_dyntick(cpu, idx, compute_delta, now); 575 576 do_div(res, NSEC_PER_USEC); 577 578 if (last_update_time) 579 *last_update_time = ktime_to_us(now); 580 581 return res; 582 } 583 584 /** 585 * get_cpu_idle_time_us - get the total idle time of a CPU 586 * @cpu: CPU number to query 587 * @last_update_time: variable to store update time in. Do not update 588 * counters if NULL. 589 * 590 * Return the cumulative idle time (since boot) for a given 591 * CPU, in microseconds. Note that this is partially broken due to 592 * the counter of iowait tasks that can be remotely updated without 593 * any synchronization. Therefore it is possible to observe backward 594 * values within two consecutive reads. 595 * 596 * This time is measured via accounting rather than sampling, 597 * and is as accurate as ktime_get() is. 598 * 599 * Return: total idle time of the @cpu 600 */ 601 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 602 { 603 return get_cpu_sleep_time_us(cpu, CPUTIME_IDLE, 604 !nr_iowait_cpu(cpu), last_update_time); 605 } 606 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 607 608 /** 609 * get_cpu_iowait_time_us - get the total iowait time of a CPU 610 * @cpu: CPU number to query 611 * @last_update_time: variable to store update time in. Do not update 612 * counters if NULL. 613 * 614 * Return the cumulative iowait time (since boot) for a given 615 * CPU, in microseconds. Note this is partially broken due to 616 * the counter of iowait tasks that can be remotely updated without 617 * any synchronization. Therefore it is possible to observe backward 618 * values within two consecutive reads. 619 * 620 * This time is measured via accounting rather than sampling, 621 * and is as accurate as ktime_get() is. 622 * 623 * Return: total iowait time of @cpu 624 */ 625 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) 626 { 627 return get_cpu_sleep_time_us(cpu, CPUTIME_IOWAIT, 628 nr_iowait_cpu(cpu), last_update_time); 629 } 630 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 631 632 /* 633 * Use precise platform statistics if available: 634 */ 635 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 636 637 void vtime_account_irq(struct task_struct *tsk, unsigned int offset) 638 { 639 unsigned int pc = irq_count() - offset; 640 641 if (pc & HARDIRQ_OFFSET) { 642 vtime_account_hardirq(tsk); 643 } else if (pc & SOFTIRQ_OFFSET) { 644 vtime_account_softirq(tsk); 645 } else if (!kcpustat_idle_dyntick()) { 646 if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && 647 is_idle_task(tsk)) { 648 vtime_account_idle(tsk); 649 } else { 650 vtime_account_kernel(tsk); 651 } 652 } else { 653 vtime_reset(); 654 } 655 } 656 657 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 658 u64 *ut, u64 *st) 659 { 660 *ut = curr->utime; 661 *st = curr->stime; 662 } 663 664 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 665 { 666 *ut = p->utime; 667 *st = p->stime; 668 } 669 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 670 671 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 672 { 673 struct task_cputime cputime; 674 675 thread_group_cputime(p, &cputime); 676 677 *ut = cputime.utime; 678 *st = cputime.stime; 679 } 680 681 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ 682 683 /* 684 * Account a single tick of CPU time. 685 * @p: the process that the CPU time gets accounted to 686 * @user_tick: indicates if the tick is a user or a system tick 687 */ 688 void account_process_tick(struct task_struct *p, int user_tick) 689 { 690 u64 cputime, steal; 691 692 if (vtime_accounting_enabled_this_cpu()) 693 return; 694 695 if (kcpustat_idle_dyntick()) 696 return; 697 698 if (irqtime_enabled()) { 699 irqtime_account_process_tick(p, user_tick, 1); 700 return; 701 } 702 703 cputime = TICK_NSEC; 704 steal = steal_account_process_time(ULONG_MAX); 705 706 if (steal >= cputime) 707 return; 708 709 cputime -= steal; 710 711 if (user_tick) 712 account_user_time(p, cputime); 713 else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) 714 account_system_time(p, HARDIRQ_OFFSET, cputime); 715 else 716 account_idle_time(cputime); 717 } 718 719 /* 720 * Adjust tick based cputime random precision against scheduler runtime 721 * accounting. 722 * 723 * Tick based cputime accounting depend on random scheduling timeslices of a 724 * task to be interrupted or not by the timer. Depending on these 725 * circumstances, the number of these interrupts may be over or 726 * under-optimistic, matching the real user and system cputime with a variable 727 * precision. 728 * 729 * Fix this by scaling these tick based values against the total runtime 730 * accounted by the CFS scheduler. 731 * 732 * This code provides the following guarantees: 733 * 734 * stime + utime == rtime 735 * stime_i+1 >= stime_i, utime_i+1 >= utime_i 736 * 737 * Assuming that rtime_i+1 >= rtime_i. 738 */ 739 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 740 u64 *ut, u64 *st) 741 { 742 u64 rtime, stime, utime; 743 unsigned long flags; 744 745 /* Serialize concurrent callers such that we can honour our guarantees */ 746 raw_spin_lock_irqsave(&prev->lock, flags); 747 rtime = curr->sum_exec_runtime; 748 749 /* 750 * This is possible under two circumstances: 751 * - rtime isn't monotonic after all (a bug); 752 * - we got reordered by the lock. 753 * 754 * In both cases this acts as a filter such that the rest of the code 755 * can assume it is monotonic regardless of anything else. 756 */ 757 if (prev->stime + prev->utime >= rtime) 758 goto out; 759 760 stime = curr->stime; 761 utime = curr->utime; 762 763 /* 764 * If either stime or utime are 0, assume all runtime is userspace. 765 * Once a task gets some ticks, the monotonicity code at 'update:' 766 * will ensure things converge to the observed ratio. 767 */ 768 if (stime == 0) { 769 utime = rtime; 770 goto update; 771 } 772 773 if (utime == 0) { 774 stime = rtime; 775 goto update; 776 } 777 778 stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); 779 780 update: 781 /* 782 * Make sure stime doesn't go backwards; this preserves monotonicity 783 * for utime because rtime is monotonic. 784 * 785 * utime_i+1 = rtime_i+1 - stime_i 786 * = rtime_i+1 - (rtime_i - utime_i) 787 * = (rtime_i+1 - rtime_i) + utime_i 788 * >= utime_i 789 */ 790 if (stime < prev->stime) 791 stime = prev->stime; 792 utime = rtime - stime; 793 794 /* 795 * Make sure utime doesn't go backwards; this still preserves 796 * monotonicity for stime, analogous argument to above. 797 */ 798 if (utime < prev->utime) { 799 utime = prev->utime; 800 stime = rtime - utime; 801 } 802 803 prev->stime = stime; 804 prev->utime = utime; 805 out: 806 *ut = prev->utime; 807 *st = prev->stime; 808 raw_spin_unlock_irqrestore(&prev->lock, flags); 809 } 810 811 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 812 { 813 struct task_cputime cputime = { 814 .sum_exec_runtime = p->se.sum_exec_runtime, 815 }; 816 817 if (task_cputime(p, &cputime.utime, &cputime.stime)) 818 cputime.sum_exec_runtime = task_sched_runtime(p); 819 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 820 } 821 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 822 823 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 824 { 825 struct task_cputime cputime; 826 827 thread_group_cputime(p, &cputime); 828 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 829 } 830 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 831 832 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 833 static u64 vtime_delta(struct vtime *vtime) 834 { 835 unsigned long long clock; 836 837 clock = sched_clock(); 838 if (clock < vtime->starttime) 839 return 0; 840 841 return clock - vtime->starttime; 842 } 843 844 static u64 get_vtime_delta(struct vtime *vtime) 845 { 846 u64 delta = vtime_delta(vtime); 847 u64 other; 848 849 /* 850 * Unlike tick based timing, vtime based timing never has lost 851 * ticks, and no need for steal time accounting to make up for 852 * lost ticks. Vtime accounts a rounded version of actual 853 * elapsed time. Limit account_other_time to prevent rounding 854 * errors from causing elapsed vtime to go negative. 855 */ 856 other = account_other_time(delta); 857 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); 858 vtime->starttime += delta; 859 860 return delta - other; 861 } 862 863 static void vtime_account_system(struct task_struct *tsk, 864 struct vtime *vtime) 865 { 866 vtime->stime += get_vtime_delta(vtime); 867 if (vtime->stime >= TICK_NSEC) { 868 account_system_time(tsk, irq_count(), vtime->stime); 869 vtime->stime = 0; 870 } 871 } 872 873 static void vtime_account_guest(struct task_struct *tsk, 874 struct vtime *vtime) 875 { 876 vtime->gtime += get_vtime_delta(vtime); 877 if (vtime->gtime >= TICK_NSEC) { 878 account_guest_time(tsk, vtime->gtime); 879 vtime->gtime = 0; 880 } 881 } 882 883 static void __vtime_account_kernel(struct task_struct *tsk, 884 struct vtime *vtime) 885 { 886 /* We might have scheduled out from guest path */ 887 if (vtime->state == VTIME_GUEST) 888 vtime_account_guest(tsk, vtime); 889 else 890 vtime_account_system(tsk, vtime); 891 } 892 893 void vtime_account_kernel(struct task_struct *tsk) 894 { 895 struct vtime *vtime = &tsk->vtime; 896 897 if (!vtime_delta(vtime)) 898 return; 899 900 write_seqcount_begin(&vtime->seqcount); 901 __vtime_account_kernel(tsk, vtime); 902 write_seqcount_end(&vtime->seqcount); 903 } 904 905 void vtime_user_enter(struct task_struct *tsk) 906 { 907 struct vtime *vtime = &tsk->vtime; 908 909 write_seqcount_begin(&vtime->seqcount); 910 vtime_account_system(tsk, vtime); 911 vtime->state = VTIME_USER; 912 write_seqcount_end(&vtime->seqcount); 913 } 914 915 void vtime_user_exit(struct task_struct *tsk) 916 { 917 struct vtime *vtime = &tsk->vtime; 918 919 write_seqcount_begin(&vtime->seqcount); 920 vtime->utime += get_vtime_delta(vtime); 921 if (vtime->utime >= TICK_NSEC) { 922 account_user_time(tsk, vtime->utime); 923 vtime->utime = 0; 924 } 925 vtime->state = VTIME_SYS; 926 write_seqcount_end(&vtime->seqcount); 927 } 928 929 void vtime_guest_enter(struct task_struct *tsk) 930 { 931 struct vtime *vtime = &tsk->vtime; 932 /* 933 * The flags must be updated under the lock with 934 * the vtime_starttime flush and update. 935 * That enforces a right ordering and update sequence 936 * synchronization against the reader (task_gtime()) 937 * that can thus safely catch up with a tickless delta. 938 */ 939 write_seqcount_begin(&vtime->seqcount); 940 vtime_account_system(tsk, vtime); 941 tsk->flags |= PF_VCPU; 942 vtime->state = VTIME_GUEST; 943 write_seqcount_end(&vtime->seqcount); 944 } 945 EXPORT_SYMBOL_GPL(vtime_guest_enter); 946 947 void vtime_guest_exit(struct task_struct *tsk) 948 { 949 struct vtime *vtime = &tsk->vtime; 950 951 write_seqcount_begin(&vtime->seqcount); 952 vtime_account_guest(tsk, vtime); 953 tsk->flags &= ~PF_VCPU; 954 vtime->state = VTIME_SYS; 955 write_seqcount_end(&vtime->seqcount); 956 } 957 EXPORT_SYMBOL_GPL(vtime_guest_exit); 958 959 static void __vtime_account_idle(struct vtime *vtime) 960 { 961 account_idle_time(get_vtime_delta(vtime)); 962 } 963 964 void vtime_task_switch_generic(struct task_struct *prev) 965 { 966 struct vtime *vtime = &prev->vtime; 967 968 write_seqcount_begin(&vtime->seqcount); 969 if (vtime->state == VTIME_IDLE) 970 __vtime_account_idle(vtime); 971 else 972 __vtime_account_kernel(prev, vtime); 973 vtime->state = VTIME_INACTIVE; 974 vtime->cpu = -1; 975 write_seqcount_end(&vtime->seqcount); 976 977 vtime = ¤t->vtime; 978 979 write_seqcount_begin(&vtime->seqcount); 980 if (is_idle_task(current)) 981 vtime->state = VTIME_IDLE; 982 else if (current->flags & PF_VCPU) 983 vtime->state = VTIME_GUEST; 984 else 985 vtime->state = VTIME_SYS; 986 vtime->starttime = sched_clock(); 987 vtime->cpu = smp_processor_id(); 988 write_seqcount_end(&vtime->seqcount); 989 } 990 991 void vtime_init_idle(struct task_struct *t, int cpu) 992 { 993 struct vtime *vtime = &t->vtime; 994 unsigned long flags; 995 996 local_irq_save(flags); 997 write_seqcount_begin(&vtime->seqcount); 998 vtime->state = VTIME_IDLE; 999 vtime->starttime = sched_clock(); 1000 vtime->cpu = cpu; 1001 write_seqcount_end(&vtime->seqcount); 1002 local_irq_restore(flags); 1003 } 1004 1005 u64 task_gtime(struct task_struct *t) 1006 { 1007 struct vtime *vtime = &t->vtime; 1008 unsigned int seq; 1009 u64 gtime; 1010 1011 if (!vtime_accounting_enabled()) 1012 return t->gtime; 1013 1014 do { 1015 seq = read_seqcount_begin(&vtime->seqcount); 1016 1017 gtime = t->gtime; 1018 if (vtime->state == VTIME_GUEST) 1019 gtime += vtime->gtime + vtime_delta(vtime); 1020 1021 } while (read_seqcount_retry(&vtime->seqcount, seq)); 1022 1023 return gtime; 1024 } 1025 1026 /* 1027 * Fetch cputime raw values from fields of task_struct and 1028 * add up the pending nohz execution time since the last 1029 * cputime snapshot. 1030 */ 1031 bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) 1032 { 1033 struct vtime *vtime = &t->vtime; 1034 unsigned int seq; 1035 u64 delta; 1036 int ret; 1037 1038 if (!vtime_accounting_enabled()) { 1039 *utime = t->utime; 1040 *stime = t->stime; 1041 return false; 1042 } 1043 1044 do { 1045 ret = false; 1046 seq = read_seqcount_begin(&vtime->seqcount); 1047 1048 *utime = t->utime; 1049 *stime = t->stime; 1050 1051 /* Task is sleeping or idle, nothing to add */ 1052 if (vtime->state < VTIME_SYS) 1053 continue; 1054 1055 ret = true; 1056 delta = vtime_delta(vtime); 1057 1058 /* 1059 * Task runs either in user (including guest) or kernel space, 1060 * add pending nohz time to the right place. 1061 */ 1062 if (vtime->state == VTIME_SYS) 1063 *stime += vtime->stime + delta; 1064 else 1065 *utime += vtime->utime + delta; 1066 } while (read_seqcount_retry(&vtime->seqcount, seq)); 1067 1068 return ret; 1069 } 1070 1071 static int vtime_state_fetch(struct vtime *vtime, int cpu) 1072 { 1073 int state = READ_ONCE(vtime->state); 1074 1075 /* 1076 * We raced against a context switch, fetch the 1077 * kcpustat task again. 1078 */ 1079 if (vtime->cpu != cpu && vtime->cpu != -1) 1080 return -EAGAIN; 1081 1082 /* 1083 * Two possible things here: 1084 * 1) We are seeing the scheduling out task (prev) or any past one. 1085 * 2) We are seeing the scheduling in task (next) but it hasn't 1086 * passed though vtime_task_switch() yet so the pending 1087 * cputime of the prev task may not be flushed yet. 1088 * 1089 * Case 1) is ok but 2) is not. So wait for a safe VTIME state. 1090 */ 1091 if (state == VTIME_INACTIVE) 1092 return -EAGAIN; 1093 1094 return state; 1095 } 1096 1097 static u64 kcpustat_user_vtime(struct vtime *vtime) 1098 { 1099 if (vtime->state == VTIME_USER) 1100 return vtime->utime + vtime_delta(vtime); 1101 else if (vtime->state == VTIME_GUEST) 1102 return vtime->gtime + vtime_delta(vtime); 1103 return 0; 1104 } 1105 1106 static int kcpustat_field_vtime(u64 *cpustat, 1107 struct task_struct *tsk, 1108 enum cpu_usage_stat usage, 1109 int cpu, u64 *val) 1110 { 1111 struct vtime *vtime = &tsk->vtime; 1112 struct rq *rq = cpu_rq(cpu); 1113 unsigned int seq; 1114 1115 do { 1116 int state; 1117 1118 seq = read_seqcount_begin(&vtime->seqcount); 1119 1120 state = vtime_state_fetch(vtime, cpu); 1121 if (state < 0) 1122 return state; 1123 1124 *val = cpustat[usage]; 1125 1126 /* 1127 * Nice VS unnice cputime accounting may be inaccurate if 1128 * the nice value has changed since the last vtime update. 1129 * But proper fix would involve interrupting target on nice 1130 * updates which is a no go on nohz_full (although the scheduler 1131 * may still interrupt the target if rescheduling is needed...) 1132 */ 1133 switch (usage) { 1134 case CPUTIME_SYSTEM: 1135 if (state == VTIME_SYS) 1136 *val += vtime->stime + vtime_delta(vtime); 1137 break; 1138 case CPUTIME_USER: 1139 if (task_nice(tsk) <= 0) 1140 *val += kcpustat_user_vtime(vtime); 1141 break; 1142 case CPUTIME_NICE: 1143 if (task_nice(tsk) > 0) 1144 *val += kcpustat_user_vtime(vtime); 1145 break; 1146 case CPUTIME_GUEST: 1147 if (state == VTIME_GUEST && task_nice(tsk) <= 0) 1148 *val += vtime->gtime + vtime_delta(vtime); 1149 break; 1150 case CPUTIME_GUEST_NICE: 1151 if (state == VTIME_GUEST && task_nice(tsk) > 0) 1152 *val += vtime->gtime + vtime_delta(vtime); 1153 break; 1154 case CPUTIME_IDLE: 1155 if (state == VTIME_IDLE && !atomic_read(&rq->nr_iowait)) 1156 *val += vtime_delta(vtime); 1157 break; 1158 case CPUTIME_IOWAIT: 1159 if (state == VTIME_IDLE && atomic_read(&rq->nr_iowait) > 0) 1160 *val += vtime_delta(vtime); 1161 break; 1162 default: 1163 break; 1164 } 1165 } while (read_seqcount_retry(&vtime->seqcount, seq)); 1166 1167 return 0; 1168 } 1169 1170 u64 kcpustat_field(enum cpu_usage_stat usage, int cpu) 1171 { 1172 u64 *cpustat = kcpustat_cpu(cpu).cpustat; 1173 u64 val = cpustat[usage]; 1174 struct rq *rq; 1175 int err; 1176 1177 if (!vtime_generic_enabled_cpu(cpu)) 1178 return kcpustat_field_default(usage, cpu); 1179 1180 rq = cpu_rq(cpu); 1181 1182 for (;;) { 1183 struct task_struct *curr; 1184 1185 rcu_read_lock(); 1186 curr = rcu_dereference(rq->curr); 1187 if (WARN_ON_ONCE(!curr)) { 1188 rcu_read_unlock(); 1189 return cpustat[usage]; 1190 } 1191 1192 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val); 1193 rcu_read_unlock(); 1194 1195 if (!err) 1196 return val; 1197 1198 cpu_relax(); 1199 } 1200 } 1201 EXPORT_SYMBOL_GPL(kcpustat_field); 1202 1203 static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, 1204 const struct kernel_cpustat *src, 1205 struct task_struct *tsk, int cpu) 1206 { 1207 struct vtime *vtime = &tsk->vtime; 1208 unsigned int seq; 1209 1210 do { 1211 u64 *cpustat; 1212 u64 delta; 1213 int state; 1214 1215 seq = read_seqcount_begin(&vtime->seqcount); 1216 1217 state = vtime_state_fetch(vtime, cpu); 1218 if (state < 0) 1219 return state; 1220 1221 *dst = *src; 1222 cpustat = dst->cpustat; 1223 1224 /* Task is sleeping or dead, nothing to add */ 1225 if (state < VTIME_IDLE) 1226 continue; 1227 1228 delta = vtime_delta(vtime); 1229 1230 /* 1231 * Task runs either in user (including guest) or kernel space, 1232 * add pending nohz time to the right place. 1233 */ 1234 switch (state) { 1235 case VTIME_SYS: 1236 cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; 1237 break; 1238 case VTIME_USER: 1239 if (task_nice(tsk) > 0) 1240 cpustat[CPUTIME_NICE] += vtime->utime + delta; 1241 else 1242 cpustat[CPUTIME_USER] += vtime->utime + delta; 1243 break; 1244 case VTIME_GUEST: 1245 if (task_nice(tsk) > 0) { 1246 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; 1247 cpustat[CPUTIME_NICE] += vtime->gtime + delta; 1248 } else { 1249 cpustat[CPUTIME_GUEST] += vtime->gtime + delta; 1250 cpustat[CPUTIME_USER] += vtime->gtime + delta; 1251 } 1252 break; 1253 case VTIME_IDLE: 1254 if (atomic_read(&cpu_rq(cpu)->nr_iowait) > 0) 1255 cpustat[CPUTIME_IOWAIT] += delta; 1256 else 1257 cpustat[CPUTIME_IDLE] += delta; 1258 break; 1259 default: 1260 WARN_ON_ONCE(1); 1261 } 1262 } while (read_seqcount_retry(&vtime->seqcount, seq)); 1263 1264 return 0; 1265 } 1266 1267 void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) 1268 { 1269 const struct kernel_cpustat *src = &kcpustat_cpu(cpu); 1270 struct rq *rq; 1271 int err; 1272 1273 if (!vtime_generic_enabled_cpu(cpu)) { 1274 kcpustat_cpu_fetch_default(dst, cpu); 1275 return; 1276 } 1277 1278 rq = cpu_rq(cpu); 1279 1280 for (;;) { 1281 struct task_struct *curr; 1282 1283 rcu_read_lock(); 1284 curr = rcu_dereference(rq->curr); 1285 if (WARN_ON_ONCE(!curr)) { 1286 rcu_read_unlock(); 1287 kcpustat_cpu_fetch_default(dst, cpu); 1288 return; 1289 } 1290 1291 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); 1292 rcu_read_unlock(); 1293 1294 if (!err) 1295 return; 1296 1297 cpu_relax(); 1298 } 1299 } 1300 EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); 1301 1302 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 1303