1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Simple CPU accounting cgroup controller 4 */ 5 #include <linux/sched/cputime.h> 6 #include <linux/tsacct_kern.h> 7 #include "sched.h" 8 9 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 10 #include <asm/cputime.h> 11 #endif 12 13 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 14 15 /* 16 * There are no locks covering percpu hardirq/softirq time. 17 * They are only modified in vtime_account, on corresponding CPU 18 * with interrupts disabled. So, writes are safe. 19 * They are read and saved off onto struct rq in update_rq_clock(). 20 * This may result in other CPU reading this CPU's IRQ time and can 21 * race with irq/vtime_account on this CPU. We would either get old 22 * or new value with a side effect of accounting a slice of IRQ time to wrong 23 * task when IRQ is in progress while we read rq->clock. That is a worthy 24 * compromise in place of having locks on each IRQ in account_system_time. 25 */ 26 DEFINE_PER_CPU(struct irqtime, cpu_irqtime); 27 28 int sched_clock_irqtime; 29 30 void enable_sched_clock_irqtime(void) 31 { 32 sched_clock_irqtime = 1; 33 } 34 35 void disable_sched_clock_irqtime(void) 36 { 37 sched_clock_irqtime = 0; 38 } 39 40 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, 41 enum cpu_usage_stat idx) 42 { 43 u64 *cpustat = kcpustat_this_cpu->cpustat; 44 45 u64_stats_update_begin(&irqtime->sync); 46 cpustat[idx] += delta; 47 irqtime->total += delta; 48 irqtime->tick_delta += delta; 49 u64_stats_update_end(&irqtime->sync); 50 } 51 52 /* 53 * Called after incrementing preempt_count on {soft,}irq_enter 54 * and before decrementing preempt_count on {soft,}irq_exit. 55 */ 56 void irqtime_account_irq(struct task_struct *curr, unsigned int offset) 57 { 58 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 59 unsigned int pc; 60 s64 delta; 61 int cpu; 62 63 if (!irqtime_enabled()) 64 return; 65 66 cpu = smp_processor_id(); 67 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; 68 irqtime->irq_start_time += delta; 69 pc = irq_count() - offset; 70 71 /* 72 * We do not account for softirq time from ksoftirqd here. 73 * We want to continue accounting softirq time to ksoftirqd thread 74 * in that case, so as not to confuse scheduler with a special task 75 * that do not consume any time, but still wants to run. 76 */ 77 if (pc & HARDIRQ_MASK) 78 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); 79 else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) 80 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); 81 } 82 83 static u64 irqtime_tick_accounted(u64 maxtime) 84 { 85 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); 86 u64 delta; 87 88 delta = min(irqtime->tick_delta, maxtime); 89 irqtime->tick_delta -= delta; 90 91 return delta; 92 } 93 94 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 95 96 static u64 irqtime_tick_accounted(u64 dummy) 97 { 98 return 0; 99 } 100 101 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 102 103 static inline void task_group_account_field(struct task_struct *p, int index, 104 u64 tmp) 105 { 106 /* 107 * Since all updates are sure to touch the root cgroup, we 108 * get ourselves ahead and touch it first. If the root cgroup 109 * is the only cgroup, then nothing else should be necessary. 110 * 111 */ 112 __this_cpu_add(kernel_cpustat.cpustat[index], tmp); 113 114 cgroup_account_cputime_field(p, index, tmp); 115 } 116 117 /* 118 * Account user CPU time to a process. 119 * @p: the process that the CPU time gets accounted to 120 * @cputime: the CPU time spent in user space since the last update 121 */ 122 void account_user_time(struct task_struct *p, u64 cputime) 123 { 124 int index; 125 126 /* Add user time to process. */ 127 p->utime += cputime; 128 account_group_user_time(p, cputime); 129 130 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 131 132 /* Add user time to cpustat. */ 133 task_group_account_field(p, index, cputime); 134 135 /* Account for user time used */ 136 acct_account_cputime(p); 137 } 138 139 /* 140 * Account guest CPU time to a process. 141 * @p: the process that the CPU time gets accounted to 142 * @cputime: the CPU time spent in virtual machine since the last update 143 */ 144 void account_guest_time(struct task_struct *p, u64 cputime) 145 { 146 u64 *cpustat = kcpustat_this_cpu->cpustat; 147 148 /* Add guest time to process. */ 149 p->utime += cputime; 150 account_group_user_time(p, cputime); 151 p->gtime += cputime; 152 153 /* Add guest time to cpustat. */ 154 if (task_nice(p) > 0) { 155 task_group_account_field(p, CPUTIME_NICE, cputime); 156 cpustat[CPUTIME_GUEST_NICE] += cputime; 157 } else { 158 task_group_account_field(p, CPUTIME_USER, cputime); 159 cpustat[CPUTIME_GUEST] += cputime; 160 } 161 } 162 163 /* 164 * Account system CPU time to a process and desired cpustat field 165 * @p: the process that the CPU time gets accounted to 166 * @cputime: the CPU time spent in kernel space since the last update 167 * @index: pointer to cpustat field that has to be updated 168 */ 169 void account_system_index_time(struct task_struct *p, 170 u64 cputime, enum cpu_usage_stat index) 171 { 172 /* Add system time to process. */ 173 p->stime += cputime; 174 account_group_system_time(p, cputime); 175 176 /* Add system time to cpustat. */ 177 task_group_account_field(p, index, cputime); 178 179 /* Account for system time used */ 180 acct_account_cputime(p); 181 } 182 183 /* 184 * Account system CPU time to a process. 185 * @p: the process that the CPU time gets accounted to 186 * @hardirq_offset: the offset to subtract from hardirq_count() 187 * @cputime: the CPU time spent in kernel space since the last update 188 */ 189 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 190 { 191 int index; 192 193 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 194 account_guest_time(p, cputime); 195 return; 196 } 197 198 if (hardirq_count() - hardirq_offset) 199 index = CPUTIME_IRQ; 200 else if (in_serving_softirq()) 201 index = CPUTIME_SOFTIRQ; 202 else 203 index = CPUTIME_SYSTEM; 204 205 account_system_index_time(p, cputime, index); 206 } 207 208 /* 209 * Account for involuntary wait time. 210 * @cputime: the CPU time spent in involuntary wait 211 */ 212 void account_steal_time(u64 cputime) 213 { 214 u64 *cpustat = kcpustat_this_cpu->cpustat; 215 216 cpustat[CPUTIME_STEAL] += cputime; 217 } 218 219 /* 220 * Account for idle time. 221 * @cputime: the CPU time spent in idle wait 222 */ 223 void account_idle_time(u64 cputime) 224 { 225 u64 *cpustat = kcpustat_this_cpu->cpustat; 226 struct rq *rq = this_rq(); 227 228 if (atomic_read(&rq->nr_iowait) > 0) 229 cpustat[CPUTIME_IOWAIT] += cputime; 230 else 231 cpustat[CPUTIME_IDLE] += cputime; 232 } 233 234 235 #ifdef CONFIG_SCHED_CORE 236 /* 237 * Account for forceidle time due to core scheduling. 238 * 239 * REQUIRES: schedstat is enabled. 240 */ 241 void __account_forceidle_time(struct task_struct *p, u64 delta) 242 { 243 __schedstat_add(p->stats.core_forceidle_sum, delta); 244 245 task_group_account_field(p, CPUTIME_FORCEIDLE, delta); 246 } 247 #endif /* CONFIG_SCHED_CORE */ 248 249 /* 250 * When a guest is interrupted for a longer amount of time, missed clock 251 * ticks are not redelivered later. Due to that, this function may on 252 * occasion account more time than the calling functions think elapsed. 253 */ 254 static __always_inline u64 steal_account_process_time(u64 maxtime) 255 { 256 #ifdef CONFIG_PARAVIRT 257 if (static_key_false(¶virt_steal_enabled)) { 258 u64 steal; 259 260 steal = paravirt_steal_clock(smp_processor_id()); 261 steal -= this_rq()->prev_steal_time; 262 steal = min(steal, maxtime); 263 account_steal_time(steal); 264 this_rq()->prev_steal_time += steal; 265 266 return steal; 267 } 268 #endif /* CONFIG_PARAVIRT */ 269 return 0; 270 } 271 272 /* 273 * Account how much elapsed time was spent in steal, IRQ, or softirq time. 274 */ 275 static inline u64 account_other_time(u64 max) 276 { 277 u64 accounted; 278 279 lockdep_assert_irqs_disabled(); 280 281 accounted = steal_account_process_time(max); 282 283 if (accounted < max) 284 accounted += irqtime_tick_accounted(max - accounted); 285 286 return accounted; 287 } 288 289 #ifdef CONFIG_64BIT 290 static inline u64 read_sum_exec_runtime(struct task_struct *t) 291 { 292 return t->se.sum_exec_runtime; 293 } 294 #else /* !CONFIG_64BIT: */ 295 static u64 read_sum_exec_runtime(struct task_struct *t) 296 { 297 u64 ns; 298 struct rq_flags rf; 299 struct rq *rq; 300 301 rq = task_rq_lock(t, &rf); 302 ns = t->se.sum_exec_runtime; 303 task_rq_unlock(rq, t, &rf); 304 305 return ns; 306 } 307 #endif /* !CONFIG_64BIT */ 308 309 /* 310 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live 311 * tasks (sum on group iteration) belonging to @tsk's group. 312 */ 313 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 314 { 315 struct signal_struct *sig = tsk->signal; 316 u64 utime, stime; 317 struct task_struct *t; 318 unsigned int seq, nextseq; 319 unsigned long flags; 320 321 /* 322 * Update current task runtime to account pending time since last 323 * scheduler action or thread_group_cputime() call. This thread group 324 * might have other running tasks on different CPUs, but updating 325 * their runtime can affect syscall performance, so we skip account 326 * those pending times and rely only on values updated on tick or 327 * other scheduler action. 328 */ 329 if (same_thread_group(current, tsk)) 330 (void) task_sched_runtime(current); 331 332 rcu_read_lock(); 333 /* Attempt a lockless read on the first round. */ 334 nextseq = 0; 335 do { 336 seq = nextseq; 337 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); 338 times->utime = sig->utime; 339 times->stime = sig->stime; 340 times->sum_exec_runtime = sig->sum_sched_runtime; 341 342 for_each_thread(tsk, t) { 343 task_cputime(t, &utime, &stime); 344 times->utime += utime; 345 times->stime += stime; 346 times->sum_exec_runtime += read_sum_exec_runtime(t); 347 } 348 /* If lockless access failed, take the lock. */ 349 nextseq = 1; 350 } while (need_seqretry(&sig->stats_lock, seq)); 351 done_seqretry_irqrestore(&sig->stats_lock, seq, flags); 352 rcu_read_unlock(); 353 } 354 355 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 356 /* 357 * Account a tick to a process and cpustat 358 * @p: the process that the CPU time gets accounted to 359 * @user_tick: is the tick from userspace 360 * @rq: the pointer to rq 361 * 362 * Tick demultiplexing follows the order 363 * - pending hardirq update 364 * - pending softirq update 365 * - user_time 366 * - idle_time 367 * - system time 368 * - check for guest_time 369 * - else account as system_time 370 * 371 * Check for hardirq is done both for system and user time as there is 372 * no timer going off while we are on hardirq and hence we may never get an 373 * opportunity to update it solely in system time. 374 * p->stime and friends are only updated on system time and not on IRQ 375 * softirq as those do not count in task exec_runtime any more. 376 */ 377 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 378 int ticks) 379 { 380 u64 other, cputime = TICK_NSEC * ticks; 381 382 /* 383 * When returning from idle, many ticks can get accounted at 384 * once, including some ticks of steal, IRQ, and softirq time. 385 * Subtract those ticks from the amount of time accounted to 386 * idle, or potentially user or system time. Due to rounding, 387 * other time can exceed ticks occasionally. 388 */ 389 other = account_other_time(ULONG_MAX); 390 if (other >= cputime) 391 return; 392 393 cputime -= other; 394 395 if (this_cpu_ksoftirqd() == p) { 396 /* 397 * ksoftirqd time do not get accounted in cpu_softirq_time. 398 * So, we have to handle it separately here. 399 * Also, p->stime needs to be updated for ksoftirqd. 400 */ 401 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); 402 } else if (user_tick) { 403 account_user_time(p, cputime); 404 } else if (p == this_rq()->idle) { 405 account_idle_time(cputime); 406 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 407 account_guest_time(p, cputime); 408 } else { 409 account_system_index_time(p, cputime, CPUTIME_SYSTEM); 410 } 411 } 412 413 static void irqtime_account_idle_ticks(int ticks) 414 { 415 irqtime_account_process_tick(current, 0, ticks); 416 } 417 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 418 static inline void irqtime_account_idle_ticks(int ticks) { } 419 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 420 int nr_ticks) { } 421 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 422 423 /* 424 * Use precise platform statistics if available: 425 */ 426 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 427 428 void vtime_account_irq(struct task_struct *tsk, unsigned int offset) 429 { 430 unsigned int pc = irq_count() - offset; 431 432 if (pc & HARDIRQ_OFFSET) { 433 vtime_account_hardirq(tsk); 434 } else if (pc & SOFTIRQ_OFFSET) { 435 vtime_account_softirq(tsk); 436 } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && 437 is_idle_task(tsk)) { 438 vtime_account_idle(tsk); 439 } else { 440 vtime_account_kernel(tsk); 441 } 442 } 443 444 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 445 u64 *ut, u64 *st) 446 { 447 *ut = curr->utime; 448 *st = curr->stime; 449 } 450 451 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 452 { 453 *ut = p->utime; 454 *st = p->stime; 455 } 456 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 457 458 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 459 { 460 struct task_cputime cputime; 461 462 thread_group_cputime(p, &cputime); 463 464 *ut = cputime.utime; 465 *st = cputime.stime; 466 } 467 468 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ 469 470 /* 471 * Account a single tick of CPU time. 472 * @p: the process that the CPU time gets accounted to 473 * @user_tick: indicates if the tick is a user or a system tick 474 */ 475 void account_process_tick(struct task_struct *p, int user_tick) 476 { 477 u64 cputime, steal; 478 479 if (vtime_accounting_enabled_this_cpu()) 480 return; 481 482 if (irqtime_enabled()) { 483 irqtime_account_process_tick(p, user_tick, 1); 484 return; 485 } 486 487 cputime = TICK_NSEC; 488 steal = steal_account_process_time(ULONG_MAX); 489 490 if (steal >= cputime) 491 return; 492 493 cputime -= steal; 494 495 if (user_tick) 496 account_user_time(p, cputime); 497 else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) 498 account_system_time(p, HARDIRQ_OFFSET, cputime); 499 else 500 account_idle_time(cputime); 501 } 502 503 /* 504 * Account multiple ticks of idle time. 505 * @ticks: number of stolen ticks 506 */ 507 void account_idle_ticks(unsigned long ticks) 508 { 509 u64 cputime, steal; 510 511 if (irqtime_enabled()) { 512 irqtime_account_idle_ticks(ticks); 513 return; 514 } 515 516 cputime = ticks * TICK_NSEC; 517 steal = steal_account_process_time(ULONG_MAX); 518 519 if (steal >= cputime) 520 return; 521 522 cputime -= steal; 523 account_idle_time(cputime); 524 } 525 526 /* 527 * Adjust tick based cputime random precision against scheduler runtime 528 * accounting. 529 * 530 * Tick based cputime accounting depend on random scheduling timeslices of a 531 * task to be interrupted or not by the timer. Depending on these 532 * circumstances, the number of these interrupts may be over or 533 * under-optimistic, matching the real user and system cputime with a variable 534 * precision. 535 * 536 * Fix this by scaling these tick based values against the total runtime 537 * accounted by the CFS scheduler. 538 * 539 * This code provides the following guarantees: 540 * 541 * stime + utime == rtime 542 * stime_i+1 >= stime_i, utime_i+1 >= utime_i 543 * 544 * Assuming that rtime_i+1 >= rtime_i. 545 */ 546 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, 547 u64 *ut, u64 *st) 548 { 549 u64 rtime, stime, utime; 550 unsigned long flags; 551 552 /* Serialize concurrent callers such that we can honour our guarantees */ 553 raw_spin_lock_irqsave(&prev->lock, flags); 554 rtime = curr->sum_exec_runtime; 555 556 /* 557 * This is possible under two circumstances: 558 * - rtime isn't monotonic after all (a bug); 559 * - we got reordered by the lock. 560 * 561 * In both cases this acts as a filter such that the rest of the code 562 * can assume it is monotonic regardless of anything else. 563 */ 564 if (prev->stime + prev->utime >= rtime) 565 goto out; 566 567 stime = curr->stime; 568 utime = curr->utime; 569 570 /* 571 * If either stime or utime are 0, assume all runtime is userspace. 572 * Once a task gets some ticks, the monotonicity code at 'update:' 573 * will ensure things converge to the observed ratio. 574 */ 575 if (stime == 0) { 576 utime = rtime; 577 goto update; 578 } 579 580 if (utime == 0) { 581 stime = rtime; 582 goto update; 583 } 584 585 stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); 586 /* 587 * Because mul_u64_u64_div_u64() can approximate on some 588 * achitectures; enforce the constraint that: a*b/(b+c) <= a. 589 */ 590 if (unlikely(stime > rtime)) 591 stime = rtime; 592 593 update: 594 /* 595 * Make sure stime doesn't go backwards; this preserves monotonicity 596 * for utime because rtime is monotonic. 597 * 598 * utime_i+1 = rtime_i+1 - stime_i 599 * = rtime_i+1 - (rtime_i - utime_i) 600 * = (rtime_i+1 - rtime_i) + utime_i 601 * >= utime_i 602 */ 603 if (stime < prev->stime) 604 stime = prev->stime; 605 utime = rtime - stime; 606 607 /* 608 * Make sure utime doesn't go backwards; this still preserves 609 * monotonicity for stime, analogous argument to above. 610 */ 611 if (utime < prev->utime) { 612 utime = prev->utime; 613 stime = rtime - utime; 614 } 615 616 prev->stime = stime; 617 prev->utime = utime; 618 out: 619 *ut = prev->utime; 620 *st = prev->stime; 621 raw_spin_unlock_irqrestore(&prev->lock, flags); 622 } 623 624 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 625 { 626 struct task_cputime cputime = { 627 .sum_exec_runtime = p->se.sum_exec_runtime, 628 }; 629 630 if (task_cputime(p, &cputime.utime, &cputime.stime)) 631 cputime.sum_exec_runtime = task_sched_runtime(p); 632 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 633 } 634 EXPORT_SYMBOL_GPL(task_cputime_adjusted); 635 636 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) 637 { 638 struct task_cputime cputime; 639 640 thread_group_cputime(p, &cputime); 641 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 642 } 643 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 644 645 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 646 static u64 vtime_delta(struct vtime *vtime) 647 { 648 unsigned long long clock; 649 650 clock = sched_clock(); 651 if (clock < vtime->starttime) 652 return 0; 653 654 return clock - vtime->starttime; 655 } 656 657 static u64 get_vtime_delta(struct vtime *vtime) 658 { 659 u64 delta = vtime_delta(vtime); 660 u64 other; 661 662 /* 663 * Unlike tick based timing, vtime based timing never has lost 664 * ticks, and no need for steal time accounting to make up for 665 * lost ticks. Vtime accounts a rounded version of actual 666 * elapsed time. Limit account_other_time to prevent rounding 667 * errors from causing elapsed vtime to go negative. 668 */ 669 other = account_other_time(delta); 670 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); 671 vtime->starttime += delta; 672 673 return delta - other; 674 } 675 676 static void vtime_account_system(struct task_struct *tsk, 677 struct vtime *vtime) 678 { 679 vtime->stime += get_vtime_delta(vtime); 680 if (vtime->stime >= TICK_NSEC) { 681 account_system_time(tsk, irq_count(), vtime->stime); 682 vtime->stime = 0; 683 } 684 } 685 686 static void vtime_account_guest(struct task_struct *tsk, 687 struct vtime *vtime) 688 { 689 vtime->gtime += get_vtime_delta(vtime); 690 if (vtime->gtime >= TICK_NSEC) { 691 account_guest_time(tsk, vtime->gtime); 692 vtime->gtime = 0; 693 } 694 } 695 696 static void __vtime_account_kernel(struct task_struct *tsk, 697 struct vtime *vtime) 698 { 699 /* We might have scheduled out from guest path */ 700 if (vtime->state == VTIME_GUEST) 701 vtime_account_guest(tsk, vtime); 702 else 703 vtime_account_system(tsk, vtime); 704 } 705 706 void vtime_account_kernel(struct task_struct *tsk) 707 { 708 struct vtime *vtime = &tsk->vtime; 709 710 if (!vtime_delta(vtime)) 711 return; 712 713 write_seqcount_begin(&vtime->seqcount); 714 __vtime_account_kernel(tsk, vtime); 715 write_seqcount_end(&vtime->seqcount); 716 } 717 718 void vtime_user_enter(struct task_struct *tsk) 719 { 720 struct vtime *vtime = &tsk->vtime; 721 722 write_seqcount_begin(&vtime->seqcount); 723 vtime_account_system(tsk, vtime); 724 vtime->state = VTIME_USER; 725 write_seqcount_end(&vtime->seqcount); 726 } 727 728 void vtime_user_exit(struct task_struct *tsk) 729 { 730 struct vtime *vtime = &tsk->vtime; 731 732 write_seqcount_begin(&vtime->seqcount); 733 vtime->utime += get_vtime_delta(vtime); 734 if (vtime->utime >= TICK_NSEC) { 735 account_user_time(tsk, vtime->utime); 736 vtime->utime = 0; 737 } 738 vtime->state = VTIME_SYS; 739 write_seqcount_end(&vtime->seqcount); 740 } 741 742 void vtime_guest_enter(struct task_struct *tsk) 743 { 744 struct vtime *vtime = &tsk->vtime; 745 /* 746 * The flags must be updated under the lock with 747 * the vtime_starttime flush and update. 748 * That enforces a right ordering and update sequence 749 * synchronization against the reader (task_gtime()) 750 * that can thus safely catch up with a tickless delta. 751 */ 752 write_seqcount_begin(&vtime->seqcount); 753 vtime_account_system(tsk, vtime); 754 tsk->flags |= PF_VCPU; 755 vtime->state = VTIME_GUEST; 756 write_seqcount_end(&vtime->seqcount); 757 } 758 EXPORT_SYMBOL_GPL(vtime_guest_enter); 759 760 void vtime_guest_exit(struct task_struct *tsk) 761 { 762 struct vtime *vtime = &tsk->vtime; 763 764 write_seqcount_begin(&vtime->seqcount); 765 vtime_account_guest(tsk, vtime); 766 tsk->flags &= ~PF_VCPU; 767 vtime->state = VTIME_SYS; 768 write_seqcount_end(&vtime->seqcount); 769 } 770 EXPORT_SYMBOL_GPL(vtime_guest_exit); 771 772 void vtime_account_idle(struct task_struct *tsk) 773 { 774 account_idle_time(get_vtime_delta(&tsk->vtime)); 775 } 776 777 void vtime_task_switch_generic(struct task_struct *prev) 778 { 779 struct vtime *vtime = &prev->vtime; 780 781 write_seqcount_begin(&vtime->seqcount); 782 if (vtime->state == VTIME_IDLE) 783 vtime_account_idle(prev); 784 else 785 __vtime_account_kernel(prev, vtime); 786 vtime->state = VTIME_INACTIVE; 787 vtime->cpu = -1; 788 write_seqcount_end(&vtime->seqcount); 789 790 vtime = ¤t->vtime; 791 792 write_seqcount_begin(&vtime->seqcount); 793 if (is_idle_task(current)) 794 vtime->state = VTIME_IDLE; 795 else if (current->flags & PF_VCPU) 796 vtime->state = VTIME_GUEST; 797 else 798 vtime->state = VTIME_SYS; 799 vtime->starttime = sched_clock(); 800 vtime->cpu = smp_processor_id(); 801 write_seqcount_end(&vtime->seqcount); 802 } 803 804 void vtime_init_idle(struct task_struct *t, int cpu) 805 { 806 struct vtime *vtime = &t->vtime; 807 unsigned long flags; 808 809 local_irq_save(flags); 810 write_seqcount_begin(&vtime->seqcount); 811 vtime->state = VTIME_IDLE; 812 vtime->starttime = sched_clock(); 813 vtime->cpu = cpu; 814 write_seqcount_end(&vtime->seqcount); 815 local_irq_restore(flags); 816 } 817 818 u64 task_gtime(struct task_struct *t) 819 { 820 struct vtime *vtime = &t->vtime; 821 unsigned int seq; 822 u64 gtime; 823 824 if (!vtime_accounting_enabled()) 825 return t->gtime; 826 827 do { 828 seq = read_seqcount_begin(&vtime->seqcount); 829 830 gtime = t->gtime; 831 if (vtime->state == VTIME_GUEST) 832 gtime += vtime->gtime + vtime_delta(vtime); 833 834 } while (read_seqcount_retry(&vtime->seqcount, seq)); 835 836 return gtime; 837 } 838 839 /* 840 * Fetch cputime raw values from fields of task_struct and 841 * add up the pending nohz execution time since the last 842 * cputime snapshot. 843 */ 844 bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) 845 { 846 struct vtime *vtime = &t->vtime; 847 unsigned int seq; 848 u64 delta; 849 int ret; 850 851 if (!vtime_accounting_enabled()) { 852 *utime = t->utime; 853 *stime = t->stime; 854 return false; 855 } 856 857 do { 858 ret = false; 859 seq = read_seqcount_begin(&vtime->seqcount); 860 861 *utime = t->utime; 862 *stime = t->stime; 863 864 /* Task is sleeping or idle, nothing to add */ 865 if (vtime->state < VTIME_SYS) 866 continue; 867 868 ret = true; 869 delta = vtime_delta(vtime); 870 871 /* 872 * Task runs either in user (including guest) or kernel space, 873 * add pending nohz time to the right place. 874 */ 875 if (vtime->state == VTIME_SYS) 876 *stime += vtime->stime + delta; 877 else 878 *utime += vtime->utime + delta; 879 } while (read_seqcount_retry(&vtime->seqcount, seq)); 880 881 return ret; 882 } 883 884 static int vtime_state_fetch(struct vtime *vtime, int cpu) 885 { 886 int state = READ_ONCE(vtime->state); 887 888 /* 889 * We raced against a context switch, fetch the 890 * kcpustat task again. 891 */ 892 if (vtime->cpu != cpu && vtime->cpu != -1) 893 return -EAGAIN; 894 895 /* 896 * Two possible things here: 897 * 1) We are seeing the scheduling out task (prev) or any past one. 898 * 2) We are seeing the scheduling in task (next) but it hasn't 899 * passed though vtime_task_switch() yet so the pending 900 * cputime of the prev task may not be flushed yet. 901 * 902 * Case 1) is ok but 2) is not. So wait for a safe VTIME state. 903 */ 904 if (state == VTIME_INACTIVE) 905 return -EAGAIN; 906 907 return state; 908 } 909 910 static u64 kcpustat_user_vtime(struct vtime *vtime) 911 { 912 if (vtime->state == VTIME_USER) 913 return vtime->utime + vtime_delta(vtime); 914 else if (vtime->state == VTIME_GUEST) 915 return vtime->gtime + vtime_delta(vtime); 916 return 0; 917 } 918 919 static int kcpustat_field_vtime(u64 *cpustat, 920 struct task_struct *tsk, 921 enum cpu_usage_stat usage, 922 int cpu, u64 *val) 923 { 924 struct vtime *vtime = &tsk->vtime; 925 unsigned int seq; 926 927 do { 928 int state; 929 930 seq = read_seqcount_begin(&vtime->seqcount); 931 932 state = vtime_state_fetch(vtime, cpu); 933 if (state < 0) 934 return state; 935 936 *val = cpustat[usage]; 937 938 /* 939 * Nice VS unnice cputime accounting may be inaccurate if 940 * the nice value has changed since the last vtime update. 941 * But proper fix would involve interrupting target on nice 942 * updates which is a no go on nohz_full (although the scheduler 943 * may still interrupt the target if rescheduling is needed...) 944 */ 945 switch (usage) { 946 case CPUTIME_SYSTEM: 947 if (state == VTIME_SYS) 948 *val += vtime->stime + vtime_delta(vtime); 949 break; 950 case CPUTIME_USER: 951 if (task_nice(tsk) <= 0) 952 *val += kcpustat_user_vtime(vtime); 953 break; 954 case CPUTIME_NICE: 955 if (task_nice(tsk) > 0) 956 *val += kcpustat_user_vtime(vtime); 957 break; 958 case CPUTIME_GUEST: 959 if (state == VTIME_GUEST && task_nice(tsk) <= 0) 960 *val += vtime->gtime + vtime_delta(vtime); 961 break; 962 case CPUTIME_GUEST_NICE: 963 if (state == VTIME_GUEST && task_nice(tsk) > 0) 964 *val += vtime->gtime + vtime_delta(vtime); 965 break; 966 default: 967 break; 968 } 969 } while (read_seqcount_retry(&vtime->seqcount, seq)); 970 971 return 0; 972 } 973 974 u64 kcpustat_field(struct kernel_cpustat *kcpustat, 975 enum cpu_usage_stat usage, int cpu) 976 { 977 u64 *cpustat = kcpustat->cpustat; 978 u64 val = cpustat[usage]; 979 struct rq *rq; 980 int err; 981 982 if (!vtime_accounting_enabled_cpu(cpu)) 983 return val; 984 985 rq = cpu_rq(cpu); 986 987 for (;;) { 988 struct task_struct *curr; 989 990 rcu_read_lock(); 991 curr = rcu_dereference(rq->curr); 992 if (WARN_ON_ONCE(!curr)) { 993 rcu_read_unlock(); 994 return cpustat[usage]; 995 } 996 997 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val); 998 rcu_read_unlock(); 999 1000 if (!err) 1001 return val; 1002 1003 cpu_relax(); 1004 } 1005 } 1006 EXPORT_SYMBOL_GPL(kcpustat_field); 1007 1008 static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, 1009 const struct kernel_cpustat *src, 1010 struct task_struct *tsk, int cpu) 1011 { 1012 struct vtime *vtime = &tsk->vtime; 1013 unsigned int seq; 1014 1015 do { 1016 u64 *cpustat; 1017 u64 delta; 1018 int state; 1019 1020 seq = read_seqcount_begin(&vtime->seqcount); 1021 1022 state = vtime_state_fetch(vtime, cpu); 1023 if (state < 0) 1024 return state; 1025 1026 *dst = *src; 1027 cpustat = dst->cpustat; 1028 1029 /* Task is sleeping, dead or idle, nothing to add */ 1030 if (state < VTIME_SYS) 1031 continue; 1032 1033 delta = vtime_delta(vtime); 1034 1035 /* 1036 * Task runs either in user (including guest) or kernel space, 1037 * add pending nohz time to the right place. 1038 */ 1039 if (state == VTIME_SYS) { 1040 cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; 1041 } else if (state == VTIME_USER) { 1042 if (task_nice(tsk) > 0) 1043 cpustat[CPUTIME_NICE] += vtime->utime + delta; 1044 else 1045 cpustat[CPUTIME_USER] += vtime->utime + delta; 1046 } else { 1047 WARN_ON_ONCE(state != VTIME_GUEST); 1048 if (task_nice(tsk) > 0) { 1049 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; 1050 cpustat[CPUTIME_NICE] += vtime->gtime + delta; 1051 } else { 1052 cpustat[CPUTIME_GUEST] += vtime->gtime + delta; 1053 cpustat[CPUTIME_USER] += vtime->gtime + delta; 1054 } 1055 } 1056 } while (read_seqcount_retry(&vtime->seqcount, seq)); 1057 1058 return 0; 1059 } 1060 1061 void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) 1062 { 1063 const struct kernel_cpustat *src = &kcpustat_cpu(cpu); 1064 struct rq *rq; 1065 int err; 1066 1067 if (!vtime_accounting_enabled_cpu(cpu)) { 1068 *dst = *src; 1069 return; 1070 } 1071 1072 rq = cpu_rq(cpu); 1073 1074 for (;;) { 1075 struct task_struct *curr; 1076 1077 rcu_read_lock(); 1078 curr = rcu_dereference(rq->curr); 1079 if (WARN_ON_ONCE(!curr)) { 1080 rcu_read_unlock(); 1081 *dst = *src; 1082 return; 1083 } 1084 1085 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); 1086 rcu_read_unlock(); 1087 1088 if (!err) 1089 return; 1090 1091 cpu_relax(); 1092 } 1093 } 1094 EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); 1095 1096 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 1097