1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2010 The FreeBSD Foundation 5 * All rights reserved. 6 * 7 * This software was developed by Edward Tomasz Napierala under sponsorship 8 * from the FreeBSD Foundation. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * $FreeBSD$ 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_sched.h" 38 39 #include <sys/param.h> 40 #include <sys/buf.h> 41 #include <sys/systm.h> 42 #include <sys/eventhandler.h> 43 #include <sys/jail.h> 44 #include <sys/kernel.h> 45 #include <sys/kthread.h> 46 #include <sys/lock.h> 47 #include <sys/loginclass.h> 48 #include <sys/malloc.h> 49 #include <sys/mutex.h> 50 #include <sys/proc.h> 51 #include <sys/racct.h> 52 #include <sys/resourcevar.h> 53 #include <sys/sbuf.h> 54 #include <sys/sched.h> 55 #include <sys/sdt.h> 56 #include <sys/smp.h> 57 #include <sys/sx.h> 58 #include <sys/sysctl.h> 59 #include <sys/sysent.h> 60 #include <sys/sysproto.h> 61 #include <sys/umtx.h> 62 #include <machine/smp.h> 63 64 #ifdef RCTL 65 #include <sys/rctl.h> 66 #endif 67 68 #ifdef RACCT 69 70 FEATURE(racct, "Resource Accounting"); 71 72 /* 73 * Do not block processes that have their %cpu usage <= pcpu_threshold. 74 */ 75 static int pcpu_threshold = 1; 76 #ifdef RACCT_DEFAULT_TO_DISABLED 77 bool __read_frequently racct_enable = false; 78 #else 79 bool __read_frequently racct_enable = true; 80 #endif 81 82 SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 83 "Resource Accounting"); 84 SYSCTL_BOOL(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable, 85 0, "Enable RACCT/RCTL"); 86 SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 87 0, "Processes with higher %cpu usage than this value can be throttled."); 88 89 /* 90 * How many seconds it takes to use the scheduler %cpu calculations. When a 91 * process starts, we compute its %cpu usage by dividing its runtime by the 92 * process wall clock time. After RACCT_PCPU_SECS pass, we use the value 93 * provided by the scheduler. 94 */ 95 #define RACCT_PCPU_SECS 3 96 97 struct mtx racct_lock; 98 MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); 99 100 static uma_zone_t racct_zone; 101 102 static void racct_sub_racct(struct racct *dest, const struct racct *src); 103 static void racct_sub_cred_locked(struct ucred *cred, int resource, 104 uint64_t amount); 105 static void racct_add_cred_locked(struct ucred *cred, int resource, 106 uint64_t amount); 107 108 SDT_PROVIDER_DEFINE(racct); 109 SDT_PROBE_DEFINE3(racct, , rusage, add, 110 "struct proc *", "int", "uint64_t"); 111 SDT_PROBE_DEFINE3(racct, , rusage, add__failure, 112 "struct proc *", "int", "uint64_t"); 113 SDT_PROBE_DEFINE3(racct, , rusage, add__buf, 114 "struct proc *", "const struct buf *", "int"); 115 SDT_PROBE_DEFINE3(racct, , rusage, add__cred, 116 "struct ucred *", "int", "uint64_t"); 117 SDT_PROBE_DEFINE3(racct, , rusage, add__force, 118 "struct proc *", "int", "uint64_t"); 119 SDT_PROBE_DEFINE3(racct, , rusage, set, 120 "struct proc *", "int", "uint64_t"); 121 SDT_PROBE_DEFINE3(racct, , rusage, set__failure, 122 "struct proc *", "int", "uint64_t"); 123 SDT_PROBE_DEFINE3(racct, , rusage, set__force, 124 "struct proc *", "int", "uint64_t"); 125 SDT_PROBE_DEFINE3(racct, , rusage, sub, 126 "struct proc *", "int", "uint64_t"); 127 SDT_PROBE_DEFINE3(racct, , rusage, sub__cred, 128 "struct ucred *", "int", "uint64_t"); 129 SDT_PROBE_DEFINE1(racct, , racct, create, 130 "struct racct *"); 131 SDT_PROBE_DEFINE1(racct, , racct, destroy, 132 "struct racct *"); 133 SDT_PROBE_DEFINE2(racct, , racct, join, 134 "struct racct *", "struct racct *"); 135 SDT_PROBE_DEFINE2(racct, , racct, join__failure, 136 "struct racct *", "struct racct *"); 137 SDT_PROBE_DEFINE2(racct, , racct, leave, 138 "struct racct *", "struct racct *"); 139 140 int racct_types[] = { 141 [RACCT_CPU] = 142 RACCT_IN_MILLIONS, 143 [RACCT_DATA] = 144 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 145 [RACCT_STACK] = 146 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 147 [RACCT_CORE] = 148 RACCT_DENIABLE, 149 [RACCT_RSS] = 150 RACCT_RECLAIMABLE, 151 [RACCT_MEMLOCK] = 152 RACCT_RECLAIMABLE | RACCT_DENIABLE, 153 [RACCT_NPROC] = 154 RACCT_RECLAIMABLE | RACCT_DENIABLE, 155 [RACCT_NOFILE] = 156 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 157 [RACCT_VMEM] = 158 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 159 [RACCT_NPTS] = 160 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 161 [RACCT_SWAP] = 162 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 163 [RACCT_NTHR] = 164 RACCT_RECLAIMABLE | RACCT_DENIABLE, 165 [RACCT_MSGQQUEUED] = 166 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 167 [RACCT_MSGQSIZE] = 168 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 169 [RACCT_NMSGQ] = 170 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 171 [RACCT_NSEM] = 172 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 173 [RACCT_NSEMOP] = 174 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 175 [RACCT_NSHM] = 176 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 177 [RACCT_SHMSIZE] = 178 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 179 [RACCT_WALLCLOCK] = 180 RACCT_IN_MILLIONS, 181 [RACCT_PCTCPU] = 182 RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS, 183 [RACCT_READBPS] = 184 RACCT_DECAYING, 185 [RACCT_WRITEBPS] = 186 RACCT_DECAYING, 187 [RACCT_READIOPS] = 188 RACCT_DECAYING, 189 [RACCT_WRITEIOPS] = 190 RACCT_DECAYING }; 191 192 static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; 193 194 #ifdef SCHED_4BSD 195 /* 196 * Contains intermediate values for %cpu calculations to avoid using floating 197 * point in the kernel. 198 * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) 199 * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to 200 * zero so the calculations are more straightforward. 201 */ 202 fixpt_t ccpu_exp[] = { 203 [0] = FSCALE * 1, 204 [1] = FSCALE * 0.95122942450071400909, 205 [2] = FSCALE * 0.90483741803595957316, 206 [3] = FSCALE * 0.86070797642505780722, 207 [4] = FSCALE * 0.81873075307798185866, 208 [5] = FSCALE * 0.77880078307140486824, 209 [6] = FSCALE * 0.74081822068171786606, 210 [7] = FSCALE * 0.70468808971871343435, 211 [8] = FSCALE * 0.67032004603563930074, 212 [9] = FSCALE * 0.63762815162177329314, 213 [10] = FSCALE * 0.60653065971263342360, 214 [11] = FSCALE * 0.57694981038048669531, 215 [12] = FSCALE * 0.54881163609402643262, 216 [13] = FSCALE * 0.52204577676101604789, 217 [14] = FSCALE * 0.49658530379140951470, 218 [15] = FSCALE * 0.47236655274101470713, 219 [16] = FSCALE * 0.44932896411722159143, 220 [17] = FSCALE * 0.42741493194872666992, 221 [18] = FSCALE * 0.40656965974059911188, 222 [19] = FSCALE * 0.38674102345450120691, 223 [20] = FSCALE * 0.36787944117144232159, 224 [21] = FSCALE * 0.34993774911115535467, 225 [22] = FSCALE * 0.33287108369807955328, 226 [23] = FSCALE * 0.31663676937905321821, 227 [24] = FSCALE * 0.30119421191220209664, 228 [25] = FSCALE * 0.28650479686019010032, 229 [26] = FSCALE * 0.27253179303401260312, 230 [27] = FSCALE * 0.25924026064589150757, 231 [28] = FSCALE * 0.24659696394160647693, 232 [29] = FSCALE * 0.23457028809379765313, 233 [30] = FSCALE * 0.22313016014842982893, 234 [31] = FSCALE * 0.21224797382674305771, 235 [32] = FSCALE * 0.20189651799465540848, 236 [33] = FSCALE * 0.19204990862075411423, 237 [34] = FSCALE * 0.18268352405273465022, 238 [35] = FSCALE * 0.17377394345044512668, 239 [36] = FSCALE * 0.16529888822158653829, 240 [37] = FSCALE * 0.15723716631362761621, 241 [38] = FSCALE * 0.14956861922263505264, 242 [39] = FSCALE * 0.14227407158651357185, 243 [40] = FSCALE * 0.13533528323661269189, 244 [41] = FSCALE * 0.12873490358780421886, 245 [42] = FSCALE * 0.12245642825298191021, 246 [43] = FSCALE * 0.11648415777349695786, 247 [44] = FSCALE * 0.11080315836233388333, 248 [45] = FSCALE * 0.10539922456186433678, 249 [46] = FSCALE * 0.10025884372280373372, 250 [47] = FSCALE * 0.09536916221554961888, 251 [48] = FSCALE * 0.09071795328941250337, 252 [49] = FSCALE * 0.08629358649937051097, 253 [50] = FSCALE * 0.08208499862389879516, 254 [51] = FSCALE * 0.07808166600115315231, 255 [52] = FSCALE * 0.07427357821433388042, 256 [53] = FSCALE * 0.07065121306042958674, 257 [54] = FSCALE * 0.06720551273974976512, 258 [55] = FSCALE * 0.06392786120670757270, 259 [56] = FSCALE * 0.06081006262521796499, 260 [57] = FSCALE * 0.05784432087483846296, 261 [58] = FSCALE * 0.05502322005640722902, 262 [59] = FSCALE * 0.05233970594843239308, 263 [60] = FSCALE * 0.04978706836786394297, 264 [61] = FSCALE * 0.04735892439114092119, 265 [62] = FSCALE * 0.04504920239355780606, 266 [63] = FSCALE * 0.04285212686704017991, 267 [64] = FSCALE * 0.04076220397836621516, 268 [65] = FSCALE * 0.03877420783172200988, 269 [66] = FSCALE * 0.03688316740124000544, 270 [67] = FSCALE * 0.03508435410084502588, 271 [68] = FSCALE * 0.03337326996032607948, 272 [69] = FSCALE * 0.03174563637806794323, 273 [70] = FSCALE * 0.03019738342231850073, 274 [71] = FSCALE * 0.02872463965423942912, 275 [72] = FSCALE * 0.02732372244729256080, 276 [73] = FSCALE * 0.02599112877875534358, 277 [74] = FSCALE * 0.02472352647033939120, 278 [75] = FSCALE * 0.02351774585600910823, 279 [76] = FSCALE * 0.02237077185616559577, 280 [77] = FSCALE * 0.02127973643837716938, 281 [78] = FSCALE * 0.02024191144580438847, 282 [79] = FSCALE * 0.01925470177538692429, 283 [80] = FSCALE * 0.01831563888873418029, 284 [81] = FSCALE * 0.01742237463949351138, 285 [82] = FSCALE * 0.01657267540176124754, 286 [83] = FSCALE * 0.01576441648485449082, 287 [84] = FSCALE * 0.01499557682047770621, 288 [85] = FSCALE * 0.01426423390899925527, 289 [86] = FSCALE * 0.01356855901220093175, 290 [87] = FSCALE * 0.01290681258047986886, 291 [88] = FSCALE * 0.01227733990306844117, 292 [89] = FSCALE * 0.01167856697039544521, 293 [90] = FSCALE * 0.01110899653824230649, 294 [91] = FSCALE * 0.01056720438385265337, 295 [92] = FSCALE * 0.01005183574463358164, 296 [93] = FSCALE * 0.00956160193054350793, 297 [94] = FSCALE * 0.00909527710169581709, 298 [95] = FSCALE * 0.00865169520312063417, 299 [96] = FSCALE * 0.00822974704902002884, 300 [97] = FSCALE * 0.00782837754922577143, 301 [98] = FSCALE * 0.00744658307092434051, 302 [99] = FSCALE * 0.00708340892905212004, 303 [100] = FSCALE * 0.00673794699908546709, 304 [101] = FSCALE * 0.00640933344625638184, 305 [102] = FSCALE * 0.00609674656551563610, 306 [103] = FSCALE * 0.00579940472684214321, 307 [104] = FSCALE * 0.00551656442076077241, 308 [105] = FSCALE * 0.00524751839918138427, 309 [106] = FSCALE * 0.00499159390691021621, 310 [107] = FSCALE * 0.00474815099941147558, 311 [108] = FSCALE * 0.00451658094261266798, 312 [109] = FSCALE * 0.00429630469075234057, 313 [110] = FSCALE * 0.00408677143846406699, 314 }; 315 #endif 316 317 #define CCPU_EXP_MAX 110 318 319 /* 320 * This function is analogical to the getpcpu() function in the ps(1) command. 321 * They should both calculate in the same way so that the racct %cpu 322 * calculations are consistent with the values showed by the ps(1) tool. 323 * The calculations are more complex in the 4BSD scheduler because of the value 324 * of the ccpu variable. In ULE it is defined to be zero which saves us some 325 * work. 326 */ 327 static uint64_t 328 racct_getpcpu(struct proc *p, u_int pcpu) 329 { 330 u_int swtime; 331 #ifdef SCHED_4BSD 332 fixpt_t pctcpu, pctcpu_next; 333 #endif 334 #ifdef SMP 335 struct pcpu *pc; 336 int found; 337 #endif 338 fixpt_t p_pctcpu; 339 struct thread *td; 340 341 ASSERT_RACCT_ENABLED(); 342 343 /* 344 * If the process is swapped out, we count its %cpu usage as zero. 345 * This behaviour is consistent with the userland ps(1) tool. 346 */ 347 if ((p->p_flag & P_INMEM) == 0) 348 return (0); 349 swtime = (ticks - p->p_swtick) / hz; 350 351 /* 352 * For short-lived processes, the sched_pctcpu() returns small 353 * values even for cpu intensive processes. Therefore we use 354 * our own estimate in this case. 355 */ 356 if (swtime < RACCT_PCPU_SECS) 357 return (pcpu); 358 359 p_pctcpu = 0; 360 FOREACH_THREAD_IN_PROC(p, td) { 361 if (td == PCPU_GET(idlethread)) 362 continue; 363 #ifdef SMP 364 found = 0; 365 STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { 366 if (td == pc->pc_idlethread) { 367 found = 1; 368 break; 369 } 370 } 371 if (found) 372 continue; 373 #endif 374 thread_lock(td); 375 #ifdef SCHED_4BSD 376 pctcpu = sched_pctcpu(td); 377 /* Count also the yet unfinished second. */ 378 pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; 379 pctcpu_next += sched_pctcpu_delta(td); 380 p_pctcpu += max(pctcpu, pctcpu_next); 381 #else 382 /* 383 * In ULE the %cpu statistics are updated on every 384 * sched_pctcpu() call. So special calculations to 385 * account for the latest (unfinished) second are 386 * not needed. 387 */ 388 p_pctcpu += sched_pctcpu(td); 389 #endif 390 thread_unlock(td); 391 } 392 393 #ifdef SCHED_4BSD 394 if (swtime <= CCPU_EXP_MAX) 395 return ((100 * (uint64_t)p_pctcpu * 1000000) / 396 (FSCALE - ccpu_exp[swtime])); 397 #endif 398 399 return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); 400 } 401 402 static void 403 racct_add_racct(struct racct *dest, const struct racct *src) 404 { 405 int i; 406 407 ASSERT_RACCT_ENABLED(); 408 RACCT_LOCK_ASSERT(); 409 410 /* 411 * Update resource usage in dest. 412 */ 413 for (i = 0; i <= RACCT_MAX; i++) { 414 KASSERT(dest->r_resources[i] >= 0, 415 ("%s: resource %d propagation meltdown: dest < 0", 416 __func__, i)); 417 KASSERT(src->r_resources[i] >= 0, 418 ("%s: resource %d propagation meltdown: src < 0", 419 __func__, i)); 420 dest->r_resources[i] += src->r_resources[i]; 421 } 422 } 423 424 static void 425 racct_sub_racct(struct racct *dest, const struct racct *src) 426 { 427 int i; 428 429 ASSERT_RACCT_ENABLED(); 430 RACCT_LOCK_ASSERT(); 431 432 /* 433 * Update resource usage in dest. 434 */ 435 for (i = 0; i <= RACCT_MAX; i++) { 436 if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { 437 KASSERT(dest->r_resources[i] >= 0, 438 ("%s: resource %d propagation meltdown: dest < 0", 439 __func__, i)); 440 KASSERT(src->r_resources[i] >= 0, 441 ("%s: resource %d propagation meltdown: src < 0", 442 __func__, i)); 443 KASSERT(src->r_resources[i] <= dest->r_resources[i], 444 ("%s: resource %d propagation meltdown: src > dest", 445 __func__, i)); 446 } 447 if (RACCT_CAN_DROP(i)) { 448 dest->r_resources[i] -= src->r_resources[i]; 449 if (dest->r_resources[i] < 0) 450 dest->r_resources[i] = 0; 451 } 452 } 453 } 454 455 void 456 racct_create(struct racct **racctp) 457 { 458 459 if (!racct_enable) 460 return; 461 462 SDT_PROBE1(racct, , racct, create, racctp); 463 464 KASSERT(*racctp == NULL, ("racct already allocated")); 465 466 *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); 467 } 468 469 static void 470 racct_destroy_locked(struct racct **racctp) 471 { 472 struct racct *racct; 473 int i; 474 475 ASSERT_RACCT_ENABLED(); 476 477 SDT_PROBE1(racct, , racct, destroy, racctp); 478 479 RACCT_LOCK_ASSERT(); 480 KASSERT(racctp != NULL, ("NULL racctp")); 481 KASSERT(*racctp != NULL, ("NULL racct")); 482 483 racct = *racctp; 484 485 for (i = 0; i <= RACCT_MAX; i++) { 486 if (RACCT_IS_SLOPPY(i)) 487 continue; 488 if (!RACCT_IS_RECLAIMABLE(i)) 489 continue; 490 KASSERT(racct->r_resources[i] == 0, 491 ("destroying non-empty racct: " 492 "%ju allocated for resource %d\n", 493 racct->r_resources[i], i)); 494 } 495 uma_zfree(racct_zone, racct); 496 *racctp = NULL; 497 } 498 499 void 500 racct_destroy(struct racct **racct) 501 { 502 503 if (!racct_enable) 504 return; 505 506 RACCT_LOCK(); 507 racct_destroy_locked(racct); 508 RACCT_UNLOCK(); 509 } 510 511 /* 512 * Increase consumption of 'resource' by 'amount' for 'racct', 513 * but not its parents. Differently from other cases, 'amount' here 514 * may be less than zero. 515 */ 516 static void 517 racct_adjust_resource(struct racct *racct, int resource, 518 int64_t amount) 519 { 520 521 ASSERT_RACCT_ENABLED(); 522 RACCT_LOCK_ASSERT(); 523 KASSERT(racct != NULL, ("NULL racct")); 524 525 racct->r_resources[resource] += amount; 526 if (racct->r_resources[resource] < 0) { 527 KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), 528 ("%s: resource %d usage < 0", __func__, resource)); 529 racct->r_resources[resource] = 0; 530 } 531 532 /* 533 * There are some cases where the racct %cpu resource would grow 534 * beyond 100% per core. For example in racct_proc_exit() we add 535 * the process %cpu usage to the ucred racct containers. If too 536 * many processes terminated in a short time span, the ucred %cpu 537 * resource could grow too much. Also, the 4BSD scheduler sometimes 538 * returns for a thread more than 100% cpu usage. So we set a sane 539 * boundary here to 100% * the maxumum number of CPUs. 540 */ 541 if ((resource == RACCT_PCTCPU) && 542 (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) 543 racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; 544 } 545 546 static int 547 racct_add_locked(struct proc *p, int resource, uint64_t amount, int force) 548 { 549 #ifdef RCTL 550 int error; 551 #endif 552 553 ASSERT_RACCT_ENABLED(); 554 555 /* 556 * We need proc lock to dereference p->p_ucred. 557 */ 558 PROC_LOCK_ASSERT(p, MA_OWNED); 559 560 #ifdef RCTL 561 error = rctl_enforce(p, resource, amount); 562 if (error && !force && RACCT_IS_DENIABLE(resource)) { 563 SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount); 564 return (error); 565 } 566 #endif 567 racct_adjust_resource(p->p_racct, resource, amount); 568 racct_add_cred_locked(p->p_ucred, resource, amount); 569 570 return (0); 571 } 572 573 /* 574 * Increase allocation of 'resource' by 'amount' for process 'p'. 575 * Return 0 if it's below limits, or errno, if it's not. 576 */ 577 int 578 racct_add(struct proc *p, int resource, uint64_t amount) 579 { 580 int error; 581 582 if (!racct_enable) 583 return (0); 584 585 SDT_PROBE3(racct, , rusage, add, p, resource, amount); 586 587 RACCT_LOCK(); 588 error = racct_add_locked(p, resource, amount, 0); 589 RACCT_UNLOCK(); 590 return (error); 591 } 592 593 /* 594 * Increase allocation of 'resource' by 'amount' for process 'p'. 595 * Doesn't check for limits and never fails. 596 */ 597 void 598 racct_add_force(struct proc *p, int resource, uint64_t amount) 599 { 600 601 if (!racct_enable) 602 return; 603 604 SDT_PROBE3(racct, , rusage, add__force, p, resource, amount); 605 606 RACCT_LOCK(); 607 racct_add_locked(p, resource, amount, 1); 608 RACCT_UNLOCK(); 609 } 610 611 static void 612 racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) 613 { 614 struct prison *pr; 615 616 ASSERT_RACCT_ENABLED(); 617 618 racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount); 619 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 620 racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, 621 amount); 622 racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount); 623 } 624 625 /* 626 * Increase allocation of 'resource' by 'amount' for credential 'cred'. 627 * Doesn't check for limits and never fails. 628 */ 629 void 630 racct_add_cred(struct ucred *cred, int resource, uint64_t amount) 631 { 632 633 if (!racct_enable) 634 return; 635 636 SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount); 637 638 RACCT_LOCK(); 639 racct_add_cred_locked(cred, resource, amount); 640 RACCT_UNLOCK(); 641 } 642 643 /* 644 * Account for disk IO resource consumption. Checks for limits, 645 * but never fails, due to disk limits being undeniable. 646 */ 647 void 648 racct_add_buf(struct proc *p, const struct buf *bp, int is_write) 649 { 650 651 ASSERT_RACCT_ENABLED(); 652 PROC_LOCK_ASSERT(p, MA_OWNED); 653 654 SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write); 655 656 RACCT_LOCK(); 657 if (is_write) { 658 racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1); 659 racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1); 660 } else { 661 racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1); 662 racct_add_locked(curproc, RACCT_READIOPS, 1, 1); 663 } 664 RACCT_UNLOCK(); 665 } 666 667 static int 668 racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) 669 { 670 int64_t old_amount, decayed_amount, diff_proc, diff_cred; 671 #ifdef RCTL 672 int error; 673 #endif 674 675 ASSERT_RACCT_ENABLED(); 676 677 /* 678 * We need proc lock to dereference p->p_ucred. 679 */ 680 PROC_LOCK_ASSERT(p, MA_OWNED); 681 682 old_amount = p->p_racct->r_resources[resource]; 683 /* 684 * The diffs may be negative. 685 */ 686 diff_proc = amount - old_amount; 687 if (resource == RACCT_PCTCPU) { 688 /* 689 * Resources in per-credential racct containers may decay. 690 * If this is the case, we need to calculate the difference 691 * between the new amount and the proportional value of the 692 * old amount that has decayed in the ucred racct containers. 693 */ 694 decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 695 diff_cred = amount - decayed_amount; 696 } else 697 diff_cred = diff_proc; 698 #ifdef notyet 699 KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), 700 ("%s: usage of non-droppable resource %d dropping", __func__, 701 resource)); 702 #endif 703 #ifdef RCTL 704 if (diff_proc > 0) { 705 error = rctl_enforce(p, resource, diff_proc); 706 if (error && !force && RACCT_IS_DENIABLE(resource)) { 707 SDT_PROBE3(racct, , rusage, set__failure, p, resource, 708 amount); 709 return (error); 710 } 711 } 712 #endif 713 racct_adjust_resource(p->p_racct, resource, diff_proc); 714 if (diff_cred > 0) 715 racct_add_cred_locked(p->p_ucred, resource, diff_cred); 716 else if (diff_cred < 0) 717 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 718 719 return (0); 720 } 721 722 /* 723 * Set allocation of 'resource' to 'amount' for process 'p'. 724 * Return 0 if it's below limits, or errno, if it's not. 725 * 726 * Note that decreasing the allocation always returns 0, 727 * even if it's above the limit. 728 */ 729 int 730 racct_set_unlocked(struct proc *p, int resource, uint64_t amount) 731 { 732 int error; 733 734 ASSERT_RACCT_ENABLED(); 735 PROC_LOCK(p); 736 error = racct_set(p, resource, amount); 737 PROC_UNLOCK(p); 738 return (error); 739 } 740 741 int 742 racct_set(struct proc *p, int resource, uint64_t amount) 743 { 744 int error; 745 746 if (!racct_enable) 747 return (0); 748 749 SDT_PROBE3(racct, , rusage, set__force, p, resource, amount); 750 751 RACCT_LOCK(); 752 error = racct_set_locked(p, resource, amount, 0); 753 RACCT_UNLOCK(); 754 return (error); 755 } 756 757 void 758 racct_set_force(struct proc *p, int resource, uint64_t amount) 759 { 760 761 if (!racct_enable) 762 return; 763 764 SDT_PROBE3(racct, , rusage, set, p, resource, amount); 765 766 RACCT_LOCK(); 767 racct_set_locked(p, resource, amount, 1); 768 RACCT_UNLOCK(); 769 } 770 771 /* 772 * Returns amount of 'resource' the process 'p' can keep allocated. 773 * Allocating more than that would be denied, unless the resource 774 * is marked undeniable. Amount of already allocated resource does 775 * not matter. 776 */ 777 uint64_t 778 racct_get_limit(struct proc *p, int resource) 779 { 780 #ifdef RCTL 781 uint64_t available; 782 783 if (!racct_enable) 784 return (UINT64_MAX); 785 786 RACCT_LOCK(); 787 available = rctl_get_limit(p, resource); 788 RACCT_UNLOCK(); 789 790 return (available); 791 #else 792 793 return (UINT64_MAX); 794 #endif 795 } 796 797 /* 798 * Returns amount of 'resource' the process 'p' can keep allocated. 799 * Allocating more than that would be denied, unless the resource 800 * is marked undeniable. Amount of already allocated resource does 801 * matter. 802 */ 803 uint64_t 804 racct_get_available(struct proc *p, int resource) 805 { 806 #ifdef RCTL 807 uint64_t available; 808 809 if (!racct_enable) 810 return (UINT64_MAX); 811 812 RACCT_LOCK(); 813 available = rctl_get_available(p, resource); 814 RACCT_UNLOCK(); 815 816 return (available); 817 #else 818 819 return (UINT64_MAX); 820 #endif 821 } 822 823 /* 824 * Returns amount of the %cpu resource that process 'p' can add to its %cpu 825 * utilization. Adding more than that would lead to the process being 826 * throttled. 827 */ 828 static int64_t 829 racct_pcpu_available(struct proc *p) 830 { 831 #ifdef RCTL 832 uint64_t available; 833 834 ASSERT_RACCT_ENABLED(); 835 836 RACCT_LOCK(); 837 available = rctl_pcpu_available(p); 838 RACCT_UNLOCK(); 839 840 return (available); 841 #else 842 843 return (INT64_MAX); 844 #endif 845 } 846 847 /* 848 * Decrease allocation of 'resource' by 'amount' for process 'p'. 849 */ 850 void 851 racct_sub(struct proc *p, int resource, uint64_t amount) 852 { 853 854 if (!racct_enable) 855 return; 856 857 SDT_PROBE3(racct, , rusage, sub, p, resource, amount); 858 859 /* 860 * We need proc lock to dereference p->p_ucred. 861 */ 862 PROC_LOCK_ASSERT(p, MA_OWNED); 863 KASSERT(RACCT_CAN_DROP(resource), 864 ("%s: called for non-droppable resource %d", __func__, resource)); 865 866 RACCT_LOCK(); 867 KASSERT(amount <= p->p_racct->r_resources[resource], 868 ("%s: freeing %ju of resource %d, which is more " 869 "than allocated %jd for %s (pid %d)", __func__, amount, resource, 870 (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); 871 872 racct_adjust_resource(p->p_racct, resource, -amount); 873 racct_sub_cred_locked(p->p_ucred, resource, amount); 874 RACCT_UNLOCK(); 875 } 876 877 static void 878 racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) 879 { 880 struct prison *pr; 881 882 ASSERT_RACCT_ENABLED(); 883 884 racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); 885 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 886 racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, 887 -amount); 888 racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount); 889 } 890 891 /* 892 * Decrease allocation of 'resource' by 'amount' for credential 'cred'. 893 */ 894 void 895 racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 896 { 897 898 if (!racct_enable) 899 return; 900 901 SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount); 902 903 #ifdef notyet 904 KASSERT(RACCT_CAN_DROP(resource), 905 ("%s: called for resource %d which can not drop", __func__, 906 resource)); 907 #endif 908 909 RACCT_LOCK(); 910 racct_sub_cred_locked(cred, resource, amount); 911 RACCT_UNLOCK(); 912 } 913 914 /* 915 * Inherit resource usage information from the parent process. 916 */ 917 int 918 racct_proc_fork(struct proc *parent, struct proc *child) 919 { 920 int i, error = 0; 921 922 if (!racct_enable) 923 return (0); 924 925 /* 926 * Create racct for the child process. 927 */ 928 racct_create(&child->p_racct); 929 930 PROC_LOCK(parent); 931 PROC_LOCK(child); 932 RACCT_LOCK(); 933 934 #ifdef RCTL 935 error = rctl_proc_fork(parent, child); 936 if (error != 0) 937 goto out; 938 #endif 939 940 /* Init process cpu time. */ 941 child->p_prev_runtime = 0; 942 child->p_throttled = 0; 943 944 /* 945 * Inherit resource usage. 946 */ 947 for (i = 0; i <= RACCT_MAX; i++) { 948 if (parent->p_racct->r_resources[i] == 0 || 949 !RACCT_IS_INHERITABLE(i)) 950 continue; 951 952 error = racct_set_locked(child, i, 953 parent->p_racct->r_resources[i], 0); 954 if (error != 0) 955 goto out; 956 } 957 958 error = racct_add_locked(child, RACCT_NPROC, 1, 0); 959 error += racct_add_locked(child, RACCT_NTHR, 1, 0); 960 961 out: 962 RACCT_UNLOCK(); 963 PROC_UNLOCK(child); 964 PROC_UNLOCK(parent); 965 966 if (error != 0) 967 racct_proc_exit(child); 968 969 return (error); 970 } 971 972 /* 973 * Called at the end of fork1(), to handle rules that require the process 974 * to be fully initialized. 975 */ 976 void 977 racct_proc_fork_done(struct proc *child) 978 { 979 980 if (!racct_enable) 981 return; 982 983 #ifdef RCTL 984 PROC_LOCK(child); 985 RACCT_LOCK(); 986 rctl_enforce(child, RACCT_NPROC, 0); 987 rctl_enforce(child, RACCT_NTHR, 0); 988 RACCT_UNLOCK(); 989 PROC_UNLOCK(child); 990 #endif 991 } 992 993 void 994 racct_proc_exit(struct proc *p) 995 { 996 struct timeval wallclock; 997 uint64_t pct_estimate, pct, runtime; 998 int i; 999 1000 if (!racct_enable) 1001 return; 1002 1003 PROC_LOCK(p); 1004 /* 1005 * We don't need to calculate rux, proc_reap() has already done this. 1006 */ 1007 runtime = cputick2usec(p->p_rux.rux_runtime); 1008 #ifdef notyet 1009 KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); 1010 #else 1011 if (runtime < p->p_prev_runtime) 1012 runtime = p->p_prev_runtime; 1013 #endif 1014 microuptime(&wallclock); 1015 timevalsub(&wallclock, &p->p_stats->p_start); 1016 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 1017 pct_estimate = (1000000 * runtime * 100) / 1018 ((uint64_t)wallclock.tv_sec * 1000000 + 1019 wallclock.tv_usec); 1020 } else 1021 pct_estimate = 0; 1022 pct = racct_getpcpu(p, pct_estimate); 1023 1024 RACCT_LOCK(); 1025 racct_set_locked(p, RACCT_CPU, runtime, 0); 1026 racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); 1027 1028 KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0, 1029 ("process reaped with %ju allocated for RSS\n", 1030 p->p_racct->r_resources[RACCT_RSS])); 1031 for (i = 0; i <= RACCT_MAX; i++) { 1032 if (p->p_racct->r_resources[i] == 0) 1033 continue; 1034 if (!RACCT_IS_RECLAIMABLE(i)) 1035 continue; 1036 racct_set_locked(p, i, 0, 0); 1037 } 1038 1039 #ifdef RCTL 1040 rctl_racct_release(p->p_racct); 1041 #endif 1042 racct_destroy_locked(&p->p_racct); 1043 RACCT_UNLOCK(); 1044 PROC_UNLOCK(p); 1045 } 1046 1047 /* 1048 * Called after credentials change, to move resource utilisation 1049 * between raccts. 1050 */ 1051 void 1052 racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, 1053 struct ucred *newcred) 1054 { 1055 struct uidinfo *olduip, *newuip; 1056 struct loginclass *oldlc, *newlc; 1057 struct prison *oldpr, *newpr, *pr; 1058 1059 if (!racct_enable) 1060 return; 1061 1062 PROC_LOCK_ASSERT(p, MA_OWNED); 1063 1064 newuip = newcred->cr_ruidinfo; 1065 olduip = oldcred->cr_ruidinfo; 1066 newlc = newcred->cr_loginclass; 1067 oldlc = oldcred->cr_loginclass; 1068 newpr = newcred->cr_prison; 1069 oldpr = oldcred->cr_prison; 1070 1071 RACCT_LOCK(); 1072 if (newuip != olduip) { 1073 racct_sub_racct(olduip->ui_racct, p->p_racct); 1074 racct_add_racct(newuip->ui_racct, p->p_racct); 1075 } 1076 if (newlc != oldlc) { 1077 racct_sub_racct(oldlc->lc_racct, p->p_racct); 1078 racct_add_racct(newlc->lc_racct, p->p_racct); 1079 } 1080 if (newpr != oldpr) { 1081 for (pr = oldpr; pr != NULL; pr = pr->pr_parent) 1082 racct_sub_racct(pr->pr_prison_racct->prr_racct, 1083 p->p_racct); 1084 for (pr = newpr; pr != NULL; pr = pr->pr_parent) 1085 racct_add_racct(pr->pr_prison_racct->prr_racct, 1086 p->p_racct); 1087 } 1088 RACCT_UNLOCK(); 1089 } 1090 1091 void 1092 racct_move(struct racct *dest, struct racct *src) 1093 { 1094 1095 ASSERT_RACCT_ENABLED(); 1096 1097 RACCT_LOCK(); 1098 racct_add_racct(dest, src); 1099 racct_sub_racct(src, src); 1100 RACCT_UNLOCK(); 1101 } 1102 1103 void 1104 racct_proc_throttled(struct proc *p) 1105 { 1106 1107 ASSERT_RACCT_ENABLED(); 1108 1109 PROC_LOCK(p); 1110 while (p->p_throttled != 0) { 1111 msleep(p->p_racct, &p->p_mtx, 0, "racct", 1112 p->p_throttled < 0 ? 0 : p->p_throttled); 1113 if (p->p_throttled > 0) 1114 p->p_throttled = 0; 1115 } 1116 PROC_UNLOCK(p); 1117 } 1118 1119 /* 1120 * Make the process sleep in userret() for 'timeout' ticks. Setting 1121 * timeout to -1 makes it sleep until woken up by racct_proc_wakeup(). 1122 */ 1123 void 1124 racct_proc_throttle(struct proc *p, int timeout) 1125 { 1126 struct thread *td; 1127 #ifdef SMP 1128 int cpuid; 1129 #endif 1130 1131 KASSERT(timeout != 0, ("timeout %d", timeout)); 1132 ASSERT_RACCT_ENABLED(); 1133 PROC_LOCK_ASSERT(p, MA_OWNED); 1134 1135 /* 1136 * Do not block kernel processes. Also do not block processes with 1137 * low %cpu utilization to improve interactivity. 1138 */ 1139 if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) 1140 return; 1141 1142 if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout)) 1143 return; 1144 1145 p->p_throttled = timeout; 1146 1147 FOREACH_THREAD_IN_PROC(p, td) { 1148 thread_lock(td); 1149 td->td_flags |= TDF_ASTPENDING; 1150 1151 switch (td->td_state) { 1152 case TDS_RUNQ: 1153 /* 1154 * If the thread is on the scheduler run-queue, we can 1155 * not just remove it from there. So we set the flag 1156 * TDF_NEEDRESCHED for the thread, so that once it is 1157 * running, it is taken off the cpu as soon as possible. 1158 */ 1159 td->td_flags |= TDF_NEEDRESCHED; 1160 break; 1161 case TDS_RUNNING: 1162 /* 1163 * If the thread is running, we request a context 1164 * switch for it by setting the TDF_NEEDRESCHED flag. 1165 */ 1166 td->td_flags |= TDF_NEEDRESCHED; 1167 #ifdef SMP 1168 cpuid = td->td_oncpu; 1169 if ((cpuid != NOCPU) && (td != curthread)) 1170 ipi_cpu(cpuid, IPI_AST); 1171 #endif 1172 break; 1173 default: 1174 break; 1175 } 1176 thread_unlock(td); 1177 } 1178 } 1179 1180 static void 1181 racct_proc_wakeup(struct proc *p) 1182 { 1183 1184 ASSERT_RACCT_ENABLED(); 1185 1186 PROC_LOCK_ASSERT(p, MA_OWNED); 1187 1188 if (p->p_throttled != 0) { 1189 p->p_throttled = 0; 1190 wakeup(p->p_racct); 1191 } 1192 } 1193 1194 static void 1195 racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) 1196 { 1197 int64_t r_old, r_new; 1198 1199 ASSERT_RACCT_ENABLED(); 1200 RACCT_LOCK_ASSERT(); 1201 1202 #ifdef RCTL 1203 rctl_throttle_decay(racct, RACCT_READBPS); 1204 rctl_throttle_decay(racct, RACCT_WRITEBPS); 1205 rctl_throttle_decay(racct, RACCT_READIOPS); 1206 rctl_throttle_decay(racct, RACCT_WRITEIOPS); 1207 #endif 1208 1209 r_old = racct->r_resources[RACCT_PCTCPU]; 1210 1211 /* If there is nothing to decay, just exit. */ 1212 if (r_old <= 0) 1213 return; 1214 1215 r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; 1216 racct->r_resources[RACCT_PCTCPU] = r_new; 1217 } 1218 1219 static void 1220 racct_decay_pre(void) 1221 { 1222 1223 RACCT_LOCK(); 1224 } 1225 1226 static void 1227 racct_decay_post(void) 1228 { 1229 1230 RACCT_UNLOCK(); 1231 } 1232 1233 static void 1234 racct_decay(void) 1235 { 1236 1237 ASSERT_RACCT_ENABLED(); 1238 1239 ui_racct_foreach(racct_decay_callback, racct_decay_pre, 1240 racct_decay_post, NULL, NULL); 1241 loginclass_racct_foreach(racct_decay_callback, racct_decay_pre, 1242 racct_decay_post, NULL, NULL); 1243 prison_racct_foreach(racct_decay_callback, racct_decay_pre, 1244 racct_decay_post, NULL, NULL); 1245 } 1246 1247 static void 1248 racctd(void) 1249 { 1250 struct thread *td; 1251 struct proc *p; 1252 struct timeval wallclock; 1253 uint64_t pct, pct_estimate, runtime; 1254 1255 ASSERT_RACCT_ENABLED(); 1256 1257 for (;;) { 1258 racct_decay(); 1259 1260 sx_slock(&allproc_lock); 1261 1262 FOREACH_PROC_IN_SYSTEM(p) { 1263 PROC_LOCK(p); 1264 if (p->p_state != PRS_NORMAL) { 1265 if (p->p_state == PRS_ZOMBIE) 1266 racct_set(p, RACCT_PCTCPU, 0); 1267 PROC_UNLOCK(p); 1268 continue; 1269 } 1270 1271 microuptime(&wallclock); 1272 timevalsub(&wallclock, &p->p_stats->p_start); 1273 PROC_STATLOCK(p); 1274 FOREACH_THREAD_IN_PROC(p, td) 1275 ruxagg(p, td); 1276 runtime = cputick2usec(p->p_rux.rux_runtime); 1277 PROC_STATUNLOCK(p); 1278 #ifdef notyet 1279 KASSERT(runtime >= p->p_prev_runtime, 1280 ("runtime < p_prev_runtime")); 1281 #else 1282 if (runtime < p->p_prev_runtime) 1283 runtime = p->p_prev_runtime; 1284 #endif 1285 p->p_prev_runtime = runtime; 1286 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 1287 pct_estimate = (1000000 * runtime * 100) / 1288 ((uint64_t)wallclock.tv_sec * 1000000 + 1289 wallclock.tv_usec); 1290 } else 1291 pct_estimate = 0; 1292 pct = racct_getpcpu(p, pct_estimate); 1293 RACCT_LOCK(); 1294 #ifdef RCTL 1295 rctl_throttle_decay(p->p_racct, RACCT_READBPS); 1296 rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS); 1297 rctl_throttle_decay(p->p_racct, RACCT_READIOPS); 1298 rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); 1299 #endif 1300 racct_set_locked(p, RACCT_PCTCPU, pct, 1); 1301 racct_set_locked(p, RACCT_CPU, runtime, 0); 1302 racct_set_locked(p, RACCT_WALLCLOCK, 1303 (uint64_t)wallclock.tv_sec * 1000000 + 1304 wallclock.tv_usec, 0); 1305 RACCT_UNLOCK(); 1306 PROC_UNLOCK(p); 1307 } 1308 1309 /* 1310 * To ensure that processes are throttled in a fair way, we need 1311 * to iterate over all processes again and check the limits 1312 * for %cpu resource only after ucred racct containers have been 1313 * properly filled. 1314 */ 1315 FOREACH_PROC_IN_SYSTEM(p) { 1316 PROC_LOCK(p); 1317 if (p->p_state != PRS_NORMAL) { 1318 PROC_UNLOCK(p); 1319 continue; 1320 } 1321 1322 if (racct_pcpu_available(p) <= 0) { 1323 if (p->p_racct->r_resources[RACCT_PCTCPU] > 1324 pcpu_threshold) 1325 racct_proc_throttle(p, -1); 1326 } else if (p->p_throttled == -1) { 1327 racct_proc_wakeup(p); 1328 } 1329 PROC_UNLOCK(p); 1330 } 1331 sx_sunlock(&allproc_lock); 1332 pause("-", hz); 1333 } 1334 } 1335 1336 static struct kproc_desc racctd_kp = { 1337 "racctd", 1338 racctd, 1339 NULL 1340 }; 1341 1342 static void 1343 racctd_init(void) 1344 { 1345 if (!racct_enable) 1346 return; 1347 1348 kproc_start(&racctd_kp); 1349 } 1350 SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); 1351 1352 static void 1353 racct_init(void) 1354 { 1355 if (!racct_enable) 1356 return; 1357 1358 racct_zone = uma_zcreate("racct", sizeof(struct racct), 1359 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1360 /* 1361 * XXX: Move this somewhere. 1362 */ 1363 prison0.pr_prison_racct = prison_racct_find("0"); 1364 } 1365 SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); 1366 1367 #endif /* !RACCT */ 1368