1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2010 The FreeBSD Foundation 5 * 6 * This software was developed by Edward Tomasz Napierala under sponsorship 7 * from the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * $FreeBSD$ 31 */ 32 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 36 #include "opt_sched.h" 37 38 #include <sys/param.h> 39 #include <sys/buf.h> 40 #include <sys/systm.h> 41 #include <sys/eventhandler.h> 42 #include <sys/jail.h> 43 #include <sys/kernel.h> 44 #include <sys/kthread.h> 45 #include <sys/lock.h> 46 #include <sys/loginclass.h> 47 #include <sys/malloc.h> 48 #include <sys/mutex.h> 49 #include <sys/proc.h> 50 #include <sys/racct.h> 51 #include <sys/resourcevar.h> 52 #include <sys/sbuf.h> 53 #include <sys/sched.h> 54 #include <sys/sdt.h> 55 #include <sys/smp.h> 56 #include <sys/sx.h> 57 #include <sys/sysctl.h> 58 #include <sys/sysproto.h> 59 #include <sys/umtxvar.h> 60 #include <machine/smp.h> 61 62 #ifdef RCTL 63 #include <sys/rctl.h> 64 #endif 65 66 #ifdef RACCT 67 68 FEATURE(racct, "Resource Accounting"); 69 70 /* 71 * Do not block processes that have their %cpu usage <= pcpu_threshold. 72 */ 73 static int pcpu_threshold = 1; 74 #ifdef RACCT_DEFAULT_TO_DISABLED 75 bool __read_frequently racct_enable = false; 76 #else 77 bool __read_frequently racct_enable = true; 78 #endif 79 80 SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 81 "Resource Accounting"); 82 SYSCTL_BOOL(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable, 83 0, "Enable RACCT/RCTL"); 84 SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 85 0, "Processes with higher %cpu usage than this value can be throttled."); 86 87 /* 88 * How many seconds it takes to use the scheduler %cpu calculations. When a 89 * process starts, we compute its %cpu usage by dividing its runtime by the 90 * process wall clock time. After RACCT_PCPU_SECS pass, we use the value 91 * provided by the scheduler. 92 */ 93 #define RACCT_PCPU_SECS 3 94 95 struct mtx racct_lock; 96 MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); 97 98 static uma_zone_t racct_zone; 99 100 static void racct_sub_racct(struct racct *dest, const struct racct *src); 101 static void racct_sub_cred_locked(struct ucred *cred, int resource, 102 uint64_t amount); 103 static void racct_add_cred_locked(struct ucred *cred, int resource, 104 uint64_t amount); 105 106 SDT_PROVIDER_DEFINE(racct); 107 SDT_PROBE_DEFINE3(racct, , rusage, add, 108 "struct proc *", "int", "uint64_t"); 109 SDT_PROBE_DEFINE3(racct, , rusage, add__failure, 110 "struct proc *", "int", "uint64_t"); 111 SDT_PROBE_DEFINE3(racct, , rusage, add__buf, 112 "struct proc *", "const struct buf *", "int"); 113 SDT_PROBE_DEFINE3(racct, , rusage, add__cred, 114 "struct ucred *", "int", "uint64_t"); 115 SDT_PROBE_DEFINE3(racct, , rusage, add__force, 116 "struct proc *", "int", "uint64_t"); 117 SDT_PROBE_DEFINE3(racct, , rusage, set, 118 "struct proc *", "int", "uint64_t"); 119 SDT_PROBE_DEFINE3(racct, , rusage, set__failure, 120 "struct proc *", "int", "uint64_t"); 121 SDT_PROBE_DEFINE3(racct, , rusage, set__force, 122 "struct proc *", "int", "uint64_t"); 123 SDT_PROBE_DEFINE3(racct, , rusage, sub, 124 "struct proc *", "int", "uint64_t"); 125 SDT_PROBE_DEFINE3(racct, , rusage, sub__cred, 126 "struct ucred *", "int", "uint64_t"); 127 SDT_PROBE_DEFINE1(racct, , racct, create, 128 "struct racct *"); 129 SDT_PROBE_DEFINE1(racct, , racct, destroy, 130 "struct racct *"); 131 SDT_PROBE_DEFINE2(racct, , racct, join, 132 "struct racct *", "struct racct *"); 133 SDT_PROBE_DEFINE2(racct, , racct, join__failure, 134 "struct racct *", "struct racct *"); 135 SDT_PROBE_DEFINE2(racct, , racct, leave, 136 "struct racct *", "struct racct *"); 137 138 int racct_types[] = { 139 [RACCT_CPU] = 140 RACCT_IN_MILLIONS, 141 [RACCT_DATA] = 142 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 143 [RACCT_STACK] = 144 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 145 [RACCT_CORE] = 146 RACCT_DENIABLE, 147 [RACCT_RSS] = 148 RACCT_RECLAIMABLE, 149 [RACCT_MEMLOCK] = 150 RACCT_RECLAIMABLE | RACCT_DENIABLE, 151 [RACCT_NPROC] = 152 RACCT_RECLAIMABLE | RACCT_DENIABLE, 153 [RACCT_NOFILE] = 154 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 155 [RACCT_VMEM] = 156 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 157 [RACCT_NPTS] = 158 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 159 [RACCT_SWAP] = 160 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 161 [RACCT_NTHR] = 162 RACCT_RECLAIMABLE | RACCT_DENIABLE, 163 [RACCT_MSGQQUEUED] = 164 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 165 [RACCT_MSGQSIZE] = 166 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 167 [RACCT_NMSGQ] = 168 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 169 [RACCT_NSEM] = 170 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 171 [RACCT_NSEMOP] = 172 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 173 [RACCT_NSHM] = 174 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 175 [RACCT_SHMSIZE] = 176 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 177 [RACCT_WALLCLOCK] = 178 RACCT_IN_MILLIONS, 179 [RACCT_PCTCPU] = 180 RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS, 181 [RACCT_READBPS] = 182 RACCT_DECAYING, 183 [RACCT_WRITEBPS] = 184 RACCT_DECAYING, 185 [RACCT_READIOPS] = 186 RACCT_DECAYING, 187 [RACCT_WRITEIOPS] = 188 RACCT_DECAYING }; 189 190 static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; 191 192 #ifdef SCHED_4BSD 193 /* 194 * Contains intermediate values for %cpu calculations to avoid using floating 195 * point in the kernel. 196 * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) 197 * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to 198 * zero so the calculations are more straightforward. 199 */ 200 fixpt_t ccpu_exp[] = { 201 [0] = FSCALE * 1, 202 [1] = FSCALE * 0.95122942450071400909, 203 [2] = FSCALE * 0.90483741803595957316, 204 [3] = FSCALE * 0.86070797642505780722, 205 [4] = FSCALE * 0.81873075307798185866, 206 [5] = FSCALE * 0.77880078307140486824, 207 [6] = FSCALE * 0.74081822068171786606, 208 [7] = FSCALE * 0.70468808971871343435, 209 [8] = FSCALE * 0.67032004603563930074, 210 [9] = FSCALE * 0.63762815162177329314, 211 [10] = FSCALE * 0.60653065971263342360, 212 [11] = FSCALE * 0.57694981038048669531, 213 [12] = FSCALE * 0.54881163609402643262, 214 [13] = FSCALE * 0.52204577676101604789, 215 [14] = FSCALE * 0.49658530379140951470, 216 [15] = FSCALE * 0.47236655274101470713, 217 [16] = FSCALE * 0.44932896411722159143, 218 [17] = FSCALE * 0.42741493194872666992, 219 [18] = FSCALE * 0.40656965974059911188, 220 [19] = FSCALE * 0.38674102345450120691, 221 [20] = FSCALE * 0.36787944117144232159, 222 [21] = FSCALE * 0.34993774911115535467, 223 [22] = FSCALE * 0.33287108369807955328, 224 [23] = FSCALE * 0.31663676937905321821, 225 [24] = FSCALE * 0.30119421191220209664, 226 [25] = FSCALE * 0.28650479686019010032, 227 [26] = FSCALE * 0.27253179303401260312, 228 [27] = FSCALE * 0.25924026064589150757, 229 [28] = FSCALE * 0.24659696394160647693, 230 [29] = FSCALE * 0.23457028809379765313, 231 [30] = FSCALE * 0.22313016014842982893, 232 [31] = FSCALE * 0.21224797382674305771, 233 [32] = FSCALE * 0.20189651799465540848, 234 [33] = FSCALE * 0.19204990862075411423, 235 [34] = FSCALE * 0.18268352405273465022, 236 [35] = FSCALE * 0.17377394345044512668, 237 [36] = FSCALE * 0.16529888822158653829, 238 [37] = FSCALE * 0.15723716631362761621, 239 [38] = FSCALE * 0.14956861922263505264, 240 [39] = FSCALE * 0.14227407158651357185, 241 [40] = FSCALE * 0.13533528323661269189, 242 [41] = FSCALE * 0.12873490358780421886, 243 [42] = FSCALE * 0.12245642825298191021, 244 [43] = FSCALE * 0.11648415777349695786, 245 [44] = FSCALE * 0.11080315836233388333, 246 [45] = FSCALE * 0.10539922456186433678, 247 [46] = FSCALE * 0.10025884372280373372, 248 [47] = FSCALE * 0.09536916221554961888, 249 [48] = FSCALE * 0.09071795328941250337, 250 [49] = FSCALE * 0.08629358649937051097, 251 [50] = FSCALE * 0.08208499862389879516, 252 [51] = FSCALE * 0.07808166600115315231, 253 [52] = FSCALE * 0.07427357821433388042, 254 [53] = FSCALE * 0.07065121306042958674, 255 [54] = FSCALE * 0.06720551273974976512, 256 [55] = FSCALE * 0.06392786120670757270, 257 [56] = FSCALE * 0.06081006262521796499, 258 [57] = FSCALE * 0.05784432087483846296, 259 [58] = FSCALE * 0.05502322005640722902, 260 [59] = FSCALE * 0.05233970594843239308, 261 [60] = FSCALE * 0.04978706836786394297, 262 [61] = FSCALE * 0.04735892439114092119, 263 [62] = FSCALE * 0.04504920239355780606, 264 [63] = FSCALE * 0.04285212686704017991, 265 [64] = FSCALE * 0.04076220397836621516, 266 [65] = FSCALE * 0.03877420783172200988, 267 [66] = FSCALE * 0.03688316740124000544, 268 [67] = FSCALE * 0.03508435410084502588, 269 [68] = FSCALE * 0.03337326996032607948, 270 [69] = FSCALE * 0.03174563637806794323, 271 [70] = FSCALE * 0.03019738342231850073, 272 [71] = FSCALE * 0.02872463965423942912, 273 [72] = FSCALE * 0.02732372244729256080, 274 [73] = FSCALE * 0.02599112877875534358, 275 [74] = FSCALE * 0.02472352647033939120, 276 [75] = FSCALE * 0.02351774585600910823, 277 [76] = FSCALE * 0.02237077185616559577, 278 [77] = FSCALE * 0.02127973643837716938, 279 [78] = FSCALE * 0.02024191144580438847, 280 [79] = FSCALE * 0.01925470177538692429, 281 [80] = FSCALE * 0.01831563888873418029, 282 [81] = FSCALE * 0.01742237463949351138, 283 [82] = FSCALE * 0.01657267540176124754, 284 [83] = FSCALE * 0.01576441648485449082, 285 [84] = FSCALE * 0.01499557682047770621, 286 [85] = FSCALE * 0.01426423390899925527, 287 [86] = FSCALE * 0.01356855901220093175, 288 [87] = FSCALE * 0.01290681258047986886, 289 [88] = FSCALE * 0.01227733990306844117, 290 [89] = FSCALE * 0.01167856697039544521, 291 [90] = FSCALE * 0.01110899653824230649, 292 [91] = FSCALE * 0.01056720438385265337, 293 [92] = FSCALE * 0.01005183574463358164, 294 [93] = FSCALE * 0.00956160193054350793, 295 [94] = FSCALE * 0.00909527710169581709, 296 [95] = FSCALE * 0.00865169520312063417, 297 [96] = FSCALE * 0.00822974704902002884, 298 [97] = FSCALE * 0.00782837754922577143, 299 [98] = FSCALE * 0.00744658307092434051, 300 [99] = FSCALE * 0.00708340892905212004, 301 [100] = FSCALE * 0.00673794699908546709, 302 [101] = FSCALE * 0.00640933344625638184, 303 [102] = FSCALE * 0.00609674656551563610, 304 [103] = FSCALE * 0.00579940472684214321, 305 [104] = FSCALE * 0.00551656442076077241, 306 [105] = FSCALE * 0.00524751839918138427, 307 [106] = FSCALE * 0.00499159390691021621, 308 [107] = FSCALE * 0.00474815099941147558, 309 [108] = FSCALE * 0.00451658094261266798, 310 [109] = FSCALE * 0.00429630469075234057, 311 [110] = FSCALE * 0.00408677143846406699, 312 }; 313 #endif 314 315 #define CCPU_EXP_MAX 110 316 317 /* 318 * This function is analogical to the getpcpu() function in the ps(1) command. 319 * They should both calculate in the same way so that the racct %cpu 320 * calculations are consistent with the values showed by the ps(1) tool. 321 * The calculations are more complex in the 4BSD scheduler because of the value 322 * of the ccpu variable. In ULE it is defined to be zero which saves us some 323 * work. 324 */ 325 static uint64_t 326 racct_getpcpu(struct proc *p, u_int pcpu) 327 { 328 u_int swtime; 329 #ifdef SCHED_4BSD 330 fixpt_t pctcpu, pctcpu_next; 331 #endif 332 #ifdef SMP 333 struct pcpu *pc; 334 int found; 335 #endif 336 fixpt_t p_pctcpu; 337 struct thread *td; 338 339 ASSERT_RACCT_ENABLED(); 340 341 /* 342 * If the process is swapped out, we count its %cpu usage as zero. 343 * This behaviour is consistent with the userland ps(1) tool. 344 */ 345 if ((p->p_flag & P_INMEM) == 0) 346 return (0); 347 swtime = (ticks - p->p_swtick) / hz; 348 349 /* 350 * For short-lived processes, the sched_pctcpu() returns small 351 * values even for cpu intensive processes. Therefore we use 352 * our own estimate in this case. 353 */ 354 if (swtime < RACCT_PCPU_SECS) 355 return (pcpu); 356 357 p_pctcpu = 0; 358 FOREACH_THREAD_IN_PROC(p, td) { 359 if (td == PCPU_GET(idlethread)) 360 continue; 361 #ifdef SMP 362 found = 0; 363 STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { 364 if (td == pc->pc_idlethread) { 365 found = 1; 366 break; 367 } 368 } 369 if (found) 370 continue; 371 #endif 372 thread_lock(td); 373 #ifdef SCHED_4BSD 374 pctcpu = sched_pctcpu(td); 375 /* Count also the yet unfinished second. */ 376 pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; 377 pctcpu_next += sched_pctcpu_delta(td); 378 p_pctcpu += max(pctcpu, pctcpu_next); 379 #else 380 /* 381 * In ULE the %cpu statistics are updated on every 382 * sched_pctcpu() call. So special calculations to 383 * account for the latest (unfinished) second are 384 * not needed. 385 */ 386 p_pctcpu += sched_pctcpu(td); 387 #endif 388 thread_unlock(td); 389 } 390 391 #ifdef SCHED_4BSD 392 if (swtime <= CCPU_EXP_MAX) 393 return ((100 * (uint64_t)p_pctcpu * 1000000) / 394 (FSCALE - ccpu_exp[swtime])); 395 #endif 396 397 return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); 398 } 399 400 static void 401 racct_add_racct(struct racct *dest, const struct racct *src) 402 { 403 int i; 404 405 ASSERT_RACCT_ENABLED(); 406 RACCT_LOCK_ASSERT(); 407 408 /* 409 * Update resource usage in dest. 410 */ 411 for (i = 0; i <= RACCT_MAX; i++) { 412 KASSERT(dest->r_resources[i] >= 0, 413 ("%s: resource %d propagation meltdown: dest < 0", 414 __func__, i)); 415 KASSERT(src->r_resources[i] >= 0, 416 ("%s: resource %d propagation meltdown: src < 0", 417 __func__, i)); 418 dest->r_resources[i] += src->r_resources[i]; 419 } 420 } 421 422 static void 423 racct_sub_racct(struct racct *dest, const struct racct *src) 424 { 425 int i; 426 427 ASSERT_RACCT_ENABLED(); 428 RACCT_LOCK_ASSERT(); 429 430 /* 431 * Update resource usage in dest. 432 */ 433 for (i = 0; i <= RACCT_MAX; i++) { 434 if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { 435 KASSERT(dest->r_resources[i] >= 0, 436 ("%s: resource %d propagation meltdown: dest < 0", 437 __func__, i)); 438 KASSERT(src->r_resources[i] >= 0, 439 ("%s: resource %d propagation meltdown: src < 0", 440 __func__, i)); 441 KASSERT(src->r_resources[i] <= dest->r_resources[i], 442 ("%s: resource %d propagation meltdown: src > dest", 443 __func__, i)); 444 } 445 if (RACCT_CAN_DROP(i)) { 446 dest->r_resources[i] -= src->r_resources[i]; 447 if (dest->r_resources[i] < 0) 448 dest->r_resources[i] = 0; 449 } 450 } 451 } 452 453 void 454 racct_create(struct racct **racctp) 455 { 456 457 if (!racct_enable) 458 return; 459 460 SDT_PROBE1(racct, , racct, create, racctp); 461 462 KASSERT(*racctp == NULL, ("racct already allocated")); 463 464 *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); 465 } 466 467 static void 468 racct_destroy_locked(struct racct **racctp) 469 { 470 struct racct *racct; 471 int i; 472 473 ASSERT_RACCT_ENABLED(); 474 475 SDT_PROBE1(racct, , racct, destroy, racctp); 476 477 RACCT_LOCK_ASSERT(); 478 KASSERT(racctp != NULL, ("NULL racctp")); 479 KASSERT(*racctp != NULL, ("NULL racct")); 480 481 racct = *racctp; 482 483 for (i = 0; i <= RACCT_MAX; i++) { 484 if (RACCT_IS_SLOPPY(i)) 485 continue; 486 if (!RACCT_IS_RECLAIMABLE(i)) 487 continue; 488 KASSERT(racct->r_resources[i] == 0, 489 ("destroying non-empty racct: " 490 "%ju allocated for resource %d\n", 491 racct->r_resources[i], i)); 492 } 493 uma_zfree(racct_zone, racct); 494 *racctp = NULL; 495 } 496 497 void 498 racct_destroy(struct racct **racct) 499 { 500 501 if (!racct_enable) 502 return; 503 504 RACCT_LOCK(); 505 racct_destroy_locked(racct); 506 RACCT_UNLOCK(); 507 } 508 509 /* 510 * Increase consumption of 'resource' by 'amount' for 'racct', 511 * but not its parents. Differently from other cases, 'amount' here 512 * may be less than zero. 513 */ 514 static void 515 racct_adjust_resource(struct racct *racct, int resource, 516 int64_t amount) 517 { 518 519 ASSERT_RACCT_ENABLED(); 520 RACCT_LOCK_ASSERT(); 521 KASSERT(racct != NULL, ("NULL racct")); 522 523 racct->r_resources[resource] += amount; 524 if (racct->r_resources[resource] < 0) { 525 KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), 526 ("%s: resource %d usage < 0", __func__, resource)); 527 racct->r_resources[resource] = 0; 528 } 529 530 /* 531 * There are some cases where the racct %cpu resource would grow 532 * beyond 100% per core. For example in racct_proc_exit() we add 533 * the process %cpu usage to the ucred racct containers. If too 534 * many processes terminated in a short time span, the ucred %cpu 535 * resource could grow too much. Also, the 4BSD scheduler sometimes 536 * returns for a thread more than 100% cpu usage. So we set a sane 537 * boundary here to 100% * the maximum number of CPUs. 538 */ 539 if ((resource == RACCT_PCTCPU) && 540 (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) 541 racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; 542 } 543 544 static int 545 racct_add_locked(struct proc *p, int resource, uint64_t amount, int force) 546 { 547 #ifdef RCTL 548 int error; 549 #endif 550 551 ASSERT_RACCT_ENABLED(); 552 553 /* 554 * We need proc lock to dereference p->p_ucred. 555 */ 556 PROC_LOCK_ASSERT(p, MA_OWNED); 557 558 #ifdef RCTL 559 error = rctl_enforce(p, resource, amount); 560 if (error && !force && RACCT_IS_DENIABLE(resource)) { 561 SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount); 562 return (error); 563 } 564 #endif 565 racct_adjust_resource(p->p_racct, resource, amount); 566 racct_add_cred_locked(p->p_ucred, resource, amount); 567 568 return (0); 569 } 570 571 /* 572 * Increase allocation of 'resource' by 'amount' for process 'p'. 573 * Return 0 if it's below limits, or errno, if it's not. 574 */ 575 int 576 racct_add(struct proc *p, int resource, uint64_t amount) 577 { 578 int error; 579 580 if (!racct_enable) 581 return (0); 582 583 SDT_PROBE3(racct, , rusage, add, p, resource, amount); 584 585 RACCT_LOCK(); 586 error = racct_add_locked(p, resource, amount, 0); 587 RACCT_UNLOCK(); 588 return (error); 589 } 590 591 /* 592 * Increase allocation of 'resource' by 'amount' for process 'p'. 593 * Doesn't check for limits and never fails. 594 */ 595 void 596 racct_add_force(struct proc *p, int resource, uint64_t amount) 597 { 598 599 if (!racct_enable) 600 return; 601 602 SDT_PROBE3(racct, , rusage, add__force, p, resource, amount); 603 604 RACCT_LOCK(); 605 racct_add_locked(p, resource, amount, 1); 606 RACCT_UNLOCK(); 607 } 608 609 static void 610 racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) 611 { 612 struct prison *pr; 613 614 ASSERT_RACCT_ENABLED(); 615 616 racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount); 617 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 618 racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, 619 amount); 620 racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount); 621 } 622 623 /* 624 * Increase allocation of 'resource' by 'amount' for credential 'cred'. 625 * Doesn't check for limits and never fails. 626 */ 627 void 628 racct_add_cred(struct ucred *cred, int resource, uint64_t amount) 629 { 630 631 if (!racct_enable) 632 return; 633 634 SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount); 635 636 RACCT_LOCK(); 637 racct_add_cred_locked(cred, resource, amount); 638 RACCT_UNLOCK(); 639 } 640 641 /* 642 * Account for disk IO resource consumption. Checks for limits, 643 * but never fails, due to disk limits being undeniable. 644 */ 645 void 646 racct_add_buf(struct proc *p, const struct buf *bp, int is_write) 647 { 648 649 ASSERT_RACCT_ENABLED(); 650 PROC_LOCK_ASSERT(p, MA_OWNED); 651 652 SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write); 653 654 RACCT_LOCK(); 655 if (is_write) { 656 racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1); 657 racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1); 658 } else { 659 racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1); 660 racct_add_locked(curproc, RACCT_READIOPS, 1, 1); 661 } 662 RACCT_UNLOCK(); 663 } 664 665 static int 666 racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) 667 { 668 int64_t old_amount, decayed_amount, diff_proc, diff_cred; 669 #ifdef RCTL 670 int error; 671 #endif 672 673 ASSERT_RACCT_ENABLED(); 674 675 /* 676 * We need proc lock to dereference p->p_ucred. 677 */ 678 PROC_LOCK_ASSERT(p, MA_OWNED); 679 680 old_amount = p->p_racct->r_resources[resource]; 681 /* 682 * The diffs may be negative. 683 */ 684 diff_proc = amount - old_amount; 685 if (resource == RACCT_PCTCPU) { 686 /* 687 * Resources in per-credential racct containers may decay. 688 * If this is the case, we need to calculate the difference 689 * between the new amount and the proportional value of the 690 * old amount that has decayed in the ucred racct containers. 691 */ 692 decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 693 diff_cred = amount - decayed_amount; 694 } else 695 diff_cred = diff_proc; 696 #ifdef notyet 697 KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), 698 ("%s: usage of non-droppable resource %d dropping", __func__, 699 resource)); 700 #endif 701 #ifdef RCTL 702 if (diff_proc > 0) { 703 error = rctl_enforce(p, resource, diff_proc); 704 if (error && !force && RACCT_IS_DENIABLE(resource)) { 705 SDT_PROBE3(racct, , rusage, set__failure, p, resource, 706 amount); 707 return (error); 708 } 709 } 710 #endif 711 racct_adjust_resource(p->p_racct, resource, diff_proc); 712 if (diff_cred > 0) 713 racct_add_cred_locked(p->p_ucred, resource, diff_cred); 714 else if (diff_cred < 0) 715 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 716 717 return (0); 718 } 719 720 /* 721 * Set allocation of 'resource' to 'amount' for process 'p'. 722 * Return 0 if it's below limits, or errno, if it's not. 723 * 724 * Note that decreasing the allocation always returns 0, 725 * even if it's above the limit. 726 */ 727 int 728 racct_set_unlocked(struct proc *p, int resource, uint64_t amount) 729 { 730 int error; 731 732 ASSERT_RACCT_ENABLED(); 733 PROC_LOCK(p); 734 error = racct_set(p, resource, amount); 735 PROC_UNLOCK(p); 736 return (error); 737 } 738 739 int 740 racct_set(struct proc *p, int resource, uint64_t amount) 741 { 742 int error; 743 744 if (!racct_enable) 745 return (0); 746 747 SDT_PROBE3(racct, , rusage, set__force, p, resource, amount); 748 749 RACCT_LOCK(); 750 error = racct_set_locked(p, resource, amount, 0); 751 RACCT_UNLOCK(); 752 return (error); 753 } 754 755 void 756 racct_set_force(struct proc *p, int resource, uint64_t amount) 757 { 758 759 if (!racct_enable) 760 return; 761 762 SDT_PROBE3(racct, , rusage, set, p, resource, amount); 763 764 RACCT_LOCK(); 765 racct_set_locked(p, resource, amount, 1); 766 RACCT_UNLOCK(); 767 } 768 769 /* 770 * Returns amount of 'resource' the process 'p' can keep allocated. 771 * Allocating more than that would be denied, unless the resource 772 * is marked undeniable. Amount of already allocated resource does 773 * not matter. 774 */ 775 uint64_t 776 racct_get_limit(struct proc *p, int resource) 777 { 778 #ifdef RCTL 779 uint64_t available; 780 781 if (!racct_enable) 782 return (UINT64_MAX); 783 784 RACCT_LOCK(); 785 available = rctl_get_limit(p, resource); 786 RACCT_UNLOCK(); 787 788 return (available); 789 #else 790 791 return (UINT64_MAX); 792 #endif 793 } 794 795 /* 796 * Returns amount of 'resource' the process 'p' can keep allocated. 797 * Allocating more than that would be denied, unless the resource 798 * is marked undeniable. Amount of already allocated resource does 799 * matter. 800 */ 801 uint64_t 802 racct_get_available(struct proc *p, int resource) 803 { 804 #ifdef RCTL 805 uint64_t available; 806 807 if (!racct_enable) 808 return (UINT64_MAX); 809 810 RACCT_LOCK(); 811 available = rctl_get_available(p, resource); 812 RACCT_UNLOCK(); 813 814 return (available); 815 #else 816 817 return (UINT64_MAX); 818 #endif 819 } 820 821 /* 822 * Returns amount of the %cpu resource that process 'p' can add to its %cpu 823 * utilization. Adding more than that would lead to the process being 824 * throttled. 825 */ 826 static int64_t 827 racct_pcpu_available(struct proc *p) 828 { 829 #ifdef RCTL 830 uint64_t available; 831 832 ASSERT_RACCT_ENABLED(); 833 834 RACCT_LOCK(); 835 available = rctl_pcpu_available(p); 836 RACCT_UNLOCK(); 837 838 return (available); 839 #else 840 841 return (INT64_MAX); 842 #endif 843 } 844 845 /* 846 * Decrease allocation of 'resource' by 'amount' for process 'p'. 847 */ 848 void 849 racct_sub(struct proc *p, int resource, uint64_t amount) 850 { 851 852 if (!racct_enable) 853 return; 854 855 SDT_PROBE3(racct, , rusage, sub, p, resource, amount); 856 857 /* 858 * We need proc lock to dereference p->p_ucred. 859 */ 860 PROC_LOCK_ASSERT(p, MA_OWNED); 861 KASSERT(RACCT_CAN_DROP(resource), 862 ("%s: called for non-droppable resource %d", __func__, resource)); 863 864 RACCT_LOCK(); 865 KASSERT(amount <= p->p_racct->r_resources[resource], 866 ("%s: freeing %ju of resource %d, which is more " 867 "than allocated %jd for %s (pid %d)", __func__, amount, resource, 868 (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); 869 870 racct_adjust_resource(p->p_racct, resource, -amount); 871 racct_sub_cred_locked(p->p_ucred, resource, amount); 872 RACCT_UNLOCK(); 873 } 874 875 static void 876 racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) 877 { 878 struct prison *pr; 879 880 ASSERT_RACCT_ENABLED(); 881 882 racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); 883 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 884 racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, 885 -amount); 886 racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount); 887 } 888 889 /* 890 * Decrease allocation of 'resource' by 'amount' for credential 'cred'. 891 */ 892 void 893 racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 894 { 895 896 if (!racct_enable) 897 return; 898 899 SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount); 900 901 #ifdef notyet 902 KASSERT(RACCT_CAN_DROP(resource), 903 ("%s: called for resource %d which can not drop", __func__, 904 resource)); 905 #endif 906 907 RACCT_LOCK(); 908 racct_sub_cred_locked(cred, resource, amount); 909 RACCT_UNLOCK(); 910 } 911 912 /* 913 * Inherit resource usage information from the parent process. 914 */ 915 int 916 racct_proc_fork(struct proc *parent, struct proc *child) 917 { 918 int i, error = 0; 919 920 if (!racct_enable) 921 return (0); 922 923 /* 924 * Create racct for the child process. 925 */ 926 racct_create(&child->p_racct); 927 928 PROC_LOCK(parent); 929 PROC_LOCK(child); 930 RACCT_LOCK(); 931 932 #ifdef RCTL 933 error = rctl_proc_fork(parent, child); 934 if (error != 0) 935 goto out; 936 #endif 937 938 /* Init process cpu time. */ 939 child->p_prev_runtime = 0; 940 child->p_throttled = 0; 941 942 /* 943 * Inherit resource usage. 944 */ 945 for (i = 0; i <= RACCT_MAX; i++) { 946 if (parent->p_racct->r_resources[i] == 0 || 947 !RACCT_IS_INHERITABLE(i)) 948 continue; 949 950 error = racct_set_locked(child, i, 951 parent->p_racct->r_resources[i], 0); 952 if (error != 0) 953 goto out; 954 } 955 956 error = racct_add_locked(child, RACCT_NPROC, 1, 0); 957 error += racct_add_locked(child, RACCT_NTHR, 1, 0); 958 959 out: 960 RACCT_UNLOCK(); 961 PROC_UNLOCK(child); 962 PROC_UNLOCK(parent); 963 964 if (error != 0) 965 racct_proc_exit(child); 966 967 return (error); 968 } 969 970 /* 971 * Called at the end of fork1(), to handle rules that require the process 972 * to be fully initialized. 973 */ 974 void 975 racct_proc_fork_done(struct proc *child) 976 { 977 978 if (!racct_enable) 979 return; 980 981 #ifdef RCTL 982 PROC_LOCK(child); 983 RACCT_LOCK(); 984 rctl_enforce(child, RACCT_NPROC, 0); 985 rctl_enforce(child, RACCT_NTHR, 0); 986 RACCT_UNLOCK(); 987 PROC_UNLOCK(child); 988 #endif 989 } 990 991 void 992 racct_proc_exit(struct proc *p) 993 { 994 struct timeval wallclock; 995 uint64_t pct_estimate, pct, runtime; 996 int i; 997 998 if (!racct_enable) 999 return; 1000 1001 PROC_LOCK(p); 1002 /* 1003 * We don't need to calculate rux, proc_reap() has already done this. 1004 */ 1005 runtime = cputick2usec(p->p_rux.rux_runtime); 1006 #ifdef notyet 1007 KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); 1008 #else 1009 if (runtime < p->p_prev_runtime) 1010 runtime = p->p_prev_runtime; 1011 #endif 1012 microuptime(&wallclock); 1013 timevalsub(&wallclock, &p->p_stats->p_start); 1014 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 1015 pct_estimate = (1000000 * runtime * 100) / 1016 ((uint64_t)wallclock.tv_sec * 1000000 + 1017 wallclock.tv_usec); 1018 } else 1019 pct_estimate = 0; 1020 pct = racct_getpcpu(p, pct_estimate); 1021 1022 RACCT_LOCK(); 1023 racct_set_locked(p, RACCT_CPU, runtime, 0); 1024 racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); 1025 1026 KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0, 1027 ("process reaped with %ju allocated for RSS\n", 1028 p->p_racct->r_resources[RACCT_RSS])); 1029 for (i = 0; i <= RACCT_MAX; i++) { 1030 if (p->p_racct->r_resources[i] == 0) 1031 continue; 1032 if (!RACCT_IS_RECLAIMABLE(i)) 1033 continue; 1034 racct_set_locked(p, i, 0, 0); 1035 } 1036 1037 #ifdef RCTL 1038 rctl_racct_release(p->p_racct); 1039 #endif 1040 racct_destroy_locked(&p->p_racct); 1041 RACCT_UNLOCK(); 1042 PROC_UNLOCK(p); 1043 } 1044 1045 /* 1046 * Called after credentials change, to move resource utilisation 1047 * between raccts. 1048 */ 1049 void 1050 racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, 1051 struct ucred *newcred) 1052 { 1053 struct uidinfo *olduip, *newuip; 1054 struct loginclass *oldlc, *newlc; 1055 struct prison *oldpr, *newpr, *pr; 1056 1057 if (!racct_enable) 1058 return; 1059 1060 PROC_LOCK_ASSERT(p, MA_OWNED); 1061 1062 newuip = newcred->cr_ruidinfo; 1063 olduip = oldcred->cr_ruidinfo; 1064 newlc = newcred->cr_loginclass; 1065 oldlc = oldcred->cr_loginclass; 1066 newpr = newcred->cr_prison; 1067 oldpr = oldcred->cr_prison; 1068 1069 RACCT_LOCK(); 1070 if (newuip != olduip) { 1071 racct_sub_racct(olduip->ui_racct, p->p_racct); 1072 racct_add_racct(newuip->ui_racct, p->p_racct); 1073 } 1074 if (newlc != oldlc) { 1075 racct_sub_racct(oldlc->lc_racct, p->p_racct); 1076 racct_add_racct(newlc->lc_racct, p->p_racct); 1077 } 1078 if (newpr != oldpr) { 1079 for (pr = oldpr; pr != NULL; pr = pr->pr_parent) 1080 racct_sub_racct(pr->pr_prison_racct->prr_racct, 1081 p->p_racct); 1082 for (pr = newpr; pr != NULL; pr = pr->pr_parent) 1083 racct_add_racct(pr->pr_prison_racct->prr_racct, 1084 p->p_racct); 1085 } 1086 RACCT_UNLOCK(); 1087 } 1088 1089 void 1090 racct_move(struct racct *dest, struct racct *src) 1091 { 1092 1093 ASSERT_RACCT_ENABLED(); 1094 1095 RACCT_LOCK(); 1096 racct_add_racct(dest, src); 1097 racct_sub_racct(src, src); 1098 RACCT_UNLOCK(); 1099 } 1100 1101 static void 1102 ast_racct(struct thread *td, int tda __unused) 1103 { 1104 struct proc *p; 1105 1106 ASSERT_RACCT_ENABLED(); 1107 1108 p = td->td_proc; 1109 if (p->p_throttled == 0) 1110 return; 1111 1112 PROC_LOCK(p); 1113 while (p->p_throttled != 0) { 1114 msleep(p->p_racct, &p->p_mtx, 0, "racct", 1115 p->p_throttled < 0 ? 0 : p->p_throttled); 1116 if (p->p_throttled > 0) 1117 p->p_throttled = 0; 1118 } 1119 PROC_UNLOCK(p); 1120 } 1121 1122 /* 1123 * Make the process sleep in userret() for 'timeout' ticks. Setting 1124 * timeout to -1 makes it sleep until woken up by racct_proc_wakeup(). 1125 */ 1126 void 1127 racct_proc_throttle(struct proc *p, int timeout) 1128 { 1129 struct thread *td; 1130 #ifdef SMP 1131 int cpuid; 1132 #endif 1133 1134 KASSERT(timeout != 0, ("timeout %d", timeout)); 1135 ASSERT_RACCT_ENABLED(); 1136 PROC_LOCK_ASSERT(p, MA_OWNED); 1137 1138 /* 1139 * Do not block kernel processes. Also do not block processes with 1140 * low %cpu utilization to improve interactivity. 1141 */ 1142 if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) 1143 return; 1144 1145 if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout)) 1146 return; 1147 1148 p->p_throttled = timeout; 1149 1150 FOREACH_THREAD_IN_PROC(p, td) { 1151 thread_lock(td); 1152 ast_sched_locked(td, TDA_RACCT); 1153 1154 switch (TD_GET_STATE(td)) { 1155 case TDS_RUNQ: 1156 /* 1157 * If the thread is on the scheduler run-queue, we can 1158 * not just remove it from there. So we set the flag 1159 * TDA_SCHED for the thread, so that once it is 1160 * running, it is taken off the cpu as soon as possible. 1161 */ 1162 ast_sched_locked(td, TDA_SCHED); 1163 break; 1164 case TDS_RUNNING: 1165 /* 1166 * If the thread is running, we request a context 1167 * switch for it by setting the TDA_SCHED flag. 1168 */ 1169 ast_sched_locked(td, TDA_SCHED); 1170 #ifdef SMP 1171 cpuid = td->td_oncpu; 1172 if ((cpuid != NOCPU) && (td != curthread)) 1173 ipi_cpu(cpuid, IPI_AST); 1174 #endif 1175 break; 1176 default: 1177 break; 1178 } 1179 thread_unlock(td); 1180 } 1181 } 1182 1183 static void 1184 racct_proc_wakeup(struct proc *p) 1185 { 1186 1187 ASSERT_RACCT_ENABLED(); 1188 1189 PROC_LOCK_ASSERT(p, MA_OWNED); 1190 1191 if (p->p_throttled != 0) { 1192 p->p_throttled = 0; 1193 wakeup(p->p_racct); 1194 } 1195 } 1196 1197 static void 1198 racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) 1199 { 1200 int64_t r_old, r_new; 1201 1202 ASSERT_RACCT_ENABLED(); 1203 RACCT_LOCK_ASSERT(); 1204 1205 #ifdef RCTL 1206 rctl_throttle_decay(racct, RACCT_READBPS); 1207 rctl_throttle_decay(racct, RACCT_WRITEBPS); 1208 rctl_throttle_decay(racct, RACCT_READIOPS); 1209 rctl_throttle_decay(racct, RACCT_WRITEIOPS); 1210 #endif 1211 1212 r_old = racct->r_resources[RACCT_PCTCPU]; 1213 1214 /* If there is nothing to decay, just exit. */ 1215 if (r_old <= 0) 1216 return; 1217 1218 r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; 1219 racct->r_resources[RACCT_PCTCPU] = r_new; 1220 } 1221 1222 static void 1223 racct_decay_pre(void) 1224 { 1225 1226 RACCT_LOCK(); 1227 } 1228 1229 static void 1230 racct_decay_post(void) 1231 { 1232 1233 RACCT_UNLOCK(); 1234 } 1235 1236 static void 1237 racct_decay(void) 1238 { 1239 1240 ASSERT_RACCT_ENABLED(); 1241 1242 ui_racct_foreach(racct_decay_callback, racct_decay_pre, 1243 racct_decay_post, NULL, NULL); 1244 loginclass_racct_foreach(racct_decay_callback, racct_decay_pre, 1245 racct_decay_post, NULL, NULL); 1246 prison_racct_foreach(racct_decay_callback, racct_decay_pre, 1247 racct_decay_post, NULL, NULL); 1248 } 1249 1250 static void 1251 racctd(void) 1252 { 1253 struct thread *td; 1254 struct proc *p; 1255 struct timeval wallclock; 1256 uint64_t pct, pct_estimate, runtime; 1257 1258 ASSERT_RACCT_ENABLED(); 1259 1260 for (;;) { 1261 racct_decay(); 1262 1263 sx_slock(&allproc_lock); 1264 1265 FOREACH_PROC_IN_SYSTEM(p) { 1266 PROC_LOCK(p); 1267 if (p->p_state != PRS_NORMAL) { 1268 if (p->p_state == PRS_ZOMBIE) 1269 racct_set(p, RACCT_PCTCPU, 0); 1270 PROC_UNLOCK(p); 1271 continue; 1272 } 1273 1274 microuptime(&wallclock); 1275 timevalsub(&wallclock, &p->p_stats->p_start); 1276 PROC_STATLOCK(p); 1277 FOREACH_THREAD_IN_PROC(p, td) 1278 ruxagg(p, td); 1279 runtime = cputick2usec(p->p_rux.rux_runtime); 1280 PROC_STATUNLOCK(p); 1281 #ifdef notyet 1282 KASSERT(runtime >= p->p_prev_runtime, 1283 ("runtime < p_prev_runtime")); 1284 #else 1285 if (runtime < p->p_prev_runtime) 1286 runtime = p->p_prev_runtime; 1287 #endif 1288 p->p_prev_runtime = runtime; 1289 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 1290 pct_estimate = (1000000 * runtime * 100) / 1291 ((uint64_t)wallclock.tv_sec * 1000000 + 1292 wallclock.tv_usec); 1293 } else 1294 pct_estimate = 0; 1295 pct = racct_getpcpu(p, pct_estimate); 1296 RACCT_LOCK(); 1297 #ifdef RCTL 1298 rctl_throttle_decay(p->p_racct, RACCT_READBPS); 1299 rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS); 1300 rctl_throttle_decay(p->p_racct, RACCT_READIOPS); 1301 rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); 1302 #endif 1303 racct_set_locked(p, RACCT_PCTCPU, pct, 1); 1304 racct_set_locked(p, RACCT_CPU, runtime, 0); 1305 racct_set_locked(p, RACCT_WALLCLOCK, 1306 (uint64_t)wallclock.tv_sec * 1000000 + 1307 wallclock.tv_usec, 0); 1308 RACCT_UNLOCK(); 1309 PROC_UNLOCK(p); 1310 } 1311 1312 /* 1313 * To ensure that processes are throttled in a fair way, we need 1314 * to iterate over all processes again and check the limits 1315 * for %cpu resource only after ucred racct containers have been 1316 * properly filled. 1317 */ 1318 FOREACH_PROC_IN_SYSTEM(p) { 1319 PROC_LOCK(p); 1320 if (p->p_state != PRS_NORMAL) { 1321 PROC_UNLOCK(p); 1322 continue; 1323 } 1324 1325 if (racct_pcpu_available(p) <= 0) { 1326 if (p->p_racct->r_resources[RACCT_PCTCPU] > 1327 pcpu_threshold) 1328 racct_proc_throttle(p, -1); 1329 } else if (p->p_throttled == -1) { 1330 racct_proc_wakeup(p); 1331 } 1332 PROC_UNLOCK(p); 1333 } 1334 sx_sunlock(&allproc_lock); 1335 pause("-", hz); 1336 } 1337 } 1338 1339 static struct kproc_desc racctd_kp = { 1340 "racctd", 1341 racctd, 1342 NULL 1343 }; 1344 1345 static void 1346 racctd_init(void) 1347 { 1348 if (!racct_enable) 1349 return; 1350 1351 kproc_start(&racctd_kp); 1352 } 1353 SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); 1354 1355 static void 1356 racct_init(void) 1357 { 1358 if (!racct_enable) 1359 return; 1360 1361 racct_zone = uma_zcreate("racct", sizeof(struct racct), 1362 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1363 ast_register(TDA_RACCT, ASTR_ASTF_REQUIRED, 0, ast_racct); 1364 1365 /* 1366 * XXX: Move this somewhere. 1367 */ 1368 prison0.pr_prison_racct = prison_racct_find("0"); 1369 } 1370 SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); 1371 1372 #endif /* !RACCT */ 1373