1 /*- 2 * Copyright (c) 2010 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by Edward Tomasz Napierala under sponsorship 6 * from the FreeBSD Foundation. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_sched.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/eventhandler.h> 40 #include <sys/jail.h> 41 #include <sys/kernel.h> 42 #include <sys/kthread.h> 43 #include <sys/lock.h> 44 #include <sys/loginclass.h> 45 #include <sys/malloc.h> 46 #include <sys/mutex.h> 47 #include <sys/proc.h> 48 #include <sys/racct.h> 49 #include <sys/resourcevar.h> 50 #include <sys/sbuf.h> 51 #include <sys/sched.h> 52 #include <sys/sdt.h> 53 #include <sys/smp.h> 54 #include <sys/sx.h> 55 #include <sys/sysctl.h> 56 #include <sys/sysent.h> 57 #include <sys/sysproto.h> 58 #include <sys/umtx.h> 59 #include <machine/smp.h> 60 61 #ifdef RCTL 62 #include <sys/rctl.h> 63 #endif 64 65 #ifdef RACCT 66 67 FEATURE(racct, "Resource Accounting"); 68 69 /* 70 * Do not block processes that have their %cpu usage <= pcpu_threshold. 71 */ 72 static int pcpu_threshold = 1; 73 #ifdef RACCT_DEFAULT_TO_DISABLED 74 int racct_enable = 0; 75 #else 76 int racct_enable = 1; 77 #endif 78 79 SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); 80 SYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable, 81 0, "Enable RACCT/RCTL"); 82 SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 83 0, "Processes with higher %cpu usage than this value can be throttled."); 84 85 /* 86 * How many seconds it takes to use the scheduler %cpu calculations. When a 87 * process starts, we compute its %cpu usage by dividing its runtime by the 88 * process wall clock time. After RACCT_PCPU_SECS pass, we use the value 89 * provided by the scheduler. 90 */ 91 #define RACCT_PCPU_SECS 3 92 93 static struct mtx racct_lock; 94 MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); 95 96 static uma_zone_t racct_zone; 97 98 static void racct_sub_racct(struct racct *dest, const struct racct *src); 99 static void racct_sub_cred_locked(struct ucred *cred, int resource, 100 uint64_t amount); 101 static void racct_add_cred_locked(struct ucred *cred, int resource, 102 uint64_t amount); 103 104 SDT_PROVIDER_DEFINE(racct); 105 SDT_PROBE_DEFINE3(racct, , rusage, add, 106 "struct proc *", "int", "uint64_t"); 107 SDT_PROBE_DEFINE3(racct, , rusage, add__failure, 108 "struct proc *", "int", "uint64_t"); 109 SDT_PROBE_DEFINE3(racct, , rusage, add__cred, 110 "struct ucred *", "int", "uint64_t"); 111 SDT_PROBE_DEFINE3(racct, , rusage, add__force, 112 "struct proc *", "int", "uint64_t"); 113 SDT_PROBE_DEFINE3(racct, , rusage, set, 114 "struct proc *", "int", "uint64_t"); 115 SDT_PROBE_DEFINE3(racct, , rusage, set__failure, 116 "struct proc *", "int", "uint64_t"); 117 SDT_PROBE_DEFINE3(racct, , rusage, sub, 118 "struct proc *", "int", "uint64_t"); 119 SDT_PROBE_DEFINE3(racct, , rusage, sub__cred, 120 "struct ucred *", "int", "uint64_t"); 121 SDT_PROBE_DEFINE1(racct, , racct, create, 122 "struct racct *"); 123 SDT_PROBE_DEFINE1(racct, , racct, destroy, 124 "struct racct *"); 125 SDT_PROBE_DEFINE2(racct, , racct, join, 126 "struct racct *", "struct racct *"); 127 SDT_PROBE_DEFINE2(racct, , racct, join__failure, 128 "struct racct *", "struct racct *"); 129 SDT_PROBE_DEFINE2(racct, , racct, leave, 130 "struct racct *", "struct racct *"); 131 132 int racct_types[] = { 133 [RACCT_CPU] = 134 RACCT_IN_MILLIONS, 135 [RACCT_DATA] = 136 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 137 [RACCT_STACK] = 138 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 139 [RACCT_CORE] = 140 RACCT_DENIABLE, 141 [RACCT_RSS] = 142 RACCT_RECLAIMABLE, 143 [RACCT_MEMLOCK] = 144 RACCT_RECLAIMABLE | RACCT_DENIABLE, 145 [RACCT_NPROC] = 146 RACCT_RECLAIMABLE | RACCT_DENIABLE, 147 [RACCT_NOFILE] = 148 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 149 [RACCT_VMEM] = 150 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 151 [RACCT_NPTS] = 152 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 153 [RACCT_SWAP] = 154 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 155 [RACCT_NTHR] = 156 RACCT_RECLAIMABLE | RACCT_DENIABLE, 157 [RACCT_MSGQQUEUED] = 158 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 159 [RACCT_MSGQSIZE] = 160 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 161 [RACCT_NMSGQ] = 162 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 163 [RACCT_NSEM] = 164 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 165 [RACCT_NSEMOP] = 166 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 167 [RACCT_NSHM] = 168 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 169 [RACCT_SHMSIZE] = 170 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 171 [RACCT_WALLCLOCK] = 172 RACCT_IN_MILLIONS, 173 [RACCT_PCTCPU] = 174 RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS }; 175 176 static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; 177 178 #ifdef SCHED_4BSD 179 /* 180 * Contains intermediate values for %cpu calculations to avoid using floating 181 * point in the kernel. 182 * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) 183 * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to 184 * zero so the calculations are more straightforward. 185 */ 186 fixpt_t ccpu_exp[] = { 187 [0] = FSCALE * 1, 188 [1] = FSCALE * 0.95122942450071400909, 189 [2] = FSCALE * 0.90483741803595957316, 190 [3] = FSCALE * 0.86070797642505780722, 191 [4] = FSCALE * 0.81873075307798185866, 192 [5] = FSCALE * 0.77880078307140486824, 193 [6] = FSCALE * 0.74081822068171786606, 194 [7] = FSCALE * 0.70468808971871343435, 195 [8] = FSCALE * 0.67032004603563930074, 196 [9] = FSCALE * 0.63762815162177329314, 197 [10] = FSCALE * 0.60653065971263342360, 198 [11] = FSCALE * 0.57694981038048669531, 199 [12] = FSCALE * 0.54881163609402643262, 200 [13] = FSCALE * 0.52204577676101604789, 201 [14] = FSCALE * 0.49658530379140951470, 202 [15] = FSCALE * 0.47236655274101470713, 203 [16] = FSCALE * 0.44932896411722159143, 204 [17] = FSCALE * 0.42741493194872666992, 205 [18] = FSCALE * 0.40656965974059911188, 206 [19] = FSCALE * 0.38674102345450120691, 207 [20] = FSCALE * 0.36787944117144232159, 208 [21] = FSCALE * 0.34993774911115535467, 209 [22] = FSCALE * 0.33287108369807955328, 210 [23] = FSCALE * 0.31663676937905321821, 211 [24] = FSCALE * 0.30119421191220209664, 212 [25] = FSCALE * 0.28650479686019010032, 213 [26] = FSCALE * 0.27253179303401260312, 214 [27] = FSCALE * 0.25924026064589150757, 215 [28] = FSCALE * 0.24659696394160647693, 216 [29] = FSCALE * 0.23457028809379765313, 217 [30] = FSCALE * 0.22313016014842982893, 218 [31] = FSCALE * 0.21224797382674305771, 219 [32] = FSCALE * 0.20189651799465540848, 220 [33] = FSCALE * 0.19204990862075411423, 221 [34] = FSCALE * 0.18268352405273465022, 222 [35] = FSCALE * 0.17377394345044512668, 223 [36] = FSCALE * 0.16529888822158653829, 224 [37] = FSCALE * 0.15723716631362761621, 225 [38] = FSCALE * 0.14956861922263505264, 226 [39] = FSCALE * 0.14227407158651357185, 227 [40] = FSCALE * 0.13533528323661269189, 228 [41] = FSCALE * 0.12873490358780421886, 229 [42] = FSCALE * 0.12245642825298191021, 230 [43] = FSCALE * 0.11648415777349695786, 231 [44] = FSCALE * 0.11080315836233388333, 232 [45] = FSCALE * 0.10539922456186433678, 233 [46] = FSCALE * 0.10025884372280373372, 234 [47] = FSCALE * 0.09536916221554961888, 235 [48] = FSCALE * 0.09071795328941250337, 236 [49] = FSCALE * 0.08629358649937051097, 237 [50] = FSCALE * 0.08208499862389879516, 238 [51] = FSCALE * 0.07808166600115315231, 239 [52] = FSCALE * 0.07427357821433388042, 240 [53] = FSCALE * 0.07065121306042958674, 241 [54] = FSCALE * 0.06720551273974976512, 242 [55] = FSCALE * 0.06392786120670757270, 243 [56] = FSCALE * 0.06081006262521796499, 244 [57] = FSCALE * 0.05784432087483846296, 245 [58] = FSCALE * 0.05502322005640722902, 246 [59] = FSCALE * 0.05233970594843239308, 247 [60] = FSCALE * 0.04978706836786394297, 248 [61] = FSCALE * 0.04735892439114092119, 249 [62] = FSCALE * 0.04504920239355780606, 250 [63] = FSCALE * 0.04285212686704017991, 251 [64] = FSCALE * 0.04076220397836621516, 252 [65] = FSCALE * 0.03877420783172200988, 253 [66] = FSCALE * 0.03688316740124000544, 254 [67] = FSCALE * 0.03508435410084502588, 255 [68] = FSCALE * 0.03337326996032607948, 256 [69] = FSCALE * 0.03174563637806794323, 257 [70] = FSCALE * 0.03019738342231850073, 258 [71] = FSCALE * 0.02872463965423942912, 259 [72] = FSCALE * 0.02732372244729256080, 260 [73] = FSCALE * 0.02599112877875534358, 261 [74] = FSCALE * 0.02472352647033939120, 262 [75] = FSCALE * 0.02351774585600910823, 263 [76] = FSCALE * 0.02237077185616559577, 264 [77] = FSCALE * 0.02127973643837716938, 265 [78] = FSCALE * 0.02024191144580438847, 266 [79] = FSCALE * 0.01925470177538692429, 267 [80] = FSCALE * 0.01831563888873418029, 268 [81] = FSCALE * 0.01742237463949351138, 269 [82] = FSCALE * 0.01657267540176124754, 270 [83] = FSCALE * 0.01576441648485449082, 271 [84] = FSCALE * 0.01499557682047770621, 272 [85] = FSCALE * 0.01426423390899925527, 273 [86] = FSCALE * 0.01356855901220093175, 274 [87] = FSCALE * 0.01290681258047986886, 275 [88] = FSCALE * 0.01227733990306844117, 276 [89] = FSCALE * 0.01167856697039544521, 277 [90] = FSCALE * 0.01110899653824230649, 278 [91] = FSCALE * 0.01056720438385265337, 279 [92] = FSCALE * 0.01005183574463358164, 280 [93] = FSCALE * 0.00956160193054350793, 281 [94] = FSCALE * 0.00909527710169581709, 282 [95] = FSCALE * 0.00865169520312063417, 283 [96] = FSCALE * 0.00822974704902002884, 284 [97] = FSCALE * 0.00782837754922577143, 285 [98] = FSCALE * 0.00744658307092434051, 286 [99] = FSCALE * 0.00708340892905212004, 287 [100] = FSCALE * 0.00673794699908546709, 288 [101] = FSCALE * 0.00640933344625638184, 289 [102] = FSCALE * 0.00609674656551563610, 290 [103] = FSCALE * 0.00579940472684214321, 291 [104] = FSCALE * 0.00551656442076077241, 292 [105] = FSCALE * 0.00524751839918138427, 293 [106] = FSCALE * 0.00499159390691021621, 294 [107] = FSCALE * 0.00474815099941147558, 295 [108] = FSCALE * 0.00451658094261266798, 296 [109] = FSCALE * 0.00429630469075234057, 297 [110] = FSCALE * 0.00408677143846406699, 298 }; 299 #endif 300 301 #define CCPU_EXP_MAX 110 302 303 /* 304 * This function is analogical to the getpcpu() function in the ps(1) command. 305 * They should both calculate in the same way so that the racct %cpu 306 * calculations are consistent with the values showed by the ps(1) tool. 307 * The calculations are more complex in the 4BSD scheduler because of the value 308 * of the ccpu variable. In ULE it is defined to be zero which saves us some 309 * work. 310 */ 311 static uint64_t 312 racct_getpcpu(struct proc *p, u_int pcpu) 313 { 314 u_int swtime; 315 #ifdef SCHED_4BSD 316 fixpt_t pctcpu, pctcpu_next; 317 #endif 318 #ifdef SMP 319 struct pcpu *pc; 320 int found; 321 #endif 322 fixpt_t p_pctcpu; 323 struct thread *td; 324 325 ASSERT_RACCT_ENABLED(); 326 327 /* 328 * If the process is swapped out, we count its %cpu usage as zero. 329 * This behaviour is consistent with the userland ps(1) tool. 330 */ 331 if ((p->p_flag & P_INMEM) == 0) 332 return (0); 333 swtime = (ticks - p->p_swtick) / hz; 334 335 /* 336 * For short-lived processes, the sched_pctcpu() returns small 337 * values even for cpu intensive processes. Therefore we use 338 * our own estimate in this case. 339 */ 340 if (swtime < RACCT_PCPU_SECS) 341 return (pcpu); 342 343 p_pctcpu = 0; 344 FOREACH_THREAD_IN_PROC(p, td) { 345 if (td == PCPU_GET(idlethread)) 346 continue; 347 #ifdef SMP 348 found = 0; 349 STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { 350 if (td == pc->pc_idlethread) { 351 found = 1; 352 break; 353 } 354 } 355 if (found) 356 continue; 357 #endif 358 thread_lock(td); 359 #ifdef SCHED_4BSD 360 pctcpu = sched_pctcpu(td); 361 /* Count also the yet unfinished second. */ 362 pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; 363 pctcpu_next += sched_pctcpu_delta(td); 364 p_pctcpu += max(pctcpu, pctcpu_next); 365 #else 366 /* 367 * In ULE the %cpu statistics are updated on every 368 * sched_pctcpu() call. So special calculations to 369 * account for the latest (unfinished) second are 370 * not needed. 371 */ 372 p_pctcpu += sched_pctcpu(td); 373 #endif 374 thread_unlock(td); 375 } 376 377 #ifdef SCHED_4BSD 378 if (swtime <= CCPU_EXP_MAX) 379 return ((100 * (uint64_t)p_pctcpu * 1000000) / 380 (FSCALE - ccpu_exp[swtime])); 381 #endif 382 383 return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); 384 } 385 386 static void 387 racct_add_racct(struct racct *dest, const struct racct *src) 388 { 389 int i; 390 391 ASSERT_RACCT_ENABLED(); 392 mtx_assert(&racct_lock, MA_OWNED); 393 394 /* 395 * Update resource usage in dest. 396 */ 397 for (i = 0; i <= RACCT_MAX; i++) { 398 KASSERT(dest->r_resources[i] >= 0, 399 ("%s: resource %d propagation meltdown: dest < 0", 400 __func__, i)); 401 KASSERT(src->r_resources[i] >= 0, 402 ("%s: resource %d propagation meltdown: src < 0", 403 __func__, i)); 404 dest->r_resources[i] += src->r_resources[i]; 405 } 406 } 407 408 static void 409 racct_sub_racct(struct racct *dest, const struct racct *src) 410 { 411 int i; 412 413 ASSERT_RACCT_ENABLED(); 414 mtx_assert(&racct_lock, MA_OWNED); 415 416 /* 417 * Update resource usage in dest. 418 */ 419 for (i = 0; i <= RACCT_MAX; i++) { 420 if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { 421 KASSERT(dest->r_resources[i] >= 0, 422 ("%s: resource %d propagation meltdown: dest < 0", 423 __func__, i)); 424 KASSERT(src->r_resources[i] >= 0, 425 ("%s: resource %d propagation meltdown: src < 0", 426 __func__, i)); 427 KASSERT(src->r_resources[i] <= dest->r_resources[i], 428 ("%s: resource %d propagation meltdown: src > dest", 429 __func__, i)); 430 } 431 if (RACCT_CAN_DROP(i)) { 432 dest->r_resources[i] -= src->r_resources[i]; 433 if (dest->r_resources[i] < 0) { 434 KASSERT(RACCT_IS_SLOPPY(i) || 435 RACCT_IS_DECAYING(i), 436 ("%s: resource %d usage < 0", __func__, i)); 437 dest->r_resources[i] = 0; 438 } 439 } 440 } 441 } 442 443 void 444 racct_create(struct racct **racctp) 445 { 446 447 if (!racct_enable) 448 return; 449 450 SDT_PROBE1(racct, , racct, create, racctp); 451 452 KASSERT(*racctp == NULL, ("racct already allocated")); 453 454 *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); 455 } 456 457 static void 458 racct_destroy_locked(struct racct **racctp) 459 { 460 int i; 461 struct racct *racct; 462 463 ASSERT_RACCT_ENABLED(); 464 465 SDT_PROBE1(racct, , racct, destroy, racctp); 466 467 mtx_assert(&racct_lock, MA_OWNED); 468 KASSERT(racctp != NULL, ("NULL racctp")); 469 KASSERT(*racctp != NULL, ("NULL racct")); 470 471 racct = *racctp; 472 473 for (i = 0; i <= RACCT_MAX; i++) { 474 if (RACCT_IS_SLOPPY(i)) 475 continue; 476 if (!RACCT_IS_RECLAIMABLE(i)) 477 continue; 478 KASSERT(racct->r_resources[i] == 0, 479 ("destroying non-empty racct: " 480 "%ju allocated for resource %d\n", 481 racct->r_resources[i], i)); 482 } 483 uma_zfree(racct_zone, racct); 484 *racctp = NULL; 485 } 486 487 void 488 racct_destroy(struct racct **racct) 489 { 490 491 if (!racct_enable) 492 return; 493 494 mtx_lock(&racct_lock); 495 racct_destroy_locked(racct); 496 mtx_unlock(&racct_lock); 497 } 498 499 /* 500 * Increase consumption of 'resource' by 'amount' for 'racct', 501 * but not its parents. Differently from other cases, 'amount' here 502 * may be less than zero. 503 */ 504 static void 505 racct_adjust_resource(struct racct *racct, int resource, 506 int64_t amount) 507 { 508 509 ASSERT_RACCT_ENABLED(); 510 mtx_assert(&racct_lock, MA_OWNED); 511 KASSERT(racct != NULL, ("NULL racct")); 512 513 racct->r_resources[resource] += amount; 514 if (racct->r_resources[resource] < 0) { 515 KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), 516 ("%s: resource %d usage < 0", __func__, resource)); 517 racct->r_resources[resource] = 0; 518 } 519 520 /* 521 * There are some cases where the racct %cpu resource would grow 522 * beyond 100% per core. For example in racct_proc_exit() we add 523 * the process %cpu usage to the ucred racct containers. If too 524 * many processes terminated in a short time span, the ucred %cpu 525 * resource could grow too much. Also, the 4BSD scheduler sometimes 526 * returns for a thread more than 100% cpu usage. So we set a sane 527 * boundary here to 100% * the maxumum number of CPUs. 528 */ 529 if ((resource == RACCT_PCTCPU) && 530 (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) 531 racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; 532 } 533 534 static int 535 racct_add_locked(struct proc *p, int resource, uint64_t amount) 536 { 537 #ifdef RCTL 538 int error; 539 #endif 540 541 ASSERT_RACCT_ENABLED(); 542 543 SDT_PROBE3(racct, , rusage, add, p, resource, amount); 544 545 /* 546 * We need proc lock to dereference p->p_ucred. 547 */ 548 PROC_LOCK_ASSERT(p, MA_OWNED); 549 550 #ifdef RCTL 551 error = rctl_enforce(p, resource, amount); 552 if (error && RACCT_IS_DENIABLE(resource)) { 553 SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount); 554 return (error); 555 } 556 #endif 557 racct_adjust_resource(p->p_racct, resource, amount); 558 racct_add_cred_locked(p->p_ucred, resource, amount); 559 560 return (0); 561 } 562 563 /* 564 * Increase allocation of 'resource' by 'amount' for process 'p'. 565 * Return 0 if it's below limits, or errno, if it's not. 566 */ 567 int 568 racct_add(struct proc *p, int resource, uint64_t amount) 569 { 570 int error; 571 572 if (!racct_enable) 573 return (0); 574 575 mtx_lock(&racct_lock); 576 error = racct_add_locked(p, resource, amount); 577 mtx_unlock(&racct_lock); 578 return (error); 579 } 580 581 static void 582 racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) 583 { 584 struct prison *pr; 585 586 ASSERT_RACCT_ENABLED(); 587 588 SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount); 589 590 racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount); 591 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 592 racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, 593 amount); 594 racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount); 595 } 596 597 /* 598 * Increase allocation of 'resource' by 'amount' for credential 'cred'. 599 * Doesn't check for limits and never fails. 600 * 601 * XXX: Shouldn't this ever return an error? 602 */ 603 void 604 racct_add_cred(struct ucred *cred, int resource, uint64_t amount) 605 { 606 607 if (!racct_enable) 608 return; 609 610 mtx_lock(&racct_lock); 611 racct_add_cred_locked(cred, resource, amount); 612 mtx_unlock(&racct_lock); 613 } 614 615 /* 616 * Increase allocation of 'resource' by 'amount' for process 'p'. 617 * Doesn't check for limits and never fails. 618 */ 619 void 620 racct_add_force(struct proc *p, int resource, uint64_t amount) 621 { 622 623 if (!racct_enable) 624 return; 625 626 SDT_PROBE3(racct, , rusage, add__force, p, resource, amount); 627 628 /* 629 * We need proc lock to dereference p->p_ucred. 630 */ 631 PROC_LOCK_ASSERT(p, MA_OWNED); 632 633 mtx_lock(&racct_lock); 634 racct_adjust_resource(p->p_racct, resource, amount); 635 racct_add_cred_locked(p->p_ucred, resource, amount); 636 mtx_unlock(&racct_lock); 637 } 638 639 static int 640 racct_set_locked(struct proc *p, int resource, uint64_t amount) 641 { 642 int64_t old_amount, decayed_amount; 643 int64_t diff_proc, diff_cred; 644 #ifdef RCTL 645 int error; 646 #endif 647 648 ASSERT_RACCT_ENABLED(); 649 650 SDT_PROBE3(racct, , rusage, set, p, resource, amount); 651 652 /* 653 * We need proc lock to dereference p->p_ucred. 654 */ 655 PROC_LOCK_ASSERT(p, MA_OWNED); 656 657 old_amount = p->p_racct->r_resources[resource]; 658 /* 659 * The diffs may be negative. 660 */ 661 diff_proc = amount - old_amount; 662 if (RACCT_IS_DECAYING(resource)) { 663 /* 664 * Resources in per-credential racct containers may decay. 665 * If this is the case, we need to calculate the difference 666 * between the new amount and the proportional value of the 667 * old amount that has decayed in the ucred racct containers. 668 */ 669 decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 670 diff_cred = amount - decayed_amount; 671 } else 672 diff_cred = diff_proc; 673 #ifdef notyet 674 KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), 675 ("%s: usage of non-droppable resource %d dropping", __func__, 676 resource)); 677 #endif 678 #ifdef RCTL 679 if (diff_proc > 0) { 680 error = rctl_enforce(p, resource, diff_proc); 681 if (error && RACCT_IS_DENIABLE(resource)) { 682 SDT_PROBE3(racct, , rusage, set__failure, p, resource, 683 amount); 684 return (error); 685 } 686 } 687 #endif 688 racct_adjust_resource(p->p_racct, resource, diff_proc); 689 if (diff_cred > 0) 690 racct_add_cred_locked(p->p_ucred, resource, diff_cred); 691 else if (diff_cred < 0) 692 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 693 694 return (0); 695 } 696 697 /* 698 * Set allocation of 'resource' to 'amount' for process 'p'. 699 * Return 0 if it's below limits, or errno, if it's not. 700 * 701 * Note that decreasing the allocation always returns 0, 702 * even if it's above the limit. 703 */ 704 int 705 racct_set(struct proc *p, int resource, uint64_t amount) 706 { 707 int error; 708 709 if (!racct_enable) 710 return (0); 711 712 mtx_lock(&racct_lock); 713 error = racct_set_locked(p, resource, amount); 714 mtx_unlock(&racct_lock); 715 return (error); 716 } 717 718 static void 719 racct_set_force_locked(struct proc *p, int resource, uint64_t amount) 720 { 721 int64_t old_amount, decayed_amount; 722 int64_t diff_proc, diff_cred; 723 724 ASSERT_RACCT_ENABLED(); 725 726 SDT_PROBE3(racct, , rusage, set, p, resource, amount); 727 728 /* 729 * We need proc lock to dereference p->p_ucred. 730 */ 731 PROC_LOCK_ASSERT(p, MA_OWNED); 732 733 old_amount = p->p_racct->r_resources[resource]; 734 /* 735 * The diffs may be negative. 736 */ 737 diff_proc = amount - old_amount; 738 if (RACCT_IS_DECAYING(resource)) { 739 /* 740 * Resources in per-credential racct containers may decay. 741 * If this is the case, we need to calculate the difference 742 * between the new amount and the proportional value of the 743 * old amount that has decayed in the ucred racct containers. 744 */ 745 decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 746 diff_cred = amount - decayed_amount; 747 } else 748 diff_cred = diff_proc; 749 750 racct_adjust_resource(p->p_racct, resource, diff_proc); 751 if (diff_cred > 0) 752 racct_add_cred_locked(p->p_ucred, resource, diff_cred); 753 else if (diff_cred < 0) 754 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 755 } 756 757 void 758 racct_set_force(struct proc *p, int resource, uint64_t amount) 759 { 760 761 if (!racct_enable) 762 return; 763 764 mtx_lock(&racct_lock); 765 racct_set_force_locked(p, resource, amount); 766 mtx_unlock(&racct_lock); 767 } 768 769 /* 770 * Returns amount of 'resource' the process 'p' can keep allocated. 771 * Allocating more than that would be denied, unless the resource 772 * is marked undeniable. Amount of already allocated resource does 773 * not matter. 774 */ 775 uint64_t 776 racct_get_limit(struct proc *p, int resource) 777 { 778 779 if (!racct_enable) 780 return (UINT64_MAX); 781 782 #ifdef RCTL 783 return (rctl_get_limit(p, resource)); 784 #else 785 return (UINT64_MAX); 786 #endif 787 } 788 789 /* 790 * Returns amount of 'resource' the process 'p' can keep allocated. 791 * Allocating more than that would be denied, unless the resource 792 * is marked undeniable. Amount of already allocated resource does 793 * matter. 794 */ 795 uint64_t 796 racct_get_available(struct proc *p, int resource) 797 { 798 799 if (!racct_enable) 800 return (UINT64_MAX); 801 802 #ifdef RCTL 803 return (rctl_get_available(p, resource)); 804 #else 805 return (UINT64_MAX); 806 #endif 807 } 808 809 /* 810 * Returns amount of the %cpu resource that process 'p' can add to its %cpu 811 * utilization. Adding more than that would lead to the process being 812 * throttled. 813 */ 814 static int64_t 815 racct_pcpu_available(struct proc *p) 816 { 817 818 ASSERT_RACCT_ENABLED(); 819 820 #ifdef RCTL 821 return (rctl_pcpu_available(p)); 822 #else 823 return (INT64_MAX); 824 #endif 825 } 826 827 /* 828 * Decrease allocation of 'resource' by 'amount' for process 'p'. 829 */ 830 void 831 racct_sub(struct proc *p, int resource, uint64_t amount) 832 { 833 834 if (!racct_enable) 835 return; 836 837 SDT_PROBE3(racct, , rusage, sub, p, resource, amount); 838 839 /* 840 * We need proc lock to dereference p->p_ucred. 841 */ 842 PROC_LOCK_ASSERT(p, MA_OWNED); 843 KASSERT(RACCT_CAN_DROP(resource), 844 ("%s: called for non-droppable resource %d", __func__, resource)); 845 846 mtx_lock(&racct_lock); 847 KASSERT(amount <= p->p_racct->r_resources[resource], 848 ("%s: freeing %ju of resource %d, which is more " 849 "than allocated %jd for %s (pid %d)", __func__, amount, resource, 850 (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); 851 852 racct_adjust_resource(p->p_racct, resource, -amount); 853 racct_sub_cred_locked(p->p_ucred, resource, amount); 854 mtx_unlock(&racct_lock); 855 } 856 857 static void 858 racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) 859 { 860 struct prison *pr; 861 862 ASSERT_RACCT_ENABLED(); 863 864 SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount); 865 866 #ifdef notyet 867 KASSERT(RACCT_CAN_DROP(resource), 868 ("%s: called for resource %d which can not drop", __func__, 869 resource)); 870 #endif 871 872 racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); 873 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 874 racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, 875 -amount); 876 racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount); 877 } 878 879 /* 880 * Decrease allocation of 'resource' by 'amount' for credential 'cred'. 881 */ 882 void 883 racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 884 { 885 886 if (!racct_enable) 887 return; 888 889 mtx_lock(&racct_lock); 890 racct_sub_cred_locked(cred, resource, amount); 891 mtx_unlock(&racct_lock); 892 } 893 894 /* 895 * Inherit resource usage information from the parent process. 896 */ 897 int 898 racct_proc_fork(struct proc *parent, struct proc *child) 899 { 900 int i, error = 0; 901 902 if (!racct_enable) 903 return (0); 904 905 /* 906 * Create racct for the child process. 907 */ 908 racct_create(&child->p_racct); 909 910 PROC_LOCK(parent); 911 PROC_LOCK(child); 912 mtx_lock(&racct_lock); 913 914 #ifdef RCTL 915 error = rctl_proc_fork(parent, child); 916 if (error != 0) 917 goto out; 918 #endif 919 920 /* Init process cpu time. */ 921 child->p_prev_runtime = 0; 922 child->p_throttled = 0; 923 924 /* 925 * Inherit resource usage. 926 */ 927 for (i = 0; i <= RACCT_MAX; i++) { 928 if (parent->p_racct->r_resources[i] == 0 || 929 !RACCT_IS_INHERITABLE(i)) 930 continue; 931 932 error = racct_set_locked(child, i, 933 parent->p_racct->r_resources[i]); 934 if (error != 0) 935 goto out; 936 } 937 938 error = racct_add_locked(child, RACCT_NPROC, 1); 939 error += racct_add_locked(child, RACCT_NTHR, 1); 940 941 out: 942 mtx_unlock(&racct_lock); 943 PROC_UNLOCK(child); 944 PROC_UNLOCK(parent); 945 946 if (error != 0) 947 racct_proc_exit(child); 948 949 return (error); 950 } 951 952 /* 953 * Called at the end of fork1(), to handle rules that require the process 954 * to be fully initialized. 955 */ 956 void 957 racct_proc_fork_done(struct proc *child) 958 { 959 960 PROC_LOCK_ASSERT(child, MA_OWNED); 961 #ifdef RCTL 962 if (!racct_enable) 963 return; 964 965 mtx_lock(&racct_lock); 966 rctl_enforce(child, RACCT_NPROC, 0); 967 rctl_enforce(child, RACCT_NTHR, 0); 968 mtx_unlock(&racct_lock); 969 #endif 970 } 971 972 void 973 racct_proc_exit(struct proc *p) 974 { 975 int i; 976 uint64_t runtime; 977 struct timeval wallclock; 978 uint64_t pct_estimate, pct; 979 980 if (!racct_enable) 981 return; 982 983 PROC_LOCK(p); 984 /* 985 * We don't need to calculate rux, proc_reap() has already done this. 986 */ 987 runtime = cputick2usec(p->p_rux.rux_runtime); 988 #ifdef notyet 989 KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); 990 #else 991 if (runtime < p->p_prev_runtime) 992 runtime = p->p_prev_runtime; 993 #endif 994 microuptime(&wallclock); 995 timevalsub(&wallclock, &p->p_stats->p_start); 996 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 997 pct_estimate = (1000000 * runtime * 100) / 998 ((uint64_t)wallclock.tv_sec * 1000000 + 999 wallclock.tv_usec); 1000 } else 1001 pct_estimate = 0; 1002 pct = racct_getpcpu(p, pct_estimate); 1003 1004 mtx_lock(&racct_lock); 1005 racct_set_locked(p, RACCT_CPU, runtime); 1006 racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); 1007 1008 for (i = 0; i <= RACCT_MAX; i++) { 1009 if (p->p_racct->r_resources[i] == 0) 1010 continue; 1011 if (!RACCT_IS_RECLAIMABLE(i)) 1012 continue; 1013 racct_set_locked(p, i, 0); 1014 } 1015 1016 mtx_unlock(&racct_lock); 1017 PROC_UNLOCK(p); 1018 1019 #ifdef RCTL 1020 rctl_racct_release(p->p_racct); 1021 #endif 1022 racct_destroy(&p->p_racct); 1023 } 1024 1025 /* 1026 * Called after credentials change, to move resource utilisation 1027 * between raccts. 1028 */ 1029 void 1030 racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, 1031 struct ucred *newcred) 1032 { 1033 struct uidinfo *olduip, *newuip; 1034 struct loginclass *oldlc, *newlc; 1035 struct prison *oldpr, *newpr, *pr; 1036 1037 if (!racct_enable) 1038 return; 1039 1040 PROC_LOCK_ASSERT(p, MA_NOTOWNED); 1041 1042 newuip = newcred->cr_ruidinfo; 1043 olduip = oldcred->cr_ruidinfo; 1044 newlc = newcred->cr_loginclass; 1045 oldlc = oldcred->cr_loginclass; 1046 newpr = newcred->cr_prison; 1047 oldpr = oldcred->cr_prison; 1048 1049 mtx_lock(&racct_lock); 1050 if (newuip != olduip) { 1051 racct_sub_racct(olduip->ui_racct, p->p_racct); 1052 racct_add_racct(newuip->ui_racct, p->p_racct); 1053 } 1054 if (newlc != oldlc) { 1055 racct_sub_racct(oldlc->lc_racct, p->p_racct); 1056 racct_add_racct(newlc->lc_racct, p->p_racct); 1057 } 1058 if (newpr != oldpr) { 1059 for (pr = oldpr; pr != NULL; pr = pr->pr_parent) 1060 racct_sub_racct(pr->pr_prison_racct->prr_racct, 1061 p->p_racct); 1062 for (pr = newpr; pr != NULL; pr = pr->pr_parent) 1063 racct_add_racct(pr->pr_prison_racct->prr_racct, 1064 p->p_racct); 1065 } 1066 mtx_unlock(&racct_lock); 1067 1068 #ifdef RCTL 1069 rctl_proc_ucred_changed(p, newcred); 1070 #endif 1071 } 1072 1073 void 1074 racct_move(struct racct *dest, struct racct *src) 1075 { 1076 1077 ASSERT_RACCT_ENABLED(); 1078 1079 mtx_lock(&racct_lock); 1080 1081 racct_add_racct(dest, src); 1082 racct_sub_racct(src, src); 1083 1084 mtx_unlock(&racct_lock); 1085 } 1086 1087 static void 1088 racct_proc_throttle(struct proc *p) 1089 { 1090 struct thread *td; 1091 #ifdef SMP 1092 int cpuid; 1093 #endif 1094 1095 ASSERT_RACCT_ENABLED(); 1096 PROC_LOCK_ASSERT(p, MA_OWNED); 1097 1098 /* 1099 * Do not block kernel processes. Also do not block processes with 1100 * low %cpu utilization to improve interactivity. 1101 */ 1102 if (((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) || 1103 (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold)) 1104 return; 1105 p->p_throttled = 1; 1106 1107 FOREACH_THREAD_IN_PROC(p, td) { 1108 thread_lock(td); 1109 switch (td->td_state) { 1110 case TDS_RUNQ: 1111 /* 1112 * If the thread is on the scheduler run-queue, we can 1113 * not just remove it from there. So we set the flag 1114 * TDF_NEEDRESCHED for the thread, so that once it is 1115 * running, it is taken off the cpu as soon as possible. 1116 */ 1117 td->td_flags |= TDF_NEEDRESCHED; 1118 break; 1119 case TDS_RUNNING: 1120 /* 1121 * If the thread is running, we request a context 1122 * switch for it by setting the TDF_NEEDRESCHED flag. 1123 */ 1124 td->td_flags |= TDF_NEEDRESCHED; 1125 #ifdef SMP 1126 cpuid = td->td_oncpu; 1127 if ((cpuid != NOCPU) && (td != curthread)) 1128 ipi_cpu(cpuid, IPI_AST); 1129 #endif 1130 break; 1131 default: 1132 break; 1133 } 1134 thread_unlock(td); 1135 } 1136 } 1137 1138 static void 1139 racct_proc_wakeup(struct proc *p) 1140 { 1141 1142 ASSERT_RACCT_ENABLED(); 1143 1144 PROC_LOCK_ASSERT(p, MA_OWNED); 1145 1146 if (p->p_throttled) { 1147 p->p_throttled = 0; 1148 wakeup(p->p_racct); 1149 } 1150 } 1151 1152 static void 1153 racct_decay_resource(struct racct *racct, void * res, void* dummy) 1154 { 1155 int resource; 1156 int64_t r_old, r_new; 1157 1158 ASSERT_RACCT_ENABLED(); 1159 mtx_assert(&racct_lock, MA_OWNED); 1160 1161 resource = *(int *)res; 1162 r_old = racct->r_resources[resource]; 1163 1164 /* If there is nothing to decay, just exit. */ 1165 if (r_old <= 0) 1166 return; 1167 1168 r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; 1169 racct->r_resources[resource] = r_new; 1170 } 1171 1172 static void 1173 racct_decay_pre(void) 1174 { 1175 1176 mtx_lock(&racct_lock); 1177 } 1178 1179 static void 1180 racct_decay_post(void) 1181 { 1182 1183 mtx_unlock(&racct_lock); 1184 } 1185 1186 static void 1187 racct_decay(int resource) 1188 { 1189 1190 ASSERT_RACCT_ENABLED(); 1191 1192 ui_racct_foreach(racct_decay_resource, racct_decay_pre, 1193 racct_decay_post, &resource, NULL); 1194 loginclass_racct_foreach(racct_decay_resource, racct_decay_pre, 1195 racct_decay_post, &resource, NULL); 1196 prison_racct_foreach(racct_decay_resource, racct_decay_pre, 1197 racct_decay_post, &resource, NULL); 1198 } 1199 1200 static void 1201 racctd(void) 1202 { 1203 struct thread *td; 1204 struct proc *p; 1205 struct timeval wallclock; 1206 uint64_t runtime; 1207 uint64_t pct, pct_estimate; 1208 1209 ASSERT_RACCT_ENABLED(); 1210 1211 for (;;) { 1212 racct_decay(RACCT_PCTCPU); 1213 1214 sx_slock(&allproc_lock); 1215 1216 LIST_FOREACH(p, &zombproc, p_list) { 1217 PROC_LOCK(p); 1218 racct_set(p, RACCT_PCTCPU, 0); 1219 PROC_UNLOCK(p); 1220 } 1221 1222 FOREACH_PROC_IN_SYSTEM(p) { 1223 PROC_LOCK(p); 1224 if (p->p_state != PRS_NORMAL) { 1225 PROC_UNLOCK(p); 1226 continue; 1227 } 1228 1229 microuptime(&wallclock); 1230 timevalsub(&wallclock, &p->p_stats->p_start); 1231 PROC_STATLOCK(p); 1232 FOREACH_THREAD_IN_PROC(p, td) 1233 ruxagg(p, td); 1234 runtime = cputick2usec(p->p_rux.rux_runtime); 1235 PROC_STATUNLOCK(p); 1236 #ifdef notyet 1237 KASSERT(runtime >= p->p_prev_runtime, 1238 ("runtime < p_prev_runtime")); 1239 #else 1240 if (runtime < p->p_prev_runtime) 1241 runtime = p->p_prev_runtime; 1242 #endif 1243 p->p_prev_runtime = runtime; 1244 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 1245 pct_estimate = (1000000 * runtime * 100) / 1246 ((uint64_t)wallclock.tv_sec * 1000000 + 1247 wallclock.tv_usec); 1248 } else 1249 pct_estimate = 0; 1250 pct = racct_getpcpu(p, pct_estimate); 1251 mtx_lock(&racct_lock); 1252 racct_set_force_locked(p, RACCT_PCTCPU, pct); 1253 racct_set_locked(p, RACCT_CPU, runtime); 1254 racct_set_locked(p, RACCT_WALLCLOCK, 1255 (uint64_t)wallclock.tv_sec * 1000000 + 1256 wallclock.tv_usec); 1257 mtx_unlock(&racct_lock); 1258 PROC_UNLOCK(p); 1259 } 1260 1261 /* 1262 * To ensure that processes are throttled in a fair way, we need 1263 * to iterate over all processes again and check the limits 1264 * for %cpu resource only after ucred racct containers have been 1265 * properly filled. 1266 */ 1267 FOREACH_PROC_IN_SYSTEM(p) { 1268 PROC_LOCK(p); 1269 if (p->p_state != PRS_NORMAL) { 1270 PROC_UNLOCK(p); 1271 continue; 1272 } 1273 1274 if (racct_pcpu_available(p) <= 0) 1275 racct_proc_throttle(p); 1276 else if (p->p_throttled) 1277 racct_proc_wakeup(p); 1278 PROC_UNLOCK(p); 1279 } 1280 sx_sunlock(&allproc_lock); 1281 pause("-", hz); 1282 } 1283 } 1284 1285 static struct kproc_desc racctd_kp = { 1286 "racctd", 1287 racctd, 1288 NULL 1289 }; 1290 1291 static void 1292 racctd_init(void) 1293 { 1294 if (!racct_enable) 1295 return; 1296 1297 kproc_start(&racctd_kp); 1298 } 1299 SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); 1300 1301 static void 1302 racct_init(void) 1303 { 1304 if (!racct_enable) 1305 return; 1306 1307 racct_zone = uma_zcreate("racct", sizeof(struct racct), 1308 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1309 /* 1310 * XXX: Move this somewhere. 1311 */ 1312 prison0.pr_prison_racct = prison_racct_find("0"); 1313 } 1314 SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); 1315 1316 #endif /* !RACCT */ 1317