/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013, Joyent, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The fair share scheduling class ensures that collections of processes * (zones and projects) each get their configured share of CPU. This is in * contrast to the TS class which considers individual processes. * * The FSS cpu-share is set on zones using the zone.cpu-shares rctl and on * projects using the project.cpu-shares rctl. By default the value is 1 * and it can range from 0 - 64k. A value of 0 means that processes in the * collection will only get CPU resources when there are no other processes * that need CPU. The cpu-share is used as one of the inputs to calculate a * thread's "user-mode" priority (umdpri) for the scheduler. The umdpri falls * in the range 0-59. FSS calculates other, internal, priorities which are not * visible outside of the FSS class. * * The FSS class should approximate TS behavior when there are excess CPU * resources. When there is a backlog of runnable processes, then the share * is used as input into the runnable process's priority calculation, where * the final umdpri is used by the scheduler to determine when the process runs. * * Projects in a zone compete with each other for CPU time, receiving CPU * allocation within a zone proportional to the project's share; at a higher * level zones compete with each other, receiving allocation in a pset * proportional to the zone's share. * * The FSS priority calculation consists of several parts. * * 1) Once per second the fss_update function runs. The first thing it does is * call fss_decay_usage. This function does three things. * * a) fss_decay_usage first decays the maxfsspri value for the pset. This * value is used in the per-process priority calculation described in step * (2b). The maxfsspri is decayed using the following formula: * * maxfsspri * fss_nice_decay[NZERO]) * maxfsspri = ------------------------------------ * FSS_DECAY_BASE * * * - NZERO is the default process priority (i.e. 20) * * The fss_nice_decay array is a fixed set of values used to adjust the * decay rate of processes based on their nice value. Entries in this * array are initialized in fss_init using the following formula: * * (FSS_DECAY_MAX - FSS_DECAY_MIN) * i * FSS_DECAY_MIN + ------------------------------------- * FSS_NICE_RANGE - 1 * * - FSS_DECAY_MIN is 82 = approximates 65% (82/128) * - FSS_DECAY_MAX is 108 = approximates 85% (108/128) * - FSS_NICE_RANGE is 40 (range is 0 - 39) * * b) The second thing fss_decay_usage does is update each project's "usage" * for the last second and then recalculates the project's "share usage". * * The usage value is the recent CPU usage for all of the threads in the * project. It is decayed and updated this way: * * (usage * FSS_DECAY_USG) * usage = ------------------------- + ticks; * FSS_DECAY_BASE * * - FSS_DECAY_BASE is 128 - used instead of 100 so we can shift vs divide * - FSS_DECAY_USG is 96 - approximates 75% (96/128) * - ticks is updated whenever a process in this project is running * when the scheduler's tick processing fires. This is not a simple * counter, the values are based on the entries in the fss_nice_tick * array (see section 3 below). ticks is then reset to 0 so it can track * the next seconds worth of nice-adjusted time for the project. * * c) The third thing fss_decay_usage does is update each project's "share * usage" (shusage). This is the normalized usage value for the project and * is calculated this way: * * pset_shares^2 zone_int_shares^2 * usage * ------------- * ------------------ * kpj_shares^2 zone_ext_shares^2 * * - usage - see (1b) for more details * - pset_shares is the total of all *active* zone shares in the pset (by * default there is only one pset) * - kpj_shares is the individual project's share (project.cpu-shares rctl) * - zone_int_shares is the sum of shares of all active projects within the * zone (the zone-internal total) * - zone_ext_shares is the share value for the zone (zone.cpu-shares rctl) * * The shusage is used in step (2b) to calculate the thread's new internal * priority. A larger shusage value leads to a lower priority. * * 2) The fss_update function then calls fss_update_list to update the priority * of all threads. This does two things. * * a) First the thread's internal priority is decayed using the following * formula: * * fsspri * fss_nice_decay[nice_value]) * fsspri = ------------------------------------ * FSS_DECAY_BASE * * - FSS_DECAY_BASE is 128 as described above * * b) Second, if the thread is runnable (TS_RUN or TS_WAIT) calls fss_newpri * to update the user-mode priority (umdpri) of the runnable thread. * Threads that are running (TS_ONPROC) or waiting for an event (TS_SLEEP) * are not updated at this time. The updated user-mode priority can cause * threads to change their position in the run queue. * * The process's new internal fsspri is calculated using the following * formula. All runnable threads in the project will use the same shusage * and nrunnable values in their calculation. * * fsspri += shusage * nrunnable * ticks * * - shusage is the project's share usage, calculated in (1c) * - nrunnable is the number of runnable threads in the project * - ticks is the number of ticks this thread ran since the last fss_newpri * invocation. * * Finally the process's new user-mode priority is calculated using the * following formula: * * (fsspri * umdprirange) * umdpri = maxumdpri - ------------------------ * maxfsspri * * - maxumdpri is MINCLSYSPRI - 1 (i.e. 59) * - umdprirange is maxumdpri - 1 (i.e. 58) * - maxfsspri is the largest fsspri seen so far, as we're iterating all * runnable processes * * Thus, a higher internal priority (fsspri) leads to a lower user-mode * priority which means the thread runs less. The fsspri is higher when * the project's normalized share usage is higher, when the project has * more runnable threads, or when the thread has accumulated more run-time. * * This code has various checks to ensure the resulting umdpri is in the * range 1-59. See fss_newpri for more details. * * To reiterate, the above processing is performed once per second to recompute * the runnable thread user-mode priorities. * * 3) The final major component in the priority calculation is the tick * processing which occurs on a thread that is running when the clock * calls fss_tick. * * A thread can run continuously in user-land (compute-bound) for the * fss_quantum (see "dispadmin -c FSS -g" for the configurable properties). * The fss_quantum defaults to 11 (i.e. 11 ticks). * * Once the quantum has been consumed, the thread will call fss_newpri to * recompute its umdpri priority, as described above in (2b). Threads that * were T_ONPROC at the one second interval when runnable thread priorities * were recalculated will have their umdpri priority recalculated when their * quanta expires. * * To ensure that runnable threads within a project see the expected * round-robin behavior, there is a special case in fss_newpri for a thread * that has run for its quanta within the one second update interval. See * the handling for the quanta_up parameter within fss_newpri. * * Also of interest, the fss_tick code increments the project's tick value * using the fss_nice_tick array entry for the thread's nice value. The idea * behind the fss_nice_tick array is that the cost of a tick is lower at * positive nice values (so that it doesn't increase the project's usage * as much as normal) with a 50% drop at the maximum level and a 50% * increase at the minimum level. See (1b). The fss_nice_tick array is * initialized in fss_init using the following formula: * * FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2) - i) * -------------------------------------------------- * FSS_NICE_RANGE * * - FSS_TICK_COST is 1000, the tick cost for threads with nice level 0 * * FSS Data Structures: * * fsszone * ----- ----- * ----- | | | | * | |-------->| |<------->| |<---->... * | | ----- ----- * | | ^ ^ ^ * | |--- | \ \ * ----- | | \ \ * fsspset | | \ \ * | | \ \ * | ----- ----- ----- * -->| |<--->| |<--->| | * | | | | | | * ----- ----- ----- * fssproj * * That is, fsspsets contain a list of fsszone's that are currently active in * the pset, and a list of fssproj's, corresponding to projects with runnable * threads on the pset. fssproj's in turn point to the fsszone which they * are a member of. * * An fssproj_t is removed when there are no threads in it. * * An fsszone_t is removed when there are no projects with threads in it. */ static pri_t fss_init(id_t, int, classfuncs_t **); static struct sclass fss = { "FSS", fss_init, 0 }; extern struct mod_ops mod_schedops; /* * Module linkage information for the kernel. */ static struct modlsched modlsched = { &mod_schedops, "fair share scheduling class", &fss }; static struct modlinkage modlinkage = { MODREV_1, (void *)&modlsched, NULL }; #define FSS_MAXUPRI 60 /* * The fssproc_t structures are kept in an array of circular doubly linked * lists. A hash on the thread pointer is used to determine which list each * thread should be placed in. Each list has a dummy "head" which is never * removed, so the list is never empty. fss_update traverses these lists to * update the priorities of threads that have been waiting on the run queue. */ #define FSS_LISTS 16 /* number of lists, must be power of 2 */ #define FSS_LIST_HASH(t) (((uintptr_t)(t) >> 9) & (FSS_LISTS - 1)) #define FSS_LIST_NEXT(i) (((i) + 1) & (FSS_LISTS - 1)) #define FSS_LIST_INSERT(fssproc) \ { \ int index = FSS_LIST_HASH(fssproc->fss_tp); \ kmutex_t *lockp = &fss_listlock[index]; \ fssproc_t *headp = &fss_listhead[index]; \ mutex_enter(lockp); \ fssproc->fss_next = headp->fss_next; \ fssproc->fss_prev = headp; \ headp->fss_next->fss_prev = fssproc; \ headp->fss_next = fssproc; \ mutex_exit(lockp); \ } #define FSS_LIST_DELETE(fssproc) \ { \ int index = FSS_LIST_HASH(fssproc->fss_tp); \ kmutex_t *lockp = &fss_listlock[index]; \ mutex_enter(lockp); \ fssproc->fss_prev->fss_next = fssproc->fss_next; \ fssproc->fss_next->fss_prev = fssproc->fss_prev; \ mutex_exit(lockp); \ } #define FSS_TICK_COST 1000 /* tick cost for threads with nice level = 0 */ /* * Decay rate percentages are based on n/128 rather than n/100 so that * calculations can avoid having to do an integer divide by 100 (divide * by FSS_DECAY_BASE == 128 optimizes to an arithmetic shift). * * FSS_DECAY_MIN = 83/128 ~= 65% * FSS_DECAY_MAX = 108/128 ~= 85% * FSS_DECAY_USG = 96/128 ~= 75% */ #define FSS_DECAY_MIN 83 /* fsspri decay pct for threads w/ nice -20 */ #define FSS_DECAY_MAX 108 /* fsspri decay pct for threads w/ nice +19 */ #define FSS_DECAY_USG 96 /* fssusage decay pct for projects */ #define FSS_DECAY_BASE 128 /* base for decay percentages above */ #define FSS_NICE_MIN 0 #define FSS_NICE_MAX (2 * NZERO - 1) #define FSS_NICE_RANGE (FSS_NICE_MAX - FSS_NICE_MIN + 1) static int fss_nice_tick[FSS_NICE_RANGE]; static int fss_nice_decay[FSS_NICE_RANGE]; static pri_t fss_maxupri = FSS_MAXUPRI; /* maximum FSS user priority */ static pri_t fss_maxumdpri; /* maximum user mode fss priority */ static pri_t fss_maxglobpri; /* maximum global priority used by fss class */ static pri_t fss_minglobpri; /* minimum global priority */ static fssproc_t fss_listhead[FSS_LISTS]; static kmutex_t fss_listlock[FSS_LISTS]; static fsspset_t *fsspsets; static kmutex_t fsspsets_lock; /* protects fsspsets */ static id_t fss_cid; static time_t fss_minrun = 2; /* t_pri becomes 59 within 2 secs */ static time_t fss_minslp = 2; /* min time on sleep queue for hardswap */ static int fss_quantum = 11; static void fss_newpri(fssproc_t *, boolean_t); static void fss_update(void *); static int fss_update_list(int); static void fss_change_priority(kthread_t *, fssproc_t *); static int fss_admin(caddr_t, cred_t *); static int fss_getclinfo(void *); static int fss_parmsin(void *); static int fss_parmsout(void *, pc_vaparms_t *); static int fss_vaparmsin(void *, pc_vaparms_t *); static int fss_vaparmsout(void *, pc_vaparms_t *); static int fss_getclpri(pcpri_t *); static int fss_alloc(void **, int); static void fss_free(void *); static int fss_enterclass(kthread_t *, id_t, void *, cred_t *, void *); static void fss_exitclass(void *); static int fss_canexit(kthread_t *, cred_t *); static int fss_fork(kthread_t *, kthread_t *, void *); static void fss_forkret(kthread_t *, kthread_t *); static void fss_parmsget(kthread_t *, void *); static int fss_parmsset(kthread_t *, void *, id_t, cred_t *); static void fss_stop(kthread_t *, int, int); static void fss_exit(kthread_t *); static void fss_active(kthread_t *); static void fss_inactive(kthread_t *); static pri_t fss_swapin(kthread_t *, int); static pri_t fss_swapout(kthread_t *, int); static void fss_trapret(kthread_t *); static void fss_preempt(kthread_t *); static void fss_setrun(kthread_t *); static void fss_sleep(kthread_t *); static void fss_tick(kthread_t *); static void fss_wakeup(kthread_t *); static int fss_donice(kthread_t *, cred_t *, int, int *); static int fss_doprio(kthread_t *, cred_t *, int, int *); static pri_t fss_globpri(kthread_t *); static void fss_yield(kthread_t *); static void fss_nullsys(); static struct classfuncs fss_classfuncs = { /* class functions */ fss_admin, fss_getclinfo, fss_parmsin, fss_parmsout, fss_vaparmsin, fss_vaparmsout, fss_getclpri, fss_alloc, fss_free, /* thread functions */ fss_enterclass, fss_exitclass, fss_canexit, fss_fork, fss_forkret, fss_parmsget, fss_parmsset, fss_stop, fss_exit, fss_active, fss_inactive, fss_swapin, fss_swapout, fss_trapret, fss_preempt, fss_setrun, fss_sleep, fss_tick, fss_wakeup, fss_donice, fss_globpri, fss_nullsys, /* set_process_group */ fss_yield, fss_doprio, }; int _init() { return (mod_install(&modlinkage)); } int _fini() { return (EBUSY); } int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } /*ARGSUSED*/ static int fss_project_walker(kproject_t *kpj, void *buf) { return (0); } void * fss_allocbuf(int op, int type) { fssbuf_t *fssbuf; void **fsslist; int cnt; int i; size_t size; ASSERT(op == FSS_NPSET_BUF || op == FSS_NPROJ_BUF || op == FSS_ONE_BUF); ASSERT(type == FSS_ALLOC_PROJ || type == FSS_ALLOC_ZONE); ASSERT(MUTEX_HELD(&cpu_lock)); fssbuf = kmem_zalloc(sizeof (fssbuf_t), KM_SLEEP); switch (op) { case FSS_NPSET_BUF: cnt = cpupart_list(NULL, 0, CP_NONEMPTY); break; case FSS_NPROJ_BUF: cnt = project_walk_all(ALL_ZONES, fss_project_walker, NULL); break; case FSS_ONE_BUF: cnt = 1; break; } switch (type) { case FSS_ALLOC_PROJ: size = sizeof (fssproj_t); break; case FSS_ALLOC_ZONE: size = sizeof (fsszone_t); break; } fsslist = kmem_zalloc(cnt * sizeof (void *), KM_SLEEP); fssbuf->fssb_size = cnt; fssbuf->fssb_list = fsslist; for (i = 0; i < cnt; i++) fsslist[i] = kmem_zalloc(size, KM_SLEEP); return (fssbuf); } void fss_freebuf(fssbuf_t *fssbuf, int type) { void **fsslist; int i; size_t size; ASSERT(fssbuf != NULL); ASSERT(type == FSS_ALLOC_PROJ || type == FSS_ALLOC_ZONE); fsslist = fssbuf->fssb_list; switch (type) { case FSS_ALLOC_PROJ: size = sizeof (fssproj_t); break; case FSS_ALLOC_ZONE: size = sizeof (fsszone_t); break; } for (i = 0; i < fssbuf->fssb_size; i++) { if (fsslist[i] != NULL) kmem_free(fsslist[i], size); } kmem_free(fsslist, sizeof (void *) * fssbuf->fssb_size); kmem_free(fssbuf, sizeof (fssbuf_t)); } static fsspset_t * fss_find_fsspset(cpupart_t *cpupart) { int i; fsspset_t *fsspset = NULL; int found = 0; ASSERT(cpupart != NULL); ASSERT(MUTEX_HELD(&fsspsets_lock)); /* * Search for the cpupart pointer in the array of fsspsets. */ for (i = 0; i < max_ncpus; i++) { fsspset = &fsspsets[i]; if (fsspset->fssps_cpupart == cpupart) { ASSERT(fsspset->fssps_nproj > 0); found = 1; break; } } if (found == 0) { /* * If we didn't find anything, then use the first * available slot in the fsspsets array. */ for (i = 0; i < max_ncpus; i++) { fsspset = &fsspsets[i]; if (fsspset->fssps_cpupart == NULL) { ASSERT(fsspset->fssps_nproj == 0); found = 1; break; } } fsspset->fssps_cpupart = cpupart; } ASSERT(found == 1); return (fsspset); } static void fss_del_fsspset(fsspset_t *fsspset) { ASSERT(MUTEX_HELD(&fsspsets_lock)); ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); ASSERT(fsspset->fssps_nproj == 0); ASSERT(fsspset->fssps_list == NULL); ASSERT(fsspset->fssps_zones == NULL); fsspset->fssps_cpupart = NULL; fsspset->fssps_maxfsspri = 0; fsspset->fssps_shares = 0; } /* * The following routine returns a pointer to the fsszone structure which * belongs to zone "zone" and cpu partition fsspset, if such structure exists. */ static fsszone_t * fss_find_fsszone(fsspset_t *fsspset, zone_t *zone) { fsszone_t *fsszone; ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); if (fsspset->fssps_list != NULL) { /* * There are projects/zones active on this cpu partition * already. Try to find our zone among them. */ fsszone = fsspset->fssps_zones; do { if (fsszone->fssz_zone == zone) { return (fsszone); } fsszone = fsszone->fssz_next; } while (fsszone != fsspset->fssps_zones); } return (NULL); } /* * The following routine links new fsszone structure into doubly linked list of * zones active on the specified cpu partition. */ static void fss_insert_fsszone(fsspset_t *fsspset, zone_t *zone, fsszone_t *fsszone) { ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); fsszone->fssz_zone = zone; fsszone->fssz_rshares = zone->zone_shares; if (fsspset->fssps_zones == NULL) { /* * This will be the first fsszone for this fsspset */ fsszone->fssz_next = fsszone->fssz_prev = fsszone; fsspset->fssps_zones = fsszone; } else { /* * Insert this fsszone to the doubly linked list. */ fsszone_t *fssz_head = fsspset->fssps_zones; fsszone->fssz_next = fssz_head; fsszone->fssz_prev = fssz_head->fssz_prev; fssz_head->fssz_prev->fssz_next = fsszone; fssz_head->fssz_prev = fsszone; fsspset->fssps_zones = fsszone; } } /* * The following routine removes a single fsszone structure from the doubly * linked list of zones active on the specified cpu partition. Note that * global fsspsets_lock must be held in case this fsszone structure is the last * on the above mentioned list. Also note that the fsszone structure is not * freed here, it is the responsibility of the caller to call kmem_free for it. */ static void fss_remove_fsszone(fsspset_t *fsspset, fsszone_t *fsszone) { ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); ASSERT(fsszone->fssz_nproj == 0); ASSERT(fsszone->fssz_shares == 0); ASSERT(fsszone->fssz_runnable == 0); if (fsszone->fssz_next != fsszone) { /* * This is not the last zone in the list. */ fsszone->fssz_prev->fssz_next = fsszone->fssz_next; fsszone->fssz_next->fssz_prev = fsszone->fssz_prev; if (fsspset->fssps_zones == fsszone) fsspset->fssps_zones = fsszone->fssz_next; } else { /* * This was the last zone active in this cpu partition. */ fsspset->fssps_zones = NULL; } } /* * The following routine returns a pointer to the fssproj structure * which belongs to project kpj and cpu partition fsspset, if such structure * exists. */ static fssproj_t * fss_find_fssproj(fsspset_t *fsspset, kproject_t *kpj) { fssproj_t *fssproj; ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); if (fsspset->fssps_list != NULL) { /* * There are projects running on this cpu partition already. * Try to find our project among them. */ fssproj = fsspset->fssps_list; do { if (fssproj->fssp_proj == kpj) { ASSERT(fssproj->fssp_pset == fsspset); return (fssproj); } fssproj = fssproj->fssp_next; } while (fssproj != fsspset->fssps_list); } return (NULL); } /* * The following routine links new fssproj structure into doubly linked list * of projects running on the specified cpu partition. */ static void fss_insert_fssproj(fsspset_t *fsspset, kproject_t *kpj, fsszone_t *fsszone, fssproj_t *fssproj) { ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); fssproj->fssp_pset = fsspset; fssproj->fssp_proj = kpj; fssproj->fssp_shares = kpj->kpj_shares; fsspset->fssps_nproj++; if (fsspset->fssps_list == NULL) { /* * This will be the first fssproj for this fsspset */ fssproj->fssp_next = fssproj->fssp_prev = fssproj; fsspset->fssps_list = fssproj; } else { /* * Insert this fssproj to the doubly linked list. */ fssproj_t *fssp_head = fsspset->fssps_list; fssproj->fssp_next = fssp_head; fssproj->fssp_prev = fssp_head->fssp_prev; fssp_head->fssp_prev->fssp_next = fssproj; fssp_head->fssp_prev = fssproj; fsspset->fssps_list = fssproj; } fssproj->fssp_fsszone = fsszone; fsszone->fssz_nproj++; ASSERT(fsszone->fssz_nproj != 0); } /* * The following routine removes a single fssproj structure from the doubly * linked list of projects running on the specified cpu partition. Note that * global fsspsets_lock must be held in case if this fssproj structure is the * last on the above mentioned list. Also note that the fssproj structure is * not freed here, it is the responsibility of the caller to call kmem_free * for it. */ static void fss_remove_fssproj(fsspset_t *fsspset, fssproj_t *fssproj) { fsszone_t *fsszone; ASSERT(MUTEX_HELD(&fsspsets_lock)); ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); ASSERT(fssproj->fssp_runnable == 0); fsspset->fssps_nproj--; fsszone = fssproj->fssp_fsszone; fsszone->fssz_nproj--; if (fssproj->fssp_next != fssproj) { /* * This is not the last part in the list. */ fssproj->fssp_prev->fssp_next = fssproj->fssp_next; fssproj->fssp_next->fssp_prev = fssproj->fssp_prev; if (fsspset->fssps_list == fssproj) fsspset->fssps_list = fssproj->fssp_next; if (fsszone->fssz_nproj == 0) fss_remove_fsszone(fsspset, fsszone); } else { /* * This was the last project part running * at this cpu partition. */ fsspset->fssps_list = NULL; ASSERT(fsspset->fssps_nproj == 0); ASSERT(fsszone->fssz_nproj == 0); fss_remove_fsszone(fsspset, fsszone); fss_del_fsspset(fsspset); } } static void fss_inactive(kthread_t *t) { fssproc_t *fssproc; fssproj_t *fssproj; fsspset_t *fsspset; fsszone_t *fsszone; ASSERT(THREAD_LOCK_HELD(t)); fssproc = FSSPROC(t); fssproj = FSSPROC2FSSPROJ(fssproc); if (fssproj == NULL) /* if this thread already exited */ return; fsspset = FSSPROJ2FSSPSET(fssproj); fsszone = fssproj->fssp_fsszone; disp_lock_enter_high(&fsspset->fssps_displock); ASSERT(fssproj->fssp_runnable > 0); if (--fssproj->fssp_runnable == 0) { fsszone->fssz_shares -= fssproj->fssp_shares; if (--fsszone->fssz_runnable == 0) fsspset->fssps_shares -= fsszone->fssz_rshares; } ASSERT(fssproc->fss_runnable == 1); fssproc->fss_runnable = 0; disp_lock_exit_high(&fsspset->fssps_displock); } static void fss_active(kthread_t *t) { fssproc_t *fssproc; fssproj_t *fssproj; fsspset_t *fsspset; fsszone_t *fsszone; ASSERT(THREAD_LOCK_HELD(t)); fssproc = FSSPROC(t); fssproj = FSSPROC2FSSPROJ(fssproc); if (fssproj == NULL) /* if this thread already exited */ return; fsspset = FSSPROJ2FSSPSET(fssproj); fsszone = fssproj->fssp_fsszone; disp_lock_enter_high(&fsspset->fssps_displock); if (++fssproj->fssp_runnable == 1) { fsszone->fssz_shares += fssproj->fssp_shares; if (++fsszone->fssz_runnable == 1) fsspset->fssps_shares += fsszone->fssz_rshares; } ASSERT(fssproc->fss_runnable == 0); fssproc->fss_runnable = 1; disp_lock_exit_high(&fsspset->fssps_displock); } /* * Fair share scheduler initialization. Called by dispinit() at boot time. * We can ignore clparmsz argument since we know that the smallest possible * parameter buffer is big enough for us. */ /*ARGSUSED*/ static pri_t fss_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp) { int i; ASSERT(MUTEX_HELD(&cpu_lock)); fss_cid = cid; fss_maxumdpri = minclsyspri - 1; fss_maxglobpri = minclsyspri; fss_minglobpri = 0; fsspsets = kmem_zalloc(sizeof (fsspset_t) * max_ncpus, KM_SLEEP); /* * Initialize the fssproc hash table. */ for (i = 0; i < FSS_LISTS; i++) fss_listhead[i].fss_next = fss_listhead[i].fss_prev = &fss_listhead[i]; *clfuncspp = &fss_classfuncs; /* * Fill in fss_nice_tick and fss_nice_decay arrays: * The cost of a tick is lower at positive nice values (so that it * will not increase its project's usage as much as normal) with 50% * drop at the maximum level and 50% increase at the minimum level. * The fsspri decay is slower at positive nice values. fsspri values * of processes with negative nice levels must decay faster to receive * time slices more frequently than normal. */ for (i = 0; i < FSS_NICE_RANGE; i++) { fss_nice_tick[i] = (FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2) - i)) / FSS_NICE_RANGE; fss_nice_decay[i] = FSS_DECAY_MIN + ((FSS_DECAY_MAX - FSS_DECAY_MIN) * i) / (FSS_NICE_RANGE - 1); } return (fss_maxglobpri); } /* * Calculate the new fss_umdpri based on the usage, the normalized share usage * and the number of active threads. Reset the tick counter for this thread. * * When calculating the new priority using the standard formula we can hit * a scenario where we don't have good round-robin behavior. This would be * most commonly seen when there is a zone with lots of runnable threads. * In the bad scenario we will see the following behavior when using the * standard formula and these conditions: * * - there are multiple runnable threads in the zone (project) * - the fssps_maxfsspri is a very large value * - (we also know all of these threads will use the project's * fssp_shusage) * * Under these conditions, a thread with a low fss_fsspri value is chosen * to run and the thread gets a high fss_umdpri. This thread can run for * its full quanta (fss_timeleft) at which time fss_newpri is called to * calculate the thread's new priority. * * In this case, because the newly calculated fsspri value is much smaller * (orders of magnitude) than the fssps_maxfsspri value, if we used the * standard formula the thread will still get a high fss_umdpri value and * will run again for another quanta, even though there are other runnable * threads in the project. * * For a thread that is runnable for a long time, the thread can continue * to run for many quanta (totaling many seconds) before the thread's fsspri * exceeds the fssps_maxfsspri and the thread's fss_umdpri is reset back * down to 1. This behavior also keeps the fssps_maxfsspr at a high value, * so that the next runnable thread might repeat this cycle. * * This leads to the case where we don't have round-robin behavior at quanta * granularity, but instead, runnable threads within the project only run * at several second intervals. * * To prevent this scenario from occuring, when a thread has consumed its * quanta and there are multiple runnable threads in the project, we * immediately cause the thread to hit fssps_maxfsspri so that it gets * reset back to 1 and another runnable thread in the project can run. */ static void fss_newpri(fssproc_t *fssproc, boolean_t quanta_up) { kthread_t *tp; fssproj_t *fssproj; fsspset_t *fsspset; fsszone_t *fsszone; fsspri_t fsspri, maxfsspri; uint32_t n_runnable; pri_t invpri; uint32_t ticks; tp = fssproc->fss_tp; ASSERT(tp != NULL); if (tp->t_cid != fss_cid) return; ASSERT(THREAD_LOCK_HELD(tp)); fssproj = FSSPROC2FSSPROJ(fssproc); fsszone = FSSPROJ2FSSZONE(fssproj); if (fssproj == NULL) /* * No need to change priority of exited threads. */ return; fsspset = FSSPROJ2FSSPSET(fssproj); disp_lock_enter_high(&fsspset->fssps_displock); ticks = fssproc->fss_ticks; fssproc->fss_ticks = 0; if (fssproj->fssp_shares == 0 || fsszone->fssz_rshares == 0) { /* * Special case: threads with no shares. */ fssproc->fss_umdpri = fss_minglobpri; disp_lock_exit_high(&fsspset->fssps_displock); return; } maxfsspri = fsspset->fssps_maxfsspri; n_runnable = fssproj->fssp_runnable; if (quanta_up && n_runnable > 1) { fsspri = maxfsspri; } else { /* * fsspri += fssp_shusage * nrunnable * ticks * If all three values are non-0, this typically calculates to * a large number (sometimes > 1M, sometimes > 100B) due to * fssp_shusage which can be > 1T. */ fsspri = fssproc->fss_fsspri; fsspri += fssproj->fssp_shusage * n_runnable * ticks; } fssproc->fss_fsspri = fsspri; /* * fss_maxumdpri is normally 59, since FSS priorities are 0-59. * If the previous calculation resulted in 0 (e.g. was 0 and added 0 * because ticks == 0), then instead of 0, we use the largest priority, * which is still small in comparison to the large numbers we typically * see. */ if (fsspri < fss_maxumdpri) fsspri = fss_maxumdpri; /* so that maxfsspri is != 0 */ /* * The general priority formula: * * (fsspri * umdprirange) * pri = maxumdpri - ------------------------ * maxfsspri * * If this thread's fsspri is greater than the previous largest * fsspri, then record it as the new high and priority for this * thread will be one (the lowest priority assigned to a thread * that has non-zero shares). Because of this check, maxfsspri can * change as this function is called via the * fss_update -> fss_update_list -> fss_newpri code path to update * all runnable threads. See the code in fss_update for how we * mitigate this issue. * * Note that this formula cannot produce out of bounds priority * values (0-59); if it is changed, additional checks may need to be * added. */ if (fsspri >= maxfsspri) { fsspset->fssps_maxfsspri = fsspri; disp_lock_exit_high(&fsspset->fssps_displock); fssproc->fss_umdpri = 1; } else { disp_lock_exit_high(&fsspset->fssps_displock); invpri = (fsspri * (fss_maxumdpri - 1)) / maxfsspri; fssproc->fss_umdpri = fss_maxumdpri - invpri; } } /* * Decays usages of all running projects, resets their tick counters and * calcluates the projects normalized share usage. Called once per second from * fss_update(). */ static void fss_decay_usage() { uint32_t zone_ext_shares, zone_int_shares; uint32_t kpj_shares, pset_shares; fsspset_t *fsspset; fssproj_t *fssproj; fsszone_t *fsszone; fsspri_t maxfsspri; int psetid; struct zone *zp; mutex_enter(&fsspsets_lock); /* * Go through all active processor sets and decay usages of projects * running on them. */ for (psetid = 0; psetid < max_ncpus; psetid++) { fsspset = &fsspsets[psetid]; mutex_enter(&fsspset->fssps_lock); fsspset->fssps_gen++; if (fsspset->fssps_cpupart == NULL || (fssproj = fsspset->fssps_list) == NULL) { mutex_exit(&fsspset->fssps_lock); continue; } /* * Decay maxfsspri for this cpu partition with the * fastest possible decay rate. */ disp_lock_enter(&fsspset->fssps_displock); pset_shares = fsspset->fssps_shares; maxfsspri = (fsspset->fssps_maxfsspri * fss_nice_decay[NZERO]) / FSS_DECAY_BASE; if (maxfsspri < fss_maxumdpri) maxfsspri = fss_maxumdpri; fsspset->fssps_maxfsspri = maxfsspri; do { fsszone = fssproj->fssp_fsszone; zp = fsszone->fssz_zone; /* * Reset zone's FSS stats if they are from a * previous cycle. */ if (fsspset->fssps_gen != zp->zone_fss_gen) { zp->zone_fss_gen = fsspset->fssps_gen; zp->zone_run_ticks = 0; } /* * Decay project usage, then add in this cycle's * nice tick value. */ fssproj->fssp_usage = (fssproj->fssp_usage * FSS_DECAY_USG) / FSS_DECAY_BASE + fssproj->fssp_ticks; fssproj->fssp_ticks = 0; zp->zone_run_ticks += fssproj->fssp_tick_cnt; fssproj->fssp_tick_cnt = 0; /* * Readjust the project's number of shares if it has * changed since we checked it last time. */ kpj_shares = fssproj->fssp_proj->kpj_shares; if (fssproj->fssp_shares != kpj_shares) { if (fssproj->fssp_runnable != 0) { fsszone->fssz_shares -= fssproj->fssp_shares; fsszone->fssz_shares += kpj_shares; } fssproj->fssp_shares = kpj_shares; } /* * Readjust the zone's number of shares if it * has changed since we checked it last time. */ zone_ext_shares = zp->zone_shares; if (fsszone->fssz_rshares != zone_ext_shares) { if (fsszone->fssz_runnable != 0) { fsspset->fssps_shares -= fsszone->fssz_rshares; fsspset->fssps_shares += zone_ext_shares; pset_shares = fsspset->fssps_shares; } fsszone->fssz_rshares = zone_ext_shares; } zone_int_shares = fsszone->fssz_shares; /* * If anything is runnable in the project, track the * overall project share percent for monitoring useage. */ if (fssproj->fssp_runnable > 0) { uint32_t zone_shr_pct; uint32_t int_shr_pct; /* * Times 1000 to get tenths of a percent * * zone_ext_shares * zone_shr_pct = --------------- * pset_shares * * kpj_shares * int_shr_pct = --------------- * zone_int_shares */ if (pset_shares == 0 || zone_int_shares == 0) { fssproj->fssp_shr_pct = 0; } else { zone_shr_pct = (zone_ext_shares * 1000) / pset_shares; int_shr_pct = (kpj_shares * 1000) / zone_int_shares; fssproj->fssp_shr_pct = (zone_shr_pct * int_shr_pct) / 1000; } } else { DTRACE_PROBE1(fss__prj__norun, fssproj_t *, fssproj); } /* * Calculate fssp_shusage value to be used * for fsspri increments for the next second. */ if (kpj_shares == 0 || zone_ext_shares == 0) { fssproj->fssp_shusage = 0; } else if (FSSPROJ2KPROJ(fssproj) == proj0p) { uint32_t zone_shr_pct; /* * Project 0 in the global zone has 50% * of its zone. See calculation above for * the zone's share percent. */ if (pset_shares == 0) zone_shr_pct = 1000; else zone_shr_pct = (zone_ext_shares * 1000) / pset_shares; fssproj->fssp_shr_pct = zone_shr_pct / 2; fssproj->fssp_shusage = (fssproj->fssp_usage * zone_int_shares * zone_int_shares) / (zone_ext_shares * zone_ext_shares); } else { /* * Thread's priority is based on its project's * normalized usage (shusage) value which gets * calculated this way: * * pset_shares^2 zone_int_shares^2 * usage * ------------- * ------------------ * kpj_shares^2 zone_ext_shares^2 * * Where zone_int_shares is the sum of shares * of all active projects within the zone (and * the pset), and zone_ext_shares is the number * of zone shares (ie, zone.cpu-shares). * * If there is only one zone active on the pset * the above reduces to: * * zone_int_shares^2 * shusage = usage * --------------------- * kpj_shares^2 * * If there's only one project active in the * zone this formula reduces to: * * pset_shares^2 * shusage = usage * ---------------------- * zone_ext_shares^2 * * shusage is one input to calculating fss_pri * in fss_newpri(). Larger values tend toward * lower priorities for processes in the proj. */ fssproj->fssp_shusage = fssproj->fssp_usage * pset_shares * zone_int_shares; fssproj->fssp_shusage /= kpj_shares * zone_ext_shares; fssproj->fssp_shusage *= pset_shares * zone_int_shares; fssproj->fssp_shusage /= kpj_shares * zone_ext_shares; } fssproj = fssproj->fssp_next; } while (fssproj != fsspset->fssps_list); disp_lock_exit(&fsspset->fssps_displock); mutex_exit(&fsspset->fssps_lock); } mutex_exit(&fsspsets_lock); } static void fss_change_priority(kthread_t *t, fssproc_t *fssproc) { pri_t new_pri; ASSERT(THREAD_LOCK_HELD(t)); new_pri = fssproc->fss_umdpri; ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri); t->t_cpri = fssproc->fss_upri; fssproc->fss_flags &= ~FSSRESTORE; if (t == curthread || t->t_state == TS_ONPROC) { /* * curthread is always onproc */ cpu_t *cp = t->t_disp_queue->disp_cpu; THREAD_CHANGE_PRI(t, new_pri); if (t == cp->cpu_dispthread) cp->cpu_dispatch_pri = DISP_PRIO(t); if (DISP_MUST_SURRENDER(t)) { fssproc->fss_flags |= FSSBACKQ; cpu_surrender(t); } else { fssproc->fss_timeleft = fss_quantum; } } else { /* * When the priority of a thread is changed, it may be * necessary to adjust its position on a sleep queue or * dispatch queue. The function thread_change_pri accomplishes * this. */ if (thread_change_pri(t, new_pri, 0)) { /* * The thread was on a run queue. */ fssproc->fss_timeleft = fss_quantum; } else { fssproc->fss_flags |= FSSBACKQ; } } } /* * Update priorities of all fair-sharing threads that are currently runnable * at a user mode priority based on the number of shares and current usage. * Called once per second via timeout which we reset here. * * There are several lists of fair-sharing threads broken up by a hash on the * thread pointer. Each list has its own lock. This avoids blocking all * fss_enterclass, fss_fork, and fss_exitclass operations while fss_update runs. * fss_update traverses each list in turn. * * Each time we're run (once/second) we may start at the next list and iterate * through all of the lists. By starting with a different list, we mitigate any * effects we would see updating the fssps_maxfsspri value in fss_newpri. */ static void fss_update(void *arg) { int i; int new_marker = -1; static int fss_update_marker; /* * Decay and update usages for all projects. */ fss_decay_usage(); /* * Start with the fss_update_marker list, then do the rest. */ i = fss_update_marker; /* * Go around all threads, set new priorities and decay * per-thread CPU usages. */ do { /* * If this is the first list after the current marker to have * threads with priority updates, advance the marker to this * list for the next time fss_update runs. */ if (fss_update_list(i) && new_marker == -1 && i != fss_update_marker) new_marker = i; } while ((i = FSS_LIST_NEXT(i)) != fss_update_marker); /* * Advance marker for the next fss_update call */ if (new_marker != -1) fss_update_marker = new_marker; (void) timeout(fss_update, arg, hz); } /* * Updates priority for a list of threads. Returns 1 if the priority of one * of the threads was actually updated, 0 if none were for various reasons * (thread is no longer in the FSS class, is not runnable, has the preemption * control no-preempt bit set, etc.) */ static int fss_update_list(int i) { fssproc_t *fssproc; fssproj_t *fssproj; fsspri_t fsspri; pri_t fss_umdpri; kthread_t *t; int updated = 0; mutex_enter(&fss_listlock[i]); for (fssproc = fss_listhead[i].fss_next; fssproc != &fss_listhead[i]; fssproc = fssproc->fss_next) { t = fssproc->fss_tp; /* * Lock the thread and verify the state. */ thread_lock(t); /* * Skip the thread if it is no longer in the FSS class or * is running with kernel mode priority. */ if (t->t_cid != fss_cid) goto next; if ((fssproc->fss_flags & FSSKPRI) != 0) goto next; fssproj = FSSPROC2FSSPROJ(fssproc); if (fssproj == NULL) goto next; if (fssproj->fssp_shares != 0) { /* * Decay fsspri value. */ fsspri = fssproc->fss_fsspri; fsspri = (fsspri * fss_nice_decay[fssproc->fss_nice]) / FSS_DECAY_BASE; fssproc->fss_fsspri = fsspri; } if (t->t_schedctl && schedctl_get_nopreempt(t)) goto next; if (t->t_state != TS_RUN && t->t_state != TS_WAIT) { /* * Make next syscall/trap call fss_trapret */ t->t_trapret = 1; aston(t); if (t->t_state == TS_ONPROC) DTRACE_PROBE1(fss__onproc, fssproc_t *, fssproc); goto next; } fss_newpri(fssproc, B_FALSE); updated = 1; fss_umdpri = fssproc->fss_umdpri; /* * Only dequeue the thread if it needs to be moved; otherwise * it should just round-robin here. */ if (t->t_pri != fss_umdpri) fss_change_priority(t, fssproc); next: thread_unlock(t); } mutex_exit(&fss_listlock[i]); return (updated); } /*ARGSUSED*/ static int fss_admin(caddr_t uaddr, cred_t *reqpcredp) { fssadmin_t fssadmin; if (copyin(uaddr, &fssadmin, sizeof (fssadmin_t))) return (EFAULT); switch (fssadmin.fss_cmd) { case FSS_SETADMIN: if (secpolicy_dispadm(reqpcredp) != 0) return (EPERM); if (fssadmin.fss_quantum <= 0 || fssadmin.fss_quantum >= hz) return (EINVAL); fss_quantum = fssadmin.fss_quantum; break; case FSS_GETADMIN: fssadmin.fss_quantum = fss_quantum; if (copyout(&fssadmin, uaddr, sizeof (fssadmin_t))) return (EFAULT); break; default: return (EINVAL); } return (0); } static int fss_getclinfo(void *infop) { fssinfo_t *fssinfo = (fssinfo_t *)infop; fssinfo->fss_maxupri = fss_maxupri; return (0); } static int fss_parmsin(void *parmsp) { fssparms_t *fssparmsp = (fssparms_t *)parmsp; /* * Check validity of parameters. */ if ((fssparmsp->fss_uprilim > fss_maxupri || fssparmsp->fss_uprilim < -fss_maxupri) && fssparmsp->fss_uprilim != FSS_NOCHANGE) return (EINVAL); if ((fssparmsp->fss_upri > fss_maxupri || fssparmsp->fss_upri < -fss_maxupri) && fssparmsp->fss_upri != FSS_NOCHANGE) return (EINVAL); return (0); } /*ARGSUSED*/ static int fss_parmsout(void *parmsp, pc_vaparms_t *vaparmsp) { return (0); } static int fss_vaparmsin(void *parmsp, pc_vaparms_t *vaparmsp) { fssparms_t *fssparmsp = (fssparms_t *)parmsp; int priflag = 0; int limflag = 0; uint_t cnt; pc_vaparm_t *vpp = &vaparmsp->pc_parms[0]; /* * FSS_NOCHANGE (-32768) is outside of the range of values for * fss_uprilim and fss_upri. If the structure fssparms_t is changed, * FSS_NOCHANGE should be replaced by a flag word. */ fssparmsp->fss_uprilim = FSS_NOCHANGE; fssparmsp->fss_upri = FSS_NOCHANGE; /* * Get the varargs parameter and check validity of parameters. */ if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT) return (EINVAL); for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) { switch (vpp->pc_key) { case FSS_KY_UPRILIM: if (limflag++) return (EINVAL); fssparmsp->fss_uprilim = (pri_t)vpp->pc_parm; if (fssparmsp->fss_uprilim > fss_maxupri || fssparmsp->fss_uprilim < -fss_maxupri) return (EINVAL); break; case FSS_KY_UPRI: if (priflag++) return (EINVAL); fssparmsp->fss_upri = (pri_t)vpp->pc_parm; if (fssparmsp->fss_upri > fss_maxupri || fssparmsp->fss_upri < -fss_maxupri) return (EINVAL); break; default: return (EINVAL); } } if (vaparmsp->pc_vaparmscnt == 0) { /* * Use default parameters. */ fssparmsp->fss_upri = fssparmsp->fss_uprilim = 0; } return (0); } /* * Copy all selected fair-sharing class parameters to the user. The parameters * are specified by a key. */ static int fss_vaparmsout(void *parmsp, pc_vaparms_t *vaparmsp) { fssparms_t *fssparmsp = (fssparms_t *)parmsp; int priflag = 0; int limflag = 0; uint_t cnt; pc_vaparm_t *vpp = &vaparmsp->pc_parms[0]; ASSERT(MUTEX_NOT_HELD(&curproc->p_lock)); if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT) return (EINVAL); for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) { switch (vpp->pc_key) { case FSS_KY_UPRILIM: if (limflag++) return (EINVAL); if (copyout(&fssparmsp->fss_uprilim, (caddr_t)(uintptr_t)vpp->pc_parm, sizeof (pri_t))) return (EFAULT); break; case FSS_KY_UPRI: if (priflag++) return (EINVAL); if (copyout(&fssparmsp->fss_upri, (caddr_t)(uintptr_t)vpp->pc_parm, sizeof (pri_t))) return (EFAULT); break; default: return (EINVAL); } } return (0); } /* * Return the user mode scheduling priority range. */ static int fss_getclpri(pcpri_t *pcprip) { pcprip->pc_clpmax = fss_maxupri; pcprip->pc_clpmin = -fss_maxupri; return (0); } static int fss_alloc(void **p, int flag) { void *bufp; if ((bufp = kmem_zalloc(sizeof (fssproc_t), flag)) == NULL) { return (ENOMEM); } else { *p = bufp; return (0); } } static void fss_free(void *bufp) { if (bufp) kmem_free(bufp, sizeof (fssproc_t)); } /* * Thread functions */ static int fss_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp, void *bufp) { fssparms_t *fssparmsp = (fssparms_t *)parmsp; fssproc_t *fssproc; pri_t reqfssuprilim; pri_t reqfssupri; static uint32_t fssexists = 0; fsspset_t *fsspset; fssproj_t *fssproj; fsszone_t *fsszone; kproject_t *kpj; zone_t *zone; int fsszone_allocated = 0; fssproc = (fssproc_t *)bufp; ASSERT(fssproc != NULL); ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); /* * Only root can move threads to FSS class. */ if (reqpcredp != NULL && secpolicy_setpriority(reqpcredp) != 0) return (EPERM); /* * Initialize the fssproc structure. */ fssproc->fss_umdpri = fss_maxumdpri / 2; if (fssparmsp == NULL) { /* * Use default values. */ fssproc->fss_nice = NZERO; fssproc->fss_uprilim = fssproc->fss_upri = 0; } else { /* * Use supplied values. */ if (fssparmsp->fss_uprilim == FSS_NOCHANGE) { reqfssuprilim = 0; } else { if (fssparmsp->fss_uprilim > 0 && secpolicy_setpriority(reqpcredp) != 0) return (EPERM); reqfssuprilim = fssparmsp->fss_uprilim; } if (fssparmsp->fss_upri == FSS_NOCHANGE) { reqfssupri = reqfssuprilim; } else { if (fssparmsp->fss_upri > 0 && secpolicy_setpriority(reqpcredp) != 0) return (EPERM); /* * Set the user priority to the requested value or * the upri limit, whichever is lower. */ reqfssupri = fssparmsp->fss_upri; if (reqfssupri > reqfssuprilim) reqfssupri = reqfssuprilim; } fssproc->fss_uprilim = reqfssuprilim; fssproc->fss_upri = reqfssupri; fssproc->fss_nice = NZERO - (NZERO * reqfssupri) / fss_maxupri; if (fssproc->fss_nice > FSS_NICE_MAX) fssproc->fss_nice = FSS_NICE_MAX; } fssproc->fss_timeleft = fss_quantum; fssproc->fss_tp = t; cpucaps_sc_init(&fssproc->fss_caps); /* * Put a lock on our fsspset structure. */ mutex_enter(&fsspsets_lock); fsspset = fss_find_fsspset(t->t_cpupart); mutex_enter(&fsspset->fssps_lock); mutex_exit(&fsspsets_lock); zone = ttoproc(t)->p_zone; if ((fsszone = fss_find_fsszone(fsspset, zone)) == NULL) { if ((fsszone = kmem_zalloc(sizeof (fsszone_t), KM_NOSLEEP)) == NULL) { mutex_exit(&fsspset->fssps_lock); return (ENOMEM); } else { fsszone_allocated = 1; fss_insert_fsszone(fsspset, zone, fsszone); } } kpj = ttoproj(t); if ((fssproj = fss_find_fssproj(fsspset, kpj)) == NULL) { if ((fssproj = kmem_zalloc(sizeof (fssproj_t), KM_NOSLEEP)) == NULL) { if (fsszone_allocated) { fss_remove_fsszone(fsspset, fsszone); kmem_free(fsszone, sizeof (fsszone_t)); } mutex_exit(&fsspset->fssps_lock); return (ENOMEM); } else { fss_insert_fssproj(fsspset, kpj, fsszone, fssproj); } } fssproj->fssp_threads++; fssproc->fss_proj = fssproj; /* * Reset priority. Process goes to a "user mode" priority here * regardless of whether or not it has slept since entering the kernel. */ thread_lock(t); t->t_clfuncs = &(sclass[cid].cl_funcs->thread); t->t_cid = cid; t->t_cldata = (void *)fssproc; t->t_schedflag |= TS_RUNQMATCH; fss_change_priority(t, fssproc); if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || t->t_state == TS_WAIT) fss_active(t); thread_unlock(t); mutex_exit(&fsspset->fssps_lock); /* * Link new structure into fssproc list. */ FSS_LIST_INSERT(fssproc); /* * If this is the first fair-sharing thread to occur since boot, * we set up the initial call to fss_update() here. Use an atomic * compare-and-swap since that's easier and faster than a mutex * (but check with an ordinary load first since most of the time * this will already be done). */ if (fssexists == 0 && atomic_cas_32(&fssexists, 0, 1) == 0) (void) timeout(fss_update, NULL, hz); return (0); } /* * Remove fssproc_t from the list. */ static void fss_exitclass(void *procp) { fssproc_t *fssproc = (fssproc_t *)procp; fssproj_t *fssproj; fsspset_t *fsspset; fsszone_t *fsszone; kthread_t *t = fssproc->fss_tp; /* * We should be either getting this thread off the deathrow or * this thread has already moved to another scheduling class and * we're being called with its old cldata buffer pointer. In both * cases, the content of this buffer can not be changed while we're * here. */ mutex_enter(&fsspsets_lock); thread_lock(t); if (t->t_cid != fss_cid) { /* * We're being called as a result of the priocntl() system * call -- someone is trying to move our thread to another * scheduling class. We can't call fss_inactive() here * because our thread's t_cldata pointer already points * to another scheduling class specific data. */ ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); fssproj = FSSPROC2FSSPROJ(fssproc); fsspset = FSSPROJ2FSSPSET(fssproj); fsszone = fssproj->fssp_fsszone; if (fssproc->fss_runnable) { disp_lock_enter_high(&fsspset->fssps_displock); if (--fssproj->fssp_runnable == 0) { fsszone->fssz_shares -= fssproj->fssp_shares; if (--fsszone->fssz_runnable == 0) fsspset->fssps_shares -= fsszone->fssz_rshares; } disp_lock_exit_high(&fsspset->fssps_displock); } thread_unlock(t); mutex_enter(&fsspset->fssps_lock); if (--fssproj->fssp_threads == 0) { fss_remove_fssproj(fsspset, fssproj); if (fsszone->fssz_nproj == 0) kmem_free(fsszone, sizeof (fsszone_t)); kmem_free(fssproj, sizeof (fssproj_t)); } mutex_exit(&fsspset->fssps_lock); } else { ASSERT(t->t_state == TS_FREE); /* * We're being called from thread_free() when our thread * is removed from the deathrow. There is nothing we need * do here since everything should've been done earlier * in fss_exit(). */ thread_unlock(t); } mutex_exit(&fsspsets_lock); FSS_LIST_DELETE(fssproc); fss_free(fssproc); } /*ARGSUSED*/ static int fss_canexit(kthread_t *t, cred_t *credp) { /* * A thread is allowed to exit FSS only if we have sufficient * privileges. */ if (credp != NULL && secpolicy_setpriority(credp) != 0) return (EPERM); else return (0); } /* * Initialize fair-share class specific proc structure for a child. */ static int fss_fork(kthread_t *pt, kthread_t *ct, void *bufp) { fssproc_t *pfssproc; /* ptr to parent's fssproc structure */ fssproc_t *cfssproc; /* ptr to child's fssproc structure */ fssproj_t *fssproj; fsspset_t *fsspset; ASSERT(MUTEX_HELD(&ttoproc(pt)->p_lock)); ASSERT(ct->t_state == TS_STOPPED); cfssproc = (fssproc_t *)bufp; ASSERT(cfssproc != NULL); bzero(cfssproc, sizeof (fssproc_t)); thread_lock(pt); pfssproc = FSSPROC(pt); fssproj = FSSPROC2FSSPROJ(pfssproc); fsspset = FSSPROJ2FSSPSET(fssproj); thread_unlock(pt); mutex_enter(&fsspset->fssps_lock); /* * Initialize child's fssproc structure. */ thread_lock(pt); ASSERT(FSSPROJ(pt) == fssproj); cfssproc->fss_proj = fssproj; cfssproc->fss_timeleft = fss_quantum; cfssproc->fss_umdpri = pfssproc->fss_umdpri; cfssproc->fss_fsspri = 0; cfssproc->fss_uprilim = pfssproc->fss_uprilim; cfssproc->fss_upri = pfssproc->fss_upri; cfssproc->fss_tp = ct; cfssproc->fss_nice = pfssproc->fss_nice; cpucaps_sc_init(&cfssproc->fss_caps); cfssproc->fss_flags = pfssproc->fss_flags & ~(FSSKPRI | FSSBACKQ | FSSRESTORE); ct->t_cldata = (void *)cfssproc; ct->t_schedflag |= TS_RUNQMATCH; thread_unlock(pt); fssproj->fssp_threads++; mutex_exit(&fsspset->fssps_lock); /* * Link new structure into fssproc hash table. */ FSS_LIST_INSERT(cfssproc); return (0); } /* * Child is placed at back of dispatcher queue and parent gives up processor * so that the child runs first after the fork. This allows the child * immediately execing to break the multiple use of copy on write pages with no * disk home. The parent will get to steal them back rather than uselessly * copying them. */ static void fss_forkret(kthread_t *t, kthread_t *ct) { proc_t *pp = ttoproc(t); proc_t *cp = ttoproc(ct); fssproc_t *fssproc; ASSERT(t == curthread); ASSERT(MUTEX_HELD(&pidlock)); /* * Grab the child's p_lock before dropping pidlock to ensure the * process does not disappear before we set it running. */ mutex_enter(&cp->p_lock); continuelwps(cp); mutex_exit(&cp->p_lock); mutex_enter(&pp->p_lock); mutex_exit(&pidlock); continuelwps(pp); thread_lock(t); fssproc = FSSPROC(t); fss_newpri(fssproc, B_FALSE); fssproc->fss_timeleft = fss_quantum; t->t_pri = fssproc->fss_umdpri; ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri); fssproc->fss_flags &= ~FSSKPRI; THREAD_TRANSITION(t); /* * We don't want to call fss_setrun(t) here because it may call * fss_active, which we don't need. */ fssproc->fss_flags &= ~FSSBACKQ; if (t->t_disp_time != ddi_get_lbolt()) setbackdq(t); else setfrontdq(t); thread_unlock(t); /* * Safe to drop p_lock now since it is safe to change * the scheduling class after this point. */ mutex_exit(&pp->p_lock); swtch(); } /* * Get the fair-sharing parameters of the thread pointed to by fssprocp into * the buffer pointed by fssparmsp. */ static void fss_parmsget(kthread_t *t, void *parmsp) { fssproc_t *fssproc = FSSPROC(t); fssparms_t *fssparmsp = (fssparms_t *)parmsp; fssparmsp->fss_uprilim = fssproc->fss_uprilim; fssparmsp->fss_upri = fssproc->fss_upri; } /*ARGSUSED*/ static int fss_parmsset(kthread_t *t, void *parmsp, id_t reqpcid, cred_t *reqpcredp) { char nice; pri_t reqfssuprilim; pri_t reqfssupri; fssproc_t *fssproc = FSSPROC(t); fssparms_t *fssparmsp = (fssparms_t *)parmsp; ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock)); if (fssparmsp->fss_uprilim == FSS_NOCHANGE) reqfssuprilim = fssproc->fss_uprilim; else reqfssuprilim = fssparmsp->fss_uprilim; if (fssparmsp->fss_upri == FSS_NOCHANGE) reqfssupri = fssproc->fss_upri; else reqfssupri = fssparmsp->fss_upri; /* * Make sure the user priority doesn't exceed the upri limit. */ if (reqfssupri > reqfssuprilim) reqfssupri = reqfssuprilim; /* * Basic permissions enforced by generic kernel code for all classes * require that a thread attempting to change the scheduling parameters * of a target thread be privileged or have a real or effective UID * matching that of the target thread. We are not called unless these * basic permission checks have already passed. The fair-sharing class * requires in addition that the calling thread be privileged if it * is attempting to raise the upri limit above its current value. * This may have been checked previously but if our caller passed us * a non-NULL credential pointer we assume it hasn't and we check it * here. */ if ((reqpcredp != NULL) && (reqfssuprilim > fssproc->fss_uprilim) && secpolicy_raisepriority(reqpcredp) != 0) return (EPERM); /* * Set fss_nice to the nice value corresponding to the user priority we * are setting. Note that setting the nice field of the parameter * struct won't affect upri or nice. */ nice = NZERO - (reqfssupri * NZERO) / fss_maxupri; if (nice > FSS_NICE_MAX) nice = FSS_NICE_MAX; thread_lock(t); fssproc->fss_uprilim = reqfssuprilim; fssproc->fss_upri = reqfssupri; fssproc->fss_nice = nice; fss_newpri(fssproc, B_FALSE); if ((fssproc->fss_flags & FSSKPRI) != 0) { thread_unlock(t); return (0); } fss_change_priority(t, fssproc); thread_unlock(t); return (0); } /* * The thread is being stopped. */ /*ARGSUSED*/ static void fss_stop(kthread_t *t, int why, int what) { ASSERT(THREAD_LOCK_HELD(t)); ASSERT(t == curthread); fss_inactive(t); } /* * The current thread is exiting, do necessary adjustments to its project */ static void fss_exit(kthread_t *t) { fsspset_t *fsspset; fssproj_t *fssproj; fssproc_t *fssproc; fsszone_t *fsszone; int free = 0; /* * Thread t here is either a current thread (in which case we hold * its process' p_lock), or a thread being destroyed by forklwp_fail(), * in which case we hold pidlock and thread is no longer on the * thread list. */ ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock) || MUTEX_HELD(&pidlock)); fssproc = FSSPROC(t); fssproj = FSSPROC2FSSPROJ(fssproc); fsspset = FSSPROJ2FSSPSET(fssproj); fsszone = fssproj->fssp_fsszone; mutex_enter(&fsspsets_lock); mutex_enter(&fsspset->fssps_lock); thread_lock(t); disp_lock_enter_high(&fsspset->fssps_displock); if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) { if (--fssproj->fssp_runnable == 0) { fsszone->fssz_shares -= fssproj->fssp_shares; if (--fsszone->fssz_runnable == 0) fsspset->fssps_shares -= fsszone->fssz_rshares; } ASSERT(fssproc->fss_runnable == 1); fssproc->fss_runnable = 0; } if (--fssproj->fssp_threads == 0) { fss_remove_fssproj(fsspset, fssproj); free = 1; } disp_lock_exit_high(&fsspset->fssps_displock); fssproc->fss_proj = NULL; /* mark this thread as already exited */ thread_unlock(t); if (free) { if (fsszone->fssz_nproj == 0) kmem_free(fsszone, sizeof (fsszone_t)); kmem_free(fssproj, sizeof (fssproj_t)); } mutex_exit(&fsspset->fssps_lock); mutex_exit(&fsspsets_lock); /* * A thread could be exiting in between clock ticks, so we need to * calculate how much CPU time it used since it was charged last time. * * CPU caps are not enforced on exiting processes - it is usually * desirable to exit as soon as possible to free resources. */ if (CPUCAPS_ON()) { thread_lock(t); fssproc = FSSPROC(t); (void) cpucaps_charge(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ONLY); thread_unlock(t); } } static void fss_nullsys() { } /* * fss_swapin() returns -1 if the thread is loaded or is not eligible to be * swapped in. Otherwise, it returns the thread's effective priority based * on swapout time and size of process (0 <= epri <= 0 SHRT_MAX). */ /*ARGSUSED*/ static pri_t fss_swapin(kthread_t *t, int flags) { fssproc_t *fssproc = FSSPROC(t); long epri = -1; proc_t *pp = ttoproc(t); ASSERT(THREAD_LOCK_HELD(t)); if (t->t_state == TS_RUN && (t->t_schedflag & TS_LOAD) == 0) { time_t swapout_time; swapout_time = (ddi_get_lbolt() - t->t_stime) / hz; if (INHERITED(t) || (fssproc->fss_flags & FSSKPRI)) { epri = (long)DISP_PRIO(t) + swapout_time; } else { /* * Threads which have been out for a long time, * have high user mode priority and are associated * with a small address space are more deserving. */ epri = fssproc->fss_umdpri; ASSERT(epri >= 0 && epri <= fss_maxumdpri); epri += swapout_time - pp->p_swrss / nz(maxpgio)/2; } /* * Scale epri so that SHRT_MAX / 2 represents zero priority. */ epri += SHRT_MAX / 2; if (epri < 0) epri = 0; else if (epri > SHRT_MAX) epri = SHRT_MAX; } return ((pri_t)epri); } /* * fss_swapout() returns -1 if the thread isn't loaded or is not eligible to * be swapped out. Otherwise, it returns the thread's effective priority * based on if the swapper is in softswap or hardswap mode. */ static pri_t fss_swapout(kthread_t *t, int flags) { fssproc_t *fssproc = FSSPROC(t); long epri = -1; proc_t *pp = ttoproc(t); time_t swapin_time; ASSERT(THREAD_LOCK_HELD(t)); if (INHERITED(t) || (fssproc->fss_flags & FSSKPRI) || (t->t_proc_flag & TP_LWPEXIT) || (t->t_state & (TS_ZOMB|TS_FREE|TS_STOPPED|TS_ONPROC|TS_WAIT)) || !(t->t_schedflag & TS_LOAD) || !(SWAP_OK(t))) return (-1); ASSERT(t->t_state & (TS_SLEEP | TS_RUN)); swapin_time = (ddi_get_lbolt() - t->t_stime) / hz; if (flags == SOFTSWAP) { if (t->t_state == TS_SLEEP && swapin_time > maxslp) { epri = 0; } else { return ((pri_t)epri); } } else { pri_t pri; if ((t->t_state == TS_SLEEP && swapin_time > fss_minslp) || (t->t_state == TS_RUN && swapin_time > fss_minrun)) { pri = fss_maxumdpri; epri = swapin_time - (rm_asrss(pp->p_as) / nz(maxpgio)/2) - (long)pri; } else { return ((pri_t)epri); } } /* * Scale epri so that SHRT_MAX / 2 represents zero priority. */ epri += SHRT_MAX / 2; if (epri < 0) epri = 0; else if (epri > SHRT_MAX) epri = SHRT_MAX; return ((pri_t)epri); } /* * If thread is currently at a kernel mode priority (has slept) and is * returning to the userland we assign it the appropriate user mode priority * and time quantum here. If we're lowering the thread's priority below that * of other runnable threads then we will set runrun via cpu_surrender() to * cause preemption. */ static void fss_trapret(kthread_t *t) { fssproc_t *fssproc = FSSPROC(t); cpu_t *cp = CPU; ASSERT(THREAD_LOCK_HELD(t)); ASSERT(t == curthread); ASSERT(cp->cpu_dispthread == t); ASSERT(t->t_state == TS_ONPROC); t->t_kpri_req = 0; if (fssproc->fss_flags & FSSKPRI) { /* * If thread has blocked in the kernel */ THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); cp->cpu_dispatch_pri = DISP_PRIO(t); ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri); fssproc->fss_flags &= ~FSSKPRI; if (DISP_MUST_SURRENDER(t)) cpu_surrender(t); } /* * Swapout lwp if the swapper is waiting for this thread to reach * a safe point. */ if (t->t_schedflag & TS_SWAPENQ) { thread_unlock(t); swapout_lwp(ttolwp(t)); thread_lock(t); } } /* * Arrange for thread to be placed in appropriate location on dispatcher queue. * This is called with the current thread in TS_ONPROC and locked. */ static void fss_preempt(kthread_t *t) { fssproc_t *fssproc = FSSPROC(t); klwp_t *lwp; uint_t flags; ASSERT(t == curthread); ASSERT(THREAD_LOCK_HELD(curthread)); ASSERT(t->t_state == TS_ONPROC); /* * If preempted in the kernel, make sure the thread has a kernel * priority if needed. */ lwp = curthread->t_lwp; if (!(fssproc->fss_flags & FSSKPRI) && lwp != NULL && t->t_kpri_req) { fssproc->fss_flags |= FSSKPRI; THREAD_CHANGE_PRI(t, minclsyspri); ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri); t->t_trapret = 1; /* so that fss_trapret will run */ aston(t); } /* * This thread may be placed on wait queue by CPU Caps. In this case we * do not need to do anything until it is removed from the wait queue. * Do not enforce CPU caps on threads running at a kernel priority */ if (CPUCAPS_ON()) { (void) cpucaps_charge(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE); if (!(fssproc->fss_flags & FSSKPRI) && CPUCAPS_ENFORCE(t)) return; } /* * If preempted in user-land mark the thread as swappable because it * cannot be holding any kernel locks. */ ASSERT(t->t_schedflag & TS_DONT_SWAP); if (lwp != NULL && lwp->lwp_state == LWP_USER) t->t_schedflag &= ~TS_DONT_SWAP; /* * Check to see if we're doing "preemption control" here. If * we are, and if the user has requested that this thread not * be preempted, and if preemptions haven't been put off for * too long, let the preemption happen here but try to make * sure the thread is rescheduled as soon as possible. We do * this by putting it on the front of the highest priority run * queue in the FSS class. If the preemption has been put off * for too long, clear the "nopreempt" bit and let the thread * be preempted. */ if (t->t_schedctl && schedctl_get_nopreempt(t)) { if (fssproc->fss_timeleft > -SC_MAX_TICKS) { DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t); if (!(fssproc->fss_flags & FSSKPRI)) { /* * If not already remembered, remember current * priority for restoration in fss_yield(). */ if (!(fssproc->fss_flags & FSSRESTORE)) { fssproc->fss_scpri = t->t_pri; fssproc->fss_flags |= FSSRESTORE; } THREAD_CHANGE_PRI(t, fss_maxumdpri); t->t_schedflag |= TS_DONT_SWAP; } schedctl_set_yield(t, 1); setfrontdq(t); return; } else { if (fssproc->fss_flags & FSSRESTORE) { THREAD_CHANGE_PRI(t, fssproc->fss_scpri); fssproc->fss_flags &= ~FSSRESTORE; } schedctl_set_nopreempt(t, 0); DTRACE_SCHED1(schedctl__preempt, kthread_t *, t); /* * Fall through and be preempted below. */ } } flags = fssproc->fss_flags & (FSSBACKQ | FSSKPRI); if (flags == FSSBACKQ) { fssproc->fss_timeleft = fss_quantum; fssproc->fss_flags &= ~FSSBACKQ; setbackdq(t); } else if (flags == (FSSBACKQ | FSSKPRI)) { fssproc->fss_flags &= ~FSSBACKQ; setbackdq(t); } else { setfrontdq(t); } } /* * Called when a thread is waking up and is to be placed on the run queue. */ static void fss_setrun(kthread_t *t) { fssproc_t *fssproc = FSSPROC(t); ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */ if (t->t_state == TS_SLEEP || t->t_state == TS_STOPPED) fss_active(t); fssproc->fss_timeleft = fss_quantum; fssproc->fss_flags &= ~FSSBACKQ; /* * If previously were running at the kernel priority then keep that * priority and the fss_timeleft doesn't matter. */ if ((fssproc->fss_flags & FSSKPRI) == 0) THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); if (t->t_disp_time != ddi_get_lbolt()) setbackdq(t); else setfrontdq(t); } /* * Prepare thread for sleep. We reset the thread priority so it will run at the * kernel priority level when it wakes up. */ static void fss_sleep(kthread_t *t) { fssproc_t *fssproc = FSSPROC(t); ASSERT(t == curthread); ASSERT(THREAD_LOCK_HELD(t)); ASSERT(t->t_state == TS_ONPROC); /* * Account for time spent on CPU before going to sleep. */ (void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE); fss_inactive(t); /* * Assign a system priority to the thread and arrange for it to be * retained when the thread is next placed on the run queue (i.e., * when it wakes up) instead of being given a new pri. Also arrange * for trapret processing as the thread leaves the system call so it * will drop back to normal priority range. */ if (t->t_kpri_req) { THREAD_CHANGE_PRI(t, minclsyspri); fssproc->fss_flags |= FSSKPRI; t->t_trapret = 1; /* so that fss_trapret will run */ aston(t); } else if (fssproc->fss_flags & FSSKPRI) { /* * The thread has done a THREAD_KPRI_REQUEST(), slept, then * done THREAD_KPRI_RELEASE() (so no t_kpri_req is 0 again), * then slept again all without finishing the current system * call so trapret won't have cleared FSSKPRI */ fssproc->fss_flags &= ~FSSKPRI; THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); if (DISP_MUST_SURRENDER(curthread)) cpu_surrender(t); } t->t_stime = ddi_get_lbolt(); /* time stamp for the swapper */ } /* * A tick interrupt has ocurrend on a running thread. Check to see if our * time slice has expired. We must also clear the TS_DONT_SWAP flag in * t_schedflag if the thread is eligible to be swapped out. */ static void fss_tick(kthread_t *t) { fssproc_t *fssproc; fssproj_t *fssproj; klwp_t *lwp; boolean_t call_cpu_surrender = B_FALSE; boolean_t cpucaps_enforce = B_FALSE; ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock)); /* * It's safe to access fsspset and fssproj structures because we're * holding our p_lock here. */ thread_lock(t); fssproc = FSSPROC(t); fssproj = FSSPROC2FSSPROJ(fssproc); if (fssproj != NULL) { fsspset_t *fsspset = FSSPROJ2FSSPSET(fssproj); disp_lock_enter_high(&fsspset->fssps_displock); fssproj->fssp_ticks += fss_nice_tick[fssproc->fss_nice]; fssproj->fssp_tick_cnt++; fssproc->fss_ticks++; disp_lock_exit_high(&fsspset->fssps_displock); } /* * Keep track of thread's project CPU usage. Note that projects * get charged even when threads are running in the kernel. * Do not surrender CPU if running in the SYS class. */ if (CPUCAPS_ON()) { cpucaps_enforce = cpucaps_charge(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE) && !(fssproc->fss_flags & FSSKPRI); } /* * A thread's execution time for threads running in the SYS class * is not tracked. */ if ((fssproc->fss_flags & FSSKPRI) == 0) { /* * If thread is not in kernel mode, decrement its fss_timeleft */ if (--fssproc->fss_timeleft <= 0) { pri_t new_pri; /* * If we're doing preemption control and trying to * avoid preempting this thread, just note that the * thread should yield soon and let it keep running * (unless it's been a while). */ if (t->t_schedctl && schedctl_get_nopreempt(t)) { if (fssproc->fss_timeleft > -SC_MAX_TICKS) { DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t); schedctl_set_yield(t, 1); thread_unlock_nopreempt(t); return; } } fssproc->fss_flags &= ~FSSRESTORE; fss_newpri(fssproc, B_TRUE); new_pri = fssproc->fss_umdpri; ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri); /* * When the priority of a thread is changed, it may * be necessary to adjust its position on a sleep queue * or dispatch queue. The function thread_change_pri * accomplishes this. */ if (thread_change_pri(t, new_pri, 0)) { if ((t->t_schedflag & TS_LOAD) && (lwp = t->t_lwp) && lwp->lwp_state == LWP_USER) t->t_schedflag &= ~TS_DONT_SWAP; fssproc->fss_timeleft = fss_quantum; } else { call_cpu_surrender = B_TRUE; } } else if (t->t_state == TS_ONPROC && t->t_pri < t->t_disp_queue->disp_maxrunpri) { /* * If there is a higher-priority thread which is * waiting for a processor, then thread surrenders * the processor. */ call_cpu_surrender = B_TRUE; } } if (cpucaps_enforce && 2 * fssproc->fss_timeleft > fss_quantum) { /* * The thread used more than half of its quantum, so assume that * it used the whole quantum. * * Update thread's priority just before putting it on the wait * queue so that it gets charged for the CPU time from its * quantum even before that quantum expires. */ fss_newpri(fssproc, B_FALSE); if (t->t_pri != fssproc->fss_umdpri) fss_change_priority(t, fssproc); /* * We need to call cpu_surrender for this thread due to cpucaps * enforcement, but fss_change_priority may have already done * so. In this case FSSBACKQ is set and there is no need to call * cpu-surrender again. */ if (!(fssproc->fss_flags & FSSBACKQ)) call_cpu_surrender = B_TRUE; } if (call_cpu_surrender) { fssproc->fss_flags |= FSSBACKQ; cpu_surrender(t); } thread_unlock_nopreempt(t); /* clock thread can't be preempted */ } /* * Processes waking up go to the back of their queue. We don't need to assign * a time quantum here because thread is still at a kernel mode priority and * the time slicing is not done for threads running in the kernel after * sleeping. The proper time quantum will be assigned by fss_trapret before the * thread returns to user mode. */ static void fss_wakeup(kthread_t *t) { fssproc_t *fssproc; ASSERT(THREAD_LOCK_HELD(t)); ASSERT(t->t_state == TS_SLEEP); fss_active(t); t->t_stime = ddi_get_lbolt(); /* time stamp for the swapper */ fssproc = FSSPROC(t); fssproc->fss_flags &= ~FSSBACKQ; if (fssproc->fss_flags & FSSKPRI) { /* * If we already have a kernel priority assigned, then we * just use it. */ setbackdq(t); } else if (t->t_kpri_req) { /* * Give thread a priority boost if we were asked. */ fssproc->fss_flags |= FSSKPRI; THREAD_CHANGE_PRI(t, minclsyspri); setbackdq(t); t->t_trapret = 1; /* so that fss_trapret will run */ aston(t); } else { /* * Otherwise, we recalculate the priority. */ if (t->t_disp_time == ddi_get_lbolt()) { setfrontdq(t); } else { fssproc->fss_timeleft = fss_quantum; THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); setbackdq(t); } } } /* * fss_donice() is called when a nice(1) command is issued on the thread to * alter the priority. The nice(1) command exists in Solaris for compatibility. * Thread priority adjustments should be done via priocntl(1). */ static int fss_donice(kthread_t *t, cred_t *cr, int incr, int *retvalp) { int newnice; fssproc_t *fssproc = FSSPROC(t); fssparms_t fssparms; /* * If there is no change to priority, just return current setting. */ if (incr == 0) { if (retvalp) *retvalp = fssproc->fss_nice - NZERO; return (0); } if ((incr < 0 || incr > 2 * NZERO) && secpolicy_raisepriority(cr) != 0) return (EPERM); /* * Specifying a nice increment greater than the upper limit of * FSS_NICE_MAX (== 2 * NZERO - 1) will result in the thread's nice * value being set to the upper limit. We check for this before * computing the new value because otherwise we could get overflow * if a privileged user specified some ridiculous increment. */ if (incr > FSS_NICE_MAX) incr = FSS_NICE_MAX; newnice = fssproc->fss_nice + incr; if (newnice > FSS_NICE_MAX) newnice = FSS_NICE_MAX; else if (newnice < FSS_NICE_MIN) newnice = FSS_NICE_MIN; fssparms.fss_uprilim = fssparms.fss_upri = -((newnice - NZERO) * fss_maxupri) / NZERO; /* * Reset the uprilim and upri values of the thread. */ (void) fss_parmsset(t, (void *)&fssparms, (id_t)0, (cred_t *)NULL); /* * Although fss_parmsset already reset fss_nice it may not have been * set to precisely the value calculated above because fss_parmsset * determines the nice value from the user priority and we may have * truncated during the integer conversion from nice value to user * priority and back. We reset fss_nice to the value we calculated * above. */ fssproc->fss_nice = (char)newnice; if (retvalp) *retvalp = newnice - NZERO; return (0); } /* * Increment the priority of the specified thread by incr and * return the new value in *retvalp. */ static int fss_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp) { int newpri; fssproc_t *fssproc = FSSPROC(t); fssparms_t fssparms; /* * If there is no change to priority, just return current setting. */ if (incr == 0) { *retvalp = fssproc->fss_upri; return (0); } newpri = fssproc->fss_upri + incr; if (newpri > fss_maxupri || newpri < -fss_maxupri) return (EINVAL); *retvalp = newpri; fssparms.fss_uprilim = fssparms.fss_upri = newpri; /* * Reset the uprilim and upri values of the thread. */ return (fss_parmsset(t, &fssparms, (id_t)0, cr)); } /* * Return the global scheduling priority that would be assigned to a thread * entering the fair-sharing class with the fss_upri. */ /*ARGSUSED*/ static pri_t fss_globpri(kthread_t *t) { ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); return (fss_maxumdpri / 2); } /* * Called from the yield(2) system call when a thread is yielding (surrendering) * the processor. The kernel thread is placed at the back of a dispatch queue. */ static void fss_yield(kthread_t *t) { fssproc_t *fssproc = FSSPROC(t); ASSERT(t == curthread); ASSERT(THREAD_LOCK_HELD(t)); /* * Collect CPU usage spent before yielding */ (void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE); /* * Clear the preemption control "yield" bit since the user is * doing a yield. */ if (t->t_schedctl) schedctl_set_yield(t, 0); /* * If fss_preempt() artifically increased the thread's priority * to avoid preemption, restore the original priority now. */ if (fssproc->fss_flags & FSSRESTORE) { THREAD_CHANGE_PRI(t, fssproc->fss_scpri); fssproc->fss_flags &= ~FSSRESTORE; } if (fssproc->fss_timeleft < 0) { /* * Time slice was artificially extended to avoid preemption, * so pretend we're preempting it now. */ DTRACE_SCHED1(schedctl__yield, int, -fssproc->fss_timeleft); fssproc->fss_timeleft = fss_quantum; } fssproc->fss_flags &= ~FSSBACKQ; setbackdq(t); } void fss_changeproj(kthread_t *t, void *kp, void *zp, fssbuf_t *projbuf, fssbuf_t *zonebuf) { kproject_t *kpj_new = kp; zone_t *zone = zp; fssproj_t *fssproj_old, *fssproj_new; fsspset_t *fsspset; kproject_t *kpj_old; fssproc_t *fssproc; fsszone_t *fsszone_old, *fsszone_new; int free = 0; int id; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(MUTEX_HELD(&pidlock)); ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); if (t->t_cid != fss_cid) return; fssproc = FSSPROC(t); mutex_enter(&fsspsets_lock); fssproj_old = FSSPROC2FSSPROJ(fssproc); if (fssproj_old == NULL) { mutex_exit(&fsspsets_lock); return; } fsspset = FSSPROJ2FSSPSET(fssproj_old); mutex_enter(&fsspset->fssps_lock); kpj_old = FSSPROJ2KPROJ(fssproj_old); fsszone_old = fssproj_old->fssp_fsszone; ASSERT(t->t_cpupart == fsspset->fssps_cpupart); if (kpj_old == kpj_new) { mutex_exit(&fsspset->fssps_lock); mutex_exit(&fsspsets_lock); return; } if ((fsszone_new = fss_find_fsszone(fsspset, zone)) == NULL) { /* * If the zone for the new project is not currently active on * the cpu partition we're on, get one of the pre-allocated * buffers and link it in our per-pset zone list. Such buffers * should already exist. */ for (id = 0; id < zonebuf->fssb_size; id++) { if ((fsszone_new = zonebuf->fssb_list[id]) != NULL) { fss_insert_fsszone(fsspset, zone, fsszone_new); zonebuf->fssb_list[id] = NULL; break; } } } ASSERT(fsszone_new != NULL); if ((fssproj_new = fss_find_fssproj(fsspset, kpj_new)) == NULL) { /* * If our new project is not currently running * on the cpu partition we're on, get one of the * pre-allocated buffers and link it in our new cpu * partition doubly linked list. Such buffers should already * exist. */ for (id = 0; id < projbuf->fssb_size; id++) { if ((fssproj_new = projbuf->fssb_list[id]) != NULL) { fss_insert_fssproj(fsspset, kpj_new, fsszone_new, fssproj_new); projbuf->fssb_list[id] = NULL; break; } } } ASSERT(fssproj_new != NULL); thread_lock(t); if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || t->t_state == TS_WAIT) fss_inactive(t); ASSERT(fssproj_old->fssp_threads > 0); if (--fssproj_old->fssp_threads == 0) { fss_remove_fssproj(fsspset, fssproj_old); free = 1; } fssproc->fss_proj = fssproj_new; fssproc->fss_fsspri = 0; fssproj_new->fssp_threads++; if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || t->t_state == TS_WAIT) fss_active(t); thread_unlock(t); if (free) { if (fsszone_old->fssz_nproj == 0) kmem_free(fsszone_old, sizeof (fsszone_t)); kmem_free(fssproj_old, sizeof (fssproj_t)); } mutex_exit(&fsspset->fssps_lock); mutex_exit(&fsspsets_lock); } void fss_changepset(kthread_t *t, void *newcp, fssbuf_t *projbuf, fssbuf_t *zonebuf) { fsspset_t *fsspset_old, *fsspset_new; fssproj_t *fssproj_old, *fssproj_new; fsszone_t *fsszone_old, *fsszone_new; fssproc_t *fssproc; kproject_t *kpj; zone_t *zone; int id; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(MUTEX_HELD(&pidlock)); ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); if (t->t_cid != fss_cid) return; fssproc = FSSPROC(t); zone = ttoproc(t)->p_zone; mutex_enter(&fsspsets_lock); fssproj_old = FSSPROC2FSSPROJ(fssproc); if (fssproj_old == NULL) { mutex_exit(&fsspsets_lock); return; } fsszone_old = fssproj_old->fssp_fsszone; fsspset_old = FSSPROJ2FSSPSET(fssproj_old); kpj = FSSPROJ2KPROJ(fssproj_old); if (fsspset_old->fssps_cpupart == newcp) { mutex_exit(&fsspsets_lock); return; } ASSERT(ttoproj(t) == kpj); fsspset_new = fss_find_fsspset(newcp); mutex_enter(&fsspset_new->fssps_lock); if ((fsszone_new = fss_find_fsszone(fsspset_new, zone)) == NULL) { for (id = 0; id < zonebuf->fssb_size; id++) { if ((fsszone_new = zonebuf->fssb_list[id]) != NULL) { fss_insert_fsszone(fsspset_new, zone, fsszone_new); zonebuf->fssb_list[id] = NULL; break; } } } ASSERT(fsszone_new != NULL); if ((fssproj_new = fss_find_fssproj(fsspset_new, kpj)) == NULL) { for (id = 0; id < projbuf->fssb_size; id++) { if ((fssproj_new = projbuf->fssb_list[id]) != NULL) { fss_insert_fssproj(fsspset_new, kpj, fsszone_new, fssproj_new); projbuf->fssb_list[id] = NULL; break; } } } ASSERT(fssproj_new != NULL); fssproj_new->fssp_threads++; thread_lock(t); if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || t->t_state == TS_WAIT) fss_inactive(t); fssproc->fss_proj = fssproj_new; fssproc->fss_fsspri = 0; if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || t->t_state == TS_WAIT) fss_active(t); thread_unlock(t); mutex_exit(&fsspset_new->fssps_lock); mutex_enter(&fsspset_old->fssps_lock); if (--fssproj_old->fssp_threads == 0) { fss_remove_fssproj(fsspset_old, fssproj_old); if (fsszone_old->fssz_nproj == 0) kmem_free(fsszone_old, sizeof (fsszone_t)); kmem_free(fssproj_old, sizeof (fssproj_t)); } mutex_exit(&fsspset_old->fssps_lock); mutex_exit(&fsspsets_lock); }